feat(monitoring): MacMiniRunnerOffline alert (Sprint 28 reconcile)
This commit is contained in:
@@ -75,6 +75,20 @@ data:
|
||||
cluster: "rke2"
|
||||
role: "agent"
|
||||
|
||||
# Mac mini macOS runner node (INFRA VLAN)
|
||||
- job_name: "macmini-node"
|
||||
scrape_timeout: 15s
|
||||
static_configs:
|
||||
- targets: ["10.0.56.115:9100"]
|
||||
labels:
|
||||
instance: "macmini"
|
||||
host: "macmini.iamworkin.lan"
|
||||
vlan: "infra"
|
||||
arch: "arm64"
|
||||
role: "macos-runner"
|
||||
puppet_managed: "true"
|
||||
puppet_server: "puppet.iamworkin.lan"
|
||||
|
||||
# In-cluster node-exporter DaemonSet
|
||||
- job_name: "k8s-node-exporter"
|
||||
kubernetes_sd_configs:
|
||||
@@ -697,6 +711,18 @@ data:
|
||||
summary: "Print.Web Ollama runner held for >10m ({{ $labels.model }})"
|
||||
description: "Print.Web reports model {{ $labels.model }} with {{ $value | printf \"%.0f\" }}s of keep-alive remaining. Check concurrent requests before the Pi 5 Ollama lane thrashes."
|
||||
|
||||
- name: macmini-runners
|
||||
rules:
|
||||
- alert: MacMiniRunnerOffline
|
||||
expr: (flowercore_github_runner_online{runner=~"macmini-.*"} == 0) or absent(flowercore_github_runner_online{runner=~"macmini-.*"})
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
service: github-runner
|
||||
annotations:
|
||||
summary: "Mac mini GitHub runner offline ({{ $labels.runner }})"
|
||||
description: "A macmini-* GitHub Actions runner has not reported online for more than 10 minutes. Puppet manages its LaunchDaemon under /Library/LaunchDaemons/io.flowercore.github-runner-<slug>.plist; runners survive reboot and do not require a GUI session."
|
||||
|
||||
- name: remote-desktop
|
||||
rules:
|
||||
- alert: RemoteDesktopWebDown
|
||||
@@ -3427,6 +3453,32 @@ data:
|
||||
relativeTimeRange: {from: 120, to: 0}
|
||||
datasourceUid: __expr__
|
||||
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [1], type: lt}}], refId: C}
|
||||
- uid: macmini-runner-offline
|
||||
title: MacMiniRunnerOffline
|
||||
condition: C
|
||||
for: 10m
|
||||
noDataState: Alerting
|
||||
execErrState: OK
|
||||
annotations:
|
||||
summary: Mac mini GitHub runner offline
|
||||
description: "One or more macmini-* GitHub Actions runners have not reported online for more than 10 minutes. LaunchDaemons survive reboot and do not require the bluejay GUI session."
|
||||
runbook: "1. ssh fcadmin@macmini.iamworkin.lan 2. launchctl print system/io.flowercore.github-runner-<slug> 3. Check /Users/fcadmin/Library/Logs/github-runners/<slug>/stderr.log 4. Re-register the repo runner if .runner is missing"
|
||||
labels:
|
||||
severity: warning
|
||||
service: github-runner
|
||||
data:
|
||||
- refId: A
|
||||
relativeTimeRange: {from: 600, to: 0}
|
||||
datasourceUid: prometheus
|
||||
model: {expr: 'min(flowercore_github_runner_online{runner=~"macmini-.*"} or vector(0))', instant: true, refId: A}
|
||||
- refId: B
|
||||
relativeTimeRange: {from: 600, to: 0}
|
||||
datasourceUid: __expr__
|
||||
model: {type: reduce, expression: A, reducer: last, refId: B}
|
||||
- refId: C
|
||||
relativeTimeRange: {from: 600, to: 0}
|
||||
datasourceUid: __expr__
|
||||
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [1], type: lt}}], refId: C}
|
||||
- uid: high-cpu
|
||||
title: High CPU (>85%)
|
||||
condition: C
|
||||
|
||||
Reference in New Issue
Block a user