feat(monitoring): MacMiniRunnerOffline alert (Sprint 28 reconcile)
This commit is contained in:
@@ -75,6 +75,20 @@ data:
|
|||||||
cluster: "rke2"
|
cluster: "rke2"
|
||||||
role: "agent"
|
role: "agent"
|
||||||
|
|
||||||
|
# Mac mini macOS runner node (INFRA VLAN)
|
||||||
|
- job_name: "macmini-node"
|
||||||
|
scrape_timeout: 15s
|
||||||
|
static_configs:
|
||||||
|
- targets: ["10.0.56.115:9100"]
|
||||||
|
labels:
|
||||||
|
instance: "macmini"
|
||||||
|
host: "macmini.iamworkin.lan"
|
||||||
|
vlan: "infra"
|
||||||
|
arch: "arm64"
|
||||||
|
role: "macos-runner"
|
||||||
|
puppet_managed: "true"
|
||||||
|
puppet_server: "puppet.iamworkin.lan"
|
||||||
|
|
||||||
# In-cluster node-exporter DaemonSet
|
# In-cluster node-exporter DaemonSet
|
||||||
- job_name: "k8s-node-exporter"
|
- job_name: "k8s-node-exporter"
|
||||||
kubernetes_sd_configs:
|
kubernetes_sd_configs:
|
||||||
@@ -697,6 +711,18 @@ data:
|
|||||||
summary: "Print.Web Ollama runner held for >10m ({{ $labels.model }})"
|
summary: "Print.Web Ollama runner held for >10m ({{ $labels.model }})"
|
||||||
description: "Print.Web reports model {{ $labels.model }} with {{ $value | printf \"%.0f\" }}s of keep-alive remaining. Check concurrent requests before the Pi 5 Ollama lane thrashes."
|
description: "Print.Web reports model {{ $labels.model }} with {{ $value | printf \"%.0f\" }}s of keep-alive remaining. Check concurrent requests before the Pi 5 Ollama lane thrashes."
|
||||||
|
|
||||||
|
- name: macmini-runners
|
||||||
|
rules:
|
||||||
|
- alert: MacMiniRunnerOffline
|
||||||
|
expr: (flowercore_github_runner_online{runner=~"macmini-.*"} == 0) or absent(flowercore_github_runner_online{runner=~"macmini-.*"})
|
||||||
|
for: 10m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
service: github-runner
|
||||||
|
annotations:
|
||||||
|
summary: "Mac mini GitHub runner offline ({{ $labels.runner }})"
|
||||||
|
description: "A macmini-* GitHub Actions runner has not reported online for more than 10 minutes. Puppet manages its LaunchDaemon under /Library/LaunchDaemons/io.flowercore.github-runner-<slug>.plist; runners survive reboot and do not require a GUI session."
|
||||||
|
|
||||||
- name: remote-desktop
|
- name: remote-desktop
|
||||||
rules:
|
rules:
|
||||||
- alert: RemoteDesktopWebDown
|
- alert: RemoteDesktopWebDown
|
||||||
@@ -3427,6 +3453,32 @@ data:
|
|||||||
relativeTimeRange: {from: 120, to: 0}
|
relativeTimeRange: {from: 120, to: 0}
|
||||||
datasourceUid: __expr__
|
datasourceUid: __expr__
|
||||||
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [1], type: lt}}], refId: C}
|
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [1], type: lt}}], refId: C}
|
||||||
|
- uid: macmini-runner-offline
|
||||||
|
title: MacMiniRunnerOffline
|
||||||
|
condition: C
|
||||||
|
for: 10m
|
||||||
|
noDataState: Alerting
|
||||||
|
execErrState: OK
|
||||||
|
annotations:
|
||||||
|
summary: Mac mini GitHub runner offline
|
||||||
|
description: "One or more macmini-* GitHub Actions runners have not reported online for more than 10 minutes. LaunchDaemons survive reboot and do not require the bluejay GUI session."
|
||||||
|
runbook: "1. ssh fcadmin@macmini.iamworkin.lan 2. launchctl print system/io.flowercore.github-runner-<slug> 3. Check /Users/fcadmin/Library/Logs/github-runners/<slug>/stderr.log 4. Re-register the repo runner if .runner is missing"
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
service: github-runner
|
||||||
|
data:
|
||||||
|
- refId: A
|
||||||
|
relativeTimeRange: {from: 600, to: 0}
|
||||||
|
datasourceUid: prometheus
|
||||||
|
model: {expr: 'min(flowercore_github_runner_online{runner=~"macmini-.*"} or vector(0))', instant: true, refId: A}
|
||||||
|
- refId: B
|
||||||
|
relativeTimeRange: {from: 600, to: 0}
|
||||||
|
datasourceUid: __expr__
|
||||||
|
model: {type: reduce, expression: A, reducer: last, refId: B}
|
||||||
|
- refId: C
|
||||||
|
relativeTimeRange: {from: 600, to: 0}
|
||||||
|
datasourceUid: __expr__
|
||||||
|
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [1], type: lt}}], refId: C}
|
||||||
- uid: high-cpu
|
- uid: high-cpu
|
||||||
title: High CPU (>85%)
|
title: High CPU (>85%)
|
||||||
condition: C
|
condition: C
|
||||||
|
|||||||
Reference in New Issue
Block a user