diff --git a/apps/monitoring/noc-monitoring.yaml b/apps/monitoring/noc-monitoring.yaml index 1698160..dd934b4 100644 --- a/apps/monitoring/noc-monitoring.yaml +++ b/apps/monitoring/noc-monitoring.yaml @@ -75,6 +75,20 @@ data: cluster: "rke2" role: "agent" + # Mac mini macOS runner node (INFRA VLAN) + - job_name: "macmini-node" + scrape_timeout: 15s + static_configs: + - targets: ["10.0.56.115:9100"] + labels: + instance: "macmini" + host: "macmini.iamworkin.lan" + vlan: "infra" + arch: "arm64" + role: "macos-runner" + puppet_managed: "true" + puppet_server: "puppet.iamworkin.lan" + # In-cluster node-exporter DaemonSet - job_name: "k8s-node-exporter" kubernetes_sd_configs: @@ -697,6 +711,18 @@ data: summary: "Print.Web Ollama runner held for >10m ({{ $labels.model }})" description: "Print.Web reports model {{ $labels.model }} with {{ $value | printf \"%.0f\" }}s of keep-alive remaining. Check concurrent requests before the Pi 5 Ollama lane thrashes." + - name: macmini-runners + rules: + - alert: MacMiniRunnerOffline + expr: (flowercore_github_runner_online{runner=~"macmini-.*"} == 0) or absent(flowercore_github_runner_online{runner=~"macmini-.*"}) + for: 10m + labels: + severity: warning + service: github-runner + annotations: + summary: "Mac mini GitHub runner offline ({{ $labels.runner }})" + description: "A macmini-* GitHub Actions runner has not reported online for more than 10 minutes. Puppet manages its LaunchDaemon under /Library/LaunchDaemons/io.flowercore.github-runner-.plist; runners survive reboot and do not require a GUI session." + - name: remote-desktop rules: - alert: RemoteDesktopWebDown @@ -3427,6 +3453,32 @@ data: relativeTimeRange: {from: 120, to: 0} datasourceUid: __expr__ model: {type: threshold, expression: B, conditions: [{evaluator: {params: [1], type: lt}}], refId: C} + - uid: macmini-runner-offline + title: MacMiniRunnerOffline + condition: C + for: 10m + noDataState: Alerting + execErrState: OK + annotations: + summary: Mac mini GitHub runner offline + description: "One or more macmini-* GitHub Actions runners have not reported online for more than 10 minutes. LaunchDaemons survive reboot and do not require the bluejay GUI session." + runbook: "1. ssh fcadmin@macmini.iamworkin.lan 2. launchctl print system/io.flowercore.github-runner- 3. Check /Users/fcadmin/Library/Logs/github-runners//stderr.log 4. Re-register the repo runner if .runner is missing" + labels: + severity: warning + service: github-runner + data: + - refId: A + relativeTimeRange: {from: 600, to: 0} + datasourceUid: prometheus + model: {expr: 'min(flowercore_github_runner_online{runner=~"macmini-.*"} or vector(0))', instant: true, refId: A} + - refId: B + relativeTimeRange: {from: 600, to: 0} + datasourceUid: __expr__ + model: {type: reduce, expression: A, reducer: last, refId: B} + - refId: C + relativeTimeRange: {from: 600, to: 0} + datasourceUid: __expr__ + model: {type: threshold, expression: B, conditions: [{evaluator: {params: [1], type: lt}}], refId: C} - uid: high-cpu title: High CPU (>85%) condition: C