monitoring: mirror Sprint 57 coverage rules

2026-06-03 22:46:33 -05:00
parent 404d884863
commit 0ed9b989fa
2 changed files with 177 additions and 5 deletions
--- a/apps/monitoring/noc-monitoring.yaml
+++ b/apps/monitoring/noc-monitoring.yaml
@@ -480,14 +480,16 @@ data:
              - "https://argocd.iamworkin.lan/"
              - "https://intranet.iamworkin.lan/"
              - "https://signage.iamworkin.lan/healthz"   # root 401 auth-gated 2026-06-01; /healthz anon 200
+              - "https://signalcontrol.iamworkin.lan/health" # FlowerCore.SignalControl explicit health route
              - "https://kiosk.iamworkin.lan/"
              - "https://media.iamworkin.lan/"
              - "https://mysql.iamworkin.lan/healthz"   # root 401 auth-gated 2026-06-01; /healthz anon 200
              - "https://php.iamworkin.lan/healthz"     # root 401 auth-gated 2026-06-01; /healthz anon 200
+              - "https://dns.iamworkin.lan/"
              - "https://zabbix.iamworkin.lan/"
+              - "https://flowercore.iamworkin.lan/healthz"
              - "https://desktop.iamworkin.lan/"
              - "https://print.iamworkin.lan/"
-              - "https://dns.iamworkin.lan/"
              - "https://chat.iamworkin.lan/"
              - "https://dist.iamworkin.lan/"
              - "https://dms.iamworkin.lan/"
@@ -496,9 +498,15 @@ data:
              - "https://presentations.iamworkin.lan/"
              - "https://retail.iamworkin.lan/"
              - "https://ttsreader.iamworkin.lan/"
+              - "https://updates.iamworkin.lan/api/v1/manifests/_schema"
              # Explicit healthcheck paths
              - "https://fc-llm-bridge.iamworkin.lan/healthz"
              - "https://acme.iamworkin.lan/health"
+              - "https://replay.iamworkin.lan/healthz"
+              - "https://updatecenter-internal.iamworkin.lan/api/v1/manifests/_schema"
+              - "https://worldbuilder.iamworkin.lan/healthz"
+              # Coverage gaps logged Q-MR-129/Q-MR-130: devices.iamworkin.lan
+              # returns 503 and e2e-test-pma/wpdemo only return 404.
              # NOTE: services intentionally NOT in this probe surface
              #   - grafana.iamworkin.lan: every endpoint (incl. /api/health
              #     and /login) returns 401 behind Traefik basic-auth.
@@ -907,11 +915,14 @@ data:
          # for: 30m absorbs sleep cycles. The EcoTank sleeps after ~5 min
          # of idle and SNMP times out, so 5m for: would page nightly. A
          # genuine printer outage (jam, disconnected) lasts well over 30m.
+          # Use a range-window expression: instant up{} can go stale/absent
+          # after repeated snmp-exporter 500s.
          - alert: EpsonPrinterDown
-            expr: up{job="snmp-printer"} == 0
+            expr: (max_over_time(up{job="snmp-printer"}[35m]) == bool 0) == 1 and (hour() >= 13 or hour() < 1)
            for: 30m
            labels:
              severity: warning
+              alert_channel: irc
            annotations:
              summary: "Epson ET-3750 SNMP unreachable for >30m (likely actual fault, not sleep)"

@@ -1020,7 +1031,9 @@ data:
      - name: kubernetes-state
        rules:
          - alert: KubeContainerRestartingFrequently
-            expr: increase(kube_pod_container_status_restarts_total[1h]) > 5
+            # Exclude github-runner: ephemeral runners register, run one job,
+            # exit cleanly, then restart by design.
+            expr: increase(kube_pod_container_status_restarts_total{namespace!="github-runner"}[1h]) > 5
            for: 15m
            labels:
              severity: warning
@@ -1029,7 +1042,9 @@ data:
              description: "Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has restarted {{ $value | printf \"%.0f\" }} times in the last hour. Check 'kubectl describe pod' + last-state termination reason."

          - alert: KubeContainerCrashLooping
-            expr: increase(kube_pod_container_status_restarts_total[15m]) > 3
+            # Exclude github-runner: ephemeral runners register, run one job,
+            # exit cleanly, then restart by design.
+            expr: increase(kube_pod_container_status_restarts_total{namespace!="github-runner"}[15m]) > 3
            for: 5m
            labels:
              severity: critical
@@ -1057,7 +1072,8 @@ data:
              description: "Pod can't pull image. Check the image ref (often a stale tag or unreachable registry) and clean up if it's an orphan."

          - alert: KubeDeploymentReplicasMismatch
-            expr: kube_deployment_spec_replicas != kube_deployment_status_replicas_available
+            # Exclude github-runner: ephemeral runner deployments flap 0/1 between jobs by design.
+            expr: kube_deployment_spec_replicas{namespace!="github-runner"} != kube_deployment_status_replicas_available{namespace!="github-runner"}
            for: 15m
            labels:
              severity: warning
@@ -3636,6 +3652,38 @@ data:
                relativeTimeRange: {from: 120, to: 0}
                datasourceUid: __expr__
                model: {type: threshold, expression: B, conditions: [{evaluator: {params: [600], type: gt}}], refId: C}
+      - orgId: 1
+        name: SNMP Devices
+        folder: Infrastructure Alerts
+        interval: 1m
+        rules:
+          - uid: epson-printer-down-stale-window
+            title: EpsonPrinterDown
+            condition: C
+            for: 30m
+            noDataState: OK
+            execErrState: OK
+            annotations:
+              summary: Epson ET-3750 SNMP unreachable
+              description: The Epson ET-3750 snmp-printer target has reported only failed scrapes for at least 35 minutes.
+              runbook: "1. Check if printer is intentionally powered off 2. If printing needed: press power button on printer 3. Ping 10.0.58.107 after wake-up 4. Check WiFi on printer LCD if still unreachable"
+            labels:
+              severity: info
+              service: printer
+              alert_channel: irc
+            data:
+              - refId: A
+                relativeTimeRange: {from: 2100, to: 0}
+                datasourceUid: prometheus
+                model: {expr: '(max_over_time(up{job="snmp-printer"}[35m]) == bool 0) == 1 and (hour() >= 13 or hour() < 1)', instant: true, refId: A}
+              - refId: B
+                relativeTimeRange: {from: 2100, to: 0}
+                datasourceUid: __expr__
+                model: {type: reduce, expression: A, reducer: last, refId: B}
+              - refId: C
+                relativeTimeRange: {from: 2100, to: 0}
+                datasourceUid: __expr__
+                model: {type: threshold, expression: B, conditions: [{evaluator: {params: [0], type: gt}}], refId: C}
      - orgId: 1
        name: CI Runners
        folder: CI Alerts