diff --git a/apps/monitoring/noc-monitoring.yaml b/apps/monitoring/noc-monitoring.yaml index 461ced3..412380d 100644 --- a/apps/monitoring/noc-monitoring.yaml +++ b/apps/monitoring/noc-monitoring.yaml @@ -668,31 +668,39 @@ data: summary: "RemoteDesktop /metrics scrape returning no data" description: "No fc_desktop_session_events_total series for 10 minutes. Either the Prometheus scrape target is misconfigured or the web deployment stopped exporting metrics. Zabbix template carries the same 10m no-data trigger for cross-monitor parity." - # fc_desktop_pool_depleted is emitted as state-as-label: one series - # per template per status (Ready/Warming/BelowDesiredSize/Disabled). - # The publisher does NOT reset old series to 0 when a template - # transitions states — it just emits a new series with new labels. - # So a template that was Warming yesterday still has its - # Warming-labeled series stuck at 1 even when current status=Ready. - # Filter on the Critical alert_level (= BelowDesiredSize) so only - # genuine current-state depletion fires. Same fix on Deficit below. + # PUBLISHER QUIRK: fc_desktop_pool_depleted / _deficit emit one + # series per template per status (Ready/Warming/BelowDesiredSize/ + # Disabled), and the historical series for non-current statuses + # stay at their last value. So just `_depleted > 0` fires forever + # on any template that ever entered a bad state. + # + # SAFE PATTERN: alert only when the canonical "Ready" status + # gauge does NOT report ready=1 for the enabled template. This + # is the publisher's own canary — _ready{status="Ready"}==1 is + # always the current "everything is fine" signal. - alert: RemoteDesktopPoolDepleted - expr: fc_desktop_pool_depleted{alert_level="Critical",enabled="true"} > 0 + expr: | + group by(template) (fc_desktop_pool_ready{enabled="true"}) + unless on(template) (fc_desktop_pool_ready{enabled="true",status="Ready"} == 1) for: 5m labels: severity: warning annotations: summary: "RemoteDesktop pool depleted ({{ $labels.template }})" - description: "Pool for template {{ $labels.template }} (status={{ $labels.status }}) has been depleted for 5 minutes. New launches will cold-start. Operator should check for pod-scheduling failures, image pull issues, or exhausted node capacity before warm-pool parity is expected back." + description: "Pool for template {{ $labels.template }} has no Ready warm pod for 5 minutes. New launches will cold-start. Check pod-scheduling failures, image pull issues, or exhausted node capacity." + # Same pattern, but only fires when template explicitly reports + # a sustained Warning-level alert state (current-status series). - alert: RemoteDesktopPoolDeficitSustained - expr: fc_desktop_pool_deficit{alert_level=~"Warning|Critical",enabled="true"} > 0 + expr: | + fc_desktop_pool_deficit{enabled="true",alert_level="Warning"} > 0 + unless on(template) (fc_desktop_pool_ready{enabled="true",status="Ready"} == 1) for: 10m labels: severity: info annotations: summary: "RemoteDesktop pool {{ $labels.template }} below desired for 10m" - description: "Pool {{ $labels.template }} (status={{ $labels.status }}) has a persistent deficit of {{ $value }} warm pods. The operator is reconciling but can't reach desired size — likely an image pull, NFS affinity, or claim-init issue." + description: "Pool {{ $labels.template }} has a persistent deficit of {{ $value }} warm pods AND no Ready series. Likely image pull, NFS affinity, or claim-init issue." - alert: RemoteDesktopSessionChurnSpike expr: sum(rate(fc_desktop_session_events_total{event="launch"}[5m])) * 60 > 20