From 4b777b16ac9d1ed1f7cd78271243898fc16e354c Mon Sep 17 00:00:00 2001 From: Codex Date: Wed, 6 May 2026 16:01:44 -0500 Subject: [PATCH] monitoring: mirror fc-signage-marquee alert group into noc-monitoring K8s target MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mirror of FlowerCore.Notes/scripts/monitoring/alerts.yml fc-signage-marquee group into the K8s migration target apps/monitoring/noc-monitoring.yaml so that future migration of the noc1 Podman monitoring stack into RKE2 inherits the marquee alert ruleset automatically. Three rules added: - MarqueeDroppedFramesHigh (5% / 5min / warning) - MarqueeRenderLatencyP99High (16ms / 10min / warning) - MarqueeAnimationDurationDrift (10% / 15min / info) All three gated with `unless on() absent_over_time(metric[7d])` so they don't fire during the metric-not-yet-published window before Track 3 IR-21 source IMPL ships the MarqueeMeter into Common + Web + WPF. Live source-of-truth (the noc1 Podman Prometheus reads from /opt/monitoring/prometheus/alerts.yml) was updated and reloaded in the same session — Notes commit 300daa0 carries the matching alerts.yml + Grafana fc-signage-dashboard.json change. Per feedback_monitoring_k8s_target_vs_live_podman: this file is the forward-looking K8s migration target, NOT what the live Podman Prometheus reads. ArgoCD-syncing this file does NOT push alerts to the live monitoring stack. Companion to: - FlowerCore.Notes 300daa0 (live alerts.yml + Grafana panels deployed) - docs/signage/marquee-performance-telemetry-design.md (Track 3 IR-21 spec) - docs/signage/marquee-animation-phases.md (Track 6 13-phase coverage matrix) Memory: project_marquee_vr_promotion_landed_2026_05_06 Co-Authored-By: Claude Opus 4.7 (1M context) --- apps/monitoring/noc-monitoring.yaml | 66 +++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) diff --git a/apps/monitoring/noc-monitoring.yaml b/apps/monitoring/noc-monitoring.yaml index 8efd051..6c63a67 100644 --- a/apps/monitoring/noc-monitoring.yaml +++ b/apps/monitoring/noc-monitoring.yaml @@ -1024,6 +1024,72 @@ data: summary: "Longhorn node {{ $labels.node }} not Ready" description: "Node {{ $labels.node }} reports ready=false (reason: {{ $labels.condition_reason }}). Volumes scheduled to this node will be unavailable until it recovers." + # ============================================================ + # FC Signage Marquee Performance — Track 3 + 8 (2026-05-06) + # Live-mirrored from FlowerCore.Notes/scripts/monitoring/alerts.yml. + # Source-of-truth for the live Podman Prometheus on noc1 is the + # Notes file; this K8s ConfigMap exists so a future migration to + # in-cluster Prometheus inherits the ruleset automatically. + # See feedback_monitoring_k8s_target_vs_live_podman. + # ============================================================ + - name: fc-signage-marquee + rules: + - alert: MarqueeDroppedFramesHigh + expr: | + ( + sum by (renderer, phase, node_id) (rate(marquee_dropped_frames_total[5m])) + / + sum by (renderer, phase, node_id) (rate(marquee_render_latency_ms_count[5m])) + ) > 0.05 + unless on() + absent_over_time(marquee_dropped_frames_total[7d]) + for: 5m + labels: + severity: warning + service: signage + alert_channel: irc + annotations: + summary: "Marquee dropped-frame rate >5% on {{ $labels.renderer }}/{{ $labels.node_id }} ({{ $labels.phase }})" + description: "Renderer {{ $labels.renderer }} on {{ $labels.node_id }} drops >5% of frames during {{ $labels.phase }}. Animation visibly stuttery." + + - alert: MarqueeRenderLatencyP99High + expr: | + histogram_quantile( + 0.99, + sum by (renderer, phase, node_id, le) (rate(marquee_render_latency_ms_bucket[5m])) + ) > 16 + unless on() + absent_over_time(marquee_render_latency_ms_bucket[7d]) + for: 10m + labels: + severity: warning + service: signage + alert_channel: irc + annotations: + summary: "Marquee render latency p99 > 16ms on {{ $labels.renderer }}/{{ $labels.node_id }} ({{ $labels.phase }})" + description: "Per-frame render latency p99 has exceeded the Pi-class 16ms budget for 10 minutes." + + - alert: MarqueeAnimationDurationDrift + expr: | + abs( + histogram_quantile(0.5, sum by (renderer, phase, le) (rate(marquee_animation_duration_ms_bucket[15m]))) + - + on (phase) group_left() avg by (phase) (marquee_animation_duration_target_ms) + ) + / + on (phase) group_left() avg by (phase) (marquee_animation_duration_target_ms) + > 0.10 + unless on() + absent_over_time(marquee_animation_duration_ms_bucket[7d]) + for: 15m + labels: + severity: info + service: signage + alert_channel: irc + annotations: + summary: "Marquee animation duration drifting > 10% on {{ $labels.renderer }} ({{ $labels.phase }})" + description: "Median observed cycle duration deviates from target DurationMs by >10%. Could indicate browser tab throttling, GPU pressure, or phase-advancement bug." + # ============================================================================= # ConfigMap: Blackbox Exporter Configuration # =============================================================================