diff --git a/apps/monitoring/noc-monitoring.yaml b/apps/monitoring/noc-monitoring.yaml index 6c63a67..1200189 100644 --- a/apps/monitoring/noc-monitoring.yaml +++ b/apps/monitoring/noc-monitoring.yaml @@ -3362,6 +3362,92 @@ data: relativeTimeRange: {from: 120, to: 0} datasourceUid: __expr__ model: {type: threshold, expression: B, conditions: [{evaluator: {params: [600], type: gt}}], refId: C} + - orgId: 1 + name: Signage Marquee + folder: AI Stack Alerts + interval: 1m + rules: + - uid: marquee-dropped-frames-high + title: MarqueeDroppedFramesHigh + condition: C + for: 5m + noDataState: OK + execErrState: OK + annotations: + summary: Marquee dropped-frame rate above 5% + description: "Dropped frames exceeded the IR-21 budget for a renderer/phase/node tuple. Grafana owns alert delivery to IRC #alerts; Prometheus rules remain only the visibility source." + runbook: "1. Open /d/fc-marquee-perf/marquee-animation-performance 2. Filter renderer/node/phase 3. Compare latest AAT baseline diff 4. Restart only the affected player if the issue is node-local" + labels: + severity: warning + service: signage + alert_channel: irc + data: + - refId: A + relativeTimeRange: {from: 300, to: 0} + datasourceUid: prometheus + model: {expr: '(sum by (renderer, node_id, phase) (rate(marquee_dropped_frames_total[5m])) / sum by (renderer, node_id, phase) (rate(marquee_render_latency_ms_count[5m]))) * 100', instant: true, refId: A} + - refId: B + relativeTimeRange: {from: 300, to: 0} + datasourceUid: __expr__ + model: {type: reduce, expression: A, reducer: last, refId: B} + - refId: C + relativeTimeRange: {from: 300, to: 0} + datasourceUid: __expr__ + model: {type: threshold, expression: B, conditions: [{evaluator: {params: [5], type: gt}}], refId: C} + - uid: marquee-render-latency-p99-high + title: MarqueeRenderLatencyP99High + condition: C + for: 5m + noDataState: OK + execErrState: OK + annotations: + summary: Marquee render latency p99 above 16ms + description: "Renderer p99 latency exceeded the Pi-class 16ms budget. Grafana delivers this alert to IRC #alerts." + runbook: "1. Open /d/fc-marquee-perf/marquee-animation-performance 2. Check render latency p99 by renderer/node/phase 3. Compare with dropped frames and node CPU 4. If isolated to WPF, capture current Player.Wpf frame set before restart" + labels: + severity: warning + service: signage + alert_channel: irc + data: + - refId: A + relativeTimeRange: {from: 300, to: 0} + datasourceUid: prometheus + model: {expr: 'histogram_quantile(0.99, sum by (renderer, node_id, phase, le) (rate(marquee_render_latency_ms_bucket[5m])))', instant: true, refId: A} + - refId: B + relativeTimeRange: {from: 300, to: 0} + datasourceUid: __expr__ + model: {type: reduce, expression: A, reducer: last, refId: B} + - refId: C + relativeTimeRange: {from: 300, to: 0} + datasourceUid: __expr__ + model: {type: threshold, expression: B, conditions: [{evaluator: {params: [16], type: gt}}], refId: C} + - uid: marquee-animation-duration-drift + title: MarqueeAnimationDurationDrift + condition: C + for: 10m + noDataState: OK + execErrState: OK + annotations: + summary: Marquee animation duration drift above 10% + description: "Observed cycle duration has drifted more than 10% from target for a renderer/phase pair. Grafana delivers this alert to IRC #alerts." + runbook: "1. Open /d/fc-marquee-perf/marquee-animation-performance 2. Compare observed vs target duration 3. Check recent theme/preset changes 4. Re-run MarqueeHolidayBrandTrajectoryTests before promoting a baseline" + labels: + severity: warning + service: signage + alert_channel: irc + data: + - refId: A + relativeTimeRange: {from: 900, to: 0} + datasourceUid: prometheus + model: {expr: 'abs((histogram_quantile(0.5, sum by (renderer, phase, le) (rate(marquee_animation_duration_ms_bucket[15m]))) - avg by (renderer, phase) (marquee_animation_duration_target_ms)) / avg by (renderer, phase) (marquee_animation_duration_target_ms))', instant: true, refId: A} + - refId: B + relativeTimeRange: {from: 900, to: 0} + datasourceUid: __expr__ + model: {type: reduce, expression: A, reducer: last, refId: B} + - refId: C + relativeTimeRange: {from: 900, to: 0} + datasourceUid: __expr__ + model: {type: threshold, expression: B, conditions: [{evaluator: {params: [0.1], type: gt}}], refId: C} - orgId: 1 name: Infrastructure folder: AI Stack Alerts