Compare commits

..

1 Commits

Author SHA1 Message Date
Codex
5cf665e77d monitoring: add signage marquee Grafana alerts 2026-05-08 19:19:32 -05:00
2 changed files with 97 additions and 24 deletions

View File

@@ -396,15 +396,10 @@ spec:
# Confirmed via debug pod: PVC content IS a real bootable ISO9660 # Confirmed via debug pod: PVC content IS a real bootable ISO9660
# (file: "ISO 9660 CD-ROM filesystem data ... (bootable)"), so the # (file: "ISO 9660 CD-ROM filesystem data ... (bootable)"), so the
# only bug was boot priority. # only bug was boot priority.
# 2026-05-08 PM: cdrom bus flipped sata→scsi for windows-iso to address
# the OVMF SATA-CDROM read timeout (`BdsDxe: failed to start Boot0001 ...
# Time out`). The SCSI CDROM uses virtio-scsi controller which has a
# longer read window and works cleanly on Filesystem-backed PVCs.
# See diagnostic chain in HANDOFF.md / CODEX-STATUS.md "OPEN — ci1".
- name: windows-iso - name: windows-iso
bootOrder: 1 bootOrder: 1
cdrom: cdrom:
bus: scsi bus: sata
- name: rootdisk - name: rootdisk
bootOrder: 2 bootOrder: 2
disk: disk:
@@ -435,25 +430,17 @@ spec:
persistentVolumeClaim: persistentVolumeClaim:
claimName: ci1-rootdisk claimName: ci1-rootdisk
- name: windows-iso - name: windows-iso
# 2026-05-08 PM: REVERTED from NFS Path B back to the original CDI # Path B (2026-05-08): mount ISO from Synology NFS instead of
# Longhorn Filesystem PVC. NFS Path B (commit fc2aca0) failed at the # Longhorn Filesystem PVC. The Filesystem-PVC path was confirmed to
# storage layer because the Synology export `/volume1/ISOs` denies # contain a valid bootable ISO9660 image but caused OVMF's
# non-root client UIDs at the directory level (qemu uid 107 cannot # SATA-CDROM read window to time out:
# `ls /iso/` even with file mode 0777). Confirmed via uid-107 + # BdsDxe: failed to start Boot0001 ... Time out
# uid-0 busybox probe pods on rke2-agent2 — same export-only-root # Block-mode DataVolume was attempted as Path A but blocked by CDI
# pattern as `/volume1/kubernetes` documented in # v1.65.0's upload pod capability drop. NFS-mounted ISO bypasses
# `feedback_synology_nfs_kubernetes_export_root_only`. Memory: # both issues. See win2025-iso-nfs-pv.yaml header for full rationale
# `feedback_synology_iso_export_root_only_uid_107_denied.md`. # and Synology layout.
#
# The Longhorn PVC `windows-server-2025-iso` (CDI Filesystem mode,
# 10Gi) was confirmed to contain valid ISO bytes that uid 107 CAN
# read (mode 0660 root:107). The OVMF SATA-CDROM read timeout from
# the original Path A is now addressed by the `bus: scsi` swap on
# the disks block above. The NFS PVC + PV are RETAINED on disk so
# the Path B state is recoverable; they can be pruned in a
# follow-up commit once SCSI boot is proven.
persistentVolumeClaim: persistentVolumeClaim:
claimName: windows-server-2025-iso claimName: windows-server-2025-iso-nfs
- name: virtio-drivers - name: virtio-drivers
containerDisk: containerDisk:
# Pinned to v1.8.2 (latest stable as of 2026-05-08). # Pinned to v1.8.2 (latest stable as of 2026-05-08).

View File

@@ -3362,6 +3362,92 @@ data:
relativeTimeRange: {from: 120, to: 0} relativeTimeRange: {from: 120, to: 0}
datasourceUid: __expr__ datasourceUid: __expr__
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [600], type: gt}}], refId: C} model: {type: threshold, expression: B, conditions: [{evaluator: {params: [600], type: gt}}], refId: C}
- orgId: 1
name: Signage Marquee
folder: AI Stack Alerts
interval: 1m
rules:
- uid: marquee-dropped-frames-high
title: MarqueeDroppedFramesHigh
condition: C
for: 5m
noDataState: OK
execErrState: OK
annotations:
summary: Marquee dropped-frame rate above 5%
description: "Dropped frames exceeded the IR-21 budget for a renderer/phase/node tuple. Grafana owns alert delivery to IRC #alerts; Prometheus rules remain only the visibility source."
runbook: "1. Open /d/fc-marquee-perf/marquee-animation-performance 2. Filter renderer/node/phase 3. Compare latest AAT baseline diff 4. Restart only the affected player if the issue is node-local"
labels:
severity: warning
service: signage
alert_channel: irc
data:
- refId: A
relativeTimeRange: {from: 300, to: 0}
datasourceUid: prometheus
model: {expr: '(sum by (renderer, node_id, phase) (rate(marquee_dropped_frames_total[5m])) / sum by (renderer, node_id, phase) (rate(marquee_render_latency_ms_count[5m]))) * 100', instant: true, refId: A}
- refId: B
relativeTimeRange: {from: 300, to: 0}
datasourceUid: __expr__
model: {type: reduce, expression: A, reducer: last, refId: B}
- refId: C
relativeTimeRange: {from: 300, to: 0}
datasourceUid: __expr__
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [5], type: gt}}], refId: C}
- uid: marquee-render-latency-p99-high
title: MarqueeRenderLatencyP99High
condition: C
for: 5m
noDataState: OK
execErrState: OK
annotations:
summary: Marquee render latency p99 above 16ms
description: "Renderer p99 latency exceeded the Pi-class 16ms budget. Grafana delivers this alert to IRC #alerts."
runbook: "1. Open /d/fc-marquee-perf/marquee-animation-performance 2. Check render latency p99 by renderer/node/phase 3. Compare with dropped frames and node CPU 4. If isolated to WPF, capture current Player.Wpf frame set before restart"
labels:
severity: warning
service: signage
alert_channel: irc
data:
- refId: A
relativeTimeRange: {from: 300, to: 0}
datasourceUid: prometheus
model: {expr: 'histogram_quantile(0.99, sum by (renderer, node_id, phase, le) (rate(marquee_render_latency_ms_bucket[5m])))', instant: true, refId: A}
- refId: B
relativeTimeRange: {from: 300, to: 0}
datasourceUid: __expr__
model: {type: reduce, expression: A, reducer: last, refId: B}
- refId: C
relativeTimeRange: {from: 300, to: 0}
datasourceUid: __expr__
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [16], type: gt}}], refId: C}
- uid: marquee-animation-duration-drift
title: MarqueeAnimationDurationDrift
condition: C
for: 10m
noDataState: OK
execErrState: OK
annotations:
summary: Marquee animation duration drift above 10%
description: "Observed cycle duration has drifted more than 10% from target for a renderer/phase pair. Grafana delivers this alert to IRC #alerts."
runbook: "1. Open /d/fc-marquee-perf/marquee-animation-performance 2. Compare observed vs target duration 3. Check recent theme/preset changes 4. Re-run MarqueeHolidayBrandTrajectoryTests before promoting a baseline"
labels:
severity: warning
service: signage
alert_channel: irc
data:
- refId: A
relativeTimeRange: {from: 900, to: 0}
datasourceUid: prometheus
model: {expr: 'abs((histogram_quantile(0.5, sum by (renderer, phase, le) (rate(marquee_animation_duration_ms_bucket[15m]))) - avg by (renderer, phase) (marquee_animation_duration_target_ms)) / avg by (renderer, phase) (marquee_animation_duration_target_ms))', instant: true, refId: A}
- refId: B
relativeTimeRange: {from: 900, to: 0}
datasourceUid: __expr__
model: {type: reduce, expression: A, reducer: last, refId: B}
- refId: C
relativeTimeRange: {from: 900, to: 0}
datasourceUid: __expr__
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [0.1], type: gt}}], refId: C}
- orgId: 1 - orgId: 1
name: Infrastructure name: Infrastructure
folder: AI Stack Alerts folder: AI Stack Alerts