Compare commits

..

1 Commits

Author SHA1 Message Date
Codex
5cf665e77d monitoring: add signage marquee Grafana alerts 2026-05-08 19:19:32 -05:00
5 changed files with 106 additions and 117 deletions

View File

@@ -58,7 +58,7 @@ spec:
nodeName: rke2-server nodeName: rke2-server
containers: containers:
- name: web - name: web
image: localhost/fc-updater-web:v20260509-4162dca-authgate image: localhost/fc-updater-web:v20260508-pub3-deepening-2bdf108
imagePullPolicy: Never imagePullPolicy: Never
ports: ports:
- containerPort: 8080 - containerPort: 8080

View File

@@ -377,22 +377,7 @@ spec:
firmware: firmware:
bootloader: bootloader:
efi: efi:
# 2026-05-08: SecureBoot=false during initial install. With SecureBoot secureBoot: true
# enabled, OVMF's BdsDxe times out reading Boot0001 from the SCSI
# CDROM ("BdsDxe: failed to start Boot0001 ... Time out") before the
# EFI bootloader signature can verify against the OVMF VARS trust DB.
# KubeVirt's `/usr/share/OVMF/OVMF_VARS.secboot.fd` template doesn't
# appear to include the Microsoft KEK/DB by default, so signed
# Windows EFI bootloaders fail validation. Disabling SecureBoot lets
# OVMF skip the chain check and boot directly. This is acceptable for
# a CI runner — TPM 2.0 is still emulated (`tpm: {}` below) so
# BitLocker / Hyper-V / WSL still work.
# When the operator wants SecureBoot back, the path is:
# 1. Custom-build OVMF_VARS.fd with Microsoft KEK/DB enrolled
# 2. Mount it into the VM via firmware.bootloader.efi.persistent
# 3. Set secureBoot: true again
# Tracked separately from the install unblock.
secureBoot: false
devices: devices:
tpm: {} # Non-persistent vTPM — sufficient for runner; no BitLocker tpm: {} # Non-persistent vTPM — sufficient for runner; no BitLocker
disks: disks:
@@ -411,22 +396,10 @@ spec:
# Confirmed via debug pod: PVC content IS a real bootable ISO9660 # Confirmed via debug pod: PVC content IS a real bootable ISO9660
# (file: "ISO 9660 CD-ROM filesystem data ... (bootable)"), so the # (file: "ISO 9660 CD-ROM filesystem data ... (bootable)"), so the
# only bug was boot priority. # only bug was boot priority.
# 2026-05-08 PM: cdrom bus SCSI + containerDisk delivery. This
# combination boots qemu cleanly and reaches OVMF, but OVMF
# BdsDxe still hits "starting Boot0001 ... Time out" on the
# cdrom — see HANDOFF.md / CODEX-STATUS.md "OPEN — ci1" for the
# full diagnostic chain. virtio-blk disk swap was attempted as a
# workaround but introduced a separate QEMU rootdisk flock issue
# without fixing the underlying OVMF cdrom problem; reverted.
# Operator decision needed for next architectural step (OVMF
# custom build with extended timeout, KubeVirt version bump,
# Hyper-V/VirtualBox-and-export, or BIOS legacy boot). The
# containerDisk distribution pipeline (build/save/scp/ctr import)
# is proven and ready to reuse for any of those.
- name: windows-iso - name: windows-iso
bootOrder: 1 bootOrder: 1
cdrom: cdrom:
bus: scsi bus: sata
- name: rootdisk - name: rootdisk
bootOrder: 2 bootOrder: 2
disk: disk:
@@ -457,40 +430,17 @@ spec:
persistentVolumeClaim: persistentVolumeClaim:
claimName: ci1-rootdisk claimName: ci1-rootdisk
- name: windows-iso - name: windows-iso
# 2026-05-08 PM (Path C, CONTAINERDISK): the ISO is now packaged as # Path B (2026-05-08): mount ISO from Synology NFS instead of
# a KubeVirt containerDisk OCI image baked from # Longhorn Filesystem PVC. The Filesystem-PVC path was confirmed to
# `FROM scratch ; ADD --chown=107:107 disk.img /disk/disk.img`. # contain a valid bootable ISO9660 image but caused OVMF's
# The qemu user (uid 107) reads the ISO directly from a tmpfs view # SATA-CDROM read window to time out:
# of the OCI layer, bypassing both: # BdsDxe: failed to start Boot0001 ... Time out
# - Synology NFS export ACL (Path B failed: uid 107 denied at # Block-mode DataVolume was attempted as Path A but blocked by CDI
# directory level even with mode 0777, see memory # v1.65.0's upload pod capability drop. NFS-mounted ISO bypasses
# feedback_synology_iso_export_root_only_uid_107_denied) # both issues. See win2025-iso-nfs-pv.yaml header for full rationale
# - OVMF cdrom read-window timeout (Path A and Path B's SCSI # and Synology layout.
# retry both hit `BdsDxe: failed to start Boot0001 ... Time out` persistentVolumeClaim:
# when the cdrom was backed by a PVC the storage controller claimName: windows-server-2025-iso-nfs
# couldn't satisfy reads from fast enough).
#
# Image build (one-time, per ISO version):
# 1. Copy ISO to disk.img, write Dockerfile
# 2. podman build --tag localhost/win-server-2025:1.0 . (on noc1)
# 3. podman save -o win-server-2025-1.0.tar localhost/win-server-2025:1.0
# 4. SCP tar to all 3 RKE2 nodes (rke2-server, rke2-agent1, rke2-agent2)
# 5. sudo /var/lib/rancher/rke2/bin/ctr -a /run/k3s/containerd/containerd.sock \
# -n k8s.io images import /tmp/win-server-2025-1.0.tar
# Standard FC pattern per `feedback_rke2_localhost_imagepullpolicy`.
#
# When a new Windows ISO version ships, bump the tag (1.1, 1.2, ...),
# rebuild + redistribute, and update the image: line below in a new
# commit. KubeVirt picks up the new image via a VM restart.
#
# The legacy NFS PVC + PV (apps/kubevirt-vms/win2025-iso-nfs-pv.yaml)
# and CDI Longhorn PVC (`windows-server-2025-iso`) are RETAINED for
# this commit so the prior states are recoverable. Once the
# containerDisk path proves on a successful Windows install, both
# legacy artifacts can be pruned in a follow-up commit.
containerDisk:
image: localhost/win-server-2025:1.0
imagePullPolicy: Never
- name: virtio-drivers - name: virtio-drivers
containerDisk: containerDisk:
# Pinned to v1.8.2 (latest stable as of 2026-05-08). # Pinned to v1.8.2 (latest stable as of 2026-05-08).

View File

@@ -974,39 +974,6 @@ data:
summary: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} replica mismatch" summary: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} replica mismatch"
description: "Spec wants {{ $labels.spec_replicas }} but only {{ $value }} available. Likely a rollout stuck on probe failure, scheduling, or PVC." description: "Spec wants {{ $labels.spec_replicas }} but only {{ $value }} available. Likely a rollout stuck on probe failure, scheduling, or PVC."
# Q-MR-3 (2026-05-11): multus memory pressure — catches the next OOM
# cascade BEFORE multus is OOM-killed cluster-wide. The 2026-05-10
# outage (21h) hit because no alert fired on the rising multus working
# set — only downstream blackbox / Traefik / service alerts. With
# 1Gi limit (bluejay-infra@eb8693e), 80% = ~800MiB; steady-state
# runs ~150-250MiB so this only fires when an avalanche starts.
- alert: MultusMemoryPressure
expr: |
container_memory_working_set_bytes{container="kube-multus"}
/ container_spec_memory_limit_bytes{container="kube-multus"} > 0.8
for: 5m
labels:
severity: critical
alert_channel: thermal_print
annotations:
summary: "kube-multus memory >80% of limit on {{ $labels.node }} for 5m"
description: "kube-multus working set is {{ $value | humanizePercentage }} of its memory limit on node {{ $labels.node }}. If this keeps climbing, multus will OOM and all new pod networking will halt cluster-wide (precedent: 2026-05-10 outage)."
# Q-MR-3 (2026-05-11): namespace pending-pod backlog — catches the
# operator-leak avalanche pattern BEFORE it cascades into a multus
# CNI OOM. Any FC operator (RemoteDesktop / Distribution / WorldBuilder)
# emitting pods without ownerReferences will accumulate them when
# the operator crashes. >25 pending pods in any namespace for 30m
# is the signal to investigate the reconciler.
- alert: NamespacePendingPodBacklog
expr: sum by (namespace) (kube_pod_status_phase{phase="Pending"}) > 25
for: 30m
labels:
severity: warning
annotations:
summary: "Namespace {{ $labels.namespace }} has {{ $value }} Pending pods for 30m"
description: "Pending pod count in {{ $labels.namespace }} exceeds 25 sustained for 30m. Likely operator-leak avalanche pattern — children emitted without ownerReferences. Risk of multus CNI OOM cascade."
# Longhorn storage health alerts. Required: longhorn scrape job # Longhorn storage health alerts. Required: longhorn scrape job
# (added 2026-04-26 — see scrape_configs above). The K8s events # (added 2026-04-26 — see scrape_configs above). The K8s events
# for "snapshot becomes not ready to use" are transient lifecycle # for "snapshot becomes not ready to use" are transient lifecycle
@@ -3395,6 +3362,92 @@ data:
relativeTimeRange: {from: 120, to: 0} relativeTimeRange: {from: 120, to: 0}
datasourceUid: __expr__ datasourceUid: __expr__
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [600], type: gt}}], refId: C} model: {type: threshold, expression: B, conditions: [{evaluator: {params: [600], type: gt}}], refId: C}
- orgId: 1
name: Signage Marquee
folder: AI Stack Alerts
interval: 1m
rules:
- uid: marquee-dropped-frames-high
title: MarqueeDroppedFramesHigh
condition: C
for: 5m
noDataState: OK
execErrState: OK
annotations:
summary: Marquee dropped-frame rate above 5%
description: "Dropped frames exceeded the IR-21 budget for a renderer/phase/node tuple. Grafana owns alert delivery to IRC #alerts; Prometheus rules remain only the visibility source."
runbook: "1. Open /d/fc-marquee-perf/marquee-animation-performance 2. Filter renderer/node/phase 3. Compare latest AAT baseline diff 4. Restart only the affected player if the issue is node-local"
labels:
severity: warning
service: signage
alert_channel: irc
data:
- refId: A
relativeTimeRange: {from: 300, to: 0}
datasourceUid: prometheus
model: {expr: '(sum by (renderer, node_id, phase) (rate(marquee_dropped_frames_total[5m])) / sum by (renderer, node_id, phase) (rate(marquee_render_latency_ms_count[5m]))) * 100', instant: true, refId: A}
- refId: B
relativeTimeRange: {from: 300, to: 0}
datasourceUid: __expr__
model: {type: reduce, expression: A, reducer: last, refId: B}
- refId: C
relativeTimeRange: {from: 300, to: 0}
datasourceUid: __expr__
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [5], type: gt}}], refId: C}
- uid: marquee-render-latency-p99-high
title: MarqueeRenderLatencyP99High
condition: C
for: 5m
noDataState: OK
execErrState: OK
annotations:
summary: Marquee render latency p99 above 16ms
description: "Renderer p99 latency exceeded the Pi-class 16ms budget. Grafana delivers this alert to IRC #alerts."
runbook: "1. Open /d/fc-marquee-perf/marquee-animation-performance 2. Check render latency p99 by renderer/node/phase 3. Compare with dropped frames and node CPU 4. If isolated to WPF, capture current Player.Wpf frame set before restart"
labels:
severity: warning
service: signage
alert_channel: irc
data:
- refId: A
relativeTimeRange: {from: 300, to: 0}
datasourceUid: prometheus
model: {expr: 'histogram_quantile(0.99, sum by (renderer, node_id, phase, le) (rate(marquee_render_latency_ms_bucket[5m])))', instant: true, refId: A}
- refId: B
relativeTimeRange: {from: 300, to: 0}
datasourceUid: __expr__
model: {type: reduce, expression: A, reducer: last, refId: B}
- refId: C
relativeTimeRange: {from: 300, to: 0}
datasourceUid: __expr__
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [16], type: gt}}], refId: C}
- uid: marquee-animation-duration-drift
title: MarqueeAnimationDurationDrift
condition: C
for: 10m
noDataState: OK
execErrState: OK
annotations:
summary: Marquee animation duration drift above 10%
description: "Observed cycle duration has drifted more than 10% from target for a renderer/phase pair. Grafana delivers this alert to IRC #alerts."
runbook: "1. Open /d/fc-marquee-perf/marquee-animation-performance 2. Compare observed vs target duration 3. Check recent theme/preset changes 4. Re-run MarqueeHolidayBrandTrajectoryTests before promoting a baseline"
labels:
severity: warning
service: signage
alert_channel: irc
data:
- refId: A
relativeTimeRange: {from: 900, to: 0}
datasourceUid: prometheus
model: {expr: 'abs((histogram_quantile(0.5, sum by (renderer, phase, le) (rate(marquee_animation_duration_ms_bucket[15m]))) - avg by (renderer, phase) (marquee_animation_duration_target_ms)) / avg by (renderer, phase) (marquee_animation_duration_target_ms))', instant: true, refId: A}
- refId: B
relativeTimeRange: {from: 900, to: 0}
datasourceUid: __expr__
model: {type: reduce, expression: A, reducer: last, refId: B}
- refId: C
relativeTimeRange: {from: 900, to: 0}
datasourceUid: __expr__
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [0.1], type: gt}}], refId: C}
- orgId: 1 - orgId: 1
name: Infrastructure name: Infrastructure
folder: AI Stack Alerts folder: AI Stack Alerts

View File

@@ -188,24 +188,13 @@ spec:
- name: kube-multus - name: kube-multus
image: ghcr.io/k8snetworkplumbingwg/multus-cni:snapshot-thick image: ghcr.io/k8snetworkplumbingwg/multus-cni:snapshot-thick
command: [ "/usr/src/multus-cni/bin/multus-daemon" ] command: [ "/usr/src/multus-cni/bin/multus-daemon" ]
# 2026-05-11: upstream default of 50Mi memory limit OOM-cascades when
# an operator-owned namespace accumulates >100 pending pods retrying
# CNI ADD. RemoteDesktop emitted 219 orphan rd-browser-only pods
# (missing OwnerReferences), kubelet's CNI ADD avalanche pushed multus
# over 50Mi, OOMKilled, restarted with even bigger backlog → loop.
# 21h cluster outage. See FlowerCore.Notes:
# feedback_multus_50mi_limit_oom_orphan_pod_avalanche.md
# 1Gi limit / 512Mi request comfortably handles a 200+ pod CNI
# catchup burst on 64GB nodes (nodes are <25% used in steady-state).
# Drop back toward 256Mi only after MultusMemoryPressure alert
# proves steady-state working set sits well below 200Mi.
resources: resources:
requests: requests:
cpu: "100m" cpu: "100m"
memory: "512Mi" memory: "50Mi"
limits: limits:
cpu: "100m" cpu: "100m"
memory: "1Gi" memory: "50Mi"
securityContext: securityContext:
privileged: true privileged: true
terminationMessagePolicy: FallbackToLogsOnError terminationMessagePolicy: FallbackToLogsOnError

View File

@@ -127,13 +127,10 @@ spec:
initContainers: initContainers:
- name: fix-data-perms - name: fix-data-perms
image: busybox:latest image: busybox:latest
# Must run as root to chown the hostPath /tmp/tts-audio that may be # Also chown /shared-tts (hostPath /tmp/tts-audio) so the non-root
# root-owned after node reboot. Pod-level runAsNonRoot:true would # app user (uid 1654) can write Piper .sln16 files that Asterisk
# otherwise inherit and chown would fail with EPERM (see Notes memory # reads at /var/lib/asterisk/sounds/tts. World-readable (755) is
# feedback_hostpath_initcontainer_chown_perms). # fine — Asterisk runs as a different uid in the other pod.
securityContext:
runAsUser: 0
runAsNonRoot: false
command: ["sh", "-c", "chown -R 1654:1654 /data && chown 1654:1654 /shared-tts && chmod 0755 /shared-tts"] command: ["sh", "-c", "chown -R 1654:1654 /data && chown 1654:1654 /shared-tts && chmod 0755 /shared-tts"]
volumeMounts: volumeMounts:
- name: telephony-data - name: telephony-data