monitoring: add signage marquee Grafana alerts

2026-05-08 19:19:32 -05:00
7 changed files with 108 additions and 291 deletions
--- a/apps/fc-redis/fc-redis.yaml
+++ b/apps/fc-redis/fc-redis.yaml
@@ -1,171 +0,0 @@
-# fc-redis — SignalR backplane for cross-product event bus
-#
-# Lands per Q-SO-1 resolution (2026-05-11 PM): SignalR backplane in Phase A,
-# not Phase C as originally drafted. Operator directive: "Redis can be
-# deployed just fine as it's another FlowerCore technology we'll want to
-# manage."
-#
-# Phase A scope (this file):
-#   - Single Redis 7.x Alpine pod
-#   - 1Gi Longhorn RWO PVC for AOF persistence
-#   - ClusterIP Service at `redis.fc-redis.svc.cluster.local:6379`
-#   - No AUTH (in-cluster only; not exposed externally)
-#   - No IngressRoute (backplane is server-to-server only)
-#
-# Consumers (Phase A IMPL across FC services):
-#   - FlowerCore.Signage.Web (OpsConsoleHub)
-#   - FlowerCore.Scoreboard.Web (ScoreboardHub)
-#   - FlowerCore.SignalControl.Web
-#   - FlowerCore.DMS.Web
-#   - Any other product joining the cross-product event bus
-#
-# Each consumer adds:
-#   services.AddSignalR()
-#           .AddStackExchangeRedis(
-#               "redis.fc-redis.svc.cluster.local:6379",
-#               opts => opts.Configuration.ChannelPrefix =
-#                   StackExchange.Redis.RedisChannel.Literal("fc-opsconsole"));
-#
-# Phase B / C follow-ons (out of scope here):
-#   - Redis Sentinel for HA (3-node)
-#   - AUTH password from 1Password Connect (rotate via /rotate-password)
-#   - redis_exporter sidecar for Prometheus scrape
-#   - Network policies restricting which namespaces can dial 6379
-#
-# Design: docs/signage/operations-console-phase-2-design.md §3.5
-# Decision: Q-SO-1 (RESOLVED 2026-05-11 PM)
-# Memory: feedback_blooming_ui_pattern_no_iframes
---
-apiVersion: v1
-kind: Namespace
-metadata:
-  name: fc-redis
-  labels:
-    app.kubernetes.io/part-of: flowercore
-    app.kubernetes.io/managed-by: argocd
---
-apiVersion: v1
-kind: PersistentVolumeClaim
-metadata:
-  name: fc-redis-data
-  namespace: fc-redis
-spec:
-  accessModes:
-    - ReadWriteOnce
-  storageClassName: longhorn
-  resources:
-    requests:
-      storage: 1Gi
---
-apiVersion: v1
-kind: ConfigMap
-metadata:
-  name: fc-redis-config
-  namespace: fc-redis
-data:
-  redis.conf: |
-    # Phase A — minimal config; no AUTH, no replication.
-    bind 0.0.0.0
-    protected-mode no
-    port 6379
-    tcp-backlog 511
-    timeout 0
-    tcp-keepalive 300
-
-    # Persistence: AOF (fsync every second is the standard SignalR-backplane
-    # durability sweet spot — the backplane only needs to survive Redis
-    # restarts, not absolute zero loss).
-    appendonly yes
-    appendfsync everysec
-    auto-aof-rewrite-percentage 100
-    auto-aof-rewrite-min-size 64mb
-
-    # Reasonable defaults — let Redis pick most things.
-    maxmemory-policy allkeys-lru
-    maxmemory 256mb
-
-    # Logging
-    loglevel notice
---
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: fc-redis
-  namespace: fc-redis
-  labels:
-    app: fc-redis
-spec:
-  replicas: 1
-  strategy:
-    type: Recreate           # RWO PVC; do not do rolling update
-  selector:
-    matchLabels:
-      app: fc-redis
-  template:
-    metadata:
-      labels:
-        app: fc-redis
-    spec:
-      securityContext:
-        runAsNonRoot: true
-        runAsUser: 999       # redis:7-alpine default uid
-        runAsGroup: 999
-        fsGroup: 999
-      containers:
-        - name: redis
-          image: redis:7-alpine
-          imagePullPolicy: IfNotPresent
-          command: ["redis-server", "/etc/redis/redis.conf"]
-          ports:
-            - name: redis
-              containerPort: 6379
-          resources:
-            requests:
-              cpu: "50m"
-              memory: "128Mi"
-            limits:
-              cpu: "500m"
-              memory: "384Mi"
-          volumeMounts:
-            - name: data
-              mountPath: /data
-            - name: config
-              mountPath: /etc/redis
-              readOnly: true
-          livenessProbe:
-            tcpSocket:
-              port: 6379
-            initialDelaySeconds: 5
-            periodSeconds: 10
-          readinessProbe:
-            exec:
-              command: ["redis-cli", "ping"]
-            initialDelaySeconds: 2
-            periodSeconds: 5
-          securityContext:
-            allowPrivilegeEscalation: false
-            readOnlyRootFilesystem: true
-            capabilities:
-              drop: [ALL]
-      volumes:
-        - name: data
-          persistentVolumeClaim:
-            claimName: fc-redis-data
-        - name: config
-          configMap:
-            name: fc-redis-config
---
-apiVersion: v1
-kind: Service
-metadata:
-  name: redis
-  namespace: fc-redis
-spec:
-  type: ClusterIP
-  selector:
-    app: fc-redis
-  ports:
-    - name: redis
-      port: 6379
-      targetPort: 6379
-      protocol: TCP
--- a/apps/fc-updater/fc-updater.yaml
+++ b/apps/fc-updater/fc-updater.yaml
@@ -58,7 +58,7 @@ spec:
      nodeName: rke2-server
      containers:
        - name: web
-          image: localhost/fc-updater-web:v20260509-4162dca-authgate
+          image: localhost/fc-updater-web:v20260508-pub3-deepening-2bdf108
          imagePullPolicy: Never
          ports:
            - containerPort: 8080
--- a/apps/guacamole/guacamole.yaml
+++ b/apps/guacamole/guacamole.yaml
@@ -466,11 +466,11 @@ spec:
  itemPath: vaults/IAmWorkin/items/Guacamole JSON Auth
 ---
 ---
-# 1Password-backed credentials for Mac mini VNC access (Phase 1 <EFBFBD> 2026-04-28)
+# 1Password-backed credentials for Mac mini VNC access (Phase 1 — 2026-04-28)
 # The operator mints Secret 'macmini-vnc-creds' with keys: username, password, VNC Password
 # Note: '1Password' field label 'VNC Password' -> K8s Secret key 'VNC Password' (space retained)
 # Guacamole VNC connection password is sourced from the 'VNC Password' field.
-# Actual IP is 10.0.56.115 (INFRA VLAN) <EFBFBD> the 1P item 'IP' field is kept as backup reference.
+# Actual IP is 10.0.56.115 (INFRA VLAN) — the 1P item 'IP' field is kept as backup reference.
 apiVersion: onepassword.com/v1
 kind: OnePasswordItem
 metadata:
@@ -481,7 +481,6 @@ metadata:
    app.kubernetes.io/part-of: flowercore
 spec:
  itemPath: vaults/IAmWorkin/items/Mac Mini
---
 # Blue Jay Branding Extension (CSS + translations)
 apiVersion: v1
 kind: ConfigMap
--- a/apps/kubevirt-vms/ci1.yaml
+++ b/apps/kubevirt-vms/ci1.yaml
@@ -377,22 +377,7 @@ spec:
        firmware:
          bootloader:
            efi:
-              # 2026-05-08: SecureBoot=false during initial install. With SecureBoot
-              # enabled, OVMF's BdsDxe times out reading Boot0001 from the SCSI
-              # CDROM ("BdsDxe: failed to start Boot0001 ... Time out") before the
-              # EFI bootloader signature can verify against the OVMF VARS trust DB.
-              # KubeVirt's `/usr/share/OVMF/OVMF_VARS.secboot.fd` template doesn't
-              # appear to include the Microsoft KEK/DB by default, so signed
-              # Windows EFI bootloaders fail validation. Disabling SecureBoot lets
-              # OVMF skip the chain check and boot directly. This is acceptable for
-              # a CI runner — TPM 2.0 is still emulated (`tpm: {}` below) so
-              # BitLocker / Hyper-V / WSL still work.
-              # When the operator wants SecureBoot back, the path is:
-              #   1. Custom-build OVMF_VARS.fd with Microsoft KEK/DB enrolled
-              #   2. Mount it into the VM via firmware.bootloader.efi.persistent
-              #   3. Set secureBoot: true again
-              # Tracked separately from the install unblock.
-              secureBoot: false
+              secureBoot: true
        devices:
          tpm: {}             # Non-persistent vTPM — sufficient for runner; no BitLocker
          disks:
@@ -411,22 +396,10 @@ spec:
            # Confirmed via debug pod: PVC content IS a real bootable ISO9660
            # (file: "ISO 9660 CD-ROM filesystem data ... (bootable)"), so the
            # only bug was boot priority.
-            # 2026-05-08 PM: cdrom bus SCSI + containerDisk delivery. This
-            # combination boots qemu cleanly and reaches OVMF, but OVMF
-            # BdsDxe still hits "starting Boot0001 ... Time out" on the
-            # cdrom — see HANDOFF.md / CODEX-STATUS.md "OPEN — ci1" for the
-            # full diagnostic chain. virtio-blk disk swap was attempted as a
-            # workaround but introduced a separate QEMU rootdisk flock issue
-            # without fixing the underlying OVMF cdrom problem; reverted.
-            # Operator decision needed for next architectural step (OVMF
-            # custom build with extended timeout, KubeVirt version bump,
-            # Hyper-V/VirtualBox-and-export, or BIOS legacy boot). The
-            # containerDisk distribution pipeline (build/save/scp/ctr import)
-            # is proven and ready to reuse for any of those.
            - name: windows-iso
              bootOrder: 1
              cdrom:
-                bus: scsi
+                bus: sata
            - name: rootdisk
              bootOrder: 2
              disk:
@@ -457,40 +430,17 @@ spec:
          persistentVolumeClaim:
            claimName: ci1-rootdisk
        - name: windows-iso
-          # 2026-05-08 PM (Path C, CONTAINERDISK): the ISO is now packaged as
-          # a KubeVirt containerDisk OCI image baked from
-          # `FROM scratch ; ADD --chown=107:107 disk.img /disk/disk.img`.
-          # The qemu user (uid 107) reads the ISO directly from a tmpfs view
-          # of the OCI layer, bypassing both:
-          #   - Synology NFS export ACL (Path B failed: uid 107 denied at
-          #     directory level even with mode 0777, see memory
-          #     feedback_synology_iso_export_root_only_uid_107_denied)
-          #   - OVMF cdrom read-window timeout (Path A and Path B's SCSI
-          #     retry both hit `BdsDxe: failed to start Boot0001 ... Time out`
-          #     when the cdrom was backed by a PVC the storage controller
-          #     couldn't satisfy reads from fast enough).
-          #
-          # Image build (one-time, per ISO version):
-          #   1. Copy ISO to disk.img, write Dockerfile
-          #   2. podman build --tag localhost/win-server-2025:1.0 .  (on noc1)
-          #   3. podman save -o win-server-2025-1.0.tar localhost/win-server-2025:1.0
-          #   4. SCP tar to all 3 RKE2 nodes (rke2-server, rke2-agent1, rke2-agent2)
-          #   5. sudo /var/lib/rancher/rke2/bin/ctr -a /run/k3s/containerd/containerd.sock \
-          #        -n k8s.io images import /tmp/win-server-2025-1.0.tar
-          # Standard FC pattern per `feedback_rke2_localhost_imagepullpolicy`.
-          #
-          # When a new Windows ISO version ships, bump the tag (1.1, 1.2, ...),
-          # rebuild + redistribute, and update the image: line below in a new
-          # commit. KubeVirt picks up the new image via a VM restart.
-          #
-          # The legacy NFS PVC + PV (apps/kubevirt-vms/win2025-iso-nfs-pv.yaml)
-          # and CDI Longhorn PVC (`windows-server-2025-iso`) are RETAINED for
-          # this commit so the prior states are recoverable. Once the
-          # containerDisk path proves on a successful Windows install, both
-          # legacy artifacts can be pruned in a follow-up commit.
-          containerDisk:
-            image: localhost/win-server-2025:1.0
-            imagePullPolicy: Never
+          # Path B (2026-05-08): mount ISO from Synology NFS instead of
+          # Longhorn Filesystem PVC. The Filesystem-PVC path was confirmed to
+          # contain a valid bootable ISO9660 image but caused OVMF's
+          # SATA-CDROM read window to time out:
+          #   BdsDxe: failed to start Boot0001 ... Time out
+          # Block-mode DataVolume was attempted as Path A but blocked by CDI
+          # v1.65.0's upload pod capability drop. NFS-mounted ISO bypasses
+          # both issues. See win2025-iso-nfs-pv.yaml header for full rationale
+          # and Synology layout.
+          persistentVolumeClaim:
+            claimName: windows-server-2025-iso-nfs
        - name: virtio-drivers
          containerDisk:
            # Pinned to v1.8.2 (latest stable as of 2026-05-08).
--- a/apps/monitoring/noc-monitoring.yaml
+++ b/apps/monitoring/noc-monitoring.yaml
@@ -974,39 +974,6 @@ data:
              summary: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} replica mismatch"
              description: "Spec wants {{ $labels.spec_replicas }} but only {{ $value }} available. Likely a rollout stuck on probe failure, scheduling, or PVC."

-          # Q-MR-3 (2026-05-11): multus memory pressure — catches the next OOM
-          # cascade BEFORE multus is OOM-killed cluster-wide. The 2026-05-10
-          # outage (21h) hit because no alert fired on the rising multus working
-          # set — only downstream blackbox / Traefik / service alerts. With
-          # 1Gi limit (bluejay-infra@eb8693e), 80% = ~800MiB; steady-state
-          # runs ~150-250MiB so this only fires when an avalanche starts.
-          - alert: MultusMemoryPressure
-            expr: |
-              container_memory_working_set_bytes{container="kube-multus"}
-                / container_spec_memory_limit_bytes{container="kube-multus"} > 0.8
-            for: 5m
-            labels:
-              severity: critical
-              alert_channel: thermal_print
-            annotations:
-              summary: "kube-multus memory >80% of limit on {{ $labels.node }} for 5m"
-              description: "kube-multus working set is {{ $value | humanizePercentage }} of its memory limit on node {{ $labels.node }}. If this keeps climbing, multus will OOM and all new pod networking will halt cluster-wide (precedent: 2026-05-10 outage)."
-
-          # Q-MR-3 (2026-05-11): namespace pending-pod backlog — catches the
-          # operator-leak avalanche pattern BEFORE it cascades into a multus
-          # CNI OOM. Any FC operator (RemoteDesktop / Distribution / WorldBuilder)
-          # emitting pods without ownerReferences will accumulate them when
-          # the operator crashes. >25 pending pods in any namespace for 30m
-          # is the signal to investigate the reconciler.
-          - alert: NamespacePendingPodBacklog
-            expr: sum by (namespace) (kube_pod_status_phase{phase="Pending"}) > 25
-            for: 30m
-            labels:
-              severity: warning
-            annotations:
-              summary: "Namespace {{ $labels.namespace }} has {{ $value }} Pending pods for 30m"
-              description: "Pending pod count in {{ $labels.namespace }} exceeds 25 sustained for 30m. Likely operator-leak avalanche pattern — children emitted without ownerReferences. Risk of multus CNI OOM cascade."
-
      # Longhorn storage health alerts. Required: longhorn scrape job
      # (added 2026-04-26 — see scrape_configs above). The K8s events
      # for "snapshot becomes not ready to use" are transient lifecycle
@@ -3395,6 +3362,92 @@ data:
                relativeTimeRange: {from: 120, to: 0}
                datasourceUid: __expr__
                model: {type: threshold, expression: B, conditions: [{evaluator: {params: [600], type: gt}}], refId: C}
+      - orgId: 1
+        name: Signage Marquee
+        folder: AI Stack Alerts
+        interval: 1m
+        rules:
+          - uid: marquee-dropped-frames-high
+            title: MarqueeDroppedFramesHigh
+            condition: C
+            for: 5m
+            noDataState: OK
+            execErrState: OK
+            annotations:
+              summary: Marquee dropped-frame rate above 5%
+              description: "Dropped frames exceeded the IR-21 budget for a renderer/phase/node tuple. Grafana owns alert delivery to IRC #alerts; Prometheus rules remain only the visibility source."
+              runbook: "1. Open /d/fc-marquee-perf/marquee-animation-performance 2. Filter renderer/node/phase 3. Compare latest AAT baseline diff 4. Restart only the affected player if the issue is node-local"
+            labels:
+              severity: warning
+              service: signage
+              alert_channel: irc
+            data:
+              - refId: A
+                relativeTimeRange: {from: 300, to: 0}
+                datasourceUid: prometheus
+                model: {expr: '(sum by (renderer, node_id, phase) (rate(marquee_dropped_frames_total[5m])) / sum by (renderer, node_id, phase) (rate(marquee_render_latency_ms_count[5m]))) * 100', instant: true, refId: A}
+              - refId: B
+                relativeTimeRange: {from: 300, to: 0}
+                datasourceUid: __expr__
+                model: {type: reduce, expression: A, reducer: last, refId: B}
+              - refId: C
+                relativeTimeRange: {from: 300, to: 0}
+                datasourceUid: __expr__
+                model: {type: threshold, expression: B, conditions: [{evaluator: {params: [5], type: gt}}], refId: C}
+          - uid: marquee-render-latency-p99-high
+            title: MarqueeRenderLatencyP99High
+            condition: C
+            for: 5m
+            noDataState: OK
+            execErrState: OK
+            annotations:
+              summary: Marquee render latency p99 above 16ms
+              description: "Renderer p99 latency exceeded the Pi-class 16ms budget. Grafana delivers this alert to IRC #alerts."
+              runbook: "1. Open /d/fc-marquee-perf/marquee-animation-performance 2. Check render latency p99 by renderer/node/phase 3. Compare with dropped frames and node CPU 4. If isolated to WPF, capture current Player.Wpf frame set before restart"
+            labels:
+              severity: warning
+              service: signage
+              alert_channel: irc
+            data:
+              - refId: A
+                relativeTimeRange: {from: 300, to: 0}
+                datasourceUid: prometheus
+                model: {expr: 'histogram_quantile(0.99, sum by (renderer, node_id, phase, le) (rate(marquee_render_latency_ms_bucket[5m])))', instant: true, refId: A}
+              - refId: B
+                relativeTimeRange: {from: 300, to: 0}
+                datasourceUid: __expr__
+                model: {type: reduce, expression: A, reducer: last, refId: B}
+              - refId: C
+                relativeTimeRange: {from: 300, to: 0}
+                datasourceUid: __expr__
+                model: {type: threshold, expression: B, conditions: [{evaluator: {params: [16], type: gt}}], refId: C}
+          - uid: marquee-animation-duration-drift
+            title: MarqueeAnimationDurationDrift
+            condition: C
+            for: 10m
+            noDataState: OK
+            execErrState: OK
+            annotations:
+              summary: Marquee animation duration drift above 10%
+              description: "Observed cycle duration has drifted more than 10% from target for a renderer/phase pair. Grafana delivers this alert to IRC #alerts."
+              runbook: "1. Open /d/fc-marquee-perf/marquee-animation-performance 2. Compare observed vs target duration 3. Check recent theme/preset changes 4. Re-run MarqueeHolidayBrandTrajectoryTests before promoting a baseline"
+            labels:
+              severity: warning
+              service: signage
+              alert_channel: irc
+            data:
+              - refId: A
+                relativeTimeRange: {from: 900, to: 0}
+                datasourceUid: prometheus
+                model: {expr: 'abs((histogram_quantile(0.5, sum by (renderer, phase, le) (rate(marquee_animation_duration_ms_bucket[15m]))) - avg by (renderer, phase) (marquee_animation_duration_target_ms)) / avg by (renderer, phase) (marquee_animation_duration_target_ms))', instant: true, refId: A}
+              - refId: B
+                relativeTimeRange: {from: 900, to: 0}
+                datasourceUid: __expr__
+                model: {type: reduce, expression: A, reducer: last, refId: B}
+              - refId: C
+                relativeTimeRange: {from: 900, to: 0}
+                datasourceUid: __expr__
+                model: {type: threshold, expression: B, conditions: [{evaluator: {params: [0.1], type: gt}}], refId: C}
      - orgId: 1
        name: Infrastructure
        folder: AI Stack Alerts
--- a/apps/multus/multus.yaml
+++ b/apps/multus/multus.yaml
@@ -188,24 +188,13 @@ spec:
        - name: kube-multus
          image: ghcr.io/k8snetworkplumbingwg/multus-cni:snapshot-thick
          command: [ "/usr/src/multus-cni/bin/multus-daemon" ]
-          # 2026-05-11: upstream default of 50Mi memory limit OOM-cascades when
-          # an operator-owned namespace accumulates >100 pending pods retrying
-          # CNI ADD. RemoteDesktop emitted 219 orphan rd-browser-only pods
-          # (missing OwnerReferences), kubelet's CNI ADD avalanche pushed multus
-          # over 50Mi, OOMKilled, restarted with even bigger backlog → loop.
-          # 21h cluster outage. See FlowerCore.Notes:
-          #   feedback_multus_50mi_limit_oom_orphan_pod_avalanche.md
-          # 1Gi limit / 512Mi request comfortably handles a 200+ pod CNI
-          # catchup burst on 64GB nodes (nodes are <25% used in steady-state).
-          # Drop back toward 256Mi only after MultusMemoryPressure alert
-          # proves steady-state working set sits well below 200Mi.
          resources:
            requests:
              cpu: "100m"
-              memory: "512Mi"
+              memory: "50Mi"
            limits:
              cpu: "100m"
-              memory: "1Gi"
+              memory: "50Mi"
          securityContext:
            privileged: true
          terminationMessagePolicy: FallbackToLogsOnError
--- a/apps/telephony/telephony.yaml
+++ b/apps/telephony/telephony.yaml
@@ -127,13 +127,10 @@ spec:
      initContainers:
        - name: fix-data-perms
          image: busybox:latest
-          # Must run as root to chown the hostPath /tmp/tts-audio that may be
-          # root-owned after node reboot. Pod-level runAsNonRoot:true would
-          # otherwise inherit and chown would fail with EPERM (see Notes memory
-          # feedback_hostpath_initcontainer_chown_perms).
-          securityContext:
-            runAsUser: 0
-            runAsNonRoot: false
+          # Also chown /shared-tts (hostPath /tmp/tts-audio) so the non-root
+          # app user (uid 1654) can write Piper .sln16 files that Asterisk
+          # reads at /var/lib/asterisk/sounds/tts. World-readable (755) is
+          # fine — Asterisk runs as a different uid in the other pod.
          command: ["sh", "-c", "chown -R 1654:1654 /data && chown 1654:1654 /shared-tts && chmod 0755 /shared-tts"]
          volumeMounts:
            - name: telephony-data