From 653d4472f5174b80d844c1a680dac0ae25686657 Mon Sep 17 00:00:00 2001 From: Codex Date: Mon, 11 May 2026 10:42:27 -0500 Subject: [PATCH] fix(monitoring): mirror Q-MR-3 MultusMemoryPressure + NamespacePendingPodBacklog alerts Two new preventive alert rules added to the kubernetes-state group of the K8s migration target ConfigMap. The live Podman Prometheus on noc1 has already been updated via FlowerCore.Notes/scripts/monitoring/alerts.yml + sudo cp + podman pod restart monitoring (this commit only locks it in the bluejay-infra K8s mirror so a future migration carries it forward). MultusMemoryPressure (critical, thermal_print): fires when kube-multus working set exceeds 80% of its memory limit for 5m. Catches the next multus OOM cascade BEFORE it kills the daemon cluster-wide. The 2026-05-10 21h outage hit because no alert fired on the rising multus working set; only downstream blackbox / Traefik / service alerts triggered, after the fact. NamespacePendingPodBacklog (warning): fires when any single namespace has >25 Pending pods sustained for 30m. Catches the operator-leak avalanche pattern (orphan pods from a crashed reconciler emitting children without ownerReferences) before it cascades into a CNI OOM. See FlowerCore.Notes: - feedback_multus_50mi_limit_oom_orphan_pod_avalanche - feedback_monitoring_k8s_target_vs_live_podman (workflow) Companion commits: - bluejay-infra@eb8693e (multus memory limit) - FlowerCore.RemoteDesktop@b02c59b (OwnerReferences fix) Co-Authored-By: Claude Opus 4.7 (1M context) --- apps/monitoring/noc-monitoring.yaml | 33 +++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/apps/monitoring/noc-monitoring.yaml b/apps/monitoring/noc-monitoring.yaml index 6c63a67..1698160 100644 --- a/apps/monitoring/noc-monitoring.yaml +++ b/apps/monitoring/noc-monitoring.yaml @@ -974,6 +974,39 @@ data: summary: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} replica mismatch" description: "Spec wants {{ $labels.spec_replicas }} but only {{ $value }} available. Likely a rollout stuck on probe failure, scheduling, or PVC." + # Q-MR-3 (2026-05-11): multus memory pressure — catches the next OOM + # cascade BEFORE multus is OOM-killed cluster-wide. The 2026-05-10 + # outage (21h) hit because no alert fired on the rising multus working + # set — only downstream blackbox / Traefik / service alerts. With + # 1Gi limit (bluejay-infra@eb8693e), 80% = ~800MiB; steady-state + # runs ~150-250MiB so this only fires when an avalanche starts. + - alert: MultusMemoryPressure + expr: | + container_memory_working_set_bytes{container="kube-multus"} + / container_spec_memory_limit_bytes{container="kube-multus"} > 0.8 + for: 5m + labels: + severity: critical + alert_channel: thermal_print + annotations: + summary: "kube-multus memory >80% of limit on {{ $labels.node }} for 5m" + description: "kube-multus working set is {{ $value | humanizePercentage }} of its memory limit on node {{ $labels.node }}. If this keeps climbing, multus will OOM and all new pod networking will halt cluster-wide (precedent: 2026-05-10 outage)." + + # Q-MR-3 (2026-05-11): namespace pending-pod backlog — catches the + # operator-leak avalanche pattern BEFORE it cascades into a multus + # CNI OOM. Any FC operator (RemoteDesktop / Distribution / WorldBuilder) + # emitting pods without ownerReferences will accumulate them when + # the operator crashes. >25 pending pods in any namespace for 30m + # is the signal to investigate the reconciler. + - alert: NamespacePendingPodBacklog + expr: sum by (namespace) (kube_pod_status_phase{phase="Pending"}) > 25 + for: 30m + labels: + severity: warning + annotations: + summary: "Namespace {{ $labels.namespace }} has {{ $value }} Pending pods for 30m" + description: "Pending pod count in {{ $labels.namespace }} exceeds 25 sustained for 30m. Likely operator-leak avalanche pattern — children emitted without ownerReferences. Risk of multus CNI OOM cascade." + # Longhorn storage health alerts. Required: longhorn scrape job # (added 2026-04-26 — see scrape_configs above). The K8s events # for "snapshot becomes not ready to use" are transient lifecycle