diff --git a/apps/monitoring/noc-monitoring.yaml b/apps/monitoring/noc-monitoring.yaml index 6c63a67..1698160 100644 --- a/apps/monitoring/noc-monitoring.yaml +++ b/apps/monitoring/noc-monitoring.yaml @@ -974,6 +974,39 @@ data: summary: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} replica mismatch" description: "Spec wants {{ $labels.spec_replicas }} but only {{ $value }} available. Likely a rollout stuck on probe failure, scheduling, or PVC." + # Q-MR-3 (2026-05-11): multus memory pressure — catches the next OOM + # cascade BEFORE multus is OOM-killed cluster-wide. The 2026-05-10 + # outage (21h) hit because no alert fired on the rising multus working + # set — only downstream blackbox / Traefik / service alerts. With + # 1Gi limit (bluejay-infra@eb8693e), 80% = ~800MiB; steady-state + # runs ~150-250MiB so this only fires when an avalanche starts. + - alert: MultusMemoryPressure + expr: | + container_memory_working_set_bytes{container="kube-multus"} + / container_spec_memory_limit_bytes{container="kube-multus"} > 0.8 + for: 5m + labels: + severity: critical + alert_channel: thermal_print + annotations: + summary: "kube-multus memory >80% of limit on {{ $labels.node }} for 5m" + description: "kube-multus working set is {{ $value | humanizePercentage }} of its memory limit on node {{ $labels.node }}. If this keeps climbing, multus will OOM and all new pod networking will halt cluster-wide (precedent: 2026-05-10 outage)." + + # Q-MR-3 (2026-05-11): namespace pending-pod backlog — catches the + # operator-leak avalanche pattern BEFORE it cascades into a multus + # CNI OOM. Any FC operator (RemoteDesktop / Distribution / WorldBuilder) + # emitting pods without ownerReferences will accumulate them when + # the operator crashes. >25 pending pods in any namespace for 30m + # is the signal to investigate the reconciler. + - alert: NamespacePendingPodBacklog + expr: sum by (namespace) (kube_pod_status_phase{phase="Pending"}) > 25 + for: 30m + labels: + severity: warning + annotations: + summary: "Namespace {{ $labels.namespace }} has {{ $value }} Pending pods for 30m" + description: "Pending pod count in {{ $labels.namespace }} exceeds 25 sustained for 30m. Likely operator-leak avalanche pattern — children emitted without ownerReferences. Risk of multus CNI OOM cascade." + # Longhorn storage health alerts. Required: longhorn scrape job # (added 2026-04-26 — see scrape_configs above). The K8s events # for "snapshot becomes not ready to use" are transient lifecycle