Compare commits
5 Commits
claude/ci1
...
feat/redis
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c1416c6968 | ||
|
|
6e7d88db49 | ||
|
|
5ae50bd491 | ||
|
|
653d4472f5 | ||
|
|
eb8693e1ce |
@@ -974,6 +974,39 @@ data:
|
|||||||
summary: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} replica mismatch"
|
summary: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} replica mismatch"
|
||||||
description: "Spec wants {{ $labels.spec_replicas }} but only {{ $value }} available. Likely a rollout stuck on probe failure, scheduling, or PVC."
|
description: "Spec wants {{ $labels.spec_replicas }} but only {{ $value }} available. Likely a rollout stuck on probe failure, scheduling, or PVC."
|
||||||
|
|
||||||
|
# Q-MR-3 (2026-05-11): multus memory pressure — catches the next OOM
|
||||||
|
# cascade BEFORE multus is OOM-killed cluster-wide. The 2026-05-10
|
||||||
|
# outage (21h) hit because no alert fired on the rising multus working
|
||||||
|
# set — only downstream blackbox / Traefik / service alerts. With
|
||||||
|
# 1Gi limit (bluejay-infra@eb8693e), 80% = ~800MiB; steady-state
|
||||||
|
# runs ~150-250MiB so this only fires when an avalanche starts.
|
||||||
|
- alert: MultusMemoryPressure
|
||||||
|
expr: |
|
||||||
|
container_memory_working_set_bytes{container="kube-multus"}
|
||||||
|
/ container_spec_memory_limit_bytes{container="kube-multus"} > 0.8
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
alert_channel: thermal_print
|
||||||
|
annotations:
|
||||||
|
summary: "kube-multus memory >80% of limit on {{ $labels.node }} for 5m"
|
||||||
|
description: "kube-multus working set is {{ $value | humanizePercentage }} of its memory limit on node {{ $labels.node }}. If this keeps climbing, multus will OOM and all new pod networking will halt cluster-wide (precedent: 2026-05-10 outage)."
|
||||||
|
|
||||||
|
# Q-MR-3 (2026-05-11): namespace pending-pod backlog — catches the
|
||||||
|
# operator-leak avalanche pattern BEFORE it cascades into a multus
|
||||||
|
# CNI OOM. Any FC operator (RemoteDesktop / Distribution / WorldBuilder)
|
||||||
|
# emitting pods without ownerReferences will accumulate them when
|
||||||
|
# the operator crashes. >25 pending pods in any namespace for 30m
|
||||||
|
# is the signal to investigate the reconciler.
|
||||||
|
- alert: NamespacePendingPodBacklog
|
||||||
|
expr: sum by (namespace) (kube_pod_status_phase{phase="Pending"}) > 25
|
||||||
|
for: 30m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Namespace {{ $labels.namespace }} has {{ $value }} Pending pods for 30m"
|
||||||
|
description: "Pending pod count in {{ $labels.namespace }} exceeds 25 sustained for 30m. Likely operator-leak avalanche pattern — children emitted without ownerReferences. Risk of multus CNI OOM cascade."
|
||||||
|
|
||||||
# Longhorn storage health alerts. Required: longhorn scrape job
|
# Longhorn storage health alerts. Required: longhorn scrape job
|
||||||
# (added 2026-04-26 — see scrape_configs above). The K8s events
|
# (added 2026-04-26 — see scrape_configs above). The K8s events
|
||||||
# for "snapshot becomes not ready to use" are transient lifecycle
|
# for "snapshot becomes not ready to use" are transient lifecycle
|
||||||
|
|||||||
@@ -188,13 +188,24 @@ spec:
|
|||||||
- name: kube-multus
|
- name: kube-multus
|
||||||
image: ghcr.io/k8snetworkplumbingwg/multus-cni:snapshot-thick
|
image: ghcr.io/k8snetworkplumbingwg/multus-cni:snapshot-thick
|
||||||
command: [ "/usr/src/multus-cni/bin/multus-daemon" ]
|
command: [ "/usr/src/multus-cni/bin/multus-daemon" ]
|
||||||
|
# 2026-05-11: upstream default of 50Mi memory limit OOM-cascades when
|
||||||
|
# an operator-owned namespace accumulates >100 pending pods retrying
|
||||||
|
# CNI ADD. RemoteDesktop emitted 219 orphan rd-browser-only pods
|
||||||
|
# (missing OwnerReferences), kubelet's CNI ADD avalanche pushed multus
|
||||||
|
# over 50Mi, OOMKilled, restarted with even bigger backlog → loop.
|
||||||
|
# 21h cluster outage. See FlowerCore.Notes:
|
||||||
|
# feedback_multus_50mi_limit_oom_orphan_pod_avalanche.md
|
||||||
|
# 1Gi limit / 512Mi request comfortably handles a 200+ pod CNI
|
||||||
|
# catchup burst on 64GB nodes (nodes are <25% used in steady-state).
|
||||||
|
# Drop back toward 256Mi only after MultusMemoryPressure alert
|
||||||
|
# proves steady-state working set sits well below 200Mi.
|
||||||
resources:
|
resources:
|
||||||
requests:
|
requests:
|
||||||
cpu: "100m"
|
cpu: "100m"
|
||||||
memory: "50Mi"
|
memory: "512Mi"
|
||||||
limits:
|
limits:
|
||||||
cpu: "100m"
|
cpu: "100m"
|
||||||
memory: "50Mi"
|
memory: "1Gi"
|
||||||
securityContext:
|
securityContext:
|
||||||
privileged: true
|
privileged: true
|
||||||
terminationMessagePolicy: FallbackToLogsOnError
|
terminationMessagePolicy: FallbackToLogsOnError
|
||||||
|
|||||||
@@ -127,10 +127,13 @@ spec:
|
|||||||
initContainers:
|
initContainers:
|
||||||
- name: fix-data-perms
|
- name: fix-data-perms
|
||||||
image: busybox:latest
|
image: busybox:latest
|
||||||
# Also chown /shared-tts (hostPath /tmp/tts-audio) so the non-root
|
# Must run as root to chown the hostPath /tmp/tts-audio that may be
|
||||||
# app user (uid 1654) can write Piper .sln16 files that Asterisk
|
# root-owned after node reboot. Pod-level runAsNonRoot:true would
|
||||||
# reads at /var/lib/asterisk/sounds/tts. World-readable (755) is
|
# otherwise inherit and chown would fail with EPERM (see Notes memory
|
||||||
# fine — Asterisk runs as a different uid in the other pod.
|
# feedback_hostpath_initcontainer_chown_perms).
|
||||||
|
securityContext:
|
||||||
|
runAsUser: 0
|
||||||
|
runAsNonRoot: false
|
||||||
command: ["sh", "-c", "chown -R 1654:1654 /data && chown 1654:1654 /shared-tts && chmod 0755 /shared-tts"]
|
command: ["sh", "-c", "chown -R 1654:1654 /data && chown 1654:1654 /shared-tts && chmod 0755 /shared-tts"]
|
||||||
volumeMounts:
|
volumeMounts:
|
||||||
- name: telephony-data
|
- name: telephony-data
|
||||||
|
|||||||
Reference in New Issue
Block a user