diff --git a/apps/multus/multus.yaml b/apps/multus/multus.yaml index 15e2a58..2a0d802 100644 --- a/apps/multus/multus.yaml +++ b/apps/multus/multus.yaml @@ -188,13 +188,24 @@ spec: - name: kube-multus image: ghcr.io/k8snetworkplumbingwg/multus-cni:snapshot-thick command: [ "/usr/src/multus-cni/bin/multus-daemon" ] + # 2026-05-11: upstream default of 50Mi memory limit OOM-cascades when + # an operator-owned namespace accumulates >100 pending pods retrying + # CNI ADD. RemoteDesktop emitted 219 orphan rd-browser-only pods + # (missing OwnerReferences), kubelet's CNI ADD avalanche pushed multus + # over 50Mi, OOMKilled, restarted with even bigger backlog → loop. + # 21h cluster outage. See FlowerCore.Notes: + # feedback_multus_50mi_limit_oom_orphan_pod_avalanche.md + # 1Gi limit / 512Mi request comfortably handles a 200+ pod CNI + # catchup burst on 64GB nodes (nodes are <25% used in steady-state). + # Drop back toward 256Mi only after MultusMemoryPressure alert + # proves steady-state working set sits well below 200Mi. resources: requests: cpu: "100m" - memory: "50Mi" + memory: "512Mi" limits: cpu: "100m" - memory: "50Mi" + memory: "1Gi" securityContext: privileged: true terminationMessagePolicy: FallbackToLogsOnError