# ============================================================================= # Multus CNI — Meta-CNI for multi-network attachment to pods/VMs # ============================================================================= # Purpose: enable KubeVirt VMs (and any future workload) to attach additional # network interfaces beyond the default Calico-managed pod network. Required # for ci1 (Windows Server 2025 KubeVirt VM) to bridge onto PROD VLAN 57. # # Source: upstream k8snetworkplumbingwg/multus-cni v4.2.2 # https://github.com/k8snetworkplumbingwg/multus-cni/blob/v4.2.2/deployments/multus-daemonset-thick.yml # # Inlined verbatim (with project header + version pin annotation) for # reproducibility and air-gap safety. Bumping versions = edit this file + # git push. ArgoCD picks up via the bluejay-infra ApplicationSet # (apps/* directory generator on main). # # Why thick plugin (not thin): # - Thick = daemon + thin shim binary; daemon handles NAD watch + CRD reads # centrally so each pod's CNI ADD doesn't hit the K8s API server. Better # for clusters with many NAD-using pods. # - Thin = each CNI ADD process directly contacts K8s API. Simpler but # scales worse and has more failure modes. # - KubeVirt + multi-VM workload pattern fits thick perfectly. # # Cluster context (verified 2026-05-08): # - RKE2 v1.34.5 on 3 nodes (rke2-server, rke2-agent1, rke2-agent2) # - Calico CNI (Tigera-managed) at /etc/cni/net.d + /opt/cni/bin (default) # - openSUSE Leap 16, kernel 6.12, containerd 2.1.5 # - host bridge for PROD VLAN 57 = `br-prod` (PUPPET HOST WORK — see Phase 1.5 # in docs/infrastructure/windows-server-build-runner-plan.md) # # Version pin: snapshot-thick → pinning to v4.2.2 release tag at deploy time # would require a private mirror of the image. Upstream `snapshot-thick` tag # is updated on every release, so for now we trust upstream + Calico's # established pattern. Pin to a specific SHA256 once we mirror to Gitea OCI. # # Apply (once committed to bluejay-infra main, ApplicationSet auto-syncs): # git add apps/multus/multus.yaml && git commit && git push origin main # # ArgoCD `infra-multus` Application appears within 3 min via ApplicationSet # # Verify: # kubectl -n kube-system get ds kube-multus-ds # kubectl -n kube-system rollout status ds kube-multus-ds # kubectl get crd network-attachment-definitions.k8s.cni.cncf.io # ============================================================================= --- apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: name: network-attachment-definitions.k8s.cni.cncf.io annotations: bluejay.iamworkin.lan/source: "k8snetworkplumbingwg/multus-cni v4.2.2" spec: group: k8s.cni.cncf.io scope: Namespaced names: plural: network-attachment-definitions singular: network-attachment-definition kind: NetworkAttachmentDefinition shortNames: - net-attach-def versions: - name: v1 served: true storage: true schema: openAPIV3Schema: description: 'NetworkAttachmentDefinition is a CRD schema specified by the Network Plumbing Working Group to express the intent for attaching pods to one or more logical or physical networks. More information available at: https://github.com/k8snetworkplumbingwg/multi-net-spec' type: object properties: apiVersion: type: string kind: type: string metadata: type: object spec: description: 'NetworkAttachmentDefinition spec defines the desired state of a network attachment' type: object properties: config: description: 'NetworkAttachmentDefinition config is a JSON-formatted CNI configuration' type: string --- kind: ClusterRole apiVersion: rbac.authorization.k8s.io/v1 metadata: name: multus rules: - apiGroups: ["k8s.cni.cncf.io"] resources: - '*' verbs: - '*' - apiGroups: - "" resources: - pods - pods/status verbs: - get - list - update - watch - apiGroups: - "" - events.k8s.io resources: - events verbs: - create - patch - update --- kind: ClusterRoleBinding apiVersion: rbac.authorization.k8s.io/v1 metadata: name: multus roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole name: multus subjects: - kind: ServiceAccount name: multus namespace: kube-system --- apiVersion: v1 kind: ServiceAccount metadata: name: multus namespace: kube-system --- kind: ConfigMap apiVersion: v1 metadata: name: multus-daemon-config namespace: kube-system labels: tier: node app: multus data: daemon-config.json: | { "chrootDir": "/hostroot", "cniVersion": "0.3.1", "logLevel": "verbose", "logToStderr": true, "cniConfigDir": "/host/etc/cni/net.d", "multusAutoconfigDir": "/host/etc/cni/net.d", "multusConfigFile": "auto", "socketDir": "/host/run/multus/" } --- apiVersion: apps/v1 kind: DaemonSet metadata: name: kube-multus-ds namespace: kube-system labels: tier: node app: multus name: multus spec: selector: matchLabels: name: multus updateStrategy: type: RollingUpdate template: metadata: labels: tier: node app: multus name: multus spec: hostNetwork: true hostPID: true tolerations: - operator: Exists effect: NoSchedule - operator: Exists effect: NoExecute serviceAccountName: multus containers: - name: kube-multus image: ghcr.io/k8snetworkplumbingwg/multus-cni:snapshot-thick command: [ "/usr/src/multus-cni/bin/multus-daemon" ] # 2026-05-11: upstream default of 50Mi memory limit OOM-cascades when # an operator-owned namespace accumulates >100 pending pods retrying # CNI ADD. RemoteDesktop emitted 219 orphan rd-browser-only pods # (missing OwnerReferences), kubelet's CNI ADD avalanche pushed multus # over 50Mi, OOMKilled, restarted with even bigger backlog → loop. # 21h cluster outage. See FlowerCore.Notes: # feedback_multus_50mi_limit_oom_orphan_pod_avalanche.md # 1Gi limit / 512Mi request comfortably handles a 200+ pod CNI # catchup burst on 64GB nodes (nodes are <25% used in steady-state). # Drop back toward 256Mi only after MultusMemoryPressure alert # proves steady-state working set sits well below 200Mi. resources: requests: cpu: "100m" memory: "512Mi" limits: cpu: "100m" memory: "1Gi" securityContext: privileged: true terminationMessagePolicy: FallbackToLogsOnError volumeMounts: - name: cni mountPath: /host/etc/cni/net.d # multus-daemon expects that cnibin path must be identical between pod and container host. # e.g. if the cni bin is in '/opt/cni/bin' on the container host side, then it should be mount to '/opt/cni/bin' in multus-daemon, # not to any other directory, like '/opt/bin' or '/usr/bin'. - name: cnibin mountPath: /opt/cni/bin - name: host-run mountPath: /host/run - name: host-var-lib-cni-multus mountPath: /var/lib/cni/multus - name: host-var-lib-kubelet mountPath: /var/lib/kubelet mountPropagation: HostToContainer - name: host-run-k8s-cni-cncf-io mountPath: /run/k8s.cni.cncf.io - name: host-run-netns mountPath: /run/netns mountPropagation: HostToContainer - name: multus-daemon-config mountPath: /etc/cni/net.d/multus.d readOnly: true - name: hostroot mountPath: /hostroot mountPropagation: HostToContainer - mountPath: /etc/cni/multus/net.d name: multus-conf-dir env: - name: MULTUS_NODE_NAME valueFrom: fieldRef: fieldPath: spec.nodeName initContainers: - name: install-multus-binary image: ghcr.io/k8snetworkplumbingwg/multus-cni:snapshot-thick command: - "sh" - "-c" - "cp /usr/src/multus-cni/bin/multus-shim /host/opt/cni/bin/multus-shim && cp /usr/src/multus-cni/bin/passthru /host/opt/cni/bin/passthru" resources: requests: cpu: "10m" memory: "15Mi" securityContext: privileged: true terminationMessagePolicy: FallbackToLogsOnError volumeMounts: - name: cnibin mountPath: /host/opt/cni/bin mountPropagation: Bidirectional terminationGracePeriodSeconds: 10 volumes: - name: cni hostPath: path: /etc/cni/net.d - name: cnibin hostPath: path: /opt/cni/bin - name: hostroot hostPath: path: / - name: multus-daemon-config configMap: name: multus-daemon-config items: - key: daemon-config.json path: daemon-config.json - name: host-run hostPath: path: /run - name: host-var-lib-cni-multus hostPath: path: /var/lib/cni/multus - name: host-var-lib-kubelet hostPath: path: /var/lib/kubelet - name: host-run-k8s-cni-cncf-io hostPath: path: /run/k8s.cni.cncf.io - name: host-run-netns hostPath: path: /run/netns/ - name: multus-conf-dir hostPath: path: /etc/cni/multus/net.d