bluejay-infra/apps/multus/multus.yaml

# =============================================================================
# Multus CNI — Meta-CNI for multi-network attachment to pods/VMs
# =============================================================================
# Purpose: enable KubeVirt VMs (and any future workload) to attach additional
# network interfaces beyond the default Calico-managed pod network. Required
# for ci1 (Windows Server 2025 KubeVirt VM) to bridge onto PROD VLAN 57.
#
# Source: upstream k8snetworkplumbingwg/multus-cni v4.2.2
#   https://github.com/k8snetworkplumbingwg/multus-cni/blob/v4.2.2/deployments/multus-daemonset-thick.yml
#
# Inlined verbatim (with project header + version pin annotation) for
# reproducibility and air-gap safety. Bumping versions = edit this file +
# git push. ArgoCD picks up via the bluejay-infra ApplicationSet
# (apps/* directory generator on main).
#
# Why thick plugin (not thin):
#   - Thick = daemon + thin shim binary; daemon handles NAD watch + CRD reads
#     centrally so each pod's CNI ADD doesn't hit the K8s API server. Better
#     for clusters with many NAD-using pods.
#   - Thin = each CNI ADD process directly contacts K8s API. Simpler but
#     scales worse and has more failure modes.
#   - KubeVirt + multi-VM workload pattern fits thick perfectly.
#
# Cluster context (verified 2026-05-08):
#   - RKE2 v1.34.5 on 3 nodes (rke2-server, rke2-agent1, rke2-agent2)
#   - Calico CNI (Tigera-managed) at /etc/cni/net.d + /opt/cni/bin (default)
#   - openSUSE Leap 16, kernel 6.12, containerd 2.1.5
#   - host bridge for PROD VLAN 57 = `br-prod` (PUPPET HOST WORK — see Phase 1.5
#     in docs/infrastructure/windows-server-build-runner-plan.md)
#
# Version pin: snapshot-thick → pinning to v4.2.2 release tag at deploy time
# would require a private mirror of the image. Upstream `snapshot-thick` tag
# is updated on every release, so for now we trust upstream + Calico's
# established pattern. Pin to a specific SHA256 once we mirror to Gitea OCI.
#
# Apply (once committed to bluejay-infra main, ApplicationSet auto-syncs):
#   git add apps/multus/multus.yaml && git commit && git push origin main
#   # ArgoCD `infra-multus` Application appears within 3 min via ApplicationSet
#
# Verify:
#   kubectl -n kube-system get ds kube-multus-ds
#   kubectl -n kube-system rollout status ds kube-multus-ds
#   kubectl get crd network-attachment-definitions.k8s.cni.cncf.io
# =============================================================================

---
apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
metadata:
  name: network-attachment-definitions.k8s.cni.cncf.io
  annotations:
    bluejay.iamworkin.lan/source: "k8snetworkplumbingwg/multus-cni v4.2.2"
spec:
  group: k8s.cni.cncf.io
  scope: Namespaced
  names:
    plural: network-attachment-definitions
    singular: network-attachment-definition
    kind: NetworkAttachmentDefinition
    shortNames:
      - net-attach-def
  versions:
    - name: v1
      served: true
      storage: true
      schema:
        openAPIV3Schema:
          description: 'NetworkAttachmentDefinition is a CRD schema specified by the Network Plumbing
            Working Group to express the intent for attaching pods to one or more logical or physical
            networks. More information available at: https://github.com/k8snetworkplumbingwg/multi-net-spec'
          type: object
          properties:
            apiVersion:
              type: string
            kind:
              type: string
            metadata:
              type: object
            spec:
              description: 'NetworkAttachmentDefinition spec defines the desired state of a network attachment'
              type: object
              properties:
                config:
                  description: 'NetworkAttachmentDefinition config is a JSON-formatted CNI configuration'
                  type: string
---
kind: ClusterRole
apiVersion: rbac.authorization.k8s.io/v1
metadata:
  name: multus
rules:
  - apiGroups: ["k8s.cni.cncf.io"]
    resources:
      - '*'
    verbs:
      - '*'
  - apiGroups:
      - ""
    resources:
      - pods
      - pods/status
    verbs:
      - get
      - list
      - update
      - watch
  - apiGroups:
      - ""
      - events.k8s.io
    resources:
      - events
    verbs:
      - create
      - patch
      - update
---
kind: ClusterRoleBinding
apiVersion: rbac.authorization.k8s.io/v1
metadata:
  name: multus
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: multus
subjects:
  - kind: ServiceAccount
    name: multus
    namespace: kube-system
---
apiVersion: v1
kind: ServiceAccount
metadata:
  name: multus
  namespace: kube-system
---
kind: ConfigMap
apiVersion: v1
metadata:
  name: multus-daemon-config
  namespace: kube-system
  labels:
    tier: node
    app: multus
data:
  daemon-config.json: |
    {
        "chrootDir": "/hostroot",
        "cniVersion": "0.3.1",
        "logLevel": "verbose",
        "logToStderr": true,
        "cniConfigDir": "/host/etc/cni/net.d",
        "multusAutoconfigDir": "/host/etc/cni/net.d",
        "multusConfigFile": "auto",
        "socketDir": "/host/run/multus/"
    }
---
apiVersion: apps/v1
kind: DaemonSet
metadata:
  name: kube-multus-ds
  namespace: kube-system
  labels:
    tier: node
    app: multus
    name: multus
spec:
  selector:
    matchLabels:
      name: multus
  updateStrategy:
    type: RollingUpdate
  template:
    metadata:
      labels:
        tier: node
        app: multus
        name: multus
    spec:
      hostNetwork: true
      hostPID: true
      tolerations:
        - operator: Exists
          effect: NoSchedule
        - operator: Exists
          effect: NoExecute
      serviceAccountName: multus
      containers:
        - name: kube-multus
          image: ghcr.io/k8snetworkplumbingwg/multus-cni:snapshot-thick
          command: [ "/usr/src/multus-cni/bin/multus-daemon" ]
          # 2026-05-11: upstream default of 50Mi memory limit OOM-cascades when
          # an operator-owned namespace accumulates >100 pending pods retrying
          # CNI ADD. RemoteDesktop emitted 219 orphan rd-browser-only pods
          # (missing OwnerReferences), kubelet's CNI ADD avalanche pushed multus
          # over 50Mi, OOMKilled, restarted with even bigger backlog → loop.
          # 21h cluster outage. See FlowerCore.Notes:
          #   feedback_multus_50mi_limit_oom_orphan_pod_avalanche.md
          # 1Gi limit / 512Mi request comfortably handles a 200+ pod CNI
          # catchup burst on 64GB nodes (nodes are <25% used in steady-state).
          # Drop back toward 256Mi only after MultusMemoryPressure alert
          # proves steady-state working set sits well below 200Mi.
          resources:
            requests:
              cpu: "100m"
              memory: "512Mi"
            limits:
              cpu: "100m"
              memory: "1Gi"
          securityContext:
            privileged: true
          terminationMessagePolicy: FallbackToLogsOnError
          volumeMounts:
            - name: cni
              mountPath: /host/etc/cni/net.d
            # multus-daemon expects that cnibin path must be identical between pod and container host.
            # e.g. if the cni bin is in '/opt/cni/bin' on the container host side, then it should be mount to '/opt/cni/bin' in multus-daemon,
            # not to any other directory, like '/opt/bin' or '/usr/bin'.
            - name: cnibin
              mountPath: /opt/cni/bin
            - name: host-run
              mountPath: /host/run
            - name: host-var-lib-cni-multus
              mountPath: /var/lib/cni/multus
            - name: host-var-lib-kubelet
              mountPath: /var/lib/kubelet
              mountPropagation: HostToContainer
            - name: host-run-k8s-cni-cncf-io
              mountPath: /run/k8s.cni.cncf.io
            - name: host-run-netns
              mountPath: /run/netns
              mountPropagation: HostToContainer
            - name: multus-daemon-config
              mountPath: /etc/cni/net.d/multus.d
              readOnly: true
            - name: hostroot
              mountPath: /hostroot
              mountPropagation: HostToContainer
            - mountPath: /etc/cni/multus/net.d
              name: multus-conf-dir
          env:
            - name: MULTUS_NODE_NAME
              valueFrom:
                fieldRef:
                  fieldPath: spec.nodeName
      initContainers:
        - name: install-multus-binary
          image: ghcr.io/k8snetworkplumbingwg/multus-cni:snapshot-thick
          command:
            - "sh"
            - "-c"
            - "cp /usr/src/multus-cni/bin/multus-shim /host/opt/cni/bin/multus-shim && cp /usr/src/multus-cni/bin/passthru /host/opt/cni/bin/passthru"
          resources:
            requests:
              cpu: "10m"
              memory: "15Mi"
          securityContext:
            privileged: true
          terminationMessagePolicy: FallbackToLogsOnError
          volumeMounts:
            - name: cnibin
              mountPath: /host/opt/cni/bin
              mountPropagation: Bidirectional
      terminationGracePeriodSeconds: 10
      volumes:
        - name: cni
          hostPath:
            path: /etc/cni/net.d
        - name: cnibin
          hostPath:
            path: /opt/cni/bin
        - name: hostroot
          hostPath:
            path: /
        - name: multus-daemon-config
          configMap:
            name: multus-daemon-config
            items:
            - key: daemon-config.json
              path: daemon-config.json
        - name: host-run
          hostPath:
            path: /run
        - name: host-var-lib-cni-multus
          hostPath:
            path: /var/lib/cni/multus
        - name: host-var-lib-kubelet
          hostPath:
            path: /var/lib/kubelet
        - name: host-run-k8s-cni-cncf-io
          hostPath:
            path: /run/k8s.cni.cncf.io
        - name: host-run-netns
          hostPath:
            path: /run/netns/
        - name: multus-conf-dir
          hostPath:
            path: /etc/cni/multus/net.d