diff --git a/apps/kubevirt-vms/ci1.yaml b/apps/kubevirt-vms/ci1.yaml
index 3f9e459..5708ab5 100644
--- a/apps/kubevirt-vms/ci1.yaml
+++ b/apps/kubevirt-vms/ci1.yaml
@@ -1,51 +1,9 @@
# =============================================================================
-# ci1 — Windows Server 2025 KubeVirt VM (GitHub Actions Self-Hosted Runner)
+# ci1 - Windows Server 2025 KubeVirt VM (GitHub Actions Self-Hosted Runner)
# =============================================================================
-# Purpose: dedicated CI runner for FlowerCore.Updater Sandbox E2E nightly +
-# future fleet WPF AAT lanes. Replaces the never-registered
-# `bluejay-ws-sandbox-1` runner placeholder. Andrew explicitly does NOT want
-# BLUEJAY-WS registered as a runner (workstation has personal/operator state).
-#
-# Storage layout (2026-05-08):
-# * ISO is now sourced from Synology NFS (Path B) — see
-# win2025-iso-nfs-pv.yaml. The Longhorn Filesystem PVC
-# `windows-server-2025-iso` below is RETAINED but UNUSED so the prior
-# CDI upload state is preserved as a fallback (and so ArgoCD doesn't
-# prune it on this commit). It can be deleted in a follow-up commit
-# after the NFS path is proven on a successful Windows install.
-#
-# Status (2026-05-08): LIVE — Phase 1 prereqs satisfied:
-# * Multus CNI v4.2.2 thick-plugin DaemonSet running on all 3 RKE2 nodes
-# (apps/multus/multus.yaml; ApplicationSet `infra-multus` Synced/Healthy)
-# * CDI v1.65.0 operator + CR Deployed (apps/cdi/; ApplicationSet
-# `infra-cdi` Synced/Healthy; uploadproxy reachable via kubectl port-forward)
-# * Windows Server 2025 ISO uploaded via CDI virtctl image-upload to
-# PVC windows-server-2025-iso (7.7 GiB → 10Gi PVC, Bound, Upload Complete)
-# * Local Administrator password generated, stored in 1Password vault
-# IAmWorkin (qaphopopkryhbg353ukzhhuqoq) item id h3ix4mgfk65gmkcmvh6ly3d3hu
-# * NetworkAttachmentDefinition prod-vlan57 registered (apps/kubevirt-vms/
-# prod-vlan57-nad.yaml). VM still uses pod-network masquerade until Phase 1.5
-# host bridge work lands (Puppet br-prod + enp86s0.57); switching is a
-# one-line YAML edit + git push.
-#
-# See docs/infrastructure/windows-server-build-runner-plan.md "Phase 1 readiness gate".
-#
-# Network choice in this draft: **pod-network fallback** (Calico default).
-# Outbound-only is fine for the Updater Sandbox E2E runner workload (the runner
-# polls GitHub Actions over HTTPS; no inbound listener needed). Switch to a
-# Multus PROD VLAN NetworkAttachmentDefinition once Multus is installed and the
-# operator wants L2 access from `ci1` to other PROD VLAN services.
-#
-# Sizing: 8 vCPU / 16 GB RAM / 200 GB disk on Longhorn (default storageClass).
-# Capacity check 2026-05-08: each RKE2 node has 16 vCPU / ~64Gi allocatable;
-# 8 vCPU is ~17% of one node's allocatable, fits comfortably.
-#
-# Apply (after operator approval + ISO loaded):
-# kubectl --kubeconfig $env:USERPROFILE\.kube\rke2.yaml apply -f apps/kubevirt-vms/ci1.yaml
-#
-# Connect to console for Windows install:
-# virtctl --kubeconfig $env:USERPROFILE\.kube\rke2.yaml vnc ci1 -n kubevirt-vms
-# (Or via Guacamole once a connection profile is added.)
+# Boots from the sysprepped containerDisk template built by the Windows VM
+# sysprep pipeline. See docs/infrastructure/windows-vm-sysprep-pipeline.md.
+# Path A/B/C install history is preserved in git log only.
# =============================================================================
apiVersion: v1
@@ -57,248 +15,6 @@ metadata:
pod-security.kubernetes.io/enforce: privileged
---
-# ISO PVC — populated via CDI virtctl image-upload (CDI is now installed).
-#
-# **Volume mode (2026-05-08 status):** Filesystem-mode PVC. A migration to
-# `volumeMode: Block` via DataVolume was attempted to address an OVMF SATA
-# CDROM read timeout, but CDI v1.65.0's upload-target pod runs as uid 107
-# with `capabilities.drop: [ALL]` and cannot open the underlying block
-# device (`blockdev: cannot open /dev/cdi-block-volume: Permission denied`).
-# Reverted to Filesystem PVC pending one of:
-# - CDI deployment override granting CAP_SYS_RAWIO to upload pod
-# - Pre-populated PVC via privileged init pod that dd's the ISO directly
-# - Migration to a different storage class that exposes block devices
-# differently (e.g. iSCSI, where Longhorn's CSI mount path may behave
-# differently)
-#
-# Population workflow (this PVC, Filesystem mode):
-# 1. virtctl --kubeconfig $env:USERPROFILE\.kube\rke2.yaml image-upload pvc \
-# windows-server-2025-iso -n kubevirt-vms \
-# --image-path "$env:USERPROFILE\Downloads\en-us_windows_server_2025_updated_march_2026_x64_dvd_8e06425a.iso" \
-# --size 10Gi --storage-class longhorn --access-mode ReadWriteOnce \
-# --uploadproxy-url https://localhost:8443 --insecure
-# (--uploadproxy-url uses port-forward in practice: `kubectl port-forward
-# -n cdi service/cdi-uploadproxy 8443:443 &` first.)
-#
-# **Open boot issue:** even with the ISO at bootOrder:1, OVMF console showed:
-# BdsDxe: starting Boot0001 "UEFI QEMU DVD-ROM QM00001 " from ... Sata(...)
-# BdsDxe: failed to start Boot0001 ... Time out
-# Diagnosis confirmed PVC content IS a valid bootable ISO9660 image — the
-# timeout is in OVMF reading from the SATA-CDROM-backed-by-filesystem-PVC.
-# Block mode would likely fix it; see CDI permission issue above.
-apiVersion: v1
-kind: PersistentVolumeClaim
-metadata:
- name: windows-server-2025-iso
- namespace: kubevirt-vms
- labels:
- app: ci-runner
- flowercore.io/managed-by: bluejay-infra
-spec:
- accessModes:
- - ReadWriteOnce # Bump to ReadOnlyMany after population for multi-VM use
- resources:
- requests:
- storage: 10Gi # Server 2025 ISO is 7.7GB; 10Gi for headroom
- storageClassName: longhorn
-
----
-# Root disk PVC — empty 200Gi volume that Windows installs into.
-apiVersion: v1
-kind: PersistentVolumeClaim
-metadata:
- name: ci1-rootdisk
- namespace: kubevirt-vms
-spec:
- accessModes:
- - ReadWriteOnce
- resources:
- requests:
- storage: 200Gi
- storageClassName: longhorn
-
----
-# Sysprep ConfigMap — autounattend.xml for hands-off Windows install.
-# Sets local Administrator password (REPLACE the placeholder), enables RDP,
-# enables WinRM, sets hostname, and configures static-ish networking via DHCP.
-# The ISO + VirtIO drivers handle the rest.
-apiVersion: v1
-kind: ConfigMap
-metadata:
- name: ci1-autounattend
- namespace: kubevirt-vms
-data:
- autounattend.xml: |
-
-
-
-
-
-
-
- en-US
-
- en-US
- en-US
- en-US
- en-US
-
-
-
-
-
- E:\amd64\2k25
-
-
-
-
-
-
-
- 0
- true
-
-
- 1
- 260
- EFI
-
-
- 2
- 128
- MSR
-
-
- 3
- true
- Primary
-
-
-
-
- 1
- 1
- FAT32
-
-
-
- 2
- 2
-
-
- 3
- 3
- NTFS
-
-
-
-
-
-
-
-
-
- 0
- 3
-
-
-
-
- /IMAGE/INDEX
- 2
-
-
-
-
-
-
- true
- FlowerCore CI Runner
- FlowerCore
-
-
-
-
-
-
-
-
- CI1
- Central Standard Time
-
-
-
- false
-
-
-
-
-
-
-
- true
- true
- true
- true
- true
- 3
-
-
-
-
- bAA3AGsANABOAHcAcgBMAG4AeQBTAHUAYgBBAHQAaQBzAFUAcAB6AEMAWQAhADkAYQBCAEEAZABtAGkAbgBpAHMAdAByAGEAdABvAHIAUABhAHMAcwB3AG8AcgBkAA==
- false
-
-
-
-
- 1
- powershell.exe -ExecutionPolicy Bypass -Command "Set-NetFirewallRule -DisplayGroup 'Remote Desktop' -Enabled True"
- Enable RDP firewall rule
-
-
- 2
- powershell.exe -ExecutionPolicy Bypass -Command "Enable-PSRemoting -Force; Set-Item WSMan:\localhost\Service\Auth\Basic $true; Set-Item WSMan:\localhost\Service\AllowUnencrypted $true"
- Enable WinRM (Phase 2 will pivot to HTTPS via step-ca cert)
-
-
- 3
- cmd.exe /c reg add "HKLM\SOFTWARE\Microsoft\Windows\CurrentVersion\Policies\System" /v EnableLUA /t REG_DWORD /d 0 /f
- Disable UAC (Phase 2 Puppet will re-evaluate)
-
-
-
-
-
-
----
-# VirtualMachine — Windows Server 2025 CI runner.
apiVersion: kubevirt.io/v1
kind: VirtualMachine
metadata:
@@ -309,33 +25,7 @@ metadata:
role: github-actions-runner
flowercore.io/managed-by: bluejay-infra
spec:
- # `running: true` is deprecated in favor of `runStrategy`. They are mutually
- # exclusive — KubeVirt's validating webhook rejects any VM that sets both:
- # admission webhook "virtualmachine-validator.kubevirt.io" denied the request:
- # Running and RunStrategy are mutually exclusive.
- # `Always` keeps a VMI running and restarts it if it crashes/exits — same
- # semantics as the old `running: true`.
- #
- # **2026-05-08 status: VM cannot start due to a stale QEMU flock on the
- # rootdisk PVC** (qemu reports `Failed to get "write" lock` on
- # `/var/run/kubevirt-private/vmi-disks/rootdisk/disk.img`). The flock was
- # left by a previous QEMU process during a force-deleted launcher pod
- # cycle. Recovery requires either (a) a Longhorn engine restart on
- # rke2-agent2, (b) a Longhorn volume detach via the longhorn-manager API
- # (kubectl patch on `volume.longhorn.io/` does not work — the
- # spec.nodeID is reconciled back), or (c) a node reboot of rke2-agent2.
- #
- # **Confirmed working:** the bootOrder swap (windows-iso=1, rootdisk=2)
- # and the runStrategy migration (above). The ISO PVC was successfully
- # repopulated via virtctl image-upload pvc on the Filesystem-mode PVC.
- #
- # **Open: SATA CDROM read timeout** — even with bootOrder=1, OVMF reported
- # `BdsDxe: failed to start Boot0001 ... Time out` reading the SATA CDROM
- # backed by the Filesystem-mode PVC. A switch to Block-mode DataVolume
- # was attempted but blocked by a CDI v1.65.0 upload-pod permission issue
- # (capability drop prevents writing to the underlying block device).
- # See header docstring on the ISO PVC.
- runStrategy: Always # LIVE — ISO uploaded 2026-05-08, password in 1P
+ runStrategy: Always
template:
metadata:
labels:
@@ -377,73 +67,16 @@ spec:
firmware:
bootloader:
efi:
- # 2026-05-08: SecureBoot=false during initial install. With SecureBoot
- # enabled, OVMF's BdsDxe times out reading Boot0001 from the SCSI
- # CDROM ("BdsDxe: failed to start Boot0001 ... Time out") before the
- # EFI bootloader signature can verify against the OVMF VARS trust DB.
- # KubeVirt's `/usr/share/OVMF/OVMF_VARS.secboot.fd` template doesn't
- # appear to include the Microsoft KEK/DB by default, so signed
- # Windows EFI bootloaders fail validation. Disabling SecureBoot lets
- # OVMF skip the chain check and boot directly. This is acceptable for
- # a CI runner — TPM 2.0 is still emulated (`tpm: {}` below) so
- # BitLocker / Hyper-V / WSL still work.
- # When the operator wants SecureBoot back, the path is:
- # 1. Custom-build OVMF_VARS.fd with Microsoft KEK/DB enrolled
- # 2. Mount it into the VM via firmware.bootloader.efi.persistent
- # 3. Set secureBoot: true again
- # Tracked separately from the install unblock.
secureBoot: false
devices:
- tpm: {} # Non-persistent vTPM — sufficient for runner; no BitLocker
+ tpm: {}
disks:
- # bootOrder: ISO must be 1 for first-boot install (the rootdisk has no
- # EFI bootloader yet). After Windows installs, it writes its own UEFI
- # Boot#### entries pointing at the rootdisk's EFI partition; UEFI then
- # boots from rootdisk going forward and the ISO at bootOrder:2 acts as
- # a fallback for re-install scenarios.
- #
- # Original (broken) order had rootdisk=1, windows-iso=2 — UEFI tried
- # the empty virtio disk first, got nothing, fell back to the SATA
- # CDROM at Boot0001 with a short timeout, and timed out before the
- # CDROM enumerated. Console showed:
- # BdsDxe: failed to start Boot0001 ... Time out
- # BdsDxe: No bootable option or device was found.
- # Confirmed via debug pod: PVC content IS a real bootable ISO9660
- # (file: "ISO 9660 CD-ROM filesystem data ... (bootable)"), so the
- # only bug was boot priority.
- # 2026-05-08 PM: cdrom bus SCSI + containerDisk delivery. This
- # combination boots qemu cleanly and reaches OVMF, but OVMF
- # BdsDxe still hits "starting Boot0001 ... Time out" on the
- # cdrom — see HANDOFF.md / CODEX-STATUS.md "OPEN — ci1" for the
- # full diagnostic chain. virtio-blk disk swap was attempted as a
- # workaround but introduced a separate QEMU rootdisk flock issue
- # without fixing the underlying OVMF cdrom problem; reverted.
- # Operator decision needed for next architectural step (OVMF
- # custom build with extended timeout, KubeVirt version bump,
- # Hyper-V/VirtualBox-and-export, or BIOS legacy boot). The
- # containerDisk distribution pipeline (build/save/scp/ctr import)
- # is proven and ready to reuse for any of those.
- - name: windows-iso
- bootOrder: 1
- cdrom:
- bus: scsi
- name: rootdisk
- bootOrder: 2
disk:
bus: virtio
- - name: virtio-drivers
- cdrom:
- bus: sata
- - name: sysprep
- cdrom:
- bus: sata
interfaces:
- # Pod-network fallback for Phase 1. To switch to PROD VLAN once Multus
- # + the prod-vlan57 NAD exist, replace this block with:
- # - name: prod-net
- # bridge: {}
- # model: virtio
- # and update the networks: stanza to use multus.networkName: kubevirt-vms/prod-vlan57
+ # Pod-network fallback for CI runner outbound traffic. Switch to
+ # prod-vlan57 once the bridge/NAD lane is ready for L2 access.
- name: default
masquerade: {}
model: virtio
@@ -454,55 +87,7 @@ spec:
pod: {}
volumes:
- name: rootdisk
- persistentVolumeClaim:
- claimName: ci1-rootdisk
- - name: windows-iso
- # 2026-05-08 PM (Path C, CONTAINERDISK): the ISO is now packaged as
- # a KubeVirt containerDisk OCI image baked from
- # `FROM scratch ; ADD --chown=107:107 disk.img /disk/disk.img`.
- # The qemu user (uid 107) reads the ISO directly from a tmpfs view
- # of the OCI layer, bypassing both:
- # - Synology NFS export ACL (Path B failed: uid 107 denied at
- # directory level even with mode 0777, see memory
- # feedback_synology_iso_export_root_only_uid_107_denied)
- # - OVMF cdrom read-window timeout (Path A and Path B's SCSI
- # retry both hit `BdsDxe: failed to start Boot0001 ... Time out`
- # when the cdrom was backed by a PVC the storage controller
- # couldn't satisfy reads from fast enough).
- #
- # Image build (one-time, per ISO version):
- # 1. Copy ISO to disk.img, write Dockerfile
- # 2. podman build --tag localhost/win-server-2025:1.0 . (on noc1)
- # 3. podman save -o win-server-2025-1.0.tar localhost/win-server-2025:1.0
- # 4. SCP tar to all 3 RKE2 nodes (rke2-server, rke2-agent1, rke2-agent2)
- # 5. sudo /var/lib/rancher/rke2/bin/ctr -a /run/k3s/containerd/containerd.sock \
- # -n k8s.io images import /tmp/win-server-2025-1.0.tar
- # Standard FC pattern per `feedback_rke2_localhost_imagepullpolicy`.
- #
- # When a new Windows ISO version ships, bump the tag (1.1, 1.2, ...),
- # rebuild + redistribute, and update the image: line below in a new
- # commit. KubeVirt picks up the new image via a VM restart.
- #
- # The legacy NFS PVC + PV (apps/kubevirt-vms/win2025-iso-nfs-pv.yaml)
- # and CDI Longhorn PVC (`windows-server-2025-iso`) are RETAINED for
- # this commit so the prior states are recoverable. Once the
- # containerDisk path proves on a successful Windows install, both
- # legacy artifacts can be pruned in a follow-up commit.
containerDisk:
- image: localhost/win-server-2025:1.0
+ image: localhost/fc-win-server-2025:v1
imagePullPolicy: Never
- - name: virtio-drivers
- containerDisk:
- # Pinned to v1.8.2 (latest stable as of 2026-05-08).
- # The :latest tag uses Docker manifest v1 schema which containerd
- # 2.1 (RKE2 v1.34.5) refuses to pull with:
- # "media type application/vnd.docker.distribution.manifest.v1+prettyjws
- # is no longer supported since containerd v2.1"
- # v1.8.2 is rebuilt with manifest v2/OCI and works on containerd 2.1.
- # Bump available: https://quay.io/repository/kubevirt/virtio-container-disk?tab=tags
- image: quay.io/kubevirt/virtio-container-disk:v1.8.2
- - name: sysprep
- sysprep:
- configMap:
- name: ci1-autounattend
terminationGracePeriodSeconds: 3600
diff --git a/apps/kubevirt-vms/kustomization.yaml b/apps/kubevirt-vms/kustomization.yaml
new file mode 100644
index 0000000..e599c50
--- /dev/null
+++ b/apps/kubevirt-vms/kustomization.yaml
@@ -0,0 +1,3 @@
+resources:
+ - ci1.yaml
+ - prod-vlan57-nad.yaml