diff --git a/apps/kubevirt-vms/ci1.yaml b/apps/kubevirt-vms/ci1.yaml index 3f9e459..5708ab5 100644 --- a/apps/kubevirt-vms/ci1.yaml +++ b/apps/kubevirt-vms/ci1.yaml @@ -1,51 +1,9 @@ # ============================================================================= -# ci1 — Windows Server 2025 KubeVirt VM (GitHub Actions Self-Hosted Runner) +# ci1 - Windows Server 2025 KubeVirt VM (GitHub Actions Self-Hosted Runner) # ============================================================================= -# Purpose: dedicated CI runner for FlowerCore.Updater Sandbox E2E nightly + -# future fleet WPF AAT lanes. Replaces the never-registered -# `bluejay-ws-sandbox-1` runner placeholder. Andrew explicitly does NOT want -# BLUEJAY-WS registered as a runner (workstation has personal/operator state). -# -# Storage layout (2026-05-08): -# * ISO is now sourced from Synology NFS (Path B) — see -# win2025-iso-nfs-pv.yaml. The Longhorn Filesystem PVC -# `windows-server-2025-iso` below is RETAINED but UNUSED so the prior -# CDI upload state is preserved as a fallback (and so ArgoCD doesn't -# prune it on this commit). It can be deleted in a follow-up commit -# after the NFS path is proven on a successful Windows install. -# -# Status (2026-05-08): LIVE — Phase 1 prereqs satisfied: -# * Multus CNI v4.2.2 thick-plugin DaemonSet running on all 3 RKE2 nodes -# (apps/multus/multus.yaml; ApplicationSet `infra-multus` Synced/Healthy) -# * CDI v1.65.0 operator + CR Deployed (apps/cdi/; ApplicationSet -# `infra-cdi` Synced/Healthy; uploadproxy reachable via kubectl port-forward) -# * Windows Server 2025 ISO uploaded via CDI virtctl image-upload to -# PVC windows-server-2025-iso (7.7 GiB → 10Gi PVC, Bound, Upload Complete) -# * Local Administrator password generated, stored in 1Password vault -# IAmWorkin (qaphopopkryhbg353ukzhhuqoq) item id h3ix4mgfk65gmkcmvh6ly3d3hu -# * NetworkAttachmentDefinition prod-vlan57 registered (apps/kubevirt-vms/ -# prod-vlan57-nad.yaml). VM still uses pod-network masquerade until Phase 1.5 -# host bridge work lands (Puppet br-prod + enp86s0.57); switching is a -# one-line YAML edit + git push. -# -# See docs/infrastructure/windows-server-build-runner-plan.md "Phase 1 readiness gate". -# -# Network choice in this draft: **pod-network fallback** (Calico default). -# Outbound-only is fine for the Updater Sandbox E2E runner workload (the runner -# polls GitHub Actions over HTTPS; no inbound listener needed). Switch to a -# Multus PROD VLAN NetworkAttachmentDefinition once Multus is installed and the -# operator wants L2 access from `ci1` to other PROD VLAN services. -# -# Sizing: 8 vCPU / 16 GB RAM / 200 GB disk on Longhorn (default storageClass). -# Capacity check 2026-05-08: each RKE2 node has 16 vCPU / ~64Gi allocatable; -# 8 vCPU is ~17% of one node's allocatable, fits comfortably. -# -# Apply (after operator approval + ISO loaded): -# kubectl --kubeconfig $env:USERPROFILE\.kube\rke2.yaml apply -f apps/kubevirt-vms/ci1.yaml -# -# Connect to console for Windows install: -# virtctl --kubeconfig $env:USERPROFILE\.kube\rke2.yaml vnc ci1 -n kubevirt-vms -# (Or via Guacamole once a connection profile is added.) +# Boots from the sysprepped containerDisk template built by the Windows VM +# sysprep pipeline. See docs/infrastructure/windows-vm-sysprep-pipeline.md. +# Path A/B/C install history is preserved in git log only. # ============================================================================= apiVersion: v1 @@ -57,248 +15,6 @@ metadata: pod-security.kubernetes.io/enforce: privileged --- -# ISO PVC — populated via CDI virtctl image-upload (CDI is now installed). -# -# **Volume mode (2026-05-08 status):** Filesystem-mode PVC. A migration to -# `volumeMode: Block` via DataVolume was attempted to address an OVMF SATA -# CDROM read timeout, but CDI v1.65.0's upload-target pod runs as uid 107 -# with `capabilities.drop: [ALL]` and cannot open the underlying block -# device (`blockdev: cannot open /dev/cdi-block-volume: Permission denied`). -# Reverted to Filesystem PVC pending one of: -# - CDI deployment override granting CAP_SYS_RAWIO to upload pod -# - Pre-populated PVC via privileged init pod that dd's the ISO directly -# - Migration to a different storage class that exposes block devices -# differently (e.g. iSCSI, where Longhorn's CSI mount path may behave -# differently) -# -# Population workflow (this PVC, Filesystem mode): -# 1. virtctl --kubeconfig $env:USERPROFILE\.kube\rke2.yaml image-upload pvc \ -# windows-server-2025-iso -n kubevirt-vms \ -# --image-path "$env:USERPROFILE\Downloads\en-us_windows_server_2025_updated_march_2026_x64_dvd_8e06425a.iso" \ -# --size 10Gi --storage-class longhorn --access-mode ReadWriteOnce \ -# --uploadproxy-url https://localhost:8443 --insecure -# (--uploadproxy-url uses port-forward in practice: `kubectl port-forward -# -n cdi service/cdi-uploadproxy 8443:443 &` first.) -# -# **Open boot issue:** even with the ISO at bootOrder:1, OVMF console showed: -# BdsDxe: starting Boot0001 "UEFI QEMU DVD-ROM QM00001 " from ... Sata(...) -# BdsDxe: failed to start Boot0001 ... Time out -# Diagnosis confirmed PVC content IS a valid bootable ISO9660 image — the -# timeout is in OVMF reading from the SATA-CDROM-backed-by-filesystem-PVC. -# Block mode would likely fix it; see CDI permission issue above. -apiVersion: v1 -kind: PersistentVolumeClaim -metadata: - name: windows-server-2025-iso - namespace: kubevirt-vms - labels: - app: ci-runner - flowercore.io/managed-by: bluejay-infra -spec: - accessModes: - - ReadWriteOnce # Bump to ReadOnlyMany after population for multi-VM use - resources: - requests: - storage: 10Gi # Server 2025 ISO is 7.7GB; 10Gi for headroom - storageClassName: longhorn - ---- -# Root disk PVC — empty 200Gi volume that Windows installs into. -apiVersion: v1 -kind: PersistentVolumeClaim -metadata: - name: ci1-rootdisk - namespace: kubevirt-vms -spec: - accessModes: - - ReadWriteOnce - resources: - requests: - storage: 200Gi - storageClassName: longhorn - ---- -# Sysprep ConfigMap — autounattend.xml for hands-off Windows install. -# Sets local Administrator password (REPLACE the placeholder), enables RDP, -# enables WinRM, sets hostname, and configures static-ish networking via DHCP. -# The ISO + VirtIO drivers handle the rest. -apiVersion: v1 -kind: ConfigMap -metadata: - name: ci1-autounattend - namespace: kubevirt-vms -data: - autounattend.xml: | - - - - - - - - en-US - - en-US - en-US - en-US - en-US - - - - - - E:\amd64\2k25 - - - - - - - - 0 - true - - - 1 - 260 - EFI - - - 2 - 128 - MSR - - - 3 - true - Primary - - - - - 1 - 1 - FAT32 - - - - 2 - 2 - - - 3 - 3 - NTFS - - - - - - - - - - 0 - 3 - - - - - /IMAGE/INDEX - 2 - - - - - - - true - FlowerCore CI Runner - FlowerCore - - - - - - - - - CI1 - Central Standard Time - - - - false - - - - - - - - true - true - true - true - true - 3 - - - - - bAA3AGsANABOAHcAcgBMAG4AeQBTAHUAYgBBAHQAaQBzAFUAcAB6AEMAWQAhADkAYQBCAEEAZABtAGkAbgBpAHMAdAByAGEAdABvAHIAUABhAHMAcwB3AG8AcgBkAA== - false</PlainText> - </AdministratorPassword> - </UserAccounts> - <FirstLogonCommands> - <SynchronousCommand wcm:action="add"> - <Order>1</Order> - <CommandLine>powershell.exe -ExecutionPolicy Bypass -Command "Set-NetFirewallRule -DisplayGroup 'Remote Desktop' -Enabled True"</CommandLine> - <Description>Enable RDP firewall rule</Description> - </SynchronousCommand> - <SynchronousCommand wcm:action="add"> - <Order>2</Order> - <CommandLine>powershell.exe -ExecutionPolicy Bypass -Command "Enable-PSRemoting -Force; Set-Item WSMan:\localhost\Service\Auth\Basic $true; Set-Item WSMan:\localhost\Service\AllowUnencrypted $true"</CommandLine> - <Description>Enable WinRM (Phase 2 will pivot to HTTPS via step-ca cert)</Description> - </SynchronousCommand> - <SynchronousCommand wcm:action="add"> - <Order>3</Order> - <CommandLine>cmd.exe /c reg add "HKLM\SOFTWARE\Microsoft\Windows\CurrentVersion\Policies\System" /v EnableLUA /t REG_DWORD /d 0 /f</CommandLine> - <Description>Disable UAC (Phase 2 Puppet will re-evaluate)</Description> - </SynchronousCommand> - </FirstLogonCommands> - </component> - </settings> - </unattend> - ---- -# VirtualMachine — Windows Server 2025 CI runner. apiVersion: kubevirt.io/v1 kind: VirtualMachine metadata: @@ -309,33 +25,7 @@ metadata: role: github-actions-runner flowercore.io/managed-by: bluejay-infra spec: - # `running: true` is deprecated in favor of `runStrategy`. They are mutually - # exclusive — KubeVirt's validating webhook rejects any VM that sets both: - # admission webhook "virtualmachine-validator.kubevirt.io" denied the request: - # Running and RunStrategy are mutually exclusive. - # `Always` keeps a VMI running and restarts it if it crashes/exits — same - # semantics as the old `running: true`. - # - # **2026-05-08 status: VM cannot start due to a stale QEMU flock on the - # rootdisk PVC** (qemu reports `Failed to get "write" lock` on - # `/var/run/kubevirt-private/vmi-disks/rootdisk/disk.img`). The flock was - # left by a previous QEMU process during a force-deleted launcher pod - # cycle. Recovery requires either (a) a Longhorn engine restart on - # rke2-agent2, (b) a Longhorn volume detach via the longhorn-manager API - # (kubectl patch on `volume.longhorn.io/<pvc-name>` does not work — the - # spec.nodeID is reconciled back), or (c) a node reboot of rke2-agent2. - # - # **Confirmed working:** the bootOrder swap (windows-iso=1, rootdisk=2) - # and the runStrategy migration (above). The ISO PVC was successfully - # repopulated via virtctl image-upload pvc on the Filesystem-mode PVC. - # - # **Open: SATA CDROM read timeout** — even with bootOrder=1, OVMF reported - # `BdsDxe: failed to start Boot0001 ... Time out` reading the SATA CDROM - # backed by the Filesystem-mode PVC. A switch to Block-mode DataVolume - # was attempted but blocked by a CDI v1.65.0 upload-pod permission issue - # (capability drop prevents writing to the underlying block device). - # See header docstring on the ISO PVC. - runStrategy: Always # LIVE — ISO uploaded 2026-05-08, password in 1P + runStrategy: Always template: metadata: labels: @@ -377,73 +67,16 @@ spec: firmware: bootloader: efi: - # 2026-05-08: SecureBoot=false during initial install. With SecureBoot - # enabled, OVMF's BdsDxe times out reading Boot0001 from the SCSI - # CDROM ("BdsDxe: failed to start Boot0001 ... Time out") before the - # EFI bootloader signature can verify against the OVMF VARS trust DB. - # KubeVirt's `/usr/share/OVMF/OVMF_VARS.secboot.fd` template doesn't - # appear to include the Microsoft KEK/DB by default, so signed - # Windows EFI bootloaders fail validation. Disabling SecureBoot lets - # OVMF skip the chain check and boot directly. This is acceptable for - # a CI runner — TPM 2.0 is still emulated (`tpm: {}` below) so - # BitLocker / Hyper-V / WSL still work. - # When the operator wants SecureBoot back, the path is: - # 1. Custom-build OVMF_VARS.fd with Microsoft KEK/DB enrolled - # 2. Mount it into the VM via firmware.bootloader.efi.persistent - # 3. Set secureBoot: true again - # Tracked separately from the install unblock. secureBoot: false devices: - tpm: {} # Non-persistent vTPM — sufficient for runner; no BitLocker + tpm: {} disks: - # bootOrder: ISO must be 1 for first-boot install (the rootdisk has no - # EFI bootloader yet). After Windows installs, it writes its own UEFI - # Boot#### entries pointing at the rootdisk's EFI partition; UEFI then - # boots from rootdisk going forward and the ISO at bootOrder:2 acts as - # a fallback for re-install scenarios. - # - # Original (broken) order had rootdisk=1, windows-iso=2 — UEFI tried - # the empty virtio disk first, got nothing, fell back to the SATA - # CDROM at Boot0001 with a short timeout, and timed out before the - # CDROM enumerated. Console showed: - # BdsDxe: failed to start Boot0001 ... Time out - # BdsDxe: No bootable option or device was found. - # Confirmed via debug pod: PVC content IS a real bootable ISO9660 - # (file: "ISO 9660 CD-ROM filesystem data ... (bootable)"), so the - # only bug was boot priority. - # 2026-05-08 PM: cdrom bus SCSI + containerDisk delivery. This - # combination boots qemu cleanly and reaches OVMF, but OVMF - # BdsDxe still hits "starting Boot0001 ... Time out" on the - # cdrom — see HANDOFF.md / CODEX-STATUS.md "OPEN — ci1" for the - # full diagnostic chain. virtio-blk disk swap was attempted as a - # workaround but introduced a separate QEMU rootdisk flock issue - # without fixing the underlying OVMF cdrom problem; reverted. - # Operator decision needed for next architectural step (OVMF - # custom build with extended timeout, KubeVirt version bump, - # Hyper-V/VirtualBox-and-export, or BIOS legacy boot). The - # containerDisk distribution pipeline (build/save/scp/ctr import) - # is proven and ready to reuse for any of those. - - name: windows-iso - bootOrder: 1 - cdrom: - bus: scsi - name: rootdisk - bootOrder: 2 disk: bus: virtio - - name: virtio-drivers - cdrom: - bus: sata - - name: sysprep - cdrom: - bus: sata interfaces: - # Pod-network fallback for Phase 1. To switch to PROD VLAN once Multus - # + the prod-vlan57 NAD exist, replace this block with: - # - name: prod-net - # bridge: {} - # model: virtio - # and update the networks: stanza to use multus.networkName: kubevirt-vms/prod-vlan57 + # Pod-network fallback for CI runner outbound traffic. Switch to + # prod-vlan57 once the bridge/NAD lane is ready for L2 access. - name: default masquerade: {} model: virtio @@ -454,55 +87,7 @@ spec: pod: {} volumes: - name: rootdisk - persistentVolumeClaim: - claimName: ci1-rootdisk - - name: windows-iso - # 2026-05-08 PM (Path C, CONTAINERDISK): the ISO is now packaged as - # a KubeVirt containerDisk OCI image baked from - # `FROM scratch ; ADD --chown=107:107 disk.img /disk/disk.img`. - # The qemu user (uid 107) reads the ISO directly from a tmpfs view - # of the OCI layer, bypassing both: - # - Synology NFS export ACL (Path B failed: uid 107 denied at - # directory level even with mode 0777, see memory - # feedback_synology_iso_export_root_only_uid_107_denied) - # - OVMF cdrom read-window timeout (Path A and Path B's SCSI - # retry both hit `BdsDxe: failed to start Boot0001 ... Time out` - # when the cdrom was backed by a PVC the storage controller - # couldn't satisfy reads from fast enough). - # - # Image build (one-time, per ISO version): - # 1. Copy ISO to disk.img, write Dockerfile - # 2. podman build --tag localhost/win-server-2025:1.0 . (on noc1) - # 3. podman save -o win-server-2025-1.0.tar localhost/win-server-2025:1.0 - # 4. SCP tar to all 3 RKE2 nodes (rke2-server, rke2-agent1, rke2-agent2) - # 5. sudo /var/lib/rancher/rke2/bin/ctr -a /run/k3s/containerd/containerd.sock \ - # -n k8s.io images import /tmp/win-server-2025-1.0.tar - # Standard FC pattern per `feedback_rke2_localhost_imagepullpolicy`. - # - # When a new Windows ISO version ships, bump the tag (1.1, 1.2, ...), - # rebuild + redistribute, and update the image: line below in a new - # commit. KubeVirt picks up the new image via a VM restart. - # - # The legacy NFS PVC + PV (apps/kubevirt-vms/win2025-iso-nfs-pv.yaml) - # and CDI Longhorn PVC (`windows-server-2025-iso`) are RETAINED for - # this commit so the prior states are recoverable. Once the - # containerDisk path proves on a successful Windows install, both - # legacy artifacts can be pruned in a follow-up commit. containerDisk: - image: localhost/win-server-2025:1.0 + image: localhost/fc-win-server-2025:v1 imagePullPolicy: Never - - name: virtio-drivers - containerDisk: - # Pinned to v1.8.2 (latest stable as of 2026-05-08). - # The :latest tag uses Docker manifest v1 schema which containerd - # 2.1 (RKE2 v1.34.5) refuses to pull with: - # "media type application/vnd.docker.distribution.manifest.v1+prettyjws - # is no longer supported since containerd v2.1" - # v1.8.2 is rebuilt with manifest v2/OCI and works on containerd 2.1. - # Bump available: https://quay.io/repository/kubevirt/virtio-container-disk?tab=tags - image: quay.io/kubevirt/virtio-container-disk:v1.8.2 - - name: sysprep - sysprep: - configMap: - name: ci1-autounattend terminationGracePeriodSeconds: 3600 diff --git a/apps/kubevirt-vms/kustomization.yaml b/apps/kubevirt-vms/kustomization.yaml new file mode 100644 index 0000000..e599c50 --- /dev/null +++ b/apps/kubevirt-vms/kustomization.yaml @@ -0,0 +1,3 @@ +resources: + - ci1.yaml + - prod-vlan57-nad.yaml