Compare commits
6 Commits
codex/sign
...
claude/ci1
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
667777a653 | ||
|
|
84c9feb893 | ||
|
|
427dbfcef2 | ||
|
|
b651a4e2d0 | ||
|
|
b998f50f48 | ||
|
|
8fd9ae1cd3 |
@@ -58,7 +58,7 @@ spec:
|
|||||||
nodeName: rke2-server
|
nodeName: rke2-server
|
||||||
containers:
|
containers:
|
||||||
- name: web
|
- name: web
|
||||||
image: localhost/fc-updater-web:v20260508-pub3-deepening-2bdf108
|
image: localhost/fc-updater-web:v20260509-4162dca-authgate
|
||||||
imagePullPolicy: Never
|
imagePullPolicy: Never
|
||||||
ports:
|
ports:
|
||||||
- containerPort: 8080
|
- containerPort: 8080
|
||||||
|
|||||||
@@ -377,7 +377,22 @@ spec:
|
|||||||
firmware:
|
firmware:
|
||||||
bootloader:
|
bootloader:
|
||||||
efi:
|
efi:
|
||||||
secureBoot: true
|
# 2026-05-08: SecureBoot=false during initial install. With SecureBoot
|
||||||
|
# enabled, OVMF's BdsDxe times out reading Boot0001 from the SCSI
|
||||||
|
# CDROM ("BdsDxe: failed to start Boot0001 ... Time out") before the
|
||||||
|
# EFI bootloader signature can verify against the OVMF VARS trust DB.
|
||||||
|
# KubeVirt's `/usr/share/OVMF/OVMF_VARS.secboot.fd` template doesn't
|
||||||
|
# appear to include the Microsoft KEK/DB by default, so signed
|
||||||
|
# Windows EFI bootloaders fail validation. Disabling SecureBoot lets
|
||||||
|
# OVMF skip the chain check and boot directly. This is acceptable for
|
||||||
|
# a CI runner — TPM 2.0 is still emulated (`tpm: {}` below) so
|
||||||
|
# BitLocker / Hyper-V / WSL still work.
|
||||||
|
# When the operator wants SecureBoot back, the path is:
|
||||||
|
# 1. Custom-build OVMF_VARS.fd with Microsoft KEK/DB enrolled
|
||||||
|
# 2. Mount it into the VM via firmware.bootloader.efi.persistent
|
||||||
|
# 3. Set secureBoot: true again
|
||||||
|
# Tracked separately from the install unblock.
|
||||||
|
secureBoot: false
|
||||||
devices:
|
devices:
|
||||||
tpm: {} # Non-persistent vTPM — sufficient for runner; no BitLocker
|
tpm: {} # Non-persistent vTPM — sufficient for runner; no BitLocker
|
||||||
disks:
|
disks:
|
||||||
@@ -396,10 +411,22 @@ spec:
|
|||||||
# Confirmed via debug pod: PVC content IS a real bootable ISO9660
|
# Confirmed via debug pod: PVC content IS a real bootable ISO9660
|
||||||
# (file: "ISO 9660 CD-ROM filesystem data ... (bootable)"), so the
|
# (file: "ISO 9660 CD-ROM filesystem data ... (bootable)"), so the
|
||||||
# only bug was boot priority.
|
# only bug was boot priority.
|
||||||
|
# 2026-05-08 PM: cdrom bus SCSI + containerDisk delivery. This
|
||||||
|
# combination boots qemu cleanly and reaches OVMF, but OVMF
|
||||||
|
# BdsDxe still hits "starting Boot0001 ... Time out" on the
|
||||||
|
# cdrom — see HANDOFF.md / CODEX-STATUS.md "OPEN — ci1" for the
|
||||||
|
# full diagnostic chain. virtio-blk disk swap was attempted as a
|
||||||
|
# workaround but introduced a separate QEMU rootdisk flock issue
|
||||||
|
# without fixing the underlying OVMF cdrom problem; reverted.
|
||||||
|
# Operator decision needed for next architectural step (OVMF
|
||||||
|
# custom build with extended timeout, KubeVirt version bump,
|
||||||
|
# Hyper-V/VirtualBox-and-export, or BIOS legacy boot). The
|
||||||
|
# containerDisk distribution pipeline (build/save/scp/ctr import)
|
||||||
|
# is proven and ready to reuse for any of those.
|
||||||
- name: windows-iso
|
- name: windows-iso
|
||||||
bootOrder: 1
|
bootOrder: 1
|
||||||
cdrom:
|
cdrom:
|
||||||
bus: sata
|
bus: scsi
|
||||||
- name: rootdisk
|
- name: rootdisk
|
||||||
bootOrder: 2
|
bootOrder: 2
|
||||||
disk:
|
disk:
|
||||||
@@ -430,17 +457,40 @@ spec:
|
|||||||
persistentVolumeClaim:
|
persistentVolumeClaim:
|
||||||
claimName: ci1-rootdisk
|
claimName: ci1-rootdisk
|
||||||
- name: windows-iso
|
- name: windows-iso
|
||||||
# Path B (2026-05-08): mount ISO from Synology NFS instead of
|
# 2026-05-08 PM (Path C, CONTAINERDISK): the ISO is now packaged as
|
||||||
# Longhorn Filesystem PVC. The Filesystem-PVC path was confirmed to
|
# a KubeVirt containerDisk OCI image baked from
|
||||||
# contain a valid bootable ISO9660 image but caused OVMF's
|
# `FROM scratch ; ADD --chown=107:107 disk.img /disk/disk.img`.
|
||||||
# SATA-CDROM read window to time out:
|
# The qemu user (uid 107) reads the ISO directly from a tmpfs view
|
||||||
# BdsDxe: failed to start Boot0001 ... Time out
|
# of the OCI layer, bypassing both:
|
||||||
# Block-mode DataVolume was attempted as Path A but blocked by CDI
|
# - Synology NFS export ACL (Path B failed: uid 107 denied at
|
||||||
# v1.65.0's upload pod capability drop. NFS-mounted ISO bypasses
|
# directory level even with mode 0777, see memory
|
||||||
# both issues. See win2025-iso-nfs-pv.yaml header for full rationale
|
# feedback_synology_iso_export_root_only_uid_107_denied)
|
||||||
# and Synology layout.
|
# - OVMF cdrom read-window timeout (Path A and Path B's SCSI
|
||||||
persistentVolumeClaim:
|
# retry both hit `BdsDxe: failed to start Boot0001 ... Time out`
|
||||||
claimName: windows-server-2025-iso-nfs
|
# when the cdrom was backed by a PVC the storage controller
|
||||||
|
# couldn't satisfy reads from fast enough).
|
||||||
|
#
|
||||||
|
# Image build (one-time, per ISO version):
|
||||||
|
# 1. Copy ISO to disk.img, write Dockerfile
|
||||||
|
# 2. podman build --tag localhost/win-server-2025:1.0 . (on noc1)
|
||||||
|
# 3. podman save -o win-server-2025-1.0.tar localhost/win-server-2025:1.0
|
||||||
|
# 4. SCP tar to all 3 RKE2 nodes (rke2-server, rke2-agent1, rke2-agent2)
|
||||||
|
# 5. sudo /var/lib/rancher/rke2/bin/ctr -a /run/k3s/containerd/containerd.sock \
|
||||||
|
# -n k8s.io images import /tmp/win-server-2025-1.0.tar
|
||||||
|
# Standard FC pattern per `feedback_rke2_localhost_imagepullpolicy`.
|
||||||
|
#
|
||||||
|
# When a new Windows ISO version ships, bump the tag (1.1, 1.2, ...),
|
||||||
|
# rebuild + redistribute, and update the image: line below in a new
|
||||||
|
# commit. KubeVirt picks up the new image via a VM restart.
|
||||||
|
#
|
||||||
|
# The legacy NFS PVC + PV (apps/kubevirt-vms/win2025-iso-nfs-pv.yaml)
|
||||||
|
# and CDI Longhorn PVC (`windows-server-2025-iso`) are RETAINED for
|
||||||
|
# this commit so the prior states are recoverable. Once the
|
||||||
|
# containerDisk path proves on a successful Windows install, both
|
||||||
|
# legacy artifacts can be pruned in a follow-up commit.
|
||||||
|
containerDisk:
|
||||||
|
image: localhost/win-server-2025:1.0
|
||||||
|
imagePullPolicy: Never
|
||||||
- name: virtio-drivers
|
- name: virtio-drivers
|
||||||
containerDisk:
|
containerDisk:
|
||||||
# Pinned to v1.8.2 (latest stable as of 2026-05-08).
|
# Pinned to v1.8.2 (latest stable as of 2026-05-08).
|
||||||
|
|||||||
@@ -3362,92 +3362,6 @@ data:
|
|||||||
relativeTimeRange: {from: 120, to: 0}
|
relativeTimeRange: {from: 120, to: 0}
|
||||||
datasourceUid: __expr__
|
datasourceUid: __expr__
|
||||||
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [600], type: gt}}], refId: C}
|
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [600], type: gt}}], refId: C}
|
||||||
- orgId: 1
|
|
||||||
name: Signage Marquee
|
|
||||||
folder: AI Stack Alerts
|
|
||||||
interval: 1m
|
|
||||||
rules:
|
|
||||||
- uid: marquee-dropped-frames-high
|
|
||||||
title: MarqueeDroppedFramesHigh
|
|
||||||
condition: C
|
|
||||||
for: 5m
|
|
||||||
noDataState: OK
|
|
||||||
execErrState: OK
|
|
||||||
annotations:
|
|
||||||
summary: Marquee dropped-frame rate above 5%
|
|
||||||
description: "Dropped frames exceeded the IR-21 budget for a renderer/phase/node tuple. Grafana owns alert delivery to IRC #alerts; Prometheus rules remain only the visibility source."
|
|
||||||
runbook: "1. Open /d/fc-marquee-perf/marquee-animation-performance 2. Filter renderer/node/phase 3. Compare latest AAT baseline diff 4. Restart only the affected player if the issue is node-local"
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
service: signage
|
|
||||||
alert_channel: irc
|
|
||||||
data:
|
|
||||||
- refId: A
|
|
||||||
relativeTimeRange: {from: 300, to: 0}
|
|
||||||
datasourceUid: prometheus
|
|
||||||
model: {expr: '(sum by (renderer, node_id, phase) (rate(marquee_dropped_frames_total[5m])) / sum by (renderer, node_id, phase) (rate(marquee_render_latency_ms_count[5m]))) * 100', instant: true, refId: A}
|
|
||||||
- refId: B
|
|
||||||
relativeTimeRange: {from: 300, to: 0}
|
|
||||||
datasourceUid: __expr__
|
|
||||||
model: {type: reduce, expression: A, reducer: last, refId: B}
|
|
||||||
- refId: C
|
|
||||||
relativeTimeRange: {from: 300, to: 0}
|
|
||||||
datasourceUid: __expr__
|
|
||||||
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [5], type: gt}}], refId: C}
|
|
||||||
- uid: marquee-render-latency-p99-high
|
|
||||||
title: MarqueeRenderLatencyP99High
|
|
||||||
condition: C
|
|
||||||
for: 5m
|
|
||||||
noDataState: OK
|
|
||||||
execErrState: OK
|
|
||||||
annotations:
|
|
||||||
summary: Marquee render latency p99 above 16ms
|
|
||||||
description: "Renderer p99 latency exceeded the Pi-class 16ms budget. Grafana delivers this alert to IRC #alerts."
|
|
||||||
runbook: "1. Open /d/fc-marquee-perf/marquee-animation-performance 2. Check render latency p99 by renderer/node/phase 3. Compare with dropped frames and node CPU 4. If isolated to WPF, capture current Player.Wpf frame set before restart"
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
service: signage
|
|
||||||
alert_channel: irc
|
|
||||||
data:
|
|
||||||
- refId: A
|
|
||||||
relativeTimeRange: {from: 300, to: 0}
|
|
||||||
datasourceUid: prometheus
|
|
||||||
model: {expr: 'histogram_quantile(0.99, sum by (renderer, node_id, phase, le) (rate(marquee_render_latency_ms_bucket[5m])))', instant: true, refId: A}
|
|
||||||
- refId: B
|
|
||||||
relativeTimeRange: {from: 300, to: 0}
|
|
||||||
datasourceUid: __expr__
|
|
||||||
model: {type: reduce, expression: A, reducer: last, refId: B}
|
|
||||||
- refId: C
|
|
||||||
relativeTimeRange: {from: 300, to: 0}
|
|
||||||
datasourceUid: __expr__
|
|
||||||
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [16], type: gt}}], refId: C}
|
|
||||||
- uid: marquee-animation-duration-drift
|
|
||||||
title: MarqueeAnimationDurationDrift
|
|
||||||
condition: C
|
|
||||||
for: 10m
|
|
||||||
noDataState: OK
|
|
||||||
execErrState: OK
|
|
||||||
annotations:
|
|
||||||
summary: Marquee animation duration drift above 10%
|
|
||||||
description: "Observed cycle duration has drifted more than 10% from target for a renderer/phase pair. Grafana delivers this alert to IRC #alerts."
|
|
||||||
runbook: "1. Open /d/fc-marquee-perf/marquee-animation-performance 2. Compare observed vs target duration 3. Check recent theme/preset changes 4. Re-run MarqueeHolidayBrandTrajectoryTests before promoting a baseline"
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
service: signage
|
|
||||||
alert_channel: irc
|
|
||||||
data:
|
|
||||||
- refId: A
|
|
||||||
relativeTimeRange: {from: 900, to: 0}
|
|
||||||
datasourceUid: prometheus
|
|
||||||
model: {expr: 'abs((histogram_quantile(0.5, sum by (renderer, phase, le) (rate(marquee_animation_duration_ms_bucket[15m]))) - avg by (renderer, phase) (marquee_animation_duration_target_ms)) / avg by (renderer, phase) (marquee_animation_duration_target_ms))', instant: true, refId: A}
|
|
||||||
- refId: B
|
|
||||||
relativeTimeRange: {from: 900, to: 0}
|
|
||||||
datasourceUid: __expr__
|
|
||||||
model: {type: reduce, expression: A, reducer: last, refId: B}
|
|
||||||
- refId: C
|
|
||||||
relativeTimeRange: {from: 900, to: 0}
|
|
||||||
datasourceUid: __expr__
|
|
||||||
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [0.1], type: gt}}], refId: C}
|
|
||||||
- orgId: 1
|
- orgId: 1
|
||||||
name: Infrastructure
|
name: Infrastructure
|
||||||
folder: AI Stack Alerts
|
folder: AI Stack Alerts
|
||||||
|
|||||||
Reference in New Issue
Block a user