Compare commits
1 Commits
claude/ci1
...
codex/sign
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
5cf665e77d |
@@ -396,15 +396,10 @@ spec:
|
||||
# Confirmed via debug pod: PVC content IS a real bootable ISO9660
|
||||
# (file: "ISO 9660 CD-ROM filesystem data ... (bootable)"), so the
|
||||
# only bug was boot priority.
|
||||
# 2026-05-08 PM: cdrom bus flipped sata→scsi for windows-iso to address
|
||||
# the OVMF SATA-CDROM read timeout (`BdsDxe: failed to start Boot0001 ...
|
||||
# Time out`). The SCSI CDROM uses virtio-scsi controller which has a
|
||||
# longer read window and works cleanly on Filesystem-backed PVCs.
|
||||
# See diagnostic chain in HANDOFF.md / CODEX-STATUS.md "OPEN — ci1".
|
||||
- name: windows-iso
|
||||
bootOrder: 1
|
||||
cdrom:
|
||||
bus: scsi
|
||||
bus: sata
|
||||
- name: rootdisk
|
||||
bootOrder: 2
|
||||
disk:
|
||||
@@ -435,25 +430,17 @@ spec:
|
||||
persistentVolumeClaim:
|
||||
claimName: ci1-rootdisk
|
||||
- name: windows-iso
|
||||
# 2026-05-08 PM: REVERTED from NFS Path B back to the original CDI
|
||||
# Longhorn Filesystem PVC. NFS Path B (commit fc2aca0) failed at the
|
||||
# storage layer because the Synology export `/volume1/ISOs` denies
|
||||
# non-root client UIDs at the directory level (qemu uid 107 cannot
|
||||
# `ls /iso/` even with file mode 0777). Confirmed via uid-107 +
|
||||
# uid-0 busybox probe pods on rke2-agent2 — same export-only-root
|
||||
# pattern as `/volume1/kubernetes` documented in
|
||||
# `feedback_synology_nfs_kubernetes_export_root_only`. Memory:
|
||||
# `feedback_synology_iso_export_root_only_uid_107_denied.md`.
|
||||
#
|
||||
# The Longhorn PVC `windows-server-2025-iso` (CDI Filesystem mode,
|
||||
# 10Gi) was confirmed to contain valid ISO bytes that uid 107 CAN
|
||||
# read (mode 0660 root:107). The OVMF SATA-CDROM read timeout from
|
||||
# the original Path A is now addressed by the `bus: scsi` swap on
|
||||
# the disks block above. The NFS PVC + PV are RETAINED on disk so
|
||||
# the Path B state is recoverable; they can be pruned in a
|
||||
# follow-up commit once SCSI boot is proven.
|
||||
# Path B (2026-05-08): mount ISO from Synology NFS instead of
|
||||
# Longhorn Filesystem PVC. The Filesystem-PVC path was confirmed to
|
||||
# contain a valid bootable ISO9660 image but caused OVMF's
|
||||
# SATA-CDROM read window to time out:
|
||||
# BdsDxe: failed to start Boot0001 ... Time out
|
||||
# Block-mode DataVolume was attempted as Path A but blocked by CDI
|
||||
# v1.65.0's upload pod capability drop. NFS-mounted ISO bypasses
|
||||
# both issues. See win2025-iso-nfs-pv.yaml header for full rationale
|
||||
# and Synology layout.
|
||||
persistentVolumeClaim:
|
||||
claimName: windows-server-2025-iso
|
||||
claimName: windows-server-2025-iso-nfs
|
||||
- name: virtio-drivers
|
||||
containerDisk:
|
||||
# Pinned to v1.8.2 (latest stable as of 2026-05-08).
|
||||
|
||||
@@ -3362,6 +3362,92 @@ data:
|
||||
relativeTimeRange: {from: 120, to: 0}
|
||||
datasourceUid: __expr__
|
||||
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [600], type: gt}}], refId: C}
|
||||
- orgId: 1
|
||||
name: Signage Marquee
|
||||
folder: AI Stack Alerts
|
||||
interval: 1m
|
||||
rules:
|
||||
- uid: marquee-dropped-frames-high
|
||||
title: MarqueeDroppedFramesHigh
|
||||
condition: C
|
||||
for: 5m
|
||||
noDataState: OK
|
||||
execErrState: OK
|
||||
annotations:
|
||||
summary: Marquee dropped-frame rate above 5%
|
||||
description: "Dropped frames exceeded the IR-21 budget for a renderer/phase/node tuple. Grafana owns alert delivery to IRC #alerts; Prometheus rules remain only the visibility source."
|
||||
runbook: "1. Open /d/fc-marquee-perf/marquee-animation-performance 2. Filter renderer/node/phase 3. Compare latest AAT baseline diff 4. Restart only the affected player if the issue is node-local"
|
||||
labels:
|
||||
severity: warning
|
||||
service: signage
|
||||
alert_channel: irc
|
||||
data:
|
||||
- refId: A
|
||||
relativeTimeRange: {from: 300, to: 0}
|
||||
datasourceUid: prometheus
|
||||
model: {expr: '(sum by (renderer, node_id, phase) (rate(marquee_dropped_frames_total[5m])) / sum by (renderer, node_id, phase) (rate(marquee_render_latency_ms_count[5m]))) * 100', instant: true, refId: A}
|
||||
- refId: B
|
||||
relativeTimeRange: {from: 300, to: 0}
|
||||
datasourceUid: __expr__
|
||||
model: {type: reduce, expression: A, reducer: last, refId: B}
|
||||
- refId: C
|
||||
relativeTimeRange: {from: 300, to: 0}
|
||||
datasourceUid: __expr__
|
||||
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [5], type: gt}}], refId: C}
|
||||
- uid: marquee-render-latency-p99-high
|
||||
title: MarqueeRenderLatencyP99High
|
||||
condition: C
|
||||
for: 5m
|
||||
noDataState: OK
|
||||
execErrState: OK
|
||||
annotations:
|
||||
summary: Marquee render latency p99 above 16ms
|
||||
description: "Renderer p99 latency exceeded the Pi-class 16ms budget. Grafana delivers this alert to IRC #alerts."
|
||||
runbook: "1. Open /d/fc-marquee-perf/marquee-animation-performance 2. Check render latency p99 by renderer/node/phase 3. Compare with dropped frames and node CPU 4. If isolated to WPF, capture current Player.Wpf frame set before restart"
|
||||
labels:
|
||||
severity: warning
|
||||
service: signage
|
||||
alert_channel: irc
|
||||
data:
|
||||
- refId: A
|
||||
relativeTimeRange: {from: 300, to: 0}
|
||||
datasourceUid: prometheus
|
||||
model: {expr: 'histogram_quantile(0.99, sum by (renderer, node_id, phase, le) (rate(marquee_render_latency_ms_bucket[5m])))', instant: true, refId: A}
|
||||
- refId: B
|
||||
relativeTimeRange: {from: 300, to: 0}
|
||||
datasourceUid: __expr__
|
||||
model: {type: reduce, expression: A, reducer: last, refId: B}
|
||||
- refId: C
|
||||
relativeTimeRange: {from: 300, to: 0}
|
||||
datasourceUid: __expr__
|
||||
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [16], type: gt}}], refId: C}
|
||||
- uid: marquee-animation-duration-drift
|
||||
title: MarqueeAnimationDurationDrift
|
||||
condition: C
|
||||
for: 10m
|
||||
noDataState: OK
|
||||
execErrState: OK
|
||||
annotations:
|
||||
summary: Marquee animation duration drift above 10%
|
||||
description: "Observed cycle duration has drifted more than 10% from target for a renderer/phase pair. Grafana delivers this alert to IRC #alerts."
|
||||
runbook: "1. Open /d/fc-marquee-perf/marquee-animation-performance 2. Compare observed vs target duration 3. Check recent theme/preset changes 4. Re-run MarqueeHolidayBrandTrajectoryTests before promoting a baseline"
|
||||
labels:
|
||||
severity: warning
|
||||
service: signage
|
||||
alert_channel: irc
|
||||
data:
|
||||
- refId: A
|
||||
relativeTimeRange: {from: 900, to: 0}
|
||||
datasourceUid: prometheus
|
||||
model: {expr: 'abs((histogram_quantile(0.5, sum by (renderer, phase, le) (rate(marquee_animation_duration_ms_bucket[15m]))) - avg by (renderer, phase) (marquee_animation_duration_target_ms)) / avg by (renderer, phase) (marquee_animation_duration_target_ms))', instant: true, refId: A}
|
||||
- refId: B
|
||||
relativeTimeRange: {from: 900, to: 0}
|
||||
datasourceUid: __expr__
|
||||
model: {type: reduce, expression: A, reducer: last, refId: B}
|
||||
- refId: C
|
||||
relativeTimeRange: {from: 900, to: 0}
|
||||
datasourceUid: __expr__
|
||||
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [0.1], type: gt}}], refId: C}
|
||||
- orgId: 1
|
||||
name: Infrastructure
|
||||
folder: AI Stack Alerts
|
||||
|
||||
Reference in New Issue
Block a user