Compare commits
3 Commits
df02b4c3c3
...
2489464d4f
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
2489464d4f | ||
|
|
4b777b16ac | ||
|
|
8c60e3a4d3 |
@@ -1024,6 +1024,72 @@ data:
|
|||||||
summary: "Longhorn node {{ $labels.node }} not Ready"
|
summary: "Longhorn node {{ $labels.node }} not Ready"
|
||||||
description: "Node {{ $labels.node }} reports ready=false (reason: {{ $labels.condition_reason }}). Volumes scheduled to this node will be unavailable until it recovers."
|
description: "Node {{ $labels.node }} reports ready=false (reason: {{ $labels.condition_reason }}). Volumes scheduled to this node will be unavailable until it recovers."
|
||||||
|
|
||||||
|
# ============================================================
|
||||||
|
# FC Signage Marquee Performance — Track 3 + 8 (2026-05-06)
|
||||||
|
# Live-mirrored from FlowerCore.Notes/scripts/monitoring/alerts.yml.
|
||||||
|
# Source-of-truth for the live Podman Prometheus on noc1 is the
|
||||||
|
# Notes file; this K8s ConfigMap exists so a future migration to
|
||||||
|
# in-cluster Prometheus inherits the ruleset automatically.
|
||||||
|
# See feedback_monitoring_k8s_target_vs_live_podman.
|
||||||
|
# ============================================================
|
||||||
|
- name: fc-signage-marquee
|
||||||
|
rules:
|
||||||
|
- alert: MarqueeDroppedFramesHigh
|
||||||
|
expr: |
|
||||||
|
(
|
||||||
|
sum by (renderer, phase, node_id) (rate(marquee_dropped_frames_total[5m]))
|
||||||
|
/
|
||||||
|
sum by (renderer, phase, node_id) (rate(marquee_render_latency_ms_count[5m]))
|
||||||
|
) > 0.05
|
||||||
|
unless on()
|
||||||
|
absent_over_time(marquee_dropped_frames_total[7d])
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
service: signage
|
||||||
|
alert_channel: irc
|
||||||
|
annotations:
|
||||||
|
summary: "Marquee dropped-frame rate >5% on {{ $labels.renderer }}/{{ $labels.node_id }} ({{ $labels.phase }})"
|
||||||
|
description: "Renderer {{ $labels.renderer }} on {{ $labels.node_id }} drops >5% of frames during {{ $labels.phase }}. Animation visibly stuttery."
|
||||||
|
|
||||||
|
- alert: MarqueeRenderLatencyP99High
|
||||||
|
expr: |
|
||||||
|
histogram_quantile(
|
||||||
|
0.99,
|
||||||
|
sum by (renderer, phase, node_id, le) (rate(marquee_render_latency_ms_bucket[5m]))
|
||||||
|
) > 16
|
||||||
|
unless on()
|
||||||
|
absent_over_time(marquee_render_latency_ms_bucket[7d])
|
||||||
|
for: 10m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
service: signage
|
||||||
|
alert_channel: irc
|
||||||
|
annotations:
|
||||||
|
summary: "Marquee render latency p99 > 16ms on {{ $labels.renderer }}/{{ $labels.node_id }} ({{ $labels.phase }})"
|
||||||
|
description: "Per-frame render latency p99 has exceeded the Pi-class 16ms budget for 10 minutes."
|
||||||
|
|
||||||
|
- alert: MarqueeAnimationDurationDrift
|
||||||
|
expr: |
|
||||||
|
abs(
|
||||||
|
histogram_quantile(0.5, sum by (renderer, phase, le) (rate(marquee_animation_duration_ms_bucket[15m])))
|
||||||
|
-
|
||||||
|
on (phase) group_left() avg by (phase) (marquee_animation_duration_target_ms)
|
||||||
|
)
|
||||||
|
/
|
||||||
|
on (phase) group_left() avg by (phase) (marquee_animation_duration_target_ms)
|
||||||
|
> 0.10
|
||||||
|
unless on()
|
||||||
|
absent_over_time(marquee_animation_duration_ms_bucket[7d])
|
||||||
|
for: 15m
|
||||||
|
labels:
|
||||||
|
severity: info
|
||||||
|
service: signage
|
||||||
|
alert_channel: irc
|
||||||
|
annotations:
|
||||||
|
summary: "Marquee animation duration drifting > 10% on {{ $labels.renderer }} ({{ $labels.phase }})"
|
||||||
|
description: "Median observed cycle duration deviates from target DurationMs by >10%. Could indicate browser tab throttling, GPU pressure, or phase-advancement bug."
|
||||||
|
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
# ConfigMap: Blackbox Exporter Configuration
|
# ConfigMap: Blackbox Exporter Configuration
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
|
|||||||
@@ -98,8 +98,13 @@ spec:
|
|||||||
- name: FlowerCore__WorldBuilder__ImageGeneration__ClientMode
|
- name: FlowerCore__WorldBuilder__ImageGeneration__ClientMode
|
||||||
value: "comfyui"
|
value: "comfyui"
|
||||||
resources:
|
resources:
|
||||||
|
# Cluster CPU-request budget runs hot (99% on all 3 nodes at deploy
|
||||||
|
# time) while actual CPU usage is well below capacity. Idle Blazor
|
||||||
|
# Server + SignalR + a single ComfyUI poller uses ~5m, so 25m is
|
||||||
|
# generous. Re-evaluate if active rendering/export workers ever
|
||||||
|
# push past the limit.
|
||||||
requests:
|
requests:
|
||||||
cpu: 100m
|
cpu: 25m
|
||||||
memory: 256Mi
|
memory: 256Mi
|
||||||
limits:
|
limits:
|
||||||
cpu: 1000m
|
cpu: 1000m
|
||||||
|
|||||||
Reference in New Issue
Block a user