diff --git a/apps/monitoring/noc-monitoring.yaml b/apps/monitoring/noc-monitoring.yaml index ec79546..8efd051 100644 --- a/apps/monitoring/noc-monitoring.yaml +++ b/apps/monitoring/noc-monitoring.yaml @@ -642,6 +642,42 @@ data: summary: "Print queue backlog on edge2 ({{ $value }} active jobs)" description: "CUPS has {{ $value }} active jobs queued. Possible printer jam, USB disconnect, or paper out." + # Paper roll lifecycle alerts (XL Track I, 2026-04-26). + # Source-of-truth gauge: print_paper_remaining_percent (Print.Web OTEL, + # hydrated on startup from the active PaperRoll row). + # alert_channel=thermal_print routes through irc-notify -> Print.Web + # /api/print/alert so the printer announces its own paper-out warning + # on its remaining paper. Self-referential humor + operator nudge. + - alert: PrintPaperRollLow + expr: print_paper_remaining_percent{job="printweb-otel"} < 10 and print_paper_remaining_percent{job="printweb-otel"} > 5 + for: 5m + labels: + severity: warning + alert_channel: thermal_print + annotations: + summary: "Print roll low on edge2 ({{ $value | printf \"%.1f\" }}% remaining)" + description: "NuPrint 210 paper roll has {{ $value | printf \"%.1f\" }}% remaining. Operator should load a fresh roll soon. Run /api/paper/status for the precise mm + estimated jobs left." + + - alert: PrintPaperRollCritical + expr: print_paper_remaining_percent{job="printweb-otel"} <= 5 + for: 2m + labels: + severity: critical + alert_channel: thermal_print + annotations: + summary: "Print roll critical on edge2 ({{ $value | printf \"%.1f\" }}% remaining)" + description: "NuPrint 210 paper roll at {{ $value | printf \"%.1f\" }}% — load a new roll NOW. The 50ft roll has a ~12% red-stripe zone; once paper passes that, the printer can run dry mid-job." + + - alert: PrintJobDeadLetter + expr: increase(print_jobs_dead_letter_total[15m]) > 0 + for: 1m + labels: + severity: warning + alert_channel: thermal_print + annotations: + summary: "Print job(s) entered dead-letter on edge2 ({{ $value | printf \"%.0f\" }} in last 15m)" + description: "{{ $value | printf \"%.0f\" }} print job(s) exhausted MaxRetries and need operator action. Open /print-log, filter Status=DeadLetter, click 'Retry From Start' after fixing the underlying cause (paper jam, USB disconnect, printer power-cycle)." + - alert: CUPSHighJobRate expr: rate(cups_job_total[5m]) * 60 > 30 for: 5m