diff --git a/apps/monitoring/noc-monitoring.yaml b/apps/monitoring/noc-monitoring.yaml index 8ab91e1..69a1286 100644 --- a/apps/monitoring/noc-monitoring.yaml +++ b/apps/monitoring/noc-monitoring.yaml @@ -656,14 +656,15 @@ data: summary: "Print queue backlog on edge2 ({{ $value }} active jobs)" description: "CUPS has {{ $value }} active jobs queued. Possible printer jam, USB disconnect, or paper out." - # Paper roll lifecycle alerts (XL Track I, 2026-04-26). - # Source-of-truth gauge: print_paper_remaining_percent (Print.Web OTEL, - # hydrated on startup from the active PaperRoll row). - # alert_channel=thermal_print routes through irc-notify -> Print.Web - # /api/print/alert so the printer announces its own paper-out warning - # on its remaining paper. Self-referential humor + operator nudge. + # Printer hardware and paper-roll lifecycle alerts. + # print_printer_online: 1 when the transport is reachable/selected. + # print_printer_state enum: 0 unknown, 1 online, 2 offline, + # 3 paper_depleted, 4 jam, 5 head_error, 6 cover_open. + # Offline/jam/cover alerts stay IRC-only. Paper depleted and head + # error may route to the thermal digest only when the printer is + # online enough to make that useful. - alert: PrintPaperRollLow - expr: print_paper_remaining_percent{job="printweb-otel"} < 10 and print_paper_remaining_percent{job="printweb-otel"} > 5 + expr: (print_paper_remaining_percent{job="printweb-otel"} < 10 and print_paper_remaining_percent{job="printweb-otel"} > 5) and print_printer_online{job="printweb-otel"} == 1 for: 5m labels: severity: warning @@ -672,15 +673,59 @@ data: summary: "Print roll low on edge2 ({{ $value | printf \"%.1f\" }}% remaining)" description: "NuPrint 210 paper roll has {{ $value | printf \"%.1f\" }}% remaining. Operator should load a fresh roll soon. Run /api/paper/status for the precise mm + estimated jobs left." + - alert: PrinterOfflineWarning + expr: print_printer_state{job="printweb-otel"} == 2 + for: 2m + labels: + severity: warning + service: print-web + alert_channel: irc + annotations: + summary: "Print.Web printer offline on edge2" + description: "Print.Web reports the NuPrint 210 transport is offline or unreachable. IRC-only by design: do not thermal-print an alert when the thermal printer itself is offline." + - alert: PrintPaperRollCritical - expr: print_paper_remaining_percent{job="printweb-otel"} <= 5 + expr: print_printer_state{job="printweb-otel"} == 3 and print_printer_online{job="printweb-otel"} == 1 for: 2m labels: severity: critical alert_channel: thermal_print annotations: - summary: "Print roll critical on edge2 ({{ $value | printf \"%.1f\" }}% remaining)" - description: "NuPrint 210 paper roll at {{ $value | printf \"%.1f\" }}% — load a new roll NOW. The 50ft roll has a ~12% red-stripe zone; once paper passes that, the printer can run dry mid-job." + summary: "Print paper depleted on edge2" + description: "NuPrint 210 reports paper depleted while the printer is still online. Load a new roll, drain the hardware buffer if needed, then replay DeadLetter jobs from /print-log." + + - alert: PrinterJamWarning + expr: print_printer_state{job="printweb-otel"} == 4 + for: 2m + labels: + severity: warning + service: print-web + alert_channel: irc + annotations: + summary: "Print.Web printer jam on edge2" + description: "Print.Web reports a paper/cutter jam state. IRC-only: clear the jam, drain the hardware buffer if bytes were queued, then retry affected jobs." + + - alert: PrinterHeadErrorCritical + expr: print_printer_state{job="printweb-otel"} == 5 + for: 2m + labels: + severity: critical + service: print-web + alert_channel: thermal_print + annotations: + summary: "Print.Web printer head error on edge2" + description: "Print.Web reports a thermal head or unrecoverable printer error. Critical routing may enter the thermal digest per existing policy; IRC remains the primary triage stream." + + - alert: PrinterCoverOpenWarning + expr: print_printer_state{job="printweb-otel"} == 6 + for: 2m + labels: + severity: warning + service: print-web + alert_channel: irc + annotations: + summary: "Print.Web printer cover open on edge2" + description: "Print.Web reports the printer cover/lid is open. IRC-only: close the cover and verify /api/print/status before retrying jobs." - alert: PrintJobDeadLetter expr: increase(print_jobs_dead_letter_total[15m]) > 0 @@ -3635,6 +3680,146 @@ data: relativeTimeRange: {from: 120, to: 0} datasourceUid: __expr__ model: {type: threshold, expression: B, conditions: [{evaluator: {params: [600], type: gt}}], refId: C} + - orgId: 1 + name: Print Services + folder: Print Alerts + interval: 1m + rules: + - uid: printer-offline-warning + title: PrinterOfflineWarning + condition: C + for: 2m + noDataState: OK + execErrState: OK + annotations: + summary: "Print.Web printer offline on edge2" + description: "Print.Web reports the NuPrint 210 transport is offline or unreachable. IRC-only by design: do not thermal-print an alert when the thermal printer itself is offline." + runbook: "1. Check edge2 power/network 2. Check USB/CUPS queue 3. Open https://print.iamworkin.lan/admin 4. Do not force thermal routing for offline alerts." + labels: + severity: warning + service: print-web + alert_channel: irc + data: + - refId: A + relativeTimeRange: {from: 120, to: 0} + datasourceUid: prometheus + model: {expr: 'print_printer_state{job="printweb-otel"} == 2', instant: true, refId: A} + - refId: B + relativeTimeRange: {from: 120, to: 0} + datasourceUid: __expr__ + model: {type: reduce, expression: A, reducer: last, refId: B} + - refId: C + relativeTimeRange: {from: 120, to: 0} + datasourceUid: __expr__ + model: {type: threshold, expression: B, conditions: [{evaluator: {params: [0], type: gt}}], refId: C} + - uid: print-paper-roll-critical + title: PrintPaperRollCritical + condition: C + for: 2m + noDataState: OK + execErrState: OK + annotations: + summary: "Print paper depleted on edge2" + description: "NuPrint 210 reports paper depleted while the printer is still online. Load a new roll, drain the hardware buffer if needed, then replay DeadLetter jobs from /print-log." + runbook: "1. Load a fresh roll 2. Drain the hardware buffer if paper-out happened mid-job 3. Open https://print.iamworkin.lan/print-log 4. Retry DeadLetter jobs after the state clears." + labels: + severity: critical + service: print-web + alert_channel: thermal_print + data: + - refId: A + relativeTimeRange: {from: 120, to: 0} + datasourceUid: prometheus + model: {expr: 'print_printer_state{job="printweb-otel"} == 3 and print_printer_online{job="printweb-otel"} == 1', instant: true, refId: A} + - refId: B + relativeTimeRange: {from: 120, to: 0} + datasourceUid: __expr__ + model: {type: reduce, expression: A, reducer: last, refId: B} + - refId: C + relativeTimeRange: {from: 120, to: 0} + datasourceUid: __expr__ + model: {type: threshold, expression: B, conditions: [{evaluator: {params: [0], type: gt}}], refId: C} + - uid: printer-jam-warning + title: PrinterJamWarning + condition: C + for: 2m + noDataState: OK + execErrState: OK + annotations: + summary: "Print.Web printer jam on edge2" + description: "Print.Web reports a paper/cutter jam state. IRC-only: clear the jam, drain the hardware buffer if bytes were queued, then retry affected jobs." + runbook: "1. Clear paper/cutter path 2. Drain hardware buffer if CUPS queued bytes 3. Verify /api/print/status 4. Retry affected jobs." + labels: + severity: warning + service: print-web + alert_channel: irc + data: + - refId: A + relativeTimeRange: {from: 120, to: 0} + datasourceUid: prometheus + model: {expr: 'print_printer_state{job="printweb-otel"} == 4', instant: true, refId: A} + - refId: B + relativeTimeRange: {from: 120, to: 0} + datasourceUid: __expr__ + model: {type: reduce, expression: A, reducer: last, refId: B} + - refId: C + relativeTimeRange: {from: 120, to: 0} + datasourceUid: __expr__ + model: {type: threshold, expression: B, conditions: [{evaluator: {params: [0], type: gt}}], refId: C} + - uid: printer-head-error-critical + title: PrinterHeadErrorCritical + condition: C + for: 2m + noDataState: OK + execErrState: OK + annotations: + summary: "Print.Web printer head error on edge2" + description: "Print.Web reports a thermal head or unrecoverable printer error. Critical routing may enter the thermal digest per existing policy; IRC remains the primary triage stream." + runbook: "1. Let the printer cool if overheated 2. Power-cycle only after checking queued jobs 3. Verify /api/print/status 4. Retry jobs after the state clears." + labels: + severity: critical + service: print-web + alert_channel: thermal_print + data: + - refId: A + relativeTimeRange: {from: 120, to: 0} + datasourceUid: prometheus + model: {expr: 'print_printer_state{job="printweb-otel"} == 5', instant: true, refId: A} + - refId: B + relativeTimeRange: {from: 120, to: 0} + datasourceUid: __expr__ + model: {type: reduce, expression: A, reducer: last, refId: B} + - refId: C + relativeTimeRange: {from: 120, to: 0} + datasourceUid: __expr__ + model: {type: threshold, expression: B, conditions: [{evaluator: {params: [0], type: gt}}], refId: C} + - uid: printer-cover-open-warning + title: PrinterCoverOpenWarning + condition: C + for: 2m + noDataState: OK + execErrState: OK + annotations: + summary: "Print.Web printer cover open on edge2" + description: "Print.Web reports the printer cover/lid is open. IRC-only: close the cover and verify /api/print/status before retrying jobs." + runbook: "1. Close the printer cover 2. Verify /api/print/status returns online 3. Retry affected jobs only after the state clears." + labels: + severity: warning + service: print-web + alert_channel: irc + data: + - refId: A + relativeTimeRange: {from: 120, to: 0} + datasourceUid: prometheus + model: {expr: 'print_printer_state{job="printweb-otel"} == 6', instant: true, refId: A} + - refId: B + relativeTimeRange: {from: 120, to: 0} + datasourceUid: __expr__ + model: {type: reduce, expression: A, reducer: last, refId: B} + - refId: C + relativeTimeRange: {from: 120, to: 0} + datasourceUid: __expr__ + model: {type: threshold, expression: B, conditions: [{evaluator: {params: [0], type: gt}}], refId: C} - orgId: 1 name: CI Runners folder: CI Alerts diff --git a/tests/bluejay-infra-lint/FleetManifestLintTests.cs b/tests/bluejay-infra-lint/FleetManifestLintTests.cs index eb9683d..388f929 100644 --- a/tests/bluejay-infra-lint/FleetManifestLintTests.cs +++ b/tests/bluejay-infra-lint/FleetManifestLintTests.cs @@ -304,7 +304,7 @@ public sealed class FleetManifestLintTests } [Fact] - public void Monitoring_MustAlertWhenLinuxRunnerDeploymentIsUnavailable() + public void Monitoring_MustIncludeRequiredAlertRoutingGuards() { var monitoring = File.ReadAllText(Path.Combine(Inventory.BluejayRoot, "apps", "monitoring", "noc-monitoring.yaml")); @@ -315,6 +315,15 @@ public sealed class FleetManifestLintTests monitoring.Should().Contain("folder: CI Alerts"); monitoring.Should().Contain("uid: linux-runner-offline"); monitoring.Should().Contain("alert_channel: irc"); + + monitoring.Should().Contain("PrinterOfflineWarning"); + monitoring.Should().Contain("expr: print_printer_state{job=\"printweb-otel\"} == 2"); + monitoring.Should().Contain("IRC-only by design: do not thermal-print an alert when the thermal printer itself is offline."); + monitoring.Should().Contain("PrintPaperRollCritical"); + monitoring.Should().Contain("expr: print_printer_state{job=\"printweb-otel\"} == 3 and print_printer_online{job=\"printweb-otel\"} == 1"); + monitoring.Should().Contain("PrinterJamWarning"); + monitoring.Should().Contain("PrinterHeadErrorCritical"); + monitoring.Should().Contain("PrinterCoverOpenWarning"); } [Fact]