diff --git a/apps/monitoring/noc-monitoring.yaml b/apps/monitoring/noc-monitoring.yaml index 24a9bb2..1671d0d 100644 --- a/apps/monitoring/noc-monitoring.yaml +++ b/apps/monitoring/noc-monitoring.yaml @@ -143,7 +143,7 @@ data: metrics_path: /snmp params: module: [synology] - auth: [public_v2] + auth: [bluejay_v2] relabel_configs: - source_labels: [__address__] target_label: __param_target @@ -247,7 +247,7 @@ data: device_type: "printer" printer_model: "NuPrint 210" - # Print.Web OTEL metrics (counters: jobs enqueued/completed/failed, bytes, duration histograms) + # Print.Web OTEL metrics (print counters/histograms plus Ollama runner gauges) - job_name: "printweb-otel" scrape_interval: 30s metrics_path: /metrics/prometheus @@ -433,6 +433,8 @@ data: expr: rate(print_jobs_completed_total[5m]) / rate(print_jobs_enqueued_total[5m]) - record: print:job_duration_p95:5m expr: histogram_quantile(0.95, rate(print_job_duration_ms_bucket[5m])) + - record: print:ollama_runner_keepalive_remaining_seconds:max + expr: max by(instance, model) (ai_ollama_runner_keepalive_remaining_seconds{job="printweb-otel"}) - name: relay-rates interval: 15s rules: @@ -528,6 +530,16 @@ data: summary: "High print volume on edge2 ({{ $value | printf \"%.0f\" }} jobs/min)" description: "Print rate exceeds 30 jobs/min for 5 minutes. Possible runaway print loop." + - alert: PrintOllamaRunnerLongKeepAlive + expr: ai_ollama_runner_keepalive_remaining_seconds{job="printweb-otel"} > 600 + for: 2m + labels: + severity: warning + alert_channel: thermal_print + annotations: + summary: "Print.Web Ollama runner held for >10m ({{ $labels.model }})" + description: "Print.Web reports model {{ $labels.model }} with {{ $value | printf \"%.0f\" }}s of keep-alive remaining. Check concurrent requests before the Pi 5 Ollama lane thrashes." + - name: pi-fleet rules: - alert: PiManagerDown @@ -2896,6 +2908,33 @@ data: relativeTimeRange: {from: 120, to: 0} datasourceUid: __expr__ model: {type: threshold, expression: B, conditions: [{evaluator: {params: [1], type: lt}}], refId: C} + - uid: print-ollama-runner-long-keepalive + title: Print.Web Ollama runner keep-alive >10m + condition: C + for: 2m + noDataState: NoData + execErrState: OK + annotations: + summary: Print.Web Ollama runner held too long + description: "A Print.Web Ollama runner has more than 10 minutes of keep-alive remaining. Check active AI requests before the Pi 5 Ollama lane thrashes." + runbook: "1. Open https://print.iamworkin.lan/admin 2. Check Ollama Fleet model + remaining keep-alive 3. Query Prometheus: ai_ollama_runner_keepalive_remaining_seconds{job=\"printweb-otel\"} 4. Stop duplicate model callers before restarting Ollama." + labels: + severity: warning + service: print-web + alert_channel: thermal_print + data: + - refId: A + relativeTimeRange: {from: 120, to: 0} + datasourceUid: prometheus + model: {expr: 'max by(model) (ai_ollama_runner_keepalive_remaining_seconds{job="printweb-otel"})', instant: true, refId: A} + - refId: B + relativeTimeRange: {from: 120, to: 0} + datasourceUid: __expr__ + model: {type: reduce, expression: A, reducer: last, refId: B} + - refId: C + relativeTimeRange: {from: 120, to: 0} + datasourceUid: __expr__ + model: {type: threshold, expression: B, conditions: [{evaluator: {params: [600], type: gt}}], refId: C} - orgId: 1 name: Infrastructure folder: AI Stack Alerts