fix(monitoring): use bluejay_v2 auth for snmp-nas (not public_v2)

Synology NAS is configured with community bluejay_monitor (→ snmp.yml auth 'bluejay_v2'), not public. public_v2 was returning HTTP 500 from snmp-exporter for this target. Verified bluejay_v2 returns metrics. Keeps printer (10.0.58.107) on public_v2 — Epson ET-3750 uses community "public" as documented in its SNMP settings.
2026-04-22 21:32:14 -05:00
parent 59efc460fd
commit 93f77c1844
1 changed files with 41 additions and 2 deletions
--- a/apps/monitoring/noc-monitoring.yaml
+++ b/apps/monitoring/noc-monitoring.yaml
@@ -143,7 +143,7 @@ data:
        metrics_path: /snmp
        params:
          module: [synology]
-          auth: [public_v2]
+          auth: [bluejay_v2]
        relabel_configs:
          - source_labels: [__address__]
            target_label: __param_target
@@ -247,7 +247,7 @@ data:
              device_type: "printer"
              printer_model: "NuPrint 210"
-      # Print.Web OTEL metrics (counters: jobs enqueued/completed/failed, bytes, duration histograms)
+      # Print.Web OTEL metrics (print counters/histograms plus Ollama runner gauges)
      - job_name: "printweb-otel"
        scrape_interval: 30s
        metrics_path: /metrics/prometheus
@@ -433,6 +433,8 @@ data:
            expr: rate(print_jobs_completed_total[5m]) / rate(print_jobs_enqueued_total[5m])
          - record: print:job_duration_p95:5m
            expr: histogram_quantile(0.95, rate(print_job_duration_ms_bucket[5m]))
          - record: print:ollama_runner_keepalive_remaining_seconds:max
            expr: max by(instance, model) (ai_ollama_runner_keepalive_remaining_seconds{job="printweb-otel"})
      - name: relay-rates
        interval: 15s
        rules:
@@ -528,6 +530,16 @@ data:
              summary: "High print volume on edge2 ({{ $value | printf \"%.0f\" }} jobs/min)"
              description: "Print rate exceeds 30 jobs/min for 5 minutes. Possible runaway print loop."
          - alert: PrintOllamaRunnerLongKeepAlive
            expr: ai_ollama_runner_keepalive_remaining_seconds{job="printweb-otel"} > 600
            for: 2m
            labels:
              severity: warning
              alert_channel: thermal_print
            annotations:
              summary: "Print.Web Ollama runner held for >10m ({{ $labels.model }})"
              description: "Print.Web reports model {{ $labels.model }} with {{ $value | printf \"%.0f\" }}s of keep-alive remaining. Check concurrent requests before the Pi 5 Ollama lane thrashes."
      - name: pi-fleet
        rules:
          - alert: PiManagerDown
@@ -2896,6 +2908,33 @@ data:
                relativeTimeRange: {from: 120, to: 0}
                datasourceUid: __expr__
                model: {type: threshold, expression: B, conditions: [{evaluator: {params: [1], type: lt}}], refId: C}
          - uid: print-ollama-runner-long-keepalive
            title: Print.Web Ollama runner keep-alive >10m
            condition: C
            for: 2m
            noDataState: NoData
            execErrState: OK
            annotations:
              summary: Print.Web Ollama runner held too long
              description: "A Print.Web Ollama runner has more than 10 minutes of keep-alive remaining. Check active AI requests before the Pi 5 Ollama lane thrashes."
              runbook: "1. Open https://print.iamworkin.lan/admin 2. Check Ollama Fleet model + remaining keep-alive 3. Query Prometheus: ai_ollama_runner_keepalive_remaining_seconds{job=\"printweb-otel\"} 4. Stop duplicate model callers before restarting Ollama."
            labels:
              severity: warning
              service: print-web
              alert_channel: thermal_print
            data:
              - refId: A
                relativeTimeRange: {from: 120, to: 0}
                datasourceUid: prometheus
                model: {expr: 'max by(model) (ai_ollama_runner_keepalive_remaining_seconds{job="printweb-otel"})', instant: true, refId: A}
              - refId: B
                relativeTimeRange: {from: 120, to: 0}
                datasourceUid: __expr__
                model: {type: reduce, expression: A, reducer: last, refId: B}
              - refId: C
                relativeTimeRange: {from: 120, to: 0}
                datasourceUid: __expr__
                model: {type: threshold, expression: B, conditions: [{evaluator: {params: [600], type: gt}}], refId: C}
      - orgId: 1
        name: Infrastructure
        folder: AI Stack Alerts