fix(monitoring): use bluejay_v2 auth for snmp-nas (not public_v2)

Synology NAS is configured with community bluejay_monitor
(→ snmp.yml auth 'bluejay_v2'), not public. public_v2 was returning
HTTP 500 from snmp-exporter for this target. Verified bluejay_v2
returns metrics.

Keeps printer (10.0.58.107) on public_v2 — Epson ET-3750 uses
community "public" as documented in its SNMP settings.
This commit is contained in:
Andrew Stoltz
2026-04-22 21:32:14 -05:00
parent 59efc460fd
commit 93f77c1844

View File

@@ -143,7 +143,7 @@ data:
metrics_path: /snmp metrics_path: /snmp
params: params:
module: [synology] module: [synology]
auth: [public_v2] auth: [bluejay_v2]
relabel_configs: relabel_configs:
- source_labels: [__address__] - source_labels: [__address__]
target_label: __param_target target_label: __param_target
@@ -247,7 +247,7 @@ data:
device_type: "printer" device_type: "printer"
printer_model: "NuPrint 210" printer_model: "NuPrint 210"
# Print.Web OTEL metrics (counters: jobs enqueued/completed/failed, bytes, duration histograms) # Print.Web OTEL metrics (print counters/histograms plus Ollama runner gauges)
- job_name: "printweb-otel" - job_name: "printweb-otel"
scrape_interval: 30s scrape_interval: 30s
metrics_path: /metrics/prometheus metrics_path: /metrics/prometheus
@@ -433,6 +433,8 @@ data:
expr: rate(print_jobs_completed_total[5m]) / rate(print_jobs_enqueued_total[5m]) expr: rate(print_jobs_completed_total[5m]) / rate(print_jobs_enqueued_total[5m])
- record: print:job_duration_p95:5m - record: print:job_duration_p95:5m
expr: histogram_quantile(0.95, rate(print_job_duration_ms_bucket[5m])) expr: histogram_quantile(0.95, rate(print_job_duration_ms_bucket[5m]))
- record: print:ollama_runner_keepalive_remaining_seconds:max
expr: max by(instance, model) (ai_ollama_runner_keepalive_remaining_seconds{job="printweb-otel"})
- name: relay-rates - name: relay-rates
interval: 15s interval: 15s
rules: rules:
@@ -528,6 +530,16 @@ data:
summary: "High print volume on edge2 ({{ $value | printf \"%.0f\" }} jobs/min)" summary: "High print volume on edge2 ({{ $value | printf \"%.0f\" }} jobs/min)"
description: "Print rate exceeds 30 jobs/min for 5 minutes. Possible runaway print loop." description: "Print rate exceeds 30 jobs/min for 5 minutes. Possible runaway print loop."
- alert: PrintOllamaRunnerLongKeepAlive
expr: ai_ollama_runner_keepalive_remaining_seconds{job="printweb-otel"} > 600
for: 2m
labels:
severity: warning
alert_channel: thermal_print
annotations:
summary: "Print.Web Ollama runner held for >10m ({{ $labels.model }})"
description: "Print.Web reports model {{ $labels.model }} with {{ $value | printf \"%.0f\" }}s of keep-alive remaining. Check concurrent requests before the Pi 5 Ollama lane thrashes."
- name: pi-fleet - name: pi-fleet
rules: rules:
- alert: PiManagerDown - alert: PiManagerDown
@@ -2896,6 +2908,33 @@ data:
relativeTimeRange: {from: 120, to: 0} relativeTimeRange: {from: 120, to: 0}
datasourceUid: __expr__ datasourceUid: __expr__
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [1], type: lt}}], refId: C} model: {type: threshold, expression: B, conditions: [{evaluator: {params: [1], type: lt}}], refId: C}
- uid: print-ollama-runner-long-keepalive
title: Print.Web Ollama runner keep-alive >10m
condition: C
for: 2m
noDataState: NoData
execErrState: OK
annotations:
summary: Print.Web Ollama runner held too long
description: "A Print.Web Ollama runner has more than 10 minutes of keep-alive remaining. Check active AI requests before the Pi 5 Ollama lane thrashes."
runbook: "1. Open https://print.iamworkin.lan/admin 2. Check Ollama Fleet model + remaining keep-alive 3. Query Prometheus: ai_ollama_runner_keepalive_remaining_seconds{job=\"printweb-otel\"} 4. Stop duplicate model callers before restarting Ollama."
labels:
severity: warning
service: print-web
alert_channel: thermal_print
data:
- refId: A
relativeTimeRange: {from: 120, to: 0}
datasourceUid: prometheus
model: {expr: 'max by(model) (ai_ollama_runner_keepalive_remaining_seconds{job="printweb-otel"})', instant: true, refId: A}
- refId: B
relativeTimeRange: {from: 120, to: 0}
datasourceUid: __expr__
model: {type: reduce, expression: A, reducer: last, refId: B}
- refId: C
relativeTimeRange: {from: 120, to: 0}
datasourceUid: __expr__
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [600], type: gt}}], refId: C}
- orgId: 1 - orgId: 1
name: Infrastructure name: Infrastructure
folder: AI Stack Alerts folder: AI Stack Alerts