fix(monitoring): use bluejay_v2 auth for snmp-nas (not public_v2)
Synology NAS is configured with community bluejay_monitor (→ snmp.yml auth 'bluejay_v2'), not public. public_v2 was returning HTTP 500 from snmp-exporter for this target. Verified bluejay_v2 returns metrics. Keeps printer (10.0.58.107) on public_v2 — Epson ET-3750 uses community "public" as documented in its SNMP settings.
This commit is contained in:
@@ -143,7 +143,7 @@ data:
|
|||||||
metrics_path: /snmp
|
metrics_path: /snmp
|
||||||
params:
|
params:
|
||||||
module: [synology]
|
module: [synology]
|
||||||
auth: [public_v2]
|
auth: [bluejay_v2]
|
||||||
relabel_configs:
|
relabel_configs:
|
||||||
- source_labels: [__address__]
|
- source_labels: [__address__]
|
||||||
target_label: __param_target
|
target_label: __param_target
|
||||||
@@ -247,7 +247,7 @@ data:
|
|||||||
device_type: "printer"
|
device_type: "printer"
|
||||||
printer_model: "NuPrint 210"
|
printer_model: "NuPrint 210"
|
||||||
|
|
||||||
# Print.Web OTEL metrics (counters: jobs enqueued/completed/failed, bytes, duration histograms)
|
# Print.Web OTEL metrics (print counters/histograms plus Ollama runner gauges)
|
||||||
- job_name: "printweb-otel"
|
- job_name: "printweb-otel"
|
||||||
scrape_interval: 30s
|
scrape_interval: 30s
|
||||||
metrics_path: /metrics/prometheus
|
metrics_path: /metrics/prometheus
|
||||||
@@ -433,6 +433,8 @@ data:
|
|||||||
expr: rate(print_jobs_completed_total[5m]) / rate(print_jobs_enqueued_total[5m])
|
expr: rate(print_jobs_completed_total[5m]) / rate(print_jobs_enqueued_total[5m])
|
||||||
- record: print:job_duration_p95:5m
|
- record: print:job_duration_p95:5m
|
||||||
expr: histogram_quantile(0.95, rate(print_job_duration_ms_bucket[5m]))
|
expr: histogram_quantile(0.95, rate(print_job_duration_ms_bucket[5m]))
|
||||||
|
- record: print:ollama_runner_keepalive_remaining_seconds:max
|
||||||
|
expr: max by(instance, model) (ai_ollama_runner_keepalive_remaining_seconds{job="printweb-otel"})
|
||||||
- name: relay-rates
|
- name: relay-rates
|
||||||
interval: 15s
|
interval: 15s
|
||||||
rules:
|
rules:
|
||||||
@@ -528,6 +530,16 @@ data:
|
|||||||
summary: "High print volume on edge2 ({{ $value | printf \"%.0f\" }} jobs/min)"
|
summary: "High print volume on edge2 ({{ $value | printf \"%.0f\" }} jobs/min)"
|
||||||
description: "Print rate exceeds 30 jobs/min for 5 minutes. Possible runaway print loop."
|
description: "Print rate exceeds 30 jobs/min for 5 minutes. Possible runaway print loop."
|
||||||
|
|
||||||
|
- alert: PrintOllamaRunnerLongKeepAlive
|
||||||
|
expr: ai_ollama_runner_keepalive_remaining_seconds{job="printweb-otel"} > 600
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
alert_channel: thermal_print
|
||||||
|
annotations:
|
||||||
|
summary: "Print.Web Ollama runner held for >10m ({{ $labels.model }})"
|
||||||
|
description: "Print.Web reports model {{ $labels.model }} with {{ $value | printf \"%.0f\" }}s of keep-alive remaining. Check concurrent requests before the Pi 5 Ollama lane thrashes."
|
||||||
|
|
||||||
- name: pi-fleet
|
- name: pi-fleet
|
||||||
rules:
|
rules:
|
||||||
- alert: PiManagerDown
|
- alert: PiManagerDown
|
||||||
@@ -2896,6 +2908,33 @@ data:
|
|||||||
relativeTimeRange: {from: 120, to: 0}
|
relativeTimeRange: {from: 120, to: 0}
|
||||||
datasourceUid: __expr__
|
datasourceUid: __expr__
|
||||||
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [1], type: lt}}], refId: C}
|
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [1], type: lt}}], refId: C}
|
||||||
|
- uid: print-ollama-runner-long-keepalive
|
||||||
|
title: Print.Web Ollama runner keep-alive >10m
|
||||||
|
condition: C
|
||||||
|
for: 2m
|
||||||
|
noDataState: NoData
|
||||||
|
execErrState: OK
|
||||||
|
annotations:
|
||||||
|
summary: Print.Web Ollama runner held too long
|
||||||
|
description: "A Print.Web Ollama runner has more than 10 minutes of keep-alive remaining. Check active AI requests before the Pi 5 Ollama lane thrashes."
|
||||||
|
runbook: "1. Open https://print.iamworkin.lan/admin 2. Check Ollama Fleet model + remaining keep-alive 3. Query Prometheus: ai_ollama_runner_keepalive_remaining_seconds{job=\"printweb-otel\"} 4. Stop duplicate model callers before restarting Ollama."
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
service: print-web
|
||||||
|
alert_channel: thermal_print
|
||||||
|
data:
|
||||||
|
- refId: A
|
||||||
|
relativeTimeRange: {from: 120, to: 0}
|
||||||
|
datasourceUid: prometheus
|
||||||
|
model: {expr: 'max by(model) (ai_ollama_runner_keepalive_remaining_seconds{job="printweb-otel"})', instant: true, refId: A}
|
||||||
|
- refId: B
|
||||||
|
relativeTimeRange: {from: 120, to: 0}
|
||||||
|
datasourceUid: __expr__
|
||||||
|
model: {type: reduce, expression: A, reducer: last, refId: B}
|
||||||
|
- refId: C
|
||||||
|
relativeTimeRange: {from: 120, to: 0}
|
||||||
|
datasourceUid: __expr__
|
||||||
|
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [600], type: gt}}], refId: C}
|
||||||
- orgId: 1
|
- orgId: 1
|
||||||
name: Infrastructure
|
name: Infrastructure
|
||||||
folder: AI Stack Alerts
|
folder: AI Stack Alerts
|
||||||
|
|||||||
Reference in New Issue
Block a user