Files
bluejay-infra/apps/monitoring/noc-monitoring.yaml
Andrew Stoltz e641ceab48 monitoring(irc-notify): criticals also batch hourly — fix per-fire spam
The first batching pass (bacac06) left critical-severity alerts on the
immediate-print path. That's still per-event spam for any persistent
critical (e.g. PrintPaperRollCritical fires every 30s Grafana evaluation
cycle when paper is <5%). Caught immediately after deploy: CUPS queue grew
0 → 8 jobs in 8 minutes from a single firing PrintPaperRollCritical.

This commit aligns with the operator's verbatim ask ("one alert an hour"):

- Critical-severity alerts now go into the digest buffer, NOT the
  immediate-print path. The digest payload already shows severity tags
  per alertname, so the operator still sees "[critical] X" in the printout.
- The explicit `alert_channel=thermal_print_immediate` label still bypasses
  batching, but only on NEW fingerprint arrival — it triggers a flush of
  the CURRENT digest (with the new alert included), then clears. Repeat
  webhooks for the same fingerprint dedupe in the buffer until the next
  hourly tick OR until the alert resolves. No fingerprint can spam.
- `add_to_digest` now returns bool (True = buffer grew, False = dedup /
  resolution / disabled) so the immediate-label path can flush only on
  state transitions.

Net effect: max 1 thermal print per BATCH_INTERVAL_MIN per alert fingerprint,
regardless of severity. Rules that genuinely need same-second paper opt in
via `alert_channel=thermal_print_immediate` (currently zero rules use this).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-19 10:22:25 -05:00

4819 lines
175 KiB
YAML

# =============================================================================
# NOC Monitoring Stack — K8s Migration Target
# =============================================================================
# Migrates the noc1 Podman monitoring pod to RKE2 K8s.
# Source: noc1 (10.0.56.10) /opt/monitoring/
#
# Components:
# - Prometheus (metrics, alerting)
# - Grafana (dashboards)
# - Blackbox Exporter (HTTP probes)
# - SNMP Exporter (network device metrics)
# - Node Exporter (host metrics, DaemonSet)
# - IRC Notify (alert relay to UnrealIRCd)
#
# Note: SNMP exporter config (snmp.yml) is ~2MB, exceeding the 1MB ConfigMap
# limit. It is stored in a separate file (snmp-config.yaml) and must be
# applied as a standalone ConfigMap or mounted via an init container that
# downloads it from Gitea.
# =============================================================================
---
apiVersion: v1
kind: Namespace
metadata:
name: monitoring
# =============================================================================
# ConfigMap: Prometheus Configuration
# =============================================================================
---
apiVersion: v1
kind: ConfigMap
metadata:
name: prometheus-config
namespace: monitoring
data:
prometheus.yml: |
global:
scrape_interval: 30s
evaluation_interval: 30s
rule_files:
- /etc/prometheus/alerts.yml
- /etc/prometheus/recording-rules.yml
scrape_configs:
# noc1 host metrics (external to cluster)
- job_name: "node-exporter"
static_configs:
- targets: ["10.0.56.10:9100"]
labels:
instance: "noc1"
vlan: "mgmt"
# RKE2 bare-metal cluster (openSUSE Leap 16 on NUCs)
- job_name: "rke2-nodes"
scrape_timeout: 15s
static_configs:
- targets: ["10.0.56.11:9100"]
labels:
instance: "rke2-server"
vlan: "mgmt"
cluster: "rke2"
role: "server"
- targets: ["10.0.56.12:9100"]
labels:
instance: "rke2-agent1"
vlan: "mgmt"
cluster: "rke2"
role: "agent"
- targets: ["10.0.56.13:9100"]
labels:
instance: "rke2-agent2"
vlan: "mgmt"
cluster: "rke2"
role: "agent"
# Mac mini macOS runner node (INFRA VLAN)
- job_name: "macmini-node"
scrape_timeout: 15s
static_configs:
- targets: ["10.0.56.115:9100"]
labels:
instance: "macmini"
host: "macmini.iamworkin.lan"
vlan: "infra"
arch: "arm64"
role: "macos-runner"
puppet_managed: "true"
puppet_server: "puppet.iamworkin.lan"
# In-cluster node-exporter DaemonSet
- job_name: "k8s-node-exporter"
kubernetes_sd_configs:
- role: endpoints
namespaces:
names: ["monitoring"]
relabel_configs:
- source_labels: [__meta_kubernetes_endpoints_name]
action: keep
regex: node-exporter
- source_labels: [__meta_kubernetes_endpoint_node_name]
target_label: instance
# pfSense SNMP via snmp-exporter
- job_name: "snmp-pfsense"
static_configs:
- targets: ["10.0.56.1"]
metrics_path: /snmp
params:
module: [if_mib]
auth: [bluejay_v2]
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: snmp-exporter.monitoring.svc:9116
# UniFi Cloud Key SNMP — DISABLED 2026-04-26
# The Cloud Key Gen2+ runs unifi-core (controller) only — not a network
# device — and does NOT run an SNMP agent on UDP/161. Scrapes were
# silently failing with "connection refused" from 10.42.x.x:161 every
# 30s, polluting up{} = 0 and lastError on the Targets page. Hardware
# health (CPU/mem/disk) for the Cloud Key host should come from
# node_exporter via SSH — not SNMP.
# - job_name: "snmp-cloudkey"
# static_configs:
# - targets: ["10.0.56.3"]
# metrics_path: /snmp
# params:
# module: [if_mib]
# auth: [bluejay_v2]
# relabel_configs:
# - source_labels: [__address__]
# target_label: __param_target
# - source_labels: [__param_target]
# target_label: instance
# - target_label: __address__
# replacement: snmp-exporter.monitoring.svc:9116
# UniFi Switch SNMP
- job_name: "snmp-switch"
static_configs:
- targets: ["10.0.56.2"]
metrics_path: /snmp
params:
module: [if_mib]
auth: [bluejay_v2]
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: snmp-exporter.monitoring.svc:9116
# Synology NAS SNMP
- job_name: "snmp-nas"
static_configs:
- targets: ["10.0.58.3"]
metrics_path: /snmp
params:
module: [synology]
auth: [bluejay_v2]
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: snmp-exporter.monitoring.svc:9116
# Prometheus self-monitoring
- job_name: "prometheus"
static_configs:
- targets: ["localhost:9090"]
# Edge nodes (PROD VLAN)
- job_name: "edge-nodes"
static_configs:
- targets: ["10.0.57.17:9100"]
labels:
instance: "edge1"
vlan: "prod"
arch: "arm64"
role: "ai-inference"
puppet_managed: "true"
puppet_server: "puppet.iamworkin.lan"
- targets: ["10.0.57.16:9100"]
labels:
instance: "edge2"
vlan: "prod"
arch: "arm64"
role: "ci-runner"
puppet_managed: "true"
puppet_server: "puppet.iamworkin.lan"
- targets: ["10.0.58.25:9100"]
labels:
instance: "piez"
vlan: "home"
arch: "arm64"
role: "prototyping"
- targets: ["10.0.58.113:9100"]
labels:
instance: "pirelay"
vlan: "home"
arch: "arm64"
role: "relay-controller"
# =======================================================================
# PiManager Application Metrics (relay states, temps, automation)
# =======================================================================
- job_name: "pimanager-app"
scrape_interval: 15s
metrics_path: /metrics
static_configs:
- targets: ["10.0.58.25:5000"]
labels:
instance: "piez"
service: "pimanager"
vlan: "home"
device: "pi4-ezconnect"
- targets: ["10.0.58.113:5100"]
labels:
instance: "pirelay"
service: "pimanager"
vlan: "home"
device: "pi3-ks0212"
# Epson ET-3750 EcoTank Printer SNMP
- job_name: "snmp-printer"
scrape_interval: 5m
scrape_timeout: 30s
static_configs:
- targets: ["10.0.58.107"]
labels:
instance: "epson-ecotank"
vlan: "home"
device_type: "printer"
metrics_path: /snmp
params:
module: [printer_mib]
auth: [public_v2]
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: snmp-exporter.monitoring.svc:9116
# =============================================================================
# Print Services (CUPS + Print.Web on edge2)
# =============================================================================
# CUPS Prometheus exporter (cups_exporter on edge2:9628)
- job_name: "cups"
scrape_interval: 30s
static_configs:
- targets: ["10.0.57.16:9628"]
labels:
instance: "edge2"
service: "cups"
device_type: "printer"
printer_model: "NuPrint 210"
# Print.Web OTEL metrics (print counters/histograms plus Ollama runner gauges)
- job_name: "printweb-otel"
scrape_interval: 30s
metrics_path: /metrics/prometheus
static_configs:
- targets: ["10.0.57.16:5200"]
labels:
instance: "print-web"
service: "print-web"
device_type: "printer"
printer_model: "NuPrint 210"
# Print.Web health (Blazor app on edge2:5200)
- job_name: "probe-printweb"
metrics_path: /probe
params:
module: [http_2xx]
scrape_interval: 30s
static_configs:
- targets: ["http://10.0.57.16:5200/"]
labels:
instance: "print-web"
service: "print-web"
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: blackbox-exporter.monitoring.svc:9115
# FlowerCore.RemoteDesktop web health (public cluster VIP)
# Module is https_internal — desktop.iamworkin.lan uses a step-ca leaf
# cert; blackbox does NOT trust step-ca root, so http_2xx fails with
# x509 unknown authority and probe_success=0 even when /health 200s.
- job_name: "probe-remotedesktop"
metrics_path: /probe
params:
module: [https_internal]
scrape_interval: 30s
static_configs:
- targets: ["https://desktop.iamworkin.lan/health"]
labels:
instance: "https://desktop.iamworkin.lan/health"
service: "remotedesktop-web"
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- target_label: __address__
replacement: blackbox-exporter.monitoring.svc:9115
# FlowerCore.RemoteDesktop /metrics (direct scrape for counters)
- job_name: "fc-remotedesktop"
metrics_path: /metrics
scheme: https
scrape_interval: 30s
tls_config:
insecure_skip_verify: true
static_configs:
- targets: ["desktop.iamworkin.lan"]
labels:
service: "remotedesktop-web"
# CUPS web UI health (port 631)
- job_name: "probe-cups"
metrics_path: /probe
params:
module: [http_2xx]
scrape_interval: 60s
static_configs:
- targets: ["http://10.0.57.16:631/"]
labels:
instance: "cups-edge2"
service: "cups"
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: blackbox-exporter.monitoring.svc:9115
# =============================================================================
# AI Stack Health Probes (Blackbox Exporter)
# =============================================================================
# NOTE: probe-ollama-local and probe-agentzero-local were REMOVED
# 2026-04-26. They pointed at 10.0.58.100 (HOME VLAN) which is not
# reachable from cluster pods (firewalled). They had been firing as
# OllamaDown / AgentZeroDown since 2026-04-24. Workstation/AI-laptop
# Ollama and Agent Zero should be monitored via host-side Puppet
# (node_exporter on the box) once the AI laptop is running 24/7.
# Ollama API — edge1 Pi 5 (NUC Agent Zero)
- job_name: "probe-ollama-edge1"
metrics_path: /probe
params:
module: [http_ollama]
scrape_interval: 30s
static_configs:
- targets: ["http://10.0.57.17:11434/api/tags"]
labels:
instance: "ollama-edge1"
service: "ollama"
deployment: "nuc"
gpu: "cpu"
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: blackbox-exporter.monitoring.svc:9115
# Agent Zero Web UI — in-cluster (RKE2)
# Target uses short svc form (agent-zero.agent-zero.svc) NOT
# cluster.local FQDN — the *.cluster.local form gets rewritten to
# 10.0.56.200 (Traefik VIP) by the CoreDNS iamworkin.lan template +
# ndots:5 search-suffix expansion. Memory: feedback_coredns_ndots_template_collision.
- job_name: "probe-agentzero-nuc"
metrics_path: /probe
params:
module: [http_2xx]
scrape_interval: 30s
static_configs:
- targets: ["http://agent-zero.agent-zero.svc:80/"]
labels:
instance: "agent-zero-nuc"
service: "agent-zero"
deployment: "nuc"
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: blackbox-exporter.monitoring.svc:9115
# =============================================================================
# K8s Cluster State (kube-state-metrics, cert-manager, traefik)
# =============================================================================
# Use in-cluster ClusterIP service DNS — NOT NodePorts — so a same-node
# NodePort hairpin doesn't break the scrape (hit on rke2-agent1 hosting
# both prometheus and traefik on 2026-04-26: 10.0.56.12:30900 timed out
# from prometheus while .11/.13 worked). NodePorts at 30900-30902 are
# still useful for noc1-Podman-style external scrapers, but in-cluster
# we should always use the svc DNS form.
# kube-state-metrics — exposes K8s object state (pods, deployments, nodes)
# Required for KubeContainerRestartingFrequently / KubePodNotReady alerts.
- job_name: "kube-state-metrics"
scrape_interval: 30s
static_configs:
- targets: ["kube-state-metrics.kube-system.svc:8080"]
labels:
cluster: "rke2"
# cert-manager — exposes certmanager_certificate_ready_status,
# certmanager_certificate_expiration_timestamp_seconds, etc. Drives the
# CertManagerCertificateNotReady / CertManagerCertificateRenewalFailed
# alerts. Memory: project_cert_manager_prometheus_scrape.
- job_name: "cert-manager"
scrape_interval: 30s
static_configs:
- targets: ["cert-manager-metrics.cert-manager.svc:9402"]
labels:
cluster: "rke2"
# Traefik — request rates, latency, TLS cert metadata, router state.
# ClusterIP svc routes to one of the traefik pods; per-pod scrape via
# the headless `traefik-metrics` selector would be nicer for failover
# visibility but the single-replica scrape is enough for steady-state.
- job_name: "traefik"
scrape_interval: 15s
static_configs:
- targets: ["traefik-metrics.traefik-system.svc:9100"]
labels:
service: "traefik"
cluster: "rke2"
# Longhorn — exposes longhorn_volume_robustness, longhorn_backup_*,
# longhorn_node_status_*. Enables LonghornVolumeUnhealthy +
# LonghornBackupFailed alerts (no real visibility into Longhorn
# health before this — was relying on K8s events which are noisy
# transient lifecycle messages, not actionable signals).
- job_name: "longhorn"
scrape_interval: 30s
static_configs:
- targets: ["longhorn-backend.longhorn-system.svc:9500"]
labels:
service: "longhorn"
cluster: "rke2"
# FC web services through Traefik — single probe surface to spot any
# iamworkin.lan host returning non-200. Uses https_internal because all
# certs are step-ca leaves; blackbox would x509-fail with http_2xx.
# Some services need explicit healthcheck paths because root returns
# 404 (acme, guac) or 401 (grafana, prometheus). Drop them or point at
# the right endpoint — don't lower valid_status_codes globally because
# 401 from a healthy pod and 401 from an outage look identical.
- job_name: "probe-traefik-services"
metrics_path: /probe
params:
module: [https_internal]
scrape_interval: 60s
static_configs:
- targets:
# Root-reachable services (200 or 3xx)
- "https://gitea.iamworkin.lan/"
- "https://argocd.iamworkin.lan/"
- "https://intranet.iamworkin.lan/"
- "https://signage.iamworkin.lan/"
- "https://kiosk.iamworkin.lan/"
- "https://media.iamworkin.lan/"
- "https://mysql.iamworkin.lan/"
- "https://php.iamworkin.lan/"
- "https://zabbix.iamworkin.lan/"
- "https://desktop.iamworkin.lan/"
- "https://print.iamworkin.lan/"
- "https://dns.iamworkin.lan/"
- "https://chat.iamworkin.lan/"
- "https://dist.iamworkin.lan/"
- "https://dms.iamworkin.lan/"
- "https://menuboard.iamworkin.lan/"
- "https://messageboard.iamworkin.lan/"
- "https://presentations.iamworkin.lan/"
- "https://retail.iamworkin.lan/"
- "https://ttsreader.iamworkin.lan/"
# Explicit healthcheck paths
- "https://fc-llm-bridge.iamworkin.lan/healthz"
- "https://acme.iamworkin.lan/health"
# NOTE: services intentionally NOT in this probe surface
# - grafana.iamworkin.lan: every endpoint (incl. /api/health
# and /login) returns 401 behind Traefik basic-auth.
# Health covered by in-cluster monitoring-grafana scrape.
# - prometheus.iamworkin.lan: same auth pattern. Health covered
# by the prometheus self-scrape job.
# - guac.iamworkin.lan: deprecated — Guacamole moved to
# desktop.iamworkin.lan/guacamole/ (memory:
# feedback_traefik_cross_namespace_refs_disabled).
labels:
probe_type: "traefik-service"
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
regex: "https?://([^/:]+).*"
target_label: instance
- target_label: __address__
replacement: blackbox-exporter.monitoring.svc:9115
# =============================================================================
# Self-monitoring (K8s monitoring namespace)
# =============================================================================
- job_name: "monitoring-grafana"
metrics_path: /metrics
static_configs:
- targets: ["grafana.monitoring.svc:3000"]
labels:
instance: "grafana-k8s"
service: "grafana"
- job_name: "monitoring-blackbox"
static_configs:
- targets: ["blackbox-exporter.monitoring.svc:9115"]
labels:
instance: "blackbox-k8s"
service: "blackbox"
recording-rules.yml: |
groups:
- name: node-aggregations
interval: 30s
rules:
- record: instance:node_cpu_usage:avg5m
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)
- record: instance:node_memory_usage:percent
expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100
- record: instance:node_disk_usage:percent
expr: (1 - node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100
- record: instance:node_network_receive:rate5m
expr: rate(node_network_receive_bytes_total{device!~"lo|veth.*|cali.*|flannel.*"}[5m]) * 8
- record: instance:node_network_transmit:rate5m
expr: rate(node_network_transmit_bytes_total{device!~"lo|veth.*|cali.*|flannel.*"}[5m]) * 8
- name: probe-aggregations
interval: 30s
rules:
- record: service:probe_success:min
expr: min by(service) (probe_success)
- record: service:probe_duration:avg
expr: avg by(service) (probe_duration_seconds)
- name: print-rates
interval: 30s
rules:
- record: print:jobs_per_minute:rate5m
expr: rate(print_jobs_enqueued_total[5m]) * 60
- record: print:success_rate:ratio5m
expr: rate(print_jobs_completed_total[5m]) / rate(print_jobs_enqueued_total[5m])
- record: print:job_duration_p95:5m
expr: histogram_quantile(0.95, rate(print_job_duration_ms_bucket[5m]))
- record: print:ollama_runner_keepalive_remaining_seconds:max
expr: max by(instance, model) (ai_ollama_runner_keepalive_remaining_seconds{job="printweb-otel"})
- name: relay-rates
interval: 15s
rules:
- record: relay:state_changes:1h
expr: changes(pimanager_relay_state[1h])
- record: epson:pages_per_day:rate24h
expr: increase(prtMarkerLifeCount{job="snmp-printer"}[24h])
alerts.yml: |
groups:
- name: ai-stack
rules:
- alert: OllamaDown
expr: probe_success{service="ollama"} == 0
for: 2m
labels:
severity: warning
annotations:
summary: "Ollama is down on {{ $labels.deployment }}"
description: "Ollama API at {{ $labels.instance }} has been unreachable for 2 minutes. Agent Zero FAISS memory will fail."
- alert: AgentZeroDown
expr: probe_success{service="agent-zero"} == 0
for: 2m
labels:
severity: warning
annotations:
summary: "Agent Zero is down on {{ $labels.deployment }}"
description: "Agent Zero web UI at {{ $labels.instance }} has been unreachable for 2 minutes."
- alert: OllamaSlowResponse
expr: probe_duration_seconds{service="ollama"} > 3
for: 5m
labels:
severity: info
annotations:
summary: "Ollama responding slowly on {{ $labels.deployment }}"
description: "Ollama API response time exceeds 3s for 5 minutes. GPU may be overloaded."
- name: print-services
rules:
- alert: CUPSExporterDown
expr: up{job="cups"} == 0
for: 2m
labels:
severity: warning
annotations:
summary: "CUPS exporter unreachable on edge2"
description: "cups_exporter at edge2:9628 has been down for 2 minutes. CUPS monitoring offline."
- alert: CUPSWebUIDown
expr: probe_success{job="probe-cups"} == 0
for: 3m
labels:
severity: warning
annotations:
summary: "CUPS web UI down on edge2"
description: "CUPS port 631 unreachable for 3 minutes. Network printing unavailable."
- alert: PrintWebDown
expr: probe_success{job="probe-printweb"} == 0
for: 2m
labels:
severity: warning
annotations:
summary: "Print.Web is down on edge2"
description: "FlowerCore Print.Web at edge2:5200 unreachable. API/MCP/Blazor printing unavailable."
- alert: CUPSPrinterStopped
expr: cups_printer_state_total{state="stopped"} > 0
for: 5m
labels:
severity: warning
annotations:
summary: "CUPS printer stopped on edge2"
description: "A CUPS printer has been in stopped state for 5 minutes. Check USB connection or paper."
- alert: CUPSJobBacklog
expr: cups_job_active_total > 10
for: 2m
labels:
severity: warning
annotations:
summary: "Print queue backlog on edge2 ({{ $value }} active jobs)"
description: "CUPS has {{ $value }} active jobs queued. Possible printer jam, USB disconnect, or paper out."
# Paper roll lifecycle alerts (XL Track I, 2026-04-26).
# Source-of-truth gauge: print_paper_remaining_percent (Print.Web OTEL,
# hydrated on startup from the active PaperRoll row).
# alert_channel=thermal_print routes through irc-notify -> Print.Web
# /api/print/alert so the printer announces its own paper-out warning
# on its remaining paper. Self-referential humor + operator nudge.
- alert: PrintPaperRollLow
expr: print_paper_remaining_percent{job="printweb-otel"} < 10 and print_paper_remaining_percent{job="printweb-otel"} > 5
for: 5m
labels:
severity: warning
alert_channel: thermal_print
annotations:
summary: "Print roll low on edge2 ({{ $value | printf \"%.1f\" }}% remaining)"
description: "NuPrint 210 paper roll has {{ $value | printf \"%.1f\" }}% remaining. Operator should load a fresh roll soon. Run /api/paper/status for the precise mm + estimated jobs left."
- alert: PrintPaperRollCritical
expr: print_paper_remaining_percent{job="printweb-otel"} <= 5
for: 2m
labels:
severity: critical
alert_channel: thermal_print
annotations:
summary: "Print roll critical on edge2 ({{ $value | printf \"%.1f\" }}% remaining)"
description: "NuPrint 210 paper roll at {{ $value | printf \"%.1f\" }}% — load a new roll NOW. The 50ft roll has a ~12% red-stripe zone; once paper passes that, the printer can run dry mid-job."
- alert: PrintJobDeadLetter
expr: increase(print_jobs_dead_letter_total[15m]) > 0
for: 1m
labels:
severity: warning
alert_channel: thermal_print
annotations:
summary: "Print job(s) entered dead-letter on edge2 ({{ $value | printf \"%.0f\" }} in last 15m)"
description: "{{ $value | printf \"%.0f\" }} print job(s) exhausted MaxRetries and need operator action. Open /print-log, filter Status=DeadLetter, click 'Retry From Start' after fixing the underlying cause (paper jam, USB disconnect, printer power-cycle)."
- alert: CUPSHighJobRate
expr: rate(cups_job_total[5m]) * 60 > 30
for: 5m
labels:
severity: info
annotations:
summary: "High print volume on edge2 ({{ $value | printf \"%.0f\" }} jobs/min)"
description: "Print rate exceeds 30 jobs/min for 5 minutes. Possible runaway print loop."
- alert: PrintOllamaRunnerLongKeepAlive
expr: ai_ollama_runner_keepalive_remaining_seconds{job="printweb-otel"} > 600
for: 2m
labels:
severity: warning
alert_channel: thermal_print
annotations:
summary: "Print.Web Ollama runner held for >10m ({{ $labels.model }})"
description: "Print.Web reports model {{ $labels.model }} with {{ $value | printf \"%.0f\" }}s of keep-alive remaining. Check concurrent requests before the Pi 5 Ollama lane thrashes."
- name: macmini-runners
rules:
- alert: MacMiniRunnerOffline
expr: (flowercore_github_runner_online{runner=~"macmini-.*"} == 0) or absent(flowercore_github_runner_online{runner=~"macmini-.*"})
for: 10m
labels:
severity: warning
service: github-runner
annotations:
summary: "Mac mini GitHub runner offline ({{ $labels.runner }})"
description: "A macmini-* GitHub Actions runner has not reported online for more than 10 minutes. Puppet manages its LaunchDaemon under /Library/LaunchDaemons/io.flowercore.github-runner-<slug>.plist; runners survive reboot and do not require a GUI session."
- name: linux-runners
rules:
- alert: LinuxRunnerOffline
expr: |
kube_deployment_status_replicas_ready{
namespace="github-runner",
deployment=~"github-runner(|-(sharedpos|puppet|signage|dms|telephony|print-web|chat|mysql|kiosk-linux))"
} == 0
for: 5m
labels:
severity: warning
alert_channel: irc
service: github-runner
team: ci
annotations:
summary: "Linux CI runner offline: {{ $labels.deployment }}"
description: "Deployment {{ $labels.deployment }} in namespace github-runner has 0 ready replicas for more than 5 minutes. CI jobs targeting this repo will queue until the runner pod restarts and re-registers with GitHub. Check pods with: kubectl -n github-runner get pods -l app.kubernetes.io/name={{ $labels.deployment }}. Check logs with: kubectl -n github-runner logs -l app.kubernetes.io/name={{ $labels.deployment }} --tail=50. Common causes: PAT missing repo access, runner CrashLoopBackOff, or node/resource pressure."
- name: remote-desktop
rules:
- alert: RemoteDesktopWebDown
expr: probe_success{job="probe-remotedesktop",instance="https://desktop.iamworkin.lan/health"} == 0
for: 3m
labels:
severity: warning
annotations:
summary: "FlowerCore RemoteDesktop web is down"
description: "https://desktop.iamworkin.lan/health probe has failed for 3 minutes. Catalog + session launch surface offline."
- alert: RemoteDesktopMetricsStale
expr: absent(fc_desktop_session_events_total)
for: 10m
labels:
severity: warning
annotations:
summary: "RemoteDesktop /metrics scrape returning no data"
description: "No fc_desktop_session_events_total series for 10 minutes. Either the Prometheus scrape target is misconfigured or the web deployment stopped exporting metrics. Zabbix template carries the same 10m no-data trigger for cross-monitor parity."
# PUBLISHER QUIRK: fc_desktop_pool_depleted / _deficit emit one
# series per template per status (Ready/Warming/BelowDesiredSize/
# Disabled), and the historical series for non-current statuses
# stay at their last value. So just `_depleted > 0` fires forever
# on any template that ever entered a bad state.
#
# SAFE PATTERN: alert only when the canonical "Ready" status
# gauge does NOT report ready=1 for the enabled template. This
# is the publisher's own canary — _ready{status="Ready"}==1 is
# always the current "everything is fine" signal.
- alert: RemoteDesktopPoolDepleted
expr: |
group by(template) (fc_desktop_pool_ready{enabled="true"})
unless on(template) (fc_desktop_pool_ready{enabled="true",status="Ready"} == 1)
for: 5m
labels:
severity: warning
annotations:
summary: "RemoteDesktop pool depleted ({{ $labels.template }})"
description: "Pool for template {{ $labels.template }} has no Ready warm pod for 5 minutes. New launches will cold-start. Check pod-scheduling failures, image pull issues, or exhausted node capacity."
# Same pattern, but only fires when template explicitly reports
# a sustained Warning-level alert state (current-status series).
- alert: RemoteDesktopPoolDeficitSustained
expr: |
fc_desktop_pool_deficit{enabled="true",alert_level="Warning"} > 0
unless on(template) (fc_desktop_pool_ready{enabled="true",status="Ready"} == 1)
for: 10m
labels:
severity: info
annotations:
summary: "RemoteDesktop pool {{ $labels.template }} below desired for 10m"
description: "Pool {{ $labels.template }} has a persistent deficit of {{ $value }} warm pods AND no Ready series. Likely image pull, NFS affinity, or claim-init issue."
- alert: RemoteDesktopSessionChurnSpike
expr: sum(rate(fc_desktop_session_events_total{event="launch"}[5m])) * 60 > 20
for: 5m
labels:
severity: info
annotations:
summary: "RemoteDesktop launch rate high ({{ $value | printf \"%.0f\" }}/min)"
description: "Launch events exceed 20/min for 5 minutes. Could be a user-facing feature launch, a pooled template thrashing, or a runaway automation loop."
- alert: RemoteDesktopRecordingEventsDropped
expr: absent_over_time(fc_desktop_session_events_total{event="recording"}[30m]) and on() (sum(fc_desktop_session_events_total{event="launch"}) > 0)
for: 15m
labels:
severity: info
annotations:
summary: "RemoteDesktop recording events silent for 30m despite active launches"
description: "No recording events in 30 minutes while launches are happening. Recording may be silently disabled on all templates (SessionRecordingEnabled=false), the guacd NFS mount may be unhealthy, or the retention sweep isn't emitting events. Not an error by itself — worth checking."
# Match by job — instance label carries full URL incl. /health,
# not just hostname, so a hostname-only match never fires.
- alert: RemoteDesktopTlsExpiry
expr: probe_ssl_earliest_cert_expiry{job="probe-remotedesktop"} - time() < 2 * 86400
for: 6h
labels:
severity: critical
annotations:
summary: "desktop.iamworkin.lan TLS cert expires within 2 days"
description: "The desktop.iamworkin.lan cert is inside the 2-day renewal window and cert-manager has not renewed. Check cert-manager logs, step-ca reachability, and pfSense DNS overrides per the ACME DNS-01 gate."
- name: pi-fleet
rules:
- alert: PiManagerDown
expr: up{job="pimanager-app"} == 0
for: 3m
labels:
severity: warning
annotations:
summary: "PiManager down on {{ $labels.instance }}"
description: "PiManager app on {{ $labels.instance }} ({{ $labels.device }}) unreachable for 3 minutes."
- alert: PiCpuTempHigh
expr: pimanager_cpu_temperature_celsius > 75
for: 5m
labels:
severity: warning
annotations:
summary: "Pi CPU temperature high on {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}C)"
- alert: PiCpuTempCritical
expr: pimanager_cpu_temperature_celsius > 82
for: 2m
labels:
severity: critical
annotations:
summary: "Pi CPU temperature CRITICAL on {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}C)"
- alert: PiMemoryHigh
expr: pimanager_memory_usage_percent > 90
for: 5m
labels:
severity: warning
annotations:
summary: "Pi memory usage high on {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}%)"
- alert: PiDiskHigh
expr: pimanager_disk_usage_percent > 85
for: 10m
labels:
severity: warning
annotations:
summary: "Pi disk usage high on {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}%)"
- alert: RelayAllOff
expr: sum by (instance) (pimanager_relay_state) == 0 and pimanager_relay_channel_count > 0
for: 0m
labels:
severity: info
annotations:
summary: "All relay channels OFF on {{ $labels.instance }}"
- alert: PiWifiWeak
expr: pimanager_wifi_signal_dbm < -75 and pimanager_wifi_signal_dbm != 0
for: 10m
labels:
severity: warning
annotations:
summary: "Weak WiFi signal on {{ $labels.instance }} ({{ $value }}dBm)"
- name: snmp-devices
rules:
- alert: EpsonInkLow
expr: prtMarkerSuppliesLevel{job="snmp-printer"} < 15 and prtMarkerSuppliesLevel{job="snmp-printer"} > 0
for: 0m
labels:
severity: warning
alert_channel: thermal_print
annotations:
summary: "Epson ink low: {{ $labels.prtMarkerSuppliesDescription }} at {{ $value }}%"
- alert: EpsonInkCritical
expr: prtMarkerSuppliesLevel{job="snmp-printer"} < 5 and prtMarkerSuppliesLevel{job="snmp-printer"} > 0
for: 0m
labels:
severity: critical
alert_channel: thermal_print
annotations:
summary: "Epson ink CRITICAL: {{ $labels.prtMarkerSuppliesDescription }} at {{ $value }}%"
# for: 30m absorbs sleep cycles. The EcoTank sleeps after ~5 min
# of idle and SNMP times out, so 5m for: would page nightly. A
# genuine printer outage (jam, disconnected) lasts well over 30m.
- alert: EpsonPrinterDown
expr: up{job="snmp-printer"} == 0
for: 30m
labels:
severity: warning
annotations:
summary: "Epson ET-3750 SNMP unreachable for >30m (likely actual fault, not sleep)"
- alert: SynologyDiskLow
expr: hrStorageUsed{job="snmp-nas"} / hrStorageSize{job="snmp-nas"} * 100 > 85
for: 10m
labels:
severity: warning
alert_channel: thermal_print
annotations:
summary: "Synology NAS storage high ({{ $value | printf \"%.1f\" }}%)"
- alert: SynologyDown
expr: up{job="snmp-nas"} == 0
for: 3m
labels:
severity: critical
alert_channel: thermal_print
annotations:
summary: "Synology NAS SNMP unreachable"
- name: infrastructure
rules:
- alert: NodeDown
expr: up{job=~"node-exporter|rke2-nodes|edge-nodes|k8s-node-exporter"} == 0
for: 2m
labels:
severity: critical
annotations:
summary: "Node {{ $labels.instance }} is down"
- alert: HighCPU
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 85
for: 10m
labels:
severity: warning
annotations:
summary: "High CPU on {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}%)"
- alert: HighMemory
expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100 > 90
for: 5m
labels:
severity: warning
annotations:
summary: "High memory usage on {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}%)"
- alert: DiskSpaceLow
expr: (1 - node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100 > 85
for: 10m
labels:
severity: warning
annotations:
summary: "Disk usage high on {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}%)"
# Puppet agent + service alerts.
# Mirror of FlowerCore.Notes/scripts/monitoring/alerts.yml `puppet` group
# so a future migration to in-cluster Prometheus inherits the ruleset.
# Source-of-truth for the live Podman Prometheus on noc1 is the Notes file.
# See feedback_monitoring_k8s_target_vs_live_podman.
- name: puppet
rules:
- alert: PuppetAgentReportStale
expr: puppet_last_run_age_seconds > 7200
for: 30m
labels:
severity: warning
alert_channel: irc
annotations:
summary: "Puppet agent {{ $labels.instance }} hasn't reported in over 2h"
description: "Last run age: {{ $value | humanizeDuration }}. The puppet agent on {{ $labels.instance }} may be stopped, the node may be powered off, or noc1 may be unreachable from this node."
runbook: "1. SSH to node (via noc1 jumpbox if needed) 2. sudo systemctl status puppet 3. sudo puppet agent -t --noop to force a run 4. Check r10k: ssh fcadmin@10.0.56.10 'sudo podman logs openvoxserver --tail 50' 5. Verify noc1 reachability: ping puppet.iamworkin.lan"
- alert: PuppetAgentReportCritical
expr: puppet_last_run_age_seconds > 86400
for: 1h
labels:
severity: critical
alert_channel: irc
annotations:
summary: "Puppet agent {{ $labels.instance }} silent for over 24h — node is unmanaged"
description: "Last run age: {{ $value | humanizeDuration }}. Node {{ $labels.instance }} has not submitted a Puppet report in over 24 hours. Config drift is accumulating — investigate immediately. If intentional (maintenance), add to the exclusion filter or silence in Grafana."
runbook: "URGENT: 1. Check node power state 2. SSH via noc1 jumpbox: ssh fcadmin@10.0.56.10 then ssh <node> 3. sudo systemctl status puppet 4. sudo systemctl start puppet + sudo puppet agent -t 5. Check for network partitions (VLAN connectivity to 10.0.56.10) 6. If node was recently reimaged: sudo puppet agent -t to re-register with new SSL cert"
# Sprint 33 Cx-7 Phase B (2026-05-25 postmortem follow-up):
# Detects puppet.service in failed state — distinct from PuppetAgentReportStale
# which catches "agent hasn't run." This catches "systemd gave up restarting it"
# (CA-verify loop or other fatal exit). Requires node-exporter systemd collector
# enabled with --collector.systemd. If `node_systemd_unit_state` has no series
# for a node, the collector is disabled there — flag in postmortem follow-up.
- alert: PuppetServiceFailed
expr: node_systemd_unit_state{name="puppet.service",state="failed"} == 1
for: 5m
labels:
severity: warning
alert_channel: irc
annotations:
summary: "Puppet service failed on {{ $labels.instance }}"
description: "puppet.service on {{ $labels.instance }} has been in failed state for 5+ minutes. systemd has stopped auto-restarting (CA-verify-loop or other exit). Manual `systemctl status puppet` confirms. Run `sudo systemctl start puppet` to recover; investigate journal for root cause."
runbook_url: "https://github.com/astoltz/FlowerCore.Notes/blob/master/memory/feedback_puppet_service_dead_after_ca_loop_alert_misreads.md"
# K8s pod-state alerts. Require kube-state-metrics scrape (added
# 2026-04-26 — see scrape_configs above). Would have surfaced the
# agent-zero ollama-proxy 172x crash-loop instead of letting it
# silently churn for ~3 days.
- name: kubernetes-state
rules:
- alert: KubeContainerRestartingFrequently
expr: increase(kube_pod_container_status_restarts_total[1h]) > 5
for: 15m
labels:
severity: warning
annotations:
summary: "{{ $labels.namespace }}/{{ $labels.pod }} container {{ $labels.container }} restarting >5x/hr"
description: "Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has restarted {{ $value | printf \"%.0f\" }} times in the last hour. Check 'kubectl describe pod' + last-state termination reason."
- alert: KubeContainerCrashLooping
expr: increase(kube_pod_container_status_restarts_total[15m]) > 3
for: 5m
labels:
severity: critical
alert_channel: thermal_print
annotations:
summary: "{{ $labels.namespace }}/{{ $labels.pod }} crashlooping ({{ $value | printf \"%.0f\" }} restarts/15m)"
description: "Container {{ $labels.container }} restarted {{ $value | printf \"%.0f\" }} times in 15 minutes — actively crashlooping."
- alert: KubePodNotReady
expr: sum by(namespace, pod) (kube_pod_status_phase{phase=~"Pending|Failed|Unknown"}) > 0
for: 15m
labels:
severity: warning
annotations:
summary: "{{ $labels.namespace }}/{{ $labels.pod }} not Ready for >15m"
description: "Pod is in a non-Running, non-Succeeded phase for over 15 minutes. Common causes: ImagePullBackOff (registry/Nexus down, wrong image tag), pending PVC, scheduling failure (taint/resources)."
- alert: KubePodImagePullBackOff
expr: sum by(namespace, pod) (kube_pod_container_status_waiting_reason{reason=~"ImagePullBackOff|ErrImagePull"}) > 0
for: 10m
labels:
severity: warning
annotations:
summary: "{{ $labels.namespace }}/{{ $labels.pod }} ImagePullBackOff for >10m"
description: "Pod can't pull image. Check the image ref (often a stale tag or unreachable registry) and clean up if it's an orphan."
- alert: KubeDeploymentReplicasMismatch
expr: kube_deployment_spec_replicas != kube_deployment_status_replicas_available
for: 15m
labels:
severity: warning
annotations:
summary: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} replica mismatch"
description: "Spec wants {{ $labels.spec_replicas }} but only {{ $value }} available. Likely a rollout stuck on probe failure, scheduling, or PVC."
# Q-MR-3 (2026-05-11): multus memory pressure — catches the next OOM
# cascade BEFORE multus is OOM-killed cluster-wide. The 2026-05-10
# outage (21h) hit because no alert fired on the rising multus working
# set — only downstream blackbox / Traefik / service alerts. With
# 1Gi limit (bluejay-infra@eb8693e), 80% = ~800MiB; steady-state
# runs ~150-250MiB so this only fires when an avalanche starts.
- alert: MultusMemoryPressure
expr: |
container_memory_working_set_bytes{container="kube-multus"}
/ container_spec_memory_limit_bytes{container="kube-multus"} > 0.8
for: 5m
labels:
severity: critical
alert_channel: thermal_print
annotations:
summary: "kube-multus memory >80% of limit on {{ $labels.node }} for 5m"
description: "kube-multus working set is {{ $value | humanizePercentage }} of its memory limit on node {{ $labels.node }}. If this keeps climbing, multus will OOM and all new pod networking will halt cluster-wide (precedent: 2026-05-10 outage)."
# Q-MR-3 (2026-05-11): namespace pending-pod backlog — catches the
# operator-leak avalanche pattern BEFORE it cascades into a multus
# CNI OOM. Any FC operator (RemoteDesktop / Distribution / WorldBuilder)
# emitting pods without ownerReferences will accumulate them when
# the operator crashes. >25 pending pods in any namespace for 30m
# is the signal to investigate the reconciler.
- alert: NamespacePendingPodBacklog
expr: sum by (namespace) (kube_pod_status_phase{phase="Pending"}) > 25
for: 30m
labels:
severity: warning
annotations:
summary: "Namespace {{ $labels.namespace }} has {{ $value }} Pending pods for 30m"
description: "Pending pod count in {{ $labels.namespace }} exceeds 25 sustained for 30m. Likely operator-leak avalanche pattern — children emitted without ownerReferences. Risk of multus CNI OOM cascade."
# Longhorn storage health alerts. Required: longhorn scrape job
# (added 2026-04-26 — see scrape_configs above). The K8s events
# for "snapshot becomes not ready to use" are transient lifecycle
# noise, not actionable — these alerts use the actual Longhorn
# gauges that reflect persistent state.
- name: longhorn-storage
rules:
# Volume robustness: 0=unknown, 1=healthy, 2=degraded, 3=faulted.
# Detached volumes report 0 — that's normal for unattached PVCs,
# so filter to only attached.
- alert: LonghornVolumeDegraded
expr: longhorn_volume_robustness{robustness="degraded"} == 1
for: 15m
labels:
severity: warning
annotations:
summary: "Longhorn volume {{ $labels.volume }} degraded for >15m"
description: "Volume {{ $labels.volume }} on node {{ $labels.node }} has been degraded (one or more replicas unhealthy) for 15+ minutes. Auto-rebuild may need help — check 'kubectl describe volume.longhorn.io {{ $labels.volume }} -n longhorn-system'."
- alert: LonghornVolumeFaulted
expr: longhorn_volume_robustness{robustness="faulted"} == 1
for: 5m
labels:
severity: critical
alert_channel: thermal_print
annotations:
summary: "Longhorn volume {{ $labels.volume }} FAULTED"
description: "Volume {{ $labels.volume }} on node {{ $labels.node }} is faulted — all replicas unavailable. Data inaccessible. Manual intervention required."
# No backup in 36h indicates the daily-backup recurringJob is
# silently failing. Allows for one missed run + slack.
- alert: LonghornBackupStale
expr: |
(time() - max by(volume) (longhorn_backup_state{state="Completed"} * on(backup) group_left() longhorn_backup_actual_size_bytes)) > 36 * 3600
for: 1h
labels:
severity: warning
annotations:
summary: "Longhorn volume {{ $labels.volume }} has no completed backup in >36h"
description: "Daily backup recurringJob (cron 0 2 * * *) appears to have skipped this volume. Check 'kubectl get backups.longhorn.io -n longhorn-system' and the daily-backup CronJob logs."
- alert: LonghornNodeUnhealthy
expr: longhorn_node_status{condition="ready",condition_reason!=""} == 0
for: 5m
labels:
severity: warning
annotations:
summary: "Longhorn node {{ $labels.node }} not Ready"
description: "Node {{ $labels.node }} reports ready=false (reason: {{ $labels.condition_reason }}). Volumes scheduled to this node will be unavailable until it recovers."
# ============================================================
# FC Signage Marquee Performance — Track 3 + 8 (2026-05-06)
# Live-mirrored from FlowerCore.Notes/scripts/monitoring/alerts.yml.
# Source-of-truth for the live Podman Prometheus on noc1 is the
# Notes file; this K8s ConfigMap exists so a future migration to
# in-cluster Prometheus inherits the ruleset automatically.
# See feedback_monitoring_k8s_target_vs_live_podman.
# ============================================================
- name: fc-signage-marquee
rules:
- alert: MarqueeDroppedFramesHigh
expr: |
(
sum by (renderer, phase, node_id) (rate(marquee_dropped_frames_total[5m]))
/
sum by (renderer, phase, node_id) (rate(marquee_render_latency_ms_count[5m]))
) > 0.05
unless on()
absent_over_time(marquee_dropped_frames_total[7d])
for: 5m
labels:
severity: warning
service: signage
alert_channel: irc
annotations:
summary: "Marquee dropped-frame rate >5% on {{ $labels.renderer }}/{{ $labels.node_id }} ({{ $labels.phase }})"
description: "Renderer {{ $labels.renderer }} on {{ $labels.node_id }} drops >5% of frames during {{ $labels.phase }}. Animation visibly stuttery."
- alert: MarqueeRenderLatencyP99High
expr: |
histogram_quantile(
0.99,
sum by (renderer, phase, node_id, le) (rate(marquee_render_latency_ms_bucket[5m]))
) > 16
unless on()
absent_over_time(marquee_render_latency_ms_bucket[7d])
for: 10m
labels:
severity: warning
service: signage
alert_channel: irc
annotations:
summary: "Marquee render latency p99 > 16ms on {{ $labels.renderer }}/{{ $labels.node_id }} ({{ $labels.phase }})"
description: "Per-frame render latency p99 has exceeded the Pi-class 16ms budget for 10 minutes."
- alert: MarqueeAnimationDurationDrift
expr: |
abs(
histogram_quantile(0.5, sum by (renderer, phase, le) (rate(marquee_animation_duration_ms_bucket[15m])))
-
on (phase) group_left() avg by (phase) (marquee_animation_duration_target_ms)
)
/
on (phase) group_left() avg by (phase) (marquee_animation_duration_target_ms)
> 0.10
unless on()
absent_over_time(marquee_animation_duration_ms_bucket[7d])
for: 15m
labels:
severity: info
service: signage
alert_channel: irc
annotations:
summary: "Marquee animation duration drifting > 10% on {{ $labels.renderer }} ({{ $labels.phase }})"
description: "Median observed cycle duration deviates from target DurationMs by >10%. Could indicate browser tab throttling, GPU pressure, or phase-advancement bug."
# =============================================================================
# ConfigMap: Blackbox Exporter Configuration
# =============================================================================
---
apiVersion: v1
kind: ConfigMap
metadata:
name: blackbox-config
namespace: monitoring
data:
blackbox.yml: |
modules:
http_2xx:
prober: http
timeout: 5s
http:
valid_http_versions: ["HTTP/1.1", "HTTP/2.0"]
valid_status_codes: [200]
method: GET
fail_if_body_not_matches_regexp: []
preferred_ip_protocol: ip4
http_ollama:
prober: http
timeout: 5s
http:
valid_http_versions: ["HTTP/1.1", "HTTP/2.0"]
valid_status_codes: [200]
method: GET
fail_if_body_not_matches_regexp:
- '"models"'
preferred_ip_protocol: ip4
# https_internal — for Traefik-fronted services with step-ca leaf
# certs. blackbox does not trust the step-ca root CA, so http_2xx
# against any *.iamworkin.lan host fails with x509 unknown authority.
# Redirects + multiple status codes are accepted because some hosts
# 302 to /login or /scalar.
https_internal:
prober: http
timeout: 10s
http:
valid_http_versions: ["HTTP/1.1", "HTTP/2.0"]
valid_status_codes: [200, 301, 302, 303, 307, 308]
method: GET
follow_redirects: true
preferred_ip_protocol: ip4
tls_config:
insecure_skip_verify: true
# =============================================================================
# ConfigMap: IRC Notify Script
# =============================================================================
---
apiVersion: v1
kind: ConfigMap
metadata:
name: irc-notify-script
namespace: monitoring
data:
notify.py: |
#!/usr/bin/env python3
"""HTTP->IRC alert relay with thermal-printer DIGEST forwarding.
Listens on :9119, posts to #alerts on UnrealIRCd, forwards to Print.Web
/api/print/alert. Thermal printing is BATCHED into hourly digests by
default so the printer no longer spam-fires per Grafana webhook.
Routing (per Grafana webhook alert):
- IRC: always per-event (operator likes the stream)
- Thermal printer:
* severity in {critical,disaster,page} OR
label alert_channel=thermal_print_immediate -> print NOW
* label alert_channel=thermal_print -> enqueue into hourly digest
* everything else -> IRC only
- RESOLVED webhooks remove the alert from the digest buffer
Env vars (defaults preserve old behavior on first deploy):
THERMAL_PRINT_ENABLED default "true" - master kill switch
BATCH_INTERVAL_MIN default "60" - minutes between digest prints
BATCH_MAX_PENDING default "50" - force-flush threshold
HTTP surface:
POST / - Grafana webhook entry
POST /flush - manual digest flush (idempotent)
GET / - status + config + buffer depth + stats
"""
import json, os, socket, sys, threading, time
from collections import defaultdict
from datetime import datetime, timezone
from http.server import HTTPServer, BaseHTTPRequestHandler
from urllib.request import Request, urlopen
THERMAL_PRINT_ENABLED = os.environ.get("THERMAL_PRINT_ENABLED", "true").lower() == "true"
BATCH_INTERVAL_MIN = int(os.environ.get("BATCH_INTERVAL_MIN", "60"))
BATCH_MAX_PENDING = int(os.environ.get("BATCH_MAX_PENDING", "50"))
IRC_HOST = os.environ.get("IRC_HOST", "unrealircd.irc.svc")
IRC_PORT = int(os.environ.get("IRC_PORT", "6667"))
IRC_NICK = os.environ.get("IRC_NICK", "grafana-bot")
IRC_CHANNEL = os.environ.get("IRC_CHANNEL", "#alerts")
PRINT_WEB_URL = os.environ.get("PRINT_WEB_URL", "http://10.0.57.16:5200/api/print/alert")
_buffer_lock = threading.Lock()
_buffer = {} # fingerprint -> {"alert": dict, "first_seen": float, "last_seen": float}
_last_flush_time = time.time()
_stats = {"webhooks_received": 0, "irc_sent": 0, "print_immediate": 0,
"digest_flushed": 0, "buffer_dedup": 0, "buffer_added": 0,
"buffer_resolved": 0, "started_at": time.time()}
def send_irc(message):
try:
sock = socket.create_connection((IRC_HOST, IRC_PORT), timeout=15)
sock.sendall(f"NICK {IRC_NICK}\r\n".encode())
sock.sendall(f"USER {IRC_NICK} 0 * :Grafana Alert Bot\r\n".encode())
registered = False
deadline = time.time() + 10
buf = ""
while time.time() < deadline:
try:
data = sock.recv(4096).decode("utf-8", errors="replace")
if not data: break
buf += data
for line in buf.split("\r\n"):
if line.startswith("PING"):
sock.sendall(("PONG " + line.split(" ", 1)[1] + "\r\n").encode())
if " 001 " in buf:
registered = True
break
except socket.timeout: break
if not registered:
sock.close()
return False
sock.sendall(f"JOIN {IRC_CHANNEL}\r\n".encode())
time.sleep(0.5)
sock.recv(4096)
for line in message.split("\n"):
if line.strip():
sock.sendall(f"PRIVMSG {IRC_CHANNEL} :{line}\r\n".encode())
time.sleep(0.3)
time.sleep(0.5)
sock.sendall(b"QUIT :alert delivered\r\n")
sock.close()
_stats["irc_sent"] += 1
return True
except Exception as e:
print(f"[irc-notify] IRC send failed: {e}", file=sys.stderr)
return False
def post_thermal(payload, kind):
if not THERMAL_PRINT_ENABLED:
print(f"[irc-notify] thermal disabled; skip {kind} ({payload.get('title','?')[:40]})", file=sys.stderr)
return False
try:
req = Request(PRINT_WEB_URL, data=json.dumps(payload).encode("utf-8"),
headers={"Content-Type": "application/json"}, method="POST")
resp = urlopen(req, timeout=10)
if kind == "immediate": _stats["print_immediate"] += 1
print(f"[irc-notify] thermal {kind} sent: {payload.get('title','?')[:50]}", file=sys.stderr)
return True
except Exception as e:
print(f"[irc-notify] thermal {kind} failed: {e}", file=sys.stderr)
return False
def fingerprint_of(alert):
fp = alert.get("fingerprint", "")
if fp: return fp
labels = alert.get("labels", {})
target = labels.get("pod") or labels.get("instance") or labels.get("deployment") or labels.get("statefulset") or labels.get("namespace") or ""
return f"{labels.get('alertname','?')}/{labels.get('namespace','')}/{target}"
def is_critical(alert):
return alert.get("labels", {}).get("severity", "").lower() in ("critical", "disaster", "page")
def is_immediate_label(alert):
return alert.get("labels", {}).get("alert_channel") == "thermal_print_immediate"
def is_batched_label(alert):
return alert.get("labels", {}).get("alert_channel") == "thermal_print"
def add_to_digest(alert):
"""Add an alert to the digest buffer. Returns True if the buffer GREW
(new fingerprint), False if it was a dedup, resolution, or no-op.
"""
if not THERMAL_PRINT_ENABLED: return False
fp = fingerprint_of(alert)
status = alert.get("status", "firing").lower()
with _buffer_lock:
if status == "resolved":
if fp in _buffer:
del _buffer[fp]
_stats["buffer_resolved"] += 1
return False
if fp in _buffer:
_buffer[fp]["last_seen"] = time.time()
_buffer[fp]["alert"] = alert
_stats["buffer_dedup"] += 1
return False
_buffer[fp] = {"alert": alert, "first_seen": time.time(), "last_seen": time.time()}
_stats["buffer_added"] += 1
return True
def build_digest_payload():
with _buffer_lock:
items = list(_buffer.values())
if not items: return None
by_name = defaultdict(list)
for item in items:
labels = item["alert"].get("labels", {})
by_name[labels.get("alertname", "Unknown")].append(item)
lines = []
for name, group in sorted(by_name.items()):
targets = []
for it in group[:5]:
labels = it["alert"].get("labels", {})
t = (labels.get("pod") or labels.get("instance") or labels.get("deployment")
or labels.get("statefulset") or labels.get("namespace") or "?")
targets.append(t)
more = f" (+{len(group)-5})" if len(group) > 5 else ""
sevs = sorted({it["alert"].get("labels", {}).get("severity", "warning") for it in group})
lines.append(f"[{'/'.join(sevs)}] {name} x{len(group)}: {', '.join(targets)}{more}")
now = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC")
title = f"Alert digest: {len(items)} firing"
body = "\n".join([
f"=== {title} ===",
f"as of {now}",
"",
*lines,
"",
"Stream: #alerts (IRC) | Triage: grafana-noc1.iamworkin.lan",
"Force-flush: POST irc-notify.monitoring.svc:9119/flush",
])
return {"title": title, "severity": "Warning", "host": "monitoring",
"message": body, "eventId": f"digest-{int(time.time())}",
"source": "Grafana digest", "status": "PROBLEM", "acknowledged": False}
def flush_digest():
payload = build_digest_payload()
if payload is None:
print("[irc-notify] flush: buffer empty, no digest sent", file=sys.stderr)
return False
sent = post_thermal(payload, "digest")
with _buffer_lock:
_buffer.clear()
if sent: _stats["digest_flushed"] += 1
return sent
def digest_loop():
global _last_flush_time
while True:
try:
now = time.time()
elapsed = now - _last_flush_time
if elapsed >= BATCH_INTERVAL_MIN * 60:
print(f"[irc-notify] digest tick: interval reached ({BATCH_INTERVAL_MIN}m); buffer={len(_buffer)}", file=sys.stderr)
flush_digest()
_last_flush_time = now
elif len(_buffer) >= BATCH_MAX_PENDING:
print(f"[irc-notify] digest tick: buffer full ({len(_buffer)}); force flush", file=sys.stderr)
flush_digest()
_last_flush_time = now
time.sleep(15)
except Exception as e:
print(f"[irc-notify] digest loop error: {e}", file=sys.stderr)
time.sleep(60)
class Handler(BaseHTTPRequestHandler):
def do_POST(self):
if self.path == "/flush":
ok = flush_digest()
self.send_response(200); self.send_header("Content-Type", "application/json"); self.end_headers()
self.wfile.write(json.dumps({"flushed": ok, "buffer_after": len(_buffer)}).encode())
return
_stats["webhooks_received"] += 1
length = int(self.headers.get("Content-Length", 0))
body = json.loads(self.rfile.read(length)) if length else {}
for alert in body.get("alerts", []):
status = alert.get("status", "unknown").upper()
labels = alert.get("labels", {})
name = labels.get("alertname", "Unknown")
summary = alert.get("annotations", {}).get("summary", "")
desc = alert.get("annotations", {}).get("description", "")
severity = labels.get("severity", "")
icon = "\x0304[FIRING]\x03" if status == "FIRING" else "\x0303[RESOLVED]\x03"
sev_tag = f" [{severity}]" if severity else ""
msg = f"{icon}{sev_tag} {name}: {summary}"
if desc: msg += f"\n {desc}"
send_irc(msg)
# Thermal routing — EVERYTHING (including criticals) goes into
# the hourly digest. Only the explicit `alert_channel=thermal_print_immediate`
# label bypasses, and even that flushes-the-current-digest rather
# than printing a standalone job, so the same fingerprint can't
# spam the printer per webhook cycle.
if status == "RESOLVED":
add_to_digest(alert) # removes from buffer
continue
if is_immediate_label(alert):
# Explicit opt-in for "paper this NOW" — first arrival of a
# new fingerprint triggers an immediate digest flush; repeat
# webhooks for the same fingerprint dedupe in the buffer
# until the next interval or until the alert resolves.
new_in_buffer = add_to_digest(alert)
if new_in_buffer:
global _last_flush_time
flush_digest()
_last_flush_time = time.time()
elif is_critical(alert) or is_batched_label(alert):
add_to_digest(alert)
# else: IRC-only (warnings without thermal_print label)
self.send_response(200); self.send_header("Content-Type", "application/json"); self.end_headers()
self.wfile.write(b'{"status":"ok"}')
def do_GET(self):
self.send_response(200); self.send_header("Content-Type", "application/json"); self.end_headers()
with _buffer_lock:
alertnames = sorted({it["alert"].get("labels", {}).get("alertname", "?") for it in _buffer.values()})
depth = len(_buffer)
info = {
"service": "irc-notify",
"config": {"thermal_print_enabled": THERMAL_PRINT_ENABLED,
"batch_interval_min": BATCH_INTERVAL_MIN,
"batch_max_pending": BATCH_MAX_PENDING,
"irc_target": f"{IRC_HOST}:{IRC_PORT} {IRC_CHANNEL}",
"print_web_url": PRINT_WEB_URL},
"buffer": {"depth": depth, "alertnames": alertnames,
"seconds_since_last_flush": int(time.time() - _last_flush_time),
"seconds_until_next_flush": max(0, int(BATCH_INTERVAL_MIN*60 - (time.time() - _last_flush_time)))},
"stats": _stats,
}
self.wfile.write(json.dumps(info, indent=2).encode())
def log_message(self, format, *args):
print(f"[irc-notify] {args[0]}", file=sys.stderr)
if __name__ == "__main__":
threading.Thread(target=digest_loop, daemon=True).start()
server = HTTPServer(("0.0.0.0", 9119), Handler)
print(f"[irc-notify] :9119 -> IRC {IRC_HOST}:{IRC_PORT} {IRC_CHANNEL} | thermal={'ON' if THERMAL_PRINT_ENABLED else 'OFF'} | digest={BATCH_INTERVAL_MIN}m max={BATCH_MAX_PENDING}", file=sys.stderr)
server.serve_forever()
# =============================================================================
# SNMP Exporter Auth Secret
# =============================================================================
# The full snmp.yml (~2MB, auto-generated) exceeds the 1MB ConfigMap limit.
# Strategy: store SNMP auth credentials in a Secret, and use an init container
# to download the full snmp.yml from Gitea, or split into multiple ConfigMaps.
# For now, we mount a minimal auth-only config and rely on the default modules
# bundled in the snmp-exporter image. To use custom modules, apply
# snmp-config.yaml separately (see comments in that file).
---
apiVersion: v1
kind: Secret
metadata:
name: snmp-auth
namespace: monitoring
type: Opaque
stringData:
# SNMP v2 community string used by prometheus scrape configs
SNMP_COMMUNITY_BLUEJAY: bluejay_monitor
SNMP_V3_USER: bluejay_snmpv3
SNMP_V3_AUTH_PASS: BlueJay-SNMP-Auth-2026
SNMP_V3_PRIV_PASS: BlueJay-SNMP-Priv-2026
# =============================================================================
# Grafana Credentials — synced from 1Password via Operator
# =============================================================================
# 1Password vault: IAmWorkin > "Grafana"
# Creates K8s Secret "grafana-credentials" with fields: username, password
# Grafana Deployment uses secretKeyRef to map username→GF_SECURITY_ADMIN_USER, password→GF_SECURITY_ADMIN_PASSWORD
---
apiVersion: onepassword.com/v1
kind: OnePasswordItem
metadata:
name: grafana-credentials
namespace: monitoring
spec:
itemPath: vaults/IAmWorkin/items/Grafana
# =============================================================================
# RBAC: ServiceAccount + ClusterRole for Prometheus K8s SD
# =============================================================================
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: prometheus
namespace: monitoring
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: prometheus
rules:
- apiGroups: [""]
resources: ["nodes", "nodes/proxy", "services", "endpoints", "pods"]
verbs: ["get", "list", "watch"]
- apiGroups: ["extensions", "networking.k8s.io"]
resources: ["ingresses"]
verbs: ["get", "list", "watch"]
- nonResourceURLs: ["/metrics"]
verbs: ["get"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: prometheus
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: prometheus
subjects:
- kind: ServiceAccount
name: prometheus
namespace: monitoring
# =============================================================================
# PVC: Prometheus Data (10Gi, Longhorn)
# =============================================================================
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: prometheus-data
namespace: monitoring
spec:
accessModes:
- ReadWriteOnce
storageClassName: longhorn
resources:
requests:
storage: 10Gi
# =============================================================================
# PVC: Grafana Data (2Gi, Longhorn)
# =============================================================================
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: grafana-data
namespace: monitoring
spec:
accessModes:
- ReadWriteOnce
storageClassName: longhorn
resources:
requests:
storage: 2Gi
# =============================================================================
# Deployment: Prometheus
# =============================================================================
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: prometheus
namespace: monitoring
labels:
app: prometheus
spec:
replicas: 1
strategy:
type: Recreate
selector:
matchLabels:
app: prometheus
template:
metadata:
labels:
app: prometheus
spec:
serviceAccountName: prometheus
securityContext:
fsGroup: 65534 # nobody
runAsUser: 65534
runAsGroup: 65534
containers:
- name: prometheus
image: docker.io/prom/prometheus:latest
args:
- "--config.file=/etc/prometheus/prometheus.yml"
- "--storage.tsdb.path=/prometheus"
- "--storage.tsdb.retention.time=90d"
- "--web.enable-lifecycle"
ports:
- containerPort: 9090
name: http
volumeMounts:
- name: config
mountPath: /etc/prometheus/prometheus.yml
subPath: prometheus.yml
readOnly: true
- name: config
mountPath: /etc/prometheus/alerts.yml
subPath: alerts.yml
readOnly: true
- name: config
mountPath: /etc/prometheus/recording-rules.yml
subPath: recording-rules.yml
readOnly: true
- name: data
mountPath: /prometheus
resources:
requests:
cpu: 200m
memory: 512Mi
limits:
cpu: "1"
memory: 2Gi
livenessProbe:
httpGet:
path: /-/healthy
port: 9090
initialDelaySeconds: 15
periodSeconds: 30
readinessProbe:
httpGet:
path: /-/ready
port: 9090
initialDelaySeconds: 5
periodSeconds: 10
volumes:
- name: config
configMap:
name: prometheus-config
- name: data
persistentVolumeClaim:
claimName: prometheus-data
# =============================================================================
# ConfigMap: Grafana Dashboard Provider
# =============================================================================
---
apiVersion: v1
kind: ConfigMap
metadata:
name: grafana-dashboard-provider
namespace: monitoring
data:
default.yml: |
apiVersion: 1
providers:
- name: 'default'
orgId: 1
folder: ''
type: file
disableDeletion: false
updateIntervalSeconds: 30
options:
path: /var/lib/grafana/dashboards
foldersFromFilesStructure: true
# =============================================================================
# ConfigMap: Grafana Dashboards (AI Stack Health)
# =============================================================================
---
apiVersion: v1
kind: ConfigMap
metadata:
name: grafana-dashboards
namespace: monitoring
data:
ai-stack-health.json: |
{
"id": null,
"panels": [
{
"fieldConfig": {
"defaults": {
"mappings": [
{
"options": {
"0": { "color": "red", "text": "DOWN" },
"1": { "color": "green", "text": "UP" }
},
"type": "value"
}
],
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "red", "value": null },
{ "color": "green", "value": 1 }
]
}
}
},
"gridPos": { "h": 4, "w": 6, "x": 0, "y": 0 },
"id": 1,
"targets": [
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"expr": "probe_success{job=\"probe-ollama-local\"}",
"legendFormat": "Status"
}
],
"title": "Ollama (Local)",
"type": "stat"
},
{
"fieldConfig": {
"defaults": {
"mappings": [
{
"options": {
"0": { "color": "red", "text": "DOWN" },
"1": { "color": "green", "text": "UP" }
},
"type": "value"
}
],
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "red", "value": null },
{ "color": "green", "value": 1 }
]
}
}
},
"gridPos": { "h": 4, "w": 6, "x": 6, "y": 0 },
"id": 2,
"targets": [
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"expr": "probe_success{job=\"probe-ollama-edge1\"}",
"legendFormat": "Status"
}
],
"title": "Ollama (Edge1)",
"type": "stat"
},
{
"fieldConfig": {
"defaults": {
"mappings": [
{
"options": {
"0": { "color": "red", "text": "DOWN" },
"1": { "color": "green", "text": "UP" }
},
"type": "value"
}
],
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "red", "value": null },
{ "color": "green", "value": 1 }
]
}
}
},
"gridPos": { "h": 4, "w": 6, "x": 12, "y": 0 },
"id": 3,
"targets": [
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"expr": "probe_success{job=\"probe-agentzero-local\"}",
"legendFormat": "Status"
}
],
"title": "Agent Zero (Local)",
"type": "stat"
},
{
"fieldConfig": {
"defaults": {
"mappings": [
{
"options": {
"0": { "color": "red", "text": "DOWN" },
"1": { "color": "green", "text": "UP" }
},
"type": "value"
}
],
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "red", "value": null },
{ "color": "green", "value": 1 }
]
}
}
},
"gridPos": { "h": 4, "w": 6, "x": 18, "y": 0 },
"id": 4,
"targets": [
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"expr": "probe_success{job=\"probe-agentzero-nuc\"}",
"legendFormat": "Status"
}
],
"title": "Agent Zero (NUC)",
"type": "stat"
},
{
"fieldConfig": {
"defaults": {
"custom": { "fillOpacity": 15, "lineWidth": 2 },
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 1 },
{ "color": "red", "value": 3 }
]
},
"unit": "s"
}
},
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 4 },
"id": 5,
"targets": [
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"expr": "probe_duration_seconds{service=\"ollama\"}",
"legendFormat": "{{ deployment }}"
}
],
"title": "Ollama Response Time",
"type": "timeseries"
},
{
"fieldConfig": {
"defaults": {
"custom": { "fillOpacity": 15, "lineWidth": 2 },
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 1 },
{ "color": "red", "value": 3 }
]
},
"unit": "s"
}
},
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 4 },
"id": 6,
"targets": [
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"expr": "probe_duration_seconds{service=\"agent-zero\"}",
"legendFormat": "{{ deployment }}"
}
],
"title": "Agent Zero Response Time",
"type": "timeseries"
},
{
"fieldConfig": {
"defaults": {
"custom": { "fillOpacity": 30, "lineWidth": 1, "stacking": { "mode": "none" } },
"mappings": [
{
"options": {
"0": { "text": "DOWN" },
"1": { "text": "UP" }
},
"type": "value"
}
],
"max": 1,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "red", "value": null },
{ "color": "green", "value": 1 }
]
}
}
},
"gridPos": { "h": 6, "w": 24, "x": 0, "y": 12 },
"id": 7,
"targets": [
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"expr": "probe_success{service=\"ollama\"}",
"legendFormat": "Ollama ({{ deployment }})"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"expr": "probe_success{service=\"agent-zero\"}",
"legendFormat": "Agent Zero ({{ deployment }})"
}
],
"title": "Uptime History",
"type": "timeseries"
},
{
"fieldConfig": {
"defaults": {
"custom": { "fillOpacity": 10, "lineWidth": 2 },
"max": 100,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 75 },
{ "color": "red", "value": 90 }
]
},
"unit": "percent"
}
},
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 18 },
"id": 8,
"targets": [
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"expr": "100 - (avg(rate(node_cpu_seconds_total{instance=\"edge1\",mode=\"idle\"}[5m])) * 100)",
"legendFormat": "CPU %"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"expr": "(1 - node_memory_MemAvailable_bytes{instance=\"edge1\"} / node_memory_MemTotal_bytes{instance=\"edge1\"}) * 100",
"legendFormat": "Memory %"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"expr": "(1 - node_filesystem_avail_bytes{instance=\"edge1\",mountpoint=\"/\"} / node_filesystem_size_bytes{instance=\"edge1\",mountpoint=\"/\"}) * 100",
"legendFormat": "Disk %"
}
],
"title": "Edge1 Pi 5 \u2014 CPU / Memory / Disk",
"type": "timeseries"
},
{
"fieldConfig": {
"defaults": {
"custom": { "fillOpacity": 10, "lineWidth": 2 },
"unit": "s"
}
},
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 18 },
"id": 9,
"targets": [
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"expr": "probe_dns_lookup_time_seconds",
"legendFormat": "{{ job }}"
}
],
"title": "Probe DNS Lookup Time",
"type": "timeseries"
}
],
"refresh": "30s",
"schemaVersion": 39,
"tags": ["ai", "ollama", "agent-zero", "blue-jay"],
"time": { "from": "now-1h", "to": "now" },
"timezone": "browser",
"title": "AI Stack Health",
"uid": "ai-stack-health",
"version": 1
}
# =============================================================================
# ConfigMap: Grafana Dashboard — Edge Nodes
# =============================================================================
---
apiVersion: v1
kind: ConfigMap
metadata:
name: grafana-dashboard-edge-nodes
namespace: monitoring
data:
bluejay-edge-nodes.json: |
{
"id": null,
"panels": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"mappings": [
{
"options": {
"0": {
"color": "red",
"text": "DOWN"
},
"1": {
"color": "green",
"text": "UP"
}
},
"type": "value"
}
]
}
},
"gridPos": { "h": 4, "w": 24, "x": 0, "y": 0 },
"targets": [
{
"expr": "up{instance=~\"edge.*\"}",
"legendFormat": "{{instance}}",
"refId": "A"
}
],
"title": "Edge Node Status",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"max": 100,
"min": 0,
"unit": "percent"
}
},
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 4 },
"targets": [
{
"expr": "100 - (avg(rate(node_cpu_seconds_total{mode=\"idle\", instance=~\"edge1.*\"}[5m])) * 100)",
"legendFormat": "CPU %",
"refId": "A"
},
{
"expr": "node_load1{instance=~\"edge1.*\"}",
"legendFormat": "Load 1m",
"refId": "B"
}
],
"title": "edge1 (Pi5 + Hailo) CPU",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"max": 100,
"min": 0,
"unit": "percent"
}
},
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 4 },
"targets": [
{
"expr": "100 - (avg(rate(node_cpu_seconds_total{mode=\"idle\", instance=~\"edge2.*\"}[5m])) * 100)",
"legendFormat": "CPU %",
"refId": "A"
},
{
"expr": "node_load1{instance=~\"edge2.*\"}",
"legendFormat": "Load 1m",
"refId": "B"
}
],
"title": "edge2 (Pi4) CPU",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"max": 100,
"min": 0,
"unit": "percent"
}
},
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 12 },
"targets": [
{
"expr": "(1 - node_memory_MemAvailable_bytes{instance=~\"edge.*\"} / node_memory_MemTotal_bytes{instance=~\"edge.*\"}) * 100",
"legendFormat": "{{instance}}",
"refId": "A"
}
],
"title": "Edge Memory Usage",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"max": 100,
"min": 0,
"unit": "percent"
}
},
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 12 },
"targets": [
{
"expr": "100 - (node_filesystem_avail_bytes{instance=~\"edge.*\",mountpoint=\"/\"} / node_filesystem_size_bytes{instance=~\"edge.*\",mountpoint=\"/\"} * 100)",
"legendFormat": "{{instance}}",
"refId": "A"
}
],
"title": "Edge Disk Usage",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"unit": "celsius"
}
},
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 20 },
"targets": [
{
"expr": "node_hwmon_temp_celsius{instance=~\"edge.*\"}",
"legendFormat": "{{instance}} {{chip}} {{sensor}}",
"refId": "A"
}
],
"title": "Edge CPU Temperature",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"unit": "bps"
}
},
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 20 },
"targets": [
{
"expr": "rate(node_network_receive_bytes_total{instance=~\"edge.*\",device!~\"lo|docker.*|veth.*\"}[5m]) * 8",
"legendFormat": "{{instance}} {{device}} RX",
"refId": "A"
},
{
"expr": "-rate(node_network_transmit_bytes_total{instance=~\"edge.*\",device!~\"lo|docker.*|veth.*\"}[5m]) * 8",
"legendFormat": "{{instance}} {{device}} TX",
"refId": "B"
}
],
"title": "Edge Network Traffic",
"type": "timeseries"
}
],
"refresh": "30s",
"schemaVersion": 40,
"tags": ["bluejay", "edge"],
"timezone": "browser",
"title": "BlueJay Edge Nodes",
"uid": "7e1603b9-e918-4b3f-a22b-163132fd5cee"
}
# =============================================================================
# ConfigMap: Grafana Dashboard — Network Overview
# =============================================================================
---
apiVersion: v1
kind: ConfigMap
metadata:
name: grafana-dashboard-network-overview
namespace: monitoring
data:
bluejay-network-overview.json: |
{
"id": null,
"panels": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"thresholds": {
"steps": [
{ "color": "green", "value": null }
]
}
}
},
"gridPos": { "h": 4, "w": 24, "x": 0, "y": 0 },
"targets": [
{
"expr": "count(up == 1)",
"legendFormat": "Up",
"refId": "A"
},
{
"expr": "count(up == 0)",
"legendFormat": "Down",
"refId": "B"
}
],
"title": "Target Health",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"max": 4,
"min": 0,
"thresholds": {
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 2 },
{ "color": "red", "value": 3 }
]
}
}
},
"gridPos": { "h": 6, "w": 6, "x": 0, "y": 4 },
"targets": [
{
"expr": "laLoad{instance=\"10.0.56.1\", laNames=\"Load-1\"}",
"refId": "A"
}
],
"title": "pfSense CPU Load (1m)",
"type": "gauge"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"max": 100,
"min": 0,
"thresholds": {
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 70 },
{ "color": "red", "value": 90 }
]
},
"unit": "percent"
}
},
"gridPos": { "h": 6, "w": 6, "x": 6, "y": 4 },
"targets": [
{
"expr": "100 - (memAvailReal{instance=\"10.0.56.1\"} / memTotalReal{instance=\"10.0.56.1\"} * 100)",
"refId": "A"
}
],
"title": "pfSense Memory Used %",
"type": "gauge"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"max": 100,
"min": 0,
"unit": "percent"
}
},
"gridPos": { "h": 6, "w": 12, "x": 12, "y": 4 },
"targets": [
{
"expr": "100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\", instance=\"noc1\"}[5m])) * 100)",
"legendFormat": "CPU %",
"refId": "A"
}
],
"title": "noc1 CPU Usage",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"max": 100,
"min": 0,
"unit": "percent"
}
},
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 10 },
"targets": [
{
"expr": "(1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100",
"legendFormat": "{{instance}}",
"refId": "A"
}
],
"title": "Node Memory Usage",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"max": 100,
"min": 0,
"thresholds": {
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 70 },
{ "color": "red", "value": 90 }
]
},
"unit": "percent"
}
},
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 10 },
"targets": [
{
"expr": "100 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} * 100)",
"legendFormat": "{{instance}}",
"refId": "A"
}
],
"title": "Node Disk Usage %",
"type": "bargauge"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"unit": "bps"
}
},
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 18 },
"targets": [
{
"expr": "rate(node_network_receive_bytes_total{device!~\"lo|veth.*|cali.*|flannel.*\"}[5m]) * 8",
"legendFormat": "{{instance}} {{device}} RX",
"refId": "A"
},
{
"expr": "-rate(node_network_transmit_bytes_total{device!~\"lo|veth.*|cali.*|flannel.*\"}[5m]) * 8",
"legendFormat": "{{instance}} {{device}} TX",
"refId": "B"
}
],
"title": "Network Traffic",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 26 },
"targets": [
{
"expr": "up",
"format": "table",
"instant": true,
"refId": "A"
}
],
"title": "Prometheus Targets",
"type": "table"
}
],
"refresh": "30s",
"schemaVersion": 40,
"tags": ["bluejay", "network"],
"timezone": "browser",
"title": "BlueJay Network Overview",
"uid": "b83a122c-6eb5-4fae-a632-77cbf753ad05"
}
# =============================================================================
# ConfigMap: Grafana Dashboard — Operations
# =============================================================================
---
apiVersion: v1
kind: ConfigMap
metadata:
name: grafana-dashboard-operations
namespace: monitoring
data:
bluejay-operations.json: |
{
"annotations": {
"list": []
},
"id": null,
"panels": [
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
"title": "Infrastructure Overview",
"type": "row"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"noValue": "0",
"thresholds": {
"steps": [
{ "color": "green", "value": null }
]
}
}
},
"gridPos": { "h": 4, "w": 8, "x": 0, "y": 1 },
"targets": [
{
"expr": "count(up == 1)",
"legendFormat": "Up",
"refId": "A"
},
{
"expr": "count(up == 0)",
"legendFormat": "Down",
"refId": "B"
}
],
"title": "All Targets Up/Down",
"type": "stat"
},
{
"datasource": {
"type": "alexanderzobnin-zabbix-datasource",
"uid": "bffjila3zkdfka"
},
"gridPos": { "h": 8, "w": 16, "x": 8, "y": 1 },
"targets": [
{
"application": { "filter": "" },
"group": { "filter": "/.*/" },
"host": { "filter": "/.*/" },
"queryType": 5,
"refId": "A",
"trigger": { "filter": "/.*/" }
}
],
"title": "Zabbix Active Problems",
"type": "alexanderzobnin-zabbix-triggers-panel"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"gridPos": { "h": 6, "w": 8, "x": 0, "y": 5 },
"targets": [
{
"expr": "node_load1{instance=\"noc1\"}",
"legendFormat": "1m",
"refId": "A"
},
{
"expr": "node_load5{instance=\"noc1\"}",
"legendFormat": "5m",
"refId": "B"
},
{
"expr": "node_load15{instance=\"noc1\"}",
"legendFormat": "15m",
"refId": "C"
}
],
"title": "noc1 Load Average",
"type": "timeseries"
},
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 11 },
"title": "Kubernetes & Services",
"type": "row"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {},
"overrides": [
{
"matcher": { "id": "byName", "options": "Value" },
"properties": [
{
"id": "mappings",
"value": [
{
"options": {
"0": { "color": "red", "text": "DOWN" },
"1": { "color": "green", "text": "UP" }
},
"type": "value"
}
]
}
]
}
]
},
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 12 },
"targets": [
{
"expr": "up",
"format": "table",
"instant": true,
"refId": "A"
}
],
"title": "K8s Services Uptime (Prometheus Targets)",
"type": "table"
},
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 20 },
"title": "Network & SNMP",
"type": "row"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"unit": "bps"
}
},
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 21 },
"targets": [
{
"expr": "rate(ifHCInOctets{instance=\"10.0.56.1\", ifAlias=~\".*WAN.*\"}[5m]) * 8",
"legendFormat": "WAN In",
"refId": "A"
},
{
"expr": "-rate(ifHCOutOctets{instance=\"10.0.56.1\", ifAlias=~\".*WAN.*\"}[5m]) * 8",
"legendFormat": "WAN Out",
"refId": "B"
}
],
"title": "pfSense WAN Traffic",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"unit": "bps"
}
},
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 21 },
"targets": [
{
"expr": "rate(ifHCInOctets{instance=\"10.0.56.1\", ifAlias!~\".*WAN.*\"}[5m]) * 8",
"legendFormat": "{{ifAlias}} In",
"refId": "A"
},
{
"expr": "-rate(ifHCOutOctets{instance=\"10.0.56.1\", ifAlias!~\".*WAN.*\"}[5m]) * 8",
"legendFormat": "{{ifAlias}} Out",
"refId": "B"
}
],
"title": "pfSense LAN Traffic",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"max": 100,
"min": 0,
"unit": "percent"
}
},
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 29 },
"targets": [
{
"expr": "(1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100",
"legendFormat": "{{instance}}",
"refId": "A"
}
],
"title": "All Nodes Memory",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"max": 100,
"min": 0,
"unit": "percent"
}
},
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 29 },
"targets": [
{
"expr": "100 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} * 100)",
"legendFormat": "{{instance}}",
"refId": "A"
}
],
"title": "All Nodes Disk",
"type": "timeseries"
}
],
"refresh": "1m",
"schemaVersion": 40,
"tags": ["bluejay", "operations", "zabbix"],
"timezone": "browser",
"title": "BlueJay Operations",
"uid": "a781c4e4-c3fe-4ac7-be93-21363a41b97d"
}
# =============================================================================
# ConfigMap: Grafana Dashboard — Epson Printer
# =============================================================================
---
apiVersion: v1
kind: ConfigMap
metadata:
name: grafana-dashboard-printer
namespace: monitoring
data:
epson-ecotank-printer.json: |
{
"id": null,
"panels": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"max": 100,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "red", "value": null },
{ "color": "orange", "value": 10 },
{ "color": "yellow", "value": 20 },
{ "color": "green", "value": 40 }
]
},
"unit": "percent"
},
"overrides": [
{
"matcher": { "id": "byName", "options": "Black Ink Bottle" },
"properties": [
{ "id": "color", "value": { "fixedColor": "dark-blue", "mode": "fixed" } }
]
},
{
"matcher": { "id": "byName", "options": "Cyan Ink Bottle" },
"properties": [
{ "id": "color", "value": { "fixedColor": "super-light-blue", "mode": "fixed" } }
]
},
{
"matcher": { "id": "byName", "options": "Magenta Ink Bottle" },
"properties": [
{ "id": "color", "value": { "fixedColor": "purple", "mode": "fixed" } }
]
},
{
"matcher": { "id": "byName", "options": "Yellow Ink Bottle" },
"properties": [
{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }
]
}
]
},
"gridPos": { "h": 10, "w": 12, "x": 0, "y": 0 },
"id": 1,
"options": {
"orientation": "horizontal",
"reduceOptions": {
"calcs": ["lastNotNull"]
},
"showThresholdLabels": false,
"showThresholdMarkers": true
},
"targets": [
{
"expr": "prtMarkerSuppliesLevel{job=\"snmp-printer\"}",
"legendFormat": "{{prtMarkerSuppliesDescription}}",
"refId": "A"
}
],
"title": "Ink Levels",
"type": "gauge"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"custom": {
"fillOpacity": 20,
"lineWidth": 2,
"spanNulls": true
},
"max": 100,
"min": 0,
"unit": "percent"
},
"overrides": [
{
"matcher": { "id": "byName", "options": "Black Ink Bottle" },
"properties": [
{ "id": "color", "value": { "fixedColor": "dark-blue", "mode": "fixed" } }
]
},
{
"matcher": { "id": "byName", "options": "Cyan Ink Bottle" },
"properties": [
{ "id": "color", "value": { "fixedColor": "super-light-blue", "mode": "fixed" } }
]
},
{
"matcher": { "id": "byName", "options": "Magenta Ink Bottle" },
"properties": [
{ "id": "color", "value": { "fixedColor": "purple", "mode": "fixed" } }
]
},
{
"matcher": { "id": "byName", "options": "Yellow Ink Bottle" },
"properties": [
{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }
]
}
]
},
"gridPos": { "h": 10, "w": 12, "x": 12, "y": 0 },
"id": 2,
"targets": [
{
"expr": "prtMarkerSuppliesLevel{job=\"snmp-printer\"}",
"legendFormat": "{{prtMarkerSuppliesDescription}}",
"refId": "A"
}
],
"title": "Ink Level History",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 10000 },
{ "color": "red", "value": 50000 }
]
},
"unit": "short"
}
},
"gridPos": { "h": 5, "w": 6, "x": 0, "y": 10 },
"id": 3,
"options": {
"colorMode": "background",
"reduceOptions": {
"calcs": ["lastNotNull"]
},
"textMode": "value_and_name"
},
"targets": [
{
"expr": "prtMarkerLifeCount{job=\"snmp-printer\"}",
"legendFormat": "Pages",
"refId": "A"
}
],
"title": "Lifetime Page Count",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"mappings": [
{
"options": {
"1": { "text": "Online" }
},
"type": "value"
}
],
"thresholds": {
"steps": [
{ "color": "blue", "value": null }
]
}
}
},
"gridPos": { "h": 5, "w": 6, "x": 6, "y": 10 },
"id": 4,
"options": {
"colorMode": "background",
"reduceOptions": {
"calcs": ["lastNotNull"]
},
"textMode": "name"
},
"targets": [
{
"expr": "prtGeneralPrinterName{job=\"snmp-printer\"}",
"legendFormat": "{{prtGeneralPrinterName}}",
"refId": "A"
}
],
"title": "Printer Model",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"thresholds": {
"steps": [
{ "color": "green", "value": null },
{ "color": "red", "value": 1 }
]
},
"unit": "short"
}
},
"gridPos": { "h": 5, "w": 6, "x": 12, "y": 10 },
"id": 5,
"options": {
"colorMode": "background",
"reduceOptions": {
"calcs": ["lastNotNull"]
}
},
"targets": [
{
"expr": "prtAlertCriticalEvents{job=\"snmp-printer\"}",
"legendFormat": "Critical Alerts",
"refId": "A"
}
],
"title": "Critical Events",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"thresholds": {
"steps": [
{ "color": "blue", "value": null }
]
}
}
},
"gridPos": { "h": 5, "w": 6, "x": 18, "y": 10 },
"id": 6,
"options": {
"colorMode": "background",
"reduceOptions": {
"calcs": ["lastNotNull"]
},
"textMode": "name"
},
"targets": [
{
"expr": "prtGeneralSerialNumber{job=\"snmp-printer\"}",
"legendFormat": "{{prtGeneralSerialNumber}}",
"refId": "A"
}
],
"title": "Serial Number",
"type": "stat"
}
],
"refresh": "5m",
"schemaVersion": 39,
"tags": ["printer", "snmp", "bluejay"],
"time": { "from": "now-24h", "to": "now" },
"timezone": "browser",
"title": "Epson ET-3750 EcoTank Printer",
"uid": "epson-ecotank"
}
# =============================================================================
# ConfigMap: Grafana Dashboard — Infrastructure Overview
# =============================================================================
---
apiVersion: v1
kind: ConfigMap
metadata:
name: grafana-dashboard-infra-overview
namespace: monitoring
data:
infra-overview.json: |
{
"id": null,
"panels": [
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
"id": 100,
"title": "AI Stack",
"type": "row"
},
{
"fieldConfig": {
"defaults": {
"mappings": [
{
"options": {
"0": { "color": "red", "text": "DOWN" },
"1": { "color": "green", "text": "UP" }
},
"type": "value"
}
],
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "red", "value": null },
{ "color": "green", "value": 1 }
]
}
}
},
"gridPos": { "h": 4, "w": 6, "x": 0, "y": 1 },
"id": 1,
"targets": [
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"expr": "probe_success{job=\"probe-ollama-local\"}",
"legendFormat": "Status"
}
],
"title": "Ollama (Local)",
"type": "stat"
},
{
"fieldConfig": {
"defaults": {
"mappings": [
{
"options": {
"0": { "color": "red", "text": "DOWN" },
"1": { "color": "green", "text": "UP" }
},
"type": "value"
}
],
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "red", "value": null },
{ "color": "green", "value": 1 }
]
}
}
},
"gridPos": { "h": 4, "w": 6, "x": 6, "y": 1 },
"id": 2,
"targets": [
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"expr": "probe_success{job=\"probe-ollama-edge1\"}",
"legendFormat": "Status"
}
],
"title": "Ollama (Edge1)",
"type": "stat"
},
{
"fieldConfig": {
"defaults": {
"mappings": [
{
"options": {
"0": { "color": "red", "text": "DOWN" },
"1": { "color": "green", "text": "UP" }
},
"type": "value"
}
],
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "red", "value": null },
{ "color": "green", "value": 1 }
]
}
}
},
"gridPos": { "h": 4, "w": 6, "x": 12, "y": 1 },
"id": 3,
"targets": [
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"expr": "probe_success{job=\"probe-agentzero-local\"}",
"legendFormat": "Status"
}
],
"title": "Agent Zero (Local)",
"type": "stat"
},
{
"fieldConfig": {
"defaults": {
"mappings": [
{
"options": {
"0": { "color": "red", "text": "DOWN" },
"1": { "color": "green", "text": "UP" }
},
"type": "value"
}
],
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "red", "value": null },
{ "color": "green", "value": 1 }
]
}
}
},
"gridPos": { "h": 4, "w": 6, "x": 18, "y": 1 },
"id": 4,
"targets": [
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"expr": "probe_success{job=\"probe-agentzero-nuc\"}",
"legendFormat": "Status"
}
],
"title": "Agent Zero (NUC)",
"type": "stat"
},
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 5 },
"id": 101,
"title": "K8s Cluster",
"type": "row"
},
{
"fieldConfig": {
"defaults": {
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 30 },
{ "color": "red", "value": 50 }
]
}
}
},
"gridPos": { "h": 6, "w": 8, "x": 0, "y": 6 },
"id": 5,
"targets": [
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"expr": "count(up{job=\"node-exporter\"} == 1)",
"legendFormat": "Nodes Up"
}
],
"title": "Nodes Up (node-exporter)",
"type": "stat"
},
{
"fieldConfig": {
"defaults": {
"custom": { "fillOpacity": 15, "lineWidth": 2 },
"max": 100,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 70 },
{ "color": "red", "value": 90 }
]
},
"unit": "percent"
}
},
"gridPos": { "h": 6, "w": 8, "x": 8, "y": 6 },
"id": 6,
"targets": [
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"expr": "100 - (avg(rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) by (instance) * 100)",
"legendFormat": "{{ instance }}"
}
],
"title": "Node CPU Usage %",
"type": "timeseries"
},
{
"fieldConfig": {
"defaults": {
"custom": { "fillOpacity": 15, "lineWidth": 2 },
"max": 100,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 70 },
{ "color": "red", "value": 90 }
]
},
"unit": "percent"
}
},
"gridPos": { "h": 6, "w": 8, "x": 16, "y": 6 },
"id": 7,
"targets": [
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"expr": "(1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100",
"legendFormat": "{{ instance }}"
}
],
"title": "Node Memory Usage %",
"type": "timeseries"
},
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 12 },
"id": 102,
"title": "Network",
"type": "row"
},
{
"fieldConfig": {
"defaults": {
"custom": { "fillOpacity": 10, "lineWidth": 2 },
"unit": "Bps"
}
},
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 13 },
"id": 8,
"targets": [
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"expr": "rate(ifHCInOctets{job=\"snmp-pfsense\",ifDescr=\"igb0\"}[5m])",
"legendFormat": "WAN In"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"expr": "rate(ifHCOutOctets{job=\"snmp-pfsense\",ifDescr=\"igb0\"}[5m])",
"legendFormat": "WAN Out"
}
],
"title": "pfSense WAN Bandwidth",
"type": "timeseries"
},
{
"fieldConfig": {
"defaults": {
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "red", "value": null },
{ "color": "green", "value": 1 }
]
}
},
"overrides": []
},
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 13 },
"id": 9,
"options": {
"showHeader": true,
"sortBy": [{ "displayName": "Value", "desc": false }]
},
"targets": [
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"expr": "up",
"format": "table",
"instant": true,
"legendFormat": ""
}
],
"title": "Target Health (up)",
"transformations": [
{
"id": "organize",
"options": {
"excludeByName": { "Time": true, "__name__": true },
"renameByName": { "job": "Job", "instance": "Instance", "Value": "Up" }
}
}
],
"type": "table"
},
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 21 },
"id": 103,
"title": "Services",
"type": "row"
},
{
"gridPos": { "h": 4, "w": 24, "x": 0, "y": 22 },
"id": 10,
"options": {
"content": "### ArgoCD Applications\n\nView sync status and health of all 11 ArgoCD-managed applications:\n\n[Open ArgoCD Dashboard](https://argocd.iamworkin.lan)\n\n| App | Expected Status |\n|-----|----------------|\n| gitea | Healthy / Synced |\n| zabbix | Healthy / Synced |\n| guacamole | Healthy / Synced |\n| irc | Healthy / Synced |\n| mail | Healthy / Synced |\n| matrix | Healthy / Synced |\n| teamspeak | Healthy / Synced |\n| pki-web | Healthy / Synced |\n| intranet | Healthy / Synced |\n| telephony | Healthy / Synced |\n| infra-noc-services | Healthy / Synced |",
"mode": "markdown"
},
"title": "ArgoCD App Status",
"type": "text"
},
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 26 },
"id": 104,
"title": "Alerting",
"type": "row"
},
{
"fieldConfig": {
"defaults": {
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "orange", "value": 1 },
{ "color": "red", "value": 3 }
]
}
}
},
"gridPos": { "h": 6, "w": 24, "x": 0, "y": 27 },
"id": 11,
"targets": [
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"expr": "count(ALERTS{alertstate=\"firing\"}) or vector(0)",
"legendFormat": "Firing Alerts"
}
],
"title": "Firing Alerts",
"type": "stat"
}
],
"refresh": "30s",
"schemaVersion": 39,
"tags": ["infrastructure", "blue-jay", "overview"],
"time": { "from": "now-1h", "to": "now" },
"timezone": "browser",
"title": "Infrastructure Overview",
"uid": "infra-overview",
"version": 1
}
# =============================================================================
# ConfigMap: Grafana Datasource Provisioning
# =============================================================================
---
apiVersion: v1
kind: ConfigMap
metadata:
name: grafana-datasource-provisioning
namespace: monitoring
data:
datasource.yml: |
apiVersion: 1
datasources:
- name: Prometheus
type: prometheus
access: proxy
url: http://prometheus.monitoring.svc:9090
isDefault: true
editable: true
# =============================================================================
# ConfigMap: Grafana Alerting Provisioning (contact points + policies + rules)
# =============================================================================
# Makes alert rules declarative — survives pod rebuilds without API recreation
---
apiVersion: v1
kind: ConfigMap
metadata:
name: grafana-alerting-provisioning
namespace: monitoring
data:
alerting.yml: |
apiVersion: 1
contactPoints:
- orgId: 1
name: IRC #alerts
receivers:
- uid: irc-alerts-webhook
type: webhook
settings:
url: http://irc-notify.monitoring.svc:9119
httpMethod: POST
disableResolveMessage: false
- orgId: 1
name: Thermal Printer
receivers:
- uid: thermal-print-001
type: webhook
settings:
url: http://irc-notify.monitoring.svc:9119
httpMethod: POST
disableResolveMessage: true
policies:
- orgId: 1
receiver: IRC #alerts
group_by: ['alertname']
group_wait: 30s
group_interval: 5m
repeat_interval: 1h
routes:
- receiver: Thermal Printer
matchers: ['alert_channel = thermal_print']
group_wait: 1m
group_interval: 10m
repeat_interval: 4h
continue: true
groups:
- orgId: 1
name: AI Stack
folder: AI Stack Alerts
interval: 1m
rules:
- uid: ollama-down-local
title: Ollama DOWN (Local)
condition: C
for: 2m
noDataState: Alerting
execErrState: OK
annotations:
summary: Ollama DOWN on workstation (R9700)
description: Agent Zero LOCAL cannot reach Ollama. FAISS memory will fail.
runbook: "1. Check PC power/wake 2. ssh stoltz@10.0.58.100 3. systemctl status ollama 4. journalctl -u ollama --since 5min"
labels:
severity: warning
service: ollama
data:
- refId: A
relativeTimeRange: {from: 120, to: 0}
datasourceUid: prometheus
model: {expr: 'probe_success{job="probe-ollama-local"}', instant: true, refId: A}
- refId: B
relativeTimeRange: {from: 120, to: 0}
datasourceUid: __expr__
model: {type: reduce, expression: A, reducer: last, refId: B}
- refId: C
relativeTimeRange: {from: 120, to: 0}
datasourceUid: __expr__
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [1], type: lt}}], refId: C}
- uid: ollama-down-edge1
title: Ollama DOWN (Edge1)
condition: C
for: 2m
noDataState: Alerting
execErrState: OK
annotations:
summary: Ollama DOWN on edge1 Pi 5
description: Agent Zero NUC cannot reach Ollama.
runbook: "1. ssh stoltz@10.0.57.17 2. systemctl status ollama 3. journalctl -u ollama --since 5min 4. cat /sys/class/thermal/thermal_zone0/temp"
labels:
severity: warning
service: ollama
data:
- refId: A
relativeTimeRange: {from: 120, to: 0}
datasourceUid: prometheus
model: {expr: 'probe_success{job="probe-ollama-edge1"}', instant: true, refId: A}
- refId: B
relativeTimeRange: {from: 120, to: 0}
datasourceUid: __expr__
model: {type: reduce, expression: A, reducer: last, refId: B}
- refId: C
relativeTimeRange: {from: 120, to: 0}
datasourceUid: __expr__
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [1], type: lt}}], refId: C}
- uid: a0-down-local
title: Agent Zero DOWN (Local)
condition: C
for: 2m
noDataState: Alerting
execErrState: OK
annotations:
summary: Agent Zero LOCAL DOWN
description: K3s web UI unreachable.
runbook: "1. ssh stoltz@10.0.58.100 2. docker ps | grep agent-zero 3. docker logs agent-zero --tail 20 4. Check Ollama first (A0 crashes if Ollama down)"
labels:
severity: warning
service: agent-zero
data:
- refId: A
relativeTimeRange: {from: 120, to: 0}
datasourceUid: prometheus
model: {expr: 'probe_success{job="probe-agentzero-local"}', instant: true, refId: A}
- refId: B
relativeTimeRange: {from: 120, to: 0}
datasourceUid: __expr__
model: {type: reduce, expression: A, reducer: last, refId: B}
- refId: C
relativeTimeRange: {from: 120, to: 0}
datasourceUid: __expr__
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [1], type: lt}}], refId: C}
- uid: a0-down-nuc
title: Agent Zero DOWN (NUC)
condition: C
for: 2m
noDataState: Alerting
execErrState: OK
annotations:
summary: Agent Zero NUC DOWN
description: RKE2 web UI unreachable.
runbook: "1. ssh root@10.0.56.200 via noc1 2. kubectl get pods -n agent-zero 3. kubectl logs -n agent-zero deploy/agent-zero --tail 20"
labels:
severity: warning
service: agent-zero
data:
- refId: A
relativeTimeRange: {from: 120, to: 0}
datasourceUid: prometheus
model: {expr: 'probe_success{job="probe-agentzero-nuc"}', instant: true, refId: A}
- refId: B
relativeTimeRange: {from: 120, to: 0}
datasourceUid: __expr__
model: {type: reduce, expression: A, reducer: last, refId: B}
- refId: C
relativeTimeRange: {from: 120, to: 0}
datasourceUid: __expr__
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [1], type: lt}}], refId: C}
- uid: print-ollama-runner-long-keepalive
title: Print.Web Ollama runner keep-alive >10m
condition: C
for: 2m
noDataState: NoData
execErrState: OK
annotations:
summary: Print.Web Ollama runner held too long
description: "A Print.Web Ollama runner has more than 10 minutes of keep-alive remaining. Check active AI requests before the Pi 5 Ollama lane thrashes."
runbook: "1. Open https://print.iamworkin.lan/admin 2. Check Ollama Fleet model + remaining keep-alive 3. Query Prometheus: ai_ollama_runner_keepalive_remaining_seconds{job=\"printweb-otel\"} 4. Stop duplicate model callers before restarting Ollama."
labels:
severity: warning
service: print-web
alert_channel: thermal_print
data:
- refId: A
relativeTimeRange: {from: 120, to: 0}
datasourceUid: prometheus
model: {expr: 'max by(model) (ai_ollama_runner_keepalive_remaining_seconds{job="printweb-otel"})', instant: true, refId: A}
- refId: B
relativeTimeRange: {from: 120, to: 0}
datasourceUid: __expr__
model: {type: reduce, expression: A, reducer: last, refId: B}
- refId: C
relativeTimeRange: {from: 120, to: 0}
datasourceUid: __expr__
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [600], type: gt}}], refId: C}
- orgId: 1
name: CI Runners
folder: CI Alerts
interval: 1m
rules:
- uid: linux-runner-offline
title: LinuxRunnerOffline
condition: C
for: 5m
noDataState: OK
execErrState: Error
annotations:
summary: "Linux CI runner offline: {{ $labels.deployment }}"
description: "A github-runner namespace Deployment has 0 ready replicas for more than 5 minutes. CI jobs targeting that repo will queue until the runner pod restarts and re-registers."
runbook: "1. kubectl -n github-runner get pods -l app.kubernetes.io/name={{ $labels.deployment }} 2. kubectl -n github-runner logs -l app.kubernetes.io/name={{ $labels.deployment }} --tail=50 3. Verify PAT repo access if registration returns 404 4. Verify no RWO PVC is shared by scaled runners"
labels:
severity: warning
service: github-runner
alert_channel: irc
team: ci
data:
- refId: A
relativeTimeRange: {from: 300, to: 0}
datasourceUid: prometheus
model: {expr: 'kube_deployment_status_replicas_ready{namespace="github-runner",deployment=~"github-runner(|-(sharedpos|puppet|signage|dms|telephony|print-web|chat|mysql|kiosk-linux))"} == 0', instant: true, refId: A}
- refId: B
relativeTimeRange: {from: 300, to: 0}
datasourceUid: __expr__
model: {type: reduce, expression: A, reducer: last, refId: B}
- refId: C
relativeTimeRange: {from: 300, to: 0}
datasourceUid: __expr__
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [0], type: gt}}], refId: C}
- orgId: 1
name: Infrastructure
folder: AI Stack Alerts
interval: 1m
rules:
- uid: node-down
title: Node DOWN
condition: C
for: 2m
noDataState: Alerting
execErrState: OK
annotations:
summary: Node down
description: Node exporter unreachable for 2 minutes. Host may be down or network issue.
runbook: "1. Ping host IP 2. SSH via noc1 jumpbox (root@10.0.56.10) 3. systemctl status node_exporter 4. Check network/VLAN 5. Physical power check if unreachable"
labels:
severity: critical
service: infrastructure
alert_channel: thermal_print
data:
- refId: A
relativeTimeRange: {from: 120, to: 0}
datasourceUid: prometheus
model: {expr: 'up{job=~"node-exporter|rke2-nodes|edge-nodes"}', instant: true, refId: A}
- refId: B
relativeTimeRange: {from: 120, to: 0}
datasourceUid: __expr__
model: {type: reduce, expression: A, reducer: last, refId: B}
- refId: C
relativeTimeRange: {from: 120, to: 0}
datasourceUid: __expr__
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [1], type: lt}}], refId: C}
- uid: macmini-runner-offline
title: MacMiniRunnerOffline
condition: C
for: 10m
noDataState: Alerting
execErrState: OK
annotations:
summary: Mac mini GitHub runner offline
description: "One or more macmini-* GitHub Actions runners have not reported online for more than 10 minutes. LaunchDaemons survive reboot and do not require the bluejay GUI session."
runbook: "1. ssh fcadmin@macmini.iamworkin.lan 2. launchctl print system/io.flowercore.github-runner-<slug> 3. Check /Users/fcadmin/Library/Logs/github-runners/<slug>/stderr.log 4. Re-register the repo runner if .runner is missing"
labels:
severity: warning
service: github-runner
data:
- refId: A
relativeTimeRange: {from: 600, to: 0}
datasourceUid: prometheus
model: {expr: 'min(flowercore_github_runner_online{runner=~"macmini-.*"} or vector(0))', instant: true, refId: A}
- refId: B
relativeTimeRange: {from: 600, to: 0}
datasourceUid: __expr__
model: {type: reduce, expression: A, reducer: last, refId: B}
- refId: C
relativeTimeRange: {from: 600, to: 0}
datasourceUid: __expr__
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [1], type: lt}}], refId: C}
- uid: high-cpu
title: High CPU (>85%)
condition: C
for: 10m
noDataState: NoData
execErrState: OK
annotations:
summary: High CPU
description: CPU above 85% for 10 minutes. Performance degradation likely.
runbook: "1. SSH to host 2. top -b -n1 | head -20 3. Check for runaway process 4. kubectl top pods (if K8s node)"
labels:
severity: warning
service: infrastructure
data:
- refId: A
relativeTimeRange: {from: 600, to: 0}
datasourceUid: prometheus
model: {expr: 'instance:node_cpu_usage:avg5m', instant: true, refId: A}
- refId: B
relativeTimeRange: {from: 600, to: 0}
datasourceUid: __expr__
model: {type: reduce, expression: A, reducer: last, refId: B}
- refId: C
relativeTimeRange: {from: 600, to: 0}
datasourceUid: __expr__
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [85], type: gt}}], refId: C}
- uid: high-memory
title: High Memory (>90%)
condition: C
for: 5m
noDataState: NoData
execErrState: OK
annotations:
summary: High memory usage
description: Memory above 90% for 5 minutes. OOM kills imminent.
runbook: "1. SSH to host 2. free -h 3. ps aux --sort=-%mem | head 10 4. Check for memory leak (growing RSS)"
labels:
severity: warning
service: infrastructure
data:
- refId: A
relativeTimeRange: {from: 600, to: 0}
datasourceUid: prometheus
model: {expr: 'instance:node_memory_usage:percent', instant: true, refId: A}
- refId: B
relativeTimeRange: {from: 600, to: 0}
datasourceUid: __expr__
model: {type: reduce, expression: A, reducer: last, refId: B}
- refId: C
relativeTimeRange: {from: 600, to: 0}
datasourceUid: __expr__
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [90], type: gt}}], refId: C}
- uid: disk-low
title: Disk Space Low (>85%)
condition: C
for: 10m
noDataState: NoData
execErrState: OK
annotations:
summary: Disk usage high
description: Root disk above 85% for 10 minutes. Service disruption if full.
runbook: "1. SSH to host 2. df -h / 3. du -sh /* | sort -rh | head 4. journalctl --vacuum-size=100M 5. podman system prune"
labels:
severity: warning
service: infrastructure
data:
- refId: A
relativeTimeRange: {from: 600, to: 0}
datasourceUid: prometheus
model: {expr: 'instance:node_disk_usage:percent', instant: true, refId: A}
- refId: B
relativeTimeRange: {from: 600, to: 0}
datasourceUid: __expr__
model: {type: reduce, expression: A, reducer: last, refId: B}
- refId: C
relativeTimeRange: {from: 600, to: 0}
datasourceUid: __expr__
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [85], type: gt}}], refId: C}
- orgId: 1
name: RemoteDesktop
folder: AI Stack Alerts
interval: 1m
rules:
- uid: remotedesktop-web-down
title: RemoteDesktop Web DOWN
condition: C
for: 3m
noDataState: Alerting
execErrState: OK
annotations:
summary: FlowerCore RemoteDesktop /health probe failing
description: "https://desktop.iamworkin.lan/health has failed for 3 minutes. Catalog + session launch surface offline."
runbook: "1. kubectl -n fc-desktop get pods -l app.kubernetes.io/name=remotedesktop-web 2. kubectl -n fc-desktop logs deploy/remotedesktop-web --tail=50 3. Check Traefik IngressRoute + step-ca cert 4. Rollout restart if pod is stuck"
labels:
severity: warning
service: remotedesktop
data:
- refId: A
relativeTimeRange: {from: 180, to: 0}
datasourceUid: prometheus
model: {expr: 'probe_success{job="probe-remotedesktop"}', instant: true, refId: A}
- refId: B
relativeTimeRange: {from: 180, to: 0}
datasourceUid: __expr__
model: {type: reduce, expression: A, reducer: last, refId: B}
- refId: C
relativeTimeRange: {from: 180, to: 0}
datasourceUid: __expr__
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [1], type: lt}}], refId: C}
- uid: remotedesktop-metrics-stale
title: RemoteDesktop metrics stale
condition: C
for: 10m
noDataState: Alerting
execErrState: OK
annotations:
summary: RemoteDesktop /metrics returning no series
description: "No fc_desktop_session_events_total series for 10 minutes. Either the Prometheus scrape is misconfigured or the web deployment stopped exporting metrics. Cross-checked by Zabbix template's identical 10m no-data trigger."
runbook: "1. curl -sk https://desktop.iamworkin.lan/metrics | head 2. kubectl -n monitoring exec deploy/prometheus -- wget -qO- localhost:9090/api/v1/targets?scrapePool=fc-remotedesktop 3. Check monitoring-netpol egress allows to fc-desktop:8080"
labels:
severity: warning
service: remotedesktop
data:
- refId: A
relativeTimeRange: {from: 600, to: 0}
datasourceUid: prometheus
model: {expr: 'count(fc_desktop_session_events_total) or vector(0)', instant: true, refId: A}
- refId: B
relativeTimeRange: {from: 600, to: 0}
datasourceUid: __expr__
model: {type: reduce, expression: A, reducer: last, refId: B}
- refId: C
relativeTimeRange: {from: 600, to: 0}
datasourceUid: __expr__
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [1], type: lt}}], refId: C}
- uid: remotedesktop-pool-depleted
title: RemoteDesktop pool depleted
condition: C
for: 5m
noDataState: OK
execErrState: OK
annotations:
summary: RemoteDesktop warm pool depleted for 5m
description: "A RemoteDesktop warm pool has fc_desktop_pool_depleted=1 for 5 minutes. New launches will cold-start. Check pod scheduling, image pull, node capacity."
runbook: "1. kubectl -n fc-desktop get pods -l app.kubernetes.io/name=remote-desktop --sort-by=.status.startTime 2. kubectl -n fc-desktop describe desktoppool <name> 3. Verify localhost/fc-desktop:* images imported on all 3 RKE2 nodes"
labels:
severity: warning
service: remotedesktop
data:
- refId: A
relativeTimeRange: {from: 300, to: 0}
datasourceUid: prometheus
model: {expr: 'max(fc_desktop_pool_depleted)', instant: true, refId: A}
- refId: B
relativeTimeRange: {from: 300, to: 0}
datasourceUid: __expr__
model: {type: reduce, expression: A, reducer: last, refId: B}
- refId: C
relativeTimeRange: {from: 300, to: 0}
datasourceUid: __expr__
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [0.5], type: gt}}], refId: C}
- uid: remotedesktop-pool-deficit-sustained
title: RemoteDesktop pool below desired
condition: C
for: 10m
noDataState: OK
execErrState: OK
annotations:
summary: RemoteDesktop pool sustained deficit
description: "A pool has fc_desktop_pool_deficit>0 for 10 minutes. Operator is reconciling but can't reach desired size — likely image pull, NFS affinity, or claim-init issue."
runbook: "1. kubectl -n fc-desktop get pods -l flowercore.io/pool=<pool> 2. kubectl logs -n fc-desktop deploy/remotedesktop-operator 3. Check claim-init hook env on template"
labels:
severity: info
service: remotedesktop
data:
- refId: A
relativeTimeRange: {from: 600, to: 0}
datasourceUid: prometheus
model: {expr: 'max(fc_desktop_pool_deficit)', instant: true, refId: A}
- refId: B
relativeTimeRange: {from: 600, to: 0}
datasourceUid: __expr__
model: {type: reduce, expression: A, reducer: last, refId: B}
- refId: C
relativeTimeRange: {from: 600, to: 0}
datasourceUid: __expr__
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [0], type: gt}}], refId: C}
- uid: remotedesktop-session-churn-spike
title: RemoteDesktop launch rate spike
condition: C
for: 5m
noDataState: OK
execErrState: OK
annotations:
summary: RemoteDesktop launch rate exceeds 20/min
description: "Launch events >20/min for 5 minutes. Could be a user-facing feature launch, pooled template thrashing, or runaway automation loop."
runbook: "1. kubectl -n fc-desktop get pods -l app.kubernetes.io/name=remote-desktop -o wide | wc -l 2. curl -sk https://desktop.iamworkin.lan/api/sessions/active 3. Check operator logs for reconcile loops"
labels:
severity: info
service: remotedesktop
data:
- refId: A
relativeTimeRange: {from: 300, to: 0}
datasourceUid: prometheus
model: {expr: 'sum(rate(fc_desktop_session_events_total{event="launch"}[5m])) * 60', instant: true, refId: A}
- refId: B
relativeTimeRange: {from: 300, to: 0}
datasourceUid: __expr__
model: {type: reduce, expression: A, reducer: last, refId: B}
- refId: C
relativeTimeRange: {from: 300, to: 0}
datasourceUid: __expr__
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [20], type: gt}}], refId: C}
- uid: remotedesktop-tls-expiry
title: RemoteDesktop TLS cert expiring
condition: C
for: 6h
noDataState: OK
execErrState: OK
annotations:
summary: desktop.iamworkin.lan cert <2d to expiry
description: "The desktop.iamworkin.lan certificate is inside the 2-day renewal window and cert-manager has not renewed. Check cert-manager logs, step-ca reachability, FlowerCore.DNS preflight for dnsNames."
runbook: "1. kubectl -n fc-desktop get certificate remotedesktop-web-tls 2. kubectl -n cert-manager logs deploy/cert-manager --tail=50 3. Verify pfSense DNS override for desktop.iamworkin.lan"
labels:
severity: critical
service: remotedesktop
data:
- refId: A
relativeTimeRange: {from: 21600, to: 0}
datasourceUid: prometheus
model: {expr: '(probe_ssl_earliest_cert_expiry{job="probe-remotedesktop"} - time()) / 86400', instant: true, refId: A}
- refId: B
relativeTimeRange: {from: 21600, to: 0}
datasourceUid: __expr__
model: {type: reduce, expression: A, reducer: last, refId: B}
- refId: C
relativeTimeRange: {from: 21600, to: 0}
datasourceUid: __expr__
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [2], type: lt}}], refId: C}
# =============================================================================
# Deployment: Grafana
# =============================================================================
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: grafana
namespace: monitoring
labels:
app: grafana
spec:
replicas: 1
strategy:
type: Recreate
selector:
matchLabels:
app: grafana
template:
metadata:
labels:
app: grafana
spec:
securityContext:
fsGroup: 472 # grafana group
runAsUser: 472
runAsGroup: 472
containers:
- name: grafana
image: docker.io/grafana/grafana:latest
env:
# Credentials from 1Password Operator (OnePasswordItem → Secret "grafana-credentials")
- name: GF_SECURITY_ADMIN_USER
valueFrom:
secretKeyRef:
name: grafana-credentials
key: username
- name: GF_SECURITY_ADMIN_PASSWORD
valueFrom:
secretKeyRef:
name: grafana-credentials
key: password
- name: GF_SERVER_ROOT_URL
value: "https://grafana.iamworkin.lan"
- name: GF_SERVER_SERVE_FROM_SUB_PATH
value: "false"
# Zabbix plugin: install manually after first boot if needed
# GF_INSTALL_PLUGINS requires internet on startup — breaks with restrictive NetworkPolicy
# kubectl exec -n monitoring deploy/grafana -- grafana cli plugins install alexanderzobnin-zabbix-app
ports:
- containerPort: 3000
name: http
volumeMounts:
- name: data
mountPath: /var/lib/grafana
- name: dashboard-provider
mountPath: /etc/grafana/provisioning/dashboards
readOnly: true
- name: dashboards-ai-stack
mountPath: /var/lib/grafana/dashboards/ai-stack
readOnly: true
- name: dashboards-edge-nodes
mountPath: /var/lib/grafana/dashboards/edge-nodes
readOnly: true
- name: dashboards-network
mountPath: /var/lib/grafana/dashboards/network
readOnly: true
- name: dashboards-operations
mountPath: /var/lib/grafana/dashboards/operations
readOnly: true
- name: dashboards-printer
mountPath: /var/lib/grafana/dashboards/printer
readOnly: true
- name: dashboards-infra-overview
mountPath: /var/lib/grafana/dashboards/infra-overview
readOnly: true
- name: dashboards-remotedesktop
mountPath: /var/lib/grafana/dashboards/remotedesktop
readOnly: true
- name: datasource-provisioning
mountPath: /etc/grafana/provisioning/datasources
readOnly: true
- name: alerting-provisioning
mountPath: /etc/grafana/provisioning/alerting
readOnly: true
resources:
requests:
cpu: 100m
memory: 128Mi
limits:
cpu: 500m
memory: 512Mi
livenessProbe:
httpGet:
path: /api/health
port: 3000
initialDelaySeconds: 30
periodSeconds: 30
readinessProbe:
httpGet:
path: /api/health
port: 3000
initialDelaySeconds: 10
periodSeconds: 10
volumes:
- name: data
persistentVolumeClaim:
claimName: grafana-data
- name: dashboard-provider
configMap:
name: grafana-dashboard-provider
- name: dashboards-ai-stack
configMap:
name: grafana-dashboards
- name: dashboards-edge-nodes
configMap:
name: grafana-dashboard-edge-nodes
- name: dashboards-network
configMap:
name: grafana-dashboard-network-overview
- name: dashboards-operations
configMap:
name: grafana-dashboard-operations
- name: dashboards-printer
configMap:
name: grafana-dashboard-printer
- name: dashboards-infra-overview
configMap:
name: grafana-dashboard-infra-overview
- name: dashboards-remotedesktop
configMap:
name: grafana-dashboard-remotedesktop
- name: datasource-provisioning
configMap:
name: grafana-datasource-provisioning
- name: alerting-provisioning
configMap:
name: grafana-alerting-provisioning
# =============================================================================
# Deployment: Blackbox Exporter
# =============================================================================
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: blackbox-exporter
namespace: monitoring
labels:
app: blackbox-exporter
spec:
replicas: 1
selector:
matchLabels:
app: blackbox-exporter
template:
metadata:
labels:
app: blackbox-exporter
spec:
containers:
- name: blackbox-exporter
image: quay.io/prometheus/blackbox-exporter:latest
args:
- "--config.file=/config/blackbox.yml"
ports:
- containerPort: 9115
name: http
volumeMounts:
- name: config
mountPath: /config/blackbox.yml
subPath: blackbox.yml
readOnly: true
resources:
requests:
cpu: 50m
memory: 32Mi
limits:
cpu: 200m
memory: 128Mi
livenessProbe:
httpGet:
path: /
port: 9115
initialDelaySeconds: 5
periodSeconds: 30
readinessProbe:
httpGet:
path: /
port: 9115
initialDelaySeconds: 3
periodSeconds: 10
volumes:
- name: config
configMap:
name: blackbox-config
# =============================================================================
# PVC: SNMP Exporter Config (100Mi, Longhorn)
# =============================================================================
# The custom snmp.yml (~2MB) exceeds the 1MB ConfigMap limit.
# This PVC stores the config file. To load a custom config:
# kubectl cp snmp.yml monitoring/<snmp-exporter-pod>:/config/snmp.yml
# Then restart the pod to pick up the new config.
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: snmp-config
namespace: monitoring
spec:
accessModes:
- ReadWriteOnce
storageClassName: longhorn
resources:
requests:
storage: 100Mi
# =============================================================================
# Deployment: SNMP Exporter
# =============================================================================
# Uses a PVC-mounted config at /config/snmp.yml. An init container copies the
# default config from the image if the PVC is empty (first deploy).
# To load the custom noc1 snmp.yml (~2MB):
# kubectl cp snmp.yml monitoring/<snmp-exporter-pod>:/config/snmp.yml
# Then restart: kubectl rollout restart deploy/snmp-exporter -n monitoring
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: snmp-exporter
namespace: monitoring
labels:
app: snmp-exporter
spec:
replicas: 1
selector:
matchLabels:
app: snmp-exporter
template:
metadata:
labels:
app: snmp-exporter
spec:
initContainers:
# Copy default snmp.yml from image if PVC is empty (first deploy)
- name: init-config
image: docker.io/prom/snmp-exporter:latest
command:
- sh
- -c
- |
if [ ! -f /config/snmp.yml ]; then
echo "No custom config found, copying default from image..."
cp /etc/snmp_exporter/snmp.yml /config/snmp.yml
echo "Default snmp.yml copied to PVC."
else
echo "Custom snmp.yml already exists on PVC, skipping copy."
fi
volumeMounts:
- name: snmp-config
mountPath: /config
containers:
- name: snmp-exporter
image: docker.io/prom/snmp-exporter:latest
args:
- "--config.file=/config/snmp.yml"
ports:
- containerPort: 9116
name: http
volumeMounts:
- name: snmp-config
mountPath: /config
readOnly: true
resources:
requests:
cpu: 50m
memory: 64Mi
limits:
cpu: 200m
memory: 256Mi
livenessProbe:
httpGet:
path: /
port: 9116
initialDelaySeconds: 5
periodSeconds: 30
readinessProbe:
httpGet:
path: /
port: 9116
initialDelaySeconds: 3
periodSeconds: 10
volumes:
- name: snmp-config
persistentVolumeClaim:
claimName: snmp-config
# =============================================================================
# Deployment: IRC Notify (alert relay)
# =============================================================================
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: irc-notify
namespace: monitoring
labels:
app: irc-notify
spec:
replicas: 1
selector:
matchLabels:
app: irc-notify
template:
metadata:
labels:
app: irc-notify
spec:
containers:
- name: irc-notify
image: docker.io/library/python:3.12-slim
command: ["python3", "/app/notify.py"]
ports:
- containerPort: 9119
name: http
volumeMounts:
- name: script
mountPath: /app/notify.py
subPath: notify.py
readOnly: true
resources:
requests:
cpu: 25m
memory: 32Mi
limits:
cpu: 100m
memory: 64Mi
livenessProbe:
tcpSocket:
port: 9119
initialDelaySeconds: 5
periodSeconds: 30
readinessProbe:
tcpSocket:
port: 9119
initialDelaySeconds: 3
periodSeconds: 10
volumes:
- name: script
configMap:
name: irc-notify-script
# =============================================================================
# DaemonSet: Node Exporter (runs on every RKE2 node)
# =============================================================================
# Port 9101 avoids conflict with host-level node-exporters already on :9100.
# The rke2-nodes Prometheus job scrapes the host instances on :9100; this
# DaemonSet provides K8s service-discovery-based scraping on :9101.
---
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: node-exporter
namespace: monitoring
labels:
app: node-exporter
spec:
selector:
matchLabels:
app: node-exporter
updateStrategy:
type: RollingUpdate
rollingUpdate:
maxUnavailable: 1
template:
metadata:
labels:
app: node-exporter
spec:
hostPID: true
hostNetwork: true
tolerations:
- operator: Exists
securityContext:
runAsNonRoot: false
runAsUser: 0
containers:
- name: node-exporter
image: docker.io/prom/node-exporter:latest
args:
- "--path.rootfs=/host"
- "--path.sysfs=/host/sys"
- "--path.procfs=/host/proc"
- "--collector.filesystem.mount-points-exclude=^/(dev|proc|sys|run|var/lib/containers|run/containers|var/lib/rancher|var/lib/longhorn)($|/)"
- "--collector.netclass.ignored-devices=^(veth|cali|flannel|cni).*$"
- "--no-collector.btrfs"
- "--web.listen-address=:9101"
ports:
- containerPort: 9101
hostPort: 9101
name: metrics
securityContext:
privileged: true
readOnlyRootFilesystem: true
volumeMounts:
- name: rootfs
mountPath: /host
readOnly: true
mountPropagation: HostToContainer
- name: proc
mountPath: /host/proc
readOnly: true
- name: sys
mountPath: /host/sys
readOnly: true
resources:
requests:
cpu: 50m
memory: 32Mi
limits:
cpu: 200m
memory: 128Mi
volumes:
- name: rootfs
hostPath:
path: /
- name: proc
hostPath:
path: /proc
- name: sys
hostPath:
path: /sys
# =============================================================================
# Service: Prometheus (ClusterIP :9090)
# =============================================================================
---
apiVersion: v1
kind: Service
metadata:
name: prometheus
namespace: monitoring
labels:
app: prometheus
spec:
type: ClusterIP
ports:
- port: 9090
targetPort: 9090
protocol: TCP
name: http
selector:
app: prometheus
# =============================================================================
# Service: Grafana (ClusterIP :3000)
# =============================================================================
---
apiVersion: v1
kind: Service
metadata:
name: grafana
namespace: monitoring
labels:
app: grafana
spec:
type: ClusterIP
ports:
- port: 3000
targetPort: 3000
protocol: TCP
name: http
selector:
app: grafana
# =============================================================================
# Service: Blackbox Exporter (ClusterIP :9115)
# =============================================================================
---
apiVersion: v1
kind: Service
metadata:
name: blackbox-exporter
namespace: monitoring
labels:
app: blackbox-exporter
spec:
type: ClusterIP
ports:
- port: 9115
targetPort: 9115
protocol: TCP
name: http
selector:
app: blackbox-exporter
# =============================================================================
# Service: SNMP Exporter (ClusterIP :9116)
# =============================================================================
---
apiVersion: v1
kind: Service
metadata:
name: snmp-exporter
namespace: monitoring
labels:
app: snmp-exporter
spec:
type: ClusterIP
ports:
- port: 9116
targetPort: 9116
protocol: TCP
name: http
selector:
app: snmp-exporter
# =============================================================================
# Service: Node Exporter (headless for Prometheus SD)
# =============================================================================
---
apiVersion: v1
kind: Service
metadata:
name: node-exporter
namespace: monitoring
labels:
app: node-exporter
spec:
type: ClusterIP
clusterIP: None
ports:
- port: 9101
targetPort: 9101
protocol: TCP
name: metrics
selector:
app: node-exporter
# =============================================================================
# Service: IRC Notify (ClusterIP :9119)
# =============================================================================
---
apiVersion: v1
kind: Service
metadata:
name: irc-notify
namespace: monitoring
labels:
app: irc-notify
spec:
type: ClusterIP
ports:
- port: 9119
targetPort: 9119
protocol: TCP
name: http
selector:
app: irc-notify
# =============================================================================
# TLS Certificates (cert-manager + step-ca ACME)
# =============================================================================
---
apiVersion: cert-manager.io/v1
kind: Certificate
metadata:
name: grafana-tls
namespace: monitoring
spec:
secretName: grafana-tls
issuerRef:
name: step-ca-acme
kind: ClusterIssuer
dnsNames:
- grafana.iamworkin.lan
---
apiVersion: cert-manager.io/v1
kind: Certificate
metadata:
name: prometheus-tls
namespace: monitoring
spec:
secretName: prometheus-tls
issuerRef:
name: step-ca-acme
kind: ClusterIssuer
dnsNames:
- prometheus.iamworkin.lan
# =============================================================================
# Traefik IngressRoute: Grafana
# =============================================================================
---
apiVersion: traefik.io/v1alpha1
kind: IngressRoute
metadata:
name: grafana
namespace: monitoring
spec:
entryPoints:
- websecure
routes:
- kind: Rule
match: Host(`grafana.iamworkin.lan`)
services:
- name: grafana
port: 3000
tls:
secretName: grafana-tls
# =============================================================================
# Traefik IngressRoute: Prometheus
# =============================================================================
---
apiVersion: traefik.io/v1alpha1
kind: IngressRoute
metadata:
name: prometheus
namespace: monitoring
spec:
entryPoints:
- websecure
routes:
- kind: Rule
match: Host(`prometheus.iamworkin.lan`)
services:
- name: prometheus
port: 9090
tls:
secretName: prometheus-tls
# =============================================================================
# NetworkPolicy: monitoring namespace
# =============================================================================
---
apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
metadata:
name: monitoring-netpol
namespace: monitoring
spec:
podSelector: {}
policyTypes:
- Ingress
- Egress
ingress:
# Allow from Traefik (IngressRoutes AND ACME solver pods)
- from:
- namespaceSelector:
matchLabels:
kubernetes.io/metadata.name: traefik-system
# Allow intra-namespace (prometheus→exporters, grafana→prometheus, grafana→irc-notify)
- from:
- podSelector: {}
# Allow from cert-manager (ACME HTTP-01 self-check)
- from:
- namespaceSelector:
matchLabels:
kubernetes.io/metadata.name: cert-manager
egress:
# DNS
- to:
- namespaceSelector: {}
ports:
- port: 53
protocol: UDP
- port: 53
protocol: TCP
# MGMT VLAN (noc1, pfSense, switches, SNMP, node-exporter)
- to:
- ipBlock:
cidr: 10.0.56.0/24
# PROD VLAN (edge nodes)
- to:
- ipBlock:
cidr: 10.0.57.0/24
# HOME VLAN (workstation, printer, NAS)
- to:
- ipBlock:
cidr: 10.0.58.0/24
# Intra-namespace
- to:
- podSelector: {}
# Blackbox probes to other namespaces (agent-zero, etc)
- to:
- namespaceSelector:
matchLabels:
kubernetes.io/metadata.name: agent-zero
ports:
- port: 80
protocol: TCP
# FlowerCore.RemoteDesktop /metrics scrape via the fc-desktop
# ClusterIP Service (remotedesktop-web:8080). Also covers the
# Traefik VIP hairpin path since after kube-proxy DNAT, the egress
# destination is the backend pod IP on the service port (see
# feedback_netpol_dnat_backend_port).
- to:
- namespaceSelector:
matchLabels:
kubernetes.io/metadata.name: fc-desktop
ports:
- port: 8080
protocol: TCP
# Traefik backend ports — needed for in-cluster egress to public
# iamworkin.lan hostnames that CoreDNS wildcard resolves to the
# LoadBalancer VIP. Post-DNAT destination is a Traefik pod on 8080/8443.
- to:
- namespaceSelector:
matchLabels:
kubernetes.io/metadata.name: traefik-system
podSelector:
matchLabels:
app.kubernetes.io/name: traefik
ports:
- port: 8080
protocol: TCP
- port: 8443
protocol: TCP
# Traefik /metrics endpoint (port 9100) — separate from the data-path
# ports above. Required for the in-cluster `traefik` scrape job.
- to:
- namespaceSelector:
matchLabels:
kubernetes.io/metadata.name: traefik-system
ports:
- port: 9100
protocol: TCP
# kube-state-metrics — required for kubernetes-state alert group.
- to:
- namespaceSelector:
matchLabels:
kubernetes.io/metadata.name: kube-system
ports:
- port: 8080
protocol: TCP
# cert-manager metrics — required for CertManagerCertificate* alerts.
- to:
- namespaceSelector:
matchLabels:
kubernetes.io/metadata.name: cert-manager
ports:
- port: 9402
protocol: TCP
# Longhorn manager metrics — required for Longhorn* alerts.
- to:
- namespaceSelector:
matchLabels:
kubernetes.io/metadata.name: longhorn-system
ports:
- port: 9500
protocol: TCP
# IRC (irc-notify → UnrealIRCd in irc namespace via K8s DNS)
- to:
- namespaceSelector:
matchLabels:
kubernetes.io/metadata.name: irc
ports:
- port: 6667
protocol: TCP
- port: 6697
protocol: TCP
# Step-CA ACME (cert renewal)
- to:
- ipBlock:
cidr: 10.0.56.10/32
ports:
- port: 9443
protocol: TCP
# Internet (optional: Grafana plugin install, ACME)
- to:
- ipBlock:
cidr: 0.0.0.0/0
except:
- 10.0.0.0/8
- 172.16.0.0/12
- 192.168.0.0/16
# =============================================================================
# Job: SNMP Config Loader (ArgoCD PostSync hook)
# =============================================================================
# Runs once after the main deployment to populate the SNMP config PVC.
# Attempts to download custom snmp.yml from noc1; falls back to the default
# config bundled in the snmp-exporter image.
---
apiVersion: batch/v1
kind: Job
metadata:
name: snmp-config-loader
namespace: monitoring
annotations:
argocd.argoproj.io/hook: PostSync
argocd.argoproj.io/hook-delete-policy: HookSucceeded
spec:
backoffLimit: 0
template:
metadata:
labels:
app: snmp-config-loader
spec:
restartPolicy: Never
initContainers:
# Try to download custom snmp.yml from noc1
- name: download-config
image: docker.io/curlimages/curl:latest
command:
- sh
- -c
- |
echo "Attempting to download custom snmp.yml from noc1..."
curl -sf --connect-timeout 10 --max-time 30 \
http://10.0.56.10:9116/config -o /config/snmp.yml 2>/dev/null
if [ $? -eq 0 ] && [ -s /config/snmp.yml ]; then
echo "Custom snmp.yml downloaded from noc1 successfully."
else
echo "Download failed or empty, will use default from image."
rm -f /config/snmp.yml
fi
volumeMounts:
- name: snmp-config
mountPath: /config
containers:
# If download failed, copy the default config from the image
- name: fallback-default
image: docker.io/prom/snmp-exporter:latest
command:
- sh
- -c
- |
if [ -f /config/snmp.yml ] && [ -s /config/snmp.yml ]; then
echo "Custom config already present, nothing to do."
else
echo "Copying default snmp.yml from image to PVC..."
cp /etc/snmp_exporter/snmp.yml /config/snmp.yml
echo "Default config copied."
fi
echo "SNMP config loader complete."
volumeMounts:
- name: snmp-config
mountPath: /config
volumes:
- name: snmp-config
persistentVolumeClaim:
claimName: snmp-config