Files
bluejay-infra/apps/monitoring/noc-monitoring.yaml
2026-05-19 17:40:34 -05:00

4825 lines
175 KiB
YAML

# =============================================================================
# NOC Monitoring Stack — K8s Migration Target
# =============================================================================
# Migrates the noc1 Podman monitoring pod to RKE2 K8s.
# Source: noc1 (10.0.56.10) /opt/monitoring/
#
# Components:
# - Prometheus (metrics, alerting)
# - Grafana (dashboards)
# - Blackbox Exporter (HTTP probes)
# - SNMP Exporter (network device metrics)
# - Node Exporter (host metrics, DaemonSet)
# - IRC Notify (alert relay to UnrealIRCd)
#
# Note: SNMP exporter config (snmp.yml) is ~2MB, exceeding the 1MB ConfigMap
# limit. It is stored in a separate file (snmp-config.yaml) and must be
# applied as a standalone ConfigMap or mounted via an init container that
# downloads it from Gitea.
# =============================================================================
---
apiVersion: v1
kind: Namespace
metadata:
name: monitoring
# =============================================================================
# ConfigMap: Prometheus Configuration
# =============================================================================
---
apiVersion: v1
kind: ConfigMap
metadata:
name: prometheus-config
namespace: monitoring
data:
prometheus.yml: |
global:
scrape_interval: 30s
evaluation_interval: 30s
rule_files:
- /etc/prometheus/alerts.yml
- /etc/prometheus/recording-rules.yml
scrape_configs:
# noc1 host metrics (external to cluster)
- job_name: "node-exporter"
static_configs:
- targets: ["10.0.56.10:9100"]
labels:
instance: "noc1"
vlan: "mgmt"
# RKE2 bare-metal cluster (openSUSE Leap 16 on NUCs)
- job_name: "rke2-nodes"
scrape_timeout: 15s
static_configs:
- targets: ["10.0.56.11:9100"]
labels:
instance: "rke2-server"
vlan: "mgmt"
cluster: "rke2"
role: "server"
- targets: ["10.0.56.12:9100"]
labels:
instance: "rke2-agent1"
vlan: "mgmt"
cluster: "rke2"
role: "agent"
- targets: ["10.0.56.13:9100"]
labels:
instance: "rke2-agent2"
vlan: "mgmt"
cluster: "rke2"
role: "agent"
# Mac mini macOS runner node (INFRA VLAN)
- job_name: "macmini-node"
scrape_timeout: 15s
static_configs:
- targets: ["10.0.56.115:9100"]
labels:
instance: "macmini"
host: "macmini.iamworkin.lan"
vlan: "infra"
arch: "arm64"
role: "macos-runner"
puppet_managed: "true"
puppet_server: "puppet.iamworkin.lan"
# In-cluster node-exporter DaemonSet
- job_name: "k8s-node-exporter"
kubernetes_sd_configs:
- role: endpoints
namespaces:
names: ["monitoring"]
relabel_configs:
- source_labels: [__meta_kubernetes_endpoints_name]
action: keep
regex: node-exporter
- source_labels: [__meta_kubernetes_endpoint_node_name]
target_label: instance
# pfSense SNMP via snmp-exporter
- job_name: "snmp-pfsense"
static_configs:
- targets: ["10.0.56.1"]
metrics_path: /snmp
params:
module: [if_mib]
auth: [bluejay_v2]
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: snmp-exporter.monitoring.svc:9116
# UniFi Cloud Key SNMP — DISABLED 2026-04-26
# The Cloud Key Gen2+ runs unifi-core (controller) only — not a network
# device — and does NOT run an SNMP agent on UDP/161. Scrapes were
# silently failing with "connection refused" from 10.42.x.x:161 every
# 30s, polluting up{} = 0 and lastError on the Targets page. Hardware
# health (CPU/mem/disk) for the Cloud Key host should come from
# node_exporter via SSH — not SNMP.
# - job_name: "snmp-cloudkey"
# static_configs:
# - targets: ["10.0.56.3"]
# metrics_path: /snmp
# params:
# module: [if_mib]
# auth: [bluejay_v2]
# relabel_configs:
# - source_labels: [__address__]
# target_label: __param_target
# - source_labels: [__param_target]
# target_label: instance
# - target_label: __address__
# replacement: snmp-exporter.monitoring.svc:9116
# UniFi Switch SNMP
- job_name: "snmp-switch"
static_configs:
- targets: ["10.0.56.2"]
metrics_path: /snmp
params:
module: [if_mib]
auth: [bluejay_v2]
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: snmp-exporter.monitoring.svc:9116
# Synology NAS SNMP
- job_name: "snmp-nas"
static_configs:
- targets: ["10.0.58.3"]
metrics_path: /snmp
params:
module: [synology]
auth: [bluejay_v2]
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: snmp-exporter.monitoring.svc:9116
# Prometheus self-monitoring
- job_name: "prometheus"
static_configs:
- targets: ["localhost:9090"]
# Edge nodes (PROD VLAN)
- job_name: "edge-nodes"
static_configs:
- targets: ["10.0.57.17:9100"]
labels:
instance: "edge1"
vlan: "prod"
arch: "arm64"
role: "ai-inference"
puppet_managed: "true"
puppet_server: "puppet.iamworkin.lan"
- targets: ["10.0.57.16:9100"]
labels:
instance: "edge2"
vlan: "prod"
arch: "arm64"
role: "ci-runner"
puppet_managed: "true"
puppet_server: "puppet.iamworkin.lan"
- targets: ["10.0.58.25:9100"]
labels:
instance: "piez"
vlan: "home"
arch: "arm64"
role: "prototyping"
- targets: ["10.0.58.113:9100"]
labels:
instance: "pirelay"
vlan: "home"
arch: "arm64"
role: "relay-controller"
maintenance_state: "planned-offline"
maintenance_ticket: "Q-SP41-PIRELAY"
maintenance_reason: "traffic-light-rig-awaiting-power"
# =======================================================================
# PiManager Application Metrics (relay states, temps, automation)
# =======================================================================
- job_name: "pimanager-app"
scrape_interval: 15s
metrics_path: /metrics
static_configs:
- targets: ["10.0.58.25:5000"]
labels:
instance: "piez"
service: "pimanager"
vlan: "home"
device: "pi4-ezconnect"
- targets: ["10.0.58.113:5100"]
labels:
instance: "pirelay"
service: "pimanager"
vlan: "home"
device: "pi3-ks0212"
maintenance_state: "planned-offline"
maintenance_ticket: "Q-SP41-PIRELAY"
maintenance_reason: "traffic-light-rig-awaiting-power"
# Epson ET-3750 EcoTank Printer SNMP
- job_name: "snmp-printer"
scrape_interval: 5m
scrape_timeout: 30s
static_configs:
- targets: ["10.0.58.107"]
labels:
instance: "epson-ecotank"
vlan: "home"
device_type: "printer"
metrics_path: /snmp
params:
module: [printer_mib]
auth: [public_v2]
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: snmp-exporter.monitoring.svc:9116
# =============================================================================
# Print Services (CUPS + Print.Web on edge2)
# =============================================================================
# CUPS Prometheus exporter (cups_exporter on edge2:9628)
- job_name: "cups"
scrape_interval: 30s
static_configs:
- targets: ["10.0.57.16:9628"]
labels:
instance: "edge2"
service: "cups"
device_type: "printer"
printer_model: "NuPrint 210"
# Print.Web OTEL metrics (print counters/histograms plus Ollama runner gauges)
- job_name: "printweb-otel"
scrape_interval: 30s
metrics_path: /metrics/prometheus
static_configs:
- targets: ["10.0.57.16:5200"]
labels:
instance: "print-web"
service: "print-web"
device_type: "printer"
printer_model: "NuPrint 210"
# Print.Web health (Blazor app on edge2:5200)
- job_name: "probe-printweb"
metrics_path: /probe
params:
module: [http_2xx]
scrape_interval: 30s
static_configs:
- targets: ["http://10.0.57.16:5200/"]
labels:
instance: "print-web"
service: "print-web"
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: blackbox-exporter.monitoring.svc:9115
# FlowerCore.RemoteDesktop web health (public cluster VIP)
# Module is https_internal — desktop.iamworkin.lan uses a step-ca leaf
# cert; blackbox does NOT trust step-ca root, so http_2xx fails with
# x509 unknown authority and probe_success=0 even when /health 200s.
- job_name: "probe-remotedesktop"
metrics_path: /probe
params:
module: [https_internal]
scrape_interval: 30s
static_configs:
- targets: ["https://desktop.iamworkin.lan/health"]
labels:
instance: "https://desktop.iamworkin.lan/health"
service: "remotedesktop-web"
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- target_label: __address__
replacement: blackbox-exporter.monitoring.svc:9115
# FlowerCore.RemoteDesktop /metrics (direct scrape for counters)
- job_name: "fc-remotedesktop"
metrics_path: /metrics
scheme: https
scrape_interval: 30s
tls_config:
insecure_skip_verify: true
static_configs:
- targets: ["desktop.iamworkin.lan"]
labels:
service: "remotedesktop-web"
# CUPS web UI health (port 631)
- job_name: "probe-cups"
metrics_path: /probe
params:
module: [http_2xx]
scrape_interval: 60s
static_configs:
- targets: ["http://10.0.57.16:631/"]
labels:
instance: "cups-edge2"
service: "cups"
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: blackbox-exporter.monitoring.svc:9115
# =============================================================================
# AI Stack Health Probes (Blackbox Exporter)
# =============================================================================
# NOTE: probe-ollama-local and probe-agentzero-local were REMOVED
# 2026-04-26. They pointed at 10.0.58.100 (HOME VLAN) which is not
# reachable from cluster pods (firewalled). They had been firing as
# OllamaDown / AgentZeroDown since 2026-04-24. Workstation/AI-laptop
# Ollama and Agent Zero should be monitored via host-side Puppet
# (node_exporter on the box) once the AI laptop is running 24/7.
# Ollama API — edge1 Pi 5 (NUC Agent Zero)
- job_name: "probe-ollama-edge1"
metrics_path: /probe
params:
module: [http_ollama]
scrape_interval: 30s
static_configs:
- targets: ["http://10.0.57.17:11434/api/tags"]
labels:
instance: "ollama-edge1"
service: "ollama"
deployment: "nuc"
gpu: "cpu"
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: blackbox-exporter.monitoring.svc:9115
# Agent Zero Web UI — in-cluster (RKE2)
# Target uses short svc form (agent-zero.agent-zero.svc) NOT
# cluster.local FQDN — the *.cluster.local form gets rewritten to
# 10.0.56.200 (Traefik VIP) by the CoreDNS iamworkin.lan template +
# ndots:5 search-suffix expansion. Memory: feedback_coredns_ndots_template_collision.
- job_name: "probe-agentzero-nuc"
metrics_path: /probe
params:
module: [http_2xx]
scrape_interval: 30s
static_configs:
- targets: ["http://agent-zero.agent-zero.svc:80/"]
labels:
instance: "agent-zero-nuc"
service: "agent-zero"
deployment: "nuc"
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: blackbox-exporter.monitoring.svc:9115
# =============================================================================
# K8s Cluster State (kube-state-metrics, cert-manager, traefik)
# =============================================================================
# Use in-cluster ClusterIP service DNS — NOT NodePorts — so a same-node
# NodePort hairpin doesn't break the scrape (hit on rke2-agent1 hosting
# both prometheus and traefik on 2026-04-26: 10.0.56.12:30900 timed out
# from prometheus while .11/.13 worked). NodePorts at 30900-30902 are
# still useful for noc1-Podman-style external scrapers, but in-cluster
# we should always use the svc DNS form.
# kube-state-metrics — exposes K8s object state (pods, deployments, nodes)
# Required for KubeContainerRestartingFrequently / KubePodNotReady alerts.
- job_name: "kube-state-metrics"
scrape_interval: 30s
static_configs:
- targets: ["kube-state-metrics.kube-system.svc:8080"]
labels:
cluster: "rke2"
# cert-manager — exposes certmanager_certificate_ready_status,
# certmanager_certificate_expiration_timestamp_seconds, etc. Drives the
# CertManagerCertificateNotReady / CertManagerCertificateRenewalFailed
# alerts. Memory: project_cert_manager_prometheus_scrape.
- job_name: "cert-manager"
scrape_interval: 30s
static_configs:
- targets: ["cert-manager-metrics.cert-manager.svc:9402"]
labels:
cluster: "rke2"
# Traefik — request rates, latency, TLS cert metadata, router state.
# ClusterIP svc routes to one of the traefik pods; per-pod scrape via
# the headless `traefik-metrics` selector would be nicer for failover
# visibility but the single-replica scrape is enough for steady-state.
- job_name: "traefik"
scrape_interval: 15s
static_configs:
- targets: ["traefik-metrics.traefik-system.svc:9100"]
labels:
service: "traefik"
cluster: "rke2"
# Longhorn — exposes longhorn_volume_robustness, longhorn_backup_*,
# longhorn_node_status_*. Enables LonghornVolumeUnhealthy +
# LonghornBackupFailed alerts (no real visibility into Longhorn
# health before this — was relying on K8s events which are noisy
# transient lifecycle messages, not actionable signals).
- job_name: "longhorn"
scrape_interval: 30s
static_configs:
- targets: ["longhorn-backend.longhorn-system.svc:9500"]
labels:
service: "longhorn"
cluster: "rke2"
# FC web services through Traefik — single probe surface to spot any
# iamworkin.lan host returning non-200. Uses https_internal because all
# certs are step-ca leaves; blackbox would x509-fail with http_2xx.
# Some services need explicit healthcheck paths because root returns
# 404 (acme, guac) or 401 (grafana, prometheus). Drop them or point at
# the right endpoint — don't lower valid_status_codes globally because
# 401 from a healthy pod and 401 from an outage look identical.
- job_name: "probe-traefik-services"
metrics_path: /probe
params:
module: [https_internal]
scrape_interval: 60s
static_configs:
- targets:
# Root-reachable services (200 or 3xx)
- "https://gitea.iamworkin.lan/"
- "https://argocd.iamworkin.lan/"
- "https://intranet.iamworkin.lan/"
- "https://signage.iamworkin.lan/"
- "https://kiosk.iamworkin.lan/"
- "https://media.iamworkin.lan/"
- "https://mysql.iamworkin.lan/"
- "https://php.iamworkin.lan/"
- "https://zabbix.iamworkin.lan/"
- "https://desktop.iamworkin.lan/"
- "https://print.iamworkin.lan/"
- "https://dns.iamworkin.lan/"
- "https://chat.iamworkin.lan/"
- "https://dist.iamworkin.lan/"
- "https://dms.iamworkin.lan/"
- "https://menuboard.iamworkin.lan/"
- "https://messageboard.iamworkin.lan/"
- "https://presentations.iamworkin.lan/"
- "https://retail.iamworkin.lan/"
- "https://ttsreader.iamworkin.lan/"
# Explicit healthcheck paths
- "https://fc-llm-bridge.iamworkin.lan/healthz"
- "https://acme.iamworkin.lan/health"
# NOTE: services intentionally NOT in this probe surface
# - grafana.iamworkin.lan: every endpoint (incl. /api/health
# and /login) returns 401 behind Traefik basic-auth.
# Health covered by in-cluster monitoring-grafana scrape.
# - prometheus.iamworkin.lan: same auth pattern. Health covered
# by the prometheus self-scrape job.
# - guac.iamworkin.lan: deprecated — Guacamole moved to
# desktop.iamworkin.lan/guacamole/ (memory:
# feedback_traefik_cross_namespace_refs_disabled).
labels:
probe_type: "traefik-service"
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
regex: "https?://([^/:]+).*"
target_label: instance
- target_label: __address__
replacement: blackbox-exporter.monitoring.svc:9115
# =============================================================================
# Self-monitoring (K8s monitoring namespace)
# =============================================================================
- job_name: "monitoring-grafana"
metrics_path: /metrics
static_configs:
- targets: ["grafana.monitoring.svc:3000"]
labels:
instance: "grafana-k8s"
service: "grafana"
- job_name: "monitoring-blackbox"
static_configs:
- targets: ["blackbox-exporter.monitoring.svc:9115"]
labels:
instance: "blackbox-k8s"
service: "blackbox"
recording-rules.yml: |
groups:
- name: node-aggregations
interval: 30s
rules:
- record: instance:node_cpu_usage:avg5m
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)
- record: instance:node_memory_usage:percent
expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100
- record: instance:node_disk_usage:percent
expr: (1 - node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100
- record: instance:node_network_receive:rate5m
expr: rate(node_network_receive_bytes_total{device!~"lo|veth.*|cali.*|flannel.*"}[5m]) * 8
- record: instance:node_network_transmit:rate5m
expr: rate(node_network_transmit_bytes_total{device!~"lo|veth.*|cali.*|flannel.*"}[5m]) * 8
- name: probe-aggregations
interval: 30s
rules:
- record: service:probe_success:min
expr: min by(service) (probe_success)
- record: service:probe_duration:avg
expr: avg by(service) (probe_duration_seconds)
- name: print-rates
interval: 30s
rules:
- record: print:jobs_per_minute:rate5m
expr: rate(print_jobs_enqueued_total[5m]) * 60
- record: print:success_rate:ratio5m
expr: rate(print_jobs_completed_total[5m]) / rate(print_jobs_enqueued_total[5m])
- record: print:job_duration_p95:5m
expr: histogram_quantile(0.95, rate(print_job_duration_ms_bucket[5m]))
- record: print:ollama_runner_keepalive_remaining_seconds:max
expr: max by(instance, model) (ai_ollama_runner_keepalive_remaining_seconds{job="printweb-otel"})
- name: relay-rates
interval: 15s
rules:
- record: relay:state_changes:1h
expr: changes(pimanager_relay_state[1h])
- record: epson:pages_per_day:rate24h
expr: increase(prtMarkerLifeCount{job="snmp-printer"}[24h])
alerts.yml: |
groups:
- name: ai-stack
rules:
- alert: OllamaDown
expr: probe_success{service="ollama"} == 0
for: 2m
labels:
severity: warning
annotations:
summary: "Ollama is down on {{ $labels.deployment }}"
description: "Ollama API at {{ $labels.instance }} has been unreachable for 2 minutes. Agent Zero FAISS memory will fail."
- alert: AgentZeroDown
expr: probe_success{service="agent-zero"} == 0
for: 2m
labels:
severity: warning
annotations:
summary: "Agent Zero is down on {{ $labels.deployment }}"
description: "Agent Zero web UI at {{ $labels.instance }} has been unreachable for 2 minutes."
- alert: OllamaSlowResponse
expr: probe_duration_seconds{service="ollama"} > 3
for: 5m
labels:
severity: info
annotations:
summary: "Ollama responding slowly on {{ $labels.deployment }}"
description: "Ollama API response time exceeds 3s for 5 minutes. GPU may be overloaded."
- name: print-services
rules:
- alert: CUPSExporterDown
expr: up{job="cups"} == 0
for: 2m
labels:
severity: warning
annotations:
summary: "CUPS exporter unreachable on edge2"
description: "cups_exporter at edge2:9628 has been down for 2 minutes. CUPS monitoring offline."
- alert: CUPSWebUIDown
expr: probe_success{job="probe-cups"} == 0
for: 3m
labels:
severity: warning
annotations:
summary: "CUPS web UI down on edge2"
description: "CUPS port 631 unreachable for 3 minutes. Network printing unavailable."
- alert: PrintWebDown
expr: probe_success{job="probe-printweb"} == 0
for: 2m
labels:
severity: warning
annotations:
summary: "Print.Web is down on edge2"
description: "FlowerCore Print.Web at edge2:5200 unreachable. API/MCP/Blazor printing unavailable."
- alert: CUPSPrinterStopped
expr: cups_printer_state_total{state="stopped"} > 0
for: 5m
labels:
severity: warning
annotations:
summary: "CUPS printer stopped on edge2"
description: "A CUPS printer has been in stopped state for 5 minutes. Check USB connection or paper."
- alert: CUPSJobBacklog
expr: cups_job_active_total > 10
for: 2m
labels:
severity: warning
annotations:
summary: "Print queue backlog on edge2 ({{ $value }} active jobs)"
description: "CUPS has {{ $value }} active jobs queued. Possible printer jam, USB disconnect, or paper out."
# Paper roll lifecycle alerts (XL Track I, 2026-04-26).
# Source-of-truth gauge: print_paper_remaining_percent (Print.Web OTEL,
# hydrated on startup from the active PaperRoll row).
# alert_channel=thermal_print routes through irc-notify -> Print.Web
# /api/print/alert so the printer announces its own paper-out warning
# on its remaining paper. Self-referential humor + operator nudge.
- alert: PrintPaperRollLow
expr: print_paper_remaining_percent{job="printweb-otel"} < 10 and print_paper_remaining_percent{job="printweb-otel"} > 5
for: 5m
labels:
severity: warning
alert_channel: thermal_print
annotations:
summary: "Print roll low on edge2 ({{ $value | printf \"%.1f\" }}% remaining)"
description: "NuPrint 210 paper roll has {{ $value | printf \"%.1f\" }}% remaining. Operator should load a fresh roll soon. Run /api/paper/status for the precise mm + estimated jobs left."
- alert: PrintPaperRollCritical
expr: print_paper_remaining_percent{job="printweb-otel"} <= 5
for: 2m
labels:
severity: critical
alert_channel: thermal_print
annotations:
summary: "Print roll critical on edge2 ({{ $value | printf \"%.1f\" }}% remaining)"
description: "NuPrint 210 paper roll at {{ $value | printf \"%.1f\" }}% — load a new roll NOW. The 50ft roll has a ~12% red-stripe zone; once paper passes that, the printer can run dry mid-job."
- alert: PrintJobDeadLetter
expr: increase(print_jobs_dead_letter_total[15m]) > 0
for: 1m
labels:
severity: warning
alert_channel: thermal_print
annotations:
summary: "Print job(s) entered dead-letter on edge2 ({{ $value | printf \"%.0f\" }} in last 15m)"
description: "{{ $value | printf \"%.0f\" }} print job(s) exhausted MaxRetries and need operator action. Open /print-log, filter Status=DeadLetter, click 'Retry From Start' after fixing the underlying cause (paper jam, USB disconnect, printer power-cycle)."
- alert: CUPSHighJobRate
expr: rate(cups_job_total[5m]) * 60 > 30
for: 5m
labels:
severity: info
annotations:
summary: "High print volume on edge2 ({{ $value | printf \"%.0f\" }} jobs/min)"
description: "Print rate exceeds 30 jobs/min for 5 minutes. Possible runaway print loop."
- alert: PrintOllamaRunnerLongKeepAlive
expr: ai_ollama_runner_keepalive_remaining_seconds{job="printweb-otel"} > 600
for: 2m
labels:
severity: warning
alert_channel: thermal_print
annotations:
summary: "Print.Web Ollama runner held for >10m ({{ $labels.model }})"
description: "Print.Web reports model {{ $labels.model }} with {{ $value | printf \"%.0f\" }}s of keep-alive remaining. Check concurrent requests before the Pi 5 Ollama lane thrashes."
- name: macmini-runners
rules:
- alert: MacMiniRunnerOffline
expr: (flowercore_github_runner_online{runner=~"macmini-.*"} == 0) or absent(flowercore_github_runner_online{runner=~"macmini-.*"})
for: 10m
labels:
severity: warning
service: github-runner
annotations:
summary: "Mac mini GitHub runner offline ({{ $labels.runner }})"
description: "A macmini-* GitHub Actions runner has not reported online for more than 10 minutes. Puppet manages its LaunchDaemon under /Library/LaunchDaemons/io.flowercore.github-runner-<slug>.plist; runners survive reboot and do not require a GUI session."
- name: linux-runners
rules:
- alert: LinuxRunnerOffline
expr: |
kube_deployment_status_replicas_ready{
namespace="github-runner",
deployment=~"github-runner(|-(sharedpos|puppet|signage|dms|telephony|print-web|chat|mysql|kiosk-linux))"
} == 0
for: 5m
labels:
severity: warning
alert_channel: irc
service: github-runner
team: ci
annotations:
summary: "Linux CI runner offline: {{ $labels.deployment }}"
description: "Deployment {{ $labels.deployment }} in namespace github-runner has 0 ready replicas for more than 5 minutes. CI jobs targeting this repo will queue until the runner pod restarts and re-registers with GitHub. Check pods with: kubectl -n github-runner get pods -l app.kubernetes.io/name={{ $labels.deployment }}. Check logs with: kubectl -n github-runner logs -l app.kubernetes.io/name={{ $labels.deployment }} --tail=50. Common causes: PAT missing repo access, runner CrashLoopBackOff, or node/resource pressure."
- name: remote-desktop
rules:
- alert: RemoteDesktopWebDown
expr: probe_success{job="probe-remotedesktop",instance="https://desktop.iamworkin.lan/health"} == 0
for: 3m
labels:
severity: warning
annotations:
summary: "FlowerCore RemoteDesktop web is down"
description: "https://desktop.iamworkin.lan/health probe has failed for 3 minutes. Catalog + session launch surface offline."
- alert: RemoteDesktopMetricsStale
expr: absent(fc_desktop_session_events_total)
for: 10m
labels:
severity: warning
annotations:
summary: "RemoteDesktop /metrics scrape returning no data"
description: "No fc_desktop_session_events_total series for 10 minutes. Either the Prometheus scrape target is misconfigured or the web deployment stopped exporting metrics. Zabbix template carries the same 10m no-data trigger for cross-monitor parity."
# PUBLISHER QUIRK: fc_desktop_pool_depleted / _deficit emit one
# series per template per status (Ready/Warming/BelowDesiredSize/
# Disabled), and the historical series for non-current statuses
# stay at their last value. So just `_depleted > 0` fires forever
# on any template that ever entered a bad state.
#
# SAFE PATTERN: alert only when the canonical "Ready" status
# gauge does NOT report ready=1 for the enabled template. This
# is the publisher's own canary — _ready{status="Ready"}==1 is
# always the current "everything is fine" signal.
- alert: RemoteDesktopPoolDepleted
expr: |
group by(template) (fc_desktop_pool_ready{enabled="true"})
unless on(template) (fc_desktop_pool_ready{enabled="true",status="Ready"} == 1)
for: 5m
labels:
severity: warning
annotations:
summary: "RemoteDesktop pool depleted ({{ $labels.template }})"
description: "Pool for template {{ $labels.template }} has no Ready warm pod for 5 minutes. New launches will cold-start. Check pod-scheduling failures, image pull issues, or exhausted node capacity."
# Same pattern, but only fires when template explicitly reports
# a sustained Warning-level alert state (current-status series).
- alert: RemoteDesktopPoolDeficitSustained
expr: |
fc_desktop_pool_deficit{enabled="true",alert_level="Warning"} > 0
unless on(template) (fc_desktop_pool_ready{enabled="true",status="Ready"} == 1)
for: 10m
labels:
severity: info
annotations:
summary: "RemoteDesktop pool {{ $labels.template }} below desired for 10m"
description: "Pool {{ $labels.template }} has a persistent deficit of {{ $value }} warm pods AND no Ready series. Likely image pull, NFS affinity, or claim-init issue."
- alert: RemoteDesktopSessionChurnSpike
expr: sum(rate(fc_desktop_session_events_total{event="launch"}[5m])) * 60 > 20
for: 5m
labels:
severity: info
annotations:
summary: "RemoteDesktop launch rate high ({{ $value | printf \"%.0f\" }}/min)"
description: "Launch events exceed 20/min for 5 minutes. Could be a user-facing feature launch, a pooled template thrashing, or a runaway automation loop."
- alert: RemoteDesktopRecordingEventsDropped
expr: absent_over_time(fc_desktop_session_events_total{event="recording"}[30m]) and on() (sum(fc_desktop_session_events_total{event="launch"}) > 0)
for: 15m
labels:
severity: info
annotations:
summary: "RemoteDesktop recording events silent for 30m despite active launches"
description: "No recording events in 30 minutes while launches are happening. Recording may be silently disabled on all templates (SessionRecordingEnabled=false), the guacd NFS mount may be unhealthy, or the retention sweep isn't emitting events. Not an error by itself — worth checking."
# Match by job — instance label carries full URL incl. /health,
# not just hostname, so a hostname-only match never fires.
- alert: RemoteDesktopTlsExpiry
expr: probe_ssl_earliest_cert_expiry{job="probe-remotedesktop"} - time() < 2 * 86400
for: 6h
labels:
severity: critical
annotations:
summary: "desktop.iamworkin.lan TLS cert expires within 2 days"
description: "The desktop.iamworkin.lan cert is inside the 2-day renewal window and cert-manager has not renewed. Check cert-manager logs, step-ca reachability, and pfSense DNS overrides per the ACME DNS-01 gate."
- name: pi-fleet
rules:
- alert: PiManagerDown
expr: up{job="pimanager-app"} == 0
for: 3m
labels:
severity: warning
annotations:
summary: "PiManager down on {{ $labels.instance }}"
description: "PiManager app on {{ $labels.instance }} ({{ $labels.device }}) unreachable for 3 minutes."
- alert: PiCpuTempHigh
expr: pimanager_cpu_temperature_celsius > 75
for: 5m
labels:
severity: warning
annotations:
summary: "Pi CPU temperature high on {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}C)"
- alert: PiCpuTempCritical
expr: pimanager_cpu_temperature_celsius > 82
for: 2m
labels:
severity: critical
annotations:
summary: "Pi CPU temperature CRITICAL on {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}C)"
- alert: PiMemoryHigh
expr: pimanager_memory_usage_percent > 90
for: 5m
labels:
severity: warning
annotations:
summary: "Pi memory usage high on {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}%)"
- alert: PiDiskHigh
expr: pimanager_disk_usage_percent > 85
for: 10m
labels:
severity: warning
annotations:
summary: "Pi disk usage high on {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}%)"
- alert: RelayAllOff
expr: sum by (instance) (pimanager_relay_state) == 0 and pimanager_relay_channel_count > 0
for: 0m
labels:
severity: info
annotations:
summary: "All relay channels OFF on {{ $labels.instance }}"
- alert: PiWifiWeak
expr: pimanager_wifi_signal_dbm < -75 and pimanager_wifi_signal_dbm != 0
for: 10m
labels:
severity: warning
annotations:
summary: "Weak WiFi signal on {{ $labels.instance }} ({{ $value }}dBm)"
- name: snmp-devices
rules:
- alert: EpsonInkLow
expr: prtMarkerSuppliesLevel{job="snmp-printer"} < 15 and prtMarkerSuppliesLevel{job="snmp-printer"} > 0
for: 0m
labels:
severity: warning
alert_channel: thermal_print
annotations:
summary: "Epson ink low: {{ $labels.prtMarkerSuppliesDescription }} at {{ $value }}%"
- alert: EpsonInkCritical
expr: prtMarkerSuppliesLevel{job="snmp-printer"} < 5 and prtMarkerSuppliesLevel{job="snmp-printer"} > 0
for: 0m
labels:
severity: critical
alert_channel: thermal_print
annotations:
summary: "Epson ink CRITICAL: {{ $labels.prtMarkerSuppliesDescription }} at {{ $value }}%"
# for: 30m absorbs sleep cycles. The EcoTank sleeps after ~5 min
# of idle and SNMP times out, so 5m for: would page nightly. A
# genuine printer outage (jam, disconnected) lasts well over 30m.
- alert: EpsonPrinterDown
expr: up{job="snmp-printer"} == 0
for: 30m
labels:
severity: warning
annotations:
summary: "Epson ET-3750 SNMP unreachable for >30m (likely actual fault, not sleep)"
- alert: SynologyDiskLow
expr: hrStorageUsed{job="snmp-nas"} / hrStorageSize{job="snmp-nas"} * 100 > 85
for: 10m
labels:
severity: warning
alert_channel: thermal_print
annotations:
summary: "Synology NAS storage high ({{ $value | printf \"%.1f\" }}%)"
- alert: SynologyDown
expr: up{job="snmp-nas"} == 0
for: 3m
labels:
severity: critical
alert_channel: thermal_print
annotations:
summary: "Synology NAS SNMP unreachable"
- name: infrastructure
rules:
- alert: NodeDown
expr: up{job=~"node-exporter|rke2-nodes|edge-nodes|k8s-node-exporter"} == 0
for: 2m
labels:
severity: critical
annotations:
summary: "Node {{ $labels.instance }} is down"
- alert: HighCPU
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 85
for: 10m
labels:
severity: warning
annotations:
summary: "High CPU on {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}%)"
- alert: HighMemory
expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100 > 90
for: 5m
labels:
severity: warning
annotations:
summary: "High memory usage on {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}%)"
- alert: DiskSpaceLow
expr: (1 - node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100 > 85
for: 10m
labels:
severity: warning
annotations:
summary: "Disk usage high on {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}%)"
# Puppet agent + service alerts.
# Mirror of FlowerCore.Notes/scripts/monitoring/alerts.yml `puppet` group
# so a future migration to in-cluster Prometheus inherits the ruleset.
# Source-of-truth for the live Podman Prometheus on noc1 is the Notes file.
# See feedback_monitoring_k8s_target_vs_live_podman.
- name: puppet
rules:
- alert: PuppetAgentReportStale
expr: puppet_last_run_age_seconds > 7200
for: 30m
labels:
severity: warning
alert_channel: irc
annotations:
summary: "Puppet agent {{ $labels.instance }} hasn't reported in over 2h"
description: "Last run age: {{ $value | humanizeDuration }}. The puppet agent on {{ $labels.instance }} may be stopped, the node may be powered off, or noc1 may be unreachable from this node."
runbook: "1. SSH to node (via noc1 jumpbox if needed) 2. sudo systemctl status puppet 3. sudo puppet agent -t --noop to force a run 4. Check r10k: ssh fcadmin@10.0.56.10 'sudo podman logs openvoxserver --tail 50' 5. Verify noc1 reachability: ping puppet.iamworkin.lan"
- alert: PuppetAgentReportCritical
expr: puppet_last_run_age_seconds > 86400
for: 1h
labels:
severity: critical
alert_channel: irc
annotations:
summary: "Puppet agent {{ $labels.instance }} silent for over 24h — node is unmanaged"
description: "Last run age: {{ $value | humanizeDuration }}. Node {{ $labels.instance }} has not submitted a Puppet report in over 24 hours. Config drift is accumulating — investigate immediately. If intentional (maintenance), add to the exclusion filter or silence in Grafana."
runbook: "URGENT: 1. Check node power state 2. SSH via noc1 jumpbox: ssh fcadmin@10.0.56.10 then ssh <node> 3. sudo systemctl status puppet 4. sudo systemctl start puppet + sudo puppet agent -t 5. Check for network partitions (VLAN connectivity to 10.0.56.10) 6. If node was recently reimaged: sudo puppet agent -t to re-register with new SSL cert"
# Sprint 33 Cx-7 Phase B (2026-05-25 postmortem follow-up):
# Detects puppet.service in failed state — distinct from PuppetAgentReportStale
# which catches "agent hasn't run." This catches "systemd gave up restarting it"
# (CA-verify loop or other fatal exit). Requires node-exporter systemd collector
# enabled with --collector.systemd. If `node_systemd_unit_state` has no series
# for a node, the collector is disabled there — flag in postmortem follow-up.
- alert: PuppetServiceFailed
expr: node_systemd_unit_state{name="puppet.service",state="failed"} == 1
for: 5m
labels:
severity: warning
alert_channel: irc
annotations:
summary: "Puppet service failed on {{ $labels.instance }}"
description: "puppet.service on {{ $labels.instance }} has been in failed state for 5+ minutes. systemd has stopped auto-restarting (CA-verify-loop or other exit). Manual `systemctl status puppet` confirms. Run `sudo systemctl start puppet` to recover; investigate journal for root cause."
runbook_url: "https://github.com/astoltz/FlowerCore.Notes/blob/master/memory/feedback_puppet_service_dead_after_ca_loop_alert_misreads.md"
# K8s pod-state alerts. Require kube-state-metrics scrape (added
# 2026-04-26 — see scrape_configs above). Would have surfaced the
# agent-zero ollama-proxy 172x crash-loop instead of letting it
# silently churn for ~3 days.
- name: kubernetes-state
rules:
- alert: KubeContainerRestartingFrequently
expr: increase(kube_pod_container_status_restarts_total[1h]) > 5
for: 15m
labels:
severity: warning
annotations:
summary: "{{ $labels.namespace }}/{{ $labels.pod }} container {{ $labels.container }} restarting >5x/hr"
description: "Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has restarted {{ $value | printf \"%.0f\" }} times in the last hour. Check 'kubectl describe pod' + last-state termination reason."
- alert: KubeContainerCrashLooping
expr: increase(kube_pod_container_status_restarts_total[15m]) > 3
for: 5m
labels:
severity: critical
alert_channel: thermal_print
annotations:
summary: "{{ $labels.namespace }}/{{ $labels.pod }} crashlooping ({{ $value | printf \"%.0f\" }} restarts/15m)"
description: "Container {{ $labels.container }} restarted {{ $value | printf \"%.0f\" }} times in 15 minutes — actively crashlooping."
- alert: KubePodNotReady
expr: sum by(namespace, pod) (kube_pod_status_phase{phase=~"Pending|Failed|Unknown"}) > 0
for: 15m
labels:
severity: warning
annotations:
summary: "{{ $labels.namespace }}/{{ $labels.pod }} not Ready for >15m"
description: "Pod is in a non-Running, non-Succeeded phase for over 15 minutes. Common causes: ImagePullBackOff (registry/Nexus down, wrong image tag), pending PVC, scheduling failure (taint/resources)."
- alert: KubePodImagePullBackOff
expr: sum by(namespace, pod) (kube_pod_container_status_waiting_reason{reason=~"ImagePullBackOff|ErrImagePull"}) > 0
for: 10m
labels:
severity: warning
annotations:
summary: "{{ $labels.namespace }}/{{ $labels.pod }} ImagePullBackOff for >10m"
description: "Pod can't pull image. Check the image ref (often a stale tag or unreachable registry) and clean up if it's an orphan."
- alert: KubeDeploymentReplicasMismatch
expr: kube_deployment_spec_replicas != kube_deployment_status_replicas_available
for: 15m
labels:
severity: warning
annotations:
summary: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} replica mismatch"
description: "Spec wants {{ $labels.spec_replicas }} but only {{ $value }} available. Likely a rollout stuck on probe failure, scheduling, or PVC."
# Q-MR-3 (2026-05-11): multus memory pressure — catches the next OOM
# cascade BEFORE multus is OOM-killed cluster-wide. The 2026-05-10
# outage (21h) hit because no alert fired on the rising multus working
# set — only downstream blackbox / Traefik / service alerts. With
# 1Gi limit (bluejay-infra@eb8693e), 80% = ~800MiB; steady-state
# runs ~150-250MiB so this only fires when an avalanche starts.
- alert: MultusMemoryPressure
expr: |
container_memory_working_set_bytes{container="kube-multus"}
/ container_spec_memory_limit_bytes{container="kube-multus"} > 0.8
for: 5m
labels:
severity: critical
alert_channel: thermal_print
annotations:
summary: "kube-multus memory >80% of limit on {{ $labels.node }} for 5m"
description: "kube-multus working set is {{ $value | humanizePercentage }} of its memory limit on node {{ $labels.node }}. If this keeps climbing, multus will OOM and all new pod networking will halt cluster-wide (precedent: 2026-05-10 outage)."
# Q-MR-3 (2026-05-11): namespace pending-pod backlog — catches the
# operator-leak avalanche pattern BEFORE it cascades into a multus
# CNI OOM. Any FC operator (RemoteDesktop / Distribution / WorldBuilder)
# emitting pods without ownerReferences will accumulate them when
# the operator crashes. >25 pending pods in any namespace for 30m
# is the signal to investigate the reconciler.
- alert: NamespacePendingPodBacklog
expr: sum by (namespace) (kube_pod_status_phase{phase="Pending"}) > 25
for: 30m
labels:
severity: warning
annotations:
summary: "Namespace {{ $labels.namespace }} has {{ $value }} Pending pods for 30m"
description: "Pending pod count in {{ $labels.namespace }} exceeds 25 sustained for 30m. Likely operator-leak avalanche pattern — children emitted without ownerReferences. Risk of multus CNI OOM cascade."
# Longhorn storage health alerts. Required: longhorn scrape job
# (added 2026-04-26 — see scrape_configs above). The K8s events
# for "snapshot becomes not ready to use" are transient lifecycle
# noise, not actionable — these alerts use the actual Longhorn
# gauges that reflect persistent state.
- name: longhorn-storage
rules:
# Volume robustness: 0=unknown, 1=healthy, 2=degraded, 3=faulted.
# Detached volumes report 0 — that's normal for unattached PVCs,
# so filter to only attached.
- alert: LonghornVolumeDegraded
expr: longhorn_volume_robustness{robustness="degraded"} == 1
for: 15m
labels:
severity: warning
annotations:
summary: "Longhorn volume {{ $labels.volume }} degraded for >15m"
description: "Volume {{ $labels.volume }} on node {{ $labels.node }} has been degraded (one or more replicas unhealthy) for 15+ minutes. Auto-rebuild may need help — check 'kubectl describe volume.longhorn.io {{ $labels.volume }} -n longhorn-system'."
- alert: LonghornVolumeFaulted
expr: longhorn_volume_robustness{robustness="faulted"} == 1
for: 5m
labels:
severity: critical
alert_channel: thermal_print
annotations:
summary: "Longhorn volume {{ $labels.volume }} FAULTED"
description: "Volume {{ $labels.volume }} on node {{ $labels.node }} is faulted — all replicas unavailable. Data inaccessible. Manual intervention required."
# No backup in 36h indicates the daily-backup recurringJob is
# silently failing. Allows for one missed run + slack.
- alert: LonghornBackupStale
expr: |
(time() - max by(volume) (longhorn_backup_state{state="Completed"} * on(backup) group_left() longhorn_backup_actual_size_bytes)) > 36 * 3600
for: 1h
labels:
severity: warning
annotations:
summary: "Longhorn volume {{ $labels.volume }} has no completed backup in >36h"
description: "Daily backup recurringJob (cron 0 2 * * *) appears to have skipped this volume. Check 'kubectl get backups.longhorn.io -n longhorn-system' and the daily-backup CronJob logs."
- alert: LonghornNodeUnhealthy
expr: longhorn_node_status{condition="ready",condition_reason!=""} == 0
for: 5m
labels:
severity: warning
annotations:
summary: "Longhorn node {{ $labels.node }} not Ready"
description: "Node {{ $labels.node }} reports ready=false (reason: {{ $labels.condition_reason }}). Volumes scheduled to this node will be unavailable until it recovers."
# ============================================================
# FC Signage Marquee Performance — Track 3 + 8 (2026-05-06)
# Live-mirrored from FlowerCore.Notes/scripts/monitoring/alerts.yml.
# Source-of-truth for the live Podman Prometheus on noc1 is the
# Notes file; this K8s ConfigMap exists so a future migration to
# in-cluster Prometheus inherits the ruleset automatically.
# See feedback_monitoring_k8s_target_vs_live_podman.
# ============================================================
- name: fc-signage-marquee
rules:
- alert: MarqueeDroppedFramesHigh
expr: |
(
sum by (renderer, phase, node_id) (rate(marquee_dropped_frames_total[5m]))
/
sum by (renderer, phase, node_id) (rate(marquee_render_latency_ms_count[5m]))
) > 0.05
unless on()
absent_over_time(marquee_dropped_frames_total[7d])
for: 5m
labels:
severity: warning
service: signage
alert_channel: irc
annotations:
summary: "Marquee dropped-frame rate >5% on {{ $labels.renderer }}/{{ $labels.node_id }} ({{ $labels.phase }})"
description: "Renderer {{ $labels.renderer }} on {{ $labels.node_id }} drops >5% of frames during {{ $labels.phase }}. Animation visibly stuttery."
- alert: MarqueeRenderLatencyP99High
expr: |
histogram_quantile(
0.99,
sum by (renderer, phase, node_id, le) (rate(marquee_render_latency_ms_bucket[5m]))
) > 16
unless on()
absent_over_time(marquee_render_latency_ms_bucket[7d])
for: 10m
labels:
severity: warning
service: signage
alert_channel: irc
annotations:
summary: "Marquee render latency p99 > 16ms on {{ $labels.renderer }}/{{ $labels.node_id }} ({{ $labels.phase }})"
description: "Per-frame render latency p99 has exceeded the Pi-class 16ms budget for 10 minutes."
- alert: MarqueeAnimationDurationDrift
expr: |
abs(
histogram_quantile(0.5, sum by (renderer, phase, le) (rate(marquee_animation_duration_ms_bucket[15m])))
-
on (phase) group_left() avg by (phase) (marquee_animation_duration_target_ms)
)
/
on (phase) group_left() avg by (phase) (marquee_animation_duration_target_ms)
> 0.10
unless on()
absent_over_time(marquee_animation_duration_ms_bucket[7d])
for: 15m
labels:
severity: info
service: signage
alert_channel: irc
annotations:
summary: "Marquee animation duration drifting > 10% on {{ $labels.renderer }} ({{ $labels.phase }})"
description: "Median observed cycle duration deviates from target DurationMs by >10%. Could indicate browser tab throttling, GPU pressure, or phase-advancement bug."
# =============================================================================
# ConfigMap: Blackbox Exporter Configuration
# =============================================================================
---
apiVersion: v1
kind: ConfigMap
metadata:
name: blackbox-config
namespace: monitoring
data:
blackbox.yml: |
modules:
http_2xx:
prober: http
timeout: 5s
http:
valid_http_versions: ["HTTP/1.1", "HTTP/2.0"]
valid_status_codes: [200]
method: GET
fail_if_body_not_matches_regexp: []
preferred_ip_protocol: ip4
http_ollama:
prober: http
timeout: 5s
http:
valid_http_versions: ["HTTP/1.1", "HTTP/2.0"]
valid_status_codes: [200]
method: GET
fail_if_body_not_matches_regexp:
- '"models"'
preferred_ip_protocol: ip4
# https_internal — for Traefik-fronted services with step-ca leaf
# certs. blackbox does not trust the step-ca root CA, so http_2xx
# against any *.iamworkin.lan host fails with x509 unknown authority.
# Redirects + multiple status codes are accepted because some hosts
# 302 to /login or /scalar.
https_internal:
prober: http
timeout: 10s
http:
valid_http_versions: ["HTTP/1.1", "HTTP/2.0"]
valid_status_codes: [200, 301, 302, 303, 307, 308]
method: GET
follow_redirects: true
preferred_ip_protocol: ip4
tls_config:
insecure_skip_verify: true
# =============================================================================
# ConfigMap: IRC Notify Script
# =============================================================================
---
apiVersion: v1
kind: ConfigMap
metadata:
name: irc-notify-script
namespace: monitoring
data:
notify.py: |
#!/usr/bin/env python3
"""HTTP->IRC alert relay with thermal-printer DIGEST forwarding.
Listens on :9119, posts to #alerts on UnrealIRCd, forwards to Print.Web
/api/print/alert. Thermal printing is BATCHED into hourly digests by
default so the printer no longer spam-fires per Grafana webhook.
Routing (per Grafana webhook alert):
- IRC: always per-event (operator likes the stream)
- Thermal printer:
* severity in {critical,disaster,page} OR
label alert_channel=thermal_print_immediate -> print NOW
* label alert_channel=thermal_print -> enqueue into hourly digest
* everything else -> IRC only
- RESOLVED webhooks remove the alert from the digest buffer
Env vars (defaults preserve old behavior on first deploy):
THERMAL_PRINT_ENABLED default "true" - master kill switch
BATCH_INTERVAL_MIN default "60" - minutes between digest prints
BATCH_MAX_PENDING default "50" - force-flush threshold
HTTP surface:
POST / - Grafana webhook entry
POST /flush - manual digest flush (idempotent)
GET / - status + config + buffer depth + stats
"""
import json, os, socket, sys, threading, time
from collections import defaultdict
from datetime import datetime, timezone
from http.server import HTTPServer, BaseHTTPRequestHandler
from urllib.request import Request, urlopen
THERMAL_PRINT_ENABLED = os.environ.get("THERMAL_PRINT_ENABLED", "true").lower() == "true"
BATCH_INTERVAL_MIN = int(os.environ.get("BATCH_INTERVAL_MIN", "60"))
BATCH_MAX_PENDING = int(os.environ.get("BATCH_MAX_PENDING", "50"))
IRC_HOST = os.environ.get("IRC_HOST", "unrealircd.irc.svc")
IRC_PORT = int(os.environ.get("IRC_PORT", "6667"))
IRC_NICK = os.environ.get("IRC_NICK", "grafana-bot")
IRC_CHANNEL = os.environ.get("IRC_CHANNEL", "#alerts")
PRINT_WEB_URL = os.environ.get("PRINT_WEB_URL", "http://10.0.57.16:5200/api/print/alert")
_buffer_lock = threading.Lock()
_buffer = {} # fingerprint -> {"alert": dict, "first_seen": float, "last_seen": float}
_last_flush_time = time.time()
_stats = {"webhooks_received": 0, "irc_sent": 0, "print_immediate": 0,
"digest_flushed": 0, "buffer_dedup": 0, "buffer_added": 0,
"buffer_resolved": 0, "started_at": time.time()}
def send_irc(message):
try:
sock = socket.create_connection((IRC_HOST, IRC_PORT), timeout=15)
sock.sendall(f"NICK {IRC_NICK}\r\n".encode())
sock.sendall(f"USER {IRC_NICK} 0 * :Grafana Alert Bot\r\n".encode())
registered = False
deadline = time.time() + 10
buf = ""
while time.time() < deadline:
try:
data = sock.recv(4096).decode("utf-8", errors="replace")
if not data: break
buf += data
for line in buf.split("\r\n"):
if line.startswith("PING"):
sock.sendall(("PONG " + line.split(" ", 1)[1] + "\r\n").encode())
if " 001 " in buf:
registered = True
break
except socket.timeout: break
if not registered:
sock.close()
return False
sock.sendall(f"JOIN {IRC_CHANNEL}\r\n".encode())
time.sleep(0.5)
sock.recv(4096)
for line in message.split("\n"):
if line.strip():
sock.sendall(f"PRIVMSG {IRC_CHANNEL} :{line}\r\n".encode())
time.sleep(0.3)
time.sleep(0.5)
sock.sendall(b"QUIT :alert delivered\r\n")
sock.close()
_stats["irc_sent"] += 1
return True
except Exception as e:
print(f"[irc-notify] IRC send failed: {e}", file=sys.stderr)
return False
def post_thermal(payload, kind):
if not THERMAL_PRINT_ENABLED:
print(f"[irc-notify] thermal disabled; skip {kind} ({payload.get('title','?')[:40]})", file=sys.stderr)
return False
try:
req = Request(PRINT_WEB_URL, data=json.dumps(payload).encode("utf-8"),
headers={"Content-Type": "application/json"}, method="POST")
resp = urlopen(req, timeout=10)
if kind == "immediate": _stats["print_immediate"] += 1
print(f"[irc-notify] thermal {kind} sent: {payload.get('title','?')[:50]}", file=sys.stderr)
return True
except Exception as e:
print(f"[irc-notify] thermal {kind} failed: {e}", file=sys.stderr)
return False
def fingerprint_of(alert):
fp = alert.get("fingerprint", "")
if fp: return fp
labels = alert.get("labels", {})
target = labels.get("pod") or labels.get("instance") or labels.get("deployment") or labels.get("statefulset") or labels.get("namespace") or ""
return f"{labels.get('alertname','?')}/{labels.get('namespace','')}/{target}"
def is_critical(alert):
return alert.get("labels", {}).get("severity", "").lower() in ("critical", "disaster", "page")
def is_immediate_label(alert):
return alert.get("labels", {}).get("alert_channel") == "thermal_print_immediate"
def is_batched_label(alert):
return alert.get("labels", {}).get("alert_channel") == "thermal_print"
def add_to_digest(alert):
"""Add an alert to the digest buffer. Returns True if the buffer GREW
(new fingerprint), False if it was a dedup, resolution, or no-op.
"""
if not THERMAL_PRINT_ENABLED: return False
fp = fingerprint_of(alert)
status = alert.get("status", "firing").lower()
with _buffer_lock:
if status == "resolved":
if fp in _buffer:
del _buffer[fp]
_stats["buffer_resolved"] += 1
return False
if fp in _buffer:
_buffer[fp]["last_seen"] = time.time()
_buffer[fp]["alert"] = alert
_stats["buffer_dedup"] += 1
return False
_buffer[fp] = {"alert": alert, "first_seen": time.time(), "last_seen": time.time()}
_stats["buffer_added"] += 1
return True
def build_digest_payload():
with _buffer_lock:
items = list(_buffer.values())
if not items: return None
by_name = defaultdict(list)
for item in items:
labels = item["alert"].get("labels", {})
by_name[labels.get("alertname", "Unknown")].append(item)
lines = []
for name, group in sorted(by_name.items()):
targets = []
for it in group[:5]:
labels = it["alert"].get("labels", {})
t = (labels.get("pod") or labels.get("instance") or labels.get("deployment")
or labels.get("statefulset") or labels.get("namespace") or "?")
targets.append(t)
more = f" (+{len(group)-5})" if len(group) > 5 else ""
sevs = sorted({it["alert"].get("labels", {}).get("severity", "warning") for it in group})
lines.append(f"[{'/'.join(sevs)}] {name} x{len(group)}: {', '.join(targets)}{more}")
now = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC")
title = f"Alert digest: {len(items)} firing"
body = "\n".join([
f"=== {title} ===",
f"as of {now}",
"",
*lines,
"",
"Stream: #alerts (IRC) | Triage: grafana-noc1.iamworkin.lan",
"Force-flush: POST irc-notify.monitoring.svc:9119/flush",
])
return {"title": title, "severity": "Warning", "host": "monitoring",
"message": body, "eventId": f"digest-{int(time.time())}",
"source": "Grafana digest", "status": "PROBLEM", "acknowledged": False}
def flush_digest():
payload = build_digest_payload()
if payload is None:
print("[irc-notify] flush: buffer empty, no digest sent", file=sys.stderr)
return False
sent = post_thermal(payload, "digest")
with _buffer_lock:
_buffer.clear()
if sent: _stats["digest_flushed"] += 1
return sent
def digest_loop():
global _last_flush_time
while True:
try:
now = time.time()
elapsed = now - _last_flush_time
if elapsed >= BATCH_INTERVAL_MIN * 60:
print(f"[irc-notify] digest tick: interval reached ({BATCH_INTERVAL_MIN}m); buffer={len(_buffer)}", file=sys.stderr)
flush_digest()
_last_flush_time = now
elif len(_buffer) >= BATCH_MAX_PENDING:
print(f"[irc-notify] digest tick: buffer full ({len(_buffer)}); force flush", file=sys.stderr)
flush_digest()
_last_flush_time = now
time.sleep(15)
except Exception as e:
print(f"[irc-notify] digest loop error: {e}", file=sys.stderr)
time.sleep(60)
class Handler(BaseHTTPRequestHandler):
def do_POST(self):
if self.path == "/flush":
ok = flush_digest()
self.send_response(200); self.send_header("Content-Type", "application/json"); self.end_headers()
self.wfile.write(json.dumps({"flushed": ok, "buffer_after": len(_buffer)}).encode())
return
_stats["webhooks_received"] += 1
length = int(self.headers.get("Content-Length", 0))
body = json.loads(self.rfile.read(length)) if length else {}
for alert in body.get("alerts", []):
status = alert.get("status", "unknown").upper()
labels = alert.get("labels", {})
name = labels.get("alertname", "Unknown")
summary = alert.get("annotations", {}).get("summary", "")
desc = alert.get("annotations", {}).get("description", "")
severity = labels.get("severity", "")
icon = "\x0304[FIRING]\x03" if status == "FIRING" else "\x0303[RESOLVED]\x03"
sev_tag = f" [{severity}]" if severity else ""
msg = f"{icon}{sev_tag} {name}: {summary}"
if desc: msg += f"\n {desc}"
send_irc(msg)
# Thermal routing — EVERYTHING (including criticals) goes into
# the hourly digest. Only the explicit `alert_channel=thermal_print_immediate`
# label bypasses, and even that flushes-the-current-digest rather
# than printing a standalone job, so the same fingerprint can't
# spam the printer per webhook cycle.
if status == "RESOLVED":
add_to_digest(alert) # removes from buffer
continue
if is_immediate_label(alert):
# Explicit opt-in for "paper this NOW" — first arrival of a
# new fingerprint triggers an immediate digest flush; repeat
# webhooks for the same fingerprint dedupe in the buffer
# until the next interval or until the alert resolves.
new_in_buffer = add_to_digest(alert)
if new_in_buffer:
global _last_flush_time
flush_digest()
_last_flush_time = time.time()
elif is_critical(alert) or is_batched_label(alert):
add_to_digest(alert)
# else: IRC-only (warnings without thermal_print label)
self.send_response(200); self.send_header("Content-Type", "application/json"); self.end_headers()
self.wfile.write(b'{"status":"ok"}')
def do_GET(self):
self.send_response(200); self.send_header("Content-Type", "application/json"); self.end_headers()
with _buffer_lock:
alertnames = sorted({it["alert"].get("labels", {}).get("alertname", "?") for it in _buffer.values()})
depth = len(_buffer)
info = {
"service": "irc-notify",
"config": {"thermal_print_enabled": THERMAL_PRINT_ENABLED,
"batch_interval_min": BATCH_INTERVAL_MIN,
"batch_max_pending": BATCH_MAX_PENDING,
"irc_target": f"{IRC_HOST}:{IRC_PORT} {IRC_CHANNEL}",
"print_web_url": PRINT_WEB_URL},
"buffer": {"depth": depth, "alertnames": alertnames,
"seconds_since_last_flush": int(time.time() - _last_flush_time),
"seconds_until_next_flush": max(0, int(BATCH_INTERVAL_MIN*60 - (time.time() - _last_flush_time)))},
"stats": _stats,
}
self.wfile.write(json.dumps(info, indent=2).encode())
def log_message(self, format, *args):
print(f"[irc-notify] {args[0]}", file=sys.stderr)
if __name__ == "__main__":
threading.Thread(target=digest_loop, daemon=True).start()
server = HTTPServer(("0.0.0.0", 9119), Handler)
print(f"[irc-notify] :9119 -> IRC {IRC_HOST}:{IRC_PORT} {IRC_CHANNEL} | thermal={'ON' if THERMAL_PRINT_ENABLED else 'OFF'} | digest={BATCH_INTERVAL_MIN}m max={BATCH_MAX_PENDING}", file=sys.stderr)
server.serve_forever()
# =============================================================================
# SNMP Exporter Auth Secret
# =============================================================================
# The full snmp.yml (~2MB, auto-generated) exceeds the 1MB ConfigMap limit.
# Strategy: store SNMP auth credentials in a Secret, and use an init container
# to download the full snmp.yml from Gitea, or split into multiple ConfigMaps.
# For now, we mount a minimal auth-only config and rely on the default modules
# bundled in the snmp-exporter image. To use custom modules, apply
# snmp-config.yaml separately (see comments in that file).
---
apiVersion: v1
kind: Secret
metadata:
name: snmp-auth
namespace: monitoring
type: Opaque
stringData:
# SNMP v2 community string used by prometheus scrape configs
SNMP_COMMUNITY_BLUEJAY: bluejay_monitor
SNMP_V3_USER: bluejay_snmpv3
SNMP_V3_AUTH_PASS: BlueJay-SNMP-Auth-2026
SNMP_V3_PRIV_PASS: BlueJay-SNMP-Priv-2026
# =============================================================================
# Grafana Credentials — synced from 1Password via Operator
# =============================================================================
# 1Password vault: IAmWorkin > "Grafana"
# Creates K8s Secret "grafana-credentials" with fields: username, password
# Grafana Deployment uses secretKeyRef to map username→GF_SECURITY_ADMIN_USER, password→GF_SECURITY_ADMIN_PASSWORD
---
apiVersion: onepassword.com/v1
kind: OnePasswordItem
metadata:
name: grafana-credentials
namespace: monitoring
spec:
itemPath: vaults/IAmWorkin/items/Grafana
# =============================================================================
# RBAC: ServiceAccount + ClusterRole for Prometheus K8s SD
# =============================================================================
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: prometheus
namespace: monitoring
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: prometheus
rules:
- apiGroups: [""]
resources: ["nodes", "nodes/proxy", "services", "endpoints", "pods"]
verbs: ["get", "list", "watch"]
- apiGroups: ["extensions", "networking.k8s.io"]
resources: ["ingresses"]
verbs: ["get", "list", "watch"]
- nonResourceURLs: ["/metrics"]
verbs: ["get"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: prometheus
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: prometheus
subjects:
- kind: ServiceAccount
name: prometheus
namespace: monitoring
# =============================================================================
# PVC: Prometheus Data (10Gi, Longhorn)
# =============================================================================
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: prometheus-data
namespace: monitoring
spec:
accessModes:
- ReadWriteOnce
storageClassName: longhorn
resources:
requests:
storage: 10Gi
# =============================================================================
# PVC: Grafana Data (2Gi, Longhorn)
# =============================================================================
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: grafana-data
namespace: monitoring
spec:
accessModes:
- ReadWriteOnce
storageClassName: longhorn
resources:
requests:
storage: 2Gi
# =============================================================================
# Deployment: Prometheus
# =============================================================================
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: prometheus
namespace: monitoring
labels:
app: prometheus
spec:
replicas: 1
strategy:
type: Recreate
selector:
matchLabels:
app: prometheus
template:
metadata:
labels:
app: prometheus
spec:
serviceAccountName: prometheus
securityContext:
fsGroup: 65534 # nobody
runAsUser: 65534
runAsGroup: 65534
containers:
- name: prometheus
image: docker.io/prom/prometheus:latest
args:
- "--config.file=/etc/prometheus/prometheus.yml"
- "--storage.tsdb.path=/prometheus"
- "--storage.tsdb.retention.time=90d"
- "--web.enable-lifecycle"
ports:
- containerPort: 9090
name: http
volumeMounts:
- name: config
mountPath: /etc/prometheus/prometheus.yml
subPath: prometheus.yml
readOnly: true
- name: config
mountPath: /etc/prometheus/alerts.yml
subPath: alerts.yml
readOnly: true
- name: config
mountPath: /etc/prometheus/recording-rules.yml
subPath: recording-rules.yml
readOnly: true
- name: data
mountPath: /prometheus
resources:
requests:
cpu: 200m
memory: 512Mi
limits:
cpu: "1"
memory: 2Gi
livenessProbe:
httpGet:
path: /-/healthy
port: 9090
initialDelaySeconds: 15
periodSeconds: 30
readinessProbe:
httpGet:
path: /-/ready
port: 9090
initialDelaySeconds: 5
periodSeconds: 10
volumes:
- name: config
configMap:
name: prometheus-config
- name: data
persistentVolumeClaim:
claimName: prometheus-data
# =============================================================================
# ConfigMap: Grafana Dashboard Provider
# =============================================================================
---
apiVersion: v1
kind: ConfigMap
metadata:
name: grafana-dashboard-provider
namespace: monitoring
data:
default.yml: |
apiVersion: 1
providers:
- name: 'default'
orgId: 1
folder: ''
type: file
disableDeletion: false
updateIntervalSeconds: 30
options:
path: /var/lib/grafana/dashboards
foldersFromFilesStructure: true
# =============================================================================
# ConfigMap: Grafana Dashboards (AI Stack Health)
# =============================================================================
---
apiVersion: v1
kind: ConfigMap
metadata:
name: grafana-dashboards
namespace: monitoring
data:
ai-stack-health.json: |
{
"id": null,
"panels": [
{
"fieldConfig": {
"defaults": {
"mappings": [
{
"options": {
"0": { "color": "red", "text": "DOWN" },
"1": { "color": "green", "text": "UP" }
},
"type": "value"
}
],
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "red", "value": null },
{ "color": "green", "value": 1 }
]
}
}
},
"gridPos": { "h": 4, "w": 6, "x": 0, "y": 0 },
"id": 1,
"targets": [
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"expr": "probe_success{job=\"probe-ollama-local\"}",
"legendFormat": "Status"
}
],
"title": "Ollama (Local)",
"type": "stat"
},
{
"fieldConfig": {
"defaults": {
"mappings": [
{
"options": {
"0": { "color": "red", "text": "DOWN" },
"1": { "color": "green", "text": "UP" }
},
"type": "value"
}
],
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "red", "value": null },
{ "color": "green", "value": 1 }
]
}
}
},
"gridPos": { "h": 4, "w": 6, "x": 6, "y": 0 },
"id": 2,
"targets": [
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"expr": "probe_success{job=\"probe-ollama-edge1\"}",
"legendFormat": "Status"
}
],
"title": "Ollama (Edge1)",
"type": "stat"
},
{
"fieldConfig": {
"defaults": {
"mappings": [
{
"options": {
"0": { "color": "red", "text": "DOWN" },
"1": { "color": "green", "text": "UP" }
},
"type": "value"
}
],
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "red", "value": null },
{ "color": "green", "value": 1 }
]
}
}
},
"gridPos": { "h": 4, "w": 6, "x": 12, "y": 0 },
"id": 3,
"targets": [
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"expr": "probe_success{job=\"probe-agentzero-local\"}",
"legendFormat": "Status"
}
],
"title": "Agent Zero (Local)",
"type": "stat"
},
{
"fieldConfig": {
"defaults": {
"mappings": [
{
"options": {
"0": { "color": "red", "text": "DOWN" },
"1": { "color": "green", "text": "UP" }
},
"type": "value"
}
],
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "red", "value": null },
{ "color": "green", "value": 1 }
]
}
}
},
"gridPos": { "h": 4, "w": 6, "x": 18, "y": 0 },
"id": 4,
"targets": [
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"expr": "probe_success{job=\"probe-agentzero-nuc\"}",
"legendFormat": "Status"
}
],
"title": "Agent Zero (NUC)",
"type": "stat"
},
{
"fieldConfig": {
"defaults": {
"custom": { "fillOpacity": 15, "lineWidth": 2 },
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 1 },
{ "color": "red", "value": 3 }
]
},
"unit": "s"
}
},
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 4 },
"id": 5,
"targets": [
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"expr": "probe_duration_seconds{service=\"ollama\"}",
"legendFormat": "{{ deployment }}"
}
],
"title": "Ollama Response Time",
"type": "timeseries"
},
{
"fieldConfig": {
"defaults": {
"custom": { "fillOpacity": 15, "lineWidth": 2 },
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 1 },
{ "color": "red", "value": 3 }
]
},
"unit": "s"
}
},
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 4 },
"id": 6,
"targets": [
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"expr": "probe_duration_seconds{service=\"agent-zero\"}",
"legendFormat": "{{ deployment }}"
}
],
"title": "Agent Zero Response Time",
"type": "timeseries"
},
{
"fieldConfig": {
"defaults": {
"custom": { "fillOpacity": 30, "lineWidth": 1, "stacking": { "mode": "none" } },
"mappings": [
{
"options": {
"0": { "text": "DOWN" },
"1": { "text": "UP" }
},
"type": "value"
}
],
"max": 1,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "red", "value": null },
{ "color": "green", "value": 1 }
]
}
}
},
"gridPos": { "h": 6, "w": 24, "x": 0, "y": 12 },
"id": 7,
"targets": [
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"expr": "probe_success{service=\"ollama\"}",
"legendFormat": "Ollama ({{ deployment }})"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"expr": "probe_success{service=\"agent-zero\"}",
"legendFormat": "Agent Zero ({{ deployment }})"
}
],
"title": "Uptime History",
"type": "timeseries"
},
{
"fieldConfig": {
"defaults": {
"custom": { "fillOpacity": 10, "lineWidth": 2 },
"max": 100,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 75 },
{ "color": "red", "value": 90 }
]
},
"unit": "percent"
}
},
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 18 },
"id": 8,
"targets": [
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"expr": "100 - (avg(rate(node_cpu_seconds_total{instance=\"edge1\",mode=\"idle\"}[5m])) * 100)",
"legendFormat": "CPU %"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"expr": "(1 - node_memory_MemAvailable_bytes{instance=\"edge1\"} / node_memory_MemTotal_bytes{instance=\"edge1\"}) * 100",
"legendFormat": "Memory %"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"expr": "(1 - node_filesystem_avail_bytes{instance=\"edge1\",mountpoint=\"/\"} / node_filesystem_size_bytes{instance=\"edge1\",mountpoint=\"/\"}) * 100",
"legendFormat": "Disk %"
}
],
"title": "Edge1 Pi 5 \u2014 CPU / Memory / Disk",
"type": "timeseries"
},
{
"fieldConfig": {
"defaults": {
"custom": { "fillOpacity": 10, "lineWidth": 2 },
"unit": "s"
}
},
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 18 },
"id": 9,
"targets": [
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"expr": "probe_dns_lookup_time_seconds",
"legendFormat": "{{ job }}"
}
],
"title": "Probe DNS Lookup Time",
"type": "timeseries"
}
],
"refresh": "30s",
"schemaVersion": 39,
"tags": ["ai", "ollama", "agent-zero", "blue-jay"],
"time": { "from": "now-1h", "to": "now" },
"timezone": "browser",
"title": "AI Stack Health",
"uid": "ai-stack-health",
"version": 1
}
# =============================================================================
# ConfigMap: Grafana Dashboard — Edge Nodes
# =============================================================================
---
apiVersion: v1
kind: ConfigMap
metadata:
name: grafana-dashboard-edge-nodes
namespace: monitoring
data:
bluejay-edge-nodes.json: |
{
"id": null,
"panels": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"mappings": [
{
"options": {
"0": {
"color": "red",
"text": "DOWN"
},
"1": {
"color": "green",
"text": "UP"
}
},
"type": "value"
}
]
}
},
"gridPos": { "h": 4, "w": 24, "x": 0, "y": 0 },
"targets": [
{
"expr": "up{instance=~\"edge.*\"}",
"legendFormat": "{{instance}}",
"refId": "A"
}
],
"title": "Edge Node Status",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"max": 100,
"min": 0,
"unit": "percent"
}
},
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 4 },
"targets": [
{
"expr": "100 - (avg(rate(node_cpu_seconds_total{mode=\"idle\", instance=~\"edge1.*\"}[5m])) * 100)",
"legendFormat": "CPU %",
"refId": "A"
},
{
"expr": "node_load1{instance=~\"edge1.*\"}",
"legendFormat": "Load 1m",
"refId": "B"
}
],
"title": "edge1 (Pi5 + Hailo) CPU",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"max": 100,
"min": 0,
"unit": "percent"
}
},
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 4 },
"targets": [
{
"expr": "100 - (avg(rate(node_cpu_seconds_total{mode=\"idle\", instance=~\"edge2.*\"}[5m])) * 100)",
"legendFormat": "CPU %",
"refId": "A"
},
{
"expr": "node_load1{instance=~\"edge2.*\"}",
"legendFormat": "Load 1m",
"refId": "B"
}
],
"title": "edge2 (Pi4) CPU",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"max": 100,
"min": 0,
"unit": "percent"
}
},
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 12 },
"targets": [
{
"expr": "(1 - node_memory_MemAvailable_bytes{instance=~\"edge.*\"} / node_memory_MemTotal_bytes{instance=~\"edge.*\"}) * 100",
"legendFormat": "{{instance}}",
"refId": "A"
}
],
"title": "Edge Memory Usage",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"max": 100,
"min": 0,
"unit": "percent"
}
},
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 12 },
"targets": [
{
"expr": "100 - (node_filesystem_avail_bytes{instance=~\"edge.*\",mountpoint=\"/\"} / node_filesystem_size_bytes{instance=~\"edge.*\",mountpoint=\"/\"} * 100)",
"legendFormat": "{{instance}}",
"refId": "A"
}
],
"title": "Edge Disk Usage",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"unit": "celsius"
}
},
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 20 },
"targets": [
{
"expr": "node_hwmon_temp_celsius{instance=~\"edge.*\"}",
"legendFormat": "{{instance}} {{chip}} {{sensor}}",
"refId": "A"
}
],
"title": "Edge CPU Temperature",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"unit": "bps"
}
},
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 20 },
"targets": [
{
"expr": "rate(node_network_receive_bytes_total{instance=~\"edge.*\",device!~\"lo|docker.*|veth.*\"}[5m]) * 8",
"legendFormat": "{{instance}} {{device}} RX",
"refId": "A"
},
{
"expr": "-rate(node_network_transmit_bytes_total{instance=~\"edge.*\",device!~\"lo|docker.*|veth.*\"}[5m]) * 8",
"legendFormat": "{{instance}} {{device}} TX",
"refId": "B"
}
],
"title": "Edge Network Traffic",
"type": "timeseries"
}
],
"refresh": "30s",
"schemaVersion": 40,
"tags": ["bluejay", "edge"],
"timezone": "browser",
"title": "BlueJay Edge Nodes",
"uid": "7e1603b9-e918-4b3f-a22b-163132fd5cee"
}
# =============================================================================
# ConfigMap: Grafana Dashboard — Network Overview
# =============================================================================
---
apiVersion: v1
kind: ConfigMap
metadata:
name: grafana-dashboard-network-overview
namespace: monitoring
data:
bluejay-network-overview.json: |
{
"id": null,
"panels": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"thresholds": {
"steps": [
{ "color": "green", "value": null }
]
}
}
},
"gridPos": { "h": 4, "w": 24, "x": 0, "y": 0 },
"targets": [
{
"expr": "count(up == 1)",
"legendFormat": "Up",
"refId": "A"
},
{
"expr": "count(up == 0)",
"legendFormat": "Down",
"refId": "B"
}
],
"title": "Target Health",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"max": 4,
"min": 0,
"thresholds": {
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 2 },
{ "color": "red", "value": 3 }
]
}
}
},
"gridPos": { "h": 6, "w": 6, "x": 0, "y": 4 },
"targets": [
{
"expr": "laLoad{instance=\"10.0.56.1\", laNames=\"Load-1\"}",
"refId": "A"
}
],
"title": "pfSense CPU Load (1m)",
"type": "gauge"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"max": 100,
"min": 0,
"thresholds": {
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 70 },
{ "color": "red", "value": 90 }
]
},
"unit": "percent"
}
},
"gridPos": { "h": 6, "w": 6, "x": 6, "y": 4 },
"targets": [
{
"expr": "100 - (memAvailReal{instance=\"10.0.56.1\"} / memTotalReal{instance=\"10.0.56.1\"} * 100)",
"refId": "A"
}
],
"title": "pfSense Memory Used %",
"type": "gauge"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"max": 100,
"min": 0,
"unit": "percent"
}
},
"gridPos": { "h": 6, "w": 12, "x": 12, "y": 4 },
"targets": [
{
"expr": "100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\", instance=\"noc1\"}[5m])) * 100)",
"legendFormat": "CPU %",
"refId": "A"
}
],
"title": "noc1 CPU Usage",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"max": 100,
"min": 0,
"unit": "percent"
}
},
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 10 },
"targets": [
{
"expr": "(1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100",
"legendFormat": "{{instance}}",
"refId": "A"
}
],
"title": "Node Memory Usage",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"max": 100,
"min": 0,
"thresholds": {
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 70 },
{ "color": "red", "value": 90 }
]
},
"unit": "percent"
}
},
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 10 },
"targets": [
{
"expr": "100 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} * 100)",
"legendFormat": "{{instance}}",
"refId": "A"
}
],
"title": "Node Disk Usage %",
"type": "bargauge"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"unit": "bps"
}
},
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 18 },
"targets": [
{
"expr": "rate(node_network_receive_bytes_total{device!~\"lo|veth.*|cali.*|flannel.*\"}[5m]) * 8",
"legendFormat": "{{instance}} {{device}} RX",
"refId": "A"
},
{
"expr": "-rate(node_network_transmit_bytes_total{device!~\"lo|veth.*|cali.*|flannel.*\"}[5m]) * 8",
"legendFormat": "{{instance}} {{device}} TX",
"refId": "B"
}
],
"title": "Network Traffic",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 26 },
"targets": [
{
"expr": "up",
"format": "table",
"instant": true,
"refId": "A"
}
],
"title": "Prometheus Targets",
"type": "table"
}
],
"refresh": "30s",
"schemaVersion": 40,
"tags": ["bluejay", "network"],
"timezone": "browser",
"title": "BlueJay Network Overview",
"uid": "b83a122c-6eb5-4fae-a632-77cbf753ad05"
}
# =============================================================================
# ConfigMap: Grafana Dashboard — Operations
# =============================================================================
---
apiVersion: v1
kind: ConfigMap
metadata:
name: grafana-dashboard-operations
namespace: monitoring
data:
bluejay-operations.json: |
{
"annotations": {
"list": []
},
"id": null,
"panels": [
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
"title": "Infrastructure Overview",
"type": "row"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"noValue": "0",
"thresholds": {
"steps": [
{ "color": "green", "value": null }
]
}
}
},
"gridPos": { "h": 4, "w": 8, "x": 0, "y": 1 },
"targets": [
{
"expr": "count(up == 1)",
"legendFormat": "Up",
"refId": "A"
},
{
"expr": "count(up == 0)",
"legendFormat": "Down",
"refId": "B"
}
],
"title": "All Targets Up/Down",
"type": "stat"
},
{
"datasource": {
"type": "alexanderzobnin-zabbix-datasource",
"uid": "bffjila3zkdfka"
},
"gridPos": { "h": 8, "w": 16, "x": 8, "y": 1 },
"targets": [
{
"application": { "filter": "" },
"group": { "filter": "/.*/" },
"host": { "filter": "/.*/" },
"queryType": 5,
"refId": "A",
"trigger": { "filter": "/.*/" }
}
],
"title": "Zabbix Active Problems",
"type": "alexanderzobnin-zabbix-triggers-panel"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"gridPos": { "h": 6, "w": 8, "x": 0, "y": 5 },
"targets": [
{
"expr": "node_load1{instance=\"noc1\"}",
"legendFormat": "1m",
"refId": "A"
},
{
"expr": "node_load5{instance=\"noc1\"}",
"legendFormat": "5m",
"refId": "B"
},
{
"expr": "node_load15{instance=\"noc1\"}",
"legendFormat": "15m",
"refId": "C"
}
],
"title": "noc1 Load Average",
"type": "timeseries"
},
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 11 },
"title": "Kubernetes & Services",
"type": "row"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {},
"overrides": [
{
"matcher": { "id": "byName", "options": "Value" },
"properties": [
{
"id": "mappings",
"value": [
{
"options": {
"0": { "color": "red", "text": "DOWN" },
"1": { "color": "green", "text": "UP" }
},
"type": "value"
}
]
}
]
}
]
},
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 12 },
"targets": [
{
"expr": "up",
"format": "table",
"instant": true,
"refId": "A"
}
],
"title": "K8s Services Uptime (Prometheus Targets)",
"type": "table"
},
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 20 },
"title": "Network & SNMP",
"type": "row"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"unit": "bps"
}
},
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 21 },
"targets": [
{
"expr": "rate(ifHCInOctets{instance=\"10.0.56.1\", ifAlias=~\".*WAN.*\"}[5m]) * 8",
"legendFormat": "WAN In",
"refId": "A"
},
{
"expr": "-rate(ifHCOutOctets{instance=\"10.0.56.1\", ifAlias=~\".*WAN.*\"}[5m]) * 8",
"legendFormat": "WAN Out",
"refId": "B"
}
],
"title": "pfSense WAN Traffic",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"unit": "bps"
}
},
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 21 },
"targets": [
{
"expr": "rate(ifHCInOctets{instance=\"10.0.56.1\", ifAlias!~\".*WAN.*\"}[5m]) * 8",
"legendFormat": "{{ifAlias}} In",
"refId": "A"
},
{
"expr": "-rate(ifHCOutOctets{instance=\"10.0.56.1\", ifAlias!~\".*WAN.*\"}[5m]) * 8",
"legendFormat": "{{ifAlias}} Out",
"refId": "B"
}
],
"title": "pfSense LAN Traffic",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"max": 100,
"min": 0,
"unit": "percent"
}
},
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 29 },
"targets": [
{
"expr": "(1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100",
"legendFormat": "{{instance}}",
"refId": "A"
}
],
"title": "All Nodes Memory",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"max": 100,
"min": 0,
"unit": "percent"
}
},
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 29 },
"targets": [
{
"expr": "100 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} * 100)",
"legendFormat": "{{instance}}",
"refId": "A"
}
],
"title": "All Nodes Disk",
"type": "timeseries"
}
],
"refresh": "1m",
"schemaVersion": 40,
"tags": ["bluejay", "operations", "zabbix"],
"timezone": "browser",
"title": "BlueJay Operations",
"uid": "a781c4e4-c3fe-4ac7-be93-21363a41b97d"
}
# =============================================================================
# ConfigMap: Grafana Dashboard — Epson Printer
# =============================================================================
---
apiVersion: v1
kind: ConfigMap
metadata:
name: grafana-dashboard-printer
namespace: monitoring
data:
epson-ecotank-printer.json: |
{
"id": null,
"panels": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"max": 100,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "red", "value": null },
{ "color": "orange", "value": 10 },
{ "color": "yellow", "value": 20 },
{ "color": "green", "value": 40 }
]
},
"unit": "percent"
},
"overrides": [
{
"matcher": { "id": "byName", "options": "Black Ink Bottle" },
"properties": [
{ "id": "color", "value": { "fixedColor": "dark-blue", "mode": "fixed" } }
]
},
{
"matcher": { "id": "byName", "options": "Cyan Ink Bottle" },
"properties": [
{ "id": "color", "value": { "fixedColor": "super-light-blue", "mode": "fixed" } }
]
},
{
"matcher": { "id": "byName", "options": "Magenta Ink Bottle" },
"properties": [
{ "id": "color", "value": { "fixedColor": "purple", "mode": "fixed" } }
]
},
{
"matcher": { "id": "byName", "options": "Yellow Ink Bottle" },
"properties": [
{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }
]
}
]
},
"gridPos": { "h": 10, "w": 12, "x": 0, "y": 0 },
"id": 1,
"options": {
"orientation": "horizontal",
"reduceOptions": {
"calcs": ["lastNotNull"]
},
"showThresholdLabels": false,
"showThresholdMarkers": true
},
"targets": [
{
"expr": "prtMarkerSuppliesLevel{job=\"snmp-printer\"}",
"legendFormat": "{{prtMarkerSuppliesDescription}}",
"refId": "A"
}
],
"title": "Ink Levels",
"type": "gauge"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"custom": {
"fillOpacity": 20,
"lineWidth": 2,
"spanNulls": true
},
"max": 100,
"min": 0,
"unit": "percent"
},
"overrides": [
{
"matcher": { "id": "byName", "options": "Black Ink Bottle" },
"properties": [
{ "id": "color", "value": { "fixedColor": "dark-blue", "mode": "fixed" } }
]
},
{
"matcher": { "id": "byName", "options": "Cyan Ink Bottle" },
"properties": [
{ "id": "color", "value": { "fixedColor": "super-light-blue", "mode": "fixed" } }
]
},
{
"matcher": { "id": "byName", "options": "Magenta Ink Bottle" },
"properties": [
{ "id": "color", "value": { "fixedColor": "purple", "mode": "fixed" } }
]
},
{
"matcher": { "id": "byName", "options": "Yellow Ink Bottle" },
"properties": [
{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }
]
}
]
},
"gridPos": { "h": 10, "w": 12, "x": 12, "y": 0 },
"id": 2,
"targets": [
{
"expr": "prtMarkerSuppliesLevel{job=\"snmp-printer\"}",
"legendFormat": "{{prtMarkerSuppliesDescription}}",
"refId": "A"
}
],
"title": "Ink Level History",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 10000 },
{ "color": "red", "value": 50000 }
]
},
"unit": "short"
}
},
"gridPos": { "h": 5, "w": 6, "x": 0, "y": 10 },
"id": 3,
"options": {
"colorMode": "background",
"reduceOptions": {
"calcs": ["lastNotNull"]
},
"textMode": "value_and_name"
},
"targets": [
{
"expr": "prtMarkerLifeCount{job=\"snmp-printer\"}",
"legendFormat": "Pages",
"refId": "A"
}
],
"title": "Lifetime Page Count",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"mappings": [
{
"options": {
"1": { "text": "Online" }
},
"type": "value"
}
],
"thresholds": {
"steps": [
{ "color": "blue", "value": null }
]
}
}
},
"gridPos": { "h": 5, "w": 6, "x": 6, "y": 10 },
"id": 4,
"options": {
"colorMode": "background",
"reduceOptions": {
"calcs": ["lastNotNull"]
},
"textMode": "name"
},
"targets": [
{
"expr": "prtGeneralPrinterName{job=\"snmp-printer\"}",
"legendFormat": "{{prtGeneralPrinterName}}",
"refId": "A"
}
],
"title": "Printer Model",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"thresholds": {
"steps": [
{ "color": "green", "value": null },
{ "color": "red", "value": 1 }
]
},
"unit": "short"
}
},
"gridPos": { "h": 5, "w": 6, "x": 12, "y": 10 },
"id": 5,
"options": {
"colorMode": "background",
"reduceOptions": {
"calcs": ["lastNotNull"]
}
},
"targets": [
{
"expr": "prtAlertCriticalEvents{job=\"snmp-printer\"}",
"legendFormat": "Critical Alerts",
"refId": "A"
}
],
"title": "Critical Events",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"thresholds": {
"steps": [
{ "color": "blue", "value": null }
]
}
}
},
"gridPos": { "h": 5, "w": 6, "x": 18, "y": 10 },
"id": 6,
"options": {
"colorMode": "background",
"reduceOptions": {
"calcs": ["lastNotNull"]
},
"textMode": "name"
},
"targets": [
{
"expr": "prtGeneralSerialNumber{job=\"snmp-printer\"}",
"legendFormat": "{{prtGeneralSerialNumber}}",
"refId": "A"
}
],
"title": "Serial Number",
"type": "stat"
}
],
"refresh": "5m",
"schemaVersion": 39,
"tags": ["printer", "snmp", "bluejay"],
"time": { "from": "now-24h", "to": "now" },
"timezone": "browser",
"title": "Epson ET-3750 EcoTank Printer",
"uid": "epson-ecotank"
}
# =============================================================================
# ConfigMap: Grafana Dashboard — Infrastructure Overview
# =============================================================================
---
apiVersion: v1
kind: ConfigMap
metadata:
name: grafana-dashboard-infra-overview
namespace: monitoring
data:
infra-overview.json: |
{
"id": null,
"panels": [
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
"id": 100,
"title": "AI Stack",
"type": "row"
},
{
"fieldConfig": {
"defaults": {
"mappings": [
{
"options": {
"0": { "color": "red", "text": "DOWN" },
"1": { "color": "green", "text": "UP" }
},
"type": "value"
}
],
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "red", "value": null },
{ "color": "green", "value": 1 }
]
}
}
},
"gridPos": { "h": 4, "w": 6, "x": 0, "y": 1 },
"id": 1,
"targets": [
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"expr": "probe_success{job=\"probe-ollama-local\"}",
"legendFormat": "Status"
}
],
"title": "Ollama (Local)",
"type": "stat"
},
{
"fieldConfig": {
"defaults": {
"mappings": [
{
"options": {
"0": { "color": "red", "text": "DOWN" },
"1": { "color": "green", "text": "UP" }
},
"type": "value"
}
],
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "red", "value": null },
{ "color": "green", "value": 1 }
]
}
}
},
"gridPos": { "h": 4, "w": 6, "x": 6, "y": 1 },
"id": 2,
"targets": [
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"expr": "probe_success{job=\"probe-ollama-edge1\"}",
"legendFormat": "Status"
}
],
"title": "Ollama (Edge1)",
"type": "stat"
},
{
"fieldConfig": {
"defaults": {
"mappings": [
{
"options": {
"0": { "color": "red", "text": "DOWN" },
"1": { "color": "green", "text": "UP" }
},
"type": "value"
}
],
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "red", "value": null },
{ "color": "green", "value": 1 }
]
}
}
},
"gridPos": { "h": 4, "w": 6, "x": 12, "y": 1 },
"id": 3,
"targets": [
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"expr": "probe_success{job=\"probe-agentzero-local\"}",
"legendFormat": "Status"
}
],
"title": "Agent Zero (Local)",
"type": "stat"
},
{
"fieldConfig": {
"defaults": {
"mappings": [
{
"options": {
"0": { "color": "red", "text": "DOWN" },
"1": { "color": "green", "text": "UP" }
},
"type": "value"
}
],
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "red", "value": null },
{ "color": "green", "value": 1 }
]
}
}
},
"gridPos": { "h": 4, "w": 6, "x": 18, "y": 1 },
"id": 4,
"targets": [
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"expr": "probe_success{job=\"probe-agentzero-nuc\"}",
"legendFormat": "Status"
}
],
"title": "Agent Zero (NUC)",
"type": "stat"
},
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 5 },
"id": 101,
"title": "K8s Cluster",
"type": "row"
},
{
"fieldConfig": {
"defaults": {
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 30 },
{ "color": "red", "value": 50 }
]
}
}
},
"gridPos": { "h": 6, "w": 8, "x": 0, "y": 6 },
"id": 5,
"targets": [
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"expr": "count(up{job=\"node-exporter\"} == 1)",
"legendFormat": "Nodes Up"
}
],
"title": "Nodes Up (node-exporter)",
"type": "stat"
},
{
"fieldConfig": {
"defaults": {
"custom": { "fillOpacity": 15, "lineWidth": 2 },
"max": 100,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 70 },
{ "color": "red", "value": 90 }
]
},
"unit": "percent"
}
},
"gridPos": { "h": 6, "w": 8, "x": 8, "y": 6 },
"id": 6,
"targets": [
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"expr": "100 - (avg(rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) by (instance) * 100)",
"legendFormat": "{{ instance }}"
}
],
"title": "Node CPU Usage %",
"type": "timeseries"
},
{
"fieldConfig": {
"defaults": {
"custom": { "fillOpacity": 15, "lineWidth": 2 },
"max": 100,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 70 },
{ "color": "red", "value": 90 }
]
},
"unit": "percent"
}
},
"gridPos": { "h": 6, "w": 8, "x": 16, "y": 6 },
"id": 7,
"targets": [
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"expr": "(1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100",
"legendFormat": "{{ instance }}"
}
],
"title": "Node Memory Usage %",
"type": "timeseries"
},
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 12 },
"id": 102,
"title": "Network",
"type": "row"
},
{
"fieldConfig": {
"defaults": {
"custom": { "fillOpacity": 10, "lineWidth": 2 },
"unit": "Bps"
}
},
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 13 },
"id": 8,
"targets": [
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"expr": "rate(ifHCInOctets{job=\"snmp-pfsense\",ifDescr=\"igb0\"}[5m])",
"legendFormat": "WAN In"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"expr": "rate(ifHCOutOctets{job=\"snmp-pfsense\",ifDescr=\"igb0\"}[5m])",
"legendFormat": "WAN Out"
}
],
"title": "pfSense WAN Bandwidth",
"type": "timeseries"
},
{
"fieldConfig": {
"defaults": {
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "red", "value": null },
{ "color": "green", "value": 1 }
]
}
},
"overrides": []
},
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 13 },
"id": 9,
"options": {
"showHeader": true,
"sortBy": [{ "displayName": "Value", "desc": false }]
},
"targets": [
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"expr": "up",
"format": "table",
"instant": true,
"legendFormat": ""
}
],
"title": "Target Health (up)",
"transformations": [
{
"id": "organize",
"options": {
"excludeByName": { "Time": true, "__name__": true },
"renameByName": { "job": "Job", "instance": "Instance", "Value": "Up" }
}
}
],
"type": "table"
},
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 21 },
"id": 103,
"title": "Services",
"type": "row"
},
{
"gridPos": { "h": 4, "w": 24, "x": 0, "y": 22 },
"id": 10,
"options": {
"content": "### ArgoCD Applications\n\nView sync status and health of all 11 ArgoCD-managed applications:\n\n[Open ArgoCD Dashboard](https://argocd.iamworkin.lan)\n\n| App | Expected Status |\n|-----|----------------|\n| gitea | Healthy / Synced |\n| zabbix | Healthy / Synced |\n| guacamole | Healthy / Synced |\n| irc | Healthy / Synced |\n| mail | Healthy / Synced |\n| matrix | Healthy / Synced |\n| teamspeak | Healthy / Synced |\n| pki-web | Healthy / Synced |\n| intranet | Healthy / Synced |\n| telephony | Healthy / Synced |\n| infra-noc-services | Healthy / Synced |",
"mode": "markdown"
},
"title": "ArgoCD App Status",
"type": "text"
},
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 26 },
"id": 104,
"title": "Alerting",
"type": "row"
},
{
"fieldConfig": {
"defaults": {
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "orange", "value": 1 },
{ "color": "red", "value": 3 }
]
}
}
},
"gridPos": { "h": 6, "w": 24, "x": 0, "y": 27 },
"id": 11,
"targets": [
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"expr": "count(ALERTS{alertstate=\"firing\"}) or vector(0)",
"legendFormat": "Firing Alerts"
}
],
"title": "Firing Alerts",
"type": "stat"
}
],
"refresh": "30s",
"schemaVersion": 39,
"tags": ["infrastructure", "blue-jay", "overview"],
"time": { "from": "now-1h", "to": "now" },
"timezone": "browser",
"title": "Infrastructure Overview",
"uid": "infra-overview",
"version": 1
}
# =============================================================================
# ConfigMap: Grafana Datasource Provisioning
# =============================================================================
---
apiVersion: v1
kind: ConfigMap
metadata:
name: grafana-datasource-provisioning
namespace: monitoring
data:
datasource.yml: |
apiVersion: 1
datasources:
- name: Prometheus
type: prometheus
access: proxy
url: http://prometheus.monitoring.svc:9090
isDefault: true
editable: true
# =============================================================================
# ConfigMap: Grafana Alerting Provisioning (contact points + policies + rules)
# =============================================================================
# Makes alert rules declarative — survives pod rebuilds without API recreation
---
apiVersion: v1
kind: ConfigMap
metadata:
name: grafana-alerting-provisioning
namespace: monitoring
data:
alerting.yml: |
apiVersion: 1
contactPoints:
- orgId: 1
name: IRC #alerts
receivers:
- uid: irc-alerts-webhook
type: webhook
settings:
url: http://irc-notify.monitoring.svc:9119
httpMethod: POST
disableResolveMessage: false
- orgId: 1
name: Thermal Printer
receivers:
- uid: thermal-print-001
type: webhook
settings:
url: http://irc-notify.monitoring.svc:9119
httpMethod: POST
disableResolveMessage: true
policies:
- orgId: 1
receiver: IRC #alerts
group_by: ['alertname']
group_wait: 30s
group_interval: 5m
repeat_interval: 1h
routes:
- receiver: Thermal Printer
matchers: ['alert_channel = thermal_print']
group_wait: 1m
group_interval: 10m
repeat_interval: 4h
continue: true
groups:
- orgId: 1
name: AI Stack
folder: AI Stack Alerts
interval: 1m
rules:
- uid: ollama-down-local
title: Ollama DOWN (Local)
condition: C
for: 2m
noDataState: Alerting
execErrState: OK
annotations:
summary: Ollama DOWN on workstation (R9700)
description: Agent Zero LOCAL cannot reach Ollama. FAISS memory will fail.
runbook: "1. Check PC power/wake 2. ssh stoltz@10.0.58.100 3. systemctl status ollama 4. journalctl -u ollama --since 5min"
labels:
severity: warning
service: ollama
data:
- refId: A
relativeTimeRange: {from: 120, to: 0}
datasourceUid: prometheus
model: {expr: 'probe_success{job="probe-ollama-local"}', instant: true, refId: A}
- refId: B
relativeTimeRange: {from: 120, to: 0}
datasourceUid: __expr__
model: {type: reduce, expression: A, reducer: last, refId: B}
- refId: C
relativeTimeRange: {from: 120, to: 0}
datasourceUid: __expr__
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [1], type: lt}}], refId: C}
- uid: ollama-down-edge1
title: Ollama DOWN (Edge1)
condition: C
for: 2m
noDataState: Alerting
execErrState: OK
annotations:
summary: Ollama DOWN on edge1 Pi 5
description: Agent Zero NUC cannot reach Ollama.
runbook: "1. ssh stoltz@10.0.57.17 2. systemctl status ollama 3. journalctl -u ollama --since 5min 4. cat /sys/class/thermal/thermal_zone0/temp"
labels:
severity: warning
service: ollama
data:
- refId: A
relativeTimeRange: {from: 120, to: 0}
datasourceUid: prometheus
model: {expr: 'probe_success{job="probe-ollama-edge1"}', instant: true, refId: A}
- refId: B
relativeTimeRange: {from: 120, to: 0}
datasourceUid: __expr__
model: {type: reduce, expression: A, reducer: last, refId: B}
- refId: C
relativeTimeRange: {from: 120, to: 0}
datasourceUid: __expr__
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [1], type: lt}}], refId: C}
- uid: a0-down-local
title: Agent Zero DOWN (Local)
condition: C
for: 2m
noDataState: Alerting
execErrState: OK
annotations:
summary: Agent Zero LOCAL DOWN
description: K3s web UI unreachable.
runbook: "1. ssh stoltz@10.0.58.100 2. docker ps | grep agent-zero 3. docker logs agent-zero --tail 20 4. Check Ollama first (A0 crashes if Ollama down)"
labels:
severity: warning
service: agent-zero
data:
- refId: A
relativeTimeRange: {from: 120, to: 0}
datasourceUid: prometheus
model: {expr: 'probe_success{job="probe-agentzero-local"}', instant: true, refId: A}
- refId: B
relativeTimeRange: {from: 120, to: 0}
datasourceUid: __expr__
model: {type: reduce, expression: A, reducer: last, refId: B}
- refId: C
relativeTimeRange: {from: 120, to: 0}
datasourceUid: __expr__
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [1], type: lt}}], refId: C}
- uid: a0-down-nuc
title: Agent Zero DOWN (NUC)
condition: C
for: 2m
noDataState: Alerting
execErrState: OK
annotations:
summary: Agent Zero NUC DOWN
description: RKE2 web UI unreachable.
runbook: "1. ssh root@10.0.56.200 via noc1 2. kubectl get pods -n agent-zero 3. kubectl logs -n agent-zero deploy/agent-zero --tail 20"
labels:
severity: warning
service: agent-zero
data:
- refId: A
relativeTimeRange: {from: 120, to: 0}
datasourceUid: prometheus
model: {expr: 'probe_success{job="probe-agentzero-nuc"}', instant: true, refId: A}
- refId: B
relativeTimeRange: {from: 120, to: 0}
datasourceUid: __expr__
model: {type: reduce, expression: A, reducer: last, refId: B}
- refId: C
relativeTimeRange: {from: 120, to: 0}
datasourceUid: __expr__
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [1], type: lt}}], refId: C}
- uid: print-ollama-runner-long-keepalive
title: Print.Web Ollama runner keep-alive >10m
condition: C
for: 2m
noDataState: NoData
execErrState: OK
annotations:
summary: Print.Web Ollama runner held too long
description: "A Print.Web Ollama runner has more than 10 minutes of keep-alive remaining. Check active AI requests before the Pi 5 Ollama lane thrashes."
runbook: "1. Open https://print.iamworkin.lan/admin 2. Check Ollama Fleet model + remaining keep-alive 3. Query Prometheus: ai_ollama_runner_keepalive_remaining_seconds{job=\"printweb-otel\"} 4. Stop duplicate model callers before restarting Ollama."
labels:
severity: warning
service: print-web
alert_channel: thermal_print
data:
- refId: A
relativeTimeRange: {from: 120, to: 0}
datasourceUid: prometheus
model: {expr: 'max by(model) (ai_ollama_runner_keepalive_remaining_seconds{job="printweb-otel"})', instant: true, refId: A}
- refId: B
relativeTimeRange: {from: 120, to: 0}
datasourceUid: __expr__
model: {type: reduce, expression: A, reducer: last, refId: B}
- refId: C
relativeTimeRange: {from: 120, to: 0}
datasourceUid: __expr__
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [600], type: gt}}], refId: C}
- orgId: 1
name: CI Runners
folder: CI Alerts
interval: 1m
rules:
- uid: linux-runner-offline
title: LinuxRunnerOffline
condition: C
for: 5m
noDataState: OK
execErrState: Error
annotations:
summary: "Linux CI runner offline: {{ $labels.deployment }}"
description: "A github-runner namespace Deployment has 0 ready replicas for more than 5 minutes. CI jobs targeting that repo will queue until the runner pod restarts and re-registers."
runbook: "1. kubectl -n github-runner get pods -l app.kubernetes.io/name={{ $labels.deployment }} 2. kubectl -n github-runner logs -l app.kubernetes.io/name={{ $labels.deployment }} --tail=50 3. Verify PAT repo access if registration returns 404 4. Verify no RWO PVC is shared by scaled runners"
labels:
severity: warning
service: github-runner
alert_channel: irc
team: ci
data:
- refId: A
relativeTimeRange: {from: 300, to: 0}
datasourceUid: prometheus
model: {expr: 'kube_deployment_status_replicas_ready{namespace="github-runner",deployment=~"github-runner(|-(sharedpos|puppet|signage|dms|telephony|print-web|chat|mysql|kiosk-linux))"} == 0', instant: true, refId: A}
- refId: B
relativeTimeRange: {from: 300, to: 0}
datasourceUid: __expr__
model: {type: reduce, expression: A, reducer: last, refId: B}
- refId: C
relativeTimeRange: {from: 300, to: 0}
datasourceUid: __expr__
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [0], type: gt}}], refId: C}
- orgId: 1
name: Infrastructure
folder: AI Stack Alerts
interval: 1m
rules:
- uid: node-down
title: Node DOWN
condition: C
for: 2m
noDataState: Alerting
execErrState: OK
annotations:
summary: Node down
description: Node exporter unreachable for 2 minutes. Host may be down or network issue.
runbook: "1. Ping host IP 2. SSH via noc1 jumpbox (root@10.0.56.10) 3. systemctl status node_exporter 4. Check network/VLAN 5. Physical power check if unreachable"
labels:
severity: critical
service: infrastructure
alert_channel: thermal_print
data:
- refId: A
relativeTimeRange: {from: 120, to: 0}
datasourceUid: prometheus
model: {expr: 'up{job=~"node-exporter|rke2-nodes|edge-nodes"}', instant: true, refId: A}
- refId: B
relativeTimeRange: {from: 120, to: 0}
datasourceUid: __expr__
model: {type: reduce, expression: A, reducer: last, refId: B}
- refId: C
relativeTimeRange: {from: 120, to: 0}
datasourceUid: __expr__
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [1], type: lt}}], refId: C}
- uid: macmini-runner-offline
title: MacMiniRunnerOffline
condition: C
for: 10m
noDataState: Alerting
execErrState: OK
annotations:
summary: Mac mini GitHub runner offline
description: "One or more macmini-* GitHub Actions runners have not reported online for more than 10 minutes. LaunchDaemons survive reboot and do not require the bluejay GUI session."
runbook: "1. ssh fcadmin@macmini.iamworkin.lan 2. launchctl print system/io.flowercore.github-runner-<slug> 3. Check /Users/fcadmin/Library/Logs/github-runners/<slug>/stderr.log 4. Re-register the repo runner if .runner is missing"
labels:
severity: warning
service: github-runner
data:
- refId: A
relativeTimeRange: {from: 600, to: 0}
datasourceUid: prometheus
model: {expr: 'min(flowercore_github_runner_online{runner=~"macmini-.*"} or vector(0))', instant: true, refId: A}
- refId: B
relativeTimeRange: {from: 600, to: 0}
datasourceUid: __expr__
model: {type: reduce, expression: A, reducer: last, refId: B}
- refId: C
relativeTimeRange: {from: 600, to: 0}
datasourceUid: __expr__
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [1], type: lt}}], refId: C}
- uid: high-cpu
title: High CPU (>85%)
condition: C
for: 10m
noDataState: NoData
execErrState: OK
annotations:
summary: High CPU
description: CPU above 85% for 10 minutes. Performance degradation likely.
runbook: "1. SSH to host 2. top -b -n1 | head -20 3. Check for runaway process 4. kubectl top pods (if K8s node)"
labels:
severity: warning
service: infrastructure
data:
- refId: A
relativeTimeRange: {from: 600, to: 0}
datasourceUid: prometheus
model: {expr: 'instance:node_cpu_usage:avg5m', instant: true, refId: A}
- refId: B
relativeTimeRange: {from: 600, to: 0}
datasourceUid: __expr__
model: {type: reduce, expression: A, reducer: last, refId: B}
- refId: C
relativeTimeRange: {from: 600, to: 0}
datasourceUid: __expr__
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [85], type: gt}}], refId: C}
- uid: high-memory
title: High Memory (>90%)
condition: C
for: 5m
noDataState: NoData
execErrState: OK
annotations:
summary: High memory usage
description: Memory above 90% for 5 minutes. OOM kills imminent.
runbook: "1. SSH to host 2. free -h 3. ps aux --sort=-%mem | head 10 4. Check for memory leak (growing RSS)"
labels:
severity: warning
service: infrastructure
data:
- refId: A
relativeTimeRange: {from: 600, to: 0}
datasourceUid: prometheus
model: {expr: 'instance:node_memory_usage:percent', instant: true, refId: A}
- refId: B
relativeTimeRange: {from: 600, to: 0}
datasourceUid: __expr__
model: {type: reduce, expression: A, reducer: last, refId: B}
- refId: C
relativeTimeRange: {from: 600, to: 0}
datasourceUid: __expr__
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [90], type: gt}}], refId: C}
- uid: disk-low
title: Disk Space Low (>85%)
condition: C
for: 10m
noDataState: NoData
execErrState: OK
annotations:
summary: Disk usage high
description: Root disk above 85% for 10 minutes. Service disruption if full.
runbook: "1. SSH to host 2. df -h / 3. du -sh /* | sort -rh | head 4. journalctl --vacuum-size=100M 5. podman system prune"
labels:
severity: warning
service: infrastructure
data:
- refId: A
relativeTimeRange: {from: 600, to: 0}
datasourceUid: prometheus
model: {expr: 'instance:node_disk_usage:percent', instant: true, refId: A}
- refId: B
relativeTimeRange: {from: 600, to: 0}
datasourceUid: __expr__
model: {type: reduce, expression: A, reducer: last, refId: B}
- refId: C
relativeTimeRange: {from: 600, to: 0}
datasourceUid: __expr__
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [85], type: gt}}], refId: C}
- orgId: 1
name: RemoteDesktop
folder: AI Stack Alerts
interval: 1m
rules:
- uid: remotedesktop-web-down
title: RemoteDesktop Web DOWN
condition: C
for: 3m
noDataState: Alerting
execErrState: OK
annotations:
summary: FlowerCore RemoteDesktop /health probe failing
description: "https://desktop.iamworkin.lan/health has failed for 3 minutes. Catalog + session launch surface offline."
runbook: "1. kubectl -n fc-desktop get pods -l app.kubernetes.io/name=remotedesktop-web 2. kubectl -n fc-desktop logs deploy/remotedesktop-web --tail=50 3. Check Traefik IngressRoute + step-ca cert 4. Rollout restart if pod is stuck"
labels:
severity: warning
service: remotedesktop
data:
- refId: A
relativeTimeRange: {from: 180, to: 0}
datasourceUid: prometheus
model: {expr: 'probe_success{job="probe-remotedesktop"}', instant: true, refId: A}
- refId: B
relativeTimeRange: {from: 180, to: 0}
datasourceUid: __expr__
model: {type: reduce, expression: A, reducer: last, refId: B}
- refId: C
relativeTimeRange: {from: 180, to: 0}
datasourceUid: __expr__
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [1], type: lt}}], refId: C}
- uid: remotedesktop-metrics-stale
title: RemoteDesktop metrics stale
condition: C
for: 10m
noDataState: Alerting
execErrState: OK
annotations:
summary: RemoteDesktop /metrics returning no series
description: "No fc_desktop_session_events_total series for 10 minutes. Either the Prometheus scrape is misconfigured or the web deployment stopped exporting metrics. Cross-checked by Zabbix template's identical 10m no-data trigger."
runbook: "1. curl -sk https://desktop.iamworkin.lan/metrics | head 2. kubectl -n monitoring exec deploy/prometheus -- wget -qO- localhost:9090/api/v1/targets?scrapePool=fc-remotedesktop 3. Check monitoring-netpol egress allows to fc-desktop:8080"
labels:
severity: warning
service: remotedesktop
data:
- refId: A
relativeTimeRange: {from: 600, to: 0}
datasourceUid: prometheus
model: {expr: 'count(fc_desktop_session_events_total) or vector(0)', instant: true, refId: A}
- refId: B
relativeTimeRange: {from: 600, to: 0}
datasourceUid: __expr__
model: {type: reduce, expression: A, reducer: last, refId: B}
- refId: C
relativeTimeRange: {from: 600, to: 0}
datasourceUid: __expr__
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [1], type: lt}}], refId: C}
- uid: remotedesktop-pool-depleted
title: RemoteDesktop pool depleted
condition: C
for: 5m
noDataState: OK
execErrState: OK
annotations:
summary: RemoteDesktop warm pool depleted for 5m
description: "A RemoteDesktop warm pool has fc_desktop_pool_depleted=1 for 5 minutes. New launches will cold-start. Check pod scheduling, image pull, node capacity."
runbook: "1. kubectl -n fc-desktop get pods -l app.kubernetes.io/name=remote-desktop --sort-by=.status.startTime 2. kubectl -n fc-desktop describe desktoppool <name> 3. Verify localhost/fc-desktop:* images imported on all 3 RKE2 nodes"
labels:
severity: warning
service: remotedesktop
data:
- refId: A
relativeTimeRange: {from: 300, to: 0}
datasourceUid: prometheus
model: {expr: 'max(fc_desktop_pool_depleted)', instant: true, refId: A}
- refId: B
relativeTimeRange: {from: 300, to: 0}
datasourceUid: __expr__
model: {type: reduce, expression: A, reducer: last, refId: B}
- refId: C
relativeTimeRange: {from: 300, to: 0}
datasourceUid: __expr__
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [0.5], type: gt}}], refId: C}
- uid: remotedesktop-pool-deficit-sustained
title: RemoteDesktop pool below desired
condition: C
for: 10m
noDataState: OK
execErrState: OK
annotations:
summary: RemoteDesktop pool sustained deficit
description: "A pool has fc_desktop_pool_deficit>0 for 10 minutes. Operator is reconciling but can't reach desired size — likely image pull, NFS affinity, or claim-init issue."
runbook: "1. kubectl -n fc-desktop get pods -l flowercore.io/pool=<pool> 2. kubectl logs -n fc-desktop deploy/remotedesktop-operator 3. Check claim-init hook env on template"
labels:
severity: info
service: remotedesktop
data:
- refId: A
relativeTimeRange: {from: 600, to: 0}
datasourceUid: prometheus
model: {expr: 'max(fc_desktop_pool_deficit)', instant: true, refId: A}
- refId: B
relativeTimeRange: {from: 600, to: 0}
datasourceUid: __expr__
model: {type: reduce, expression: A, reducer: last, refId: B}
- refId: C
relativeTimeRange: {from: 600, to: 0}
datasourceUid: __expr__
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [0], type: gt}}], refId: C}
- uid: remotedesktop-session-churn-spike
title: RemoteDesktop launch rate spike
condition: C
for: 5m
noDataState: OK
execErrState: OK
annotations:
summary: RemoteDesktop launch rate exceeds 20/min
description: "Launch events >20/min for 5 minutes. Could be a user-facing feature launch, pooled template thrashing, or runaway automation loop."
runbook: "1. kubectl -n fc-desktop get pods -l app.kubernetes.io/name=remote-desktop -o wide | wc -l 2. curl -sk https://desktop.iamworkin.lan/api/sessions/active 3. Check operator logs for reconcile loops"
labels:
severity: info
service: remotedesktop
data:
- refId: A
relativeTimeRange: {from: 300, to: 0}
datasourceUid: prometheus
model: {expr: 'sum(rate(fc_desktop_session_events_total{event="launch"}[5m])) * 60', instant: true, refId: A}
- refId: B
relativeTimeRange: {from: 300, to: 0}
datasourceUid: __expr__
model: {type: reduce, expression: A, reducer: last, refId: B}
- refId: C
relativeTimeRange: {from: 300, to: 0}
datasourceUid: __expr__
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [20], type: gt}}], refId: C}
- uid: remotedesktop-tls-expiry
title: RemoteDesktop TLS cert expiring
condition: C
for: 6h
noDataState: OK
execErrState: OK
annotations:
summary: desktop.iamworkin.lan cert <2d to expiry
description: "The desktop.iamworkin.lan certificate is inside the 2-day renewal window and cert-manager has not renewed. Check cert-manager logs, step-ca reachability, FlowerCore.DNS preflight for dnsNames."
runbook: "1. kubectl -n fc-desktop get certificate remotedesktop-web-tls 2. kubectl -n cert-manager logs deploy/cert-manager --tail=50 3. Verify pfSense DNS override for desktop.iamworkin.lan"
labels:
severity: critical
service: remotedesktop
data:
- refId: A
relativeTimeRange: {from: 21600, to: 0}
datasourceUid: prometheus
model: {expr: '(probe_ssl_earliest_cert_expiry{job="probe-remotedesktop"} - time()) / 86400', instant: true, refId: A}
- refId: B
relativeTimeRange: {from: 21600, to: 0}
datasourceUid: __expr__
model: {type: reduce, expression: A, reducer: last, refId: B}
- refId: C
relativeTimeRange: {from: 21600, to: 0}
datasourceUid: __expr__
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [2], type: lt}}], refId: C}
# =============================================================================
# Deployment: Grafana
# =============================================================================
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: grafana
namespace: monitoring
labels:
app: grafana
spec:
replicas: 1
strategy:
type: Recreate
selector:
matchLabels:
app: grafana
template:
metadata:
labels:
app: grafana
spec:
securityContext:
fsGroup: 472 # grafana group
runAsUser: 472
runAsGroup: 472
containers:
- name: grafana
image: docker.io/grafana/grafana:latest
env:
# Credentials from 1Password Operator (OnePasswordItem → Secret "grafana-credentials")
- name: GF_SECURITY_ADMIN_USER
valueFrom:
secretKeyRef:
name: grafana-credentials
key: username
- name: GF_SECURITY_ADMIN_PASSWORD
valueFrom:
secretKeyRef:
name: grafana-credentials
key: password
- name: GF_SERVER_ROOT_URL
value: "https://grafana.iamworkin.lan"
- name: GF_SERVER_SERVE_FROM_SUB_PATH
value: "false"
# Zabbix plugin: install manually after first boot if needed
# GF_INSTALL_PLUGINS requires internet on startup — breaks with restrictive NetworkPolicy
# kubectl exec -n monitoring deploy/grafana -- grafana cli plugins install alexanderzobnin-zabbix-app
ports:
- containerPort: 3000
name: http
volumeMounts:
- name: data
mountPath: /var/lib/grafana
- name: dashboard-provider
mountPath: /etc/grafana/provisioning/dashboards
readOnly: true
- name: dashboards-ai-stack
mountPath: /var/lib/grafana/dashboards/ai-stack
readOnly: true
- name: dashboards-edge-nodes
mountPath: /var/lib/grafana/dashboards/edge-nodes
readOnly: true
- name: dashboards-network
mountPath: /var/lib/grafana/dashboards/network
readOnly: true
- name: dashboards-operations
mountPath: /var/lib/grafana/dashboards/operations
readOnly: true
- name: dashboards-printer
mountPath: /var/lib/grafana/dashboards/printer
readOnly: true
- name: dashboards-infra-overview
mountPath: /var/lib/grafana/dashboards/infra-overview
readOnly: true
- name: dashboards-remotedesktop
mountPath: /var/lib/grafana/dashboards/remotedesktop
readOnly: true
- name: datasource-provisioning
mountPath: /etc/grafana/provisioning/datasources
readOnly: true
- name: alerting-provisioning
mountPath: /etc/grafana/provisioning/alerting
readOnly: true
resources:
requests:
cpu: 100m
memory: 128Mi
limits:
cpu: 500m
memory: 512Mi
livenessProbe:
httpGet:
path: /api/health
port: 3000
initialDelaySeconds: 30
periodSeconds: 30
readinessProbe:
httpGet:
path: /api/health
port: 3000
initialDelaySeconds: 10
periodSeconds: 10
volumes:
- name: data
persistentVolumeClaim:
claimName: grafana-data
- name: dashboard-provider
configMap:
name: grafana-dashboard-provider
- name: dashboards-ai-stack
configMap:
name: grafana-dashboards
- name: dashboards-edge-nodes
configMap:
name: grafana-dashboard-edge-nodes
- name: dashboards-network
configMap:
name: grafana-dashboard-network-overview
- name: dashboards-operations
configMap:
name: grafana-dashboard-operations
- name: dashboards-printer
configMap:
name: grafana-dashboard-printer
- name: dashboards-infra-overview
configMap:
name: grafana-dashboard-infra-overview
- name: dashboards-remotedesktop
configMap:
name: grafana-dashboard-remotedesktop
- name: datasource-provisioning
configMap:
name: grafana-datasource-provisioning
- name: alerting-provisioning
configMap:
name: grafana-alerting-provisioning
# =============================================================================
# Deployment: Blackbox Exporter
# =============================================================================
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: blackbox-exporter
namespace: monitoring
labels:
app: blackbox-exporter
spec:
replicas: 1
selector:
matchLabels:
app: blackbox-exporter
template:
metadata:
labels:
app: blackbox-exporter
spec:
containers:
- name: blackbox-exporter
image: quay.io/prometheus/blackbox-exporter:latest
args:
- "--config.file=/config/blackbox.yml"
ports:
- containerPort: 9115
name: http
volumeMounts:
- name: config
mountPath: /config/blackbox.yml
subPath: blackbox.yml
readOnly: true
resources:
requests:
cpu: 50m
memory: 32Mi
limits:
cpu: 200m
memory: 128Mi
livenessProbe:
httpGet:
path: /
port: 9115
initialDelaySeconds: 5
periodSeconds: 30
readinessProbe:
httpGet:
path: /
port: 9115
initialDelaySeconds: 3
periodSeconds: 10
volumes:
- name: config
configMap:
name: blackbox-config
# =============================================================================
# PVC: SNMP Exporter Config (100Mi, Longhorn)
# =============================================================================
# The custom snmp.yml (~2MB) exceeds the 1MB ConfigMap limit.
# This PVC stores the config file. To load a custom config:
# kubectl cp snmp.yml monitoring/<snmp-exporter-pod>:/config/snmp.yml
# Then restart the pod to pick up the new config.
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: snmp-config
namespace: monitoring
spec:
accessModes:
- ReadWriteOnce
storageClassName: longhorn
resources:
requests:
storage: 100Mi
# =============================================================================
# Deployment: SNMP Exporter
# =============================================================================
# Uses a PVC-mounted config at /config/snmp.yml. An init container copies the
# default config from the image if the PVC is empty (first deploy).
# To load the custom noc1 snmp.yml (~2MB):
# kubectl cp snmp.yml monitoring/<snmp-exporter-pod>:/config/snmp.yml
# Then restart: kubectl rollout restart deploy/snmp-exporter -n monitoring
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: snmp-exporter
namespace: monitoring
labels:
app: snmp-exporter
spec:
replicas: 1
selector:
matchLabels:
app: snmp-exporter
template:
metadata:
labels:
app: snmp-exporter
spec:
initContainers:
# Copy default snmp.yml from image if PVC is empty (first deploy)
- name: init-config
image: docker.io/prom/snmp-exporter:latest
command:
- sh
- -c
- |
if [ ! -f /config/snmp.yml ]; then
echo "No custom config found, copying default from image..."
cp /etc/snmp_exporter/snmp.yml /config/snmp.yml
echo "Default snmp.yml copied to PVC."
else
echo "Custom snmp.yml already exists on PVC, skipping copy."
fi
volumeMounts:
- name: snmp-config
mountPath: /config
containers:
- name: snmp-exporter
image: docker.io/prom/snmp-exporter:latest
args:
- "--config.file=/config/snmp.yml"
ports:
- containerPort: 9116
name: http
volumeMounts:
- name: snmp-config
mountPath: /config
readOnly: true
resources:
requests:
cpu: 50m
memory: 64Mi
limits:
cpu: 200m
memory: 256Mi
livenessProbe:
httpGet:
path: /
port: 9116
initialDelaySeconds: 5
periodSeconds: 30
readinessProbe:
httpGet:
path: /
port: 9116
initialDelaySeconds: 3
periodSeconds: 10
volumes:
- name: snmp-config
persistentVolumeClaim:
claimName: snmp-config
# =============================================================================
# Deployment: IRC Notify (alert relay)
# =============================================================================
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: irc-notify
namespace: monitoring
labels:
app: irc-notify
spec:
replicas: 1
selector:
matchLabels:
app: irc-notify
template:
metadata:
labels:
app: irc-notify
spec:
containers:
- name: irc-notify
image: docker.io/library/python:3.12-slim
command: ["python3", "/app/notify.py"]
ports:
- containerPort: 9119
name: http
volumeMounts:
- name: script
mountPath: /app/notify.py
subPath: notify.py
readOnly: true
resources:
requests:
cpu: 25m
memory: 32Mi
limits:
cpu: 100m
memory: 64Mi
livenessProbe:
tcpSocket:
port: 9119
initialDelaySeconds: 5
periodSeconds: 30
readinessProbe:
tcpSocket:
port: 9119
initialDelaySeconds: 3
periodSeconds: 10
volumes:
- name: script
configMap:
name: irc-notify-script
# =============================================================================
# DaemonSet: Node Exporter (runs on every RKE2 node)
# =============================================================================
# Port 9101 avoids conflict with host-level node-exporters already on :9100.
# The rke2-nodes Prometheus job scrapes the host instances on :9100; this
# DaemonSet provides K8s service-discovery-based scraping on :9101.
---
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: node-exporter
namespace: monitoring
labels:
app: node-exporter
spec:
selector:
matchLabels:
app: node-exporter
updateStrategy:
type: RollingUpdate
rollingUpdate:
maxUnavailable: 1
template:
metadata:
labels:
app: node-exporter
spec:
hostPID: true
hostNetwork: true
tolerations:
- operator: Exists
securityContext:
runAsNonRoot: false
runAsUser: 0
containers:
- name: node-exporter
image: docker.io/prom/node-exporter:latest
args:
- "--path.rootfs=/host"
- "--path.sysfs=/host/sys"
- "--path.procfs=/host/proc"
- "--collector.filesystem.mount-points-exclude=^/(dev|proc|sys|run|var/lib/containers|run/containers|var/lib/rancher|var/lib/longhorn)($|/)"
- "--collector.netclass.ignored-devices=^(veth|cali|flannel|cni).*$"
- "--no-collector.btrfs"
- "--web.listen-address=:9101"
ports:
- containerPort: 9101
hostPort: 9101
name: metrics
securityContext:
privileged: true
readOnlyRootFilesystem: true
volumeMounts:
- name: rootfs
mountPath: /host
readOnly: true
mountPropagation: HostToContainer
- name: proc
mountPath: /host/proc
readOnly: true
- name: sys
mountPath: /host/sys
readOnly: true
resources:
requests:
cpu: 50m
memory: 32Mi
limits:
cpu: 200m
memory: 128Mi
volumes:
- name: rootfs
hostPath:
path: /
- name: proc
hostPath:
path: /proc
- name: sys
hostPath:
path: /sys
# =============================================================================
# Service: Prometheus (ClusterIP :9090)
# =============================================================================
---
apiVersion: v1
kind: Service
metadata:
name: prometheus
namespace: monitoring
labels:
app: prometheus
spec:
type: ClusterIP
ports:
- port: 9090
targetPort: 9090
protocol: TCP
name: http
selector:
app: prometheus
# =============================================================================
# Service: Grafana (ClusterIP :3000)
# =============================================================================
---
apiVersion: v1
kind: Service
metadata:
name: grafana
namespace: monitoring
labels:
app: grafana
spec:
type: ClusterIP
ports:
- port: 3000
targetPort: 3000
protocol: TCP
name: http
selector:
app: grafana
# =============================================================================
# Service: Blackbox Exporter (ClusterIP :9115)
# =============================================================================
---
apiVersion: v1
kind: Service
metadata:
name: blackbox-exporter
namespace: monitoring
labels:
app: blackbox-exporter
spec:
type: ClusterIP
ports:
- port: 9115
targetPort: 9115
protocol: TCP
name: http
selector:
app: blackbox-exporter
# =============================================================================
# Service: SNMP Exporter (ClusterIP :9116)
# =============================================================================
---
apiVersion: v1
kind: Service
metadata:
name: snmp-exporter
namespace: monitoring
labels:
app: snmp-exporter
spec:
type: ClusterIP
ports:
- port: 9116
targetPort: 9116
protocol: TCP
name: http
selector:
app: snmp-exporter
# =============================================================================
# Service: Node Exporter (headless for Prometheus SD)
# =============================================================================
---
apiVersion: v1
kind: Service
metadata:
name: node-exporter
namespace: monitoring
labels:
app: node-exporter
spec:
type: ClusterIP
clusterIP: None
ports:
- port: 9101
targetPort: 9101
protocol: TCP
name: metrics
selector:
app: node-exporter
# =============================================================================
# Service: IRC Notify (ClusterIP :9119)
# =============================================================================
---
apiVersion: v1
kind: Service
metadata:
name: irc-notify
namespace: monitoring
labels:
app: irc-notify
spec:
type: ClusterIP
ports:
- port: 9119
targetPort: 9119
protocol: TCP
name: http
selector:
app: irc-notify
# =============================================================================
# TLS Certificates (cert-manager + step-ca ACME)
# =============================================================================
---
apiVersion: cert-manager.io/v1
kind: Certificate
metadata:
name: grafana-tls
namespace: monitoring
spec:
secretName: grafana-tls
issuerRef:
name: step-ca-acme
kind: ClusterIssuer
dnsNames:
- grafana.iamworkin.lan
---
apiVersion: cert-manager.io/v1
kind: Certificate
metadata:
name: prometheus-tls
namespace: monitoring
spec:
secretName: prometheus-tls
issuerRef:
name: step-ca-acme
kind: ClusterIssuer
dnsNames:
- prometheus.iamworkin.lan
# =============================================================================
# Traefik IngressRoute: Grafana
# =============================================================================
---
apiVersion: traefik.io/v1alpha1
kind: IngressRoute
metadata:
name: grafana
namespace: monitoring
spec:
entryPoints:
- websecure
routes:
- kind: Rule
match: Host(`grafana.iamworkin.lan`)
services:
- name: grafana
port: 3000
tls:
secretName: grafana-tls
# =============================================================================
# Traefik IngressRoute: Prometheus
# =============================================================================
---
apiVersion: traefik.io/v1alpha1
kind: IngressRoute
metadata:
name: prometheus
namespace: monitoring
spec:
entryPoints:
- websecure
routes:
- kind: Rule
match: Host(`prometheus.iamworkin.lan`)
services:
- name: prometheus
port: 9090
tls:
secretName: prometheus-tls
# =============================================================================
# NetworkPolicy: monitoring namespace
# =============================================================================
---
apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
metadata:
name: monitoring-netpol
namespace: monitoring
spec:
podSelector: {}
policyTypes:
- Ingress
- Egress
ingress:
# Allow from Traefik (IngressRoutes AND ACME solver pods)
- from:
- namespaceSelector:
matchLabels:
kubernetes.io/metadata.name: traefik-system
# Allow intra-namespace (prometheus→exporters, grafana→prometheus, grafana→irc-notify)
- from:
- podSelector: {}
# Allow from cert-manager (ACME HTTP-01 self-check)
- from:
- namespaceSelector:
matchLabels:
kubernetes.io/metadata.name: cert-manager
egress:
# DNS
- to:
- namespaceSelector: {}
ports:
- port: 53
protocol: UDP
- port: 53
protocol: TCP
# MGMT VLAN (noc1, pfSense, switches, SNMP, node-exporter)
- to:
- ipBlock:
cidr: 10.0.56.0/24
# PROD VLAN (edge nodes)
- to:
- ipBlock:
cidr: 10.0.57.0/24
# HOME VLAN (workstation, printer, NAS)
- to:
- ipBlock:
cidr: 10.0.58.0/24
# Intra-namespace
- to:
- podSelector: {}
# Blackbox probes to other namespaces (agent-zero, etc)
- to:
- namespaceSelector:
matchLabels:
kubernetes.io/metadata.name: agent-zero
ports:
- port: 80
protocol: TCP
# FlowerCore.RemoteDesktop /metrics scrape via the fc-desktop
# ClusterIP Service (remotedesktop-web:8080). Also covers the
# Traefik VIP hairpin path since after kube-proxy DNAT, the egress
# destination is the backend pod IP on the service port (see
# feedback_netpol_dnat_backend_port).
- to:
- namespaceSelector:
matchLabels:
kubernetes.io/metadata.name: fc-desktop
ports:
- port: 8080
protocol: TCP
# Traefik backend ports — needed for in-cluster egress to public
# iamworkin.lan hostnames that CoreDNS wildcard resolves to the
# LoadBalancer VIP. Post-DNAT destination is a Traefik pod on 8080/8443.
- to:
- namespaceSelector:
matchLabels:
kubernetes.io/metadata.name: traefik-system
podSelector:
matchLabels:
app.kubernetes.io/name: traefik
ports:
- port: 8080
protocol: TCP
- port: 8443
protocol: TCP
# Traefik /metrics endpoint (port 9100) — separate from the data-path
# ports above. Required for the in-cluster `traefik` scrape job.
- to:
- namespaceSelector:
matchLabels:
kubernetes.io/metadata.name: traefik-system
ports:
- port: 9100
protocol: TCP
# kube-state-metrics — required for kubernetes-state alert group.
- to:
- namespaceSelector:
matchLabels:
kubernetes.io/metadata.name: kube-system
ports:
- port: 8080
protocol: TCP
# cert-manager metrics — required for CertManagerCertificate* alerts.
- to:
- namespaceSelector:
matchLabels:
kubernetes.io/metadata.name: cert-manager
ports:
- port: 9402
protocol: TCP
# Longhorn manager metrics — required for Longhorn* alerts.
- to:
- namespaceSelector:
matchLabels:
kubernetes.io/metadata.name: longhorn-system
ports:
- port: 9500
protocol: TCP
# IRC (irc-notify → UnrealIRCd in irc namespace via K8s DNS)
- to:
- namespaceSelector:
matchLabels:
kubernetes.io/metadata.name: irc
ports:
- port: 6667
protocol: TCP
- port: 6697
protocol: TCP
# Step-CA ACME (cert renewal)
- to:
- ipBlock:
cidr: 10.0.56.10/32
ports:
- port: 9443
protocol: TCP
# Internet (optional: Grafana plugin install, ACME)
- to:
- ipBlock:
cidr: 0.0.0.0/0
except:
- 10.0.0.0/8
- 172.16.0.0/12
- 192.168.0.0/16
# =============================================================================
# Job: SNMP Config Loader (ArgoCD PostSync hook)
# =============================================================================
# Runs once after the main deployment to populate the SNMP config PVC.
# Attempts to download custom snmp.yml from noc1; falls back to the default
# config bundled in the snmp-exporter image.
---
apiVersion: batch/v1
kind: Job
metadata:
name: snmp-config-loader
namespace: monitoring
annotations:
argocd.argoproj.io/hook: PostSync
argocd.argoproj.io/hook-delete-policy: HookSucceeded
spec:
backoffLimit: 0
template:
metadata:
labels:
app: snmp-config-loader
spec:
restartPolicy: Never
initContainers:
# Try to download custom snmp.yml from noc1
- name: download-config
image: docker.io/curlimages/curl:latest
command:
- sh
- -c
- |
echo "Attempting to download custom snmp.yml from noc1..."
curl -sf --connect-timeout 10 --max-time 30 \
http://10.0.56.10:9116/config -o /config/snmp.yml 2>/dev/null
if [ $? -eq 0 ] && [ -s /config/snmp.yml ]; then
echo "Custom snmp.yml downloaded from noc1 successfully."
else
echo "Download failed or empty, will use default from image."
rm -f /config/snmp.yml
fi
volumeMounts:
- name: snmp-config
mountPath: /config
containers:
# If download failed, copy the default config from the image
- name: fallback-default
image: docker.io/prom/snmp-exporter:latest
command:
- sh
- -c
- |
if [ -f /config/snmp.yml ] && [ -s /config/snmp.yml ]; then
echo "Custom config already present, nothing to do."
else
echo "Copying default snmp.yml from image to PVC..."
cp /etc/snmp_exporter/snmp.yml /config/snmp.yml
echo "Default config copied."
fi
echo "SNMP config loader complete."
volumeMounts:
- name: snmp-config
mountPath: /config
volumes:
- name: snmp-config
persistentVolumeClaim:
claimName: snmp-config