The first batching pass (bacac06) left critical-severity alerts on the
immediate-print path. That's still per-event spam for any persistent
critical (e.g. PrintPaperRollCritical fires every 30s Grafana evaluation
cycle when paper is <5%). Caught immediately after deploy: CUPS queue grew
0 → 8 jobs in 8 minutes from a single firing PrintPaperRollCritical.
This commit aligns with the operator's verbatim ask ("one alert an hour"):
- Critical-severity alerts now go into the digest buffer, NOT the
immediate-print path. The digest payload already shows severity tags
per alertname, so the operator still sees "[critical] X" in the printout.
- The explicit `alert_channel=thermal_print_immediate` label still bypasses
batching, but only on NEW fingerprint arrival — it triggers a flush of
the CURRENT digest (with the new alert included), then clears. Repeat
webhooks for the same fingerprint dedupe in the buffer until the next
hourly tick OR until the alert resolves. No fingerprint can spam.
- `add_to_digest` now returns bool (True = buffer grew, False = dedup /
resolution / disabled) so the immediate-label path can flush only on
state transitions.
Net effect: max 1 thermal print per BATCH_INTERVAL_MIN per alert fingerprint,
regardless of severity. Rules that genuinely need same-second paper opt in
via `alert_channel=thermal_print_immediate` (currently zero rules use this).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
4819 lines
175 KiB
YAML
4819 lines
175 KiB
YAML
# =============================================================================
|
|
# NOC Monitoring Stack — K8s Migration Target
|
|
# =============================================================================
|
|
# Migrates the noc1 Podman monitoring pod to RKE2 K8s.
|
|
# Source: noc1 (10.0.56.10) /opt/monitoring/
|
|
#
|
|
# Components:
|
|
# - Prometheus (metrics, alerting)
|
|
# - Grafana (dashboards)
|
|
# - Blackbox Exporter (HTTP probes)
|
|
# - SNMP Exporter (network device metrics)
|
|
# - Node Exporter (host metrics, DaemonSet)
|
|
# - IRC Notify (alert relay to UnrealIRCd)
|
|
#
|
|
# Note: SNMP exporter config (snmp.yml) is ~2MB, exceeding the 1MB ConfigMap
|
|
# limit. It is stored in a separate file (snmp-config.yaml) and must be
|
|
# applied as a standalone ConfigMap or mounted via an init container that
|
|
# downloads it from Gitea.
|
|
# =============================================================================
|
|
|
|
---
|
|
apiVersion: v1
|
|
kind: Namespace
|
|
metadata:
|
|
name: monitoring
|
|
|
|
# =============================================================================
|
|
# ConfigMap: Prometheus Configuration
|
|
# =============================================================================
|
|
---
|
|
apiVersion: v1
|
|
kind: ConfigMap
|
|
metadata:
|
|
name: prometheus-config
|
|
namespace: monitoring
|
|
data:
|
|
prometheus.yml: |
|
|
global:
|
|
scrape_interval: 30s
|
|
evaluation_interval: 30s
|
|
|
|
rule_files:
|
|
- /etc/prometheus/alerts.yml
|
|
- /etc/prometheus/recording-rules.yml
|
|
|
|
scrape_configs:
|
|
# noc1 host metrics (external to cluster)
|
|
- job_name: "node-exporter"
|
|
static_configs:
|
|
- targets: ["10.0.56.10:9100"]
|
|
labels:
|
|
instance: "noc1"
|
|
vlan: "mgmt"
|
|
|
|
# RKE2 bare-metal cluster (openSUSE Leap 16 on NUCs)
|
|
- job_name: "rke2-nodes"
|
|
scrape_timeout: 15s
|
|
static_configs:
|
|
- targets: ["10.0.56.11:9100"]
|
|
labels:
|
|
instance: "rke2-server"
|
|
vlan: "mgmt"
|
|
cluster: "rke2"
|
|
role: "server"
|
|
- targets: ["10.0.56.12:9100"]
|
|
labels:
|
|
instance: "rke2-agent1"
|
|
vlan: "mgmt"
|
|
cluster: "rke2"
|
|
role: "agent"
|
|
- targets: ["10.0.56.13:9100"]
|
|
labels:
|
|
instance: "rke2-agent2"
|
|
vlan: "mgmt"
|
|
cluster: "rke2"
|
|
role: "agent"
|
|
|
|
# Mac mini macOS runner node (INFRA VLAN)
|
|
- job_name: "macmini-node"
|
|
scrape_timeout: 15s
|
|
static_configs:
|
|
- targets: ["10.0.56.115:9100"]
|
|
labels:
|
|
instance: "macmini"
|
|
host: "macmini.iamworkin.lan"
|
|
vlan: "infra"
|
|
arch: "arm64"
|
|
role: "macos-runner"
|
|
puppet_managed: "true"
|
|
puppet_server: "puppet.iamworkin.lan"
|
|
|
|
# In-cluster node-exporter DaemonSet
|
|
- job_name: "k8s-node-exporter"
|
|
kubernetes_sd_configs:
|
|
- role: endpoints
|
|
namespaces:
|
|
names: ["monitoring"]
|
|
relabel_configs:
|
|
- source_labels: [__meta_kubernetes_endpoints_name]
|
|
action: keep
|
|
regex: node-exporter
|
|
- source_labels: [__meta_kubernetes_endpoint_node_name]
|
|
target_label: instance
|
|
|
|
# pfSense SNMP via snmp-exporter
|
|
- job_name: "snmp-pfsense"
|
|
static_configs:
|
|
- targets: ["10.0.56.1"]
|
|
metrics_path: /snmp
|
|
params:
|
|
module: [if_mib]
|
|
auth: [bluejay_v2]
|
|
relabel_configs:
|
|
- source_labels: [__address__]
|
|
target_label: __param_target
|
|
- source_labels: [__param_target]
|
|
target_label: instance
|
|
- target_label: __address__
|
|
replacement: snmp-exporter.monitoring.svc:9116
|
|
|
|
# UniFi Cloud Key SNMP — DISABLED 2026-04-26
|
|
# The Cloud Key Gen2+ runs unifi-core (controller) only — not a network
|
|
# device — and does NOT run an SNMP agent on UDP/161. Scrapes were
|
|
# silently failing with "connection refused" from 10.42.x.x:161 every
|
|
# 30s, polluting up{} = 0 and lastError on the Targets page. Hardware
|
|
# health (CPU/mem/disk) for the Cloud Key host should come from
|
|
# node_exporter via SSH — not SNMP.
|
|
# - job_name: "snmp-cloudkey"
|
|
# static_configs:
|
|
# - targets: ["10.0.56.3"]
|
|
# metrics_path: /snmp
|
|
# params:
|
|
# module: [if_mib]
|
|
# auth: [bluejay_v2]
|
|
# relabel_configs:
|
|
# - source_labels: [__address__]
|
|
# target_label: __param_target
|
|
# - source_labels: [__param_target]
|
|
# target_label: instance
|
|
# - target_label: __address__
|
|
# replacement: snmp-exporter.monitoring.svc:9116
|
|
|
|
# UniFi Switch SNMP
|
|
- job_name: "snmp-switch"
|
|
static_configs:
|
|
- targets: ["10.0.56.2"]
|
|
metrics_path: /snmp
|
|
params:
|
|
module: [if_mib]
|
|
auth: [bluejay_v2]
|
|
relabel_configs:
|
|
- source_labels: [__address__]
|
|
target_label: __param_target
|
|
- source_labels: [__param_target]
|
|
target_label: instance
|
|
- target_label: __address__
|
|
replacement: snmp-exporter.monitoring.svc:9116
|
|
|
|
# Synology NAS SNMP
|
|
- job_name: "snmp-nas"
|
|
static_configs:
|
|
- targets: ["10.0.58.3"]
|
|
metrics_path: /snmp
|
|
params:
|
|
module: [synology]
|
|
auth: [bluejay_v2]
|
|
relabel_configs:
|
|
- source_labels: [__address__]
|
|
target_label: __param_target
|
|
- source_labels: [__param_target]
|
|
target_label: instance
|
|
- target_label: __address__
|
|
replacement: snmp-exporter.monitoring.svc:9116
|
|
|
|
# Prometheus self-monitoring
|
|
- job_name: "prometheus"
|
|
static_configs:
|
|
- targets: ["localhost:9090"]
|
|
|
|
# Edge nodes (PROD VLAN)
|
|
- job_name: "edge-nodes"
|
|
static_configs:
|
|
- targets: ["10.0.57.17:9100"]
|
|
labels:
|
|
instance: "edge1"
|
|
vlan: "prod"
|
|
arch: "arm64"
|
|
role: "ai-inference"
|
|
puppet_managed: "true"
|
|
puppet_server: "puppet.iamworkin.lan"
|
|
- targets: ["10.0.57.16:9100"]
|
|
labels:
|
|
instance: "edge2"
|
|
vlan: "prod"
|
|
arch: "arm64"
|
|
role: "ci-runner"
|
|
puppet_managed: "true"
|
|
puppet_server: "puppet.iamworkin.lan"
|
|
- targets: ["10.0.58.25:9100"]
|
|
labels:
|
|
instance: "piez"
|
|
vlan: "home"
|
|
arch: "arm64"
|
|
role: "prototyping"
|
|
- targets: ["10.0.58.113:9100"]
|
|
labels:
|
|
instance: "pirelay"
|
|
vlan: "home"
|
|
arch: "arm64"
|
|
role: "relay-controller"
|
|
|
|
# =======================================================================
|
|
# PiManager Application Metrics (relay states, temps, automation)
|
|
# =======================================================================
|
|
|
|
- job_name: "pimanager-app"
|
|
scrape_interval: 15s
|
|
metrics_path: /metrics
|
|
static_configs:
|
|
- targets: ["10.0.58.25:5000"]
|
|
labels:
|
|
instance: "piez"
|
|
service: "pimanager"
|
|
vlan: "home"
|
|
device: "pi4-ezconnect"
|
|
- targets: ["10.0.58.113:5100"]
|
|
labels:
|
|
instance: "pirelay"
|
|
service: "pimanager"
|
|
vlan: "home"
|
|
device: "pi3-ks0212"
|
|
|
|
# Epson ET-3750 EcoTank Printer SNMP
|
|
- job_name: "snmp-printer"
|
|
scrape_interval: 5m
|
|
scrape_timeout: 30s
|
|
static_configs:
|
|
- targets: ["10.0.58.107"]
|
|
labels:
|
|
instance: "epson-ecotank"
|
|
vlan: "home"
|
|
device_type: "printer"
|
|
metrics_path: /snmp
|
|
params:
|
|
module: [printer_mib]
|
|
auth: [public_v2]
|
|
relabel_configs:
|
|
- source_labels: [__address__]
|
|
target_label: __param_target
|
|
- source_labels: [__param_target]
|
|
target_label: instance
|
|
- target_label: __address__
|
|
replacement: snmp-exporter.monitoring.svc:9116
|
|
|
|
# =============================================================================
|
|
# Print Services (CUPS + Print.Web on edge2)
|
|
# =============================================================================
|
|
|
|
# CUPS Prometheus exporter (cups_exporter on edge2:9628)
|
|
- job_name: "cups"
|
|
scrape_interval: 30s
|
|
static_configs:
|
|
- targets: ["10.0.57.16:9628"]
|
|
labels:
|
|
instance: "edge2"
|
|
service: "cups"
|
|
device_type: "printer"
|
|
printer_model: "NuPrint 210"
|
|
|
|
# Print.Web OTEL metrics (print counters/histograms plus Ollama runner gauges)
|
|
- job_name: "printweb-otel"
|
|
scrape_interval: 30s
|
|
metrics_path: /metrics/prometheus
|
|
static_configs:
|
|
- targets: ["10.0.57.16:5200"]
|
|
labels:
|
|
instance: "print-web"
|
|
service: "print-web"
|
|
device_type: "printer"
|
|
printer_model: "NuPrint 210"
|
|
|
|
# Print.Web health (Blazor app on edge2:5200)
|
|
- job_name: "probe-printweb"
|
|
metrics_path: /probe
|
|
params:
|
|
module: [http_2xx]
|
|
scrape_interval: 30s
|
|
static_configs:
|
|
- targets: ["http://10.0.57.16:5200/"]
|
|
labels:
|
|
instance: "print-web"
|
|
service: "print-web"
|
|
relabel_configs:
|
|
- source_labels: [__address__]
|
|
target_label: __param_target
|
|
- source_labels: [__param_target]
|
|
target_label: instance
|
|
- target_label: __address__
|
|
replacement: blackbox-exporter.monitoring.svc:9115
|
|
|
|
# FlowerCore.RemoteDesktop web health (public cluster VIP)
|
|
# Module is https_internal — desktop.iamworkin.lan uses a step-ca leaf
|
|
# cert; blackbox does NOT trust step-ca root, so http_2xx fails with
|
|
# x509 unknown authority and probe_success=0 even when /health 200s.
|
|
- job_name: "probe-remotedesktop"
|
|
metrics_path: /probe
|
|
params:
|
|
module: [https_internal]
|
|
scrape_interval: 30s
|
|
static_configs:
|
|
- targets: ["https://desktop.iamworkin.lan/health"]
|
|
labels:
|
|
instance: "https://desktop.iamworkin.lan/health"
|
|
service: "remotedesktop-web"
|
|
relabel_configs:
|
|
- source_labels: [__address__]
|
|
target_label: __param_target
|
|
- target_label: __address__
|
|
replacement: blackbox-exporter.monitoring.svc:9115
|
|
|
|
# FlowerCore.RemoteDesktop /metrics (direct scrape for counters)
|
|
- job_name: "fc-remotedesktop"
|
|
metrics_path: /metrics
|
|
scheme: https
|
|
scrape_interval: 30s
|
|
tls_config:
|
|
insecure_skip_verify: true
|
|
static_configs:
|
|
- targets: ["desktop.iamworkin.lan"]
|
|
labels:
|
|
service: "remotedesktop-web"
|
|
|
|
# CUPS web UI health (port 631)
|
|
- job_name: "probe-cups"
|
|
metrics_path: /probe
|
|
params:
|
|
module: [http_2xx]
|
|
scrape_interval: 60s
|
|
static_configs:
|
|
- targets: ["http://10.0.57.16:631/"]
|
|
labels:
|
|
instance: "cups-edge2"
|
|
service: "cups"
|
|
relabel_configs:
|
|
- source_labels: [__address__]
|
|
target_label: __param_target
|
|
- source_labels: [__param_target]
|
|
target_label: instance
|
|
- target_label: __address__
|
|
replacement: blackbox-exporter.monitoring.svc:9115
|
|
|
|
# =============================================================================
|
|
# AI Stack Health Probes (Blackbox Exporter)
|
|
# =============================================================================
|
|
|
|
# NOTE: probe-ollama-local and probe-agentzero-local were REMOVED
|
|
# 2026-04-26. They pointed at 10.0.58.100 (HOME VLAN) which is not
|
|
# reachable from cluster pods (firewalled). They had been firing as
|
|
# OllamaDown / AgentZeroDown since 2026-04-24. Workstation/AI-laptop
|
|
# Ollama and Agent Zero should be monitored via host-side Puppet
|
|
# (node_exporter on the box) once the AI laptop is running 24/7.
|
|
|
|
# Ollama API — edge1 Pi 5 (NUC Agent Zero)
|
|
- job_name: "probe-ollama-edge1"
|
|
metrics_path: /probe
|
|
params:
|
|
module: [http_ollama]
|
|
scrape_interval: 30s
|
|
static_configs:
|
|
- targets: ["http://10.0.57.17:11434/api/tags"]
|
|
labels:
|
|
instance: "ollama-edge1"
|
|
service: "ollama"
|
|
deployment: "nuc"
|
|
gpu: "cpu"
|
|
relabel_configs:
|
|
- source_labels: [__address__]
|
|
target_label: __param_target
|
|
- source_labels: [__param_target]
|
|
target_label: instance
|
|
- target_label: __address__
|
|
replacement: blackbox-exporter.monitoring.svc:9115
|
|
|
|
# Agent Zero Web UI — in-cluster (RKE2)
|
|
# Target uses short svc form (agent-zero.agent-zero.svc) NOT
|
|
# cluster.local FQDN — the *.cluster.local form gets rewritten to
|
|
# 10.0.56.200 (Traefik VIP) by the CoreDNS iamworkin.lan template +
|
|
# ndots:5 search-suffix expansion. Memory: feedback_coredns_ndots_template_collision.
|
|
- job_name: "probe-agentzero-nuc"
|
|
metrics_path: /probe
|
|
params:
|
|
module: [http_2xx]
|
|
scrape_interval: 30s
|
|
static_configs:
|
|
- targets: ["http://agent-zero.agent-zero.svc:80/"]
|
|
labels:
|
|
instance: "agent-zero-nuc"
|
|
service: "agent-zero"
|
|
deployment: "nuc"
|
|
relabel_configs:
|
|
- source_labels: [__address__]
|
|
target_label: __param_target
|
|
- source_labels: [__param_target]
|
|
target_label: instance
|
|
- target_label: __address__
|
|
replacement: blackbox-exporter.monitoring.svc:9115
|
|
|
|
# =============================================================================
|
|
# K8s Cluster State (kube-state-metrics, cert-manager, traefik)
|
|
# =============================================================================
|
|
# Use in-cluster ClusterIP service DNS — NOT NodePorts — so a same-node
|
|
# NodePort hairpin doesn't break the scrape (hit on rke2-agent1 hosting
|
|
# both prometheus and traefik on 2026-04-26: 10.0.56.12:30900 timed out
|
|
# from prometheus while .11/.13 worked). NodePorts at 30900-30902 are
|
|
# still useful for noc1-Podman-style external scrapers, but in-cluster
|
|
# we should always use the svc DNS form.
|
|
|
|
# kube-state-metrics — exposes K8s object state (pods, deployments, nodes)
|
|
# Required for KubeContainerRestartingFrequently / KubePodNotReady alerts.
|
|
- job_name: "kube-state-metrics"
|
|
scrape_interval: 30s
|
|
static_configs:
|
|
- targets: ["kube-state-metrics.kube-system.svc:8080"]
|
|
labels:
|
|
cluster: "rke2"
|
|
|
|
# cert-manager — exposes certmanager_certificate_ready_status,
|
|
# certmanager_certificate_expiration_timestamp_seconds, etc. Drives the
|
|
# CertManagerCertificateNotReady / CertManagerCertificateRenewalFailed
|
|
# alerts. Memory: project_cert_manager_prometheus_scrape.
|
|
- job_name: "cert-manager"
|
|
scrape_interval: 30s
|
|
static_configs:
|
|
- targets: ["cert-manager-metrics.cert-manager.svc:9402"]
|
|
labels:
|
|
cluster: "rke2"
|
|
|
|
# Traefik — request rates, latency, TLS cert metadata, router state.
|
|
# ClusterIP svc routes to one of the traefik pods; per-pod scrape via
|
|
# the headless `traefik-metrics` selector would be nicer for failover
|
|
# visibility but the single-replica scrape is enough for steady-state.
|
|
- job_name: "traefik"
|
|
scrape_interval: 15s
|
|
static_configs:
|
|
- targets: ["traefik-metrics.traefik-system.svc:9100"]
|
|
labels:
|
|
service: "traefik"
|
|
cluster: "rke2"
|
|
|
|
# Longhorn — exposes longhorn_volume_robustness, longhorn_backup_*,
|
|
# longhorn_node_status_*. Enables LonghornVolumeUnhealthy +
|
|
# LonghornBackupFailed alerts (no real visibility into Longhorn
|
|
# health before this — was relying on K8s events which are noisy
|
|
# transient lifecycle messages, not actionable signals).
|
|
- job_name: "longhorn"
|
|
scrape_interval: 30s
|
|
static_configs:
|
|
- targets: ["longhorn-backend.longhorn-system.svc:9500"]
|
|
labels:
|
|
service: "longhorn"
|
|
cluster: "rke2"
|
|
|
|
# FC web services through Traefik — single probe surface to spot any
|
|
# iamworkin.lan host returning non-200. Uses https_internal because all
|
|
# certs are step-ca leaves; blackbox would x509-fail with http_2xx.
|
|
# Some services need explicit healthcheck paths because root returns
|
|
# 404 (acme, guac) or 401 (grafana, prometheus). Drop them or point at
|
|
# the right endpoint — don't lower valid_status_codes globally because
|
|
# 401 from a healthy pod and 401 from an outage look identical.
|
|
- job_name: "probe-traefik-services"
|
|
metrics_path: /probe
|
|
params:
|
|
module: [https_internal]
|
|
scrape_interval: 60s
|
|
static_configs:
|
|
- targets:
|
|
# Root-reachable services (200 or 3xx)
|
|
- "https://gitea.iamworkin.lan/"
|
|
- "https://argocd.iamworkin.lan/"
|
|
- "https://intranet.iamworkin.lan/"
|
|
- "https://signage.iamworkin.lan/"
|
|
- "https://kiosk.iamworkin.lan/"
|
|
- "https://media.iamworkin.lan/"
|
|
- "https://mysql.iamworkin.lan/"
|
|
- "https://php.iamworkin.lan/"
|
|
- "https://zabbix.iamworkin.lan/"
|
|
- "https://desktop.iamworkin.lan/"
|
|
- "https://print.iamworkin.lan/"
|
|
- "https://dns.iamworkin.lan/"
|
|
- "https://chat.iamworkin.lan/"
|
|
- "https://dist.iamworkin.lan/"
|
|
- "https://dms.iamworkin.lan/"
|
|
- "https://menuboard.iamworkin.lan/"
|
|
- "https://messageboard.iamworkin.lan/"
|
|
- "https://presentations.iamworkin.lan/"
|
|
- "https://retail.iamworkin.lan/"
|
|
- "https://ttsreader.iamworkin.lan/"
|
|
# Explicit healthcheck paths
|
|
- "https://fc-llm-bridge.iamworkin.lan/healthz"
|
|
- "https://acme.iamworkin.lan/health"
|
|
# NOTE: services intentionally NOT in this probe surface
|
|
# - grafana.iamworkin.lan: every endpoint (incl. /api/health
|
|
# and /login) returns 401 behind Traefik basic-auth.
|
|
# Health covered by in-cluster monitoring-grafana scrape.
|
|
# - prometheus.iamworkin.lan: same auth pattern. Health covered
|
|
# by the prometheus self-scrape job.
|
|
# - guac.iamworkin.lan: deprecated — Guacamole moved to
|
|
# desktop.iamworkin.lan/guacamole/ (memory:
|
|
# feedback_traefik_cross_namespace_refs_disabled).
|
|
labels:
|
|
probe_type: "traefik-service"
|
|
relabel_configs:
|
|
- source_labels: [__address__]
|
|
target_label: __param_target
|
|
- source_labels: [__param_target]
|
|
regex: "https?://([^/:]+).*"
|
|
target_label: instance
|
|
- target_label: __address__
|
|
replacement: blackbox-exporter.monitoring.svc:9115
|
|
|
|
# =============================================================================
|
|
# Self-monitoring (K8s monitoring namespace)
|
|
# =============================================================================
|
|
|
|
- job_name: "monitoring-grafana"
|
|
metrics_path: /metrics
|
|
static_configs:
|
|
- targets: ["grafana.monitoring.svc:3000"]
|
|
labels:
|
|
instance: "grafana-k8s"
|
|
service: "grafana"
|
|
|
|
- job_name: "monitoring-blackbox"
|
|
static_configs:
|
|
- targets: ["blackbox-exporter.monitoring.svc:9115"]
|
|
labels:
|
|
instance: "blackbox-k8s"
|
|
service: "blackbox"
|
|
|
|
recording-rules.yml: |
|
|
groups:
|
|
- name: node-aggregations
|
|
interval: 30s
|
|
rules:
|
|
- record: instance:node_cpu_usage:avg5m
|
|
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)
|
|
- record: instance:node_memory_usage:percent
|
|
expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100
|
|
- record: instance:node_disk_usage:percent
|
|
expr: (1 - node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100
|
|
- record: instance:node_network_receive:rate5m
|
|
expr: rate(node_network_receive_bytes_total{device!~"lo|veth.*|cali.*|flannel.*"}[5m]) * 8
|
|
- record: instance:node_network_transmit:rate5m
|
|
expr: rate(node_network_transmit_bytes_total{device!~"lo|veth.*|cali.*|flannel.*"}[5m]) * 8
|
|
- name: probe-aggregations
|
|
interval: 30s
|
|
rules:
|
|
- record: service:probe_success:min
|
|
expr: min by(service) (probe_success)
|
|
- record: service:probe_duration:avg
|
|
expr: avg by(service) (probe_duration_seconds)
|
|
- name: print-rates
|
|
interval: 30s
|
|
rules:
|
|
- record: print:jobs_per_minute:rate5m
|
|
expr: rate(print_jobs_enqueued_total[5m]) * 60
|
|
- record: print:success_rate:ratio5m
|
|
expr: rate(print_jobs_completed_total[5m]) / rate(print_jobs_enqueued_total[5m])
|
|
- record: print:job_duration_p95:5m
|
|
expr: histogram_quantile(0.95, rate(print_job_duration_ms_bucket[5m]))
|
|
- record: print:ollama_runner_keepalive_remaining_seconds:max
|
|
expr: max by(instance, model) (ai_ollama_runner_keepalive_remaining_seconds{job="printweb-otel"})
|
|
- name: relay-rates
|
|
interval: 15s
|
|
rules:
|
|
- record: relay:state_changes:1h
|
|
expr: changes(pimanager_relay_state[1h])
|
|
- record: epson:pages_per_day:rate24h
|
|
expr: increase(prtMarkerLifeCount{job="snmp-printer"}[24h])
|
|
|
|
alerts.yml: |
|
|
groups:
|
|
- name: ai-stack
|
|
rules:
|
|
- alert: OllamaDown
|
|
expr: probe_success{service="ollama"} == 0
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Ollama is down on {{ $labels.deployment }}"
|
|
description: "Ollama API at {{ $labels.instance }} has been unreachable for 2 minutes. Agent Zero FAISS memory will fail."
|
|
|
|
- alert: AgentZeroDown
|
|
expr: probe_success{service="agent-zero"} == 0
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Agent Zero is down on {{ $labels.deployment }}"
|
|
description: "Agent Zero web UI at {{ $labels.instance }} has been unreachable for 2 minutes."
|
|
|
|
- alert: OllamaSlowResponse
|
|
expr: probe_duration_seconds{service="ollama"} > 3
|
|
for: 5m
|
|
labels:
|
|
severity: info
|
|
annotations:
|
|
summary: "Ollama responding slowly on {{ $labels.deployment }}"
|
|
description: "Ollama API response time exceeds 3s for 5 minutes. GPU may be overloaded."
|
|
|
|
- name: print-services
|
|
rules:
|
|
- alert: CUPSExporterDown
|
|
expr: up{job="cups"} == 0
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "CUPS exporter unreachable on edge2"
|
|
description: "cups_exporter at edge2:9628 has been down for 2 minutes. CUPS monitoring offline."
|
|
|
|
- alert: CUPSWebUIDown
|
|
expr: probe_success{job="probe-cups"} == 0
|
|
for: 3m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "CUPS web UI down on edge2"
|
|
description: "CUPS port 631 unreachable for 3 minutes. Network printing unavailable."
|
|
|
|
- alert: PrintWebDown
|
|
expr: probe_success{job="probe-printweb"} == 0
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Print.Web is down on edge2"
|
|
description: "FlowerCore Print.Web at edge2:5200 unreachable. API/MCP/Blazor printing unavailable."
|
|
|
|
- alert: CUPSPrinterStopped
|
|
expr: cups_printer_state_total{state="stopped"} > 0
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "CUPS printer stopped on edge2"
|
|
description: "A CUPS printer has been in stopped state for 5 minutes. Check USB connection or paper."
|
|
|
|
- alert: CUPSJobBacklog
|
|
expr: cups_job_active_total > 10
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Print queue backlog on edge2 ({{ $value }} active jobs)"
|
|
description: "CUPS has {{ $value }} active jobs queued. Possible printer jam, USB disconnect, or paper out."
|
|
|
|
# Paper roll lifecycle alerts (XL Track I, 2026-04-26).
|
|
# Source-of-truth gauge: print_paper_remaining_percent (Print.Web OTEL,
|
|
# hydrated on startup from the active PaperRoll row).
|
|
# alert_channel=thermal_print routes through irc-notify -> Print.Web
|
|
# /api/print/alert so the printer announces its own paper-out warning
|
|
# on its remaining paper. Self-referential humor + operator nudge.
|
|
- alert: PrintPaperRollLow
|
|
expr: print_paper_remaining_percent{job="printweb-otel"} < 10 and print_paper_remaining_percent{job="printweb-otel"} > 5
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
alert_channel: thermal_print
|
|
annotations:
|
|
summary: "Print roll low on edge2 ({{ $value | printf \"%.1f\" }}% remaining)"
|
|
description: "NuPrint 210 paper roll has {{ $value | printf \"%.1f\" }}% remaining. Operator should load a fresh roll soon. Run /api/paper/status for the precise mm + estimated jobs left."
|
|
|
|
- alert: PrintPaperRollCritical
|
|
expr: print_paper_remaining_percent{job="printweb-otel"} <= 5
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
alert_channel: thermal_print
|
|
annotations:
|
|
summary: "Print roll critical on edge2 ({{ $value | printf \"%.1f\" }}% remaining)"
|
|
description: "NuPrint 210 paper roll at {{ $value | printf \"%.1f\" }}% — load a new roll NOW. The 50ft roll has a ~12% red-stripe zone; once paper passes that, the printer can run dry mid-job."
|
|
|
|
- alert: PrintJobDeadLetter
|
|
expr: increase(print_jobs_dead_letter_total[15m]) > 0
|
|
for: 1m
|
|
labels:
|
|
severity: warning
|
|
alert_channel: thermal_print
|
|
annotations:
|
|
summary: "Print job(s) entered dead-letter on edge2 ({{ $value | printf \"%.0f\" }} in last 15m)"
|
|
description: "{{ $value | printf \"%.0f\" }} print job(s) exhausted MaxRetries and need operator action. Open /print-log, filter Status=DeadLetter, click 'Retry From Start' after fixing the underlying cause (paper jam, USB disconnect, printer power-cycle)."
|
|
|
|
- alert: CUPSHighJobRate
|
|
expr: rate(cups_job_total[5m]) * 60 > 30
|
|
for: 5m
|
|
labels:
|
|
severity: info
|
|
annotations:
|
|
summary: "High print volume on edge2 ({{ $value | printf \"%.0f\" }} jobs/min)"
|
|
description: "Print rate exceeds 30 jobs/min for 5 minutes. Possible runaway print loop."
|
|
|
|
- alert: PrintOllamaRunnerLongKeepAlive
|
|
expr: ai_ollama_runner_keepalive_remaining_seconds{job="printweb-otel"} > 600
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
alert_channel: thermal_print
|
|
annotations:
|
|
summary: "Print.Web Ollama runner held for >10m ({{ $labels.model }})"
|
|
description: "Print.Web reports model {{ $labels.model }} with {{ $value | printf \"%.0f\" }}s of keep-alive remaining. Check concurrent requests before the Pi 5 Ollama lane thrashes."
|
|
|
|
- name: macmini-runners
|
|
rules:
|
|
- alert: MacMiniRunnerOffline
|
|
expr: (flowercore_github_runner_online{runner=~"macmini-.*"} == 0) or absent(flowercore_github_runner_online{runner=~"macmini-.*"})
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
service: github-runner
|
|
annotations:
|
|
summary: "Mac mini GitHub runner offline ({{ $labels.runner }})"
|
|
description: "A macmini-* GitHub Actions runner has not reported online for more than 10 minutes. Puppet manages its LaunchDaemon under /Library/LaunchDaemons/io.flowercore.github-runner-<slug>.plist; runners survive reboot and do not require a GUI session."
|
|
|
|
- name: linux-runners
|
|
rules:
|
|
- alert: LinuxRunnerOffline
|
|
expr: |
|
|
kube_deployment_status_replicas_ready{
|
|
namespace="github-runner",
|
|
deployment=~"github-runner(|-(sharedpos|puppet|signage|dms|telephony|print-web|chat|mysql|kiosk-linux))"
|
|
} == 0
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
alert_channel: irc
|
|
service: github-runner
|
|
team: ci
|
|
annotations:
|
|
summary: "Linux CI runner offline: {{ $labels.deployment }}"
|
|
description: "Deployment {{ $labels.deployment }} in namespace github-runner has 0 ready replicas for more than 5 minutes. CI jobs targeting this repo will queue until the runner pod restarts and re-registers with GitHub. Check pods with: kubectl -n github-runner get pods -l app.kubernetes.io/name={{ $labels.deployment }}. Check logs with: kubectl -n github-runner logs -l app.kubernetes.io/name={{ $labels.deployment }} --tail=50. Common causes: PAT missing repo access, runner CrashLoopBackOff, or node/resource pressure."
|
|
|
|
- name: remote-desktop
|
|
rules:
|
|
- alert: RemoteDesktopWebDown
|
|
expr: probe_success{job="probe-remotedesktop",instance="https://desktop.iamworkin.lan/health"} == 0
|
|
for: 3m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "FlowerCore RemoteDesktop web is down"
|
|
description: "https://desktop.iamworkin.lan/health probe has failed for 3 minutes. Catalog + session launch surface offline."
|
|
|
|
- alert: RemoteDesktopMetricsStale
|
|
expr: absent(fc_desktop_session_events_total)
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "RemoteDesktop /metrics scrape returning no data"
|
|
description: "No fc_desktop_session_events_total series for 10 minutes. Either the Prometheus scrape target is misconfigured or the web deployment stopped exporting metrics. Zabbix template carries the same 10m no-data trigger for cross-monitor parity."
|
|
|
|
# PUBLISHER QUIRK: fc_desktop_pool_depleted / _deficit emit one
|
|
# series per template per status (Ready/Warming/BelowDesiredSize/
|
|
# Disabled), and the historical series for non-current statuses
|
|
# stay at their last value. So just `_depleted > 0` fires forever
|
|
# on any template that ever entered a bad state.
|
|
#
|
|
# SAFE PATTERN: alert only when the canonical "Ready" status
|
|
# gauge does NOT report ready=1 for the enabled template. This
|
|
# is the publisher's own canary — _ready{status="Ready"}==1 is
|
|
# always the current "everything is fine" signal.
|
|
- alert: RemoteDesktopPoolDepleted
|
|
expr: |
|
|
group by(template) (fc_desktop_pool_ready{enabled="true"})
|
|
unless on(template) (fc_desktop_pool_ready{enabled="true",status="Ready"} == 1)
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "RemoteDesktop pool depleted ({{ $labels.template }})"
|
|
description: "Pool for template {{ $labels.template }} has no Ready warm pod for 5 minutes. New launches will cold-start. Check pod-scheduling failures, image pull issues, or exhausted node capacity."
|
|
|
|
# Same pattern, but only fires when template explicitly reports
|
|
# a sustained Warning-level alert state (current-status series).
|
|
- alert: RemoteDesktopPoolDeficitSustained
|
|
expr: |
|
|
fc_desktop_pool_deficit{enabled="true",alert_level="Warning"} > 0
|
|
unless on(template) (fc_desktop_pool_ready{enabled="true",status="Ready"} == 1)
|
|
for: 10m
|
|
labels:
|
|
severity: info
|
|
annotations:
|
|
summary: "RemoteDesktop pool {{ $labels.template }} below desired for 10m"
|
|
description: "Pool {{ $labels.template }} has a persistent deficit of {{ $value }} warm pods AND no Ready series. Likely image pull, NFS affinity, or claim-init issue."
|
|
|
|
- alert: RemoteDesktopSessionChurnSpike
|
|
expr: sum(rate(fc_desktop_session_events_total{event="launch"}[5m])) * 60 > 20
|
|
for: 5m
|
|
labels:
|
|
severity: info
|
|
annotations:
|
|
summary: "RemoteDesktop launch rate high ({{ $value | printf \"%.0f\" }}/min)"
|
|
description: "Launch events exceed 20/min for 5 minutes. Could be a user-facing feature launch, a pooled template thrashing, or a runaway automation loop."
|
|
|
|
- alert: RemoteDesktopRecordingEventsDropped
|
|
expr: absent_over_time(fc_desktop_session_events_total{event="recording"}[30m]) and on() (sum(fc_desktop_session_events_total{event="launch"}) > 0)
|
|
for: 15m
|
|
labels:
|
|
severity: info
|
|
annotations:
|
|
summary: "RemoteDesktop recording events silent for 30m despite active launches"
|
|
description: "No recording events in 30 minutes while launches are happening. Recording may be silently disabled on all templates (SessionRecordingEnabled=false), the guacd NFS mount may be unhealthy, or the retention sweep isn't emitting events. Not an error by itself — worth checking."
|
|
|
|
# Match by job — instance label carries full URL incl. /health,
|
|
# not just hostname, so a hostname-only match never fires.
|
|
- alert: RemoteDesktopTlsExpiry
|
|
expr: probe_ssl_earliest_cert_expiry{job="probe-remotedesktop"} - time() < 2 * 86400
|
|
for: 6h
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "desktop.iamworkin.lan TLS cert expires within 2 days"
|
|
description: "The desktop.iamworkin.lan cert is inside the 2-day renewal window and cert-manager has not renewed. Check cert-manager logs, step-ca reachability, and pfSense DNS overrides per the ACME DNS-01 gate."
|
|
|
|
- name: pi-fleet
|
|
rules:
|
|
- alert: PiManagerDown
|
|
expr: up{job="pimanager-app"} == 0
|
|
for: 3m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "PiManager down on {{ $labels.instance }}"
|
|
description: "PiManager app on {{ $labels.instance }} ({{ $labels.device }}) unreachable for 3 minutes."
|
|
|
|
- alert: PiCpuTempHigh
|
|
expr: pimanager_cpu_temperature_celsius > 75
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Pi CPU temperature high on {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}C)"
|
|
|
|
- alert: PiCpuTempCritical
|
|
expr: pimanager_cpu_temperature_celsius > 82
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Pi CPU temperature CRITICAL on {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}C)"
|
|
|
|
- alert: PiMemoryHigh
|
|
expr: pimanager_memory_usage_percent > 90
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Pi memory usage high on {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}%)"
|
|
|
|
- alert: PiDiskHigh
|
|
expr: pimanager_disk_usage_percent > 85
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Pi disk usage high on {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}%)"
|
|
|
|
- alert: RelayAllOff
|
|
expr: sum by (instance) (pimanager_relay_state) == 0 and pimanager_relay_channel_count > 0
|
|
for: 0m
|
|
labels:
|
|
severity: info
|
|
annotations:
|
|
summary: "All relay channels OFF on {{ $labels.instance }}"
|
|
|
|
- alert: PiWifiWeak
|
|
expr: pimanager_wifi_signal_dbm < -75 and pimanager_wifi_signal_dbm != 0
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Weak WiFi signal on {{ $labels.instance }} ({{ $value }}dBm)"
|
|
|
|
- name: snmp-devices
|
|
rules:
|
|
- alert: EpsonInkLow
|
|
expr: prtMarkerSuppliesLevel{job="snmp-printer"} < 15 and prtMarkerSuppliesLevel{job="snmp-printer"} > 0
|
|
for: 0m
|
|
labels:
|
|
severity: warning
|
|
alert_channel: thermal_print
|
|
annotations:
|
|
summary: "Epson ink low: {{ $labels.prtMarkerSuppliesDescription }} at {{ $value }}%"
|
|
|
|
- alert: EpsonInkCritical
|
|
expr: prtMarkerSuppliesLevel{job="snmp-printer"} < 5 and prtMarkerSuppliesLevel{job="snmp-printer"} > 0
|
|
for: 0m
|
|
labels:
|
|
severity: critical
|
|
alert_channel: thermal_print
|
|
annotations:
|
|
summary: "Epson ink CRITICAL: {{ $labels.prtMarkerSuppliesDescription }} at {{ $value }}%"
|
|
|
|
# for: 30m absorbs sleep cycles. The EcoTank sleeps after ~5 min
|
|
# of idle and SNMP times out, so 5m for: would page nightly. A
|
|
# genuine printer outage (jam, disconnected) lasts well over 30m.
|
|
- alert: EpsonPrinterDown
|
|
expr: up{job="snmp-printer"} == 0
|
|
for: 30m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Epson ET-3750 SNMP unreachable for >30m (likely actual fault, not sleep)"
|
|
|
|
- alert: SynologyDiskLow
|
|
expr: hrStorageUsed{job="snmp-nas"} / hrStorageSize{job="snmp-nas"} * 100 > 85
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
alert_channel: thermal_print
|
|
annotations:
|
|
summary: "Synology NAS storage high ({{ $value | printf \"%.1f\" }}%)"
|
|
|
|
- alert: SynologyDown
|
|
expr: up{job="snmp-nas"} == 0
|
|
for: 3m
|
|
labels:
|
|
severity: critical
|
|
alert_channel: thermal_print
|
|
annotations:
|
|
summary: "Synology NAS SNMP unreachable"
|
|
|
|
- name: infrastructure
|
|
rules:
|
|
- alert: NodeDown
|
|
expr: up{job=~"node-exporter|rke2-nodes|edge-nodes|k8s-node-exporter"} == 0
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Node {{ $labels.instance }} is down"
|
|
|
|
- alert: HighCPU
|
|
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 85
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High CPU on {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}%)"
|
|
|
|
- alert: HighMemory
|
|
expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100 > 90
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High memory usage on {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}%)"
|
|
|
|
- alert: DiskSpaceLow
|
|
expr: (1 - node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100 > 85
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Disk usage high on {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}%)"
|
|
|
|
# Puppet agent + service alerts.
|
|
# Mirror of FlowerCore.Notes/scripts/monitoring/alerts.yml `puppet` group
|
|
# so a future migration to in-cluster Prometheus inherits the ruleset.
|
|
# Source-of-truth for the live Podman Prometheus on noc1 is the Notes file.
|
|
# See feedback_monitoring_k8s_target_vs_live_podman.
|
|
- name: puppet
|
|
rules:
|
|
- alert: PuppetAgentReportStale
|
|
expr: puppet_last_run_age_seconds > 7200
|
|
for: 30m
|
|
labels:
|
|
severity: warning
|
|
alert_channel: irc
|
|
annotations:
|
|
summary: "Puppet agent {{ $labels.instance }} hasn't reported in over 2h"
|
|
description: "Last run age: {{ $value | humanizeDuration }}. The puppet agent on {{ $labels.instance }} may be stopped, the node may be powered off, or noc1 may be unreachable from this node."
|
|
runbook: "1. SSH to node (via noc1 jumpbox if needed) 2. sudo systemctl status puppet 3. sudo puppet agent -t --noop to force a run 4. Check r10k: ssh fcadmin@10.0.56.10 'sudo podman logs openvoxserver --tail 50' 5. Verify noc1 reachability: ping puppet.iamworkin.lan"
|
|
|
|
- alert: PuppetAgentReportCritical
|
|
expr: puppet_last_run_age_seconds > 86400
|
|
for: 1h
|
|
labels:
|
|
severity: critical
|
|
alert_channel: irc
|
|
annotations:
|
|
summary: "Puppet agent {{ $labels.instance }} silent for over 24h — node is unmanaged"
|
|
description: "Last run age: {{ $value | humanizeDuration }}. Node {{ $labels.instance }} has not submitted a Puppet report in over 24 hours. Config drift is accumulating — investigate immediately. If intentional (maintenance), add to the exclusion filter or silence in Grafana."
|
|
runbook: "URGENT: 1. Check node power state 2. SSH via noc1 jumpbox: ssh fcadmin@10.0.56.10 then ssh <node> 3. sudo systemctl status puppet 4. sudo systemctl start puppet + sudo puppet agent -t 5. Check for network partitions (VLAN connectivity to 10.0.56.10) 6. If node was recently reimaged: sudo puppet agent -t to re-register with new SSL cert"
|
|
|
|
# Sprint 33 Cx-7 Phase B (2026-05-25 postmortem follow-up):
|
|
# Detects puppet.service in failed state — distinct from PuppetAgentReportStale
|
|
# which catches "agent hasn't run." This catches "systemd gave up restarting it"
|
|
# (CA-verify loop or other fatal exit). Requires node-exporter systemd collector
|
|
# enabled with --collector.systemd. If `node_systemd_unit_state` has no series
|
|
# for a node, the collector is disabled there — flag in postmortem follow-up.
|
|
- alert: PuppetServiceFailed
|
|
expr: node_systemd_unit_state{name="puppet.service",state="failed"} == 1
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
alert_channel: irc
|
|
annotations:
|
|
summary: "Puppet service failed on {{ $labels.instance }}"
|
|
description: "puppet.service on {{ $labels.instance }} has been in failed state for 5+ minutes. systemd has stopped auto-restarting (CA-verify-loop or other exit). Manual `systemctl status puppet` confirms. Run `sudo systemctl start puppet` to recover; investigate journal for root cause."
|
|
runbook_url: "https://github.com/astoltz/FlowerCore.Notes/blob/master/memory/feedback_puppet_service_dead_after_ca_loop_alert_misreads.md"
|
|
|
|
# K8s pod-state alerts. Require kube-state-metrics scrape (added
|
|
# 2026-04-26 — see scrape_configs above). Would have surfaced the
|
|
# agent-zero ollama-proxy 172x crash-loop instead of letting it
|
|
# silently churn for ~3 days.
|
|
- name: kubernetes-state
|
|
rules:
|
|
- alert: KubeContainerRestartingFrequently
|
|
expr: increase(kube_pod_container_status_restarts_total[1h]) > 5
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "{{ $labels.namespace }}/{{ $labels.pod }} container {{ $labels.container }} restarting >5x/hr"
|
|
description: "Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has restarted {{ $value | printf \"%.0f\" }} times in the last hour. Check 'kubectl describe pod' + last-state termination reason."
|
|
|
|
- alert: KubeContainerCrashLooping
|
|
expr: increase(kube_pod_container_status_restarts_total[15m]) > 3
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
alert_channel: thermal_print
|
|
annotations:
|
|
summary: "{{ $labels.namespace }}/{{ $labels.pod }} crashlooping ({{ $value | printf \"%.0f\" }} restarts/15m)"
|
|
description: "Container {{ $labels.container }} restarted {{ $value | printf \"%.0f\" }} times in 15 minutes — actively crashlooping."
|
|
|
|
- alert: KubePodNotReady
|
|
expr: sum by(namespace, pod) (kube_pod_status_phase{phase=~"Pending|Failed|Unknown"}) > 0
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "{{ $labels.namespace }}/{{ $labels.pod }} not Ready for >15m"
|
|
description: "Pod is in a non-Running, non-Succeeded phase for over 15 minutes. Common causes: ImagePullBackOff (registry/Nexus down, wrong image tag), pending PVC, scheduling failure (taint/resources)."
|
|
|
|
- alert: KubePodImagePullBackOff
|
|
expr: sum by(namespace, pod) (kube_pod_container_status_waiting_reason{reason=~"ImagePullBackOff|ErrImagePull"}) > 0
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "{{ $labels.namespace }}/{{ $labels.pod }} ImagePullBackOff for >10m"
|
|
description: "Pod can't pull image. Check the image ref (often a stale tag or unreachable registry) and clean up if it's an orphan."
|
|
|
|
- alert: KubeDeploymentReplicasMismatch
|
|
expr: kube_deployment_spec_replicas != kube_deployment_status_replicas_available
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} replica mismatch"
|
|
description: "Spec wants {{ $labels.spec_replicas }} but only {{ $value }} available. Likely a rollout stuck on probe failure, scheduling, or PVC."
|
|
|
|
# Q-MR-3 (2026-05-11): multus memory pressure — catches the next OOM
|
|
# cascade BEFORE multus is OOM-killed cluster-wide. The 2026-05-10
|
|
# outage (21h) hit because no alert fired on the rising multus working
|
|
# set — only downstream blackbox / Traefik / service alerts. With
|
|
# 1Gi limit (bluejay-infra@eb8693e), 80% = ~800MiB; steady-state
|
|
# runs ~150-250MiB so this only fires when an avalanche starts.
|
|
- alert: MultusMemoryPressure
|
|
expr: |
|
|
container_memory_working_set_bytes{container="kube-multus"}
|
|
/ container_spec_memory_limit_bytes{container="kube-multus"} > 0.8
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
alert_channel: thermal_print
|
|
annotations:
|
|
summary: "kube-multus memory >80% of limit on {{ $labels.node }} for 5m"
|
|
description: "kube-multus working set is {{ $value | humanizePercentage }} of its memory limit on node {{ $labels.node }}. If this keeps climbing, multus will OOM and all new pod networking will halt cluster-wide (precedent: 2026-05-10 outage)."
|
|
|
|
# Q-MR-3 (2026-05-11): namespace pending-pod backlog — catches the
|
|
# operator-leak avalanche pattern BEFORE it cascades into a multus
|
|
# CNI OOM. Any FC operator (RemoteDesktop / Distribution / WorldBuilder)
|
|
# emitting pods without ownerReferences will accumulate them when
|
|
# the operator crashes. >25 pending pods in any namespace for 30m
|
|
# is the signal to investigate the reconciler.
|
|
- alert: NamespacePendingPodBacklog
|
|
expr: sum by (namespace) (kube_pod_status_phase{phase="Pending"}) > 25
|
|
for: 30m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Namespace {{ $labels.namespace }} has {{ $value }} Pending pods for 30m"
|
|
description: "Pending pod count in {{ $labels.namespace }} exceeds 25 sustained for 30m. Likely operator-leak avalanche pattern — children emitted without ownerReferences. Risk of multus CNI OOM cascade."
|
|
|
|
# Longhorn storage health alerts. Required: longhorn scrape job
|
|
# (added 2026-04-26 — see scrape_configs above). The K8s events
|
|
# for "snapshot becomes not ready to use" are transient lifecycle
|
|
# noise, not actionable — these alerts use the actual Longhorn
|
|
# gauges that reflect persistent state.
|
|
- name: longhorn-storage
|
|
rules:
|
|
# Volume robustness: 0=unknown, 1=healthy, 2=degraded, 3=faulted.
|
|
# Detached volumes report 0 — that's normal for unattached PVCs,
|
|
# so filter to only attached.
|
|
- alert: LonghornVolumeDegraded
|
|
expr: longhorn_volume_robustness{robustness="degraded"} == 1
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Longhorn volume {{ $labels.volume }} degraded for >15m"
|
|
description: "Volume {{ $labels.volume }} on node {{ $labels.node }} has been degraded (one or more replicas unhealthy) for 15+ minutes. Auto-rebuild may need help — check 'kubectl describe volume.longhorn.io {{ $labels.volume }} -n longhorn-system'."
|
|
|
|
- alert: LonghornVolumeFaulted
|
|
expr: longhorn_volume_robustness{robustness="faulted"} == 1
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
alert_channel: thermal_print
|
|
annotations:
|
|
summary: "Longhorn volume {{ $labels.volume }} FAULTED"
|
|
description: "Volume {{ $labels.volume }} on node {{ $labels.node }} is faulted — all replicas unavailable. Data inaccessible. Manual intervention required."
|
|
|
|
# No backup in 36h indicates the daily-backup recurringJob is
|
|
# silently failing. Allows for one missed run + slack.
|
|
- alert: LonghornBackupStale
|
|
expr: |
|
|
(time() - max by(volume) (longhorn_backup_state{state="Completed"} * on(backup) group_left() longhorn_backup_actual_size_bytes)) > 36 * 3600
|
|
for: 1h
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Longhorn volume {{ $labels.volume }} has no completed backup in >36h"
|
|
description: "Daily backup recurringJob (cron 0 2 * * *) appears to have skipped this volume. Check 'kubectl get backups.longhorn.io -n longhorn-system' and the daily-backup CronJob logs."
|
|
|
|
- alert: LonghornNodeUnhealthy
|
|
expr: longhorn_node_status{condition="ready",condition_reason!=""} == 0
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Longhorn node {{ $labels.node }} not Ready"
|
|
description: "Node {{ $labels.node }} reports ready=false (reason: {{ $labels.condition_reason }}). Volumes scheduled to this node will be unavailable until it recovers."
|
|
|
|
# ============================================================
|
|
# FC Signage Marquee Performance — Track 3 + 8 (2026-05-06)
|
|
# Live-mirrored from FlowerCore.Notes/scripts/monitoring/alerts.yml.
|
|
# Source-of-truth for the live Podman Prometheus on noc1 is the
|
|
# Notes file; this K8s ConfigMap exists so a future migration to
|
|
# in-cluster Prometheus inherits the ruleset automatically.
|
|
# See feedback_monitoring_k8s_target_vs_live_podman.
|
|
# ============================================================
|
|
- name: fc-signage-marquee
|
|
rules:
|
|
- alert: MarqueeDroppedFramesHigh
|
|
expr: |
|
|
(
|
|
sum by (renderer, phase, node_id) (rate(marquee_dropped_frames_total[5m]))
|
|
/
|
|
sum by (renderer, phase, node_id) (rate(marquee_render_latency_ms_count[5m]))
|
|
) > 0.05
|
|
unless on()
|
|
absent_over_time(marquee_dropped_frames_total[7d])
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
service: signage
|
|
alert_channel: irc
|
|
annotations:
|
|
summary: "Marquee dropped-frame rate >5% on {{ $labels.renderer }}/{{ $labels.node_id }} ({{ $labels.phase }})"
|
|
description: "Renderer {{ $labels.renderer }} on {{ $labels.node_id }} drops >5% of frames during {{ $labels.phase }}. Animation visibly stuttery."
|
|
|
|
- alert: MarqueeRenderLatencyP99High
|
|
expr: |
|
|
histogram_quantile(
|
|
0.99,
|
|
sum by (renderer, phase, node_id, le) (rate(marquee_render_latency_ms_bucket[5m]))
|
|
) > 16
|
|
unless on()
|
|
absent_over_time(marquee_render_latency_ms_bucket[7d])
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
service: signage
|
|
alert_channel: irc
|
|
annotations:
|
|
summary: "Marquee render latency p99 > 16ms on {{ $labels.renderer }}/{{ $labels.node_id }} ({{ $labels.phase }})"
|
|
description: "Per-frame render latency p99 has exceeded the Pi-class 16ms budget for 10 minutes."
|
|
|
|
- alert: MarqueeAnimationDurationDrift
|
|
expr: |
|
|
abs(
|
|
histogram_quantile(0.5, sum by (renderer, phase, le) (rate(marquee_animation_duration_ms_bucket[15m])))
|
|
-
|
|
on (phase) group_left() avg by (phase) (marquee_animation_duration_target_ms)
|
|
)
|
|
/
|
|
on (phase) group_left() avg by (phase) (marquee_animation_duration_target_ms)
|
|
> 0.10
|
|
unless on()
|
|
absent_over_time(marquee_animation_duration_ms_bucket[7d])
|
|
for: 15m
|
|
labels:
|
|
severity: info
|
|
service: signage
|
|
alert_channel: irc
|
|
annotations:
|
|
summary: "Marquee animation duration drifting > 10% on {{ $labels.renderer }} ({{ $labels.phase }})"
|
|
description: "Median observed cycle duration deviates from target DurationMs by >10%. Could indicate browser tab throttling, GPU pressure, or phase-advancement bug."
|
|
|
|
# =============================================================================
|
|
# ConfigMap: Blackbox Exporter Configuration
|
|
# =============================================================================
|
|
---
|
|
apiVersion: v1
|
|
kind: ConfigMap
|
|
metadata:
|
|
name: blackbox-config
|
|
namespace: monitoring
|
|
data:
|
|
blackbox.yml: |
|
|
modules:
|
|
http_2xx:
|
|
prober: http
|
|
timeout: 5s
|
|
http:
|
|
valid_http_versions: ["HTTP/1.1", "HTTP/2.0"]
|
|
valid_status_codes: [200]
|
|
method: GET
|
|
fail_if_body_not_matches_regexp: []
|
|
preferred_ip_protocol: ip4
|
|
http_ollama:
|
|
prober: http
|
|
timeout: 5s
|
|
http:
|
|
valid_http_versions: ["HTTP/1.1", "HTTP/2.0"]
|
|
valid_status_codes: [200]
|
|
method: GET
|
|
fail_if_body_not_matches_regexp:
|
|
- '"models"'
|
|
preferred_ip_protocol: ip4
|
|
# https_internal — for Traefik-fronted services with step-ca leaf
|
|
# certs. blackbox does not trust the step-ca root CA, so http_2xx
|
|
# against any *.iamworkin.lan host fails with x509 unknown authority.
|
|
# Redirects + multiple status codes are accepted because some hosts
|
|
# 302 to /login or /scalar.
|
|
https_internal:
|
|
prober: http
|
|
timeout: 10s
|
|
http:
|
|
valid_http_versions: ["HTTP/1.1", "HTTP/2.0"]
|
|
valid_status_codes: [200, 301, 302, 303, 307, 308]
|
|
method: GET
|
|
follow_redirects: true
|
|
preferred_ip_protocol: ip4
|
|
tls_config:
|
|
insecure_skip_verify: true
|
|
|
|
# =============================================================================
|
|
# ConfigMap: IRC Notify Script
|
|
# =============================================================================
|
|
---
|
|
apiVersion: v1
|
|
kind: ConfigMap
|
|
metadata:
|
|
name: irc-notify-script
|
|
namespace: monitoring
|
|
data:
|
|
notify.py: |
|
|
#!/usr/bin/env python3
|
|
"""HTTP->IRC alert relay with thermal-printer DIGEST forwarding.
|
|
|
|
Listens on :9119, posts to #alerts on UnrealIRCd, forwards to Print.Web
|
|
/api/print/alert. Thermal printing is BATCHED into hourly digests by
|
|
default so the printer no longer spam-fires per Grafana webhook.
|
|
|
|
Routing (per Grafana webhook alert):
|
|
- IRC: always per-event (operator likes the stream)
|
|
- Thermal printer:
|
|
* severity in {critical,disaster,page} OR
|
|
label alert_channel=thermal_print_immediate -> print NOW
|
|
* label alert_channel=thermal_print -> enqueue into hourly digest
|
|
* everything else -> IRC only
|
|
- RESOLVED webhooks remove the alert from the digest buffer
|
|
|
|
Env vars (defaults preserve old behavior on first deploy):
|
|
THERMAL_PRINT_ENABLED default "true" - master kill switch
|
|
BATCH_INTERVAL_MIN default "60" - minutes between digest prints
|
|
BATCH_MAX_PENDING default "50" - force-flush threshold
|
|
|
|
HTTP surface:
|
|
POST / - Grafana webhook entry
|
|
POST /flush - manual digest flush (idempotent)
|
|
GET / - status + config + buffer depth + stats
|
|
"""
|
|
import json, os, socket, sys, threading, time
|
|
from collections import defaultdict
|
|
from datetime import datetime, timezone
|
|
from http.server import HTTPServer, BaseHTTPRequestHandler
|
|
from urllib.request import Request, urlopen
|
|
|
|
THERMAL_PRINT_ENABLED = os.environ.get("THERMAL_PRINT_ENABLED", "true").lower() == "true"
|
|
BATCH_INTERVAL_MIN = int(os.environ.get("BATCH_INTERVAL_MIN", "60"))
|
|
BATCH_MAX_PENDING = int(os.environ.get("BATCH_MAX_PENDING", "50"))
|
|
|
|
IRC_HOST = os.environ.get("IRC_HOST", "unrealircd.irc.svc")
|
|
IRC_PORT = int(os.environ.get("IRC_PORT", "6667"))
|
|
IRC_NICK = os.environ.get("IRC_NICK", "grafana-bot")
|
|
IRC_CHANNEL = os.environ.get("IRC_CHANNEL", "#alerts")
|
|
PRINT_WEB_URL = os.environ.get("PRINT_WEB_URL", "http://10.0.57.16:5200/api/print/alert")
|
|
|
|
_buffer_lock = threading.Lock()
|
|
_buffer = {} # fingerprint -> {"alert": dict, "first_seen": float, "last_seen": float}
|
|
_last_flush_time = time.time()
|
|
_stats = {"webhooks_received": 0, "irc_sent": 0, "print_immediate": 0,
|
|
"digest_flushed": 0, "buffer_dedup": 0, "buffer_added": 0,
|
|
"buffer_resolved": 0, "started_at": time.time()}
|
|
|
|
def send_irc(message):
|
|
try:
|
|
sock = socket.create_connection((IRC_HOST, IRC_PORT), timeout=15)
|
|
sock.sendall(f"NICK {IRC_NICK}\r\n".encode())
|
|
sock.sendall(f"USER {IRC_NICK} 0 * :Grafana Alert Bot\r\n".encode())
|
|
registered = False
|
|
deadline = time.time() + 10
|
|
buf = ""
|
|
while time.time() < deadline:
|
|
try:
|
|
data = sock.recv(4096).decode("utf-8", errors="replace")
|
|
if not data: break
|
|
buf += data
|
|
for line in buf.split("\r\n"):
|
|
if line.startswith("PING"):
|
|
sock.sendall(("PONG " + line.split(" ", 1)[1] + "\r\n").encode())
|
|
if " 001 " in buf:
|
|
registered = True
|
|
break
|
|
except socket.timeout: break
|
|
if not registered:
|
|
sock.close()
|
|
return False
|
|
sock.sendall(f"JOIN {IRC_CHANNEL}\r\n".encode())
|
|
time.sleep(0.5)
|
|
sock.recv(4096)
|
|
for line in message.split("\n"):
|
|
if line.strip():
|
|
sock.sendall(f"PRIVMSG {IRC_CHANNEL} :{line}\r\n".encode())
|
|
time.sleep(0.3)
|
|
time.sleep(0.5)
|
|
sock.sendall(b"QUIT :alert delivered\r\n")
|
|
sock.close()
|
|
_stats["irc_sent"] += 1
|
|
return True
|
|
except Exception as e:
|
|
print(f"[irc-notify] IRC send failed: {e}", file=sys.stderr)
|
|
return False
|
|
|
|
def post_thermal(payload, kind):
|
|
if not THERMAL_PRINT_ENABLED:
|
|
print(f"[irc-notify] thermal disabled; skip {kind} ({payload.get('title','?')[:40]})", file=sys.stderr)
|
|
return False
|
|
try:
|
|
req = Request(PRINT_WEB_URL, data=json.dumps(payload).encode("utf-8"),
|
|
headers={"Content-Type": "application/json"}, method="POST")
|
|
resp = urlopen(req, timeout=10)
|
|
if kind == "immediate": _stats["print_immediate"] += 1
|
|
print(f"[irc-notify] thermal {kind} sent: {payload.get('title','?')[:50]}", file=sys.stderr)
|
|
return True
|
|
except Exception as e:
|
|
print(f"[irc-notify] thermal {kind} failed: {e}", file=sys.stderr)
|
|
return False
|
|
|
|
def fingerprint_of(alert):
|
|
fp = alert.get("fingerprint", "")
|
|
if fp: return fp
|
|
labels = alert.get("labels", {})
|
|
target = labels.get("pod") or labels.get("instance") or labels.get("deployment") or labels.get("statefulset") or labels.get("namespace") or ""
|
|
return f"{labels.get('alertname','?')}/{labels.get('namespace','')}/{target}"
|
|
|
|
def is_critical(alert):
|
|
return alert.get("labels", {}).get("severity", "").lower() in ("critical", "disaster", "page")
|
|
|
|
def is_immediate_label(alert):
|
|
return alert.get("labels", {}).get("alert_channel") == "thermal_print_immediate"
|
|
|
|
def is_batched_label(alert):
|
|
return alert.get("labels", {}).get("alert_channel") == "thermal_print"
|
|
|
|
def add_to_digest(alert):
|
|
"""Add an alert to the digest buffer. Returns True if the buffer GREW
|
|
(new fingerprint), False if it was a dedup, resolution, or no-op.
|
|
"""
|
|
if not THERMAL_PRINT_ENABLED: return False
|
|
fp = fingerprint_of(alert)
|
|
status = alert.get("status", "firing").lower()
|
|
with _buffer_lock:
|
|
if status == "resolved":
|
|
if fp in _buffer:
|
|
del _buffer[fp]
|
|
_stats["buffer_resolved"] += 1
|
|
return False
|
|
if fp in _buffer:
|
|
_buffer[fp]["last_seen"] = time.time()
|
|
_buffer[fp]["alert"] = alert
|
|
_stats["buffer_dedup"] += 1
|
|
return False
|
|
_buffer[fp] = {"alert": alert, "first_seen": time.time(), "last_seen": time.time()}
|
|
_stats["buffer_added"] += 1
|
|
return True
|
|
|
|
def build_digest_payload():
|
|
with _buffer_lock:
|
|
items = list(_buffer.values())
|
|
if not items: return None
|
|
by_name = defaultdict(list)
|
|
for item in items:
|
|
labels = item["alert"].get("labels", {})
|
|
by_name[labels.get("alertname", "Unknown")].append(item)
|
|
lines = []
|
|
for name, group in sorted(by_name.items()):
|
|
targets = []
|
|
for it in group[:5]:
|
|
labels = it["alert"].get("labels", {})
|
|
t = (labels.get("pod") or labels.get("instance") or labels.get("deployment")
|
|
or labels.get("statefulset") or labels.get("namespace") or "?")
|
|
targets.append(t)
|
|
more = f" (+{len(group)-5})" if len(group) > 5 else ""
|
|
sevs = sorted({it["alert"].get("labels", {}).get("severity", "warning") for it in group})
|
|
lines.append(f"[{'/'.join(sevs)}] {name} x{len(group)}: {', '.join(targets)}{more}")
|
|
now = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC")
|
|
title = f"Alert digest: {len(items)} firing"
|
|
body = "\n".join([
|
|
f"=== {title} ===",
|
|
f"as of {now}",
|
|
"",
|
|
*lines,
|
|
"",
|
|
"Stream: #alerts (IRC) | Triage: grafana-noc1.iamworkin.lan",
|
|
"Force-flush: POST irc-notify.monitoring.svc:9119/flush",
|
|
])
|
|
return {"title": title, "severity": "Warning", "host": "monitoring",
|
|
"message": body, "eventId": f"digest-{int(time.time())}",
|
|
"source": "Grafana digest", "status": "PROBLEM", "acknowledged": False}
|
|
|
|
def flush_digest():
|
|
payload = build_digest_payload()
|
|
if payload is None:
|
|
print("[irc-notify] flush: buffer empty, no digest sent", file=sys.stderr)
|
|
return False
|
|
sent = post_thermal(payload, "digest")
|
|
with _buffer_lock:
|
|
_buffer.clear()
|
|
if sent: _stats["digest_flushed"] += 1
|
|
return sent
|
|
|
|
def digest_loop():
|
|
global _last_flush_time
|
|
while True:
|
|
try:
|
|
now = time.time()
|
|
elapsed = now - _last_flush_time
|
|
if elapsed >= BATCH_INTERVAL_MIN * 60:
|
|
print(f"[irc-notify] digest tick: interval reached ({BATCH_INTERVAL_MIN}m); buffer={len(_buffer)}", file=sys.stderr)
|
|
flush_digest()
|
|
_last_flush_time = now
|
|
elif len(_buffer) >= BATCH_MAX_PENDING:
|
|
print(f"[irc-notify] digest tick: buffer full ({len(_buffer)}); force flush", file=sys.stderr)
|
|
flush_digest()
|
|
_last_flush_time = now
|
|
time.sleep(15)
|
|
except Exception as e:
|
|
print(f"[irc-notify] digest loop error: {e}", file=sys.stderr)
|
|
time.sleep(60)
|
|
|
|
class Handler(BaseHTTPRequestHandler):
|
|
def do_POST(self):
|
|
if self.path == "/flush":
|
|
ok = flush_digest()
|
|
self.send_response(200); self.send_header("Content-Type", "application/json"); self.end_headers()
|
|
self.wfile.write(json.dumps({"flushed": ok, "buffer_after": len(_buffer)}).encode())
|
|
return
|
|
_stats["webhooks_received"] += 1
|
|
length = int(self.headers.get("Content-Length", 0))
|
|
body = json.loads(self.rfile.read(length)) if length else {}
|
|
for alert in body.get("alerts", []):
|
|
status = alert.get("status", "unknown").upper()
|
|
labels = alert.get("labels", {})
|
|
name = labels.get("alertname", "Unknown")
|
|
summary = alert.get("annotations", {}).get("summary", "")
|
|
desc = alert.get("annotations", {}).get("description", "")
|
|
severity = labels.get("severity", "")
|
|
icon = "\x0304[FIRING]\x03" if status == "FIRING" else "\x0303[RESOLVED]\x03"
|
|
sev_tag = f" [{severity}]" if severity else ""
|
|
msg = f"{icon}{sev_tag} {name}: {summary}"
|
|
if desc: msg += f"\n {desc}"
|
|
send_irc(msg)
|
|
# Thermal routing — EVERYTHING (including criticals) goes into
|
|
# the hourly digest. Only the explicit `alert_channel=thermal_print_immediate`
|
|
# label bypasses, and even that flushes-the-current-digest rather
|
|
# than printing a standalone job, so the same fingerprint can't
|
|
# spam the printer per webhook cycle.
|
|
if status == "RESOLVED":
|
|
add_to_digest(alert) # removes from buffer
|
|
continue
|
|
if is_immediate_label(alert):
|
|
# Explicit opt-in for "paper this NOW" — first arrival of a
|
|
# new fingerprint triggers an immediate digest flush; repeat
|
|
# webhooks for the same fingerprint dedupe in the buffer
|
|
# until the next interval or until the alert resolves.
|
|
new_in_buffer = add_to_digest(alert)
|
|
if new_in_buffer:
|
|
global _last_flush_time
|
|
flush_digest()
|
|
_last_flush_time = time.time()
|
|
elif is_critical(alert) or is_batched_label(alert):
|
|
add_to_digest(alert)
|
|
# else: IRC-only (warnings without thermal_print label)
|
|
self.send_response(200); self.send_header("Content-Type", "application/json"); self.end_headers()
|
|
self.wfile.write(b'{"status":"ok"}')
|
|
|
|
def do_GET(self):
|
|
self.send_response(200); self.send_header("Content-Type", "application/json"); self.end_headers()
|
|
with _buffer_lock:
|
|
alertnames = sorted({it["alert"].get("labels", {}).get("alertname", "?") for it in _buffer.values()})
|
|
depth = len(_buffer)
|
|
info = {
|
|
"service": "irc-notify",
|
|
"config": {"thermal_print_enabled": THERMAL_PRINT_ENABLED,
|
|
"batch_interval_min": BATCH_INTERVAL_MIN,
|
|
"batch_max_pending": BATCH_MAX_PENDING,
|
|
"irc_target": f"{IRC_HOST}:{IRC_PORT} {IRC_CHANNEL}",
|
|
"print_web_url": PRINT_WEB_URL},
|
|
"buffer": {"depth": depth, "alertnames": alertnames,
|
|
"seconds_since_last_flush": int(time.time() - _last_flush_time),
|
|
"seconds_until_next_flush": max(0, int(BATCH_INTERVAL_MIN*60 - (time.time() - _last_flush_time)))},
|
|
"stats": _stats,
|
|
}
|
|
self.wfile.write(json.dumps(info, indent=2).encode())
|
|
|
|
def log_message(self, format, *args):
|
|
print(f"[irc-notify] {args[0]}", file=sys.stderr)
|
|
|
|
if __name__ == "__main__":
|
|
threading.Thread(target=digest_loop, daemon=True).start()
|
|
server = HTTPServer(("0.0.0.0", 9119), Handler)
|
|
print(f"[irc-notify] :9119 -> IRC {IRC_HOST}:{IRC_PORT} {IRC_CHANNEL} | thermal={'ON' if THERMAL_PRINT_ENABLED else 'OFF'} | digest={BATCH_INTERVAL_MIN}m max={BATCH_MAX_PENDING}", file=sys.stderr)
|
|
server.serve_forever()
|
|
|
|
# =============================================================================
|
|
# SNMP Exporter Auth Secret
|
|
# =============================================================================
|
|
# The full snmp.yml (~2MB, auto-generated) exceeds the 1MB ConfigMap limit.
|
|
# Strategy: store SNMP auth credentials in a Secret, and use an init container
|
|
# to download the full snmp.yml from Gitea, or split into multiple ConfigMaps.
|
|
# For now, we mount a minimal auth-only config and rely on the default modules
|
|
# bundled in the snmp-exporter image. To use custom modules, apply
|
|
# snmp-config.yaml separately (see comments in that file).
|
|
---
|
|
apiVersion: v1
|
|
kind: Secret
|
|
metadata:
|
|
name: snmp-auth
|
|
namespace: monitoring
|
|
type: Opaque
|
|
stringData:
|
|
# SNMP v2 community string used by prometheus scrape configs
|
|
SNMP_COMMUNITY_BLUEJAY: bluejay_monitor
|
|
SNMP_V3_USER: bluejay_snmpv3
|
|
SNMP_V3_AUTH_PASS: BlueJay-SNMP-Auth-2026
|
|
SNMP_V3_PRIV_PASS: BlueJay-SNMP-Priv-2026
|
|
|
|
# =============================================================================
|
|
# Grafana Credentials — synced from 1Password via Operator
|
|
# =============================================================================
|
|
# 1Password vault: IAmWorkin > "Grafana"
|
|
# Creates K8s Secret "grafana-credentials" with fields: username, password
|
|
# Grafana Deployment uses secretKeyRef to map username→GF_SECURITY_ADMIN_USER, password→GF_SECURITY_ADMIN_PASSWORD
|
|
---
|
|
apiVersion: onepassword.com/v1
|
|
kind: OnePasswordItem
|
|
metadata:
|
|
name: grafana-credentials
|
|
namespace: monitoring
|
|
spec:
|
|
itemPath: vaults/IAmWorkin/items/Grafana
|
|
|
|
# =============================================================================
|
|
# RBAC: ServiceAccount + ClusterRole for Prometheus K8s SD
|
|
# =============================================================================
|
|
---
|
|
apiVersion: v1
|
|
kind: ServiceAccount
|
|
metadata:
|
|
name: prometheus
|
|
namespace: monitoring
|
|
---
|
|
apiVersion: rbac.authorization.k8s.io/v1
|
|
kind: ClusterRole
|
|
metadata:
|
|
name: prometheus
|
|
rules:
|
|
- apiGroups: [""]
|
|
resources: ["nodes", "nodes/proxy", "services", "endpoints", "pods"]
|
|
verbs: ["get", "list", "watch"]
|
|
- apiGroups: ["extensions", "networking.k8s.io"]
|
|
resources: ["ingresses"]
|
|
verbs: ["get", "list", "watch"]
|
|
- nonResourceURLs: ["/metrics"]
|
|
verbs: ["get"]
|
|
---
|
|
apiVersion: rbac.authorization.k8s.io/v1
|
|
kind: ClusterRoleBinding
|
|
metadata:
|
|
name: prometheus
|
|
roleRef:
|
|
apiGroup: rbac.authorization.k8s.io
|
|
kind: ClusterRole
|
|
name: prometheus
|
|
subjects:
|
|
- kind: ServiceAccount
|
|
name: prometheus
|
|
namespace: monitoring
|
|
|
|
# =============================================================================
|
|
# PVC: Prometheus Data (10Gi, Longhorn)
|
|
# =============================================================================
|
|
---
|
|
apiVersion: v1
|
|
kind: PersistentVolumeClaim
|
|
metadata:
|
|
name: prometheus-data
|
|
namespace: monitoring
|
|
spec:
|
|
accessModes:
|
|
- ReadWriteOnce
|
|
storageClassName: longhorn
|
|
resources:
|
|
requests:
|
|
storage: 10Gi
|
|
|
|
# =============================================================================
|
|
# PVC: Grafana Data (2Gi, Longhorn)
|
|
# =============================================================================
|
|
---
|
|
apiVersion: v1
|
|
kind: PersistentVolumeClaim
|
|
metadata:
|
|
name: grafana-data
|
|
namespace: monitoring
|
|
spec:
|
|
accessModes:
|
|
- ReadWriteOnce
|
|
storageClassName: longhorn
|
|
resources:
|
|
requests:
|
|
storage: 2Gi
|
|
|
|
# =============================================================================
|
|
# Deployment: Prometheus
|
|
# =============================================================================
|
|
---
|
|
apiVersion: apps/v1
|
|
kind: Deployment
|
|
metadata:
|
|
name: prometheus
|
|
namespace: monitoring
|
|
labels:
|
|
app: prometheus
|
|
spec:
|
|
replicas: 1
|
|
strategy:
|
|
type: Recreate
|
|
selector:
|
|
matchLabels:
|
|
app: prometheus
|
|
template:
|
|
metadata:
|
|
labels:
|
|
app: prometheus
|
|
spec:
|
|
serviceAccountName: prometheus
|
|
securityContext:
|
|
fsGroup: 65534 # nobody
|
|
runAsUser: 65534
|
|
runAsGroup: 65534
|
|
containers:
|
|
- name: prometheus
|
|
image: docker.io/prom/prometheus:latest
|
|
args:
|
|
- "--config.file=/etc/prometheus/prometheus.yml"
|
|
- "--storage.tsdb.path=/prometheus"
|
|
- "--storage.tsdb.retention.time=90d"
|
|
- "--web.enable-lifecycle"
|
|
ports:
|
|
- containerPort: 9090
|
|
name: http
|
|
volumeMounts:
|
|
- name: config
|
|
mountPath: /etc/prometheus/prometheus.yml
|
|
subPath: prometheus.yml
|
|
readOnly: true
|
|
- name: config
|
|
mountPath: /etc/prometheus/alerts.yml
|
|
subPath: alerts.yml
|
|
readOnly: true
|
|
- name: config
|
|
mountPath: /etc/prometheus/recording-rules.yml
|
|
subPath: recording-rules.yml
|
|
readOnly: true
|
|
- name: data
|
|
mountPath: /prometheus
|
|
resources:
|
|
requests:
|
|
cpu: 200m
|
|
memory: 512Mi
|
|
limits:
|
|
cpu: "1"
|
|
memory: 2Gi
|
|
livenessProbe:
|
|
httpGet:
|
|
path: /-/healthy
|
|
port: 9090
|
|
initialDelaySeconds: 15
|
|
periodSeconds: 30
|
|
readinessProbe:
|
|
httpGet:
|
|
path: /-/ready
|
|
port: 9090
|
|
initialDelaySeconds: 5
|
|
periodSeconds: 10
|
|
volumes:
|
|
- name: config
|
|
configMap:
|
|
name: prometheus-config
|
|
- name: data
|
|
persistentVolumeClaim:
|
|
claimName: prometheus-data
|
|
|
|
# =============================================================================
|
|
# ConfigMap: Grafana Dashboard Provider
|
|
# =============================================================================
|
|
---
|
|
apiVersion: v1
|
|
kind: ConfigMap
|
|
metadata:
|
|
name: grafana-dashboard-provider
|
|
namespace: monitoring
|
|
data:
|
|
default.yml: |
|
|
apiVersion: 1
|
|
providers:
|
|
- name: 'default'
|
|
orgId: 1
|
|
folder: ''
|
|
type: file
|
|
disableDeletion: false
|
|
updateIntervalSeconds: 30
|
|
options:
|
|
path: /var/lib/grafana/dashboards
|
|
foldersFromFilesStructure: true
|
|
|
|
# =============================================================================
|
|
# ConfigMap: Grafana Dashboards (AI Stack Health)
|
|
# =============================================================================
|
|
---
|
|
apiVersion: v1
|
|
kind: ConfigMap
|
|
metadata:
|
|
name: grafana-dashboards
|
|
namespace: monitoring
|
|
data:
|
|
ai-stack-health.json: |
|
|
{
|
|
"id": null,
|
|
"panels": [
|
|
{
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"mappings": [
|
|
{
|
|
"options": {
|
|
"0": { "color": "red", "text": "DOWN" },
|
|
"1": { "color": "green", "text": "UP" }
|
|
},
|
|
"type": "value"
|
|
}
|
|
],
|
|
"thresholds": {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{ "color": "red", "value": null },
|
|
{ "color": "green", "value": 1 }
|
|
]
|
|
}
|
|
}
|
|
},
|
|
"gridPos": { "h": 4, "w": 6, "x": 0, "y": 0 },
|
|
"id": 1,
|
|
"targets": [
|
|
{
|
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
|
"expr": "probe_success{job=\"probe-ollama-local\"}",
|
|
"legendFormat": "Status"
|
|
}
|
|
],
|
|
"title": "Ollama (Local)",
|
|
"type": "stat"
|
|
},
|
|
{
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"mappings": [
|
|
{
|
|
"options": {
|
|
"0": { "color": "red", "text": "DOWN" },
|
|
"1": { "color": "green", "text": "UP" }
|
|
},
|
|
"type": "value"
|
|
}
|
|
],
|
|
"thresholds": {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{ "color": "red", "value": null },
|
|
{ "color": "green", "value": 1 }
|
|
]
|
|
}
|
|
}
|
|
},
|
|
"gridPos": { "h": 4, "w": 6, "x": 6, "y": 0 },
|
|
"id": 2,
|
|
"targets": [
|
|
{
|
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
|
"expr": "probe_success{job=\"probe-ollama-edge1\"}",
|
|
"legendFormat": "Status"
|
|
}
|
|
],
|
|
"title": "Ollama (Edge1)",
|
|
"type": "stat"
|
|
},
|
|
{
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"mappings": [
|
|
{
|
|
"options": {
|
|
"0": { "color": "red", "text": "DOWN" },
|
|
"1": { "color": "green", "text": "UP" }
|
|
},
|
|
"type": "value"
|
|
}
|
|
],
|
|
"thresholds": {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{ "color": "red", "value": null },
|
|
{ "color": "green", "value": 1 }
|
|
]
|
|
}
|
|
}
|
|
},
|
|
"gridPos": { "h": 4, "w": 6, "x": 12, "y": 0 },
|
|
"id": 3,
|
|
"targets": [
|
|
{
|
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
|
"expr": "probe_success{job=\"probe-agentzero-local\"}",
|
|
"legendFormat": "Status"
|
|
}
|
|
],
|
|
"title": "Agent Zero (Local)",
|
|
"type": "stat"
|
|
},
|
|
{
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"mappings": [
|
|
{
|
|
"options": {
|
|
"0": { "color": "red", "text": "DOWN" },
|
|
"1": { "color": "green", "text": "UP" }
|
|
},
|
|
"type": "value"
|
|
}
|
|
],
|
|
"thresholds": {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{ "color": "red", "value": null },
|
|
{ "color": "green", "value": 1 }
|
|
]
|
|
}
|
|
}
|
|
},
|
|
"gridPos": { "h": 4, "w": 6, "x": 18, "y": 0 },
|
|
"id": 4,
|
|
"targets": [
|
|
{
|
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
|
"expr": "probe_success{job=\"probe-agentzero-nuc\"}",
|
|
"legendFormat": "Status"
|
|
}
|
|
],
|
|
"title": "Agent Zero (NUC)",
|
|
"type": "stat"
|
|
},
|
|
{
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"custom": { "fillOpacity": 15, "lineWidth": 2 },
|
|
"thresholds": {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{ "color": "green", "value": null },
|
|
{ "color": "yellow", "value": 1 },
|
|
{ "color": "red", "value": 3 }
|
|
]
|
|
},
|
|
"unit": "s"
|
|
}
|
|
},
|
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 4 },
|
|
"id": 5,
|
|
"targets": [
|
|
{
|
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
|
"expr": "probe_duration_seconds{service=\"ollama\"}",
|
|
"legendFormat": "{{ deployment }}"
|
|
}
|
|
],
|
|
"title": "Ollama Response Time",
|
|
"type": "timeseries"
|
|
},
|
|
{
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"custom": { "fillOpacity": 15, "lineWidth": 2 },
|
|
"thresholds": {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{ "color": "green", "value": null },
|
|
{ "color": "yellow", "value": 1 },
|
|
{ "color": "red", "value": 3 }
|
|
]
|
|
},
|
|
"unit": "s"
|
|
}
|
|
},
|
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 4 },
|
|
"id": 6,
|
|
"targets": [
|
|
{
|
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
|
"expr": "probe_duration_seconds{service=\"agent-zero\"}",
|
|
"legendFormat": "{{ deployment }}"
|
|
}
|
|
],
|
|
"title": "Agent Zero Response Time",
|
|
"type": "timeseries"
|
|
},
|
|
{
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"custom": { "fillOpacity": 30, "lineWidth": 1, "stacking": { "mode": "none" } },
|
|
"mappings": [
|
|
{
|
|
"options": {
|
|
"0": { "text": "DOWN" },
|
|
"1": { "text": "UP" }
|
|
},
|
|
"type": "value"
|
|
}
|
|
],
|
|
"max": 1,
|
|
"min": 0,
|
|
"thresholds": {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{ "color": "red", "value": null },
|
|
{ "color": "green", "value": 1 }
|
|
]
|
|
}
|
|
}
|
|
},
|
|
"gridPos": { "h": 6, "w": 24, "x": 0, "y": 12 },
|
|
"id": 7,
|
|
"targets": [
|
|
{
|
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
|
"expr": "probe_success{service=\"ollama\"}",
|
|
"legendFormat": "Ollama ({{ deployment }})"
|
|
},
|
|
{
|
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
|
"expr": "probe_success{service=\"agent-zero\"}",
|
|
"legendFormat": "Agent Zero ({{ deployment }})"
|
|
}
|
|
],
|
|
"title": "Uptime History",
|
|
"type": "timeseries"
|
|
},
|
|
{
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"custom": { "fillOpacity": 10, "lineWidth": 2 },
|
|
"max": 100,
|
|
"min": 0,
|
|
"thresholds": {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{ "color": "green", "value": null },
|
|
{ "color": "yellow", "value": 75 },
|
|
{ "color": "red", "value": 90 }
|
|
]
|
|
},
|
|
"unit": "percent"
|
|
}
|
|
},
|
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 18 },
|
|
"id": 8,
|
|
"targets": [
|
|
{
|
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
|
"expr": "100 - (avg(rate(node_cpu_seconds_total{instance=\"edge1\",mode=\"idle\"}[5m])) * 100)",
|
|
"legendFormat": "CPU %"
|
|
},
|
|
{
|
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
|
"expr": "(1 - node_memory_MemAvailable_bytes{instance=\"edge1\"} / node_memory_MemTotal_bytes{instance=\"edge1\"}) * 100",
|
|
"legendFormat": "Memory %"
|
|
},
|
|
{
|
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
|
"expr": "(1 - node_filesystem_avail_bytes{instance=\"edge1\",mountpoint=\"/\"} / node_filesystem_size_bytes{instance=\"edge1\",mountpoint=\"/\"}) * 100",
|
|
"legendFormat": "Disk %"
|
|
}
|
|
],
|
|
"title": "Edge1 Pi 5 \u2014 CPU / Memory / Disk",
|
|
"type": "timeseries"
|
|
},
|
|
{
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"custom": { "fillOpacity": 10, "lineWidth": 2 },
|
|
"unit": "s"
|
|
}
|
|
},
|
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 18 },
|
|
"id": 9,
|
|
"targets": [
|
|
{
|
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
|
"expr": "probe_dns_lookup_time_seconds",
|
|
"legendFormat": "{{ job }}"
|
|
}
|
|
],
|
|
"title": "Probe DNS Lookup Time",
|
|
"type": "timeseries"
|
|
}
|
|
],
|
|
"refresh": "30s",
|
|
"schemaVersion": 39,
|
|
"tags": ["ai", "ollama", "agent-zero", "blue-jay"],
|
|
"time": { "from": "now-1h", "to": "now" },
|
|
"timezone": "browser",
|
|
"title": "AI Stack Health",
|
|
"uid": "ai-stack-health",
|
|
"version": 1
|
|
}
|
|
|
|
# =============================================================================
|
|
# ConfigMap: Grafana Dashboard — Edge Nodes
|
|
# =============================================================================
|
|
---
|
|
apiVersion: v1
|
|
kind: ConfigMap
|
|
metadata:
|
|
name: grafana-dashboard-edge-nodes
|
|
namespace: monitoring
|
|
data:
|
|
bluejay-edge-nodes.json: |
|
|
{
|
|
"id": null,
|
|
"panels": [
|
|
{
|
|
"datasource": {
|
|
"type": "prometheus",
|
|
"uid": "prometheus"
|
|
},
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"mappings": [
|
|
{
|
|
"options": {
|
|
"0": {
|
|
"color": "red",
|
|
"text": "DOWN"
|
|
},
|
|
"1": {
|
|
"color": "green",
|
|
"text": "UP"
|
|
}
|
|
},
|
|
"type": "value"
|
|
}
|
|
]
|
|
}
|
|
},
|
|
"gridPos": { "h": 4, "w": 24, "x": 0, "y": 0 },
|
|
"targets": [
|
|
{
|
|
"expr": "up{instance=~\"edge.*\"}",
|
|
"legendFormat": "{{instance}}",
|
|
"refId": "A"
|
|
}
|
|
],
|
|
"title": "Edge Node Status",
|
|
"type": "stat"
|
|
},
|
|
{
|
|
"datasource": {
|
|
"type": "prometheus",
|
|
"uid": "prometheus"
|
|
},
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"max": 100,
|
|
"min": 0,
|
|
"unit": "percent"
|
|
}
|
|
},
|
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 4 },
|
|
"targets": [
|
|
{
|
|
"expr": "100 - (avg(rate(node_cpu_seconds_total{mode=\"idle\", instance=~\"edge1.*\"}[5m])) * 100)",
|
|
"legendFormat": "CPU %",
|
|
"refId": "A"
|
|
},
|
|
{
|
|
"expr": "node_load1{instance=~\"edge1.*\"}",
|
|
"legendFormat": "Load 1m",
|
|
"refId": "B"
|
|
}
|
|
],
|
|
"title": "edge1 (Pi5 + Hailo) CPU",
|
|
"type": "timeseries"
|
|
},
|
|
{
|
|
"datasource": {
|
|
"type": "prometheus",
|
|
"uid": "prometheus"
|
|
},
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"max": 100,
|
|
"min": 0,
|
|
"unit": "percent"
|
|
}
|
|
},
|
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 4 },
|
|
"targets": [
|
|
{
|
|
"expr": "100 - (avg(rate(node_cpu_seconds_total{mode=\"idle\", instance=~\"edge2.*\"}[5m])) * 100)",
|
|
"legendFormat": "CPU %",
|
|
"refId": "A"
|
|
},
|
|
{
|
|
"expr": "node_load1{instance=~\"edge2.*\"}",
|
|
"legendFormat": "Load 1m",
|
|
"refId": "B"
|
|
}
|
|
],
|
|
"title": "edge2 (Pi4) CPU",
|
|
"type": "timeseries"
|
|
},
|
|
{
|
|
"datasource": {
|
|
"type": "prometheus",
|
|
"uid": "prometheus"
|
|
},
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"max": 100,
|
|
"min": 0,
|
|
"unit": "percent"
|
|
}
|
|
},
|
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 12 },
|
|
"targets": [
|
|
{
|
|
"expr": "(1 - node_memory_MemAvailable_bytes{instance=~\"edge.*\"} / node_memory_MemTotal_bytes{instance=~\"edge.*\"}) * 100",
|
|
"legendFormat": "{{instance}}",
|
|
"refId": "A"
|
|
}
|
|
],
|
|
"title": "Edge Memory Usage",
|
|
"type": "timeseries"
|
|
},
|
|
{
|
|
"datasource": {
|
|
"type": "prometheus",
|
|
"uid": "prometheus"
|
|
},
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"max": 100,
|
|
"min": 0,
|
|
"unit": "percent"
|
|
}
|
|
},
|
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 12 },
|
|
"targets": [
|
|
{
|
|
"expr": "100 - (node_filesystem_avail_bytes{instance=~\"edge.*\",mountpoint=\"/\"} / node_filesystem_size_bytes{instance=~\"edge.*\",mountpoint=\"/\"} * 100)",
|
|
"legendFormat": "{{instance}}",
|
|
"refId": "A"
|
|
}
|
|
],
|
|
"title": "Edge Disk Usage",
|
|
"type": "timeseries"
|
|
},
|
|
{
|
|
"datasource": {
|
|
"type": "prometheus",
|
|
"uid": "prometheus"
|
|
},
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"unit": "celsius"
|
|
}
|
|
},
|
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 20 },
|
|
"targets": [
|
|
{
|
|
"expr": "node_hwmon_temp_celsius{instance=~\"edge.*\"}",
|
|
"legendFormat": "{{instance}} {{chip}} {{sensor}}",
|
|
"refId": "A"
|
|
}
|
|
],
|
|
"title": "Edge CPU Temperature",
|
|
"type": "timeseries"
|
|
},
|
|
{
|
|
"datasource": {
|
|
"type": "prometheus",
|
|
"uid": "prometheus"
|
|
},
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"unit": "bps"
|
|
}
|
|
},
|
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 20 },
|
|
"targets": [
|
|
{
|
|
"expr": "rate(node_network_receive_bytes_total{instance=~\"edge.*\",device!~\"lo|docker.*|veth.*\"}[5m]) * 8",
|
|
"legendFormat": "{{instance}} {{device}} RX",
|
|
"refId": "A"
|
|
},
|
|
{
|
|
"expr": "-rate(node_network_transmit_bytes_total{instance=~\"edge.*\",device!~\"lo|docker.*|veth.*\"}[5m]) * 8",
|
|
"legendFormat": "{{instance}} {{device}} TX",
|
|
"refId": "B"
|
|
}
|
|
],
|
|
"title": "Edge Network Traffic",
|
|
"type": "timeseries"
|
|
}
|
|
],
|
|
"refresh": "30s",
|
|
"schemaVersion": 40,
|
|
"tags": ["bluejay", "edge"],
|
|
"timezone": "browser",
|
|
"title": "BlueJay Edge Nodes",
|
|
"uid": "7e1603b9-e918-4b3f-a22b-163132fd5cee"
|
|
}
|
|
|
|
# =============================================================================
|
|
# ConfigMap: Grafana Dashboard — Network Overview
|
|
# =============================================================================
|
|
---
|
|
apiVersion: v1
|
|
kind: ConfigMap
|
|
metadata:
|
|
name: grafana-dashboard-network-overview
|
|
namespace: monitoring
|
|
data:
|
|
bluejay-network-overview.json: |
|
|
{
|
|
"id": null,
|
|
"panels": [
|
|
{
|
|
"datasource": {
|
|
"type": "prometheus",
|
|
"uid": "prometheus"
|
|
},
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"thresholds": {
|
|
"steps": [
|
|
{ "color": "green", "value": null }
|
|
]
|
|
}
|
|
}
|
|
},
|
|
"gridPos": { "h": 4, "w": 24, "x": 0, "y": 0 },
|
|
"targets": [
|
|
{
|
|
"expr": "count(up == 1)",
|
|
"legendFormat": "Up",
|
|
"refId": "A"
|
|
},
|
|
{
|
|
"expr": "count(up == 0)",
|
|
"legendFormat": "Down",
|
|
"refId": "B"
|
|
}
|
|
],
|
|
"title": "Target Health",
|
|
"type": "stat"
|
|
},
|
|
{
|
|
"datasource": {
|
|
"type": "prometheus",
|
|
"uid": "prometheus"
|
|
},
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"max": 4,
|
|
"min": 0,
|
|
"thresholds": {
|
|
"steps": [
|
|
{ "color": "green", "value": null },
|
|
{ "color": "yellow", "value": 2 },
|
|
{ "color": "red", "value": 3 }
|
|
]
|
|
}
|
|
}
|
|
},
|
|
"gridPos": { "h": 6, "w": 6, "x": 0, "y": 4 },
|
|
"targets": [
|
|
{
|
|
"expr": "laLoad{instance=\"10.0.56.1\", laNames=\"Load-1\"}",
|
|
"refId": "A"
|
|
}
|
|
],
|
|
"title": "pfSense CPU Load (1m)",
|
|
"type": "gauge"
|
|
},
|
|
{
|
|
"datasource": {
|
|
"type": "prometheus",
|
|
"uid": "prometheus"
|
|
},
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"max": 100,
|
|
"min": 0,
|
|
"thresholds": {
|
|
"steps": [
|
|
{ "color": "green", "value": null },
|
|
{ "color": "yellow", "value": 70 },
|
|
{ "color": "red", "value": 90 }
|
|
]
|
|
},
|
|
"unit": "percent"
|
|
}
|
|
},
|
|
"gridPos": { "h": 6, "w": 6, "x": 6, "y": 4 },
|
|
"targets": [
|
|
{
|
|
"expr": "100 - (memAvailReal{instance=\"10.0.56.1\"} / memTotalReal{instance=\"10.0.56.1\"} * 100)",
|
|
"refId": "A"
|
|
}
|
|
],
|
|
"title": "pfSense Memory Used %",
|
|
"type": "gauge"
|
|
},
|
|
{
|
|
"datasource": {
|
|
"type": "prometheus",
|
|
"uid": "prometheus"
|
|
},
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"max": 100,
|
|
"min": 0,
|
|
"unit": "percent"
|
|
}
|
|
},
|
|
"gridPos": { "h": 6, "w": 12, "x": 12, "y": 4 },
|
|
"targets": [
|
|
{
|
|
"expr": "100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\", instance=\"noc1\"}[5m])) * 100)",
|
|
"legendFormat": "CPU %",
|
|
"refId": "A"
|
|
}
|
|
],
|
|
"title": "noc1 CPU Usage",
|
|
"type": "timeseries"
|
|
},
|
|
{
|
|
"datasource": {
|
|
"type": "prometheus",
|
|
"uid": "prometheus"
|
|
},
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"max": 100,
|
|
"min": 0,
|
|
"unit": "percent"
|
|
}
|
|
},
|
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 10 },
|
|
"targets": [
|
|
{
|
|
"expr": "(1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100",
|
|
"legendFormat": "{{instance}}",
|
|
"refId": "A"
|
|
}
|
|
],
|
|
"title": "Node Memory Usage",
|
|
"type": "timeseries"
|
|
},
|
|
{
|
|
"datasource": {
|
|
"type": "prometheus",
|
|
"uid": "prometheus"
|
|
},
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"max": 100,
|
|
"min": 0,
|
|
"thresholds": {
|
|
"steps": [
|
|
{ "color": "green", "value": null },
|
|
{ "color": "yellow", "value": 70 },
|
|
{ "color": "red", "value": 90 }
|
|
]
|
|
},
|
|
"unit": "percent"
|
|
}
|
|
},
|
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 10 },
|
|
"targets": [
|
|
{
|
|
"expr": "100 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} * 100)",
|
|
"legendFormat": "{{instance}}",
|
|
"refId": "A"
|
|
}
|
|
],
|
|
"title": "Node Disk Usage %",
|
|
"type": "bargauge"
|
|
},
|
|
{
|
|
"datasource": {
|
|
"type": "prometheus",
|
|
"uid": "prometheus"
|
|
},
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"unit": "bps"
|
|
}
|
|
},
|
|
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 18 },
|
|
"targets": [
|
|
{
|
|
"expr": "rate(node_network_receive_bytes_total{device!~\"lo|veth.*|cali.*|flannel.*\"}[5m]) * 8",
|
|
"legendFormat": "{{instance}} {{device}} RX",
|
|
"refId": "A"
|
|
},
|
|
{
|
|
"expr": "-rate(node_network_transmit_bytes_total{device!~\"lo|veth.*|cali.*|flannel.*\"}[5m]) * 8",
|
|
"legendFormat": "{{instance}} {{device}} TX",
|
|
"refId": "B"
|
|
}
|
|
],
|
|
"title": "Network Traffic",
|
|
"type": "timeseries"
|
|
},
|
|
{
|
|
"datasource": {
|
|
"type": "prometheus",
|
|
"uid": "prometheus"
|
|
},
|
|
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 26 },
|
|
"targets": [
|
|
{
|
|
"expr": "up",
|
|
"format": "table",
|
|
"instant": true,
|
|
"refId": "A"
|
|
}
|
|
],
|
|
"title": "Prometheus Targets",
|
|
"type": "table"
|
|
}
|
|
],
|
|
"refresh": "30s",
|
|
"schemaVersion": 40,
|
|
"tags": ["bluejay", "network"],
|
|
"timezone": "browser",
|
|
"title": "BlueJay Network Overview",
|
|
"uid": "b83a122c-6eb5-4fae-a632-77cbf753ad05"
|
|
}
|
|
|
|
# =============================================================================
|
|
# ConfigMap: Grafana Dashboard — Operations
|
|
# =============================================================================
|
|
---
|
|
apiVersion: v1
|
|
kind: ConfigMap
|
|
metadata:
|
|
name: grafana-dashboard-operations
|
|
namespace: monitoring
|
|
data:
|
|
bluejay-operations.json: |
|
|
{
|
|
"annotations": {
|
|
"list": []
|
|
},
|
|
"id": null,
|
|
"panels": [
|
|
{
|
|
"collapsed": false,
|
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
|
|
"title": "Infrastructure Overview",
|
|
"type": "row"
|
|
},
|
|
{
|
|
"datasource": {
|
|
"type": "prometheus",
|
|
"uid": "prometheus"
|
|
},
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"noValue": "0",
|
|
"thresholds": {
|
|
"steps": [
|
|
{ "color": "green", "value": null }
|
|
]
|
|
}
|
|
}
|
|
},
|
|
"gridPos": { "h": 4, "w": 8, "x": 0, "y": 1 },
|
|
"targets": [
|
|
{
|
|
"expr": "count(up == 1)",
|
|
"legendFormat": "Up",
|
|
"refId": "A"
|
|
},
|
|
{
|
|
"expr": "count(up == 0)",
|
|
"legendFormat": "Down",
|
|
"refId": "B"
|
|
}
|
|
],
|
|
"title": "All Targets Up/Down",
|
|
"type": "stat"
|
|
},
|
|
{
|
|
"datasource": {
|
|
"type": "alexanderzobnin-zabbix-datasource",
|
|
"uid": "bffjila3zkdfka"
|
|
},
|
|
"gridPos": { "h": 8, "w": 16, "x": 8, "y": 1 },
|
|
"targets": [
|
|
{
|
|
"application": { "filter": "" },
|
|
"group": { "filter": "/.*/" },
|
|
"host": { "filter": "/.*/" },
|
|
"queryType": 5,
|
|
"refId": "A",
|
|
"trigger": { "filter": "/.*/" }
|
|
}
|
|
],
|
|
"title": "Zabbix Active Problems",
|
|
"type": "alexanderzobnin-zabbix-triggers-panel"
|
|
},
|
|
{
|
|
"datasource": {
|
|
"type": "prometheus",
|
|
"uid": "prometheus"
|
|
},
|
|
"gridPos": { "h": 6, "w": 8, "x": 0, "y": 5 },
|
|
"targets": [
|
|
{
|
|
"expr": "node_load1{instance=\"noc1\"}",
|
|
"legendFormat": "1m",
|
|
"refId": "A"
|
|
},
|
|
{
|
|
"expr": "node_load5{instance=\"noc1\"}",
|
|
"legendFormat": "5m",
|
|
"refId": "B"
|
|
},
|
|
{
|
|
"expr": "node_load15{instance=\"noc1\"}",
|
|
"legendFormat": "15m",
|
|
"refId": "C"
|
|
}
|
|
],
|
|
"title": "noc1 Load Average",
|
|
"type": "timeseries"
|
|
},
|
|
{
|
|
"collapsed": false,
|
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 11 },
|
|
"title": "Kubernetes & Services",
|
|
"type": "row"
|
|
},
|
|
{
|
|
"datasource": {
|
|
"type": "prometheus",
|
|
"uid": "prometheus"
|
|
},
|
|
"fieldConfig": {
|
|
"defaults": {},
|
|
"overrides": [
|
|
{
|
|
"matcher": { "id": "byName", "options": "Value" },
|
|
"properties": [
|
|
{
|
|
"id": "mappings",
|
|
"value": [
|
|
{
|
|
"options": {
|
|
"0": { "color": "red", "text": "DOWN" },
|
|
"1": { "color": "green", "text": "UP" }
|
|
},
|
|
"type": "value"
|
|
}
|
|
]
|
|
}
|
|
]
|
|
}
|
|
]
|
|
},
|
|
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 12 },
|
|
"targets": [
|
|
{
|
|
"expr": "up",
|
|
"format": "table",
|
|
"instant": true,
|
|
"refId": "A"
|
|
}
|
|
],
|
|
"title": "K8s Services Uptime (Prometheus Targets)",
|
|
"type": "table"
|
|
},
|
|
{
|
|
"collapsed": false,
|
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 20 },
|
|
"title": "Network & SNMP",
|
|
"type": "row"
|
|
},
|
|
{
|
|
"datasource": {
|
|
"type": "prometheus",
|
|
"uid": "prometheus"
|
|
},
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"unit": "bps"
|
|
}
|
|
},
|
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 21 },
|
|
"targets": [
|
|
{
|
|
"expr": "rate(ifHCInOctets{instance=\"10.0.56.1\", ifAlias=~\".*WAN.*\"}[5m]) * 8",
|
|
"legendFormat": "WAN In",
|
|
"refId": "A"
|
|
},
|
|
{
|
|
"expr": "-rate(ifHCOutOctets{instance=\"10.0.56.1\", ifAlias=~\".*WAN.*\"}[5m]) * 8",
|
|
"legendFormat": "WAN Out",
|
|
"refId": "B"
|
|
}
|
|
],
|
|
"title": "pfSense WAN Traffic",
|
|
"type": "timeseries"
|
|
},
|
|
{
|
|
"datasource": {
|
|
"type": "prometheus",
|
|
"uid": "prometheus"
|
|
},
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"unit": "bps"
|
|
}
|
|
},
|
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 21 },
|
|
"targets": [
|
|
{
|
|
"expr": "rate(ifHCInOctets{instance=\"10.0.56.1\", ifAlias!~\".*WAN.*\"}[5m]) * 8",
|
|
"legendFormat": "{{ifAlias}} In",
|
|
"refId": "A"
|
|
},
|
|
{
|
|
"expr": "-rate(ifHCOutOctets{instance=\"10.0.56.1\", ifAlias!~\".*WAN.*\"}[5m]) * 8",
|
|
"legendFormat": "{{ifAlias}} Out",
|
|
"refId": "B"
|
|
}
|
|
],
|
|
"title": "pfSense LAN Traffic",
|
|
"type": "timeseries"
|
|
},
|
|
{
|
|
"datasource": {
|
|
"type": "prometheus",
|
|
"uid": "prometheus"
|
|
},
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"max": 100,
|
|
"min": 0,
|
|
"unit": "percent"
|
|
}
|
|
},
|
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 29 },
|
|
"targets": [
|
|
{
|
|
"expr": "(1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100",
|
|
"legendFormat": "{{instance}}",
|
|
"refId": "A"
|
|
}
|
|
],
|
|
"title": "All Nodes Memory",
|
|
"type": "timeseries"
|
|
},
|
|
{
|
|
"datasource": {
|
|
"type": "prometheus",
|
|
"uid": "prometheus"
|
|
},
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"max": 100,
|
|
"min": 0,
|
|
"unit": "percent"
|
|
}
|
|
},
|
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 29 },
|
|
"targets": [
|
|
{
|
|
"expr": "100 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} * 100)",
|
|
"legendFormat": "{{instance}}",
|
|
"refId": "A"
|
|
}
|
|
],
|
|
"title": "All Nodes Disk",
|
|
"type": "timeseries"
|
|
}
|
|
],
|
|
"refresh": "1m",
|
|
"schemaVersion": 40,
|
|
"tags": ["bluejay", "operations", "zabbix"],
|
|
"timezone": "browser",
|
|
"title": "BlueJay Operations",
|
|
"uid": "a781c4e4-c3fe-4ac7-be93-21363a41b97d"
|
|
}
|
|
|
|
# =============================================================================
|
|
# ConfigMap: Grafana Dashboard — Epson Printer
|
|
# =============================================================================
|
|
---
|
|
apiVersion: v1
|
|
kind: ConfigMap
|
|
metadata:
|
|
name: grafana-dashboard-printer
|
|
namespace: monitoring
|
|
data:
|
|
epson-ecotank-printer.json: |
|
|
{
|
|
"id": null,
|
|
"panels": [
|
|
{
|
|
"datasource": {
|
|
"type": "prometheus",
|
|
"uid": "prometheus"
|
|
},
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"max": 100,
|
|
"min": 0,
|
|
"thresholds": {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{ "color": "red", "value": null },
|
|
{ "color": "orange", "value": 10 },
|
|
{ "color": "yellow", "value": 20 },
|
|
{ "color": "green", "value": 40 }
|
|
]
|
|
},
|
|
"unit": "percent"
|
|
},
|
|
"overrides": [
|
|
{
|
|
"matcher": { "id": "byName", "options": "Black Ink Bottle" },
|
|
"properties": [
|
|
{ "id": "color", "value": { "fixedColor": "dark-blue", "mode": "fixed" } }
|
|
]
|
|
},
|
|
{
|
|
"matcher": { "id": "byName", "options": "Cyan Ink Bottle" },
|
|
"properties": [
|
|
{ "id": "color", "value": { "fixedColor": "super-light-blue", "mode": "fixed" } }
|
|
]
|
|
},
|
|
{
|
|
"matcher": { "id": "byName", "options": "Magenta Ink Bottle" },
|
|
"properties": [
|
|
{ "id": "color", "value": { "fixedColor": "purple", "mode": "fixed" } }
|
|
]
|
|
},
|
|
{
|
|
"matcher": { "id": "byName", "options": "Yellow Ink Bottle" },
|
|
"properties": [
|
|
{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }
|
|
]
|
|
}
|
|
]
|
|
},
|
|
"gridPos": { "h": 10, "w": 12, "x": 0, "y": 0 },
|
|
"id": 1,
|
|
"options": {
|
|
"orientation": "horizontal",
|
|
"reduceOptions": {
|
|
"calcs": ["lastNotNull"]
|
|
},
|
|
"showThresholdLabels": false,
|
|
"showThresholdMarkers": true
|
|
},
|
|
"targets": [
|
|
{
|
|
"expr": "prtMarkerSuppliesLevel{job=\"snmp-printer\"}",
|
|
"legendFormat": "{{prtMarkerSuppliesDescription}}",
|
|
"refId": "A"
|
|
}
|
|
],
|
|
"title": "Ink Levels",
|
|
"type": "gauge"
|
|
},
|
|
{
|
|
"datasource": {
|
|
"type": "prometheus",
|
|
"uid": "prometheus"
|
|
},
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"custom": {
|
|
"fillOpacity": 20,
|
|
"lineWidth": 2,
|
|
"spanNulls": true
|
|
},
|
|
"max": 100,
|
|
"min": 0,
|
|
"unit": "percent"
|
|
},
|
|
"overrides": [
|
|
{
|
|
"matcher": { "id": "byName", "options": "Black Ink Bottle" },
|
|
"properties": [
|
|
{ "id": "color", "value": { "fixedColor": "dark-blue", "mode": "fixed" } }
|
|
]
|
|
},
|
|
{
|
|
"matcher": { "id": "byName", "options": "Cyan Ink Bottle" },
|
|
"properties": [
|
|
{ "id": "color", "value": { "fixedColor": "super-light-blue", "mode": "fixed" } }
|
|
]
|
|
},
|
|
{
|
|
"matcher": { "id": "byName", "options": "Magenta Ink Bottle" },
|
|
"properties": [
|
|
{ "id": "color", "value": { "fixedColor": "purple", "mode": "fixed" } }
|
|
]
|
|
},
|
|
{
|
|
"matcher": { "id": "byName", "options": "Yellow Ink Bottle" },
|
|
"properties": [
|
|
{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }
|
|
]
|
|
}
|
|
]
|
|
},
|
|
"gridPos": { "h": 10, "w": 12, "x": 12, "y": 0 },
|
|
"id": 2,
|
|
"targets": [
|
|
{
|
|
"expr": "prtMarkerSuppliesLevel{job=\"snmp-printer\"}",
|
|
"legendFormat": "{{prtMarkerSuppliesDescription}}",
|
|
"refId": "A"
|
|
}
|
|
],
|
|
"title": "Ink Level History",
|
|
"type": "timeseries"
|
|
},
|
|
{
|
|
"datasource": {
|
|
"type": "prometheus",
|
|
"uid": "prometheus"
|
|
},
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"thresholds": {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{ "color": "green", "value": null },
|
|
{ "color": "yellow", "value": 10000 },
|
|
{ "color": "red", "value": 50000 }
|
|
]
|
|
},
|
|
"unit": "short"
|
|
}
|
|
},
|
|
"gridPos": { "h": 5, "w": 6, "x": 0, "y": 10 },
|
|
"id": 3,
|
|
"options": {
|
|
"colorMode": "background",
|
|
"reduceOptions": {
|
|
"calcs": ["lastNotNull"]
|
|
},
|
|
"textMode": "value_and_name"
|
|
},
|
|
"targets": [
|
|
{
|
|
"expr": "prtMarkerLifeCount{job=\"snmp-printer\"}",
|
|
"legendFormat": "Pages",
|
|
"refId": "A"
|
|
}
|
|
],
|
|
"title": "Lifetime Page Count",
|
|
"type": "stat"
|
|
},
|
|
{
|
|
"datasource": {
|
|
"type": "prometheus",
|
|
"uid": "prometheus"
|
|
},
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"mappings": [
|
|
{
|
|
"options": {
|
|
"1": { "text": "Online" }
|
|
},
|
|
"type": "value"
|
|
}
|
|
],
|
|
"thresholds": {
|
|
"steps": [
|
|
{ "color": "blue", "value": null }
|
|
]
|
|
}
|
|
}
|
|
},
|
|
"gridPos": { "h": 5, "w": 6, "x": 6, "y": 10 },
|
|
"id": 4,
|
|
"options": {
|
|
"colorMode": "background",
|
|
"reduceOptions": {
|
|
"calcs": ["lastNotNull"]
|
|
},
|
|
"textMode": "name"
|
|
},
|
|
"targets": [
|
|
{
|
|
"expr": "prtGeneralPrinterName{job=\"snmp-printer\"}",
|
|
"legendFormat": "{{prtGeneralPrinterName}}",
|
|
"refId": "A"
|
|
}
|
|
],
|
|
"title": "Printer Model",
|
|
"type": "stat"
|
|
},
|
|
{
|
|
"datasource": {
|
|
"type": "prometheus",
|
|
"uid": "prometheus"
|
|
},
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"thresholds": {
|
|
"steps": [
|
|
{ "color": "green", "value": null },
|
|
{ "color": "red", "value": 1 }
|
|
]
|
|
},
|
|
"unit": "short"
|
|
}
|
|
},
|
|
"gridPos": { "h": 5, "w": 6, "x": 12, "y": 10 },
|
|
"id": 5,
|
|
"options": {
|
|
"colorMode": "background",
|
|
"reduceOptions": {
|
|
"calcs": ["lastNotNull"]
|
|
}
|
|
},
|
|
"targets": [
|
|
{
|
|
"expr": "prtAlertCriticalEvents{job=\"snmp-printer\"}",
|
|
"legendFormat": "Critical Alerts",
|
|
"refId": "A"
|
|
}
|
|
],
|
|
"title": "Critical Events",
|
|
"type": "stat"
|
|
},
|
|
{
|
|
"datasource": {
|
|
"type": "prometheus",
|
|
"uid": "prometheus"
|
|
},
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"thresholds": {
|
|
"steps": [
|
|
{ "color": "blue", "value": null }
|
|
]
|
|
}
|
|
}
|
|
},
|
|
"gridPos": { "h": 5, "w": 6, "x": 18, "y": 10 },
|
|
"id": 6,
|
|
"options": {
|
|
"colorMode": "background",
|
|
"reduceOptions": {
|
|
"calcs": ["lastNotNull"]
|
|
},
|
|
"textMode": "name"
|
|
},
|
|
"targets": [
|
|
{
|
|
"expr": "prtGeneralSerialNumber{job=\"snmp-printer\"}",
|
|
"legendFormat": "{{prtGeneralSerialNumber}}",
|
|
"refId": "A"
|
|
}
|
|
],
|
|
"title": "Serial Number",
|
|
"type": "stat"
|
|
}
|
|
],
|
|
"refresh": "5m",
|
|
"schemaVersion": 39,
|
|
"tags": ["printer", "snmp", "bluejay"],
|
|
"time": { "from": "now-24h", "to": "now" },
|
|
"timezone": "browser",
|
|
"title": "Epson ET-3750 EcoTank Printer",
|
|
"uid": "epson-ecotank"
|
|
}
|
|
|
|
# =============================================================================
|
|
# ConfigMap: Grafana Dashboard — Infrastructure Overview
|
|
# =============================================================================
|
|
---
|
|
apiVersion: v1
|
|
kind: ConfigMap
|
|
metadata:
|
|
name: grafana-dashboard-infra-overview
|
|
namespace: monitoring
|
|
data:
|
|
infra-overview.json: |
|
|
{
|
|
"id": null,
|
|
"panels": [
|
|
{
|
|
"collapsed": false,
|
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
|
|
"id": 100,
|
|
"title": "AI Stack",
|
|
"type": "row"
|
|
},
|
|
{
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"mappings": [
|
|
{
|
|
"options": {
|
|
"0": { "color": "red", "text": "DOWN" },
|
|
"1": { "color": "green", "text": "UP" }
|
|
},
|
|
"type": "value"
|
|
}
|
|
],
|
|
"thresholds": {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{ "color": "red", "value": null },
|
|
{ "color": "green", "value": 1 }
|
|
]
|
|
}
|
|
}
|
|
},
|
|
"gridPos": { "h": 4, "w": 6, "x": 0, "y": 1 },
|
|
"id": 1,
|
|
"targets": [
|
|
{
|
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
|
"expr": "probe_success{job=\"probe-ollama-local\"}",
|
|
"legendFormat": "Status"
|
|
}
|
|
],
|
|
"title": "Ollama (Local)",
|
|
"type": "stat"
|
|
},
|
|
{
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"mappings": [
|
|
{
|
|
"options": {
|
|
"0": { "color": "red", "text": "DOWN" },
|
|
"1": { "color": "green", "text": "UP" }
|
|
},
|
|
"type": "value"
|
|
}
|
|
],
|
|
"thresholds": {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{ "color": "red", "value": null },
|
|
{ "color": "green", "value": 1 }
|
|
]
|
|
}
|
|
}
|
|
},
|
|
"gridPos": { "h": 4, "w": 6, "x": 6, "y": 1 },
|
|
"id": 2,
|
|
"targets": [
|
|
{
|
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
|
"expr": "probe_success{job=\"probe-ollama-edge1\"}",
|
|
"legendFormat": "Status"
|
|
}
|
|
],
|
|
"title": "Ollama (Edge1)",
|
|
"type": "stat"
|
|
},
|
|
{
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"mappings": [
|
|
{
|
|
"options": {
|
|
"0": { "color": "red", "text": "DOWN" },
|
|
"1": { "color": "green", "text": "UP" }
|
|
},
|
|
"type": "value"
|
|
}
|
|
],
|
|
"thresholds": {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{ "color": "red", "value": null },
|
|
{ "color": "green", "value": 1 }
|
|
]
|
|
}
|
|
}
|
|
},
|
|
"gridPos": { "h": 4, "w": 6, "x": 12, "y": 1 },
|
|
"id": 3,
|
|
"targets": [
|
|
{
|
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
|
"expr": "probe_success{job=\"probe-agentzero-local\"}",
|
|
"legendFormat": "Status"
|
|
}
|
|
],
|
|
"title": "Agent Zero (Local)",
|
|
"type": "stat"
|
|
},
|
|
{
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"mappings": [
|
|
{
|
|
"options": {
|
|
"0": { "color": "red", "text": "DOWN" },
|
|
"1": { "color": "green", "text": "UP" }
|
|
},
|
|
"type": "value"
|
|
}
|
|
],
|
|
"thresholds": {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{ "color": "red", "value": null },
|
|
{ "color": "green", "value": 1 }
|
|
]
|
|
}
|
|
}
|
|
},
|
|
"gridPos": { "h": 4, "w": 6, "x": 18, "y": 1 },
|
|
"id": 4,
|
|
"targets": [
|
|
{
|
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
|
"expr": "probe_success{job=\"probe-agentzero-nuc\"}",
|
|
"legendFormat": "Status"
|
|
}
|
|
],
|
|
"title": "Agent Zero (NUC)",
|
|
"type": "stat"
|
|
},
|
|
{
|
|
"collapsed": false,
|
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 5 },
|
|
"id": 101,
|
|
"title": "K8s Cluster",
|
|
"type": "row"
|
|
},
|
|
{
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"thresholds": {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{ "color": "green", "value": null },
|
|
{ "color": "yellow", "value": 30 },
|
|
{ "color": "red", "value": 50 }
|
|
]
|
|
}
|
|
}
|
|
},
|
|
"gridPos": { "h": 6, "w": 8, "x": 0, "y": 6 },
|
|
"id": 5,
|
|
"targets": [
|
|
{
|
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
|
"expr": "count(up{job=\"node-exporter\"} == 1)",
|
|
"legendFormat": "Nodes Up"
|
|
}
|
|
],
|
|
"title": "Nodes Up (node-exporter)",
|
|
"type": "stat"
|
|
},
|
|
{
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"custom": { "fillOpacity": 15, "lineWidth": 2 },
|
|
"max": 100,
|
|
"min": 0,
|
|
"thresholds": {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{ "color": "green", "value": null },
|
|
{ "color": "yellow", "value": 70 },
|
|
{ "color": "red", "value": 90 }
|
|
]
|
|
},
|
|
"unit": "percent"
|
|
}
|
|
},
|
|
"gridPos": { "h": 6, "w": 8, "x": 8, "y": 6 },
|
|
"id": 6,
|
|
"targets": [
|
|
{
|
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
|
"expr": "100 - (avg(rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) by (instance) * 100)",
|
|
"legendFormat": "{{ instance }}"
|
|
}
|
|
],
|
|
"title": "Node CPU Usage %",
|
|
"type": "timeseries"
|
|
},
|
|
{
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"custom": { "fillOpacity": 15, "lineWidth": 2 },
|
|
"max": 100,
|
|
"min": 0,
|
|
"thresholds": {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{ "color": "green", "value": null },
|
|
{ "color": "yellow", "value": 70 },
|
|
{ "color": "red", "value": 90 }
|
|
]
|
|
},
|
|
"unit": "percent"
|
|
}
|
|
},
|
|
"gridPos": { "h": 6, "w": 8, "x": 16, "y": 6 },
|
|
"id": 7,
|
|
"targets": [
|
|
{
|
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
|
"expr": "(1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100",
|
|
"legendFormat": "{{ instance }}"
|
|
}
|
|
],
|
|
"title": "Node Memory Usage %",
|
|
"type": "timeseries"
|
|
},
|
|
{
|
|
"collapsed": false,
|
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 12 },
|
|
"id": 102,
|
|
"title": "Network",
|
|
"type": "row"
|
|
},
|
|
{
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"custom": { "fillOpacity": 10, "lineWidth": 2 },
|
|
"unit": "Bps"
|
|
}
|
|
},
|
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 13 },
|
|
"id": 8,
|
|
"targets": [
|
|
{
|
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
|
"expr": "rate(ifHCInOctets{job=\"snmp-pfsense\",ifDescr=\"igb0\"}[5m])",
|
|
"legendFormat": "WAN In"
|
|
},
|
|
{
|
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
|
"expr": "rate(ifHCOutOctets{job=\"snmp-pfsense\",ifDescr=\"igb0\"}[5m])",
|
|
"legendFormat": "WAN Out"
|
|
}
|
|
],
|
|
"title": "pfSense WAN Bandwidth",
|
|
"type": "timeseries"
|
|
},
|
|
{
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"thresholds": {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{ "color": "red", "value": null },
|
|
{ "color": "green", "value": 1 }
|
|
]
|
|
}
|
|
},
|
|
"overrides": []
|
|
},
|
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 13 },
|
|
"id": 9,
|
|
"options": {
|
|
"showHeader": true,
|
|
"sortBy": [{ "displayName": "Value", "desc": false }]
|
|
},
|
|
"targets": [
|
|
{
|
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
|
"expr": "up",
|
|
"format": "table",
|
|
"instant": true,
|
|
"legendFormat": ""
|
|
}
|
|
],
|
|
"title": "Target Health (up)",
|
|
"transformations": [
|
|
{
|
|
"id": "organize",
|
|
"options": {
|
|
"excludeByName": { "Time": true, "__name__": true },
|
|
"renameByName": { "job": "Job", "instance": "Instance", "Value": "Up" }
|
|
}
|
|
}
|
|
],
|
|
"type": "table"
|
|
},
|
|
{
|
|
"collapsed": false,
|
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 21 },
|
|
"id": 103,
|
|
"title": "Services",
|
|
"type": "row"
|
|
},
|
|
{
|
|
"gridPos": { "h": 4, "w": 24, "x": 0, "y": 22 },
|
|
"id": 10,
|
|
"options": {
|
|
"content": "### ArgoCD Applications\n\nView sync status and health of all 11 ArgoCD-managed applications:\n\n[Open ArgoCD Dashboard](https://argocd.iamworkin.lan)\n\n| App | Expected Status |\n|-----|----------------|\n| gitea | Healthy / Synced |\n| zabbix | Healthy / Synced |\n| guacamole | Healthy / Synced |\n| irc | Healthy / Synced |\n| mail | Healthy / Synced |\n| matrix | Healthy / Synced |\n| teamspeak | Healthy / Synced |\n| pki-web | Healthy / Synced |\n| intranet | Healthy / Synced |\n| telephony | Healthy / Synced |\n| infra-noc-services | Healthy / Synced |",
|
|
"mode": "markdown"
|
|
},
|
|
"title": "ArgoCD App Status",
|
|
"type": "text"
|
|
},
|
|
{
|
|
"collapsed": false,
|
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 26 },
|
|
"id": 104,
|
|
"title": "Alerting",
|
|
"type": "row"
|
|
},
|
|
{
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"thresholds": {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{ "color": "green", "value": null },
|
|
{ "color": "orange", "value": 1 },
|
|
{ "color": "red", "value": 3 }
|
|
]
|
|
}
|
|
}
|
|
},
|
|
"gridPos": { "h": 6, "w": 24, "x": 0, "y": 27 },
|
|
"id": 11,
|
|
"targets": [
|
|
{
|
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
|
"expr": "count(ALERTS{alertstate=\"firing\"}) or vector(0)",
|
|
"legendFormat": "Firing Alerts"
|
|
}
|
|
],
|
|
"title": "Firing Alerts",
|
|
"type": "stat"
|
|
}
|
|
],
|
|
"refresh": "30s",
|
|
"schemaVersion": 39,
|
|
"tags": ["infrastructure", "blue-jay", "overview"],
|
|
"time": { "from": "now-1h", "to": "now" },
|
|
"timezone": "browser",
|
|
"title": "Infrastructure Overview",
|
|
"uid": "infra-overview",
|
|
"version": 1
|
|
}
|
|
|
|
# =============================================================================
|
|
# ConfigMap: Grafana Datasource Provisioning
|
|
# =============================================================================
|
|
---
|
|
apiVersion: v1
|
|
kind: ConfigMap
|
|
metadata:
|
|
name: grafana-datasource-provisioning
|
|
namespace: monitoring
|
|
data:
|
|
datasource.yml: |
|
|
apiVersion: 1
|
|
datasources:
|
|
- name: Prometheus
|
|
type: prometheus
|
|
access: proxy
|
|
url: http://prometheus.monitoring.svc:9090
|
|
isDefault: true
|
|
editable: true
|
|
|
|
# =============================================================================
|
|
# ConfigMap: Grafana Alerting Provisioning (contact points + policies + rules)
|
|
# =============================================================================
|
|
# Makes alert rules declarative — survives pod rebuilds without API recreation
|
|
---
|
|
apiVersion: v1
|
|
kind: ConfigMap
|
|
metadata:
|
|
name: grafana-alerting-provisioning
|
|
namespace: monitoring
|
|
data:
|
|
alerting.yml: |
|
|
apiVersion: 1
|
|
contactPoints:
|
|
- orgId: 1
|
|
name: IRC #alerts
|
|
receivers:
|
|
- uid: irc-alerts-webhook
|
|
type: webhook
|
|
settings:
|
|
url: http://irc-notify.monitoring.svc:9119
|
|
httpMethod: POST
|
|
disableResolveMessage: false
|
|
- orgId: 1
|
|
name: Thermal Printer
|
|
receivers:
|
|
- uid: thermal-print-001
|
|
type: webhook
|
|
settings:
|
|
url: http://irc-notify.monitoring.svc:9119
|
|
httpMethod: POST
|
|
disableResolveMessage: true
|
|
policies:
|
|
- orgId: 1
|
|
receiver: IRC #alerts
|
|
group_by: ['alertname']
|
|
group_wait: 30s
|
|
group_interval: 5m
|
|
repeat_interval: 1h
|
|
routes:
|
|
- receiver: Thermal Printer
|
|
matchers: ['alert_channel = thermal_print']
|
|
group_wait: 1m
|
|
group_interval: 10m
|
|
repeat_interval: 4h
|
|
continue: true
|
|
groups:
|
|
- orgId: 1
|
|
name: AI Stack
|
|
folder: AI Stack Alerts
|
|
interval: 1m
|
|
rules:
|
|
- uid: ollama-down-local
|
|
title: Ollama DOWN (Local)
|
|
condition: C
|
|
for: 2m
|
|
noDataState: Alerting
|
|
execErrState: OK
|
|
annotations:
|
|
summary: Ollama DOWN on workstation (R9700)
|
|
description: Agent Zero LOCAL cannot reach Ollama. FAISS memory will fail.
|
|
runbook: "1. Check PC power/wake 2. ssh stoltz@10.0.58.100 3. systemctl status ollama 4. journalctl -u ollama --since 5min"
|
|
labels:
|
|
severity: warning
|
|
service: ollama
|
|
data:
|
|
- refId: A
|
|
relativeTimeRange: {from: 120, to: 0}
|
|
datasourceUid: prometheus
|
|
model: {expr: 'probe_success{job="probe-ollama-local"}', instant: true, refId: A}
|
|
- refId: B
|
|
relativeTimeRange: {from: 120, to: 0}
|
|
datasourceUid: __expr__
|
|
model: {type: reduce, expression: A, reducer: last, refId: B}
|
|
- refId: C
|
|
relativeTimeRange: {from: 120, to: 0}
|
|
datasourceUid: __expr__
|
|
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [1], type: lt}}], refId: C}
|
|
- uid: ollama-down-edge1
|
|
title: Ollama DOWN (Edge1)
|
|
condition: C
|
|
for: 2m
|
|
noDataState: Alerting
|
|
execErrState: OK
|
|
annotations:
|
|
summary: Ollama DOWN on edge1 Pi 5
|
|
description: Agent Zero NUC cannot reach Ollama.
|
|
runbook: "1. ssh stoltz@10.0.57.17 2. systemctl status ollama 3. journalctl -u ollama --since 5min 4. cat /sys/class/thermal/thermal_zone0/temp"
|
|
labels:
|
|
severity: warning
|
|
service: ollama
|
|
data:
|
|
- refId: A
|
|
relativeTimeRange: {from: 120, to: 0}
|
|
datasourceUid: prometheus
|
|
model: {expr: 'probe_success{job="probe-ollama-edge1"}', instant: true, refId: A}
|
|
- refId: B
|
|
relativeTimeRange: {from: 120, to: 0}
|
|
datasourceUid: __expr__
|
|
model: {type: reduce, expression: A, reducer: last, refId: B}
|
|
- refId: C
|
|
relativeTimeRange: {from: 120, to: 0}
|
|
datasourceUid: __expr__
|
|
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [1], type: lt}}], refId: C}
|
|
- uid: a0-down-local
|
|
title: Agent Zero DOWN (Local)
|
|
condition: C
|
|
for: 2m
|
|
noDataState: Alerting
|
|
execErrState: OK
|
|
annotations:
|
|
summary: Agent Zero LOCAL DOWN
|
|
description: K3s web UI unreachable.
|
|
runbook: "1. ssh stoltz@10.0.58.100 2. docker ps | grep agent-zero 3. docker logs agent-zero --tail 20 4. Check Ollama first (A0 crashes if Ollama down)"
|
|
labels:
|
|
severity: warning
|
|
service: agent-zero
|
|
data:
|
|
- refId: A
|
|
relativeTimeRange: {from: 120, to: 0}
|
|
datasourceUid: prometheus
|
|
model: {expr: 'probe_success{job="probe-agentzero-local"}', instant: true, refId: A}
|
|
- refId: B
|
|
relativeTimeRange: {from: 120, to: 0}
|
|
datasourceUid: __expr__
|
|
model: {type: reduce, expression: A, reducer: last, refId: B}
|
|
- refId: C
|
|
relativeTimeRange: {from: 120, to: 0}
|
|
datasourceUid: __expr__
|
|
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [1], type: lt}}], refId: C}
|
|
- uid: a0-down-nuc
|
|
title: Agent Zero DOWN (NUC)
|
|
condition: C
|
|
for: 2m
|
|
noDataState: Alerting
|
|
execErrState: OK
|
|
annotations:
|
|
summary: Agent Zero NUC DOWN
|
|
description: RKE2 web UI unreachable.
|
|
runbook: "1. ssh root@10.0.56.200 via noc1 2. kubectl get pods -n agent-zero 3. kubectl logs -n agent-zero deploy/agent-zero --tail 20"
|
|
labels:
|
|
severity: warning
|
|
service: agent-zero
|
|
data:
|
|
- refId: A
|
|
relativeTimeRange: {from: 120, to: 0}
|
|
datasourceUid: prometheus
|
|
model: {expr: 'probe_success{job="probe-agentzero-nuc"}', instant: true, refId: A}
|
|
- refId: B
|
|
relativeTimeRange: {from: 120, to: 0}
|
|
datasourceUid: __expr__
|
|
model: {type: reduce, expression: A, reducer: last, refId: B}
|
|
- refId: C
|
|
relativeTimeRange: {from: 120, to: 0}
|
|
datasourceUid: __expr__
|
|
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [1], type: lt}}], refId: C}
|
|
- uid: print-ollama-runner-long-keepalive
|
|
title: Print.Web Ollama runner keep-alive >10m
|
|
condition: C
|
|
for: 2m
|
|
noDataState: NoData
|
|
execErrState: OK
|
|
annotations:
|
|
summary: Print.Web Ollama runner held too long
|
|
description: "A Print.Web Ollama runner has more than 10 minutes of keep-alive remaining. Check active AI requests before the Pi 5 Ollama lane thrashes."
|
|
runbook: "1. Open https://print.iamworkin.lan/admin 2. Check Ollama Fleet model + remaining keep-alive 3. Query Prometheus: ai_ollama_runner_keepalive_remaining_seconds{job=\"printweb-otel\"} 4. Stop duplicate model callers before restarting Ollama."
|
|
labels:
|
|
severity: warning
|
|
service: print-web
|
|
alert_channel: thermal_print
|
|
data:
|
|
- refId: A
|
|
relativeTimeRange: {from: 120, to: 0}
|
|
datasourceUid: prometheus
|
|
model: {expr: 'max by(model) (ai_ollama_runner_keepalive_remaining_seconds{job="printweb-otel"})', instant: true, refId: A}
|
|
- refId: B
|
|
relativeTimeRange: {from: 120, to: 0}
|
|
datasourceUid: __expr__
|
|
model: {type: reduce, expression: A, reducer: last, refId: B}
|
|
- refId: C
|
|
relativeTimeRange: {from: 120, to: 0}
|
|
datasourceUid: __expr__
|
|
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [600], type: gt}}], refId: C}
|
|
- orgId: 1
|
|
name: CI Runners
|
|
folder: CI Alerts
|
|
interval: 1m
|
|
rules:
|
|
- uid: linux-runner-offline
|
|
title: LinuxRunnerOffline
|
|
condition: C
|
|
for: 5m
|
|
noDataState: OK
|
|
execErrState: Error
|
|
annotations:
|
|
summary: "Linux CI runner offline: {{ $labels.deployment }}"
|
|
description: "A github-runner namespace Deployment has 0 ready replicas for more than 5 minutes. CI jobs targeting that repo will queue until the runner pod restarts and re-registers."
|
|
runbook: "1. kubectl -n github-runner get pods -l app.kubernetes.io/name={{ $labels.deployment }} 2. kubectl -n github-runner logs -l app.kubernetes.io/name={{ $labels.deployment }} --tail=50 3. Verify PAT repo access if registration returns 404 4. Verify no RWO PVC is shared by scaled runners"
|
|
labels:
|
|
severity: warning
|
|
service: github-runner
|
|
alert_channel: irc
|
|
team: ci
|
|
data:
|
|
- refId: A
|
|
relativeTimeRange: {from: 300, to: 0}
|
|
datasourceUid: prometheus
|
|
model: {expr: 'kube_deployment_status_replicas_ready{namespace="github-runner",deployment=~"github-runner(|-(sharedpos|puppet|signage|dms|telephony|print-web|chat|mysql|kiosk-linux))"} == 0', instant: true, refId: A}
|
|
- refId: B
|
|
relativeTimeRange: {from: 300, to: 0}
|
|
datasourceUid: __expr__
|
|
model: {type: reduce, expression: A, reducer: last, refId: B}
|
|
- refId: C
|
|
relativeTimeRange: {from: 300, to: 0}
|
|
datasourceUid: __expr__
|
|
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [0], type: gt}}], refId: C}
|
|
- orgId: 1
|
|
name: Infrastructure
|
|
folder: AI Stack Alerts
|
|
interval: 1m
|
|
rules:
|
|
- uid: node-down
|
|
title: Node DOWN
|
|
condition: C
|
|
for: 2m
|
|
noDataState: Alerting
|
|
execErrState: OK
|
|
annotations:
|
|
summary: Node down
|
|
description: Node exporter unreachable for 2 minutes. Host may be down or network issue.
|
|
runbook: "1. Ping host IP 2. SSH via noc1 jumpbox (root@10.0.56.10) 3. systemctl status node_exporter 4. Check network/VLAN 5. Physical power check if unreachable"
|
|
labels:
|
|
severity: critical
|
|
service: infrastructure
|
|
alert_channel: thermal_print
|
|
data:
|
|
- refId: A
|
|
relativeTimeRange: {from: 120, to: 0}
|
|
datasourceUid: prometheus
|
|
model: {expr: 'up{job=~"node-exporter|rke2-nodes|edge-nodes"}', instant: true, refId: A}
|
|
- refId: B
|
|
relativeTimeRange: {from: 120, to: 0}
|
|
datasourceUid: __expr__
|
|
model: {type: reduce, expression: A, reducer: last, refId: B}
|
|
- refId: C
|
|
relativeTimeRange: {from: 120, to: 0}
|
|
datasourceUid: __expr__
|
|
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [1], type: lt}}], refId: C}
|
|
- uid: macmini-runner-offline
|
|
title: MacMiniRunnerOffline
|
|
condition: C
|
|
for: 10m
|
|
noDataState: Alerting
|
|
execErrState: OK
|
|
annotations:
|
|
summary: Mac mini GitHub runner offline
|
|
description: "One or more macmini-* GitHub Actions runners have not reported online for more than 10 minutes. LaunchDaemons survive reboot and do not require the bluejay GUI session."
|
|
runbook: "1. ssh fcadmin@macmini.iamworkin.lan 2. launchctl print system/io.flowercore.github-runner-<slug> 3. Check /Users/fcadmin/Library/Logs/github-runners/<slug>/stderr.log 4. Re-register the repo runner if .runner is missing"
|
|
labels:
|
|
severity: warning
|
|
service: github-runner
|
|
data:
|
|
- refId: A
|
|
relativeTimeRange: {from: 600, to: 0}
|
|
datasourceUid: prometheus
|
|
model: {expr: 'min(flowercore_github_runner_online{runner=~"macmini-.*"} or vector(0))', instant: true, refId: A}
|
|
- refId: B
|
|
relativeTimeRange: {from: 600, to: 0}
|
|
datasourceUid: __expr__
|
|
model: {type: reduce, expression: A, reducer: last, refId: B}
|
|
- refId: C
|
|
relativeTimeRange: {from: 600, to: 0}
|
|
datasourceUid: __expr__
|
|
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [1], type: lt}}], refId: C}
|
|
- uid: high-cpu
|
|
title: High CPU (>85%)
|
|
condition: C
|
|
for: 10m
|
|
noDataState: NoData
|
|
execErrState: OK
|
|
annotations:
|
|
summary: High CPU
|
|
description: CPU above 85% for 10 minutes. Performance degradation likely.
|
|
runbook: "1. SSH to host 2. top -b -n1 | head -20 3. Check for runaway process 4. kubectl top pods (if K8s node)"
|
|
labels:
|
|
severity: warning
|
|
service: infrastructure
|
|
data:
|
|
- refId: A
|
|
relativeTimeRange: {from: 600, to: 0}
|
|
datasourceUid: prometheus
|
|
model: {expr: 'instance:node_cpu_usage:avg5m', instant: true, refId: A}
|
|
- refId: B
|
|
relativeTimeRange: {from: 600, to: 0}
|
|
datasourceUid: __expr__
|
|
model: {type: reduce, expression: A, reducer: last, refId: B}
|
|
- refId: C
|
|
relativeTimeRange: {from: 600, to: 0}
|
|
datasourceUid: __expr__
|
|
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [85], type: gt}}], refId: C}
|
|
- uid: high-memory
|
|
title: High Memory (>90%)
|
|
condition: C
|
|
for: 5m
|
|
noDataState: NoData
|
|
execErrState: OK
|
|
annotations:
|
|
summary: High memory usage
|
|
description: Memory above 90% for 5 minutes. OOM kills imminent.
|
|
runbook: "1. SSH to host 2. free -h 3. ps aux --sort=-%mem | head 10 4. Check for memory leak (growing RSS)"
|
|
labels:
|
|
severity: warning
|
|
service: infrastructure
|
|
data:
|
|
- refId: A
|
|
relativeTimeRange: {from: 600, to: 0}
|
|
datasourceUid: prometheus
|
|
model: {expr: 'instance:node_memory_usage:percent', instant: true, refId: A}
|
|
- refId: B
|
|
relativeTimeRange: {from: 600, to: 0}
|
|
datasourceUid: __expr__
|
|
model: {type: reduce, expression: A, reducer: last, refId: B}
|
|
- refId: C
|
|
relativeTimeRange: {from: 600, to: 0}
|
|
datasourceUid: __expr__
|
|
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [90], type: gt}}], refId: C}
|
|
- uid: disk-low
|
|
title: Disk Space Low (>85%)
|
|
condition: C
|
|
for: 10m
|
|
noDataState: NoData
|
|
execErrState: OK
|
|
annotations:
|
|
summary: Disk usage high
|
|
description: Root disk above 85% for 10 minutes. Service disruption if full.
|
|
runbook: "1. SSH to host 2. df -h / 3. du -sh /* | sort -rh | head 4. journalctl --vacuum-size=100M 5. podman system prune"
|
|
labels:
|
|
severity: warning
|
|
service: infrastructure
|
|
data:
|
|
- refId: A
|
|
relativeTimeRange: {from: 600, to: 0}
|
|
datasourceUid: prometheus
|
|
model: {expr: 'instance:node_disk_usage:percent', instant: true, refId: A}
|
|
- refId: B
|
|
relativeTimeRange: {from: 600, to: 0}
|
|
datasourceUid: __expr__
|
|
model: {type: reduce, expression: A, reducer: last, refId: B}
|
|
- refId: C
|
|
relativeTimeRange: {from: 600, to: 0}
|
|
datasourceUid: __expr__
|
|
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [85], type: gt}}], refId: C}
|
|
- orgId: 1
|
|
name: RemoteDesktop
|
|
folder: AI Stack Alerts
|
|
interval: 1m
|
|
rules:
|
|
- uid: remotedesktop-web-down
|
|
title: RemoteDesktop Web DOWN
|
|
condition: C
|
|
for: 3m
|
|
noDataState: Alerting
|
|
execErrState: OK
|
|
annotations:
|
|
summary: FlowerCore RemoteDesktop /health probe failing
|
|
description: "https://desktop.iamworkin.lan/health has failed for 3 minutes. Catalog + session launch surface offline."
|
|
runbook: "1. kubectl -n fc-desktop get pods -l app.kubernetes.io/name=remotedesktop-web 2. kubectl -n fc-desktop logs deploy/remotedesktop-web --tail=50 3. Check Traefik IngressRoute + step-ca cert 4. Rollout restart if pod is stuck"
|
|
labels:
|
|
severity: warning
|
|
service: remotedesktop
|
|
data:
|
|
- refId: A
|
|
relativeTimeRange: {from: 180, to: 0}
|
|
datasourceUid: prometheus
|
|
model: {expr: 'probe_success{job="probe-remotedesktop"}', instant: true, refId: A}
|
|
- refId: B
|
|
relativeTimeRange: {from: 180, to: 0}
|
|
datasourceUid: __expr__
|
|
model: {type: reduce, expression: A, reducer: last, refId: B}
|
|
- refId: C
|
|
relativeTimeRange: {from: 180, to: 0}
|
|
datasourceUid: __expr__
|
|
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [1], type: lt}}], refId: C}
|
|
|
|
- uid: remotedesktop-metrics-stale
|
|
title: RemoteDesktop metrics stale
|
|
condition: C
|
|
for: 10m
|
|
noDataState: Alerting
|
|
execErrState: OK
|
|
annotations:
|
|
summary: RemoteDesktop /metrics returning no series
|
|
description: "No fc_desktop_session_events_total series for 10 minutes. Either the Prometheus scrape is misconfigured or the web deployment stopped exporting metrics. Cross-checked by Zabbix template's identical 10m no-data trigger."
|
|
runbook: "1. curl -sk https://desktop.iamworkin.lan/metrics | head 2. kubectl -n monitoring exec deploy/prometheus -- wget -qO- localhost:9090/api/v1/targets?scrapePool=fc-remotedesktop 3. Check monitoring-netpol egress allows to fc-desktop:8080"
|
|
labels:
|
|
severity: warning
|
|
service: remotedesktop
|
|
data:
|
|
- refId: A
|
|
relativeTimeRange: {from: 600, to: 0}
|
|
datasourceUid: prometheus
|
|
model: {expr: 'count(fc_desktop_session_events_total) or vector(0)', instant: true, refId: A}
|
|
- refId: B
|
|
relativeTimeRange: {from: 600, to: 0}
|
|
datasourceUid: __expr__
|
|
model: {type: reduce, expression: A, reducer: last, refId: B}
|
|
- refId: C
|
|
relativeTimeRange: {from: 600, to: 0}
|
|
datasourceUid: __expr__
|
|
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [1], type: lt}}], refId: C}
|
|
|
|
- uid: remotedesktop-pool-depleted
|
|
title: RemoteDesktop pool depleted
|
|
condition: C
|
|
for: 5m
|
|
noDataState: OK
|
|
execErrState: OK
|
|
annotations:
|
|
summary: RemoteDesktop warm pool depleted for 5m
|
|
description: "A RemoteDesktop warm pool has fc_desktop_pool_depleted=1 for 5 minutes. New launches will cold-start. Check pod scheduling, image pull, node capacity."
|
|
runbook: "1. kubectl -n fc-desktop get pods -l app.kubernetes.io/name=remote-desktop --sort-by=.status.startTime 2. kubectl -n fc-desktop describe desktoppool <name> 3. Verify localhost/fc-desktop:* images imported on all 3 RKE2 nodes"
|
|
labels:
|
|
severity: warning
|
|
service: remotedesktop
|
|
data:
|
|
- refId: A
|
|
relativeTimeRange: {from: 300, to: 0}
|
|
datasourceUid: prometheus
|
|
model: {expr: 'max(fc_desktop_pool_depleted)', instant: true, refId: A}
|
|
- refId: B
|
|
relativeTimeRange: {from: 300, to: 0}
|
|
datasourceUid: __expr__
|
|
model: {type: reduce, expression: A, reducer: last, refId: B}
|
|
- refId: C
|
|
relativeTimeRange: {from: 300, to: 0}
|
|
datasourceUid: __expr__
|
|
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [0.5], type: gt}}], refId: C}
|
|
|
|
- uid: remotedesktop-pool-deficit-sustained
|
|
title: RemoteDesktop pool below desired
|
|
condition: C
|
|
for: 10m
|
|
noDataState: OK
|
|
execErrState: OK
|
|
annotations:
|
|
summary: RemoteDesktop pool sustained deficit
|
|
description: "A pool has fc_desktop_pool_deficit>0 for 10 minutes. Operator is reconciling but can't reach desired size — likely image pull, NFS affinity, or claim-init issue."
|
|
runbook: "1. kubectl -n fc-desktop get pods -l flowercore.io/pool=<pool> 2. kubectl logs -n fc-desktop deploy/remotedesktop-operator 3. Check claim-init hook env on template"
|
|
labels:
|
|
severity: info
|
|
service: remotedesktop
|
|
data:
|
|
- refId: A
|
|
relativeTimeRange: {from: 600, to: 0}
|
|
datasourceUid: prometheus
|
|
model: {expr: 'max(fc_desktop_pool_deficit)', instant: true, refId: A}
|
|
- refId: B
|
|
relativeTimeRange: {from: 600, to: 0}
|
|
datasourceUid: __expr__
|
|
model: {type: reduce, expression: A, reducer: last, refId: B}
|
|
- refId: C
|
|
relativeTimeRange: {from: 600, to: 0}
|
|
datasourceUid: __expr__
|
|
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [0], type: gt}}], refId: C}
|
|
|
|
- uid: remotedesktop-session-churn-spike
|
|
title: RemoteDesktop launch rate spike
|
|
condition: C
|
|
for: 5m
|
|
noDataState: OK
|
|
execErrState: OK
|
|
annotations:
|
|
summary: RemoteDesktop launch rate exceeds 20/min
|
|
description: "Launch events >20/min for 5 minutes. Could be a user-facing feature launch, pooled template thrashing, or runaway automation loop."
|
|
runbook: "1. kubectl -n fc-desktop get pods -l app.kubernetes.io/name=remote-desktop -o wide | wc -l 2. curl -sk https://desktop.iamworkin.lan/api/sessions/active 3. Check operator logs for reconcile loops"
|
|
labels:
|
|
severity: info
|
|
service: remotedesktop
|
|
data:
|
|
- refId: A
|
|
relativeTimeRange: {from: 300, to: 0}
|
|
datasourceUid: prometheus
|
|
model: {expr: 'sum(rate(fc_desktop_session_events_total{event="launch"}[5m])) * 60', instant: true, refId: A}
|
|
- refId: B
|
|
relativeTimeRange: {from: 300, to: 0}
|
|
datasourceUid: __expr__
|
|
model: {type: reduce, expression: A, reducer: last, refId: B}
|
|
- refId: C
|
|
relativeTimeRange: {from: 300, to: 0}
|
|
datasourceUid: __expr__
|
|
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [20], type: gt}}], refId: C}
|
|
|
|
- uid: remotedesktop-tls-expiry
|
|
title: RemoteDesktop TLS cert expiring
|
|
condition: C
|
|
for: 6h
|
|
noDataState: OK
|
|
execErrState: OK
|
|
annotations:
|
|
summary: desktop.iamworkin.lan cert <2d to expiry
|
|
description: "The desktop.iamworkin.lan certificate is inside the 2-day renewal window and cert-manager has not renewed. Check cert-manager logs, step-ca reachability, FlowerCore.DNS preflight for dnsNames."
|
|
runbook: "1. kubectl -n fc-desktop get certificate remotedesktop-web-tls 2. kubectl -n cert-manager logs deploy/cert-manager --tail=50 3. Verify pfSense DNS override for desktop.iamworkin.lan"
|
|
labels:
|
|
severity: critical
|
|
service: remotedesktop
|
|
data:
|
|
- refId: A
|
|
relativeTimeRange: {from: 21600, to: 0}
|
|
datasourceUid: prometheus
|
|
model: {expr: '(probe_ssl_earliest_cert_expiry{job="probe-remotedesktop"} - time()) / 86400', instant: true, refId: A}
|
|
- refId: B
|
|
relativeTimeRange: {from: 21600, to: 0}
|
|
datasourceUid: __expr__
|
|
model: {type: reduce, expression: A, reducer: last, refId: B}
|
|
- refId: C
|
|
relativeTimeRange: {from: 21600, to: 0}
|
|
datasourceUid: __expr__
|
|
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [2], type: lt}}], refId: C}
|
|
|
|
# =============================================================================
|
|
# Deployment: Grafana
|
|
# =============================================================================
|
|
---
|
|
apiVersion: apps/v1
|
|
kind: Deployment
|
|
metadata:
|
|
name: grafana
|
|
namespace: monitoring
|
|
labels:
|
|
app: grafana
|
|
spec:
|
|
replicas: 1
|
|
strategy:
|
|
type: Recreate
|
|
selector:
|
|
matchLabels:
|
|
app: grafana
|
|
template:
|
|
metadata:
|
|
labels:
|
|
app: grafana
|
|
spec:
|
|
securityContext:
|
|
fsGroup: 472 # grafana group
|
|
runAsUser: 472
|
|
runAsGroup: 472
|
|
containers:
|
|
- name: grafana
|
|
image: docker.io/grafana/grafana:latest
|
|
env:
|
|
# Credentials from 1Password Operator (OnePasswordItem → Secret "grafana-credentials")
|
|
- name: GF_SECURITY_ADMIN_USER
|
|
valueFrom:
|
|
secretKeyRef:
|
|
name: grafana-credentials
|
|
key: username
|
|
- name: GF_SECURITY_ADMIN_PASSWORD
|
|
valueFrom:
|
|
secretKeyRef:
|
|
name: grafana-credentials
|
|
key: password
|
|
- name: GF_SERVER_ROOT_URL
|
|
value: "https://grafana.iamworkin.lan"
|
|
- name: GF_SERVER_SERVE_FROM_SUB_PATH
|
|
value: "false"
|
|
# Zabbix plugin: install manually after first boot if needed
|
|
# GF_INSTALL_PLUGINS requires internet on startup — breaks with restrictive NetworkPolicy
|
|
# kubectl exec -n monitoring deploy/grafana -- grafana cli plugins install alexanderzobnin-zabbix-app
|
|
ports:
|
|
- containerPort: 3000
|
|
name: http
|
|
volumeMounts:
|
|
- name: data
|
|
mountPath: /var/lib/grafana
|
|
- name: dashboard-provider
|
|
mountPath: /etc/grafana/provisioning/dashboards
|
|
readOnly: true
|
|
- name: dashboards-ai-stack
|
|
mountPath: /var/lib/grafana/dashboards/ai-stack
|
|
readOnly: true
|
|
- name: dashboards-edge-nodes
|
|
mountPath: /var/lib/grafana/dashboards/edge-nodes
|
|
readOnly: true
|
|
- name: dashboards-network
|
|
mountPath: /var/lib/grafana/dashboards/network
|
|
readOnly: true
|
|
- name: dashboards-operations
|
|
mountPath: /var/lib/grafana/dashboards/operations
|
|
readOnly: true
|
|
- name: dashboards-printer
|
|
mountPath: /var/lib/grafana/dashboards/printer
|
|
readOnly: true
|
|
- name: dashboards-infra-overview
|
|
mountPath: /var/lib/grafana/dashboards/infra-overview
|
|
readOnly: true
|
|
- name: dashboards-remotedesktop
|
|
mountPath: /var/lib/grafana/dashboards/remotedesktop
|
|
readOnly: true
|
|
- name: datasource-provisioning
|
|
mountPath: /etc/grafana/provisioning/datasources
|
|
readOnly: true
|
|
- name: alerting-provisioning
|
|
mountPath: /etc/grafana/provisioning/alerting
|
|
readOnly: true
|
|
resources:
|
|
requests:
|
|
cpu: 100m
|
|
memory: 128Mi
|
|
limits:
|
|
cpu: 500m
|
|
memory: 512Mi
|
|
livenessProbe:
|
|
httpGet:
|
|
path: /api/health
|
|
port: 3000
|
|
initialDelaySeconds: 30
|
|
periodSeconds: 30
|
|
readinessProbe:
|
|
httpGet:
|
|
path: /api/health
|
|
port: 3000
|
|
initialDelaySeconds: 10
|
|
periodSeconds: 10
|
|
volumes:
|
|
- name: data
|
|
persistentVolumeClaim:
|
|
claimName: grafana-data
|
|
- name: dashboard-provider
|
|
configMap:
|
|
name: grafana-dashboard-provider
|
|
- name: dashboards-ai-stack
|
|
configMap:
|
|
name: grafana-dashboards
|
|
- name: dashboards-edge-nodes
|
|
configMap:
|
|
name: grafana-dashboard-edge-nodes
|
|
- name: dashboards-network
|
|
configMap:
|
|
name: grafana-dashboard-network-overview
|
|
- name: dashboards-operations
|
|
configMap:
|
|
name: grafana-dashboard-operations
|
|
- name: dashboards-printer
|
|
configMap:
|
|
name: grafana-dashboard-printer
|
|
- name: dashboards-infra-overview
|
|
configMap:
|
|
name: grafana-dashboard-infra-overview
|
|
- name: dashboards-remotedesktop
|
|
configMap:
|
|
name: grafana-dashboard-remotedesktop
|
|
- name: datasource-provisioning
|
|
configMap:
|
|
name: grafana-datasource-provisioning
|
|
- name: alerting-provisioning
|
|
configMap:
|
|
name: grafana-alerting-provisioning
|
|
|
|
# =============================================================================
|
|
# Deployment: Blackbox Exporter
|
|
# =============================================================================
|
|
---
|
|
apiVersion: apps/v1
|
|
kind: Deployment
|
|
metadata:
|
|
name: blackbox-exporter
|
|
namespace: monitoring
|
|
labels:
|
|
app: blackbox-exporter
|
|
spec:
|
|
replicas: 1
|
|
selector:
|
|
matchLabels:
|
|
app: blackbox-exporter
|
|
template:
|
|
metadata:
|
|
labels:
|
|
app: blackbox-exporter
|
|
spec:
|
|
containers:
|
|
- name: blackbox-exporter
|
|
image: quay.io/prometheus/blackbox-exporter:latest
|
|
args:
|
|
- "--config.file=/config/blackbox.yml"
|
|
ports:
|
|
- containerPort: 9115
|
|
name: http
|
|
volumeMounts:
|
|
- name: config
|
|
mountPath: /config/blackbox.yml
|
|
subPath: blackbox.yml
|
|
readOnly: true
|
|
resources:
|
|
requests:
|
|
cpu: 50m
|
|
memory: 32Mi
|
|
limits:
|
|
cpu: 200m
|
|
memory: 128Mi
|
|
livenessProbe:
|
|
httpGet:
|
|
path: /
|
|
port: 9115
|
|
initialDelaySeconds: 5
|
|
periodSeconds: 30
|
|
readinessProbe:
|
|
httpGet:
|
|
path: /
|
|
port: 9115
|
|
initialDelaySeconds: 3
|
|
periodSeconds: 10
|
|
volumes:
|
|
- name: config
|
|
configMap:
|
|
name: blackbox-config
|
|
|
|
# =============================================================================
|
|
# PVC: SNMP Exporter Config (100Mi, Longhorn)
|
|
# =============================================================================
|
|
# The custom snmp.yml (~2MB) exceeds the 1MB ConfigMap limit.
|
|
# This PVC stores the config file. To load a custom config:
|
|
# kubectl cp snmp.yml monitoring/<snmp-exporter-pod>:/config/snmp.yml
|
|
# Then restart the pod to pick up the new config.
|
|
---
|
|
apiVersion: v1
|
|
kind: PersistentVolumeClaim
|
|
metadata:
|
|
name: snmp-config
|
|
namespace: monitoring
|
|
spec:
|
|
accessModes:
|
|
- ReadWriteOnce
|
|
storageClassName: longhorn
|
|
resources:
|
|
requests:
|
|
storage: 100Mi
|
|
|
|
# =============================================================================
|
|
# Deployment: SNMP Exporter
|
|
# =============================================================================
|
|
# Uses a PVC-mounted config at /config/snmp.yml. An init container copies the
|
|
# default config from the image if the PVC is empty (first deploy).
|
|
# To load the custom noc1 snmp.yml (~2MB):
|
|
# kubectl cp snmp.yml monitoring/<snmp-exporter-pod>:/config/snmp.yml
|
|
# Then restart: kubectl rollout restart deploy/snmp-exporter -n monitoring
|
|
---
|
|
apiVersion: apps/v1
|
|
kind: Deployment
|
|
metadata:
|
|
name: snmp-exporter
|
|
namespace: monitoring
|
|
labels:
|
|
app: snmp-exporter
|
|
spec:
|
|
replicas: 1
|
|
selector:
|
|
matchLabels:
|
|
app: snmp-exporter
|
|
template:
|
|
metadata:
|
|
labels:
|
|
app: snmp-exporter
|
|
spec:
|
|
initContainers:
|
|
# Copy default snmp.yml from image if PVC is empty (first deploy)
|
|
- name: init-config
|
|
image: docker.io/prom/snmp-exporter:latest
|
|
command:
|
|
- sh
|
|
- -c
|
|
- |
|
|
if [ ! -f /config/snmp.yml ]; then
|
|
echo "No custom config found, copying default from image..."
|
|
cp /etc/snmp_exporter/snmp.yml /config/snmp.yml
|
|
echo "Default snmp.yml copied to PVC."
|
|
else
|
|
echo "Custom snmp.yml already exists on PVC, skipping copy."
|
|
fi
|
|
volumeMounts:
|
|
- name: snmp-config
|
|
mountPath: /config
|
|
containers:
|
|
- name: snmp-exporter
|
|
image: docker.io/prom/snmp-exporter:latest
|
|
args:
|
|
- "--config.file=/config/snmp.yml"
|
|
ports:
|
|
- containerPort: 9116
|
|
name: http
|
|
volumeMounts:
|
|
- name: snmp-config
|
|
mountPath: /config
|
|
readOnly: true
|
|
resources:
|
|
requests:
|
|
cpu: 50m
|
|
memory: 64Mi
|
|
limits:
|
|
cpu: 200m
|
|
memory: 256Mi
|
|
livenessProbe:
|
|
httpGet:
|
|
path: /
|
|
port: 9116
|
|
initialDelaySeconds: 5
|
|
periodSeconds: 30
|
|
readinessProbe:
|
|
httpGet:
|
|
path: /
|
|
port: 9116
|
|
initialDelaySeconds: 3
|
|
periodSeconds: 10
|
|
volumes:
|
|
- name: snmp-config
|
|
persistentVolumeClaim:
|
|
claimName: snmp-config
|
|
|
|
# =============================================================================
|
|
# Deployment: IRC Notify (alert relay)
|
|
# =============================================================================
|
|
---
|
|
apiVersion: apps/v1
|
|
kind: Deployment
|
|
metadata:
|
|
name: irc-notify
|
|
namespace: monitoring
|
|
labels:
|
|
app: irc-notify
|
|
spec:
|
|
replicas: 1
|
|
selector:
|
|
matchLabels:
|
|
app: irc-notify
|
|
template:
|
|
metadata:
|
|
labels:
|
|
app: irc-notify
|
|
spec:
|
|
containers:
|
|
- name: irc-notify
|
|
image: docker.io/library/python:3.12-slim
|
|
command: ["python3", "/app/notify.py"]
|
|
ports:
|
|
- containerPort: 9119
|
|
name: http
|
|
volumeMounts:
|
|
- name: script
|
|
mountPath: /app/notify.py
|
|
subPath: notify.py
|
|
readOnly: true
|
|
resources:
|
|
requests:
|
|
cpu: 25m
|
|
memory: 32Mi
|
|
limits:
|
|
cpu: 100m
|
|
memory: 64Mi
|
|
livenessProbe:
|
|
tcpSocket:
|
|
port: 9119
|
|
initialDelaySeconds: 5
|
|
periodSeconds: 30
|
|
readinessProbe:
|
|
tcpSocket:
|
|
port: 9119
|
|
initialDelaySeconds: 3
|
|
periodSeconds: 10
|
|
volumes:
|
|
- name: script
|
|
configMap:
|
|
name: irc-notify-script
|
|
|
|
# =============================================================================
|
|
# DaemonSet: Node Exporter (runs on every RKE2 node)
|
|
# =============================================================================
|
|
# Port 9101 avoids conflict with host-level node-exporters already on :9100.
|
|
# The rke2-nodes Prometheus job scrapes the host instances on :9100; this
|
|
# DaemonSet provides K8s service-discovery-based scraping on :9101.
|
|
---
|
|
apiVersion: apps/v1
|
|
kind: DaemonSet
|
|
metadata:
|
|
name: node-exporter
|
|
namespace: monitoring
|
|
labels:
|
|
app: node-exporter
|
|
spec:
|
|
selector:
|
|
matchLabels:
|
|
app: node-exporter
|
|
updateStrategy:
|
|
type: RollingUpdate
|
|
rollingUpdate:
|
|
maxUnavailable: 1
|
|
template:
|
|
metadata:
|
|
labels:
|
|
app: node-exporter
|
|
spec:
|
|
hostPID: true
|
|
hostNetwork: true
|
|
tolerations:
|
|
- operator: Exists
|
|
securityContext:
|
|
runAsNonRoot: false
|
|
runAsUser: 0
|
|
containers:
|
|
- name: node-exporter
|
|
image: docker.io/prom/node-exporter:latest
|
|
args:
|
|
- "--path.rootfs=/host"
|
|
- "--path.sysfs=/host/sys"
|
|
- "--path.procfs=/host/proc"
|
|
- "--collector.filesystem.mount-points-exclude=^/(dev|proc|sys|run|var/lib/containers|run/containers|var/lib/rancher|var/lib/longhorn)($|/)"
|
|
- "--collector.netclass.ignored-devices=^(veth|cali|flannel|cni).*$"
|
|
- "--no-collector.btrfs"
|
|
- "--web.listen-address=:9101"
|
|
ports:
|
|
- containerPort: 9101
|
|
hostPort: 9101
|
|
name: metrics
|
|
securityContext:
|
|
privileged: true
|
|
readOnlyRootFilesystem: true
|
|
volumeMounts:
|
|
- name: rootfs
|
|
mountPath: /host
|
|
readOnly: true
|
|
mountPropagation: HostToContainer
|
|
- name: proc
|
|
mountPath: /host/proc
|
|
readOnly: true
|
|
- name: sys
|
|
mountPath: /host/sys
|
|
readOnly: true
|
|
resources:
|
|
requests:
|
|
cpu: 50m
|
|
memory: 32Mi
|
|
limits:
|
|
cpu: 200m
|
|
memory: 128Mi
|
|
volumes:
|
|
- name: rootfs
|
|
hostPath:
|
|
path: /
|
|
- name: proc
|
|
hostPath:
|
|
path: /proc
|
|
- name: sys
|
|
hostPath:
|
|
path: /sys
|
|
|
|
# =============================================================================
|
|
# Service: Prometheus (ClusterIP :9090)
|
|
# =============================================================================
|
|
---
|
|
apiVersion: v1
|
|
kind: Service
|
|
metadata:
|
|
name: prometheus
|
|
namespace: monitoring
|
|
labels:
|
|
app: prometheus
|
|
spec:
|
|
type: ClusterIP
|
|
ports:
|
|
- port: 9090
|
|
targetPort: 9090
|
|
protocol: TCP
|
|
name: http
|
|
selector:
|
|
app: prometheus
|
|
|
|
# =============================================================================
|
|
# Service: Grafana (ClusterIP :3000)
|
|
# =============================================================================
|
|
---
|
|
apiVersion: v1
|
|
kind: Service
|
|
metadata:
|
|
name: grafana
|
|
namespace: monitoring
|
|
labels:
|
|
app: grafana
|
|
spec:
|
|
type: ClusterIP
|
|
ports:
|
|
- port: 3000
|
|
targetPort: 3000
|
|
protocol: TCP
|
|
name: http
|
|
selector:
|
|
app: grafana
|
|
|
|
# =============================================================================
|
|
# Service: Blackbox Exporter (ClusterIP :9115)
|
|
# =============================================================================
|
|
---
|
|
apiVersion: v1
|
|
kind: Service
|
|
metadata:
|
|
name: blackbox-exporter
|
|
namespace: monitoring
|
|
labels:
|
|
app: blackbox-exporter
|
|
spec:
|
|
type: ClusterIP
|
|
ports:
|
|
- port: 9115
|
|
targetPort: 9115
|
|
protocol: TCP
|
|
name: http
|
|
selector:
|
|
app: blackbox-exporter
|
|
|
|
# =============================================================================
|
|
# Service: SNMP Exporter (ClusterIP :9116)
|
|
# =============================================================================
|
|
---
|
|
apiVersion: v1
|
|
kind: Service
|
|
metadata:
|
|
name: snmp-exporter
|
|
namespace: monitoring
|
|
labels:
|
|
app: snmp-exporter
|
|
spec:
|
|
type: ClusterIP
|
|
ports:
|
|
- port: 9116
|
|
targetPort: 9116
|
|
protocol: TCP
|
|
name: http
|
|
selector:
|
|
app: snmp-exporter
|
|
|
|
# =============================================================================
|
|
# Service: Node Exporter (headless for Prometheus SD)
|
|
# =============================================================================
|
|
---
|
|
apiVersion: v1
|
|
kind: Service
|
|
metadata:
|
|
name: node-exporter
|
|
namespace: monitoring
|
|
labels:
|
|
app: node-exporter
|
|
spec:
|
|
type: ClusterIP
|
|
clusterIP: None
|
|
ports:
|
|
- port: 9101
|
|
targetPort: 9101
|
|
protocol: TCP
|
|
name: metrics
|
|
selector:
|
|
app: node-exporter
|
|
|
|
# =============================================================================
|
|
# Service: IRC Notify (ClusterIP :9119)
|
|
# =============================================================================
|
|
---
|
|
apiVersion: v1
|
|
kind: Service
|
|
metadata:
|
|
name: irc-notify
|
|
namespace: monitoring
|
|
labels:
|
|
app: irc-notify
|
|
spec:
|
|
type: ClusterIP
|
|
ports:
|
|
- port: 9119
|
|
targetPort: 9119
|
|
protocol: TCP
|
|
name: http
|
|
selector:
|
|
app: irc-notify
|
|
|
|
# =============================================================================
|
|
# TLS Certificates (cert-manager + step-ca ACME)
|
|
# =============================================================================
|
|
---
|
|
apiVersion: cert-manager.io/v1
|
|
kind: Certificate
|
|
metadata:
|
|
name: grafana-tls
|
|
namespace: monitoring
|
|
spec:
|
|
secretName: grafana-tls
|
|
issuerRef:
|
|
name: step-ca-acme
|
|
kind: ClusterIssuer
|
|
dnsNames:
|
|
- grafana.iamworkin.lan
|
|
---
|
|
apiVersion: cert-manager.io/v1
|
|
kind: Certificate
|
|
metadata:
|
|
name: prometheus-tls
|
|
namespace: monitoring
|
|
spec:
|
|
secretName: prometheus-tls
|
|
issuerRef:
|
|
name: step-ca-acme
|
|
kind: ClusterIssuer
|
|
dnsNames:
|
|
- prometheus.iamworkin.lan
|
|
|
|
# =============================================================================
|
|
# Traefik IngressRoute: Grafana
|
|
# =============================================================================
|
|
---
|
|
apiVersion: traefik.io/v1alpha1
|
|
kind: IngressRoute
|
|
metadata:
|
|
name: grafana
|
|
namespace: monitoring
|
|
spec:
|
|
entryPoints:
|
|
- websecure
|
|
routes:
|
|
- kind: Rule
|
|
match: Host(`grafana.iamworkin.lan`)
|
|
services:
|
|
- name: grafana
|
|
port: 3000
|
|
tls:
|
|
secretName: grafana-tls
|
|
|
|
# =============================================================================
|
|
# Traefik IngressRoute: Prometheus
|
|
# =============================================================================
|
|
---
|
|
apiVersion: traefik.io/v1alpha1
|
|
kind: IngressRoute
|
|
metadata:
|
|
name: prometheus
|
|
namespace: monitoring
|
|
spec:
|
|
entryPoints:
|
|
- websecure
|
|
routes:
|
|
- kind: Rule
|
|
match: Host(`prometheus.iamworkin.lan`)
|
|
services:
|
|
- name: prometheus
|
|
port: 9090
|
|
tls:
|
|
secretName: prometheus-tls
|
|
|
|
# =============================================================================
|
|
# NetworkPolicy: monitoring namespace
|
|
# =============================================================================
|
|
---
|
|
apiVersion: networking.k8s.io/v1
|
|
kind: NetworkPolicy
|
|
metadata:
|
|
name: monitoring-netpol
|
|
namespace: monitoring
|
|
spec:
|
|
podSelector: {}
|
|
policyTypes:
|
|
- Ingress
|
|
- Egress
|
|
ingress:
|
|
# Allow from Traefik (IngressRoutes AND ACME solver pods)
|
|
- from:
|
|
- namespaceSelector:
|
|
matchLabels:
|
|
kubernetes.io/metadata.name: traefik-system
|
|
# Allow intra-namespace (prometheus→exporters, grafana→prometheus, grafana→irc-notify)
|
|
- from:
|
|
- podSelector: {}
|
|
# Allow from cert-manager (ACME HTTP-01 self-check)
|
|
- from:
|
|
- namespaceSelector:
|
|
matchLabels:
|
|
kubernetes.io/metadata.name: cert-manager
|
|
egress:
|
|
# DNS
|
|
- to:
|
|
- namespaceSelector: {}
|
|
ports:
|
|
- port: 53
|
|
protocol: UDP
|
|
- port: 53
|
|
protocol: TCP
|
|
# MGMT VLAN (noc1, pfSense, switches, SNMP, node-exporter)
|
|
- to:
|
|
- ipBlock:
|
|
cidr: 10.0.56.0/24
|
|
# PROD VLAN (edge nodes)
|
|
- to:
|
|
- ipBlock:
|
|
cidr: 10.0.57.0/24
|
|
# HOME VLAN (workstation, printer, NAS)
|
|
- to:
|
|
- ipBlock:
|
|
cidr: 10.0.58.0/24
|
|
# Intra-namespace
|
|
- to:
|
|
- podSelector: {}
|
|
# Blackbox probes to other namespaces (agent-zero, etc)
|
|
- to:
|
|
- namespaceSelector:
|
|
matchLabels:
|
|
kubernetes.io/metadata.name: agent-zero
|
|
ports:
|
|
- port: 80
|
|
protocol: TCP
|
|
# FlowerCore.RemoteDesktop /metrics scrape via the fc-desktop
|
|
# ClusterIP Service (remotedesktop-web:8080). Also covers the
|
|
# Traefik VIP hairpin path since after kube-proxy DNAT, the egress
|
|
# destination is the backend pod IP on the service port (see
|
|
# feedback_netpol_dnat_backend_port).
|
|
- to:
|
|
- namespaceSelector:
|
|
matchLabels:
|
|
kubernetes.io/metadata.name: fc-desktop
|
|
ports:
|
|
- port: 8080
|
|
protocol: TCP
|
|
# Traefik backend ports — needed for in-cluster egress to public
|
|
# iamworkin.lan hostnames that CoreDNS wildcard resolves to the
|
|
# LoadBalancer VIP. Post-DNAT destination is a Traefik pod on 8080/8443.
|
|
- to:
|
|
- namespaceSelector:
|
|
matchLabels:
|
|
kubernetes.io/metadata.name: traefik-system
|
|
podSelector:
|
|
matchLabels:
|
|
app.kubernetes.io/name: traefik
|
|
ports:
|
|
- port: 8080
|
|
protocol: TCP
|
|
- port: 8443
|
|
protocol: TCP
|
|
# Traefik /metrics endpoint (port 9100) — separate from the data-path
|
|
# ports above. Required for the in-cluster `traefik` scrape job.
|
|
- to:
|
|
- namespaceSelector:
|
|
matchLabels:
|
|
kubernetes.io/metadata.name: traefik-system
|
|
ports:
|
|
- port: 9100
|
|
protocol: TCP
|
|
# kube-state-metrics — required for kubernetes-state alert group.
|
|
- to:
|
|
- namespaceSelector:
|
|
matchLabels:
|
|
kubernetes.io/metadata.name: kube-system
|
|
ports:
|
|
- port: 8080
|
|
protocol: TCP
|
|
# cert-manager metrics — required for CertManagerCertificate* alerts.
|
|
- to:
|
|
- namespaceSelector:
|
|
matchLabels:
|
|
kubernetes.io/metadata.name: cert-manager
|
|
ports:
|
|
- port: 9402
|
|
protocol: TCP
|
|
# Longhorn manager metrics — required for Longhorn* alerts.
|
|
- to:
|
|
- namespaceSelector:
|
|
matchLabels:
|
|
kubernetes.io/metadata.name: longhorn-system
|
|
ports:
|
|
- port: 9500
|
|
protocol: TCP
|
|
# IRC (irc-notify → UnrealIRCd in irc namespace via K8s DNS)
|
|
- to:
|
|
- namespaceSelector:
|
|
matchLabels:
|
|
kubernetes.io/metadata.name: irc
|
|
ports:
|
|
- port: 6667
|
|
protocol: TCP
|
|
- port: 6697
|
|
protocol: TCP
|
|
# Step-CA ACME (cert renewal)
|
|
- to:
|
|
- ipBlock:
|
|
cidr: 10.0.56.10/32
|
|
ports:
|
|
- port: 9443
|
|
protocol: TCP
|
|
# Internet (optional: Grafana plugin install, ACME)
|
|
- to:
|
|
- ipBlock:
|
|
cidr: 0.0.0.0/0
|
|
except:
|
|
- 10.0.0.0/8
|
|
- 172.16.0.0/12
|
|
- 192.168.0.0/16
|
|
|
|
# =============================================================================
|
|
# Job: SNMP Config Loader (ArgoCD PostSync hook)
|
|
# =============================================================================
|
|
# Runs once after the main deployment to populate the SNMP config PVC.
|
|
# Attempts to download custom snmp.yml from noc1; falls back to the default
|
|
# config bundled in the snmp-exporter image.
|
|
---
|
|
apiVersion: batch/v1
|
|
kind: Job
|
|
metadata:
|
|
name: snmp-config-loader
|
|
namespace: monitoring
|
|
annotations:
|
|
argocd.argoproj.io/hook: PostSync
|
|
argocd.argoproj.io/hook-delete-policy: HookSucceeded
|
|
spec:
|
|
backoffLimit: 0
|
|
template:
|
|
metadata:
|
|
labels:
|
|
app: snmp-config-loader
|
|
spec:
|
|
restartPolicy: Never
|
|
initContainers:
|
|
# Try to download custom snmp.yml from noc1
|
|
- name: download-config
|
|
image: docker.io/curlimages/curl:latest
|
|
command:
|
|
- sh
|
|
- -c
|
|
- |
|
|
echo "Attempting to download custom snmp.yml from noc1..."
|
|
curl -sf --connect-timeout 10 --max-time 30 \
|
|
http://10.0.56.10:9116/config -o /config/snmp.yml 2>/dev/null
|
|
if [ $? -eq 0 ] && [ -s /config/snmp.yml ]; then
|
|
echo "Custom snmp.yml downloaded from noc1 successfully."
|
|
else
|
|
echo "Download failed or empty, will use default from image."
|
|
rm -f /config/snmp.yml
|
|
fi
|
|
volumeMounts:
|
|
- name: snmp-config
|
|
mountPath: /config
|
|
containers:
|
|
# If download failed, copy the default config from the image
|
|
- name: fallback-default
|
|
image: docker.io/prom/snmp-exporter:latest
|
|
command:
|
|
- sh
|
|
- -c
|
|
- |
|
|
if [ -f /config/snmp.yml ] && [ -s /config/snmp.yml ]; then
|
|
echo "Custom config already present, nothing to do."
|
|
else
|
|
echo "Copying default snmp.yml from image to PVC..."
|
|
cp /etc/snmp_exporter/snmp.yml /config/snmp.yml
|
|
echo "Default config copied."
|
|
fi
|
|
echo "SNMP config loader complete."
|
|
volumeMounts:
|
|
- name: snmp-config
|
|
mountPath: /config
|
|
volumes:
|
|
- name: snmp-config
|
|
persistentVolumeClaim:
|
|
claimName: snmp-config
|