4866 lines
178 KiB
YAML
4866 lines
178 KiB
YAML
# =============================================================================
|
|
# NOC Monitoring Stack — K8s Migration Target
|
|
# =============================================================================
|
|
# Migrates the noc1 Podman monitoring pod to RKE2 K8s.
|
|
# Source: noc1 (10.0.56.10) /opt/monitoring/
|
|
#
|
|
# Components:
|
|
# - Prometheus (metrics, alerting)
|
|
# - Grafana (dashboards)
|
|
# - Blackbox Exporter (HTTP probes)
|
|
# - SNMP Exporter (network device metrics)
|
|
# - Node Exporter (host metrics, DaemonSet)
|
|
# - IRC Notify (alert relay to UnrealIRCd)
|
|
#
|
|
# Note: SNMP exporter config (snmp.yml) is ~2MB, exceeding the 1MB ConfigMap
|
|
# limit. It is stored in a separate file (snmp-config.yaml) and must be
|
|
# applied as a standalone ConfigMap or mounted via an init container that
|
|
# downloads it from Gitea.
|
|
# =============================================================================
|
|
|
|
---
|
|
apiVersion: v1
|
|
kind: Namespace
|
|
metadata:
|
|
name: monitoring
|
|
|
|
# =============================================================================
|
|
# ConfigMap: Prometheus Configuration
|
|
# =============================================================================
|
|
---
|
|
apiVersion: v1
|
|
kind: ConfigMap
|
|
metadata:
|
|
name: prometheus-config
|
|
namespace: monitoring
|
|
data:
|
|
prometheus.yml: |
|
|
global:
|
|
scrape_interval: 30s
|
|
evaluation_interval: 30s
|
|
|
|
rule_files:
|
|
- /etc/prometheus/alerts.yml
|
|
- /etc/prometheus/recording-rules.yml
|
|
|
|
scrape_configs:
|
|
# noc1 host metrics (external to cluster)
|
|
- job_name: "node-exporter"
|
|
static_configs:
|
|
- targets: ["10.0.56.10:9100"]
|
|
labels:
|
|
instance: "noc1"
|
|
vlan: "mgmt"
|
|
|
|
# RKE2 bare-metal cluster (openSUSE Leap 16 on NUCs)
|
|
- job_name: "rke2-nodes"
|
|
scrape_timeout: 15s
|
|
static_configs:
|
|
- targets: ["10.0.56.11:9100"]
|
|
labels:
|
|
instance: "rke2-server"
|
|
vlan: "mgmt"
|
|
cluster: "rke2"
|
|
role: "server"
|
|
- targets: ["10.0.56.12:9100"]
|
|
labels:
|
|
instance: "rke2-agent1"
|
|
vlan: "mgmt"
|
|
cluster: "rke2"
|
|
role: "agent"
|
|
- targets: ["10.0.56.13:9100"]
|
|
labels:
|
|
instance: "rke2-agent2"
|
|
vlan: "mgmt"
|
|
cluster: "rke2"
|
|
role: "agent"
|
|
|
|
# Mac mini macOS runner node (INFRA VLAN)
|
|
- job_name: "macmini-node"
|
|
scrape_timeout: 15s
|
|
static_configs:
|
|
- targets: ["10.0.56.115:9100"]
|
|
labels:
|
|
instance: "macmini"
|
|
host: "macmini.iamworkin.lan"
|
|
vlan: "infra"
|
|
arch: "arm64"
|
|
role: "macos-runner"
|
|
puppet_managed: "true"
|
|
puppet_server: "puppet.iamworkin.lan"
|
|
|
|
# In-cluster node-exporter DaemonSet
|
|
- job_name: "k8s-node-exporter"
|
|
kubernetes_sd_configs:
|
|
- role: endpoints
|
|
namespaces:
|
|
names: ["monitoring"]
|
|
relabel_configs:
|
|
- source_labels: [__meta_kubernetes_endpoints_name]
|
|
action: keep
|
|
regex: node-exporter
|
|
- source_labels: [__meta_kubernetes_endpoint_node_name]
|
|
target_label: instance
|
|
|
|
# pfSense SNMP via snmp-exporter
|
|
- job_name: "snmp-pfsense"
|
|
static_configs:
|
|
- targets: ["10.0.56.1"]
|
|
metrics_path: /snmp
|
|
params:
|
|
module: [if_mib]
|
|
auth: [bluejay_v2]
|
|
relabel_configs:
|
|
- source_labels: [__address__]
|
|
target_label: __param_target
|
|
- source_labels: [__param_target]
|
|
target_label: instance
|
|
- target_label: __address__
|
|
replacement: snmp-exporter.monitoring.svc:9116
|
|
|
|
# UniFi Cloud Key SNMP — DISABLED 2026-04-26
|
|
# The Cloud Key Gen2+ runs unifi-core (controller) only — not a network
|
|
# device — and does NOT run an SNMP agent on UDP/161. Scrapes were
|
|
# silently failing with "connection refused" from 10.42.x.x:161 every
|
|
# 30s, polluting up{} = 0 and lastError on the Targets page. Hardware
|
|
# health (CPU/mem/disk) for the Cloud Key host should come from
|
|
# node_exporter via SSH — not SNMP.
|
|
# - job_name: "snmp-cloudkey"
|
|
# static_configs:
|
|
# - targets: ["10.0.56.3"]
|
|
# metrics_path: /snmp
|
|
# params:
|
|
# module: [if_mib]
|
|
# auth: [bluejay_v2]
|
|
# relabel_configs:
|
|
# - source_labels: [__address__]
|
|
# target_label: __param_target
|
|
# - source_labels: [__param_target]
|
|
# target_label: instance
|
|
# - target_label: __address__
|
|
# replacement: snmp-exporter.monitoring.svc:9116
|
|
|
|
# UniFi Switch SNMP
|
|
- job_name: "snmp-switch"
|
|
static_configs:
|
|
- targets: ["10.0.56.2"]
|
|
metrics_path: /snmp
|
|
params:
|
|
module: [if_mib]
|
|
auth: [bluejay_v2]
|
|
relabel_configs:
|
|
- source_labels: [__address__]
|
|
target_label: __param_target
|
|
- source_labels: [__param_target]
|
|
target_label: instance
|
|
- target_label: __address__
|
|
replacement: snmp-exporter.monitoring.svc:9116
|
|
|
|
# Synology NAS SNMP
|
|
- job_name: "snmp-nas"
|
|
static_configs:
|
|
- targets: ["10.0.58.3"]
|
|
metrics_path: /snmp
|
|
params:
|
|
module: [synology]
|
|
auth: [bluejay_v2]
|
|
relabel_configs:
|
|
- source_labels: [__address__]
|
|
target_label: __param_target
|
|
- source_labels: [__param_target]
|
|
target_label: instance
|
|
- target_label: __address__
|
|
replacement: snmp-exporter.monitoring.svc:9116
|
|
|
|
# Prometheus self-monitoring
|
|
- job_name: "prometheus"
|
|
static_configs:
|
|
- targets: ["localhost:9090"]
|
|
|
|
# Edge nodes (PROD VLAN)
|
|
- job_name: "edge-nodes"
|
|
static_configs:
|
|
- targets: ["10.0.57.17:9100"]
|
|
labels:
|
|
instance: "edge1"
|
|
vlan: "prod"
|
|
arch: "arm64"
|
|
role: "ai-inference"
|
|
puppet_managed: "true"
|
|
puppet_server: "puppet.iamworkin.lan"
|
|
- targets: ["10.0.57.16:9100"]
|
|
labels:
|
|
instance: "edge2"
|
|
vlan: "prod"
|
|
arch: "arm64"
|
|
role: "ci-runner"
|
|
puppet_managed: "true"
|
|
puppet_server: "puppet.iamworkin.lan"
|
|
- targets: ["10.0.58.25:9100"]
|
|
labels:
|
|
instance: "piez"
|
|
vlan: "home"
|
|
arch: "arm64"
|
|
role: "prototyping"
|
|
- targets: ["10.0.58.113:9100"]
|
|
labels:
|
|
instance: "pirelay"
|
|
vlan: "home"
|
|
arch: "arm64"
|
|
role: "relay-controller"
|
|
|
|
# =======================================================================
|
|
# PiManager Application Metrics (relay states, temps, automation)
|
|
# =======================================================================
|
|
|
|
- job_name: "pimanager-app"
|
|
scrape_interval: 15s
|
|
metrics_path: /metrics
|
|
static_configs:
|
|
- targets: ["10.0.58.25:5000"]
|
|
labels:
|
|
instance: "piez"
|
|
service: "pimanager"
|
|
vlan: "home"
|
|
device: "pi4-ezconnect"
|
|
- targets: ["10.0.58.113:5100"]
|
|
labels:
|
|
instance: "pirelay"
|
|
service: "pimanager"
|
|
vlan: "home"
|
|
device: "pi3-ks0212"
|
|
|
|
# Epson ET-3750 EcoTank Printer SNMP
|
|
- job_name: "snmp-printer"
|
|
scrape_interval: 5m
|
|
scrape_timeout: 30s
|
|
static_configs:
|
|
- targets: ["10.0.58.107"]
|
|
labels:
|
|
instance: "epson-ecotank"
|
|
vlan: "home"
|
|
device_type: "printer"
|
|
metrics_path: /snmp
|
|
params:
|
|
module: [printer_mib]
|
|
auth: [public_v2]
|
|
relabel_configs:
|
|
- source_labels: [__address__]
|
|
target_label: __param_target
|
|
- source_labels: [__param_target]
|
|
target_label: instance
|
|
- target_label: __address__
|
|
replacement: snmp-exporter.monitoring.svc:9116
|
|
|
|
# =============================================================================
|
|
# Print Services (CUPS + Print.Web on edge2)
|
|
# =============================================================================
|
|
|
|
# CUPS Prometheus exporter (cups_exporter on edge2:9628)
|
|
- job_name: "cups"
|
|
scrape_interval: 30s
|
|
static_configs:
|
|
- targets: ["10.0.57.16:9628"]
|
|
labels:
|
|
instance: "edge2"
|
|
service: "cups"
|
|
device_type: "printer"
|
|
printer_model: "NuPrint 210"
|
|
|
|
# Print.Web OTEL metrics (print counters/histograms plus Ollama runner gauges)
|
|
- job_name: "printweb-otel"
|
|
scrape_interval: 30s
|
|
metrics_path: /metrics/prometheus
|
|
static_configs:
|
|
- targets: ["10.0.57.16:5200"]
|
|
labels:
|
|
instance: "print-web"
|
|
service: "print-web"
|
|
device_type: "printer"
|
|
printer_model: "NuPrint 210"
|
|
|
|
# Print.Web health (Blazor app on edge2:5200)
|
|
- job_name: "probe-printweb"
|
|
metrics_path: /probe
|
|
params:
|
|
module: [http_2xx]
|
|
scrape_interval: 30s
|
|
static_configs:
|
|
- targets: ["http://10.0.57.16:5200/"]
|
|
labels:
|
|
instance: "print-web"
|
|
service: "print-web"
|
|
relabel_configs:
|
|
- source_labels: [__address__]
|
|
target_label: __param_target
|
|
- source_labels: [__param_target]
|
|
target_label: instance
|
|
- target_label: __address__
|
|
replacement: blackbox-exporter.monitoring.svc:9115
|
|
|
|
# FlowerCore.RemoteDesktop web health (public cluster VIP)
|
|
# Module is https_internal — desktop.iamworkin.lan uses a step-ca leaf
|
|
# cert; blackbox does NOT trust step-ca root, so http_2xx fails with
|
|
# x509 unknown authority and probe_success=0 even when /health 200s.
|
|
- job_name: "probe-remotedesktop"
|
|
metrics_path: /probe
|
|
params:
|
|
module: [https_internal]
|
|
scrape_interval: 30s
|
|
static_configs:
|
|
- targets: ["https://desktop.iamworkin.lan/health"]
|
|
labels:
|
|
instance: "https://desktop.iamworkin.lan/health"
|
|
service: "remotedesktop-web"
|
|
relabel_configs:
|
|
- source_labels: [__address__]
|
|
target_label: __param_target
|
|
- target_label: __address__
|
|
replacement: blackbox-exporter.monitoring.svc:9115
|
|
|
|
# FlowerCore.RemoteDesktop /metrics (direct scrape for counters)
|
|
- job_name: "fc-remotedesktop"
|
|
metrics_path: /metrics
|
|
scheme: https
|
|
scrape_interval: 30s
|
|
tls_config:
|
|
insecure_skip_verify: true
|
|
static_configs:
|
|
- targets: ["desktop.iamworkin.lan"]
|
|
labels:
|
|
service: "remotedesktop-web"
|
|
|
|
# CUPS web UI health (port 631)
|
|
- job_name: "probe-cups"
|
|
metrics_path: /probe
|
|
params:
|
|
module: [http_2xx]
|
|
scrape_interval: 60s
|
|
static_configs:
|
|
- targets: ["http://10.0.57.16:631/"]
|
|
labels:
|
|
instance: "cups-edge2"
|
|
service: "cups"
|
|
relabel_configs:
|
|
- source_labels: [__address__]
|
|
target_label: __param_target
|
|
- source_labels: [__param_target]
|
|
target_label: instance
|
|
- target_label: __address__
|
|
replacement: blackbox-exporter.monitoring.svc:9115
|
|
|
|
# =============================================================================
|
|
# AI Stack Health Probes (Blackbox Exporter)
|
|
# =============================================================================
|
|
|
|
# NOTE: probe-ollama-local and probe-agentzero-local were REMOVED
|
|
# 2026-04-26. They pointed at 10.0.58.100 (HOME VLAN) which is not
|
|
# reachable from cluster pods (firewalled). They had been firing as
|
|
# OllamaDown / AgentZeroDown since 2026-04-24. Workstation/AI-laptop
|
|
# Ollama and Agent Zero should be monitored via host-side Puppet
|
|
# (node_exporter on the box) once the AI laptop is running 24/7.
|
|
|
|
# Ollama API — edge1 Pi 5 (NUC Agent Zero)
|
|
- job_name: "probe-ollama-edge1"
|
|
metrics_path: /probe
|
|
params:
|
|
module: [http_ollama]
|
|
scrape_interval: 30s
|
|
static_configs:
|
|
- targets: ["http://10.0.57.17:11434/api/tags"]
|
|
labels:
|
|
instance: "ollama-edge1"
|
|
service: "ollama"
|
|
deployment: "nuc"
|
|
gpu: "cpu"
|
|
relabel_configs:
|
|
- source_labels: [__address__]
|
|
target_label: __param_target
|
|
- source_labels: [__param_target]
|
|
target_label: instance
|
|
- target_label: __address__
|
|
replacement: blackbox-exporter.monitoring.svc:9115
|
|
|
|
# Agent Zero Web UI — in-cluster (RKE2)
|
|
# Target uses short svc form (agent-zero.agent-zero.svc) NOT
|
|
# cluster.local FQDN — the *.cluster.local form gets rewritten to
|
|
# 10.0.56.200 (Traefik VIP) by the CoreDNS iamworkin.lan template +
|
|
# ndots:5 search-suffix expansion. Memory: feedback_coredns_ndots_template_collision.
|
|
- job_name: "probe-agentzero-nuc"
|
|
metrics_path: /probe
|
|
params:
|
|
module: [http_2xx]
|
|
scrape_interval: 30s
|
|
static_configs:
|
|
- targets: ["http://agent-zero.agent-zero.svc:80/"]
|
|
labels:
|
|
instance: "agent-zero-nuc"
|
|
service: "agent-zero"
|
|
deployment: "nuc"
|
|
relabel_configs:
|
|
- source_labels: [__address__]
|
|
target_label: __param_target
|
|
- source_labels: [__param_target]
|
|
target_label: instance
|
|
- target_label: __address__
|
|
replacement: blackbox-exporter.monitoring.svc:9115
|
|
|
|
# =============================================================================
|
|
# K8s Cluster State (kube-state-metrics, cert-manager, traefik)
|
|
# =============================================================================
|
|
# Use in-cluster ClusterIP service DNS — NOT NodePorts — so a same-node
|
|
# NodePort hairpin doesn't break the scrape (hit on rke2-agent1 hosting
|
|
# both prometheus and traefik on 2026-04-26: 10.0.56.12:30900 timed out
|
|
# from prometheus while .11/.13 worked). NodePorts at 30900-30902 are
|
|
# still useful for noc1-Podman-style external scrapers, but in-cluster
|
|
# we should always use the svc DNS form.
|
|
|
|
# kube-state-metrics — exposes K8s object state (pods, deployments, nodes)
|
|
# Required for KubeContainerRestartingFrequently / KubePodNotReady alerts.
|
|
- job_name: "kube-state-metrics"
|
|
scrape_interval: 30s
|
|
static_configs:
|
|
- targets: ["kube-state-metrics.kube-system.svc:8080"]
|
|
labels:
|
|
cluster: "rke2"
|
|
|
|
# cert-manager — exposes certmanager_certificate_ready_status,
|
|
# certmanager_certificate_expiration_timestamp_seconds, etc. Drives the
|
|
# CertManagerCertificateNotReady / CertManagerCertificateRenewalFailed
|
|
# alerts. Memory: project_cert_manager_prometheus_scrape.
|
|
- job_name: "cert-manager"
|
|
scrape_interval: 30s
|
|
static_configs:
|
|
- targets: ["cert-manager-metrics.cert-manager.svc:9402"]
|
|
labels:
|
|
cluster: "rke2"
|
|
|
|
# Traefik — request rates, latency, TLS cert metadata, router state.
|
|
# ClusterIP svc routes to one of the traefik pods; per-pod scrape via
|
|
# the headless `traefik-metrics` selector would be nicer for failover
|
|
# visibility but the single-replica scrape is enough for steady-state.
|
|
- job_name: "traefik"
|
|
scrape_interval: 15s
|
|
static_configs:
|
|
- targets: ["traefik-metrics.traefik-system.svc:9100"]
|
|
labels:
|
|
service: "traefik"
|
|
cluster: "rke2"
|
|
|
|
# Longhorn — exposes longhorn_volume_robustness, longhorn_backup_*,
|
|
# longhorn_node_status_*. Enables LonghornVolumeUnhealthy +
|
|
# LonghornBackupFailed alerts (no real visibility into Longhorn
|
|
# health before this — was relying on K8s events which are noisy
|
|
# transient lifecycle messages, not actionable signals).
|
|
- job_name: "longhorn"
|
|
scrape_interval: 30s
|
|
static_configs:
|
|
- targets: ["longhorn-backend.longhorn-system.svc:9500"]
|
|
labels:
|
|
service: "longhorn"
|
|
cluster: "rke2"
|
|
|
|
# FC web services through Traefik — single probe surface to spot any
|
|
# iamworkin.lan host returning non-200. Uses https_internal because all
|
|
# certs are step-ca leaves; blackbox would x509-fail with http_2xx.
|
|
# Some services need explicit healthcheck paths because root returns
|
|
# 404 (acme, guac) or 401 (grafana, prometheus). Drop them or point at
|
|
# the right endpoint — don't lower valid_status_codes globally because
|
|
# 401 from a healthy pod and 401 from an outage look identical.
|
|
- job_name: "probe-traefik-services"
|
|
metrics_path: /probe
|
|
params:
|
|
module: [https_internal]
|
|
scrape_interval: 60s
|
|
static_configs:
|
|
- targets:
|
|
# Root-reachable services (200 or 3xx)
|
|
- "https://gitea.iamworkin.lan/"
|
|
- "https://argocd.iamworkin.lan/"
|
|
- "https://intranet.iamworkin.lan/"
|
|
- "https://signage.iamworkin.lan/"
|
|
- "https://kiosk.iamworkin.lan/"
|
|
- "https://media.iamworkin.lan/"
|
|
- "https://mysql.iamworkin.lan/"
|
|
- "https://php.iamworkin.lan/"
|
|
- "https://zabbix.iamworkin.lan/"
|
|
- "https://desktop.iamworkin.lan/"
|
|
- "https://print.iamworkin.lan/"
|
|
- "https://dns.iamworkin.lan/"
|
|
- "https://chat.iamworkin.lan/"
|
|
- "https://dist.iamworkin.lan/"
|
|
- "https://dms.iamworkin.lan/"
|
|
- "https://menuboard.iamworkin.lan/"
|
|
- "https://messageboard.iamworkin.lan/"
|
|
- "https://presentations.iamworkin.lan/"
|
|
- "https://retail.iamworkin.lan/"
|
|
- "https://ttsreader.iamworkin.lan/"
|
|
# Explicit healthcheck paths
|
|
- "https://fc-llm-bridge.iamworkin.lan/healthz"
|
|
- "https://acme.iamworkin.lan/health"
|
|
# NOTE: services intentionally NOT in this probe surface
|
|
# - grafana.iamworkin.lan: every endpoint (incl. /api/health
|
|
# and /login) returns 401 behind Traefik basic-auth.
|
|
# Health covered by in-cluster monitoring-grafana scrape.
|
|
# - prometheus.iamworkin.lan: same auth pattern. Health covered
|
|
# by the prometheus self-scrape job.
|
|
# - guac.iamworkin.lan: deprecated — Guacamole moved to
|
|
# desktop.iamworkin.lan/guacamole/ (memory:
|
|
# feedback_traefik_cross_namespace_refs_disabled).
|
|
labels:
|
|
probe_type: "traefik-service"
|
|
relabel_configs:
|
|
- source_labels: [__address__]
|
|
target_label: __param_target
|
|
- source_labels: [__param_target]
|
|
regex: "https?://([^/:]+).*"
|
|
target_label: instance
|
|
- target_label: __address__
|
|
replacement: blackbox-exporter.monitoring.svc:9115
|
|
|
|
# =============================================================================
|
|
# Self-monitoring (K8s monitoring namespace)
|
|
# =============================================================================
|
|
|
|
- job_name: "monitoring-grafana"
|
|
metrics_path: /metrics
|
|
static_configs:
|
|
- targets: ["grafana.monitoring.svc:3000"]
|
|
labels:
|
|
instance: "grafana-k8s"
|
|
service: "grafana"
|
|
|
|
- job_name: "monitoring-blackbox"
|
|
static_configs:
|
|
- targets: ["blackbox-exporter.monitoring.svc:9115"]
|
|
labels:
|
|
instance: "blackbox-k8s"
|
|
service: "blackbox"
|
|
|
|
recording-rules.yml: |
|
|
groups:
|
|
- name: node-aggregations
|
|
interval: 30s
|
|
rules:
|
|
- record: instance:node_cpu_usage:avg5m
|
|
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)
|
|
- record: instance:node_memory_usage:percent
|
|
expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100
|
|
- record: instance:node_disk_usage:percent
|
|
expr: (1 - node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100
|
|
- record: instance:node_network_receive:rate5m
|
|
expr: rate(node_network_receive_bytes_total{device!~"lo|veth.*|cali.*|flannel.*"}[5m]) * 8
|
|
- record: instance:node_network_transmit:rate5m
|
|
expr: rate(node_network_transmit_bytes_total{device!~"lo|veth.*|cali.*|flannel.*"}[5m]) * 8
|
|
- name: probe-aggregations
|
|
interval: 30s
|
|
rules:
|
|
- record: service:probe_success:min
|
|
expr: min by(service) (probe_success)
|
|
- record: service:probe_duration:avg
|
|
expr: avg by(service) (probe_duration_seconds)
|
|
- name: print-rates
|
|
interval: 30s
|
|
rules:
|
|
- record: print:jobs_per_minute:rate5m
|
|
expr: rate(print_jobs_enqueued_total[5m]) * 60
|
|
- record: print:success_rate:ratio5m
|
|
expr: rate(print_jobs_completed_total[5m]) / rate(print_jobs_enqueued_total[5m])
|
|
- record: print:job_duration_p95:5m
|
|
expr: histogram_quantile(0.95, rate(print_job_duration_ms_bucket[5m]))
|
|
- record: print:ollama_runner_keepalive_remaining_seconds:max
|
|
expr: max by(instance, model) (ai_ollama_runner_keepalive_remaining_seconds{job="printweb-otel"})
|
|
- name: relay-rates
|
|
interval: 15s
|
|
rules:
|
|
- record: relay:state_changes:1h
|
|
expr: changes(pimanager_relay_state[1h])
|
|
- record: epson:pages_per_day:rate24h
|
|
expr: increase(prtMarkerLifeCount{job="snmp-printer"}[24h])
|
|
|
|
alerts.yml: |
|
|
groups:
|
|
- name: ai-stack
|
|
rules:
|
|
- alert: OllamaDown
|
|
expr: probe_success{service="ollama"} == 0
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Ollama is down on {{ $labels.deployment }}"
|
|
description: "Ollama API at {{ $labels.instance }} has been unreachable for 2 minutes. Agent Zero FAISS memory will fail."
|
|
|
|
- alert: AgentZeroDown
|
|
expr: probe_success{service="agent-zero"} == 0
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Agent Zero is down on {{ $labels.deployment }}"
|
|
description: "Agent Zero web UI at {{ $labels.instance }} has been unreachable for 2 minutes."
|
|
|
|
- alert: OllamaSlowResponse
|
|
expr: probe_duration_seconds{service="ollama"} > 3
|
|
for: 5m
|
|
labels:
|
|
severity: info
|
|
annotations:
|
|
summary: "Ollama responding slowly on {{ $labels.deployment }}"
|
|
description: "Ollama API response time exceeds 3s for 5 minutes. GPU may be overloaded."
|
|
|
|
- name: print-services
|
|
rules:
|
|
- alert: CUPSExporterDown
|
|
expr: up{job="cups"} == 0
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "CUPS exporter unreachable on edge2"
|
|
description: "cups_exporter at edge2:9628 has been down for 2 minutes. CUPS monitoring offline."
|
|
|
|
- alert: CUPSWebUIDown
|
|
expr: probe_success{job="probe-cups"} == 0
|
|
for: 3m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "CUPS web UI down on edge2"
|
|
description: "CUPS port 631 unreachable for 3 minutes. Network printing unavailable."
|
|
|
|
- alert: PrintWebDown
|
|
expr: probe_success{job="probe-printweb"} == 0
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Print.Web is down on edge2"
|
|
description: "FlowerCore Print.Web at edge2:5200 unreachable. API/MCP/Blazor printing unavailable."
|
|
|
|
- alert: CUPSPrinterStopped
|
|
expr: cups_printer_state_total{state="stopped"} > 0
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "CUPS printer stopped on edge2"
|
|
description: "A CUPS printer has been in stopped state for 5 minutes. Check USB connection or paper."
|
|
|
|
- alert: CUPSJobBacklog
|
|
expr: cups_job_active_total > 10
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Print queue backlog on edge2 ({{ $value }} active jobs)"
|
|
description: "CUPS has {{ $value }} active jobs queued. Possible printer jam, USB disconnect, or paper out."
|
|
|
|
# Paper roll lifecycle alerts (XL Track I, 2026-04-26).
|
|
# Source-of-truth gauge: print_paper_remaining_percent (Print.Web OTEL,
|
|
# hydrated on startup from the active PaperRoll row).
|
|
# alert_channel=thermal_print routes through irc-notify -> Print.Web
|
|
# /api/print/alert so the printer announces its own paper-out warning
|
|
# on its remaining paper. Self-referential humor + operator nudge.
|
|
- alert: PrintPaperRollLow
|
|
expr: print_paper_remaining_percent{job="printweb-otel"} < 10 and print_paper_remaining_percent{job="printweb-otel"} > 5
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
alert_channel: thermal_print
|
|
annotations:
|
|
summary: "Print roll low on edge2 ({{ $value | printf \"%.1f\" }}% remaining)"
|
|
description: "NuPrint 210 paper roll has {{ $value | printf \"%.1f\" }}% remaining. Operator should load a fresh roll soon. Run /api/paper/status for the precise mm + estimated jobs left."
|
|
|
|
- alert: PrintPaperRollCritical
|
|
expr: print_paper_remaining_percent{job="printweb-otel"} <= 5
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
alert_channel: thermal_print
|
|
annotations:
|
|
summary: "Print roll critical on edge2 ({{ $value | printf \"%.1f\" }}% remaining)"
|
|
description: "NuPrint 210 paper roll at {{ $value | printf \"%.1f\" }}% — load a new roll NOW. The 50ft roll has a ~12% red-stripe zone; once paper passes that, the printer can run dry mid-job."
|
|
|
|
- alert: PrintJobDeadLetter
|
|
expr: increase(print_jobs_dead_letter_total[15m]) > 0
|
|
for: 1m
|
|
labels:
|
|
severity: warning
|
|
alert_channel: thermal_print
|
|
annotations:
|
|
summary: "Print job(s) entered dead-letter on edge2 ({{ $value | printf \"%.0f\" }} in last 15m)"
|
|
description: "{{ $value | printf \"%.0f\" }} print job(s) exhausted MaxRetries and need operator action. Open /print-log, filter Status=DeadLetter, click 'Retry From Start' after fixing the underlying cause (paper jam, USB disconnect, printer power-cycle)."
|
|
|
|
- alert: CUPSHighJobRate
|
|
expr: rate(cups_job_total[5m]) * 60 > 30
|
|
for: 5m
|
|
labels:
|
|
severity: info
|
|
annotations:
|
|
summary: "High print volume on edge2 ({{ $value | printf \"%.0f\" }} jobs/min)"
|
|
description: "Print rate exceeds 30 jobs/min for 5 minutes. Possible runaway print loop."
|
|
|
|
- alert: PrintOllamaRunnerLongKeepAlive
|
|
expr: ai_ollama_runner_keepalive_remaining_seconds{job="printweb-otel"} > 600
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
alert_channel: thermal_print
|
|
annotations:
|
|
summary: "Print.Web Ollama runner held for >10m ({{ $labels.model }})"
|
|
description: "Print.Web reports model {{ $labels.model }} with {{ $value | printf \"%.0f\" }}s of keep-alive remaining. Check concurrent requests before the Pi 5 Ollama lane thrashes."
|
|
|
|
- name: macmini-runners
|
|
rules:
|
|
- alert: MacMiniRunnerOffline
|
|
expr: (flowercore_github_runner_online{runner=~"macmini-.*"} == 0) or absent(flowercore_github_runner_online{runner=~"macmini-.*"})
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
service: github-runner
|
|
annotations:
|
|
summary: "Mac mini GitHub runner offline ({{ $labels.runner }})"
|
|
description: "A macmini-* GitHub Actions runner has not reported online for more than 10 minutes. Puppet manages its LaunchDaemon under /Library/LaunchDaemons/io.flowercore.github-runner-<slug>.plist; runners survive reboot and do not require a GUI session."
|
|
|
|
- name: linux-runners
|
|
rules:
|
|
- alert: LinuxRunnerOffline
|
|
expr: |
|
|
kube_deployment_status_replicas_ready{
|
|
namespace="github-runner",
|
|
deployment=~"github-runner(|-(sharedpos|puppet|signage|dms|telephony|print-web|chat|mysql|kiosk-linux))"
|
|
} == 0
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
alert_channel: irc
|
|
service: github-runner
|
|
team: ci
|
|
annotations:
|
|
summary: "Linux CI runner offline: {{ $labels.deployment }}"
|
|
description: "Deployment {{ $labels.deployment }} in namespace github-runner has 0 ready replicas for more than 5 minutes. CI jobs targeting this repo will queue until the runner pod restarts and re-registers with GitHub. Check pods with: kubectl -n github-runner get pods -l app.kubernetes.io/name={{ $labels.deployment }}. Check logs with: kubectl -n github-runner logs -l app.kubernetes.io/name={{ $labels.deployment }} --tail=50. Common causes: PAT missing repo access, runner CrashLoopBackOff, or node/resource pressure."
|
|
|
|
- name: remote-desktop
|
|
rules:
|
|
- alert: RemoteDesktopWebDown
|
|
expr: probe_success{job="probe-remotedesktop",instance="https://desktop.iamworkin.lan/health"} == 0
|
|
for: 3m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "FlowerCore RemoteDesktop web is down"
|
|
description: "https://desktop.iamworkin.lan/health probe has failed for 3 minutes. Catalog + session launch surface offline."
|
|
|
|
- alert: RemoteDesktopMetricsStale
|
|
expr: absent(fc_desktop_session_events_total)
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "RemoteDesktop /metrics scrape returning no data"
|
|
description: "No fc_desktop_session_events_total series for 10 minutes. Either the Prometheus scrape target is misconfigured or the web deployment stopped exporting metrics. Zabbix template carries the same 10m no-data trigger for cross-monitor parity."
|
|
|
|
# PUBLISHER QUIRK: fc_desktop_pool_depleted / _deficit emit one
|
|
# series per template per status (Ready/Warming/BelowDesiredSize/
|
|
# Disabled), and the historical series for non-current statuses
|
|
# stay at their last value. So just `_depleted > 0` fires forever
|
|
# on any template that ever entered a bad state.
|
|
#
|
|
# SAFE PATTERN: alert only when the canonical "Ready" status
|
|
# gauge does NOT report ready=1 for the enabled template. This
|
|
# is the publisher's own canary — _ready{status="Ready"}==1 is
|
|
# always the current "everything is fine" signal.
|
|
- alert: RemoteDesktopPoolDepleted
|
|
expr: |
|
|
group by(template) (fc_desktop_pool_ready{enabled="true"})
|
|
unless on(template) (fc_desktop_pool_ready{enabled="true",status="Ready"} == 1)
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "RemoteDesktop pool depleted ({{ $labels.template }})"
|
|
description: "Pool for template {{ $labels.template }} has no Ready warm pod for 5 minutes. New launches will cold-start. Check pod-scheduling failures, image pull issues, or exhausted node capacity."
|
|
|
|
# Same pattern, but only fires when template explicitly reports
|
|
# a sustained Warning-level alert state (current-status series).
|
|
- alert: RemoteDesktopPoolDeficitSustained
|
|
expr: |
|
|
fc_desktop_pool_deficit{enabled="true",alert_level="Warning"} > 0
|
|
unless on(template) (fc_desktop_pool_ready{enabled="true",status="Ready"} == 1)
|
|
for: 10m
|
|
labels:
|
|
severity: info
|
|
annotations:
|
|
summary: "RemoteDesktop pool {{ $labels.template }} below desired for 10m"
|
|
description: "Pool {{ $labels.template }} has a persistent deficit of {{ $value }} warm pods AND no Ready series. Likely image pull, NFS affinity, or claim-init issue."
|
|
|
|
- alert: RemoteDesktopSessionChurnSpike
|
|
expr: sum(rate(fc_desktop_session_events_total{event="launch"}[5m])) * 60 > 20
|
|
for: 5m
|
|
labels:
|
|
severity: info
|
|
annotations:
|
|
summary: "RemoteDesktop launch rate high ({{ $value | printf \"%.0f\" }}/min)"
|
|
description: "Launch events exceed 20/min for 5 minutes. Could be a user-facing feature launch, a pooled template thrashing, or a runaway automation loop."
|
|
|
|
- alert: RemoteDesktopRecordingEventsDropped
|
|
expr: absent_over_time(fc_desktop_session_events_total{event="recording"}[30m]) and on() (sum(fc_desktop_session_events_total{event="launch"}) > 0)
|
|
for: 15m
|
|
labels:
|
|
severity: info
|
|
annotations:
|
|
summary: "RemoteDesktop recording events silent for 30m despite active launches"
|
|
description: "No recording events in 30 minutes while launches are happening. Recording may be silently disabled on all templates (SessionRecordingEnabled=false), the guacd NFS mount may be unhealthy, or the retention sweep isn't emitting events. Not an error by itself — worth checking."
|
|
|
|
# Match by job — instance label carries full URL incl. /health,
|
|
# not just hostname, so a hostname-only match never fires.
|
|
- alert: RemoteDesktopTlsExpiry
|
|
expr: probe_ssl_earliest_cert_expiry{job="probe-remotedesktop"} - time() < 2 * 86400
|
|
for: 6h
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "desktop.iamworkin.lan TLS cert expires within 2 days"
|
|
description: "The desktop.iamworkin.lan cert is inside the 2-day renewal window and cert-manager has not renewed. Check cert-manager logs, step-ca reachability, and pfSense DNS overrides per the ACME DNS-01 gate."
|
|
|
|
- alert: LonghornPVCGrowthRapid
|
|
expr: |
|
|
(
|
|
(
|
|
(
|
|
longhorn_volume_actual_size_bytes
|
|
- (longhorn_volume_actual_size_bytes offset 1h)
|
|
)
|
|
/ clamp_min(longhorn_volume_actual_size_bytes offset 1h, 1)
|
|
)
|
|
* on(volume) group_left(namespace, persistentvolumeclaim) (
|
|
(
|
|
label_replace(kube_persistentvolumeclaim_info{storageclass="longhorn"}, "volume", "$1", "volumename", "(.+)")
|
|
* on(namespace, persistentvolumeclaim) group_left()
|
|
kube_persistentvolumeclaim_labels{label_flowercore_io_managed_by="remotedesktop"}
|
|
)
|
|
or
|
|
label_replace(kube_persistentvolumeclaim_info{namespace="fc-desktop", storageclass="longhorn", persistentvolumeclaim=~"fc-profile-.*|remotedesktop-data"}, "volume", "$1", "volumename", "(.+)")
|
|
)
|
|
) > 0.20
|
|
or
|
|
(
|
|
(
|
|
longhorn_volume_actual_size_bytes
|
|
/ on(volume) clamp_min(longhorn_volume_capacity_bytes, 1)
|
|
)
|
|
* on(volume) group_left(namespace, persistentvolumeclaim) (
|
|
(
|
|
label_replace(kube_persistentvolumeclaim_info{storageclass="longhorn"}, "volume", "$1", "volumename", "(.+)")
|
|
* on(namespace, persistentvolumeclaim) group_left()
|
|
kube_persistentvolumeclaim_labels{label_flowercore_io_managed_by="remotedesktop"}
|
|
)
|
|
or
|
|
label_replace(kube_persistentvolumeclaim_info{namespace="fc-desktop", storageclass="longhorn", persistentvolumeclaim=~"fc-profile-.*|remotedesktop-data"}, "volume", "$1", "volumename", "(.+)")
|
|
)
|
|
) > 0.80
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
alert_channel: thermal_print
|
|
service: remotedesktop
|
|
annotations:
|
|
summary: "RemoteDesktop Longhorn PVC {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} growing rapidly"
|
|
description: "Longhorn volume {{ $labels.volume }} backing RemoteDesktop PVC {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} grew more than 20% in 1h or is over 80% capacity. Check for runaway SQLite/user-profile growth; this alert was added after the 2026-05-16 RemoteDesktop web SQLite Error 13 incident."
|
|
runbook: "1. kubectl -n {{ $labels.namespace }} describe pvc {{ $labels.persistentvolumeclaim }} 2. Open Longhorn UI volume {{ $labels.volume }} 3. Check RemoteDesktop web/user-volume SQLite files for permission or runaway growth 4. Expand PVC only after confirming the writer is healthy"
|
|
todo: "2026-05-19 metric gate: live noc1 Prometheus currently exposes kube_persistentvolumeclaim_info and kube_persistentvolumeclaim_resource_requests_storage_bytes, but not longhorn_volume_actual_size_bytes, longhorn_volume_capacity_bytes, kube_persistentvolumeclaim_labels, or kubelet_volume_stats_used_bytes. Keep the fc-desktop PVC fallback until kube-state-metrics label allowlist exposes flowercore.io/managed-by=remotedesktop."
|
|
|
|
- name: pi-fleet
|
|
rules:
|
|
- alert: PiManagerDown
|
|
expr: up{job="pimanager-app"} == 0
|
|
for: 3m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "PiManager down on {{ $labels.instance }}"
|
|
description: "PiManager app on {{ $labels.instance }} ({{ $labels.device }}) unreachable for 3 minutes."
|
|
|
|
- alert: PiCpuTempHigh
|
|
expr: pimanager_cpu_temperature_celsius > 75
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Pi CPU temperature high on {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}C)"
|
|
|
|
- alert: PiCpuTempCritical
|
|
expr: pimanager_cpu_temperature_celsius > 82
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Pi CPU temperature CRITICAL on {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}C)"
|
|
|
|
- alert: PiMemoryHigh
|
|
expr: pimanager_memory_usage_percent > 90
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Pi memory usage high on {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}%)"
|
|
|
|
- alert: PiDiskHigh
|
|
expr: pimanager_disk_usage_percent > 85
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Pi disk usage high on {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}%)"
|
|
|
|
- alert: RelayAllOff
|
|
expr: sum by (instance) (pimanager_relay_state) == 0 and pimanager_relay_channel_count > 0
|
|
for: 0m
|
|
labels:
|
|
severity: info
|
|
annotations:
|
|
summary: "All relay channels OFF on {{ $labels.instance }}"
|
|
|
|
- alert: PiWifiWeak
|
|
expr: pimanager_wifi_signal_dbm < -75 and pimanager_wifi_signal_dbm != 0
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Weak WiFi signal on {{ $labels.instance }} ({{ $value }}dBm)"
|
|
|
|
- name: snmp-devices
|
|
rules:
|
|
- alert: EpsonInkLow
|
|
expr: prtMarkerSuppliesLevel{job="snmp-printer"} < 15 and prtMarkerSuppliesLevel{job="snmp-printer"} > 0
|
|
for: 0m
|
|
labels:
|
|
severity: warning
|
|
alert_channel: thermal_print
|
|
annotations:
|
|
summary: "Epson ink low: {{ $labels.prtMarkerSuppliesDescription }} at {{ $value }}%"
|
|
|
|
- alert: EpsonInkCritical
|
|
expr: prtMarkerSuppliesLevel{job="snmp-printer"} < 5 and prtMarkerSuppliesLevel{job="snmp-printer"} > 0
|
|
for: 0m
|
|
labels:
|
|
severity: critical
|
|
alert_channel: thermal_print
|
|
annotations:
|
|
summary: "Epson ink CRITICAL: {{ $labels.prtMarkerSuppliesDescription }} at {{ $value }}%"
|
|
|
|
# for: 30m absorbs sleep cycles. The EcoTank sleeps after ~5 min
|
|
# of idle and SNMP times out, so 5m for: would page nightly. A
|
|
# genuine printer outage (jam, disconnected) lasts well over 30m.
|
|
- alert: EpsonPrinterDown
|
|
expr: up{job="snmp-printer"} == 0
|
|
for: 30m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Epson ET-3750 SNMP unreachable for >30m (likely actual fault, not sleep)"
|
|
|
|
- alert: SynologyDiskLow
|
|
expr: hrStorageUsed{job="snmp-nas"} / hrStorageSize{job="snmp-nas"} * 100 > 85
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
alert_channel: thermal_print
|
|
annotations:
|
|
summary: "Synology NAS storage high ({{ $value | printf \"%.1f\" }}%)"
|
|
|
|
- alert: SynologyDown
|
|
expr: up{job="snmp-nas"} == 0
|
|
for: 3m
|
|
labels:
|
|
severity: critical
|
|
alert_channel: thermal_print
|
|
annotations:
|
|
summary: "Synology NAS SNMP unreachable"
|
|
|
|
- name: infrastructure
|
|
rules:
|
|
- alert: NodeDown
|
|
expr: up{job=~"node-exporter|rke2-nodes|edge-nodes|k8s-node-exporter"} == 0
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Node {{ $labels.instance }} is down"
|
|
|
|
- alert: HighCPU
|
|
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 85
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High CPU on {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}%)"
|
|
|
|
- alert: HighMemory
|
|
expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100 > 90
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High memory usage on {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}%)"
|
|
|
|
- alert: DiskSpaceLow
|
|
expr: (1 - node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100 > 85
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Disk usage high on {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}%)"
|
|
|
|
# Puppet agent + service alerts.
|
|
# Mirror of FlowerCore.Notes/scripts/monitoring/alerts.yml `puppet` group
|
|
# so a future migration to in-cluster Prometheus inherits the ruleset.
|
|
# Source-of-truth for the live Podman Prometheus on noc1 is the Notes file.
|
|
# See feedback_monitoring_k8s_target_vs_live_podman.
|
|
- name: puppet
|
|
rules:
|
|
- alert: PuppetAgentReportStale
|
|
expr: puppet_last_run_age_seconds > 7200
|
|
for: 30m
|
|
labels:
|
|
severity: warning
|
|
alert_channel: irc
|
|
annotations:
|
|
summary: "Puppet agent {{ $labels.instance }} hasn't reported in over 2h"
|
|
description: "Last run age: {{ $value | humanizeDuration }}. The puppet agent on {{ $labels.instance }} may be stopped, the node may be powered off, or noc1 may be unreachable from this node."
|
|
runbook: "1. SSH to node (via noc1 jumpbox if needed) 2. sudo systemctl status puppet 3. sudo puppet agent -t --noop to force a run 4. Check r10k: ssh fcadmin@10.0.56.10 'sudo podman logs openvoxserver --tail 50' 5. Verify noc1 reachability: ping puppet.iamworkin.lan"
|
|
|
|
- alert: PuppetAgentReportCritical
|
|
expr: puppet_last_run_age_seconds > 86400
|
|
for: 1h
|
|
labels:
|
|
severity: critical
|
|
alert_channel: irc
|
|
annotations:
|
|
summary: "Puppet agent {{ $labels.instance }} silent for over 24h — node is unmanaged"
|
|
description: "Last run age: {{ $value | humanizeDuration }}. Node {{ $labels.instance }} has not submitted a Puppet report in over 24 hours. Config drift is accumulating — investigate immediately. If intentional (maintenance), add to the exclusion filter or silence in Grafana."
|
|
runbook: "URGENT: 1. Check node power state 2. SSH via noc1 jumpbox: ssh fcadmin@10.0.56.10 then ssh <node> 3. sudo systemctl status puppet 4. sudo systemctl start puppet + sudo puppet agent -t 5. Check for network partitions (VLAN connectivity to 10.0.56.10) 6. If node was recently reimaged: sudo puppet agent -t to re-register with new SSL cert"
|
|
|
|
# Sprint 33 Cx-7 Phase B (2026-05-25 postmortem follow-up):
|
|
# Detects puppet.service in failed state — distinct from PuppetAgentReportStale
|
|
# which catches "agent hasn't run." This catches "systemd gave up restarting it"
|
|
# (CA-verify loop or other fatal exit). Requires node-exporter systemd collector
|
|
# enabled with --collector.systemd. If `node_systemd_unit_state` has no series
|
|
# for a node, the collector is disabled there — flag in postmortem follow-up.
|
|
- alert: PuppetServiceFailed
|
|
expr: node_systemd_unit_state{name="puppet.service",state="failed"} == 1
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
alert_channel: irc
|
|
annotations:
|
|
summary: "Puppet service failed on {{ $labels.instance }}"
|
|
description: "puppet.service on {{ $labels.instance }} has been in failed state for 5+ minutes. systemd has stopped auto-restarting (CA-verify-loop or other exit). Manual `systemctl status puppet` confirms. Run `sudo systemctl start puppet` to recover; investigate journal for root cause."
|
|
runbook_url: "https://github.com/astoltz/FlowerCore.Notes/blob/master/memory/feedback_puppet_service_dead_after_ca_loop_alert_misreads.md"
|
|
|
|
# K8s pod-state alerts. Require kube-state-metrics scrape (added
|
|
# 2026-04-26 — see scrape_configs above). Would have surfaced the
|
|
# agent-zero ollama-proxy 172x crash-loop instead of letting it
|
|
# silently churn for ~3 days.
|
|
- name: kubernetes-state
|
|
rules:
|
|
- alert: KubeContainerRestartingFrequently
|
|
expr: increase(kube_pod_container_status_restarts_total[1h]) > 5
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "{{ $labels.namespace }}/{{ $labels.pod }} container {{ $labels.container }} restarting >5x/hr"
|
|
description: "Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has restarted {{ $value | printf \"%.0f\" }} times in the last hour. Check 'kubectl describe pod' + last-state termination reason."
|
|
|
|
- alert: KubeContainerCrashLooping
|
|
expr: increase(kube_pod_container_status_restarts_total[15m]) > 3
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
alert_channel: thermal_print
|
|
annotations:
|
|
summary: "{{ $labels.namespace }}/{{ $labels.pod }} crashlooping ({{ $value | printf \"%.0f\" }} restarts/15m)"
|
|
description: "Container {{ $labels.container }} restarted {{ $value | printf \"%.0f\" }} times in 15 minutes — actively crashlooping."
|
|
|
|
- alert: KubePodNotReady
|
|
expr: sum by(namespace, pod) (kube_pod_status_phase{phase=~"Pending|Failed|Unknown"}) > 0
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "{{ $labels.namespace }}/{{ $labels.pod }} not Ready for >15m"
|
|
description: "Pod is in a non-Running, non-Succeeded phase for over 15 minutes. Common causes: ImagePullBackOff (registry/Nexus down, wrong image tag), pending PVC, scheduling failure (taint/resources)."
|
|
|
|
- alert: KubePodImagePullBackOff
|
|
expr: sum by(namespace, pod) (kube_pod_container_status_waiting_reason{reason=~"ImagePullBackOff|ErrImagePull"}) > 0
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "{{ $labels.namespace }}/{{ $labels.pod }} ImagePullBackOff for >10m"
|
|
description: "Pod can't pull image. Check the image ref (often a stale tag or unreachable registry) and clean up if it's an orphan."
|
|
|
|
- alert: KubeDeploymentReplicasMismatch
|
|
expr: kube_deployment_spec_replicas != kube_deployment_status_replicas_available
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} replica mismatch"
|
|
description: "Spec wants {{ $labels.spec_replicas }} but only {{ $value }} available. Likely a rollout stuck on probe failure, scheduling, or PVC."
|
|
|
|
# Q-MR-3 (2026-05-11): multus memory pressure — catches the next OOM
|
|
# cascade BEFORE multus is OOM-killed cluster-wide. The 2026-05-10
|
|
# outage (21h) hit because no alert fired on the rising multus working
|
|
# set — only downstream blackbox / Traefik / service alerts. With
|
|
# 1Gi limit (bluejay-infra@eb8693e), 80% = ~800MiB; steady-state
|
|
# runs ~150-250MiB so this only fires when an avalanche starts.
|
|
- alert: MultusMemoryPressure
|
|
expr: |
|
|
container_memory_working_set_bytes{container="kube-multus"}
|
|
/ container_spec_memory_limit_bytes{container="kube-multus"} > 0.8
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
alert_channel: thermal_print
|
|
annotations:
|
|
summary: "kube-multus memory >80% of limit on {{ $labels.node }} for 5m"
|
|
description: "kube-multus working set is {{ $value | humanizePercentage }} of its memory limit on node {{ $labels.node }}. If this keeps climbing, multus will OOM and all new pod networking will halt cluster-wide (precedent: 2026-05-10 outage)."
|
|
|
|
# Q-MR-3 (2026-05-11): namespace pending-pod backlog — catches the
|
|
# operator-leak avalanche pattern BEFORE it cascades into a multus
|
|
# CNI OOM. Any FC operator (RemoteDesktop / Distribution / WorldBuilder)
|
|
# emitting pods without ownerReferences will accumulate them when
|
|
# the operator crashes. >25 pending pods in any namespace for 30m
|
|
# is the signal to investigate the reconciler.
|
|
- alert: NamespacePendingPodBacklog
|
|
expr: sum by (namespace) (kube_pod_status_phase{phase="Pending"}) > 25
|
|
for: 30m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Namespace {{ $labels.namespace }} has {{ $value }} Pending pods for 30m"
|
|
description: "Pending pod count in {{ $labels.namespace }} exceeds 25 sustained for 30m. Likely operator-leak avalanche pattern — children emitted without ownerReferences. Risk of multus CNI OOM cascade."
|
|
|
|
# Longhorn storage health alerts. Required: longhorn scrape job
|
|
# (added 2026-04-26 — see scrape_configs above). The K8s events
|
|
# for "snapshot becomes not ready to use" are transient lifecycle
|
|
# noise, not actionable — these alerts use the actual Longhorn
|
|
# gauges that reflect persistent state.
|
|
- name: longhorn-storage
|
|
rules:
|
|
# Volume robustness: 0=unknown, 1=healthy, 2=degraded, 3=faulted.
|
|
# Detached volumes report 0 — that's normal for unattached PVCs,
|
|
# so filter to only attached.
|
|
- alert: LonghornVolumeDegraded
|
|
expr: longhorn_volume_robustness{robustness="degraded"} == 1
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Longhorn volume {{ $labels.volume }} degraded for >15m"
|
|
description: "Volume {{ $labels.volume }} on node {{ $labels.node }} has been degraded (one or more replicas unhealthy) for 15+ minutes. Auto-rebuild may need help — check 'kubectl describe volume.longhorn.io {{ $labels.volume }} -n longhorn-system'."
|
|
|
|
- alert: LonghornVolumeFaulted
|
|
expr: longhorn_volume_robustness{robustness="faulted"} == 1
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
alert_channel: thermal_print
|
|
annotations:
|
|
summary: "Longhorn volume {{ $labels.volume }} FAULTED"
|
|
description: "Volume {{ $labels.volume }} on node {{ $labels.node }} is faulted — all replicas unavailable. Data inaccessible. Manual intervention required."
|
|
|
|
# No backup in 36h indicates the daily-backup recurringJob is
|
|
# silently failing. Allows for one missed run + slack.
|
|
- alert: LonghornBackupStale
|
|
expr: |
|
|
(time() - max by(volume) (longhorn_backup_state{state="Completed"} * on(backup) group_left() longhorn_backup_actual_size_bytes)) > 36 * 3600
|
|
for: 1h
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Longhorn volume {{ $labels.volume }} has no completed backup in >36h"
|
|
description: "Daily backup recurringJob (cron 0 2 * * *) appears to have skipped this volume. Check 'kubectl get backups.longhorn.io -n longhorn-system' and the daily-backup CronJob logs."
|
|
|
|
- alert: LonghornNodeUnhealthy
|
|
expr: longhorn_node_status{condition="ready",condition_reason!=""} == 0
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Longhorn node {{ $labels.node }} not Ready"
|
|
description: "Node {{ $labels.node }} reports ready=false (reason: {{ $labels.condition_reason }}). Volumes scheduled to this node will be unavailable until it recovers."
|
|
|
|
# ============================================================
|
|
# FC Signage Marquee Performance — Track 3 + 8 (2026-05-06)
|
|
# Live-mirrored from FlowerCore.Notes/scripts/monitoring/alerts.yml.
|
|
# Source-of-truth for the live Podman Prometheus on noc1 is the
|
|
# Notes file; this K8s ConfigMap exists so a future migration to
|
|
# in-cluster Prometheus inherits the ruleset automatically.
|
|
# See feedback_monitoring_k8s_target_vs_live_podman.
|
|
# ============================================================
|
|
- name: fc-signage-marquee
|
|
rules:
|
|
- alert: MarqueeDroppedFramesHigh
|
|
expr: |
|
|
(
|
|
sum by (renderer, phase, node_id) (rate(marquee_dropped_frames_total[5m]))
|
|
/
|
|
sum by (renderer, phase, node_id) (rate(marquee_render_latency_ms_count[5m]))
|
|
) > 0.05
|
|
unless on()
|
|
absent_over_time(marquee_dropped_frames_total[7d])
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
service: signage
|
|
alert_channel: irc
|
|
annotations:
|
|
summary: "Marquee dropped-frame rate >5% on {{ $labels.renderer }}/{{ $labels.node_id }} ({{ $labels.phase }})"
|
|
description: "Renderer {{ $labels.renderer }} on {{ $labels.node_id }} drops >5% of frames during {{ $labels.phase }}. Animation visibly stuttery."
|
|
|
|
- alert: MarqueeRenderLatencyP99High
|
|
expr: |
|
|
histogram_quantile(
|
|
0.99,
|
|
sum by (renderer, phase, node_id, le) (rate(marquee_render_latency_ms_bucket[5m]))
|
|
) > 16
|
|
unless on()
|
|
absent_over_time(marquee_render_latency_ms_bucket[7d])
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
service: signage
|
|
alert_channel: irc
|
|
annotations:
|
|
summary: "Marquee render latency p99 > 16ms on {{ $labels.renderer }}/{{ $labels.node_id }} ({{ $labels.phase }})"
|
|
description: "Per-frame render latency p99 has exceeded the Pi-class 16ms budget for 10 minutes."
|
|
|
|
- alert: MarqueeAnimationDurationDrift
|
|
expr: |
|
|
abs(
|
|
histogram_quantile(0.5, sum by (renderer, phase, le) (rate(marquee_animation_duration_ms_bucket[15m])))
|
|
-
|
|
on (phase) group_left() avg by (phase) (marquee_animation_duration_target_ms)
|
|
)
|
|
/
|
|
on (phase) group_left() avg by (phase) (marquee_animation_duration_target_ms)
|
|
> 0.10
|
|
unless on()
|
|
absent_over_time(marquee_animation_duration_ms_bucket[7d])
|
|
for: 15m
|
|
labels:
|
|
severity: info
|
|
service: signage
|
|
alert_channel: irc
|
|
annotations:
|
|
summary: "Marquee animation duration drifting > 10% on {{ $labels.renderer }} ({{ $labels.phase }})"
|
|
description: "Median observed cycle duration deviates from target DurationMs by >10%. Could indicate browser tab throttling, GPU pressure, or phase-advancement bug."
|
|
|
|
# =============================================================================
|
|
# ConfigMap: Blackbox Exporter Configuration
|
|
# =============================================================================
|
|
---
|
|
apiVersion: v1
|
|
kind: ConfigMap
|
|
metadata:
|
|
name: blackbox-config
|
|
namespace: monitoring
|
|
data:
|
|
blackbox.yml: |
|
|
modules:
|
|
http_2xx:
|
|
prober: http
|
|
timeout: 5s
|
|
http:
|
|
valid_http_versions: ["HTTP/1.1", "HTTP/2.0"]
|
|
valid_status_codes: [200]
|
|
method: GET
|
|
fail_if_body_not_matches_regexp: []
|
|
preferred_ip_protocol: ip4
|
|
http_ollama:
|
|
prober: http
|
|
timeout: 5s
|
|
http:
|
|
valid_http_versions: ["HTTP/1.1", "HTTP/2.0"]
|
|
valid_status_codes: [200]
|
|
method: GET
|
|
fail_if_body_not_matches_regexp:
|
|
- '"models"'
|
|
preferred_ip_protocol: ip4
|
|
# https_internal — for Traefik-fronted services with step-ca leaf
|
|
# certs. blackbox does not trust the step-ca root CA, so http_2xx
|
|
# against any *.iamworkin.lan host fails with x509 unknown authority.
|
|
# Redirects + multiple status codes are accepted because some hosts
|
|
# 302 to /login or /scalar.
|
|
https_internal:
|
|
prober: http
|
|
timeout: 10s
|
|
http:
|
|
valid_http_versions: ["HTTP/1.1", "HTTP/2.0"]
|
|
valid_status_codes: [200, 301, 302, 303, 307, 308]
|
|
method: GET
|
|
follow_redirects: true
|
|
preferred_ip_protocol: ip4
|
|
tls_config:
|
|
insecure_skip_verify: true
|
|
|
|
# =============================================================================
|
|
# ConfigMap: IRC Notify Script
|
|
# =============================================================================
|
|
---
|
|
apiVersion: v1
|
|
kind: ConfigMap
|
|
metadata:
|
|
name: irc-notify-script
|
|
namespace: monitoring
|
|
data:
|
|
notify.py: |
|
|
#!/usr/bin/env python3
|
|
"""HTTP->IRC alert relay with thermal-printer DIGEST forwarding.
|
|
|
|
Listens on :9119, posts to #alerts on UnrealIRCd, forwards to Print.Web
|
|
/api/print/alert. Thermal printing is BATCHED into hourly digests by
|
|
default so the printer no longer spam-fires per Grafana webhook.
|
|
|
|
Routing (per Grafana webhook alert):
|
|
- IRC: always per-event (operator likes the stream)
|
|
- Thermal printer:
|
|
* severity in {critical,disaster,page} OR
|
|
label alert_channel=thermal_print_immediate -> print NOW
|
|
* label alert_channel=thermal_print -> enqueue into hourly digest
|
|
* everything else -> IRC only
|
|
- RESOLVED webhooks remove the alert from the digest buffer
|
|
|
|
Env vars (defaults preserve old behavior on first deploy):
|
|
THERMAL_PRINT_ENABLED default "true" - master kill switch
|
|
BATCH_INTERVAL_MIN default "60" - minutes between digest prints
|
|
BATCH_MAX_PENDING default "50" - force-flush threshold
|
|
|
|
HTTP surface:
|
|
POST / - Grafana webhook entry
|
|
POST /flush - manual digest flush (idempotent)
|
|
GET / - status + config + buffer depth + stats
|
|
"""
|
|
import json, os, socket, sys, threading, time
|
|
from collections import defaultdict
|
|
from datetime import datetime, timezone
|
|
from http.server import HTTPServer, BaseHTTPRequestHandler
|
|
from urllib.request import Request, urlopen
|
|
|
|
THERMAL_PRINT_ENABLED = os.environ.get("THERMAL_PRINT_ENABLED", "true").lower() == "true"
|
|
BATCH_INTERVAL_MIN = int(os.environ.get("BATCH_INTERVAL_MIN", "60"))
|
|
BATCH_MAX_PENDING = int(os.environ.get("BATCH_MAX_PENDING", "50"))
|
|
|
|
IRC_HOST = os.environ.get("IRC_HOST", "unrealircd.irc.svc")
|
|
IRC_PORT = int(os.environ.get("IRC_PORT", "6667"))
|
|
IRC_NICK = os.environ.get("IRC_NICK", "grafana-bot")
|
|
IRC_CHANNEL = os.environ.get("IRC_CHANNEL", "#alerts")
|
|
PRINT_WEB_URL = os.environ.get("PRINT_WEB_URL", "http://10.0.57.16:5200/api/print/alert")
|
|
|
|
_buffer_lock = threading.Lock()
|
|
_buffer = {} # fingerprint -> {"alert": dict, "first_seen": float, "last_seen": float}
|
|
_last_flush_time = time.time()
|
|
_stats = {"webhooks_received": 0, "irc_sent": 0, "print_immediate": 0,
|
|
"digest_flushed": 0, "buffer_dedup": 0, "buffer_added": 0,
|
|
"buffer_resolved": 0, "started_at": time.time()}
|
|
|
|
def send_irc(message):
|
|
try:
|
|
sock = socket.create_connection((IRC_HOST, IRC_PORT), timeout=15)
|
|
sock.sendall(f"NICK {IRC_NICK}\r\n".encode())
|
|
sock.sendall(f"USER {IRC_NICK} 0 * :Grafana Alert Bot\r\n".encode())
|
|
registered = False
|
|
deadline = time.time() + 10
|
|
buf = ""
|
|
while time.time() < deadline:
|
|
try:
|
|
data = sock.recv(4096).decode("utf-8", errors="replace")
|
|
if not data: break
|
|
buf += data
|
|
for line in buf.split("\r\n"):
|
|
if line.startswith("PING"):
|
|
sock.sendall(("PONG " + line.split(" ", 1)[1] + "\r\n").encode())
|
|
if " 001 " in buf:
|
|
registered = True
|
|
break
|
|
except socket.timeout: break
|
|
if not registered:
|
|
sock.close()
|
|
return False
|
|
sock.sendall(f"JOIN {IRC_CHANNEL}\r\n".encode())
|
|
time.sleep(0.5)
|
|
sock.recv(4096)
|
|
for line in message.split("\n"):
|
|
if line.strip():
|
|
sock.sendall(f"PRIVMSG {IRC_CHANNEL} :{line}\r\n".encode())
|
|
time.sleep(0.3)
|
|
time.sleep(0.5)
|
|
sock.sendall(b"QUIT :alert delivered\r\n")
|
|
sock.close()
|
|
_stats["irc_sent"] += 1
|
|
return True
|
|
except Exception as e:
|
|
print(f"[irc-notify] IRC send failed: {e}", file=sys.stderr)
|
|
return False
|
|
|
|
def post_thermal(payload, kind):
|
|
if not THERMAL_PRINT_ENABLED:
|
|
print(f"[irc-notify] thermal disabled; skip {kind} ({payload.get('title','?')[:40]})", file=sys.stderr)
|
|
return False
|
|
try:
|
|
req = Request(PRINT_WEB_URL, data=json.dumps(payload).encode("utf-8"),
|
|
headers={"Content-Type": "application/json"}, method="POST")
|
|
resp = urlopen(req, timeout=10)
|
|
if kind == "immediate": _stats["print_immediate"] += 1
|
|
print(f"[irc-notify] thermal {kind} sent: {payload.get('title','?')[:50]}", file=sys.stderr)
|
|
return True
|
|
except Exception as e:
|
|
print(f"[irc-notify] thermal {kind} failed: {e}", file=sys.stderr)
|
|
return False
|
|
|
|
def fingerprint_of(alert):
|
|
fp = alert.get("fingerprint", "")
|
|
if fp: return fp
|
|
labels = alert.get("labels", {})
|
|
target = labels.get("pod") or labels.get("instance") or labels.get("deployment") or labels.get("statefulset") or labels.get("namespace") or ""
|
|
return f"{labels.get('alertname','?')}/{labels.get('namespace','')}/{target}"
|
|
|
|
def is_critical(alert):
|
|
return alert.get("labels", {}).get("severity", "").lower() in ("critical", "disaster", "page")
|
|
|
|
def is_immediate_label(alert):
|
|
return alert.get("labels", {}).get("alert_channel") == "thermal_print_immediate"
|
|
|
|
def is_batched_label(alert):
|
|
return alert.get("labels", {}).get("alert_channel") == "thermal_print"
|
|
|
|
def add_to_digest(alert):
|
|
"""Add an alert to the digest buffer. Returns True if the buffer GREW
|
|
(new fingerprint), False if it was a dedup, resolution, or no-op.
|
|
"""
|
|
if not THERMAL_PRINT_ENABLED: return False
|
|
fp = fingerprint_of(alert)
|
|
status = alert.get("status", "firing").lower()
|
|
with _buffer_lock:
|
|
if status == "resolved":
|
|
if fp in _buffer:
|
|
del _buffer[fp]
|
|
_stats["buffer_resolved"] += 1
|
|
return False
|
|
if fp in _buffer:
|
|
_buffer[fp]["last_seen"] = time.time()
|
|
_buffer[fp]["alert"] = alert
|
|
_stats["buffer_dedup"] += 1
|
|
return False
|
|
_buffer[fp] = {"alert": alert, "first_seen": time.time(), "last_seen": time.time()}
|
|
_stats["buffer_added"] += 1
|
|
return True
|
|
|
|
def build_digest_payload():
|
|
with _buffer_lock:
|
|
items = list(_buffer.values())
|
|
if not items: return None
|
|
by_name = defaultdict(list)
|
|
for item in items:
|
|
labels = item["alert"].get("labels", {})
|
|
by_name[labels.get("alertname", "Unknown")].append(item)
|
|
lines = []
|
|
for name, group in sorted(by_name.items()):
|
|
targets = []
|
|
for it in group[:5]:
|
|
labels = it["alert"].get("labels", {})
|
|
t = (labels.get("pod") or labels.get("instance") or labels.get("deployment")
|
|
or labels.get("statefulset") or labels.get("namespace") or "?")
|
|
targets.append(t)
|
|
more = f" (+{len(group)-5})" if len(group) > 5 else ""
|
|
sevs = sorted({it["alert"].get("labels", {}).get("severity", "warning") for it in group})
|
|
lines.append(f"[{'/'.join(sevs)}] {name} x{len(group)}: {', '.join(targets)}{more}")
|
|
now = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC")
|
|
title = f"Alert digest: {len(items)} firing"
|
|
body = "\n".join([
|
|
f"=== {title} ===",
|
|
f"as of {now}",
|
|
"",
|
|
*lines,
|
|
"",
|
|
"Stream: #alerts (IRC) | Triage: grafana-noc1.iamworkin.lan",
|
|
"Force-flush: POST irc-notify.monitoring.svc:9119/flush",
|
|
])
|
|
return {"title": title, "severity": "Warning", "host": "monitoring",
|
|
"message": body, "eventId": f"digest-{int(time.time())}",
|
|
"source": "Grafana digest", "status": "PROBLEM", "acknowledged": False}
|
|
|
|
def flush_digest():
|
|
payload = build_digest_payload()
|
|
if payload is None:
|
|
print("[irc-notify] flush: buffer empty, no digest sent", file=sys.stderr)
|
|
return False
|
|
sent = post_thermal(payload, "digest")
|
|
with _buffer_lock:
|
|
_buffer.clear()
|
|
if sent: _stats["digest_flushed"] += 1
|
|
return sent
|
|
|
|
def digest_loop():
|
|
global _last_flush_time
|
|
while True:
|
|
try:
|
|
now = time.time()
|
|
elapsed = now - _last_flush_time
|
|
if elapsed >= BATCH_INTERVAL_MIN * 60:
|
|
print(f"[irc-notify] digest tick: interval reached ({BATCH_INTERVAL_MIN}m); buffer={len(_buffer)}", file=sys.stderr)
|
|
flush_digest()
|
|
_last_flush_time = now
|
|
elif len(_buffer) >= BATCH_MAX_PENDING:
|
|
print(f"[irc-notify] digest tick: buffer full ({len(_buffer)}); force flush", file=sys.stderr)
|
|
flush_digest()
|
|
_last_flush_time = now
|
|
time.sleep(15)
|
|
except Exception as e:
|
|
print(f"[irc-notify] digest loop error: {e}", file=sys.stderr)
|
|
time.sleep(60)
|
|
|
|
class Handler(BaseHTTPRequestHandler):
|
|
def do_POST(self):
|
|
if self.path == "/flush":
|
|
ok = flush_digest()
|
|
self.send_response(200); self.send_header("Content-Type", "application/json"); self.end_headers()
|
|
self.wfile.write(json.dumps({"flushed": ok, "buffer_after": len(_buffer)}).encode())
|
|
return
|
|
_stats["webhooks_received"] += 1
|
|
length = int(self.headers.get("Content-Length", 0))
|
|
body = json.loads(self.rfile.read(length)) if length else {}
|
|
for alert in body.get("alerts", []):
|
|
status = alert.get("status", "unknown").upper()
|
|
labels = alert.get("labels", {})
|
|
name = labels.get("alertname", "Unknown")
|
|
summary = alert.get("annotations", {}).get("summary", "")
|
|
desc = alert.get("annotations", {}).get("description", "")
|
|
severity = labels.get("severity", "")
|
|
icon = "\x0304[FIRING]\x03" if status == "FIRING" else "\x0303[RESOLVED]\x03"
|
|
sev_tag = f" [{severity}]" if severity else ""
|
|
msg = f"{icon}{sev_tag} {name}: {summary}"
|
|
if desc: msg += f"\n {desc}"
|
|
send_irc(msg)
|
|
# Thermal routing — EVERYTHING (including criticals) goes into
|
|
# the hourly digest. Only the explicit `alert_channel=thermal_print_immediate`
|
|
# label bypasses, and even that flushes-the-current-digest rather
|
|
# than printing a standalone job, so the same fingerprint can't
|
|
# spam the printer per webhook cycle.
|
|
if status == "RESOLVED":
|
|
add_to_digest(alert) # removes from buffer
|
|
continue
|
|
if is_immediate_label(alert):
|
|
# Explicit opt-in for "paper this NOW" — first arrival of a
|
|
# new fingerprint triggers an immediate digest flush; repeat
|
|
# webhooks for the same fingerprint dedupe in the buffer
|
|
# until the next interval or until the alert resolves.
|
|
new_in_buffer = add_to_digest(alert)
|
|
if new_in_buffer:
|
|
global _last_flush_time
|
|
flush_digest()
|
|
_last_flush_time = time.time()
|
|
elif is_critical(alert) or is_batched_label(alert):
|
|
add_to_digest(alert)
|
|
# else: IRC-only (warnings without thermal_print label)
|
|
self.send_response(200); self.send_header("Content-Type", "application/json"); self.end_headers()
|
|
self.wfile.write(b'{"status":"ok"}')
|
|
|
|
def do_GET(self):
|
|
self.send_response(200); self.send_header("Content-Type", "application/json"); self.end_headers()
|
|
with _buffer_lock:
|
|
alertnames = sorted({it["alert"].get("labels", {}).get("alertname", "?") for it in _buffer.values()})
|
|
depth = len(_buffer)
|
|
info = {
|
|
"service": "irc-notify",
|
|
"config": {"thermal_print_enabled": THERMAL_PRINT_ENABLED,
|
|
"batch_interval_min": BATCH_INTERVAL_MIN,
|
|
"batch_max_pending": BATCH_MAX_PENDING,
|
|
"irc_target": f"{IRC_HOST}:{IRC_PORT} {IRC_CHANNEL}",
|
|
"print_web_url": PRINT_WEB_URL},
|
|
"buffer": {"depth": depth, "alertnames": alertnames,
|
|
"seconds_since_last_flush": int(time.time() - _last_flush_time),
|
|
"seconds_until_next_flush": max(0, int(BATCH_INTERVAL_MIN*60 - (time.time() - _last_flush_time)))},
|
|
"stats": _stats,
|
|
}
|
|
self.wfile.write(json.dumps(info, indent=2).encode())
|
|
|
|
def log_message(self, format, *args):
|
|
print(f"[irc-notify] {args[0]}", file=sys.stderr)
|
|
|
|
if __name__ == "__main__":
|
|
threading.Thread(target=digest_loop, daemon=True).start()
|
|
server = HTTPServer(("0.0.0.0", 9119), Handler)
|
|
print(f"[irc-notify] :9119 -> IRC {IRC_HOST}:{IRC_PORT} {IRC_CHANNEL} | thermal={'ON' if THERMAL_PRINT_ENABLED else 'OFF'} | digest={BATCH_INTERVAL_MIN}m max={BATCH_MAX_PENDING}", file=sys.stderr)
|
|
server.serve_forever()
|
|
|
|
# =============================================================================
|
|
# SNMP Exporter Auth Secret
|
|
# =============================================================================
|
|
# The full snmp.yml (~2MB, auto-generated) exceeds the 1MB ConfigMap limit.
|
|
# Strategy: store SNMP auth credentials in a Secret, and use an init container
|
|
# to download the full snmp.yml from Gitea, or split into multiple ConfigMaps.
|
|
# For now, we mount a minimal auth-only config and rely on the default modules
|
|
# bundled in the snmp-exporter image. To use custom modules, apply
|
|
# snmp-config.yaml separately (see comments in that file).
|
|
---
|
|
apiVersion: v1
|
|
kind: Secret
|
|
metadata:
|
|
name: snmp-auth
|
|
namespace: monitoring
|
|
type: Opaque
|
|
stringData:
|
|
# SNMP v2 community string used by prometheus scrape configs
|
|
SNMP_COMMUNITY_BLUEJAY: bluejay_monitor
|
|
SNMP_V3_USER: bluejay_snmpv3
|
|
SNMP_V3_AUTH_PASS: BlueJay-SNMP-Auth-2026
|
|
SNMP_V3_PRIV_PASS: BlueJay-SNMP-Priv-2026
|
|
|
|
# =============================================================================
|
|
# Grafana Credentials — synced from 1Password via Operator
|
|
# =============================================================================
|
|
# 1Password vault: IAmWorkin > "Grafana"
|
|
# Creates K8s Secret "grafana-credentials" with fields: username, password
|
|
# Grafana Deployment uses secretKeyRef to map username→GF_SECURITY_ADMIN_USER, password→GF_SECURITY_ADMIN_PASSWORD
|
|
---
|
|
apiVersion: onepassword.com/v1
|
|
kind: OnePasswordItem
|
|
metadata:
|
|
name: grafana-credentials
|
|
namespace: monitoring
|
|
spec:
|
|
itemPath: vaults/IAmWorkin/items/Grafana
|
|
|
|
# =============================================================================
|
|
# RBAC: ServiceAccount + ClusterRole for Prometheus K8s SD
|
|
# =============================================================================
|
|
---
|
|
apiVersion: v1
|
|
kind: ServiceAccount
|
|
metadata:
|
|
name: prometheus
|
|
namespace: monitoring
|
|
---
|
|
apiVersion: rbac.authorization.k8s.io/v1
|
|
kind: ClusterRole
|
|
metadata:
|
|
name: prometheus
|
|
rules:
|
|
- apiGroups: [""]
|
|
resources: ["nodes", "nodes/proxy", "services", "endpoints", "pods"]
|
|
verbs: ["get", "list", "watch"]
|
|
- apiGroups: ["extensions", "networking.k8s.io"]
|
|
resources: ["ingresses"]
|
|
verbs: ["get", "list", "watch"]
|
|
- nonResourceURLs: ["/metrics"]
|
|
verbs: ["get"]
|
|
---
|
|
apiVersion: rbac.authorization.k8s.io/v1
|
|
kind: ClusterRoleBinding
|
|
metadata:
|
|
name: prometheus
|
|
roleRef:
|
|
apiGroup: rbac.authorization.k8s.io
|
|
kind: ClusterRole
|
|
name: prometheus
|
|
subjects:
|
|
- kind: ServiceAccount
|
|
name: prometheus
|
|
namespace: monitoring
|
|
|
|
# =============================================================================
|
|
# PVC: Prometheus Data (10Gi, Longhorn)
|
|
# =============================================================================
|
|
---
|
|
apiVersion: v1
|
|
kind: PersistentVolumeClaim
|
|
metadata:
|
|
name: prometheus-data
|
|
namespace: monitoring
|
|
spec:
|
|
accessModes:
|
|
- ReadWriteOnce
|
|
storageClassName: longhorn
|
|
resources:
|
|
requests:
|
|
storage: 10Gi
|
|
|
|
# =============================================================================
|
|
# PVC: Grafana Data (2Gi, Longhorn)
|
|
# =============================================================================
|
|
---
|
|
apiVersion: v1
|
|
kind: PersistentVolumeClaim
|
|
metadata:
|
|
name: grafana-data
|
|
namespace: monitoring
|
|
spec:
|
|
accessModes:
|
|
- ReadWriteOnce
|
|
storageClassName: longhorn
|
|
resources:
|
|
requests:
|
|
storage: 2Gi
|
|
|
|
# =============================================================================
|
|
# Deployment: Prometheus
|
|
# =============================================================================
|
|
---
|
|
apiVersion: apps/v1
|
|
kind: Deployment
|
|
metadata:
|
|
name: prometheus
|
|
namespace: monitoring
|
|
labels:
|
|
app: prometheus
|
|
spec:
|
|
replicas: 1
|
|
strategy:
|
|
type: Recreate
|
|
selector:
|
|
matchLabels:
|
|
app: prometheus
|
|
template:
|
|
metadata:
|
|
labels:
|
|
app: prometheus
|
|
spec:
|
|
serviceAccountName: prometheus
|
|
securityContext:
|
|
fsGroup: 65534 # nobody
|
|
runAsUser: 65534
|
|
runAsGroup: 65534
|
|
containers:
|
|
- name: prometheus
|
|
image: docker.io/prom/prometheus:latest
|
|
args:
|
|
- "--config.file=/etc/prometheus/prometheus.yml"
|
|
- "--storage.tsdb.path=/prometheus"
|
|
- "--storage.tsdb.retention.time=90d"
|
|
- "--web.enable-lifecycle"
|
|
ports:
|
|
- containerPort: 9090
|
|
name: http
|
|
volumeMounts:
|
|
- name: config
|
|
mountPath: /etc/prometheus/prometheus.yml
|
|
subPath: prometheus.yml
|
|
readOnly: true
|
|
- name: config
|
|
mountPath: /etc/prometheus/alerts.yml
|
|
subPath: alerts.yml
|
|
readOnly: true
|
|
- name: config
|
|
mountPath: /etc/prometheus/recording-rules.yml
|
|
subPath: recording-rules.yml
|
|
readOnly: true
|
|
- name: data
|
|
mountPath: /prometheus
|
|
resources:
|
|
requests:
|
|
cpu: 200m
|
|
memory: 512Mi
|
|
limits:
|
|
cpu: "1"
|
|
memory: 2Gi
|
|
livenessProbe:
|
|
httpGet:
|
|
path: /-/healthy
|
|
port: 9090
|
|
initialDelaySeconds: 15
|
|
periodSeconds: 30
|
|
readinessProbe:
|
|
httpGet:
|
|
path: /-/ready
|
|
port: 9090
|
|
initialDelaySeconds: 5
|
|
periodSeconds: 10
|
|
volumes:
|
|
- name: config
|
|
configMap:
|
|
name: prometheus-config
|
|
- name: data
|
|
persistentVolumeClaim:
|
|
claimName: prometheus-data
|
|
|
|
# =============================================================================
|
|
# ConfigMap: Grafana Dashboard Provider
|
|
# =============================================================================
|
|
---
|
|
apiVersion: v1
|
|
kind: ConfigMap
|
|
metadata:
|
|
name: grafana-dashboard-provider
|
|
namespace: monitoring
|
|
data:
|
|
default.yml: |
|
|
apiVersion: 1
|
|
providers:
|
|
- name: 'default'
|
|
orgId: 1
|
|
folder: ''
|
|
type: file
|
|
disableDeletion: false
|
|
updateIntervalSeconds: 30
|
|
options:
|
|
path: /var/lib/grafana/dashboards
|
|
foldersFromFilesStructure: true
|
|
|
|
# =============================================================================
|
|
# ConfigMap: Grafana Dashboards (AI Stack Health)
|
|
# =============================================================================
|
|
---
|
|
apiVersion: v1
|
|
kind: ConfigMap
|
|
metadata:
|
|
name: grafana-dashboards
|
|
namespace: monitoring
|
|
data:
|
|
ai-stack-health.json: |
|
|
{
|
|
"id": null,
|
|
"panels": [
|
|
{
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"mappings": [
|
|
{
|
|
"options": {
|
|
"0": { "color": "red", "text": "DOWN" },
|
|
"1": { "color": "green", "text": "UP" }
|
|
},
|
|
"type": "value"
|
|
}
|
|
],
|
|
"thresholds": {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{ "color": "red", "value": null },
|
|
{ "color": "green", "value": 1 }
|
|
]
|
|
}
|
|
}
|
|
},
|
|
"gridPos": { "h": 4, "w": 6, "x": 0, "y": 0 },
|
|
"id": 1,
|
|
"targets": [
|
|
{
|
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
|
"expr": "probe_success{job=\"probe-ollama-local\"}",
|
|
"legendFormat": "Status"
|
|
}
|
|
],
|
|
"title": "Ollama (Local)",
|
|
"type": "stat"
|
|
},
|
|
{
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"mappings": [
|
|
{
|
|
"options": {
|
|
"0": { "color": "red", "text": "DOWN" },
|
|
"1": { "color": "green", "text": "UP" }
|
|
},
|
|
"type": "value"
|
|
}
|
|
],
|
|
"thresholds": {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{ "color": "red", "value": null },
|
|
{ "color": "green", "value": 1 }
|
|
]
|
|
}
|
|
}
|
|
},
|
|
"gridPos": { "h": 4, "w": 6, "x": 6, "y": 0 },
|
|
"id": 2,
|
|
"targets": [
|
|
{
|
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
|
"expr": "probe_success{job=\"probe-ollama-edge1\"}",
|
|
"legendFormat": "Status"
|
|
}
|
|
],
|
|
"title": "Ollama (Edge1)",
|
|
"type": "stat"
|
|
},
|
|
{
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"mappings": [
|
|
{
|
|
"options": {
|
|
"0": { "color": "red", "text": "DOWN" },
|
|
"1": { "color": "green", "text": "UP" }
|
|
},
|
|
"type": "value"
|
|
}
|
|
],
|
|
"thresholds": {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{ "color": "red", "value": null },
|
|
{ "color": "green", "value": 1 }
|
|
]
|
|
}
|
|
}
|
|
},
|
|
"gridPos": { "h": 4, "w": 6, "x": 12, "y": 0 },
|
|
"id": 3,
|
|
"targets": [
|
|
{
|
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
|
"expr": "probe_success{job=\"probe-agentzero-local\"}",
|
|
"legendFormat": "Status"
|
|
}
|
|
],
|
|
"title": "Agent Zero (Local)",
|
|
"type": "stat"
|
|
},
|
|
{
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"mappings": [
|
|
{
|
|
"options": {
|
|
"0": { "color": "red", "text": "DOWN" },
|
|
"1": { "color": "green", "text": "UP" }
|
|
},
|
|
"type": "value"
|
|
}
|
|
],
|
|
"thresholds": {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{ "color": "red", "value": null },
|
|
{ "color": "green", "value": 1 }
|
|
]
|
|
}
|
|
}
|
|
},
|
|
"gridPos": { "h": 4, "w": 6, "x": 18, "y": 0 },
|
|
"id": 4,
|
|
"targets": [
|
|
{
|
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
|
"expr": "probe_success{job=\"probe-agentzero-nuc\"}",
|
|
"legendFormat": "Status"
|
|
}
|
|
],
|
|
"title": "Agent Zero (NUC)",
|
|
"type": "stat"
|
|
},
|
|
{
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"custom": { "fillOpacity": 15, "lineWidth": 2 },
|
|
"thresholds": {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{ "color": "green", "value": null },
|
|
{ "color": "yellow", "value": 1 },
|
|
{ "color": "red", "value": 3 }
|
|
]
|
|
},
|
|
"unit": "s"
|
|
}
|
|
},
|
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 4 },
|
|
"id": 5,
|
|
"targets": [
|
|
{
|
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
|
"expr": "probe_duration_seconds{service=\"ollama\"}",
|
|
"legendFormat": "{{ deployment }}"
|
|
}
|
|
],
|
|
"title": "Ollama Response Time",
|
|
"type": "timeseries"
|
|
},
|
|
{
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"custom": { "fillOpacity": 15, "lineWidth": 2 },
|
|
"thresholds": {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{ "color": "green", "value": null },
|
|
{ "color": "yellow", "value": 1 },
|
|
{ "color": "red", "value": 3 }
|
|
]
|
|
},
|
|
"unit": "s"
|
|
}
|
|
},
|
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 4 },
|
|
"id": 6,
|
|
"targets": [
|
|
{
|
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
|
"expr": "probe_duration_seconds{service=\"agent-zero\"}",
|
|
"legendFormat": "{{ deployment }}"
|
|
}
|
|
],
|
|
"title": "Agent Zero Response Time",
|
|
"type": "timeseries"
|
|
},
|
|
{
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"custom": { "fillOpacity": 30, "lineWidth": 1, "stacking": { "mode": "none" } },
|
|
"mappings": [
|
|
{
|
|
"options": {
|
|
"0": { "text": "DOWN" },
|
|
"1": { "text": "UP" }
|
|
},
|
|
"type": "value"
|
|
}
|
|
],
|
|
"max": 1,
|
|
"min": 0,
|
|
"thresholds": {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{ "color": "red", "value": null },
|
|
{ "color": "green", "value": 1 }
|
|
]
|
|
}
|
|
}
|
|
},
|
|
"gridPos": { "h": 6, "w": 24, "x": 0, "y": 12 },
|
|
"id": 7,
|
|
"targets": [
|
|
{
|
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
|
"expr": "probe_success{service=\"ollama\"}",
|
|
"legendFormat": "Ollama ({{ deployment }})"
|
|
},
|
|
{
|
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
|
"expr": "probe_success{service=\"agent-zero\"}",
|
|
"legendFormat": "Agent Zero ({{ deployment }})"
|
|
}
|
|
],
|
|
"title": "Uptime History",
|
|
"type": "timeseries"
|
|
},
|
|
{
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"custom": { "fillOpacity": 10, "lineWidth": 2 },
|
|
"max": 100,
|
|
"min": 0,
|
|
"thresholds": {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{ "color": "green", "value": null },
|
|
{ "color": "yellow", "value": 75 },
|
|
{ "color": "red", "value": 90 }
|
|
]
|
|
},
|
|
"unit": "percent"
|
|
}
|
|
},
|
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 18 },
|
|
"id": 8,
|
|
"targets": [
|
|
{
|
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
|
"expr": "100 - (avg(rate(node_cpu_seconds_total{instance=\"edge1\",mode=\"idle\"}[5m])) * 100)",
|
|
"legendFormat": "CPU %"
|
|
},
|
|
{
|
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
|
"expr": "(1 - node_memory_MemAvailable_bytes{instance=\"edge1\"} / node_memory_MemTotal_bytes{instance=\"edge1\"}) * 100",
|
|
"legendFormat": "Memory %"
|
|
},
|
|
{
|
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
|
"expr": "(1 - node_filesystem_avail_bytes{instance=\"edge1\",mountpoint=\"/\"} / node_filesystem_size_bytes{instance=\"edge1\",mountpoint=\"/\"}) * 100",
|
|
"legendFormat": "Disk %"
|
|
}
|
|
],
|
|
"title": "Edge1 Pi 5 \u2014 CPU / Memory / Disk",
|
|
"type": "timeseries"
|
|
},
|
|
{
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"custom": { "fillOpacity": 10, "lineWidth": 2 },
|
|
"unit": "s"
|
|
}
|
|
},
|
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 18 },
|
|
"id": 9,
|
|
"targets": [
|
|
{
|
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
|
"expr": "probe_dns_lookup_time_seconds",
|
|
"legendFormat": "{{ job }}"
|
|
}
|
|
],
|
|
"title": "Probe DNS Lookup Time",
|
|
"type": "timeseries"
|
|
}
|
|
],
|
|
"refresh": "30s",
|
|
"schemaVersion": 39,
|
|
"tags": ["ai", "ollama", "agent-zero", "blue-jay"],
|
|
"time": { "from": "now-1h", "to": "now" },
|
|
"timezone": "browser",
|
|
"title": "AI Stack Health",
|
|
"uid": "ai-stack-health",
|
|
"version": 1
|
|
}
|
|
|
|
# =============================================================================
|
|
# ConfigMap: Grafana Dashboard — Edge Nodes
|
|
# =============================================================================
|
|
---
|
|
apiVersion: v1
|
|
kind: ConfigMap
|
|
metadata:
|
|
name: grafana-dashboard-edge-nodes
|
|
namespace: monitoring
|
|
data:
|
|
bluejay-edge-nodes.json: |
|
|
{
|
|
"id": null,
|
|
"panels": [
|
|
{
|
|
"datasource": {
|
|
"type": "prometheus",
|
|
"uid": "prometheus"
|
|
},
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"mappings": [
|
|
{
|
|
"options": {
|
|
"0": {
|
|
"color": "red",
|
|
"text": "DOWN"
|
|
},
|
|
"1": {
|
|
"color": "green",
|
|
"text": "UP"
|
|
}
|
|
},
|
|
"type": "value"
|
|
}
|
|
]
|
|
}
|
|
},
|
|
"gridPos": { "h": 4, "w": 24, "x": 0, "y": 0 },
|
|
"targets": [
|
|
{
|
|
"expr": "up{instance=~\"edge.*\"}",
|
|
"legendFormat": "{{instance}}",
|
|
"refId": "A"
|
|
}
|
|
],
|
|
"title": "Edge Node Status",
|
|
"type": "stat"
|
|
},
|
|
{
|
|
"datasource": {
|
|
"type": "prometheus",
|
|
"uid": "prometheus"
|
|
},
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"max": 100,
|
|
"min": 0,
|
|
"unit": "percent"
|
|
}
|
|
},
|
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 4 },
|
|
"targets": [
|
|
{
|
|
"expr": "100 - (avg(rate(node_cpu_seconds_total{mode=\"idle\", instance=~\"edge1.*\"}[5m])) * 100)",
|
|
"legendFormat": "CPU %",
|
|
"refId": "A"
|
|
},
|
|
{
|
|
"expr": "node_load1{instance=~\"edge1.*\"}",
|
|
"legendFormat": "Load 1m",
|
|
"refId": "B"
|
|
}
|
|
],
|
|
"title": "edge1 (Pi5 + Hailo) CPU",
|
|
"type": "timeseries"
|
|
},
|
|
{
|
|
"datasource": {
|
|
"type": "prometheus",
|
|
"uid": "prometheus"
|
|
},
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"max": 100,
|
|
"min": 0,
|
|
"unit": "percent"
|
|
}
|
|
},
|
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 4 },
|
|
"targets": [
|
|
{
|
|
"expr": "100 - (avg(rate(node_cpu_seconds_total{mode=\"idle\", instance=~\"edge2.*\"}[5m])) * 100)",
|
|
"legendFormat": "CPU %",
|
|
"refId": "A"
|
|
},
|
|
{
|
|
"expr": "node_load1{instance=~\"edge2.*\"}",
|
|
"legendFormat": "Load 1m",
|
|
"refId": "B"
|
|
}
|
|
],
|
|
"title": "edge2 (Pi4) CPU",
|
|
"type": "timeseries"
|
|
},
|
|
{
|
|
"datasource": {
|
|
"type": "prometheus",
|
|
"uid": "prometheus"
|
|
},
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"max": 100,
|
|
"min": 0,
|
|
"unit": "percent"
|
|
}
|
|
},
|
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 12 },
|
|
"targets": [
|
|
{
|
|
"expr": "(1 - node_memory_MemAvailable_bytes{instance=~\"edge.*\"} / node_memory_MemTotal_bytes{instance=~\"edge.*\"}) * 100",
|
|
"legendFormat": "{{instance}}",
|
|
"refId": "A"
|
|
}
|
|
],
|
|
"title": "Edge Memory Usage",
|
|
"type": "timeseries"
|
|
},
|
|
{
|
|
"datasource": {
|
|
"type": "prometheus",
|
|
"uid": "prometheus"
|
|
},
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"max": 100,
|
|
"min": 0,
|
|
"unit": "percent"
|
|
}
|
|
},
|
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 12 },
|
|
"targets": [
|
|
{
|
|
"expr": "100 - (node_filesystem_avail_bytes{instance=~\"edge.*\",mountpoint=\"/\"} / node_filesystem_size_bytes{instance=~\"edge.*\",mountpoint=\"/\"} * 100)",
|
|
"legendFormat": "{{instance}}",
|
|
"refId": "A"
|
|
}
|
|
],
|
|
"title": "Edge Disk Usage",
|
|
"type": "timeseries"
|
|
},
|
|
{
|
|
"datasource": {
|
|
"type": "prometheus",
|
|
"uid": "prometheus"
|
|
},
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"unit": "celsius"
|
|
}
|
|
},
|
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 20 },
|
|
"targets": [
|
|
{
|
|
"expr": "node_hwmon_temp_celsius{instance=~\"edge.*\"}",
|
|
"legendFormat": "{{instance}} {{chip}} {{sensor}}",
|
|
"refId": "A"
|
|
}
|
|
],
|
|
"title": "Edge CPU Temperature",
|
|
"type": "timeseries"
|
|
},
|
|
{
|
|
"datasource": {
|
|
"type": "prometheus",
|
|
"uid": "prometheus"
|
|
},
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"unit": "bps"
|
|
}
|
|
},
|
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 20 },
|
|
"targets": [
|
|
{
|
|
"expr": "rate(node_network_receive_bytes_total{instance=~\"edge.*\",device!~\"lo|docker.*|veth.*\"}[5m]) * 8",
|
|
"legendFormat": "{{instance}} {{device}} RX",
|
|
"refId": "A"
|
|
},
|
|
{
|
|
"expr": "-rate(node_network_transmit_bytes_total{instance=~\"edge.*\",device!~\"lo|docker.*|veth.*\"}[5m]) * 8",
|
|
"legendFormat": "{{instance}} {{device}} TX",
|
|
"refId": "B"
|
|
}
|
|
],
|
|
"title": "Edge Network Traffic",
|
|
"type": "timeseries"
|
|
}
|
|
],
|
|
"refresh": "30s",
|
|
"schemaVersion": 40,
|
|
"tags": ["bluejay", "edge"],
|
|
"timezone": "browser",
|
|
"title": "BlueJay Edge Nodes",
|
|
"uid": "7e1603b9-e918-4b3f-a22b-163132fd5cee"
|
|
}
|
|
|
|
# =============================================================================
|
|
# ConfigMap: Grafana Dashboard — Network Overview
|
|
# =============================================================================
|
|
---
|
|
apiVersion: v1
|
|
kind: ConfigMap
|
|
metadata:
|
|
name: grafana-dashboard-network-overview
|
|
namespace: monitoring
|
|
data:
|
|
bluejay-network-overview.json: |
|
|
{
|
|
"id": null,
|
|
"panels": [
|
|
{
|
|
"datasource": {
|
|
"type": "prometheus",
|
|
"uid": "prometheus"
|
|
},
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"thresholds": {
|
|
"steps": [
|
|
{ "color": "green", "value": null }
|
|
]
|
|
}
|
|
}
|
|
},
|
|
"gridPos": { "h": 4, "w": 24, "x": 0, "y": 0 },
|
|
"targets": [
|
|
{
|
|
"expr": "count(up == 1)",
|
|
"legendFormat": "Up",
|
|
"refId": "A"
|
|
},
|
|
{
|
|
"expr": "count(up == 0)",
|
|
"legendFormat": "Down",
|
|
"refId": "B"
|
|
}
|
|
],
|
|
"title": "Target Health",
|
|
"type": "stat"
|
|
},
|
|
{
|
|
"datasource": {
|
|
"type": "prometheus",
|
|
"uid": "prometheus"
|
|
},
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"max": 4,
|
|
"min": 0,
|
|
"thresholds": {
|
|
"steps": [
|
|
{ "color": "green", "value": null },
|
|
{ "color": "yellow", "value": 2 },
|
|
{ "color": "red", "value": 3 }
|
|
]
|
|
}
|
|
}
|
|
},
|
|
"gridPos": { "h": 6, "w": 6, "x": 0, "y": 4 },
|
|
"targets": [
|
|
{
|
|
"expr": "laLoad{instance=\"10.0.56.1\", laNames=\"Load-1\"}",
|
|
"refId": "A"
|
|
}
|
|
],
|
|
"title": "pfSense CPU Load (1m)",
|
|
"type": "gauge"
|
|
},
|
|
{
|
|
"datasource": {
|
|
"type": "prometheus",
|
|
"uid": "prometheus"
|
|
},
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"max": 100,
|
|
"min": 0,
|
|
"thresholds": {
|
|
"steps": [
|
|
{ "color": "green", "value": null },
|
|
{ "color": "yellow", "value": 70 },
|
|
{ "color": "red", "value": 90 }
|
|
]
|
|
},
|
|
"unit": "percent"
|
|
}
|
|
},
|
|
"gridPos": { "h": 6, "w": 6, "x": 6, "y": 4 },
|
|
"targets": [
|
|
{
|
|
"expr": "100 - (memAvailReal{instance=\"10.0.56.1\"} / memTotalReal{instance=\"10.0.56.1\"} * 100)",
|
|
"refId": "A"
|
|
}
|
|
],
|
|
"title": "pfSense Memory Used %",
|
|
"type": "gauge"
|
|
},
|
|
{
|
|
"datasource": {
|
|
"type": "prometheus",
|
|
"uid": "prometheus"
|
|
},
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"max": 100,
|
|
"min": 0,
|
|
"unit": "percent"
|
|
}
|
|
},
|
|
"gridPos": { "h": 6, "w": 12, "x": 12, "y": 4 },
|
|
"targets": [
|
|
{
|
|
"expr": "100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\", instance=\"noc1\"}[5m])) * 100)",
|
|
"legendFormat": "CPU %",
|
|
"refId": "A"
|
|
}
|
|
],
|
|
"title": "noc1 CPU Usage",
|
|
"type": "timeseries"
|
|
},
|
|
{
|
|
"datasource": {
|
|
"type": "prometheus",
|
|
"uid": "prometheus"
|
|
},
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"max": 100,
|
|
"min": 0,
|
|
"unit": "percent"
|
|
}
|
|
},
|
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 10 },
|
|
"targets": [
|
|
{
|
|
"expr": "(1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100",
|
|
"legendFormat": "{{instance}}",
|
|
"refId": "A"
|
|
}
|
|
],
|
|
"title": "Node Memory Usage",
|
|
"type": "timeseries"
|
|
},
|
|
{
|
|
"datasource": {
|
|
"type": "prometheus",
|
|
"uid": "prometheus"
|
|
},
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"max": 100,
|
|
"min": 0,
|
|
"thresholds": {
|
|
"steps": [
|
|
{ "color": "green", "value": null },
|
|
{ "color": "yellow", "value": 70 },
|
|
{ "color": "red", "value": 90 }
|
|
]
|
|
},
|
|
"unit": "percent"
|
|
}
|
|
},
|
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 10 },
|
|
"targets": [
|
|
{
|
|
"expr": "100 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} * 100)",
|
|
"legendFormat": "{{instance}}",
|
|
"refId": "A"
|
|
}
|
|
],
|
|
"title": "Node Disk Usage %",
|
|
"type": "bargauge"
|
|
},
|
|
{
|
|
"datasource": {
|
|
"type": "prometheus",
|
|
"uid": "prometheus"
|
|
},
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"unit": "bps"
|
|
}
|
|
},
|
|
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 18 },
|
|
"targets": [
|
|
{
|
|
"expr": "rate(node_network_receive_bytes_total{device!~\"lo|veth.*|cali.*|flannel.*\"}[5m]) * 8",
|
|
"legendFormat": "{{instance}} {{device}} RX",
|
|
"refId": "A"
|
|
},
|
|
{
|
|
"expr": "-rate(node_network_transmit_bytes_total{device!~\"lo|veth.*|cali.*|flannel.*\"}[5m]) * 8",
|
|
"legendFormat": "{{instance}} {{device}} TX",
|
|
"refId": "B"
|
|
}
|
|
],
|
|
"title": "Network Traffic",
|
|
"type": "timeseries"
|
|
},
|
|
{
|
|
"datasource": {
|
|
"type": "prometheus",
|
|
"uid": "prometheus"
|
|
},
|
|
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 26 },
|
|
"targets": [
|
|
{
|
|
"expr": "up",
|
|
"format": "table",
|
|
"instant": true,
|
|
"refId": "A"
|
|
}
|
|
],
|
|
"title": "Prometheus Targets",
|
|
"type": "table"
|
|
}
|
|
],
|
|
"refresh": "30s",
|
|
"schemaVersion": 40,
|
|
"tags": ["bluejay", "network"],
|
|
"timezone": "browser",
|
|
"title": "BlueJay Network Overview",
|
|
"uid": "b83a122c-6eb5-4fae-a632-77cbf753ad05"
|
|
}
|
|
|
|
# =============================================================================
|
|
# ConfigMap: Grafana Dashboard — Operations
|
|
# =============================================================================
|
|
---
|
|
apiVersion: v1
|
|
kind: ConfigMap
|
|
metadata:
|
|
name: grafana-dashboard-operations
|
|
namespace: monitoring
|
|
data:
|
|
bluejay-operations.json: |
|
|
{
|
|
"annotations": {
|
|
"list": []
|
|
},
|
|
"id": null,
|
|
"panels": [
|
|
{
|
|
"collapsed": false,
|
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
|
|
"title": "Infrastructure Overview",
|
|
"type": "row"
|
|
},
|
|
{
|
|
"datasource": {
|
|
"type": "prometheus",
|
|
"uid": "prometheus"
|
|
},
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"noValue": "0",
|
|
"thresholds": {
|
|
"steps": [
|
|
{ "color": "green", "value": null }
|
|
]
|
|
}
|
|
}
|
|
},
|
|
"gridPos": { "h": 4, "w": 8, "x": 0, "y": 1 },
|
|
"targets": [
|
|
{
|
|
"expr": "count(up == 1)",
|
|
"legendFormat": "Up",
|
|
"refId": "A"
|
|
},
|
|
{
|
|
"expr": "count(up == 0)",
|
|
"legendFormat": "Down",
|
|
"refId": "B"
|
|
}
|
|
],
|
|
"title": "All Targets Up/Down",
|
|
"type": "stat"
|
|
},
|
|
{
|
|
"datasource": {
|
|
"type": "alexanderzobnin-zabbix-datasource",
|
|
"uid": "bffjila3zkdfka"
|
|
},
|
|
"gridPos": { "h": 8, "w": 16, "x": 8, "y": 1 },
|
|
"targets": [
|
|
{
|
|
"application": { "filter": "" },
|
|
"group": { "filter": "/.*/" },
|
|
"host": { "filter": "/.*/" },
|
|
"queryType": 5,
|
|
"refId": "A",
|
|
"trigger": { "filter": "/.*/" }
|
|
}
|
|
],
|
|
"title": "Zabbix Active Problems",
|
|
"type": "alexanderzobnin-zabbix-triggers-panel"
|
|
},
|
|
{
|
|
"datasource": {
|
|
"type": "prometheus",
|
|
"uid": "prometheus"
|
|
},
|
|
"gridPos": { "h": 6, "w": 8, "x": 0, "y": 5 },
|
|
"targets": [
|
|
{
|
|
"expr": "node_load1{instance=\"noc1\"}",
|
|
"legendFormat": "1m",
|
|
"refId": "A"
|
|
},
|
|
{
|
|
"expr": "node_load5{instance=\"noc1\"}",
|
|
"legendFormat": "5m",
|
|
"refId": "B"
|
|
},
|
|
{
|
|
"expr": "node_load15{instance=\"noc1\"}",
|
|
"legendFormat": "15m",
|
|
"refId": "C"
|
|
}
|
|
],
|
|
"title": "noc1 Load Average",
|
|
"type": "timeseries"
|
|
},
|
|
{
|
|
"collapsed": false,
|
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 11 },
|
|
"title": "Kubernetes & Services",
|
|
"type": "row"
|
|
},
|
|
{
|
|
"datasource": {
|
|
"type": "prometheus",
|
|
"uid": "prometheus"
|
|
},
|
|
"fieldConfig": {
|
|
"defaults": {},
|
|
"overrides": [
|
|
{
|
|
"matcher": { "id": "byName", "options": "Value" },
|
|
"properties": [
|
|
{
|
|
"id": "mappings",
|
|
"value": [
|
|
{
|
|
"options": {
|
|
"0": { "color": "red", "text": "DOWN" },
|
|
"1": { "color": "green", "text": "UP" }
|
|
},
|
|
"type": "value"
|
|
}
|
|
]
|
|
}
|
|
]
|
|
}
|
|
]
|
|
},
|
|
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 12 },
|
|
"targets": [
|
|
{
|
|
"expr": "up",
|
|
"format": "table",
|
|
"instant": true,
|
|
"refId": "A"
|
|
}
|
|
],
|
|
"title": "K8s Services Uptime (Prometheus Targets)",
|
|
"type": "table"
|
|
},
|
|
{
|
|
"collapsed": false,
|
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 20 },
|
|
"title": "Network & SNMP",
|
|
"type": "row"
|
|
},
|
|
{
|
|
"datasource": {
|
|
"type": "prometheus",
|
|
"uid": "prometheus"
|
|
},
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"unit": "bps"
|
|
}
|
|
},
|
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 21 },
|
|
"targets": [
|
|
{
|
|
"expr": "rate(ifHCInOctets{instance=\"10.0.56.1\", ifAlias=~\".*WAN.*\"}[5m]) * 8",
|
|
"legendFormat": "WAN In",
|
|
"refId": "A"
|
|
},
|
|
{
|
|
"expr": "-rate(ifHCOutOctets{instance=\"10.0.56.1\", ifAlias=~\".*WAN.*\"}[5m]) * 8",
|
|
"legendFormat": "WAN Out",
|
|
"refId": "B"
|
|
}
|
|
],
|
|
"title": "pfSense WAN Traffic",
|
|
"type": "timeseries"
|
|
},
|
|
{
|
|
"datasource": {
|
|
"type": "prometheus",
|
|
"uid": "prometheus"
|
|
},
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"unit": "bps"
|
|
}
|
|
},
|
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 21 },
|
|
"targets": [
|
|
{
|
|
"expr": "rate(ifHCInOctets{instance=\"10.0.56.1\", ifAlias!~\".*WAN.*\"}[5m]) * 8",
|
|
"legendFormat": "{{ifAlias}} In",
|
|
"refId": "A"
|
|
},
|
|
{
|
|
"expr": "-rate(ifHCOutOctets{instance=\"10.0.56.1\", ifAlias!~\".*WAN.*\"}[5m]) * 8",
|
|
"legendFormat": "{{ifAlias}} Out",
|
|
"refId": "B"
|
|
}
|
|
],
|
|
"title": "pfSense LAN Traffic",
|
|
"type": "timeseries"
|
|
},
|
|
{
|
|
"datasource": {
|
|
"type": "prometheus",
|
|
"uid": "prometheus"
|
|
},
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"max": 100,
|
|
"min": 0,
|
|
"unit": "percent"
|
|
}
|
|
},
|
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 29 },
|
|
"targets": [
|
|
{
|
|
"expr": "(1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100",
|
|
"legendFormat": "{{instance}}",
|
|
"refId": "A"
|
|
}
|
|
],
|
|
"title": "All Nodes Memory",
|
|
"type": "timeseries"
|
|
},
|
|
{
|
|
"datasource": {
|
|
"type": "prometheus",
|
|
"uid": "prometheus"
|
|
},
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"max": 100,
|
|
"min": 0,
|
|
"unit": "percent"
|
|
}
|
|
},
|
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 29 },
|
|
"targets": [
|
|
{
|
|
"expr": "100 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} * 100)",
|
|
"legendFormat": "{{instance}}",
|
|
"refId": "A"
|
|
}
|
|
],
|
|
"title": "All Nodes Disk",
|
|
"type": "timeseries"
|
|
}
|
|
],
|
|
"refresh": "1m",
|
|
"schemaVersion": 40,
|
|
"tags": ["bluejay", "operations", "zabbix"],
|
|
"timezone": "browser",
|
|
"title": "BlueJay Operations",
|
|
"uid": "a781c4e4-c3fe-4ac7-be93-21363a41b97d"
|
|
}
|
|
|
|
# =============================================================================
|
|
# ConfigMap: Grafana Dashboard — Epson Printer
|
|
# =============================================================================
|
|
---
|
|
apiVersion: v1
|
|
kind: ConfigMap
|
|
metadata:
|
|
name: grafana-dashboard-printer
|
|
namespace: monitoring
|
|
data:
|
|
epson-ecotank-printer.json: |
|
|
{
|
|
"id": null,
|
|
"panels": [
|
|
{
|
|
"datasource": {
|
|
"type": "prometheus",
|
|
"uid": "prometheus"
|
|
},
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"max": 100,
|
|
"min": 0,
|
|
"thresholds": {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{ "color": "red", "value": null },
|
|
{ "color": "orange", "value": 10 },
|
|
{ "color": "yellow", "value": 20 },
|
|
{ "color": "green", "value": 40 }
|
|
]
|
|
},
|
|
"unit": "percent"
|
|
},
|
|
"overrides": [
|
|
{
|
|
"matcher": { "id": "byName", "options": "Black Ink Bottle" },
|
|
"properties": [
|
|
{ "id": "color", "value": { "fixedColor": "dark-blue", "mode": "fixed" } }
|
|
]
|
|
},
|
|
{
|
|
"matcher": { "id": "byName", "options": "Cyan Ink Bottle" },
|
|
"properties": [
|
|
{ "id": "color", "value": { "fixedColor": "super-light-blue", "mode": "fixed" } }
|
|
]
|
|
},
|
|
{
|
|
"matcher": { "id": "byName", "options": "Magenta Ink Bottle" },
|
|
"properties": [
|
|
{ "id": "color", "value": { "fixedColor": "purple", "mode": "fixed" } }
|
|
]
|
|
},
|
|
{
|
|
"matcher": { "id": "byName", "options": "Yellow Ink Bottle" },
|
|
"properties": [
|
|
{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }
|
|
]
|
|
}
|
|
]
|
|
},
|
|
"gridPos": { "h": 10, "w": 12, "x": 0, "y": 0 },
|
|
"id": 1,
|
|
"options": {
|
|
"orientation": "horizontal",
|
|
"reduceOptions": {
|
|
"calcs": ["lastNotNull"]
|
|
},
|
|
"showThresholdLabels": false,
|
|
"showThresholdMarkers": true
|
|
},
|
|
"targets": [
|
|
{
|
|
"expr": "prtMarkerSuppliesLevel{job=\"snmp-printer\"}",
|
|
"legendFormat": "{{prtMarkerSuppliesDescription}}",
|
|
"refId": "A"
|
|
}
|
|
],
|
|
"title": "Ink Levels",
|
|
"type": "gauge"
|
|
},
|
|
{
|
|
"datasource": {
|
|
"type": "prometheus",
|
|
"uid": "prometheus"
|
|
},
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"custom": {
|
|
"fillOpacity": 20,
|
|
"lineWidth": 2,
|
|
"spanNulls": true
|
|
},
|
|
"max": 100,
|
|
"min": 0,
|
|
"unit": "percent"
|
|
},
|
|
"overrides": [
|
|
{
|
|
"matcher": { "id": "byName", "options": "Black Ink Bottle" },
|
|
"properties": [
|
|
{ "id": "color", "value": { "fixedColor": "dark-blue", "mode": "fixed" } }
|
|
]
|
|
},
|
|
{
|
|
"matcher": { "id": "byName", "options": "Cyan Ink Bottle" },
|
|
"properties": [
|
|
{ "id": "color", "value": { "fixedColor": "super-light-blue", "mode": "fixed" } }
|
|
]
|
|
},
|
|
{
|
|
"matcher": { "id": "byName", "options": "Magenta Ink Bottle" },
|
|
"properties": [
|
|
{ "id": "color", "value": { "fixedColor": "purple", "mode": "fixed" } }
|
|
]
|
|
},
|
|
{
|
|
"matcher": { "id": "byName", "options": "Yellow Ink Bottle" },
|
|
"properties": [
|
|
{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }
|
|
]
|
|
}
|
|
]
|
|
},
|
|
"gridPos": { "h": 10, "w": 12, "x": 12, "y": 0 },
|
|
"id": 2,
|
|
"targets": [
|
|
{
|
|
"expr": "prtMarkerSuppliesLevel{job=\"snmp-printer\"}",
|
|
"legendFormat": "{{prtMarkerSuppliesDescription}}",
|
|
"refId": "A"
|
|
}
|
|
],
|
|
"title": "Ink Level History",
|
|
"type": "timeseries"
|
|
},
|
|
{
|
|
"datasource": {
|
|
"type": "prometheus",
|
|
"uid": "prometheus"
|
|
},
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"thresholds": {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{ "color": "green", "value": null },
|
|
{ "color": "yellow", "value": 10000 },
|
|
{ "color": "red", "value": 50000 }
|
|
]
|
|
},
|
|
"unit": "short"
|
|
}
|
|
},
|
|
"gridPos": { "h": 5, "w": 6, "x": 0, "y": 10 },
|
|
"id": 3,
|
|
"options": {
|
|
"colorMode": "background",
|
|
"reduceOptions": {
|
|
"calcs": ["lastNotNull"]
|
|
},
|
|
"textMode": "value_and_name"
|
|
},
|
|
"targets": [
|
|
{
|
|
"expr": "prtMarkerLifeCount{job=\"snmp-printer\"}",
|
|
"legendFormat": "Pages",
|
|
"refId": "A"
|
|
}
|
|
],
|
|
"title": "Lifetime Page Count",
|
|
"type": "stat"
|
|
},
|
|
{
|
|
"datasource": {
|
|
"type": "prometheus",
|
|
"uid": "prometheus"
|
|
},
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"mappings": [
|
|
{
|
|
"options": {
|
|
"1": { "text": "Online" }
|
|
},
|
|
"type": "value"
|
|
}
|
|
],
|
|
"thresholds": {
|
|
"steps": [
|
|
{ "color": "blue", "value": null }
|
|
]
|
|
}
|
|
}
|
|
},
|
|
"gridPos": { "h": 5, "w": 6, "x": 6, "y": 10 },
|
|
"id": 4,
|
|
"options": {
|
|
"colorMode": "background",
|
|
"reduceOptions": {
|
|
"calcs": ["lastNotNull"]
|
|
},
|
|
"textMode": "name"
|
|
},
|
|
"targets": [
|
|
{
|
|
"expr": "prtGeneralPrinterName{job=\"snmp-printer\"}",
|
|
"legendFormat": "{{prtGeneralPrinterName}}",
|
|
"refId": "A"
|
|
}
|
|
],
|
|
"title": "Printer Model",
|
|
"type": "stat"
|
|
},
|
|
{
|
|
"datasource": {
|
|
"type": "prometheus",
|
|
"uid": "prometheus"
|
|
},
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"thresholds": {
|
|
"steps": [
|
|
{ "color": "green", "value": null },
|
|
{ "color": "red", "value": 1 }
|
|
]
|
|
},
|
|
"unit": "short"
|
|
}
|
|
},
|
|
"gridPos": { "h": 5, "w": 6, "x": 12, "y": 10 },
|
|
"id": 5,
|
|
"options": {
|
|
"colorMode": "background",
|
|
"reduceOptions": {
|
|
"calcs": ["lastNotNull"]
|
|
}
|
|
},
|
|
"targets": [
|
|
{
|
|
"expr": "prtAlertCriticalEvents{job=\"snmp-printer\"}",
|
|
"legendFormat": "Critical Alerts",
|
|
"refId": "A"
|
|
}
|
|
],
|
|
"title": "Critical Events",
|
|
"type": "stat"
|
|
},
|
|
{
|
|
"datasource": {
|
|
"type": "prometheus",
|
|
"uid": "prometheus"
|
|
},
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"thresholds": {
|
|
"steps": [
|
|
{ "color": "blue", "value": null }
|
|
]
|
|
}
|
|
}
|
|
},
|
|
"gridPos": { "h": 5, "w": 6, "x": 18, "y": 10 },
|
|
"id": 6,
|
|
"options": {
|
|
"colorMode": "background",
|
|
"reduceOptions": {
|
|
"calcs": ["lastNotNull"]
|
|
},
|
|
"textMode": "name"
|
|
},
|
|
"targets": [
|
|
{
|
|
"expr": "prtGeneralSerialNumber{job=\"snmp-printer\"}",
|
|
"legendFormat": "{{prtGeneralSerialNumber}}",
|
|
"refId": "A"
|
|
}
|
|
],
|
|
"title": "Serial Number",
|
|
"type": "stat"
|
|
}
|
|
],
|
|
"refresh": "5m",
|
|
"schemaVersion": 39,
|
|
"tags": ["printer", "snmp", "bluejay"],
|
|
"time": { "from": "now-24h", "to": "now" },
|
|
"timezone": "browser",
|
|
"title": "Epson ET-3750 EcoTank Printer",
|
|
"uid": "epson-ecotank"
|
|
}
|
|
|
|
# =============================================================================
|
|
# ConfigMap: Grafana Dashboard — Infrastructure Overview
|
|
# =============================================================================
|
|
---
|
|
apiVersion: v1
|
|
kind: ConfigMap
|
|
metadata:
|
|
name: grafana-dashboard-infra-overview
|
|
namespace: monitoring
|
|
data:
|
|
infra-overview.json: |
|
|
{
|
|
"id": null,
|
|
"panels": [
|
|
{
|
|
"collapsed": false,
|
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
|
|
"id": 100,
|
|
"title": "AI Stack",
|
|
"type": "row"
|
|
},
|
|
{
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"mappings": [
|
|
{
|
|
"options": {
|
|
"0": { "color": "red", "text": "DOWN" },
|
|
"1": { "color": "green", "text": "UP" }
|
|
},
|
|
"type": "value"
|
|
}
|
|
],
|
|
"thresholds": {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{ "color": "red", "value": null },
|
|
{ "color": "green", "value": 1 }
|
|
]
|
|
}
|
|
}
|
|
},
|
|
"gridPos": { "h": 4, "w": 6, "x": 0, "y": 1 },
|
|
"id": 1,
|
|
"targets": [
|
|
{
|
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
|
"expr": "probe_success{job=\"probe-ollama-local\"}",
|
|
"legendFormat": "Status"
|
|
}
|
|
],
|
|
"title": "Ollama (Local)",
|
|
"type": "stat"
|
|
},
|
|
{
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"mappings": [
|
|
{
|
|
"options": {
|
|
"0": { "color": "red", "text": "DOWN" },
|
|
"1": { "color": "green", "text": "UP" }
|
|
},
|
|
"type": "value"
|
|
}
|
|
],
|
|
"thresholds": {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{ "color": "red", "value": null },
|
|
{ "color": "green", "value": 1 }
|
|
]
|
|
}
|
|
}
|
|
},
|
|
"gridPos": { "h": 4, "w": 6, "x": 6, "y": 1 },
|
|
"id": 2,
|
|
"targets": [
|
|
{
|
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
|
"expr": "probe_success{job=\"probe-ollama-edge1\"}",
|
|
"legendFormat": "Status"
|
|
}
|
|
],
|
|
"title": "Ollama (Edge1)",
|
|
"type": "stat"
|
|
},
|
|
{
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"mappings": [
|
|
{
|
|
"options": {
|
|
"0": { "color": "red", "text": "DOWN" },
|
|
"1": { "color": "green", "text": "UP" }
|
|
},
|
|
"type": "value"
|
|
}
|
|
],
|
|
"thresholds": {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{ "color": "red", "value": null },
|
|
{ "color": "green", "value": 1 }
|
|
]
|
|
}
|
|
}
|
|
},
|
|
"gridPos": { "h": 4, "w": 6, "x": 12, "y": 1 },
|
|
"id": 3,
|
|
"targets": [
|
|
{
|
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
|
"expr": "probe_success{job=\"probe-agentzero-local\"}",
|
|
"legendFormat": "Status"
|
|
}
|
|
],
|
|
"title": "Agent Zero (Local)",
|
|
"type": "stat"
|
|
},
|
|
{
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"mappings": [
|
|
{
|
|
"options": {
|
|
"0": { "color": "red", "text": "DOWN" },
|
|
"1": { "color": "green", "text": "UP" }
|
|
},
|
|
"type": "value"
|
|
}
|
|
],
|
|
"thresholds": {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{ "color": "red", "value": null },
|
|
{ "color": "green", "value": 1 }
|
|
]
|
|
}
|
|
}
|
|
},
|
|
"gridPos": { "h": 4, "w": 6, "x": 18, "y": 1 },
|
|
"id": 4,
|
|
"targets": [
|
|
{
|
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
|
"expr": "probe_success{job=\"probe-agentzero-nuc\"}",
|
|
"legendFormat": "Status"
|
|
}
|
|
],
|
|
"title": "Agent Zero (NUC)",
|
|
"type": "stat"
|
|
},
|
|
{
|
|
"collapsed": false,
|
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 5 },
|
|
"id": 101,
|
|
"title": "K8s Cluster",
|
|
"type": "row"
|
|
},
|
|
{
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"thresholds": {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{ "color": "green", "value": null },
|
|
{ "color": "yellow", "value": 30 },
|
|
{ "color": "red", "value": 50 }
|
|
]
|
|
}
|
|
}
|
|
},
|
|
"gridPos": { "h": 6, "w": 8, "x": 0, "y": 6 },
|
|
"id": 5,
|
|
"targets": [
|
|
{
|
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
|
"expr": "count(up{job=\"node-exporter\"} == 1)",
|
|
"legendFormat": "Nodes Up"
|
|
}
|
|
],
|
|
"title": "Nodes Up (node-exporter)",
|
|
"type": "stat"
|
|
},
|
|
{
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"custom": { "fillOpacity": 15, "lineWidth": 2 },
|
|
"max": 100,
|
|
"min": 0,
|
|
"thresholds": {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{ "color": "green", "value": null },
|
|
{ "color": "yellow", "value": 70 },
|
|
{ "color": "red", "value": 90 }
|
|
]
|
|
},
|
|
"unit": "percent"
|
|
}
|
|
},
|
|
"gridPos": { "h": 6, "w": 8, "x": 8, "y": 6 },
|
|
"id": 6,
|
|
"targets": [
|
|
{
|
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
|
"expr": "100 - (avg(rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) by (instance) * 100)",
|
|
"legendFormat": "{{ instance }}"
|
|
}
|
|
],
|
|
"title": "Node CPU Usage %",
|
|
"type": "timeseries"
|
|
},
|
|
{
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"custom": { "fillOpacity": 15, "lineWidth": 2 },
|
|
"max": 100,
|
|
"min": 0,
|
|
"thresholds": {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{ "color": "green", "value": null },
|
|
{ "color": "yellow", "value": 70 },
|
|
{ "color": "red", "value": 90 }
|
|
]
|
|
},
|
|
"unit": "percent"
|
|
}
|
|
},
|
|
"gridPos": { "h": 6, "w": 8, "x": 16, "y": 6 },
|
|
"id": 7,
|
|
"targets": [
|
|
{
|
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
|
"expr": "(1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100",
|
|
"legendFormat": "{{ instance }}"
|
|
}
|
|
],
|
|
"title": "Node Memory Usage %",
|
|
"type": "timeseries"
|
|
},
|
|
{
|
|
"collapsed": false,
|
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 12 },
|
|
"id": 102,
|
|
"title": "Network",
|
|
"type": "row"
|
|
},
|
|
{
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"custom": { "fillOpacity": 10, "lineWidth": 2 },
|
|
"unit": "Bps"
|
|
}
|
|
},
|
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 13 },
|
|
"id": 8,
|
|
"targets": [
|
|
{
|
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
|
"expr": "rate(ifHCInOctets{job=\"snmp-pfsense\",ifDescr=\"igb0\"}[5m])",
|
|
"legendFormat": "WAN In"
|
|
},
|
|
{
|
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
|
"expr": "rate(ifHCOutOctets{job=\"snmp-pfsense\",ifDescr=\"igb0\"}[5m])",
|
|
"legendFormat": "WAN Out"
|
|
}
|
|
],
|
|
"title": "pfSense WAN Bandwidth",
|
|
"type": "timeseries"
|
|
},
|
|
{
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"thresholds": {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{ "color": "red", "value": null },
|
|
{ "color": "green", "value": 1 }
|
|
]
|
|
}
|
|
},
|
|
"overrides": []
|
|
},
|
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 13 },
|
|
"id": 9,
|
|
"options": {
|
|
"showHeader": true,
|
|
"sortBy": [{ "displayName": "Value", "desc": false }]
|
|
},
|
|
"targets": [
|
|
{
|
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
|
"expr": "up",
|
|
"format": "table",
|
|
"instant": true,
|
|
"legendFormat": ""
|
|
}
|
|
],
|
|
"title": "Target Health (up)",
|
|
"transformations": [
|
|
{
|
|
"id": "organize",
|
|
"options": {
|
|
"excludeByName": { "Time": true, "__name__": true },
|
|
"renameByName": { "job": "Job", "instance": "Instance", "Value": "Up" }
|
|
}
|
|
}
|
|
],
|
|
"type": "table"
|
|
},
|
|
{
|
|
"collapsed": false,
|
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 21 },
|
|
"id": 103,
|
|
"title": "Services",
|
|
"type": "row"
|
|
},
|
|
{
|
|
"gridPos": { "h": 4, "w": 24, "x": 0, "y": 22 },
|
|
"id": 10,
|
|
"options": {
|
|
"content": "### ArgoCD Applications\n\nView sync status and health of all 11 ArgoCD-managed applications:\n\n[Open ArgoCD Dashboard](https://argocd.iamworkin.lan)\n\n| App | Expected Status |\n|-----|----------------|\n| gitea | Healthy / Synced |\n| zabbix | Healthy / Synced |\n| guacamole | Healthy / Synced |\n| irc | Healthy / Synced |\n| mail | Healthy / Synced |\n| matrix | Healthy / Synced |\n| teamspeak | Healthy / Synced |\n| pki-web | Healthy / Synced |\n| intranet | Healthy / Synced |\n| telephony | Healthy / Synced |\n| infra-noc-services | Healthy / Synced |",
|
|
"mode": "markdown"
|
|
},
|
|
"title": "ArgoCD App Status",
|
|
"type": "text"
|
|
},
|
|
{
|
|
"collapsed": false,
|
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 26 },
|
|
"id": 104,
|
|
"title": "Alerting",
|
|
"type": "row"
|
|
},
|
|
{
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"thresholds": {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{ "color": "green", "value": null },
|
|
{ "color": "orange", "value": 1 },
|
|
{ "color": "red", "value": 3 }
|
|
]
|
|
}
|
|
}
|
|
},
|
|
"gridPos": { "h": 6, "w": 24, "x": 0, "y": 27 },
|
|
"id": 11,
|
|
"targets": [
|
|
{
|
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
|
"expr": "count(ALERTS{alertstate=\"firing\"}) or vector(0)",
|
|
"legendFormat": "Firing Alerts"
|
|
}
|
|
],
|
|
"title": "Firing Alerts",
|
|
"type": "stat"
|
|
}
|
|
],
|
|
"refresh": "30s",
|
|
"schemaVersion": 39,
|
|
"tags": ["infrastructure", "blue-jay", "overview"],
|
|
"time": { "from": "now-1h", "to": "now" },
|
|
"timezone": "browser",
|
|
"title": "Infrastructure Overview",
|
|
"uid": "infra-overview",
|
|
"version": 1
|
|
}
|
|
|
|
# =============================================================================
|
|
# ConfigMap: Grafana Datasource Provisioning
|
|
# =============================================================================
|
|
---
|
|
apiVersion: v1
|
|
kind: ConfigMap
|
|
metadata:
|
|
name: grafana-datasource-provisioning
|
|
namespace: monitoring
|
|
data:
|
|
datasource.yml: |
|
|
apiVersion: 1
|
|
datasources:
|
|
- name: Prometheus
|
|
type: prometheus
|
|
access: proxy
|
|
url: http://prometheus.monitoring.svc:9090
|
|
isDefault: true
|
|
editable: true
|
|
|
|
# =============================================================================
|
|
# ConfigMap: Grafana Alerting Provisioning (contact points + policies + rules)
|
|
# =============================================================================
|
|
# Makes alert rules declarative — survives pod rebuilds without API recreation
|
|
---
|
|
apiVersion: v1
|
|
kind: ConfigMap
|
|
metadata:
|
|
name: grafana-alerting-provisioning
|
|
namespace: monitoring
|
|
data:
|
|
alerting.yml: |
|
|
apiVersion: 1
|
|
contactPoints:
|
|
- orgId: 1
|
|
name: IRC #alerts
|
|
receivers:
|
|
- uid: irc-alerts-webhook
|
|
type: webhook
|
|
settings:
|
|
url: http://irc-notify.monitoring.svc:9119
|
|
httpMethod: POST
|
|
disableResolveMessage: false
|
|
- orgId: 1
|
|
name: Thermal Printer
|
|
receivers:
|
|
- uid: thermal-print-001
|
|
type: webhook
|
|
settings:
|
|
url: http://irc-notify.monitoring.svc:9119
|
|
httpMethod: POST
|
|
disableResolveMessage: true
|
|
policies:
|
|
- orgId: 1
|
|
receiver: IRC #alerts
|
|
group_by: ['alertname']
|
|
group_wait: 30s
|
|
group_interval: 5m
|
|
repeat_interval: 1h
|
|
routes:
|
|
- receiver: Thermal Printer
|
|
matchers: ['alert_channel = thermal_print']
|
|
group_wait: 1m
|
|
group_interval: 10m
|
|
repeat_interval: 4h
|
|
continue: true
|
|
groups:
|
|
- orgId: 1
|
|
name: AI Stack
|
|
folder: AI Stack Alerts
|
|
interval: 1m
|
|
rules:
|
|
- uid: ollama-down-local
|
|
title: Ollama DOWN (Local)
|
|
condition: C
|
|
for: 2m
|
|
noDataState: Alerting
|
|
execErrState: OK
|
|
annotations:
|
|
summary: Ollama DOWN on workstation (R9700)
|
|
description: Agent Zero LOCAL cannot reach Ollama. FAISS memory will fail.
|
|
runbook: "1. Check PC power/wake 2. ssh stoltz@10.0.58.100 3. systemctl status ollama 4. journalctl -u ollama --since 5min"
|
|
labels:
|
|
severity: warning
|
|
service: ollama
|
|
data:
|
|
- refId: A
|
|
relativeTimeRange: {from: 120, to: 0}
|
|
datasourceUid: prometheus
|
|
model: {expr: 'probe_success{job="probe-ollama-local"}', instant: true, refId: A}
|
|
- refId: B
|
|
relativeTimeRange: {from: 120, to: 0}
|
|
datasourceUid: __expr__
|
|
model: {type: reduce, expression: A, reducer: last, refId: B}
|
|
- refId: C
|
|
relativeTimeRange: {from: 120, to: 0}
|
|
datasourceUid: __expr__
|
|
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [1], type: lt}}], refId: C}
|
|
- uid: ollama-down-edge1
|
|
title: Ollama DOWN (Edge1)
|
|
condition: C
|
|
for: 2m
|
|
noDataState: Alerting
|
|
execErrState: OK
|
|
annotations:
|
|
summary: Ollama DOWN on edge1 Pi 5
|
|
description: Agent Zero NUC cannot reach Ollama.
|
|
runbook: "1. ssh stoltz@10.0.57.17 2. systemctl status ollama 3. journalctl -u ollama --since 5min 4. cat /sys/class/thermal/thermal_zone0/temp"
|
|
labels:
|
|
severity: warning
|
|
service: ollama
|
|
data:
|
|
- refId: A
|
|
relativeTimeRange: {from: 120, to: 0}
|
|
datasourceUid: prometheus
|
|
model: {expr: 'probe_success{job="probe-ollama-edge1"}', instant: true, refId: A}
|
|
- refId: B
|
|
relativeTimeRange: {from: 120, to: 0}
|
|
datasourceUid: __expr__
|
|
model: {type: reduce, expression: A, reducer: last, refId: B}
|
|
- refId: C
|
|
relativeTimeRange: {from: 120, to: 0}
|
|
datasourceUid: __expr__
|
|
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [1], type: lt}}], refId: C}
|
|
- uid: a0-down-local
|
|
title: Agent Zero DOWN (Local)
|
|
condition: C
|
|
for: 2m
|
|
noDataState: Alerting
|
|
execErrState: OK
|
|
annotations:
|
|
summary: Agent Zero LOCAL DOWN
|
|
description: K3s web UI unreachable.
|
|
runbook: "1. ssh stoltz@10.0.58.100 2. docker ps | grep agent-zero 3. docker logs agent-zero --tail 20 4. Check Ollama first (A0 crashes if Ollama down)"
|
|
labels:
|
|
severity: warning
|
|
service: agent-zero
|
|
data:
|
|
- refId: A
|
|
relativeTimeRange: {from: 120, to: 0}
|
|
datasourceUid: prometheus
|
|
model: {expr: 'probe_success{job="probe-agentzero-local"}', instant: true, refId: A}
|
|
- refId: B
|
|
relativeTimeRange: {from: 120, to: 0}
|
|
datasourceUid: __expr__
|
|
model: {type: reduce, expression: A, reducer: last, refId: B}
|
|
- refId: C
|
|
relativeTimeRange: {from: 120, to: 0}
|
|
datasourceUid: __expr__
|
|
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [1], type: lt}}], refId: C}
|
|
- uid: a0-down-nuc
|
|
title: Agent Zero DOWN (NUC)
|
|
condition: C
|
|
for: 2m
|
|
noDataState: Alerting
|
|
execErrState: OK
|
|
annotations:
|
|
summary: Agent Zero NUC DOWN
|
|
description: RKE2 web UI unreachable.
|
|
runbook: "1. ssh root@10.0.56.200 via noc1 2. kubectl get pods -n agent-zero 3. kubectl logs -n agent-zero deploy/agent-zero --tail 20"
|
|
labels:
|
|
severity: warning
|
|
service: agent-zero
|
|
data:
|
|
- refId: A
|
|
relativeTimeRange: {from: 120, to: 0}
|
|
datasourceUid: prometheus
|
|
model: {expr: 'probe_success{job="probe-agentzero-nuc"}', instant: true, refId: A}
|
|
- refId: B
|
|
relativeTimeRange: {from: 120, to: 0}
|
|
datasourceUid: __expr__
|
|
model: {type: reduce, expression: A, reducer: last, refId: B}
|
|
- refId: C
|
|
relativeTimeRange: {from: 120, to: 0}
|
|
datasourceUid: __expr__
|
|
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [1], type: lt}}], refId: C}
|
|
- uid: print-ollama-runner-long-keepalive
|
|
title: Print.Web Ollama runner keep-alive >10m
|
|
condition: C
|
|
for: 2m
|
|
noDataState: NoData
|
|
execErrState: OK
|
|
annotations:
|
|
summary: Print.Web Ollama runner held too long
|
|
description: "A Print.Web Ollama runner has more than 10 minutes of keep-alive remaining. Check active AI requests before the Pi 5 Ollama lane thrashes."
|
|
runbook: "1. Open https://print.iamworkin.lan/admin 2. Check Ollama Fleet model + remaining keep-alive 3. Query Prometheus: ai_ollama_runner_keepalive_remaining_seconds{job=\"printweb-otel\"} 4. Stop duplicate model callers before restarting Ollama."
|
|
labels:
|
|
severity: warning
|
|
service: print-web
|
|
alert_channel: thermal_print
|
|
data:
|
|
- refId: A
|
|
relativeTimeRange: {from: 120, to: 0}
|
|
datasourceUid: prometheus
|
|
model: {expr: 'max by(model) (ai_ollama_runner_keepalive_remaining_seconds{job="printweb-otel"})', instant: true, refId: A}
|
|
- refId: B
|
|
relativeTimeRange: {from: 120, to: 0}
|
|
datasourceUid: __expr__
|
|
model: {type: reduce, expression: A, reducer: last, refId: B}
|
|
- refId: C
|
|
relativeTimeRange: {from: 120, to: 0}
|
|
datasourceUid: __expr__
|
|
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [600], type: gt}}], refId: C}
|
|
- orgId: 1
|
|
name: CI Runners
|
|
folder: CI Alerts
|
|
interval: 1m
|
|
rules:
|
|
- uid: linux-runner-offline
|
|
title: LinuxRunnerOffline
|
|
condition: C
|
|
for: 5m
|
|
noDataState: OK
|
|
execErrState: Error
|
|
annotations:
|
|
summary: "Linux CI runner offline: {{ $labels.deployment }}"
|
|
description: "A github-runner namespace Deployment has 0 ready replicas for more than 5 minutes. CI jobs targeting that repo will queue until the runner pod restarts and re-registers."
|
|
runbook: "1. kubectl -n github-runner get pods -l app.kubernetes.io/name={{ $labels.deployment }} 2. kubectl -n github-runner logs -l app.kubernetes.io/name={{ $labels.deployment }} --tail=50 3. Verify PAT repo access if registration returns 404 4. Verify no RWO PVC is shared by scaled runners"
|
|
labels:
|
|
severity: warning
|
|
service: github-runner
|
|
alert_channel: irc
|
|
team: ci
|
|
data:
|
|
- refId: A
|
|
relativeTimeRange: {from: 300, to: 0}
|
|
datasourceUid: prometheus
|
|
model: {expr: 'kube_deployment_status_replicas_ready{namespace="github-runner",deployment=~"github-runner(|-(sharedpos|puppet|signage|dms|telephony|print-web|chat|mysql|kiosk-linux))"} == 0', instant: true, refId: A}
|
|
- refId: B
|
|
relativeTimeRange: {from: 300, to: 0}
|
|
datasourceUid: __expr__
|
|
model: {type: reduce, expression: A, reducer: last, refId: B}
|
|
- refId: C
|
|
relativeTimeRange: {from: 300, to: 0}
|
|
datasourceUid: __expr__
|
|
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [0], type: gt}}], refId: C}
|
|
- orgId: 1
|
|
name: Infrastructure
|
|
folder: AI Stack Alerts
|
|
interval: 1m
|
|
rules:
|
|
- uid: node-down
|
|
title: Node DOWN
|
|
condition: C
|
|
for: 2m
|
|
noDataState: Alerting
|
|
execErrState: OK
|
|
annotations:
|
|
summary: Node down
|
|
description: Node exporter unreachable for 2 minutes. Host may be down or network issue.
|
|
runbook: "1. Ping host IP 2. SSH via noc1 jumpbox (root@10.0.56.10) 3. systemctl status node_exporter 4. Check network/VLAN 5. Physical power check if unreachable"
|
|
labels:
|
|
severity: critical
|
|
service: infrastructure
|
|
alert_channel: thermal_print
|
|
data:
|
|
- refId: A
|
|
relativeTimeRange: {from: 120, to: 0}
|
|
datasourceUid: prometheus
|
|
model: {expr: 'up{job=~"node-exporter|rke2-nodes|edge-nodes"}', instant: true, refId: A}
|
|
- refId: B
|
|
relativeTimeRange: {from: 120, to: 0}
|
|
datasourceUid: __expr__
|
|
model: {type: reduce, expression: A, reducer: last, refId: B}
|
|
- refId: C
|
|
relativeTimeRange: {from: 120, to: 0}
|
|
datasourceUid: __expr__
|
|
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [1], type: lt}}], refId: C}
|
|
- uid: macmini-runner-offline
|
|
title: MacMiniRunnerOffline
|
|
condition: C
|
|
for: 10m
|
|
noDataState: Alerting
|
|
execErrState: OK
|
|
annotations:
|
|
summary: Mac mini GitHub runner offline
|
|
description: "One or more macmini-* GitHub Actions runners have not reported online for more than 10 minutes. LaunchDaemons survive reboot and do not require the bluejay GUI session."
|
|
runbook: "1. ssh fcadmin@macmini.iamworkin.lan 2. launchctl print system/io.flowercore.github-runner-<slug> 3. Check /Users/fcadmin/Library/Logs/github-runners/<slug>/stderr.log 4. Re-register the repo runner if .runner is missing"
|
|
labels:
|
|
severity: warning
|
|
service: github-runner
|
|
data:
|
|
- refId: A
|
|
relativeTimeRange: {from: 600, to: 0}
|
|
datasourceUid: prometheus
|
|
model: {expr: 'min(flowercore_github_runner_online{runner=~"macmini-.*"} or vector(0))', instant: true, refId: A}
|
|
- refId: B
|
|
relativeTimeRange: {from: 600, to: 0}
|
|
datasourceUid: __expr__
|
|
model: {type: reduce, expression: A, reducer: last, refId: B}
|
|
- refId: C
|
|
relativeTimeRange: {from: 600, to: 0}
|
|
datasourceUid: __expr__
|
|
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [1], type: lt}}], refId: C}
|
|
- uid: high-cpu
|
|
title: High CPU (>85%)
|
|
condition: C
|
|
for: 10m
|
|
noDataState: NoData
|
|
execErrState: OK
|
|
annotations:
|
|
summary: High CPU
|
|
description: CPU above 85% for 10 minutes. Performance degradation likely.
|
|
runbook: "1. SSH to host 2. top -b -n1 | head -20 3. Check for runaway process 4. kubectl top pods (if K8s node)"
|
|
labels:
|
|
severity: warning
|
|
service: infrastructure
|
|
data:
|
|
- refId: A
|
|
relativeTimeRange: {from: 600, to: 0}
|
|
datasourceUid: prometheus
|
|
model: {expr: 'instance:node_cpu_usage:avg5m', instant: true, refId: A}
|
|
- refId: B
|
|
relativeTimeRange: {from: 600, to: 0}
|
|
datasourceUid: __expr__
|
|
model: {type: reduce, expression: A, reducer: last, refId: B}
|
|
- refId: C
|
|
relativeTimeRange: {from: 600, to: 0}
|
|
datasourceUid: __expr__
|
|
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [85], type: gt}}], refId: C}
|
|
- uid: high-memory
|
|
title: High Memory (>90%)
|
|
condition: C
|
|
for: 5m
|
|
noDataState: NoData
|
|
execErrState: OK
|
|
annotations:
|
|
summary: High memory usage
|
|
description: Memory above 90% for 5 minutes. OOM kills imminent.
|
|
runbook: "1. SSH to host 2. free -h 3. ps aux --sort=-%mem | head 10 4. Check for memory leak (growing RSS)"
|
|
labels:
|
|
severity: warning
|
|
service: infrastructure
|
|
data:
|
|
- refId: A
|
|
relativeTimeRange: {from: 600, to: 0}
|
|
datasourceUid: prometheus
|
|
model: {expr: 'instance:node_memory_usage:percent', instant: true, refId: A}
|
|
- refId: B
|
|
relativeTimeRange: {from: 600, to: 0}
|
|
datasourceUid: __expr__
|
|
model: {type: reduce, expression: A, reducer: last, refId: B}
|
|
- refId: C
|
|
relativeTimeRange: {from: 600, to: 0}
|
|
datasourceUid: __expr__
|
|
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [90], type: gt}}], refId: C}
|
|
- uid: disk-low
|
|
title: Disk Space Low (>85%)
|
|
condition: C
|
|
for: 10m
|
|
noDataState: NoData
|
|
execErrState: OK
|
|
annotations:
|
|
summary: Disk usage high
|
|
description: Root disk above 85% for 10 minutes. Service disruption if full.
|
|
runbook: "1. SSH to host 2. df -h / 3. du -sh /* | sort -rh | head 4. journalctl --vacuum-size=100M 5. podman system prune"
|
|
labels:
|
|
severity: warning
|
|
service: infrastructure
|
|
data:
|
|
- refId: A
|
|
relativeTimeRange: {from: 600, to: 0}
|
|
datasourceUid: prometheus
|
|
model: {expr: 'instance:node_disk_usage:percent', instant: true, refId: A}
|
|
- refId: B
|
|
relativeTimeRange: {from: 600, to: 0}
|
|
datasourceUid: __expr__
|
|
model: {type: reduce, expression: A, reducer: last, refId: B}
|
|
- refId: C
|
|
relativeTimeRange: {from: 600, to: 0}
|
|
datasourceUid: __expr__
|
|
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [85], type: gt}}], refId: C}
|
|
- orgId: 1
|
|
name: RemoteDesktop
|
|
folder: AI Stack Alerts
|
|
interval: 1m
|
|
rules:
|
|
- uid: remotedesktop-web-down
|
|
title: RemoteDesktop Web DOWN
|
|
condition: C
|
|
for: 3m
|
|
noDataState: Alerting
|
|
execErrState: OK
|
|
annotations:
|
|
summary: FlowerCore RemoteDesktop /health probe failing
|
|
description: "https://desktop.iamworkin.lan/health has failed for 3 minutes. Catalog + session launch surface offline."
|
|
runbook: "1. kubectl -n fc-desktop get pods -l app.kubernetes.io/name=remotedesktop-web 2. kubectl -n fc-desktop logs deploy/remotedesktop-web --tail=50 3. Check Traefik IngressRoute + step-ca cert 4. Rollout restart if pod is stuck"
|
|
labels:
|
|
severity: warning
|
|
service: remotedesktop
|
|
data:
|
|
- refId: A
|
|
relativeTimeRange: {from: 180, to: 0}
|
|
datasourceUid: prometheus
|
|
model: {expr: 'probe_success{job="probe-remotedesktop"}', instant: true, refId: A}
|
|
- refId: B
|
|
relativeTimeRange: {from: 180, to: 0}
|
|
datasourceUid: __expr__
|
|
model: {type: reduce, expression: A, reducer: last, refId: B}
|
|
- refId: C
|
|
relativeTimeRange: {from: 180, to: 0}
|
|
datasourceUid: __expr__
|
|
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [1], type: lt}}], refId: C}
|
|
|
|
- uid: remotedesktop-metrics-stale
|
|
title: RemoteDesktop metrics stale
|
|
condition: C
|
|
for: 10m
|
|
noDataState: Alerting
|
|
execErrState: OK
|
|
annotations:
|
|
summary: RemoteDesktop /metrics returning no series
|
|
description: "No fc_desktop_session_events_total series for 10 minutes. Either the Prometheus scrape is misconfigured or the web deployment stopped exporting metrics. Cross-checked by Zabbix template's identical 10m no-data trigger."
|
|
runbook: "1. curl -sk https://desktop.iamworkin.lan/metrics | head 2. kubectl -n monitoring exec deploy/prometheus -- wget -qO- localhost:9090/api/v1/targets?scrapePool=fc-remotedesktop 3. Check monitoring-netpol egress allows to fc-desktop:8080"
|
|
labels:
|
|
severity: warning
|
|
service: remotedesktop
|
|
data:
|
|
- refId: A
|
|
relativeTimeRange: {from: 600, to: 0}
|
|
datasourceUid: prometheus
|
|
model: {expr: 'count(fc_desktop_session_events_total) or vector(0)', instant: true, refId: A}
|
|
- refId: B
|
|
relativeTimeRange: {from: 600, to: 0}
|
|
datasourceUid: __expr__
|
|
model: {type: reduce, expression: A, reducer: last, refId: B}
|
|
- refId: C
|
|
relativeTimeRange: {from: 600, to: 0}
|
|
datasourceUid: __expr__
|
|
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [1], type: lt}}], refId: C}
|
|
|
|
- uid: remotedesktop-pool-depleted
|
|
title: RemoteDesktop pool depleted
|
|
condition: C
|
|
for: 5m
|
|
noDataState: OK
|
|
execErrState: OK
|
|
annotations:
|
|
summary: RemoteDesktop warm pool depleted for 5m
|
|
description: "A RemoteDesktop warm pool has fc_desktop_pool_depleted=1 for 5 minutes. New launches will cold-start. Check pod scheduling, image pull, node capacity."
|
|
runbook: "1. kubectl -n fc-desktop get pods -l app.kubernetes.io/name=remote-desktop --sort-by=.status.startTime 2. kubectl -n fc-desktop describe desktoppool <name> 3. Verify localhost/fc-desktop:* images imported on all 3 RKE2 nodes"
|
|
labels:
|
|
severity: warning
|
|
service: remotedesktop
|
|
data:
|
|
- refId: A
|
|
relativeTimeRange: {from: 300, to: 0}
|
|
datasourceUid: prometheus
|
|
model: {expr: 'max(fc_desktop_pool_depleted)', instant: true, refId: A}
|
|
- refId: B
|
|
relativeTimeRange: {from: 300, to: 0}
|
|
datasourceUid: __expr__
|
|
model: {type: reduce, expression: A, reducer: last, refId: B}
|
|
- refId: C
|
|
relativeTimeRange: {from: 300, to: 0}
|
|
datasourceUid: __expr__
|
|
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [0.5], type: gt}}], refId: C}
|
|
|
|
- uid: remotedesktop-pool-deficit-sustained
|
|
title: RemoteDesktop pool below desired
|
|
condition: C
|
|
for: 10m
|
|
noDataState: OK
|
|
execErrState: OK
|
|
annotations:
|
|
summary: RemoteDesktop pool sustained deficit
|
|
description: "A pool has fc_desktop_pool_deficit>0 for 10 minutes. Operator is reconciling but can't reach desired size — likely image pull, NFS affinity, or claim-init issue."
|
|
runbook: "1. kubectl -n fc-desktop get pods -l flowercore.io/pool=<pool> 2. kubectl logs -n fc-desktop deploy/remotedesktop-operator 3. Check claim-init hook env on template"
|
|
labels:
|
|
severity: info
|
|
service: remotedesktop
|
|
data:
|
|
- refId: A
|
|
relativeTimeRange: {from: 600, to: 0}
|
|
datasourceUid: prometheus
|
|
model: {expr: 'max(fc_desktop_pool_deficit)', instant: true, refId: A}
|
|
- refId: B
|
|
relativeTimeRange: {from: 600, to: 0}
|
|
datasourceUid: __expr__
|
|
model: {type: reduce, expression: A, reducer: last, refId: B}
|
|
- refId: C
|
|
relativeTimeRange: {from: 600, to: 0}
|
|
datasourceUid: __expr__
|
|
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [0], type: gt}}], refId: C}
|
|
|
|
- uid: remotedesktop-session-churn-spike
|
|
title: RemoteDesktop launch rate spike
|
|
condition: C
|
|
for: 5m
|
|
noDataState: OK
|
|
execErrState: OK
|
|
annotations:
|
|
summary: RemoteDesktop launch rate exceeds 20/min
|
|
description: "Launch events >20/min for 5 minutes. Could be a user-facing feature launch, pooled template thrashing, or runaway automation loop."
|
|
runbook: "1. kubectl -n fc-desktop get pods -l app.kubernetes.io/name=remote-desktop -o wide | wc -l 2. curl -sk https://desktop.iamworkin.lan/api/sessions/active 3. Check operator logs for reconcile loops"
|
|
labels:
|
|
severity: info
|
|
service: remotedesktop
|
|
data:
|
|
- refId: A
|
|
relativeTimeRange: {from: 300, to: 0}
|
|
datasourceUid: prometheus
|
|
model: {expr: 'sum(rate(fc_desktop_session_events_total{event="launch"}[5m])) * 60', instant: true, refId: A}
|
|
- refId: B
|
|
relativeTimeRange: {from: 300, to: 0}
|
|
datasourceUid: __expr__
|
|
model: {type: reduce, expression: A, reducer: last, refId: B}
|
|
- refId: C
|
|
relativeTimeRange: {from: 300, to: 0}
|
|
datasourceUid: __expr__
|
|
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [20], type: gt}}], refId: C}
|
|
|
|
- uid: remotedesktop-tls-expiry
|
|
title: RemoteDesktop TLS cert expiring
|
|
condition: C
|
|
for: 6h
|
|
noDataState: OK
|
|
execErrState: OK
|
|
annotations:
|
|
summary: desktop.iamworkin.lan cert <2d to expiry
|
|
description: "The desktop.iamworkin.lan certificate is inside the 2-day renewal window and cert-manager has not renewed. Check cert-manager logs, step-ca reachability, FlowerCore.DNS preflight for dnsNames."
|
|
runbook: "1. kubectl -n fc-desktop get certificate remotedesktop-web-tls 2. kubectl -n cert-manager logs deploy/cert-manager --tail=50 3. Verify pfSense DNS override for desktop.iamworkin.lan"
|
|
labels:
|
|
severity: critical
|
|
service: remotedesktop
|
|
data:
|
|
- refId: A
|
|
relativeTimeRange: {from: 21600, to: 0}
|
|
datasourceUid: prometheus
|
|
model: {expr: '(probe_ssl_earliest_cert_expiry{job="probe-remotedesktop"} - time()) / 86400', instant: true, refId: A}
|
|
- refId: B
|
|
relativeTimeRange: {from: 21600, to: 0}
|
|
datasourceUid: __expr__
|
|
model: {type: reduce, expression: A, reducer: last, refId: B}
|
|
- refId: C
|
|
relativeTimeRange: {from: 21600, to: 0}
|
|
datasourceUid: __expr__
|
|
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [2], type: lt}}], refId: C}
|
|
|
|
# =============================================================================
|
|
# Deployment: Grafana
|
|
# =============================================================================
|
|
---
|
|
apiVersion: apps/v1
|
|
kind: Deployment
|
|
metadata:
|
|
name: grafana
|
|
namespace: monitoring
|
|
labels:
|
|
app: grafana
|
|
spec:
|
|
replicas: 1
|
|
strategy:
|
|
type: Recreate
|
|
selector:
|
|
matchLabels:
|
|
app: grafana
|
|
template:
|
|
metadata:
|
|
labels:
|
|
app: grafana
|
|
spec:
|
|
securityContext:
|
|
fsGroup: 472 # grafana group
|
|
runAsUser: 472
|
|
runAsGroup: 472
|
|
containers:
|
|
- name: grafana
|
|
image: docker.io/grafana/grafana:latest
|
|
env:
|
|
# Credentials from 1Password Operator (OnePasswordItem → Secret "grafana-credentials")
|
|
- name: GF_SECURITY_ADMIN_USER
|
|
valueFrom:
|
|
secretKeyRef:
|
|
name: grafana-credentials
|
|
key: username
|
|
- name: GF_SECURITY_ADMIN_PASSWORD
|
|
valueFrom:
|
|
secretKeyRef:
|
|
name: grafana-credentials
|
|
key: password
|
|
- name: GF_SERVER_ROOT_URL
|
|
value: "https://grafana.iamworkin.lan"
|
|
- name: GF_SERVER_SERVE_FROM_SUB_PATH
|
|
value: "false"
|
|
# Zabbix plugin: install manually after first boot if needed
|
|
# GF_INSTALL_PLUGINS requires internet on startup — breaks with restrictive NetworkPolicy
|
|
# kubectl exec -n monitoring deploy/grafana -- grafana cli plugins install alexanderzobnin-zabbix-app
|
|
ports:
|
|
- containerPort: 3000
|
|
name: http
|
|
volumeMounts:
|
|
- name: data
|
|
mountPath: /var/lib/grafana
|
|
- name: dashboard-provider
|
|
mountPath: /etc/grafana/provisioning/dashboards
|
|
readOnly: true
|
|
- name: dashboards-ai-stack
|
|
mountPath: /var/lib/grafana/dashboards/ai-stack
|
|
readOnly: true
|
|
- name: dashboards-edge-nodes
|
|
mountPath: /var/lib/grafana/dashboards/edge-nodes
|
|
readOnly: true
|
|
- name: dashboards-network
|
|
mountPath: /var/lib/grafana/dashboards/network
|
|
readOnly: true
|
|
- name: dashboards-operations
|
|
mountPath: /var/lib/grafana/dashboards/operations
|
|
readOnly: true
|
|
- name: dashboards-printer
|
|
mountPath: /var/lib/grafana/dashboards/printer
|
|
readOnly: true
|
|
- name: dashboards-infra-overview
|
|
mountPath: /var/lib/grafana/dashboards/infra-overview
|
|
readOnly: true
|
|
- name: dashboards-remotedesktop
|
|
mountPath: /var/lib/grafana/dashboards/remotedesktop
|
|
readOnly: true
|
|
- name: datasource-provisioning
|
|
mountPath: /etc/grafana/provisioning/datasources
|
|
readOnly: true
|
|
- name: alerting-provisioning
|
|
mountPath: /etc/grafana/provisioning/alerting
|
|
readOnly: true
|
|
resources:
|
|
requests:
|
|
cpu: 100m
|
|
memory: 128Mi
|
|
limits:
|
|
cpu: 500m
|
|
memory: 512Mi
|
|
livenessProbe:
|
|
httpGet:
|
|
path: /api/health
|
|
port: 3000
|
|
initialDelaySeconds: 30
|
|
periodSeconds: 30
|
|
readinessProbe:
|
|
httpGet:
|
|
path: /api/health
|
|
port: 3000
|
|
initialDelaySeconds: 10
|
|
periodSeconds: 10
|
|
volumes:
|
|
- name: data
|
|
persistentVolumeClaim:
|
|
claimName: grafana-data
|
|
- name: dashboard-provider
|
|
configMap:
|
|
name: grafana-dashboard-provider
|
|
- name: dashboards-ai-stack
|
|
configMap:
|
|
name: grafana-dashboards
|
|
- name: dashboards-edge-nodes
|
|
configMap:
|
|
name: grafana-dashboard-edge-nodes
|
|
- name: dashboards-network
|
|
configMap:
|
|
name: grafana-dashboard-network-overview
|
|
- name: dashboards-operations
|
|
configMap:
|
|
name: grafana-dashboard-operations
|
|
- name: dashboards-printer
|
|
configMap:
|
|
name: grafana-dashboard-printer
|
|
- name: dashboards-infra-overview
|
|
configMap:
|
|
name: grafana-dashboard-infra-overview
|
|
- name: dashboards-remotedesktop
|
|
configMap:
|
|
name: grafana-dashboard-remotedesktop
|
|
- name: datasource-provisioning
|
|
configMap:
|
|
name: grafana-datasource-provisioning
|
|
- name: alerting-provisioning
|
|
configMap:
|
|
name: grafana-alerting-provisioning
|
|
|
|
# =============================================================================
|
|
# Deployment: Blackbox Exporter
|
|
# =============================================================================
|
|
---
|
|
apiVersion: apps/v1
|
|
kind: Deployment
|
|
metadata:
|
|
name: blackbox-exporter
|
|
namespace: monitoring
|
|
labels:
|
|
app: blackbox-exporter
|
|
spec:
|
|
replicas: 1
|
|
selector:
|
|
matchLabels:
|
|
app: blackbox-exporter
|
|
template:
|
|
metadata:
|
|
labels:
|
|
app: blackbox-exporter
|
|
spec:
|
|
containers:
|
|
- name: blackbox-exporter
|
|
image: quay.io/prometheus/blackbox-exporter:latest
|
|
args:
|
|
- "--config.file=/config/blackbox.yml"
|
|
ports:
|
|
- containerPort: 9115
|
|
name: http
|
|
volumeMounts:
|
|
- name: config
|
|
mountPath: /config/blackbox.yml
|
|
subPath: blackbox.yml
|
|
readOnly: true
|
|
resources:
|
|
requests:
|
|
cpu: 50m
|
|
memory: 32Mi
|
|
limits:
|
|
cpu: 200m
|
|
memory: 128Mi
|
|
livenessProbe:
|
|
httpGet:
|
|
path: /
|
|
port: 9115
|
|
initialDelaySeconds: 5
|
|
periodSeconds: 30
|
|
readinessProbe:
|
|
httpGet:
|
|
path: /
|
|
port: 9115
|
|
initialDelaySeconds: 3
|
|
periodSeconds: 10
|
|
volumes:
|
|
- name: config
|
|
configMap:
|
|
name: blackbox-config
|
|
|
|
# =============================================================================
|
|
# PVC: SNMP Exporter Config (100Mi, Longhorn)
|
|
# =============================================================================
|
|
# The custom snmp.yml (~2MB) exceeds the 1MB ConfigMap limit.
|
|
# This PVC stores the config file. To load a custom config:
|
|
# kubectl cp snmp.yml monitoring/<snmp-exporter-pod>:/config/snmp.yml
|
|
# Then restart the pod to pick up the new config.
|
|
---
|
|
apiVersion: v1
|
|
kind: PersistentVolumeClaim
|
|
metadata:
|
|
name: snmp-config
|
|
namespace: monitoring
|
|
spec:
|
|
accessModes:
|
|
- ReadWriteOnce
|
|
storageClassName: longhorn
|
|
resources:
|
|
requests:
|
|
storage: 100Mi
|
|
|
|
# =============================================================================
|
|
# Deployment: SNMP Exporter
|
|
# =============================================================================
|
|
# Uses a PVC-mounted config at /config/snmp.yml. An init container copies the
|
|
# default config from the image if the PVC is empty (first deploy).
|
|
# To load the custom noc1 snmp.yml (~2MB):
|
|
# kubectl cp snmp.yml monitoring/<snmp-exporter-pod>:/config/snmp.yml
|
|
# Then restart: kubectl rollout restart deploy/snmp-exporter -n monitoring
|
|
---
|
|
apiVersion: apps/v1
|
|
kind: Deployment
|
|
metadata:
|
|
name: snmp-exporter
|
|
namespace: monitoring
|
|
labels:
|
|
app: snmp-exporter
|
|
spec:
|
|
replicas: 1
|
|
selector:
|
|
matchLabels:
|
|
app: snmp-exporter
|
|
template:
|
|
metadata:
|
|
labels:
|
|
app: snmp-exporter
|
|
spec:
|
|
initContainers:
|
|
# Copy default snmp.yml from image if PVC is empty (first deploy)
|
|
- name: init-config
|
|
image: docker.io/prom/snmp-exporter:latest
|
|
command:
|
|
- sh
|
|
- -c
|
|
- |
|
|
if [ ! -f /config/snmp.yml ]; then
|
|
echo "No custom config found, copying default from image..."
|
|
cp /etc/snmp_exporter/snmp.yml /config/snmp.yml
|
|
echo "Default snmp.yml copied to PVC."
|
|
else
|
|
echo "Custom snmp.yml already exists on PVC, skipping copy."
|
|
fi
|
|
volumeMounts:
|
|
- name: snmp-config
|
|
mountPath: /config
|
|
containers:
|
|
- name: snmp-exporter
|
|
image: docker.io/prom/snmp-exporter:latest
|
|
args:
|
|
- "--config.file=/config/snmp.yml"
|
|
ports:
|
|
- containerPort: 9116
|
|
name: http
|
|
volumeMounts:
|
|
- name: snmp-config
|
|
mountPath: /config
|
|
readOnly: true
|
|
resources:
|
|
requests:
|
|
cpu: 50m
|
|
memory: 64Mi
|
|
limits:
|
|
cpu: 200m
|
|
memory: 256Mi
|
|
livenessProbe:
|
|
httpGet:
|
|
path: /
|
|
port: 9116
|
|
initialDelaySeconds: 5
|
|
periodSeconds: 30
|
|
readinessProbe:
|
|
httpGet:
|
|
path: /
|
|
port: 9116
|
|
initialDelaySeconds: 3
|
|
periodSeconds: 10
|
|
volumes:
|
|
- name: snmp-config
|
|
persistentVolumeClaim:
|
|
claimName: snmp-config
|
|
|
|
# =============================================================================
|
|
# Deployment: IRC Notify (alert relay)
|
|
# =============================================================================
|
|
---
|
|
apiVersion: apps/v1
|
|
kind: Deployment
|
|
metadata:
|
|
name: irc-notify
|
|
namespace: monitoring
|
|
labels:
|
|
app: irc-notify
|
|
spec:
|
|
replicas: 1
|
|
selector:
|
|
matchLabels:
|
|
app: irc-notify
|
|
template:
|
|
metadata:
|
|
labels:
|
|
app: irc-notify
|
|
spec:
|
|
containers:
|
|
- name: irc-notify
|
|
image: docker.io/library/python:3.12-slim
|
|
command: ["python3", "/app/notify.py"]
|
|
ports:
|
|
- containerPort: 9119
|
|
name: http
|
|
volumeMounts:
|
|
- name: script
|
|
mountPath: /app/notify.py
|
|
subPath: notify.py
|
|
readOnly: true
|
|
resources:
|
|
requests:
|
|
cpu: 25m
|
|
memory: 32Mi
|
|
limits:
|
|
cpu: 100m
|
|
memory: 64Mi
|
|
livenessProbe:
|
|
tcpSocket:
|
|
port: 9119
|
|
initialDelaySeconds: 5
|
|
periodSeconds: 30
|
|
readinessProbe:
|
|
tcpSocket:
|
|
port: 9119
|
|
initialDelaySeconds: 3
|
|
periodSeconds: 10
|
|
volumes:
|
|
- name: script
|
|
configMap:
|
|
name: irc-notify-script
|
|
|
|
# =============================================================================
|
|
# DaemonSet: Node Exporter (runs on every RKE2 node)
|
|
# =============================================================================
|
|
# Port 9101 avoids conflict with host-level node-exporters already on :9100.
|
|
# The rke2-nodes Prometheus job scrapes the host instances on :9100; this
|
|
# DaemonSet provides K8s service-discovery-based scraping on :9101.
|
|
---
|
|
apiVersion: apps/v1
|
|
kind: DaemonSet
|
|
metadata:
|
|
name: node-exporter
|
|
namespace: monitoring
|
|
labels:
|
|
app: node-exporter
|
|
spec:
|
|
selector:
|
|
matchLabels:
|
|
app: node-exporter
|
|
updateStrategy:
|
|
type: RollingUpdate
|
|
rollingUpdate:
|
|
maxUnavailable: 1
|
|
template:
|
|
metadata:
|
|
labels:
|
|
app: node-exporter
|
|
spec:
|
|
hostPID: true
|
|
hostNetwork: true
|
|
tolerations:
|
|
- operator: Exists
|
|
securityContext:
|
|
runAsNonRoot: false
|
|
runAsUser: 0
|
|
containers:
|
|
- name: node-exporter
|
|
image: docker.io/prom/node-exporter:latest
|
|
args:
|
|
- "--path.rootfs=/host"
|
|
- "--path.sysfs=/host/sys"
|
|
- "--path.procfs=/host/proc"
|
|
- "--collector.filesystem.mount-points-exclude=^/(dev|proc|sys|run|var/lib/containers|run/containers|var/lib/rancher|var/lib/longhorn)($|/)"
|
|
- "--collector.netclass.ignored-devices=^(veth|cali|flannel|cni).*$"
|
|
- "--no-collector.btrfs"
|
|
- "--web.listen-address=:9101"
|
|
ports:
|
|
- containerPort: 9101
|
|
hostPort: 9101
|
|
name: metrics
|
|
securityContext:
|
|
privileged: true
|
|
readOnlyRootFilesystem: true
|
|
volumeMounts:
|
|
- name: rootfs
|
|
mountPath: /host
|
|
readOnly: true
|
|
mountPropagation: HostToContainer
|
|
- name: proc
|
|
mountPath: /host/proc
|
|
readOnly: true
|
|
- name: sys
|
|
mountPath: /host/sys
|
|
readOnly: true
|
|
resources:
|
|
requests:
|
|
cpu: 50m
|
|
memory: 32Mi
|
|
limits:
|
|
cpu: 200m
|
|
memory: 128Mi
|
|
volumes:
|
|
- name: rootfs
|
|
hostPath:
|
|
path: /
|
|
- name: proc
|
|
hostPath:
|
|
path: /proc
|
|
- name: sys
|
|
hostPath:
|
|
path: /sys
|
|
|
|
# =============================================================================
|
|
# Service: Prometheus (ClusterIP :9090)
|
|
# =============================================================================
|
|
---
|
|
apiVersion: v1
|
|
kind: Service
|
|
metadata:
|
|
name: prometheus
|
|
namespace: monitoring
|
|
labels:
|
|
app: prometheus
|
|
spec:
|
|
type: ClusterIP
|
|
ports:
|
|
- port: 9090
|
|
targetPort: 9090
|
|
protocol: TCP
|
|
name: http
|
|
selector:
|
|
app: prometheus
|
|
|
|
# =============================================================================
|
|
# Service: Grafana (ClusterIP :3000)
|
|
# =============================================================================
|
|
---
|
|
apiVersion: v1
|
|
kind: Service
|
|
metadata:
|
|
name: grafana
|
|
namespace: monitoring
|
|
labels:
|
|
app: grafana
|
|
spec:
|
|
type: ClusterIP
|
|
ports:
|
|
- port: 3000
|
|
targetPort: 3000
|
|
protocol: TCP
|
|
name: http
|
|
selector:
|
|
app: grafana
|
|
|
|
# =============================================================================
|
|
# Service: Blackbox Exporter (ClusterIP :9115)
|
|
# =============================================================================
|
|
---
|
|
apiVersion: v1
|
|
kind: Service
|
|
metadata:
|
|
name: blackbox-exporter
|
|
namespace: monitoring
|
|
labels:
|
|
app: blackbox-exporter
|
|
spec:
|
|
type: ClusterIP
|
|
ports:
|
|
- port: 9115
|
|
targetPort: 9115
|
|
protocol: TCP
|
|
name: http
|
|
selector:
|
|
app: blackbox-exporter
|
|
|
|
# =============================================================================
|
|
# Service: SNMP Exporter (ClusterIP :9116)
|
|
# =============================================================================
|
|
---
|
|
apiVersion: v1
|
|
kind: Service
|
|
metadata:
|
|
name: snmp-exporter
|
|
namespace: monitoring
|
|
labels:
|
|
app: snmp-exporter
|
|
spec:
|
|
type: ClusterIP
|
|
ports:
|
|
- port: 9116
|
|
targetPort: 9116
|
|
protocol: TCP
|
|
name: http
|
|
selector:
|
|
app: snmp-exporter
|
|
|
|
# =============================================================================
|
|
# Service: Node Exporter (headless for Prometheus SD)
|
|
# =============================================================================
|
|
---
|
|
apiVersion: v1
|
|
kind: Service
|
|
metadata:
|
|
name: node-exporter
|
|
namespace: monitoring
|
|
labels:
|
|
app: node-exporter
|
|
spec:
|
|
type: ClusterIP
|
|
clusterIP: None
|
|
ports:
|
|
- port: 9101
|
|
targetPort: 9101
|
|
protocol: TCP
|
|
name: metrics
|
|
selector:
|
|
app: node-exporter
|
|
|
|
# =============================================================================
|
|
# Service: IRC Notify (ClusterIP :9119)
|
|
# =============================================================================
|
|
---
|
|
apiVersion: v1
|
|
kind: Service
|
|
metadata:
|
|
name: irc-notify
|
|
namespace: monitoring
|
|
labels:
|
|
app: irc-notify
|
|
spec:
|
|
type: ClusterIP
|
|
ports:
|
|
- port: 9119
|
|
targetPort: 9119
|
|
protocol: TCP
|
|
name: http
|
|
selector:
|
|
app: irc-notify
|
|
|
|
# =============================================================================
|
|
# TLS Certificates (cert-manager + step-ca ACME)
|
|
# =============================================================================
|
|
---
|
|
apiVersion: cert-manager.io/v1
|
|
kind: Certificate
|
|
metadata:
|
|
name: grafana-tls
|
|
namespace: monitoring
|
|
spec:
|
|
secretName: grafana-tls
|
|
issuerRef:
|
|
name: step-ca-acme
|
|
kind: ClusterIssuer
|
|
dnsNames:
|
|
- grafana.iamworkin.lan
|
|
---
|
|
apiVersion: cert-manager.io/v1
|
|
kind: Certificate
|
|
metadata:
|
|
name: prometheus-tls
|
|
namespace: monitoring
|
|
spec:
|
|
secretName: prometheus-tls
|
|
issuerRef:
|
|
name: step-ca-acme
|
|
kind: ClusterIssuer
|
|
dnsNames:
|
|
- prometheus.iamworkin.lan
|
|
|
|
# =============================================================================
|
|
# Traefik IngressRoute: Grafana
|
|
# =============================================================================
|
|
---
|
|
apiVersion: traefik.io/v1alpha1
|
|
kind: IngressRoute
|
|
metadata:
|
|
name: grafana
|
|
namespace: monitoring
|
|
spec:
|
|
entryPoints:
|
|
- websecure
|
|
routes:
|
|
- kind: Rule
|
|
match: Host(`grafana.iamworkin.lan`)
|
|
services:
|
|
- name: grafana
|
|
port: 3000
|
|
tls:
|
|
secretName: grafana-tls
|
|
|
|
# =============================================================================
|
|
# Traefik IngressRoute: Prometheus
|
|
# =============================================================================
|
|
---
|
|
apiVersion: traefik.io/v1alpha1
|
|
kind: IngressRoute
|
|
metadata:
|
|
name: prometheus
|
|
namespace: monitoring
|
|
spec:
|
|
entryPoints:
|
|
- websecure
|
|
routes:
|
|
- kind: Rule
|
|
match: Host(`prometheus.iamworkin.lan`)
|
|
services:
|
|
- name: prometheus
|
|
port: 9090
|
|
tls:
|
|
secretName: prometheus-tls
|
|
|
|
# =============================================================================
|
|
# NetworkPolicy: monitoring namespace
|
|
# =============================================================================
|
|
---
|
|
apiVersion: networking.k8s.io/v1
|
|
kind: NetworkPolicy
|
|
metadata:
|
|
name: monitoring-netpol
|
|
namespace: monitoring
|
|
spec:
|
|
podSelector: {}
|
|
policyTypes:
|
|
- Ingress
|
|
- Egress
|
|
ingress:
|
|
# Allow from Traefik (IngressRoutes AND ACME solver pods)
|
|
- from:
|
|
- namespaceSelector:
|
|
matchLabels:
|
|
kubernetes.io/metadata.name: traefik-system
|
|
# Allow intra-namespace (prometheus→exporters, grafana→prometheus, grafana→irc-notify)
|
|
- from:
|
|
- podSelector: {}
|
|
# Allow from cert-manager (ACME HTTP-01 self-check)
|
|
- from:
|
|
- namespaceSelector:
|
|
matchLabels:
|
|
kubernetes.io/metadata.name: cert-manager
|
|
egress:
|
|
# DNS
|
|
- to:
|
|
- namespaceSelector: {}
|
|
ports:
|
|
- port: 53
|
|
protocol: UDP
|
|
- port: 53
|
|
protocol: TCP
|
|
# MGMT VLAN (noc1, pfSense, switches, SNMP, node-exporter)
|
|
- to:
|
|
- ipBlock:
|
|
cidr: 10.0.56.0/24
|
|
# PROD VLAN (edge nodes)
|
|
- to:
|
|
- ipBlock:
|
|
cidr: 10.0.57.0/24
|
|
# HOME VLAN (workstation, printer, NAS)
|
|
- to:
|
|
- ipBlock:
|
|
cidr: 10.0.58.0/24
|
|
# Intra-namespace
|
|
- to:
|
|
- podSelector: {}
|
|
# Blackbox probes to other namespaces (agent-zero, etc)
|
|
- to:
|
|
- namespaceSelector:
|
|
matchLabels:
|
|
kubernetes.io/metadata.name: agent-zero
|
|
ports:
|
|
- port: 80
|
|
protocol: TCP
|
|
# FlowerCore.RemoteDesktop /metrics scrape via the fc-desktop
|
|
# ClusterIP Service (remotedesktop-web:8080). Also covers the
|
|
# Traefik VIP hairpin path since after kube-proxy DNAT, the egress
|
|
# destination is the backend pod IP on the service port (see
|
|
# feedback_netpol_dnat_backend_port).
|
|
- to:
|
|
- namespaceSelector:
|
|
matchLabels:
|
|
kubernetes.io/metadata.name: fc-desktop
|
|
ports:
|
|
- port: 8080
|
|
protocol: TCP
|
|
# Traefik backend ports — needed for in-cluster egress to public
|
|
# iamworkin.lan hostnames that CoreDNS wildcard resolves to the
|
|
# LoadBalancer VIP. Post-DNAT destination is a Traefik pod on 8080/8443.
|
|
- to:
|
|
- namespaceSelector:
|
|
matchLabels:
|
|
kubernetes.io/metadata.name: traefik-system
|
|
podSelector:
|
|
matchLabels:
|
|
app.kubernetes.io/name: traefik
|
|
ports:
|
|
- port: 8080
|
|
protocol: TCP
|
|
- port: 8443
|
|
protocol: TCP
|
|
# Traefik /metrics endpoint (port 9100) — separate from the data-path
|
|
# ports above. Required for the in-cluster `traefik` scrape job.
|
|
- to:
|
|
- namespaceSelector:
|
|
matchLabels:
|
|
kubernetes.io/metadata.name: traefik-system
|
|
ports:
|
|
- port: 9100
|
|
protocol: TCP
|
|
# kube-state-metrics — required for kubernetes-state alert group.
|
|
- to:
|
|
- namespaceSelector:
|
|
matchLabels:
|
|
kubernetes.io/metadata.name: kube-system
|
|
ports:
|
|
- port: 8080
|
|
protocol: TCP
|
|
# cert-manager metrics — required for CertManagerCertificate* alerts.
|
|
- to:
|
|
- namespaceSelector:
|
|
matchLabels:
|
|
kubernetes.io/metadata.name: cert-manager
|
|
ports:
|
|
- port: 9402
|
|
protocol: TCP
|
|
# Longhorn manager metrics — required for Longhorn* alerts.
|
|
- to:
|
|
- namespaceSelector:
|
|
matchLabels:
|
|
kubernetes.io/metadata.name: longhorn-system
|
|
ports:
|
|
- port: 9500
|
|
protocol: TCP
|
|
# IRC (irc-notify → UnrealIRCd in irc namespace via K8s DNS)
|
|
- to:
|
|
- namespaceSelector:
|
|
matchLabels:
|
|
kubernetes.io/metadata.name: irc
|
|
ports:
|
|
- port: 6667
|
|
protocol: TCP
|
|
- port: 6697
|
|
protocol: TCP
|
|
# Step-CA ACME (cert renewal)
|
|
- to:
|
|
- ipBlock:
|
|
cidr: 10.0.56.10/32
|
|
ports:
|
|
- port: 9443
|
|
protocol: TCP
|
|
# Internet (optional: Grafana plugin install, ACME)
|
|
- to:
|
|
- ipBlock:
|
|
cidr: 0.0.0.0/0
|
|
except:
|
|
- 10.0.0.0/8
|
|
- 172.16.0.0/12
|
|
- 192.168.0.0/16
|
|
|
|
# =============================================================================
|
|
# Job: SNMP Config Loader (ArgoCD PostSync hook)
|
|
# =============================================================================
|
|
# Runs once after the main deployment to populate the SNMP config PVC.
|
|
# Attempts to download custom snmp.yml from noc1; falls back to the default
|
|
# config bundled in the snmp-exporter image.
|
|
---
|
|
apiVersion: batch/v1
|
|
kind: Job
|
|
metadata:
|
|
name: snmp-config-loader
|
|
namespace: monitoring
|
|
annotations:
|
|
argocd.argoproj.io/hook: PostSync
|
|
argocd.argoproj.io/hook-delete-policy: HookSucceeded
|
|
spec:
|
|
backoffLimit: 0
|
|
template:
|
|
metadata:
|
|
labels:
|
|
app: snmp-config-loader
|
|
spec:
|
|
restartPolicy: Never
|
|
initContainers:
|
|
# Try to download custom snmp.yml from noc1
|
|
- name: download-config
|
|
image: docker.io/curlimages/curl:latest
|
|
command:
|
|
- sh
|
|
- -c
|
|
- |
|
|
echo "Attempting to download custom snmp.yml from noc1..."
|
|
curl -sf --connect-timeout 10 --max-time 30 \
|
|
http://10.0.56.10:9116/config -o /config/snmp.yml 2>/dev/null
|
|
if [ $? -eq 0 ] && [ -s /config/snmp.yml ]; then
|
|
echo "Custom snmp.yml downloaded from noc1 successfully."
|
|
else
|
|
echo "Download failed or empty, will use default from image."
|
|
rm -f /config/snmp.yml
|
|
fi
|
|
volumeMounts:
|
|
- name: snmp-config
|
|
mountPath: /config
|
|
containers:
|
|
# If download failed, copy the default config from the image
|
|
- name: fallback-default
|
|
image: docker.io/prom/snmp-exporter:latest
|
|
command:
|
|
- sh
|
|
- -c
|
|
- |
|
|
if [ -f /config/snmp.yml ] && [ -s /config/snmp.yml ]; then
|
|
echo "Custom config already present, nothing to do."
|
|
else
|
|
echo "Copying default snmp.yml from image to PVC..."
|
|
cp /etc/snmp_exporter/snmp.yml /config/snmp.yml
|
|
echo "Default config copied."
|
|
fi
|
|
echo "SNMP config loader complete."
|
|
volumeMounts:
|
|
- name: snmp-config
|
|
mountPath: /config
|
|
volumes:
|
|
- name: snmp-config
|
|
persistentVolumeClaim:
|
|
claimName: snmp-config
|