# ============================================================================= # NOC Monitoring Stack — K8s Migration Target # ============================================================================= # Migrates the noc1 Podman monitoring pod to RKE2 K8s. # Source: noc1 (10.0.56.10) /opt/monitoring/ # # Components: # - Prometheus (metrics, alerting) # - Grafana (dashboards) # - Blackbox Exporter (HTTP probes) # - SNMP Exporter (network device metrics) # - Node Exporter (host metrics, DaemonSet) # - IRC Notify (alert relay to UnrealIRCd) # # Note: SNMP exporter config (snmp.yml) is ~2MB, exceeding the 1MB ConfigMap # limit. It is stored in a separate file (snmp-config.yaml) and must be # applied as a standalone ConfigMap or mounted via an init container that # downloads it from Gitea. # ============================================================================= --- apiVersion: v1 kind: Namespace metadata: name: monitoring # ============================================================================= # ConfigMap: Prometheus Configuration # ============================================================================= --- apiVersion: v1 kind: ConfigMap metadata: name: prometheus-config namespace: monitoring data: prometheus.yml: | global: scrape_interval: 30s evaluation_interval: 30s rule_files: - /etc/prometheus/alerts.yml - /etc/prometheus/recording-rules.yml scrape_configs: # noc1 host metrics (external to cluster) - job_name: "node-exporter" static_configs: - targets: ["10.0.56.10:9100"] labels: instance: "noc1" vlan: "mgmt" # RKE2 bare-metal cluster (openSUSE Leap 16 on NUCs) - job_name: "rke2-nodes" scrape_timeout: 15s static_configs: - targets: ["10.0.56.11:9100"] labels: instance: "rke2-server" vlan: "mgmt" cluster: "rke2" role: "server" - targets: ["10.0.56.12:9100"] labels: instance: "rke2-agent1" vlan: "mgmt" cluster: "rke2" role: "agent" - targets: ["10.0.56.13:9100"] labels: instance: "rke2-agent2" vlan: "mgmt" cluster: "rke2" role: "agent" # Mac mini macOS runner node (INFRA VLAN) - job_name: "macmini-node" scrape_timeout: 15s static_configs: - targets: ["10.0.56.115:9100"] labels: instance: "macmini" host: "macmini.iamworkin.lan" vlan: "infra" arch: "arm64" role: "macos-runner" puppet_managed: "true" puppet_server: "puppet.iamworkin.lan" # In-cluster node-exporter DaemonSet - job_name: "k8s-node-exporter" kubernetes_sd_configs: - role: endpoints namespaces: names: ["monitoring"] relabel_configs: - source_labels: [__meta_kubernetes_endpoints_name] action: keep regex: node-exporter - source_labels: [__meta_kubernetes_endpoint_node_name] target_label: instance # pfSense SNMP via snmp-exporter - job_name: "snmp-pfsense" static_configs: - targets: ["10.0.56.1"] metrics_path: /snmp params: module: [if_mib] auth: [bluejay_v2] relabel_configs: - source_labels: [__address__] target_label: __param_target - source_labels: [__param_target] target_label: instance - target_label: __address__ replacement: snmp-exporter.monitoring.svc:9116 # UniFi Cloud Key SNMP — DISABLED 2026-04-26 # The Cloud Key Gen2+ runs unifi-core (controller) only — not a network # device — and does NOT run an SNMP agent on UDP/161. Scrapes were # silently failing with "connection refused" from 10.42.x.x:161 every # 30s, polluting up{} = 0 and lastError on the Targets page. Hardware # health (CPU/mem/disk) for the Cloud Key host should come from # node_exporter via SSH — not SNMP. # - job_name: "snmp-cloudkey" # static_configs: # - targets: ["10.0.56.3"] # metrics_path: /snmp # params: # module: [if_mib] # auth: [bluejay_v2] # relabel_configs: # - source_labels: [__address__] # target_label: __param_target # - source_labels: [__param_target] # target_label: instance # - target_label: __address__ # replacement: snmp-exporter.monitoring.svc:9116 # UniFi Switch SNMP - job_name: "snmp-switch" static_configs: - targets: ["10.0.56.2"] metrics_path: /snmp params: module: [if_mib] auth: [bluejay_v2] relabel_configs: - source_labels: [__address__] target_label: __param_target - source_labels: [__param_target] target_label: instance - target_label: __address__ replacement: snmp-exporter.monitoring.svc:9116 # Synology NAS SNMP - job_name: "snmp-nas" static_configs: - targets: ["10.0.58.3"] metrics_path: /snmp params: module: [synology] auth: [bluejay_v2] relabel_configs: - source_labels: [__address__] target_label: __param_target - source_labels: [__param_target] target_label: instance - target_label: __address__ replacement: snmp-exporter.monitoring.svc:9116 # Prometheus self-monitoring - job_name: "prometheus" static_configs: - targets: ["localhost:9090"] # Edge nodes (PROD VLAN) - job_name: "edge-nodes" static_configs: - targets: ["10.0.57.17:9100"] labels: instance: "edge1" vlan: "prod" arch: "arm64" role: "ai-inference" puppet_managed: "true" puppet_server: "puppet.iamworkin.lan" - targets: ["10.0.57.16:9100"] labels: instance: "edge2" vlan: "prod" arch: "arm64" role: "ci-runner" puppet_managed: "true" puppet_server: "puppet.iamworkin.lan" - targets: ["10.0.58.25:9100"] labels: instance: "piez" vlan: "home" arch: "arm64" role: "prototyping" - targets: ["10.0.58.113:9100"] labels: instance: "pirelay" vlan: "home" arch: "arm64" role: "relay-controller" # ======================================================================= # PiManager Application Metrics (relay states, temps, automation) # ======================================================================= - job_name: "pimanager-app" scrape_interval: 15s metrics_path: /metrics static_configs: - targets: ["10.0.58.25:5000"] labels: instance: "piez" service: "pimanager" vlan: "home" device: "pi4-ezconnect" - targets: ["10.0.58.113:5100"] labels: instance: "pirelay" service: "pimanager" vlan: "home" device: "pi3-ks0212" # Epson ET-3750 EcoTank Printer SNMP - job_name: "snmp-printer" scrape_interval: 5m scrape_timeout: 30s static_configs: - targets: ["10.0.58.107"] labels: instance: "epson-ecotank" vlan: "home" device_type: "printer" metrics_path: /snmp params: module: [printer_mib] auth: [public_v2] relabel_configs: - source_labels: [__address__] target_label: __param_target - source_labels: [__param_target] target_label: instance - target_label: __address__ replacement: snmp-exporter.monitoring.svc:9116 # ============================================================================= # Print Services (CUPS + Print.Web on edge2) # ============================================================================= # CUPS Prometheus exporter (cups_exporter on edge2:9628) - job_name: "cups" scrape_interval: 30s static_configs: - targets: ["10.0.57.16:9628"] labels: instance: "edge2" service: "cups" device_type: "printer" printer_model: "NuPrint 210" # Print.Web OTEL metrics (print counters/histograms plus Ollama runner gauges) - job_name: "printweb-otel" scrape_interval: 30s metrics_path: /metrics/prometheus static_configs: - targets: ["10.0.57.16:5200"] labels: instance: "print-web" service: "print-web" device_type: "printer" printer_model: "NuPrint 210" # Print.Web health (Blazor app on edge2:5200) # Target `/health` (anonymous) — root path requires API key auth and returns 401. - job_name: "probe-printweb" metrics_path: /probe params: module: [http_2xx] scrape_interval: 30s static_configs: - targets: ["http://10.0.57.16:5200/health"] labels: instance: "print-web" service: "print-web" relabel_configs: - source_labels: [__address__] target_label: __param_target - source_labels: [__param_target] target_label: instance - target_label: __address__ replacement: blackbox-exporter.monitoring.svc:9115 # FlowerCore.RemoteDesktop web health (public cluster VIP) # Module is https_internal — desktop.iamworkin.lan uses a step-ca leaf # cert; blackbox does NOT trust step-ca root, so http_2xx fails with # x509 unknown authority and probe_success=0 even when /health 200s. - job_name: "probe-remotedesktop" metrics_path: /probe params: module: [https_internal] scrape_interval: 30s static_configs: - targets: ["https://desktop.iamworkin.lan/health"] labels: instance: "https://desktop.iamworkin.lan/health" service: "remotedesktop-web" relabel_configs: - source_labels: [__address__] target_label: __param_target - target_label: __address__ replacement: blackbox-exporter.monitoring.svc:9115 # FlowerCore.RemoteDesktop /metrics (direct scrape for counters) - job_name: "fc-remotedesktop" metrics_path: /metrics scheme: https scrape_interval: 30s tls_config: insecure_skip_verify: true static_configs: - targets: ["desktop.iamworkin.lan"] labels: service: "remotedesktop-web" # CUPS web UI health (port 631) - job_name: "probe-cups" metrics_path: /probe params: module: [http_2xx] scrape_interval: 60s static_configs: - targets: ["http://10.0.57.16:631/"] labels: instance: "cups-edge2" service: "cups" relabel_configs: - source_labels: [__address__] target_label: __param_target - source_labels: [__param_target] target_label: instance - target_label: __address__ replacement: blackbox-exporter.monitoring.svc:9115 # ============================================================================= # AI Stack Health Probes (Blackbox Exporter) # ============================================================================= # NOTE: probe-ollama-local and probe-agentzero-local were REMOVED # 2026-04-26. They pointed at 10.0.58.100 (HOME VLAN) which is not # reachable from cluster pods (firewalled). They had been firing as # OllamaDown / AgentZeroDown since 2026-04-24. Workstation/AI-laptop # Ollama and Agent Zero should be monitored via host-side Puppet # (node_exporter on the box) once the AI laptop is running 24/7. # Ollama API — edge1 Pi 5 (NUC Agent Zero) - job_name: "probe-ollama-edge1" metrics_path: /probe params: module: [http_ollama] scrape_interval: 30s static_configs: - targets: ["http://10.0.57.17:11434/api/tags"] labels: instance: "ollama-edge1" service: "ollama" deployment: "nuc" gpu: "cpu" relabel_configs: - source_labels: [__address__] target_label: __param_target - source_labels: [__param_target] target_label: instance - target_label: __address__ replacement: blackbox-exporter.monitoring.svc:9115 # Agent Zero Web UI — in-cluster (RKE2) # Target uses short svc form (agent-zero.agent-zero.svc) NOT # cluster.local FQDN — the *.cluster.local form gets rewritten to # 10.0.56.200 (Traefik VIP) by the CoreDNS iamworkin.lan template + # ndots:5 search-suffix expansion. Memory: feedback_coredns_ndots_template_collision. - job_name: "probe-agentzero-nuc" metrics_path: /probe params: module: [http_2xx] scrape_interval: 30s static_configs: - targets: ["http://agent-zero.agent-zero.svc:80/"] labels: instance: "agent-zero-nuc" service: "agent-zero" deployment: "nuc" relabel_configs: - source_labels: [__address__] target_label: __param_target - source_labels: [__param_target] target_label: instance - target_label: __address__ replacement: blackbox-exporter.monitoring.svc:9115 # ============================================================================= # K8s Cluster State (kube-state-metrics, cert-manager, traefik) # ============================================================================= # Use in-cluster ClusterIP service DNS — NOT NodePorts — so a same-node # NodePort hairpin doesn't break the scrape (hit on rke2-agent1 hosting # both prometheus and traefik on 2026-04-26: 10.0.56.12:30900 timed out # from prometheus while .11/.13 worked). NodePorts at 30900-30902 are # still useful for noc1-Podman-style external scrapers, but in-cluster # we should always use the svc DNS form. # kube-state-metrics — exposes K8s object state (pods, deployments, nodes) # Required for KubeContainerRestartingFrequently / KubePodNotReady alerts. - job_name: "kube-state-metrics" scrape_interval: 30s static_configs: - targets: ["kube-state-metrics.kube-system.svc:8080"] labels: cluster: "rke2" # cert-manager — exposes certmanager_certificate_ready_status, # certmanager_certificate_expiration_timestamp_seconds, etc. Drives the # CertManagerCertificateNotReady / CertManagerCertificateRenewalFailed # alerts. Memory: project_cert_manager_prometheus_scrape. - job_name: "cert-manager" scrape_interval: 30s static_configs: - targets: ["cert-manager-metrics.cert-manager.svc:9402"] labels: cluster: "rke2" # Traefik — request rates, latency, TLS cert metadata, router state. # ClusterIP svc routes to one of the traefik pods; per-pod scrape via # the headless `traefik-metrics` selector would be nicer for failover # visibility but the single-replica scrape is enough for steady-state. - job_name: "traefik" scrape_interval: 15s static_configs: - targets: ["traefik-metrics.traefik-system.svc:9100"] labels: service: "traefik" cluster: "rke2" # Longhorn — exposes longhorn_volume_robustness, longhorn_backup_*, # longhorn_node_status_*. Enables LonghornVolumeUnhealthy + # LonghornBackupFailed alerts (no real visibility into Longhorn # health before this — was relying on K8s events which are noisy # transient lifecycle messages, not actionable signals). - job_name: "longhorn" scrape_interval: 30s static_configs: - targets: ["longhorn-backend.longhorn-system.svc:9500"] labels: service: "longhorn" cluster: "rke2" # FC web services through Traefik — single probe surface to spot any # iamworkin.lan host returning non-200. Uses https_internal because all # certs are step-ca leaves; blackbox would x509-fail with http_2xx. # Some services need explicit healthcheck paths because root returns # 404 (acme, guac) or 401 (grafana, prometheus). Drop them or point at # the right endpoint — don't lower valid_status_codes globally because # 401 from a healthy pod and 401 from an outage look identical. - job_name: "probe-traefik-services" metrics_path: /probe params: module: [https_internal] scrape_interval: 60s static_configs: - targets: # Root-reachable services (200 or 3xx) - "https://gitea.iamworkin.lan/" - "https://argocd.iamworkin.lan/" - "https://intranet.iamworkin.lan/" - "https://signage.iamworkin.lan/" - "https://kiosk.iamworkin.lan/" - "https://media.iamworkin.lan/" - "https://mysql.iamworkin.lan/" - "https://php.iamworkin.lan/" - "https://zabbix.iamworkin.lan/" - "https://desktop.iamworkin.lan/" - "https://print.iamworkin.lan/" - "https://dns.iamworkin.lan/" - "https://chat.iamworkin.lan/" - "https://dist.iamworkin.lan/" - "https://dms.iamworkin.lan/" - "https://menuboard.iamworkin.lan/" - "https://messageboard.iamworkin.lan/" - "https://presentations.iamworkin.lan/" - "https://retail.iamworkin.lan/" - "https://ttsreader.iamworkin.lan/" # Explicit healthcheck paths - "https://fc-llm-bridge.iamworkin.lan/healthz" - "https://acme.iamworkin.lan/health" # NOTE: services intentionally NOT in this probe surface # - grafana.iamworkin.lan: every endpoint (incl. /api/health # and /login) returns 401 behind Traefik basic-auth. # Health covered by in-cluster monitoring-grafana scrape. # - prometheus.iamworkin.lan: same auth pattern. Health covered # by the prometheus self-scrape job. # - guac.iamworkin.lan: deprecated — Guacamole moved to # desktop.iamworkin.lan/guacamole/ (memory: # feedback_traefik_cross_namespace_refs_disabled). labels: probe_type: "traefik-service" relabel_configs: - source_labels: [__address__] target_label: __param_target - source_labels: [__param_target] regex: "https?://([^/:]+).*" target_label: instance - target_label: __address__ replacement: blackbox-exporter.monitoring.svc:9115 # ============================================================================= # Self-monitoring (K8s monitoring namespace) # ============================================================================= - job_name: "monitoring-grafana" metrics_path: /metrics static_configs: - targets: ["grafana.monitoring.svc:3000"] labels: instance: "grafana-k8s" service: "grafana" - job_name: "monitoring-blackbox" static_configs: - targets: ["blackbox-exporter.monitoring.svc:9115"] labels: instance: "blackbox-k8s" service: "blackbox" recording-rules.yml: | groups: - name: node-aggregations interval: 30s rules: - record: instance:node_cpu_usage:avg5m expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) - record: instance:node_memory_usage:percent expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100 - record: instance:node_disk_usage:percent expr: (1 - node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100 - record: instance:node_network_receive:rate5m expr: rate(node_network_receive_bytes_total{device!~"lo|veth.*|cali.*|flannel.*"}[5m]) * 8 - record: instance:node_network_transmit:rate5m expr: rate(node_network_transmit_bytes_total{device!~"lo|veth.*|cali.*|flannel.*"}[5m]) * 8 - name: probe-aggregations interval: 30s rules: - record: service:probe_success:min expr: min by(service) (probe_success) - record: service:probe_duration:avg expr: avg by(service) (probe_duration_seconds) - name: print-rates interval: 30s rules: - record: print:jobs_per_minute:rate5m expr: rate(print_jobs_enqueued_total[5m]) * 60 - record: print:success_rate:ratio5m expr: rate(print_jobs_completed_total[5m]) / rate(print_jobs_enqueued_total[5m]) - record: print:job_duration_p95:5m expr: histogram_quantile(0.95, rate(print_job_duration_ms_bucket[5m])) - record: print:ollama_runner_keepalive_remaining_seconds:max expr: max by(instance, model) (ai_ollama_runner_keepalive_remaining_seconds{job="printweb-otel"}) - name: relay-rates interval: 15s rules: - record: relay:state_changes:1h expr: changes(pimanager_relay_state[1h]) - record: epson:pages_per_day:rate24h expr: increase(prtMarkerLifeCount{job="snmp-printer"}[24h]) alerts.yml: | groups: - name: ai-stack rules: - alert: OllamaDown expr: probe_success{service="ollama"} == 0 for: 2m labels: severity: warning annotations: summary: "Ollama is down on {{ $labels.deployment }}" description: "Ollama API at {{ $labels.instance }} has been unreachable for 2 minutes. Agent Zero FAISS memory will fail." - alert: AgentZeroDown expr: probe_success{service="agent-zero"} == 0 for: 2m labels: severity: warning annotations: summary: "Agent Zero is down on {{ $labels.deployment }}" description: "Agent Zero web UI at {{ $labels.instance }} has been unreachable for 2 minutes." - alert: OllamaSlowResponse expr: probe_duration_seconds{service="ollama"} > 3 for: 5m labels: severity: info annotations: summary: "Ollama responding slowly on {{ $labels.deployment }}" description: "Ollama API response time exceeds 3s for 5 minutes. GPU may be overloaded." - name: print-services rules: - alert: CUPSExporterDown expr: up{job="cups"} == 0 for: 2m labels: severity: warning annotations: summary: "CUPS exporter unreachable on edge2" description: "cups_exporter at edge2:9628 has been down for 2 minutes. CUPS monitoring offline." - alert: CUPSWebUIDown expr: probe_success{job="probe-cups"} == 0 for: 3m labels: severity: warning annotations: summary: "CUPS web UI down on edge2" description: "CUPS port 631 unreachable for 3 minutes. Network printing unavailable." - alert: PrintWebDown expr: probe_success{job="probe-printweb"} == 0 for: 2m labels: severity: warning annotations: summary: "Print.Web is down on edge2" description: "FlowerCore Print.Web at edge2:5200 unreachable. API/MCP/Blazor printing unavailable." - alert: CUPSPrinterStopped expr: cups_printer_state_total{state="stopped"} > 0 for: 5m labels: severity: warning annotations: summary: "CUPS printer stopped on edge2" description: "A CUPS printer has been in stopped state for 5 minutes. Check USB connection or paper." - alert: CUPSJobBacklog expr: cups_job_active_total > 10 for: 2m labels: severity: warning annotations: summary: "Print queue backlog on edge2 ({{ $value }} active jobs)" description: "CUPS has {{ $value }} active jobs queued. Possible printer jam, USB disconnect, or paper out." # Paper roll lifecycle alerts (XL Track I, 2026-04-26). # Source-of-truth gauge: print_paper_remaining_percent (Print.Web OTEL, # hydrated on startup from the active PaperRoll row). # alert_channel=thermal_print routes through irc-notify -> Print.Web # /api/print/alert so the printer announces its own paper-out warning # on its remaining paper. Self-referential humor + operator nudge. - alert: PrintPaperRollLow expr: print_paper_remaining_percent{job="printweb-otel"} < 10 and print_paper_remaining_percent{job="printweb-otel"} > 5 for: 5m labels: severity: warning alert_channel: thermal_print annotations: summary: "Print roll low on edge2 ({{ $value | printf \"%.1f\" }}% remaining)" description: "NuPrint 210 paper roll has {{ $value | printf \"%.1f\" }}% remaining. Operator should load a fresh roll soon. Run /api/paper/status for the precise mm + estimated jobs left." - alert: PrintPaperRollCritical expr: print_paper_remaining_percent{job="printweb-otel"} <= 5 for: 2m labels: severity: critical alert_channel: thermal_print annotations: summary: "Print roll critical on edge2 ({{ $value | printf \"%.1f\" }}% remaining)" description: "NuPrint 210 paper roll at {{ $value | printf \"%.1f\" }}% — load a new roll NOW. The 50ft roll has a ~12% red-stripe zone; once paper passes that, the printer can run dry mid-job." - alert: PrintJobDeadLetter expr: increase(print_jobs_dead_letter_total[15m]) > 0 for: 1m labels: severity: warning alert_channel: thermal_print annotations: summary: "Print job(s) entered dead-letter on edge2 ({{ $value | printf \"%.0f\" }} in last 15m)" description: "{{ $value | printf \"%.0f\" }} print job(s) exhausted MaxRetries and need operator action. Open /print-log, filter Status=DeadLetter, click 'Retry From Start' after fixing the underlying cause (paper jam, USB disconnect, printer power-cycle)." - alert: CUPSHighJobRate expr: rate(cups_job_total[5m]) * 60 > 30 for: 5m labels: severity: info annotations: summary: "High print volume on edge2 ({{ $value | printf \"%.0f\" }} jobs/min)" description: "Print rate exceeds 30 jobs/min for 5 minutes. Possible runaway print loop." - alert: PrintOllamaRunnerLongKeepAlive expr: ai_ollama_runner_keepalive_remaining_seconds{job="printweb-otel"} > 600 for: 2m labels: severity: warning alert_channel: thermal_print annotations: summary: "Print.Web Ollama runner held for >10m ({{ $labels.model }})" description: "Print.Web reports model {{ $labels.model }} with {{ $value | printf \"%.0f\" }}s of keep-alive remaining. Check concurrent requests before the Pi 5 Ollama lane thrashes." - name: macmini-runners rules: - alert: MacMiniRunnerOffline expr: (flowercore_github_runner_online{runner=~"macmini-.*"} == 0) or absent(flowercore_github_runner_online{runner=~"macmini-.*"}) for: 10m labels: severity: warning service: github-runner annotations: summary: "Mac mini GitHub runner offline ({{ $labels.runner }})" description: "A macmini-* GitHub Actions runner has not reported online for more than 10 minutes. Puppet manages its LaunchDaemon under /Library/LaunchDaemons/io.flowercore.github-runner-.plist; runners survive reboot and do not require a GUI session." - name: linux-runners rules: - alert: LinuxRunnerOffline expr: | kube_deployment_status_replicas_ready{ namespace="github-runner", deployment=~"github-runner(|-(sharedpos|puppet|signage|dms|telephony|print-web|chat|mysql|kiosk-linux))" } == 0 for: 5m labels: severity: warning alert_channel: irc service: github-runner team: ci annotations: summary: "Linux CI runner offline: {{ $labels.deployment }}" description: "Deployment {{ $labels.deployment }} in namespace github-runner has 0 ready replicas for more than 5 minutes. CI jobs targeting this repo will queue until the runner pod restarts and re-registers with GitHub. Check pods with: kubectl -n github-runner get pods -l app.kubernetes.io/name={{ $labels.deployment }}. Check logs with: kubectl -n github-runner logs -l app.kubernetes.io/name={{ $labels.deployment }} --tail=50. Common causes: PAT missing repo access, runner CrashLoopBackOff, or node/resource pressure." - name: remote-desktop rules: - alert: RemoteDesktopWebDown expr: probe_success{job="probe-remotedesktop",instance="https://desktop.iamworkin.lan/health"} == 0 for: 3m labels: severity: warning annotations: summary: "FlowerCore RemoteDesktop web is down" description: "https://desktop.iamworkin.lan/health probe has failed for 3 minutes. Catalog + session launch surface offline." - alert: RemoteDesktopMetricsStale expr: absent(fc_desktop_session_events_total) for: 10m labels: severity: warning annotations: summary: "RemoteDesktop /metrics scrape returning no data" description: "No fc_desktop_session_events_total series for 10 minutes. Either the Prometheus scrape target is misconfigured or the web deployment stopped exporting metrics. Zabbix template carries the same 10m no-data trigger for cross-monitor parity." # PUBLISHER QUIRK: fc_desktop_pool_depleted / _deficit emit one # series per template per status (Ready/Warming/BelowDesiredSize/ # Disabled), and the historical series for non-current statuses # stay at their last value. So just `_depleted > 0` fires forever # on any template that ever entered a bad state. # # SAFE PATTERN: alert only when the canonical "Ready" status # gauge does NOT report ready=1 for the enabled template. This # is the publisher's own canary — _ready{status="Ready"}==1 is # always the current "everything is fine" signal. - alert: RemoteDesktopPoolDepleted expr: | group by(template) (fc_desktop_pool_ready{enabled="true"}) unless on(template) (fc_desktop_pool_ready{enabled="true",status="Ready"} == 1) for: 5m labels: severity: warning annotations: summary: "RemoteDesktop pool depleted ({{ $labels.template }})" description: "Pool for template {{ $labels.template }} has no Ready warm pod for 5 minutes. New launches will cold-start. Check pod-scheduling failures, image pull issues, or exhausted node capacity." # Same pattern, but only fires when template explicitly reports # a sustained Warning-level alert state (current-status series). - alert: RemoteDesktopPoolDeficitSustained expr: | fc_desktop_pool_deficit{enabled="true",alert_level="Warning"} > 0 unless on(template) (fc_desktop_pool_ready{enabled="true",status="Ready"} == 1) for: 10m labels: severity: info annotations: summary: "RemoteDesktop pool {{ $labels.template }} below desired for 10m" description: "Pool {{ $labels.template }} has a persistent deficit of {{ $value }} warm pods AND no Ready series. Likely image pull, NFS affinity, or claim-init issue." - alert: RemoteDesktopSessionChurnSpike expr: sum(rate(fc_desktop_session_events_total{event="launch"}[5m])) * 60 > 20 for: 5m labels: severity: info annotations: summary: "RemoteDesktop launch rate high ({{ $value | printf \"%.0f\" }}/min)" description: "Launch events exceed 20/min for 5 minutes. Could be a user-facing feature launch, a pooled template thrashing, or a runaway automation loop." - alert: RemoteDesktopRecordingEventsDropped expr: absent_over_time(fc_desktop_session_events_total{event="recording"}[30m]) and on() (sum(fc_desktop_session_events_total{event="launch"}) > 0) for: 15m labels: severity: info annotations: summary: "RemoteDesktop recording events silent for 30m despite active launches" description: "No recording events in 30 minutes while launches are happening. Recording may be silently disabled on all templates (SessionRecordingEnabled=false), the guacd NFS mount may be unhealthy, or the retention sweep isn't emitting events. Not an error by itself — worth checking." # Match by job — instance label carries full URL incl. /health, # not just hostname, so a hostname-only match never fires. - alert: RemoteDesktopTlsExpiry expr: probe_ssl_earliest_cert_expiry{job="probe-remotedesktop"} - time() < 2 * 86400 for: 6h labels: severity: critical annotations: summary: "desktop.iamworkin.lan TLS cert expires within 2 days" description: "The desktop.iamworkin.lan cert is inside the 2-day renewal window and cert-manager has not renewed. Check cert-manager logs, step-ca reachability, and pfSense DNS overrides per the ACME DNS-01 gate." - name: pi-fleet rules: - alert: PiManagerDown expr: up{job="pimanager-app"} == 0 for: 3m labels: severity: warning annotations: summary: "PiManager down on {{ $labels.instance }}" description: "PiManager app on {{ $labels.instance }} ({{ $labels.device }}) unreachable for 3 minutes." - alert: PiCpuTempHigh expr: pimanager_cpu_temperature_celsius > 75 for: 5m labels: severity: warning annotations: summary: "Pi CPU temperature high on {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}C)" - alert: PiCpuTempCritical expr: pimanager_cpu_temperature_celsius > 82 for: 2m labels: severity: critical annotations: summary: "Pi CPU temperature CRITICAL on {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}C)" - alert: PiMemoryHigh expr: pimanager_memory_usage_percent > 90 for: 5m labels: severity: warning annotations: summary: "Pi memory usage high on {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}%)" - alert: PiDiskHigh expr: pimanager_disk_usage_percent > 85 for: 10m labels: severity: warning annotations: summary: "Pi disk usage high on {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}%)" - alert: RelayAllOff expr: sum by (instance) (pimanager_relay_state) == 0 and pimanager_relay_channel_count > 0 for: 0m labels: severity: info annotations: summary: "All relay channels OFF on {{ $labels.instance }}" - alert: PiWifiWeak expr: pimanager_wifi_signal_dbm < -75 and pimanager_wifi_signal_dbm != 0 for: 10m labels: severity: warning annotations: summary: "Weak WiFi signal on {{ $labels.instance }} ({{ $value }}dBm)" - name: snmp-devices rules: - alert: EpsonInkLow expr: prtMarkerSuppliesLevel{job="snmp-printer"} < 15 and prtMarkerSuppliesLevel{job="snmp-printer"} > 0 for: 0m labels: severity: warning alert_channel: thermal_print annotations: summary: "Epson ink low: {{ $labels.prtMarkerSuppliesDescription }} at {{ $value }}%" - alert: EpsonInkCritical expr: prtMarkerSuppliesLevel{job="snmp-printer"} < 5 and prtMarkerSuppliesLevel{job="snmp-printer"} > 0 for: 0m labels: severity: critical alert_channel: thermal_print annotations: summary: "Epson ink CRITICAL: {{ $labels.prtMarkerSuppliesDescription }} at {{ $value }}%" # for: 30m absorbs sleep cycles. The EcoTank sleeps after ~5 min # of idle and SNMP times out, so 5m for: would page nightly. A # genuine printer outage (jam, disconnected) lasts well over 30m. - alert: EpsonPrinterDown expr: up{job="snmp-printer"} == 0 for: 30m labels: severity: warning annotations: summary: "Epson ET-3750 SNMP unreachable for >30m (likely actual fault, not sleep)" - alert: SynologyDiskLow expr: hrStorageUsed{job="snmp-nas"} / hrStorageSize{job="snmp-nas"} * 100 > 85 for: 10m labels: severity: warning alert_channel: thermal_print annotations: summary: "Synology NAS storage high ({{ $value | printf \"%.1f\" }}%)" - alert: SynologyDown expr: up{job="snmp-nas"} == 0 for: 3m labels: severity: critical alert_channel: thermal_print annotations: summary: "Synology NAS SNMP unreachable" - name: infrastructure rules: - alert: NodeDown expr: up{job=~"node-exporter|rke2-nodes|edge-nodes|k8s-node-exporter"} == 0 for: 2m labels: severity: critical annotations: summary: "Node {{ $labels.instance }} is down" - alert: HighCPU expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 85 for: 10m labels: severity: warning annotations: summary: "High CPU on {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}%)" - alert: HighMemory expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100 > 90 for: 5m labels: severity: warning annotations: summary: "High memory usage on {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}%)" - alert: DiskSpaceLow expr: (1 - node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100 > 85 for: 10m labels: severity: warning annotations: summary: "Disk usage high on {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}%)" # Puppet agent + service alerts. # Mirror of FlowerCore.Notes/scripts/monitoring/alerts.yml `puppet` group # so a future migration to in-cluster Prometheus inherits the ruleset. # Source-of-truth for the live Podman Prometheus on noc1 is the Notes file. # See feedback_monitoring_k8s_target_vs_live_podman. - name: puppet rules: - alert: PuppetAgentReportStale expr: puppet_last_run_age_seconds > 7200 for: 30m labels: severity: warning alert_channel: irc annotations: summary: "Puppet agent {{ $labels.instance }} hasn't reported in over 2h" description: "Last run age: {{ $value | humanizeDuration }}. The puppet agent on {{ $labels.instance }} may be stopped, the node may be powered off, or noc1 may be unreachable from this node." runbook: "1. SSH to node (via noc1 jumpbox if needed) 2. sudo systemctl status puppet 3. sudo puppet agent -t --noop to force a run 4. Check r10k: ssh fcadmin@10.0.56.10 'sudo podman logs openvoxserver --tail 50' 5. Verify noc1 reachability: ping puppet.iamworkin.lan" - alert: PuppetAgentReportCritical expr: puppet_last_run_age_seconds > 86400 for: 1h labels: severity: critical alert_channel: irc annotations: summary: "Puppet agent {{ $labels.instance }} silent for over 24h — node is unmanaged" description: "Last run age: {{ $value | humanizeDuration }}. Node {{ $labels.instance }} has not submitted a Puppet report in over 24 hours. Config drift is accumulating — investigate immediately. If intentional (maintenance), add to the exclusion filter or silence in Grafana." runbook: "URGENT: 1. Check node power state 2. SSH via noc1 jumpbox: ssh fcadmin@10.0.56.10 then ssh 3. sudo systemctl status puppet 4. sudo systemctl start puppet + sudo puppet agent -t 5. Check for network partitions (VLAN connectivity to 10.0.56.10) 6. If node was recently reimaged: sudo puppet agent -t to re-register with new SSL cert" # Sprint 33 Cx-7 Phase B (2026-05-25 postmortem follow-up): # Detects puppet.service in failed state — distinct from PuppetAgentReportStale # which catches "agent hasn't run." This catches "systemd gave up restarting it" # (CA-verify loop or other fatal exit). Requires node-exporter systemd collector # enabled with --collector.systemd. If `node_systemd_unit_state` has no series # for a node, the collector is disabled there — flag in postmortem follow-up. - alert: PuppetServiceFailed expr: node_systemd_unit_state{name="puppet.service",state="failed"} == 1 for: 5m labels: severity: warning alert_channel: irc annotations: summary: "Puppet service failed on {{ $labels.instance }}" description: "puppet.service on {{ $labels.instance }} has been in failed state for 5+ minutes. systemd has stopped auto-restarting (CA-verify-loop or other exit). Manual `systemctl status puppet` confirms. Run `sudo systemctl start puppet` to recover; investigate journal for root cause." runbook_url: "https://github.com/astoltz/FlowerCore.Notes/blob/master/memory/feedback_puppet_service_dead_after_ca_loop_alert_misreads.md" # K8s pod-state alerts. Require kube-state-metrics scrape (added # 2026-04-26 — see scrape_configs above). Would have surfaced the # agent-zero ollama-proxy 172x crash-loop instead of letting it # silently churn for ~3 days. - name: kubernetes-state rules: - alert: KubeContainerRestartingFrequently expr: increase(kube_pod_container_status_restarts_total[1h]) > 5 for: 15m labels: severity: warning annotations: summary: "{{ $labels.namespace }}/{{ $labels.pod }} container {{ $labels.container }} restarting >5x/hr" description: "Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has restarted {{ $value | printf \"%.0f\" }} times in the last hour. Check 'kubectl describe pod' + last-state termination reason." - alert: KubeContainerCrashLooping expr: increase(kube_pod_container_status_restarts_total[15m]) > 3 for: 5m labels: severity: critical alert_channel: thermal_print annotations: summary: "{{ $labels.namespace }}/{{ $labels.pod }} crashlooping ({{ $value | printf \"%.0f\" }} restarts/15m)" description: "Container {{ $labels.container }} restarted {{ $value | printf \"%.0f\" }} times in 15 minutes — actively crashlooping." - alert: KubePodNotReady expr: sum by(namespace, pod) (kube_pod_status_phase{phase=~"Pending|Failed|Unknown"}) > 0 for: 15m labels: severity: warning annotations: summary: "{{ $labels.namespace }}/{{ $labels.pod }} not Ready for >15m" description: "Pod is in a non-Running, non-Succeeded phase for over 15 minutes. Common causes: ImagePullBackOff (registry/Nexus down, wrong image tag), pending PVC, scheduling failure (taint/resources)." - alert: KubePodImagePullBackOff expr: sum by(namespace, pod) (kube_pod_container_status_waiting_reason{reason=~"ImagePullBackOff|ErrImagePull"}) > 0 for: 10m labels: severity: warning annotations: summary: "{{ $labels.namespace }}/{{ $labels.pod }} ImagePullBackOff for >10m" description: "Pod can't pull image. Check the image ref (often a stale tag or unreachable registry) and clean up if it's an orphan." - alert: KubeDeploymentReplicasMismatch expr: kube_deployment_spec_replicas != kube_deployment_status_replicas_available for: 15m labels: severity: warning annotations: summary: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} replica mismatch" description: "Spec wants {{ $labels.spec_replicas }} but only {{ $value }} available. Likely a rollout stuck on probe failure, scheduling, or PVC." # Q-MR-3 (2026-05-11): multus memory pressure — catches the next OOM # cascade BEFORE multus is OOM-killed cluster-wide. The 2026-05-10 # outage (21h) hit because no alert fired on the rising multus working # set — only downstream blackbox / Traefik / service alerts. With # 1Gi limit (bluejay-infra@eb8693e), 80% = ~800MiB; steady-state # runs ~150-250MiB so this only fires when an avalanche starts. - alert: MultusMemoryPressure expr: | container_memory_working_set_bytes{container="kube-multus"} / container_spec_memory_limit_bytes{container="kube-multus"} > 0.8 for: 5m labels: severity: critical alert_channel: thermal_print annotations: summary: "kube-multus memory >80% of limit on {{ $labels.node }} for 5m" description: "kube-multus working set is {{ $value | humanizePercentage }} of its memory limit on node {{ $labels.node }}. If this keeps climbing, multus will OOM and all new pod networking will halt cluster-wide (precedent: 2026-05-10 outage)." # Q-MR-3 (2026-05-11): namespace pending-pod backlog — catches the # operator-leak avalanche pattern BEFORE it cascades into a multus # CNI OOM. Any FC operator (RemoteDesktop / Distribution / WorldBuilder) # emitting pods without ownerReferences will accumulate them when # the operator crashes. >25 pending pods in any namespace for 30m # is the signal to investigate the reconciler. - alert: NamespacePendingPodBacklog expr: sum by (namespace) (kube_pod_status_phase{phase="Pending"}) > 25 for: 30m labels: severity: warning annotations: summary: "Namespace {{ $labels.namespace }} has {{ $value }} Pending pods for 30m" description: "Pending pod count in {{ $labels.namespace }} exceeds 25 sustained for 30m. Likely operator-leak avalanche pattern — children emitted without ownerReferences. Risk of multus CNI OOM cascade." # Longhorn storage health alerts. Required: longhorn scrape job # (added 2026-04-26 — see scrape_configs above). The K8s events # for "snapshot becomes not ready to use" are transient lifecycle # noise, not actionable — these alerts use the actual Longhorn # gauges that reflect persistent state. - name: longhorn-storage rules: # Volume robustness: 0=unknown, 1=healthy, 2=degraded, 3=faulted. # Detached volumes report 0 — that's normal for unattached PVCs, # so filter to only attached. - alert: LonghornVolumeDegraded expr: longhorn_volume_robustness{robustness="degraded"} == 1 for: 15m labels: severity: warning annotations: summary: "Longhorn volume {{ $labels.volume }} degraded for >15m" description: "Volume {{ $labels.volume }} on node {{ $labels.node }} has been degraded (one or more replicas unhealthy) for 15+ minutes. Auto-rebuild may need help — check 'kubectl describe volume.longhorn.io {{ $labels.volume }} -n longhorn-system'." - alert: LonghornVolumeFaulted expr: longhorn_volume_robustness{robustness="faulted"} == 1 for: 5m labels: severity: critical alert_channel: thermal_print annotations: summary: "Longhorn volume {{ $labels.volume }} FAULTED" description: "Volume {{ $labels.volume }} on node {{ $labels.node }} is faulted — all replicas unavailable. Data inaccessible. Manual intervention required." # No backup in 36h indicates the daily-backup recurringJob is # silently failing. Allows for one missed run + slack. - alert: LonghornBackupStale expr: | (time() - max by(volume) (longhorn_backup_state{state="Completed"} * on(backup) group_left() longhorn_backup_actual_size_bytes)) > 36 * 3600 for: 1h labels: severity: warning annotations: summary: "Longhorn volume {{ $labels.volume }} has no completed backup in >36h" description: "Daily backup recurringJob (cron 0 2 * * *) appears to have skipped this volume. Check 'kubectl get backups.longhorn.io -n longhorn-system' and the daily-backup CronJob logs." - alert: LonghornNodeUnhealthy expr: longhorn_node_status{condition="ready",condition_reason!=""} == 0 for: 5m labels: severity: warning annotations: summary: "Longhorn node {{ $labels.node }} not Ready" description: "Node {{ $labels.node }} reports ready=false (reason: {{ $labels.condition_reason }}). Volumes scheduled to this node will be unavailable until it recovers." # ============================================================ # FC Signage Marquee Performance — Track 3 + 8 (2026-05-06) # Live-mirrored from FlowerCore.Notes/scripts/monitoring/alerts.yml. # Source-of-truth for the live Podman Prometheus on noc1 is the # Notes file; this K8s ConfigMap exists so a future migration to # in-cluster Prometheus inherits the ruleset automatically. # See feedback_monitoring_k8s_target_vs_live_podman. # ============================================================ - name: fc-signage-marquee rules: - alert: MarqueeDroppedFramesHigh expr: | ( sum by (renderer, phase, node_id) (rate(marquee_dropped_frames_total[5m])) / sum by (renderer, phase, node_id) (rate(marquee_render_latency_ms_count[5m])) ) > 0.05 unless on() absent_over_time(marquee_dropped_frames_total[7d]) for: 5m labels: severity: warning service: signage alert_channel: irc annotations: summary: "Marquee dropped-frame rate >5% on {{ $labels.renderer }}/{{ $labels.node_id }} ({{ $labels.phase }})" description: "Renderer {{ $labels.renderer }} on {{ $labels.node_id }} drops >5% of frames during {{ $labels.phase }}. Animation visibly stuttery." - alert: MarqueeRenderLatencyP99High expr: | histogram_quantile( 0.99, sum by (renderer, phase, node_id, le) (rate(marquee_render_latency_ms_bucket[5m])) ) > 16 unless on() absent_over_time(marquee_render_latency_ms_bucket[7d]) for: 10m labels: severity: warning service: signage alert_channel: irc annotations: summary: "Marquee render latency p99 > 16ms on {{ $labels.renderer }}/{{ $labels.node_id }} ({{ $labels.phase }})" description: "Per-frame render latency p99 has exceeded the Pi-class 16ms budget for 10 minutes." - alert: MarqueeAnimationDurationDrift expr: | abs( histogram_quantile(0.5, sum by (renderer, phase, le) (rate(marquee_animation_duration_ms_bucket[15m]))) - on (phase) group_left() avg by (phase) (marquee_animation_duration_target_ms) ) / on (phase) group_left() avg by (phase) (marquee_animation_duration_target_ms) > 0.10 unless on() absent_over_time(marquee_animation_duration_ms_bucket[7d]) for: 15m labels: severity: info service: signage alert_channel: irc annotations: summary: "Marquee animation duration drifting > 10% on {{ $labels.renderer }} ({{ $labels.phase }})" description: "Median observed cycle duration deviates from target DurationMs by >10%. Could indicate browser tab throttling, GPU pressure, or phase-advancement bug." # ============================================================================= # ConfigMap: Blackbox Exporter Configuration # ============================================================================= --- apiVersion: v1 kind: ConfigMap metadata: name: blackbox-config namespace: monitoring data: blackbox.yml: | modules: http_2xx: prober: http timeout: 5s http: valid_http_versions: ["HTTP/1.1", "HTTP/2.0"] valid_status_codes: [200] method: GET fail_if_body_not_matches_regexp: [] preferred_ip_protocol: ip4 http_ollama: prober: http timeout: 5s http: valid_http_versions: ["HTTP/1.1", "HTTP/2.0"] valid_status_codes: [200] method: GET fail_if_body_not_matches_regexp: - '"models"' preferred_ip_protocol: ip4 # https_internal — for Traefik-fronted services with step-ca leaf # certs. blackbox does not trust the step-ca root CA, so http_2xx # against any *.iamworkin.lan host fails with x509 unknown authority. # Redirects + multiple status codes are accepted because some hosts # 302 to /login or /scalar. https_internal: prober: http timeout: 10s http: valid_http_versions: ["HTTP/1.1", "HTTP/2.0"] valid_status_codes: [200, 301, 302, 303, 307, 308] method: GET follow_redirects: true preferred_ip_protocol: ip4 tls_config: insecure_skip_verify: true # ============================================================================= # ConfigMap: IRC Notify Script # ============================================================================= --- apiVersion: v1 kind: ConfigMap metadata: name: irc-notify-script namespace: monitoring data: notify.py: | #!/usr/bin/env python3 """HTTP->IRC alert relay with thermal-printer DIGEST forwarding. Listens on :9119, posts to #alerts on UnrealIRCd, forwards to Print.Web /api/print/alert. Thermal printing is BATCHED into hourly digests by default so the printer no longer spam-fires per Grafana webhook. Routing (per Grafana webhook alert): - IRC: always per-event (operator likes the stream) - Thermal printer: * severity in {critical,disaster,page} OR label alert_channel=thermal_print_immediate -> print NOW * label alert_channel=thermal_print -> enqueue into hourly digest * everything else -> IRC only - RESOLVED webhooks remove the alert from the digest buffer Env vars (defaults preserve old behavior on first deploy): THERMAL_PRINT_ENABLED default "true" - master kill switch BATCH_INTERVAL_MIN default "60" - minutes between digest prints BATCH_MAX_PENDING default "50" - force-flush threshold HTTP surface: POST / - Grafana webhook entry POST /flush - manual digest flush (idempotent) GET / - status + config + buffer depth + stats """ import json, os, socket, sys, threading, time from collections import defaultdict from datetime import datetime, timezone from http.server import HTTPServer, BaseHTTPRequestHandler from urllib.request import Request, urlopen THERMAL_PRINT_ENABLED = os.environ.get("THERMAL_PRINT_ENABLED", "true").lower() == "true" BATCH_INTERVAL_MIN = int(os.environ.get("BATCH_INTERVAL_MIN", "60")) BATCH_MAX_PENDING = int(os.environ.get("BATCH_MAX_PENDING", "50")) IRC_HOST = os.environ.get("IRC_HOST", "unrealircd.irc.svc") IRC_PORT = int(os.environ.get("IRC_PORT", "6667")) IRC_NICK = os.environ.get("IRC_NICK", "grafana-bot") IRC_CHANNEL = os.environ.get("IRC_CHANNEL", "#alerts") PRINT_WEB_URL = os.environ.get("PRINT_WEB_URL", "http://10.0.57.16:5200/api/print/alert") _buffer_lock = threading.Lock() _buffer = {} # fingerprint -> {"alert": dict, "first_seen": float, "last_seen": float} _last_flush_time = time.time() _stats = {"webhooks_received": 0, "irc_sent": 0, "print_immediate": 0, "digest_flushed": 0, "buffer_dedup": 0, "buffer_added": 0, "buffer_resolved": 0, "started_at": time.time()} def send_irc(message): try: sock = socket.create_connection((IRC_HOST, IRC_PORT), timeout=15) sock.sendall(f"NICK {IRC_NICK}\r\n".encode()) sock.sendall(f"USER {IRC_NICK} 0 * :Grafana Alert Bot\r\n".encode()) registered = False deadline = time.time() + 10 buf = "" while time.time() < deadline: try: data = sock.recv(4096).decode("utf-8", errors="replace") if not data: break buf += data for line in buf.split("\r\n"): if line.startswith("PING"): sock.sendall(("PONG " + line.split(" ", 1)[1] + "\r\n").encode()) if " 001 " in buf: registered = True break except socket.timeout: break if not registered: sock.close() return False sock.sendall(f"JOIN {IRC_CHANNEL}\r\n".encode()) time.sleep(0.5) sock.recv(4096) for line in message.split("\n"): if line.strip(): sock.sendall(f"PRIVMSG {IRC_CHANNEL} :{line}\r\n".encode()) time.sleep(0.3) time.sleep(0.5) sock.sendall(b"QUIT :alert delivered\r\n") sock.close() _stats["irc_sent"] += 1 return True except Exception as e: print(f"[irc-notify] IRC send failed: {e}", file=sys.stderr) return False def post_thermal(payload, kind): if not THERMAL_PRINT_ENABLED: print(f"[irc-notify] thermal disabled; skip {kind} ({payload.get('title','?')[:40]})", file=sys.stderr) return False try: req = Request(PRINT_WEB_URL, data=json.dumps(payload).encode("utf-8"), headers={"Content-Type": "application/json"}, method="POST") resp = urlopen(req, timeout=10) if kind == "immediate": _stats["print_immediate"] += 1 print(f"[irc-notify] thermal {kind} sent: {payload.get('title','?')[:50]}", file=sys.stderr) return True except Exception as e: print(f"[irc-notify] thermal {kind} failed: {e}", file=sys.stderr) return False def fingerprint_of(alert): fp = alert.get("fingerprint", "") if fp: return fp labels = alert.get("labels", {}) target = labels.get("pod") or labels.get("instance") or labels.get("deployment") or labels.get("statefulset") or labels.get("namespace") or "" return f"{labels.get('alertname','?')}/{labels.get('namespace','')}/{target}" def is_critical(alert): return alert.get("labels", {}).get("severity", "").lower() in ("critical", "disaster", "page") def is_immediate_label(alert): return alert.get("labels", {}).get("alert_channel") == "thermal_print_immediate" def is_batched_label(alert): return alert.get("labels", {}).get("alert_channel") == "thermal_print" def add_to_digest(alert): """Add an alert to the digest buffer. Returns True if the buffer GREW (new fingerprint), False if it was a dedup, resolution, or no-op. """ if not THERMAL_PRINT_ENABLED: return False fp = fingerprint_of(alert) status = alert.get("status", "firing").lower() with _buffer_lock: if status == "resolved": if fp in _buffer: del _buffer[fp] _stats["buffer_resolved"] += 1 return False if fp in _buffer: _buffer[fp]["last_seen"] = time.time() _buffer[fp]["alert"] = alert _stats["buffer_dedup"] += 1 return False _buffer[fp] = {"alert": alert, "first_seen": time.time(), "last_seen": time.time()} _stats["buffer_added"] += 1 return True def build_digest_payload(): with _buffer_lock: items = list(_buffer.values()) if not items: return None by_name = defaultdict(list) for item in items: labels = item["alert"].get("labels", {}) by_name[labels.get("alertname", "Unknown")].append(item) lines = [] for name, group in sorted(by_name.items()): targets = [] for it in group[:5]: labels = it["alert"].get("labels", {}) t = (labels.get("pod") or labels.get("instance") or labels.get("deployment") or labels.get("statefulset") or labels.get("namespace") or "?") targets.append(t) more = f" (+{len(group)-5})" if len(group) > 5 else "" sevs = sorted({it["alert"].get("labels", {}).get("severity", "warning") for it in group}) lines.append(f"[{'/'.join(sevs)}] {name} x{len(group)}: {', '.join(targets)}{more}") now = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC") title = f"Alert digest: {len(items)} firing" body = "\n".join([ f"=== {title} ===", f"as of {now}", "", *lines, "", "Stream: #alerts (IRC) | Triage: grafana-noc1.iamworkin.lan", "Force-flush: POST irc-notify.monitoring.svc:9119/flush", ]) return {"title": title, "severity": "Warning", "host": "monitoring", "message": body, "eventId": f"digest-{int(time.time())}", "source": "Grafana digest", "status": "PROBLEM", "acknowledged": False} def flush_digest(): payload = build_digest_payload() if payload is None: print("[irc-notify] flush: buffer empty, no digest sent", file=sys.stderr) return False sent = post_thermal(payload, "digest") with _buffer_lock: _buffer.clear() if sent: _stats["digest_flushed"] += 1 return sent def digest_loop(): global _last_flush_time while True: try: now = time.time() elapsed = now - _last_flush_time if elapsed >= BATCH_INTERVAL_MIN * 60: print(f"[irc-notify] digest tick: interval reached ({BATCH_INTERVAL_MIN}m); buffer={len(_buffer)}", file=sys.stderr) flush_digest() _last_flush_time = now elif len(_buffer) >= BATCH_MAX_PENDING: print(f"[irc-notify] digest tick: buffer full ({len(_buffer)}); force flush", file=sys.stderr) flush_digest() _last_flush_time = now time.sleep(15) except Exception as e: print(f"[irc-notify] digest loop error: {e}", file=sys.stderr) time.sleep(60) class Handler(BaseHTTPRequestHandler): def do_POST(self): if self.path == "/flush": ok = flush_digest() self.send_response(200); self.send_header("Content-Type", "application/json"); self.end_headers() self.wfile.write(json.dumps({"flushed": ok, "buffer_after": len(_buffer)}).encode()) return _stats["webhooks_received"] += 1 length = int(self.headers.get("Content-Length", 0)) body = json.loads(self.rfile.read(length)) if length else {} for alert in body.get("alerts", []): status = alert.get("status", "unknown").upper() labels = alert.get("labels", {}) name = labels.get("alertname", "Unknown") summary = alert.get("annotations", {}).get("summary", "") desc = alert.get("annotations", {}).get("description", "") severity = labels.get("severity", "") icon = "\x0304[FIRING]\x03" if status == "FIRING" else "\x0303[RESOLVED]\x03" sev_tag = f" [{severity}]" if severity else "" msg = f"{icon}{sev_tag} {name}: {summary}" if desc: msg += f"\n {desc}" send_irc(msg) # Thermal routing — EVERYTHING (including criticals) goes into # the hourly digest. Only the explicit `alert_channel=thermal_print_immediate` # label bypasses, and even that flushes-the-current-digest rather # than printing a standalone job, so the same fingerprint can't # spam the printer per webhook cycle. if status == "RESOLVED": add_to_digest(alert) # removes from buffer continue if is_immediate_label(alert): # Explicit opt-in for "paper this NOW" — first arrival of a # new fingerprint triggers an immediate digest flush; repeat # webhooks for the same fingerprint dedupe in the buffer # until the next interval or until the alert resolves. new_in_buffer = add_to_digest(alert) if new_in_buffer: global _last_flush_time flush_digest() _last_flush_time = time.time() elif is_critical(alert) or is_batched_label(alert): add_to_digest(alert) # else: IRC-only (warnings without thermal_print label) self.send_response(200); self.send_header("Content-Type", "application/json"); self.end_headers() self.wfile.write(b'{"status":"ok"}') def do_GET(self): self.send_response(200); self.send_header("Content-Type", "application/json"); self.end_headers() with _buffer_lock: alertnames = sorted({it["alert"].get("labels", {}).get("alertname", "?") for it in _buffer.values()}) depth = len(_buffer) info = { "service": "irc-notify", "config": {"thermal_print_enabled": THERMAL_PRINT_ENABLED, "batch_interval_min": BATCH_INTERVAL_MIN, "batch_max_pending": BATCH_MAX_PENDING, "irc_target": f"{IRC_HOST}:{IRC_PORT} {IRC_CHANNEL}", "print_web_url": PRINT_WEB_URL}, "buffer": {"depth": depth, "alertnames": alertnames, "seconds_since_last_flush": int(time.time() - _last_flush_time), "seconds_until_next_flush": max(0, int(BATCH_INTERVAL_MIN*60 - (time.time() - _last_flush_time)))}, "stats": _stats, } self.wfile.write(json.dumps(info, indent=2).encode()) def log_message(self, format, *args): print(f"[irc-notify] {args[0]}", file=sys.stderr) if __name__ == "__main__": threading.Thread(target=digest_loop, daemon=True).start() server = HTTPServer(("0.0.0.0", 9119), Handler) print(f"[irc-notify] :9119 -> IRC {IRC_HOST}:{IRC_PORT} {IRC_CHANNEL} | thermal={'ON' if THERMAL_PRINT_ENABLED else 'OFF'} | digest={BATCH_INTERVAL_MIN}m max={BATCH_MAX_PENDING}", file=sys.stderr) server.serve_forever() # ============================================================================= # SNMP Exporter Auth Secret # ============================================================================= # The full snmp.yml (~2MB, auto-generated) exceeds the 1MB ConfigMap limit. # Strategy: store SNMP auth credentials in a Secret, and use an init container # to download the full snmp.yml from Gitea, or split into multiple ConfigMaps. # For now, we mount a minimal auth-only config and rely on the default modules # bundled in the snmp-exporter image. To use custom modules, apply # snmp-config.yaml separately (see comments in that file). --- apiVersion: v1 kind: Secret metadata: name: snmp-auth namespace: monitoring type: Opaque stringData: # SNMP v2 community string used by prometheus scrape configs SNMP_COMMUNITY_BLUEJAY: bluejay_monitor SNMP_V3_USER: bluejay_snmpv3 SNMP_V3_AUTH_PASS: BlueJay-SNMP-Auth-2026 SNMP_V3_PRIV_PASS: BlueJay-SNMP-Priv-2026 # ============================================================================= # Grafana Credentials — synced from 1Password via Operator # ============================================================================= # 1Password vault: IAmWorkin > "Grafana" # Creates K8s Secret "grafana-credentials" with fields: username, password # Grafana Deployment uses secretKeyRef to map username→GF_SECURITY_ADMIN_USER, password→GF_SECURITY_ADMIN_PASSWORD --- apiVersion: onepassword.com/v1 kind: OnePasswordItem metadata: name: grafana-credentials namespace: monitoring spec: itemPath: vaults/IAmWorkin/items/Grafana # ============================================================================= # RBAC: ServiceAccount + ClusterRole for Prometheus K8s SD # ============================================================================= --- apiVersion: v1 kind: ServiceAccount metadata: name: prometheus namespace: monitoring --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: name: prometheus rules: - apiGroups: [""] resources: ["nodes", "nodes/proxy", "services", "endpoints", "pods"] verbs: ["get", "list", "watch"] - apiGroups: ["extensions", "networking.k8s.io"] resources: ["ingresses"] verbs: ["get", "list", "watch"] - nonResourceURLs: ["/metrics"] verbs: ["get"] --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: name: prometheus roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole name: prometheus subjects: - kind: ServiceAccount name: prometheus namespace: monitoring # ============================================================================= # PVC: Prometheus Data (10Gi, Longhorn) # ============================================================================= --- apiVersion: v1 kind: PersistentVolumeClaim metadata: name: prometheus-data namespace: monitoring spec: accessModes: - ReadWriteOnce storageClassName: longhorn resources: requests: storage: 10Gi # ============================================================================= # PVC: Grafana Data (2Gi, Longhorn) # ============================================================================= --- apiVersion: v1 kind: PersistentVolumeClaim metadata: name: grafana-data namespace: monitoring spec: accessModes: - ReadWriteOnce storageClassName: longhorn resources: requests: storage: 2Gi # ============================================================================= # Deployment: Prometheus # ============================================================================= --- apiVersion: apps/v1 kind: Deployment metadata: name: prometheus namespace: monitoring labels: app: prometheus spec: replicas: 1 strategy: type: Recreate selector: matchLabels: app: prometheus template: metadata: labels: app: prometheus spec: serviceAccountName: prometheus securityContext: fsGroup: 65534 # nobody runAsUser: 65534 runAsGroup: 65534 containers: - name: prometheus image: docker.io/prom/prometheus:latest args: - "--config.file=/etc/prometheus/prometheus.yml" - "--storage.tsdb.path=/prometheus" - "--storage.tsdb.retention.time=90d" - "--web.enable-lifecycle" ports: - containerPort: 9090 name: http volumeMounts: - name: config mountPath: /etc/prometheus/prometheus.yml subPath: prometheus.yml readOnly: true - name: config mountPath: /etc/prometheus/alerts.yml subPath: alerts.yml readOnly: true - name: config mountPath: /etc/prometheus/recording-rules.yml subPath: recording-rules.yml readOnly: true - name: data mountPath: /prometheus resources: requests: cpu: 200m memory: 512Mi limits: cpu: "1" memory: 2Gi livenessProbe: httpGet: path: /-/healthy port: 9090 initialDelaySeconds: 15 periodSeconds: 30 readinessProbe: httpGet: path: /-/ready port: 9090 initialDelaySeconds: 5 periodSeconds: 10 volumes: - name: config configMap: name: prometheus-config - name: data persistentVolumeClaim: claimName: prometheus-data # ============================================================================= # ConfigMap: Grafana Dashboard Provider # ============================================================================= --- apiVersion: v1 kind: ConfigMap metadata: name: grafana-dashboard-provider namespace: monitoring data: default.yml: | apiVersion: 1 providers: - name: 'default' orgId: 1 folder: '' type: file disableDeletion: false updateIntervalSeconds: 30 options: path: /var/lib/grafana/dashboards foldersFromFilesStructure: true # ============================================================================= # ConfigMap: Grafana Dashboards (AI Stack Health) # ============================================================================= --- apiVersion: v1 kind: ConfigMap metadata: name: grafana-dashboards namespace: monitoring data: ai-stack-health.json: | { "id": null, "panels": [ { "fieldConfig": { "defaults": { "mappings": [ { "options": { "0": { "color": "red", "text": "DOWN" }, "1": { "color": "green", "text": "UP" } }, "type": "value" } ], "thresholds": { "mode": "absolute", "steps": [ { "color": "red", "value": null }, { "color": "green", "value": 1 } ] } } }, "gridPos": { "h": 4, "w": 6, "x": 0, "y": 0 }, "id": 1, "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "probe_success{job=\"probe-ollama-local\"}", "legendFormat": "Status" } ], "title": "Ollama (Local)", "type": "stat" }, { "fieldConfig": { "defaults": { "mappings": [ { "options": { "0": { "color": "red", "text": "DOWN" }, "1": { "color": "green", "text": "UP" } }, "type": "value" } ], "thresholds": { "mode": "absolute", "steps": [ { "color": "red", "value": null }, { "color": "green", "value": 1 } ] } } }, "gridPos": { "h": 4, "w": 6, "x": 6, "y": 0 }, "id": 2, "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "probe_success{job=\"probe-ollama-edge1\"}", "legendFormat": "Status" } ], "title": "Ollama (Edge1)", "type": "stat" }, { "fieldConfig": { "defaults": { "mappings": [ { "options": { "0": { "color": "red", "text": "DOWN" }, "1": { "color": "green", "text": "UP" } }, "type": "value" } ], "thresholds": { "mode": "absolute", "steps": [ { "color": "red", "value": null }, { "color": "green", "value": 1 } ] } } }, "gridPos": { "h": 4, "w": 6, "x": 12, "y": 0 }, "id": 3, "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "probe_success{job=\"probe-agentzero-local\"}", "legendFormat": "Status" } ], "title": "Agent Zero (Local)", "type": "stat" }, { "fieldConfig": { "defaults": { "mappings": [ { "options": { "0": { "color": "red", "text": "DOWN" }, "1": { "color": "green", "text": "UP" } }, "type": "value" } ], "thresholds": { "mode": "absolute", "steps": [ { "color": "red", "value": null }, { "color": "green", "value": 1 } ] } } }, "gridPos": { "h": 4, "w": 6, "x": 18, "y": 0 }, "id": 4, "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "probe_success{job=\"probe-agentzero-nuc\"}", "legendFormat": "Status" } ], "title": "Agent Zero (NUC)", "type": "stat" }, { "fieldConfig": { "defaults": { "custom": { "fillOpacity": 15, "lineWidth": 2 }, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "yellow", "value": 1 }, { "color": "red", "value": 3 } ] }, "unit": "s" } }, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 4 }, "id": 5, "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "probe_duration_seconds{service=\"ollama\"}", "legendFormat": "{{ deployment }}" } ], "title": "Ollama Response Time", "type": "timeseries" }, { "fieldConfig": { "defaults": { "custom": { "fillOpacity": 15, "lineWidth": 2 }, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "yellow", "value": 1 }, { "color": "red", "value": 3 } ] }, "unit": "s" } }, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 4 }, "id": 6, "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "probe_duration_seconds{service=\"agent-zero\"}", "legendFormat": "{{ deployment }}" } ], "title": "Agent Zero Response Time", "type": "timeseries" }, { "fieldConfig": { "defaults": { "custom": { "fillOpacity": 30, "lineWidth": 1, "stacking": { "mode": "none" } }, "mappings": [ { "options": { "0": { "text": "DOWN" }, "1": { "text": "UP" } }, "type": "value" } ], "max": 1, "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "red", "value": null }, { "color": "green", "value": 1 } ] } } }, "gridPos": { "h": 6, "w": 24, "x": 0, "y": 12 }, "id": 7, "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "probe_success{service=\"ollama\"}", "legendFormat": "Ollama ({{ deployment }})" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "probe_success{service=\"agent-zero\"}", "legendFormat": "Agent Zero ({{ deployment }})" } ], "title": "Uptime History", "type": "timeseries" }, { "fieldConfig": { "defaults": { "custom": { "fillOpacity": 10, "lineWidth": 2 }, "max": 100, "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "yellow", "value": 75 }, { "color": "red", "value": 90 } ] }, "unit": "percent" } }, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 18 }, "id": 8, "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "100 - (avg(rate(node_cpu_seconds_total{instance=\"edge1\",mode=\"idle\"}[5m])) * 100)", "legendFormat": "CPU %" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "(1 - node_memory_MemAvailable_bytes{instance=\"edge1\"} / node_memory_MemTotal_bytes{instance=\"edge1\"}) * 100", "legendFormat": "Memory %" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "(1 - node_filesystem_avail_bytes{instance=\"edge1\",mountpoint=\"/\"} / node_filesystem_size_bytes{instance=\"edge1\",mountpoint=\"/\"}) * 100", "legendFormat": "Disk %" } ], "title": "Edge1 Pi 5 \u2014 CPU / Memory / Disk", "type": "timeseries" }, { "fieldConfig": { "defaults": { "custom": { "fillOpacity": 10, "lineWidth": 2 }, "unit": "s" } }, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 18 }, "id": 9, "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "probe_dns_lookup_time_seconds", "legendFormat": "{{ job }}" } ], "title": "Probe DNS Lookup Time", "type": "timeseries" } ], "refresh": "30s", "schemaVersion": 39, "tags": ["ai", "ollama", "agent-zero", "blue-jay"], "time": { "from": "now-1h", "to": "now" }, "timezone": "browser", "title": "AI Stack Health", "uid": "ai-stack-health", "version": 1 } # ============================================================================= # ConfigMap: Grafana Dashboard — Edge Nodes # ============================================================================= --- apiVersion: v1 kind: ConfigMap metadata: name: grafana-dashboard-edge-nodes namespace: monitoring data: bluejay-edge-nodes.json: | { "id": null, "panels": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, "fieldConfig": { "defaults": { "mappings": [ { "options": { "0": { "color": "red", "text": "DOWN" }, "1": { "color": "green", "text": "UP" } }, "type": "value" } ] } }, "gridPos": { "h": 4, "w": 24, "x": 0, "y": 0 }, "targets": [ { "expr": "up{instance=~\"edge.*\"}", "legendFormat": "{{instance}}", "refId": "A" } ], "title": "Edge Node Status", "type": "stat" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "fieldConfig": { "defaults": { "max": 100, "min": 0, "unit": "percent" } }, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 4 }, "targets": [ { "expr": "100 - (avg(rate(node_cpu_seconds_total{mode=\"idle\", instance=~\"edge1.*\"}[5m])) * 100)", "legendFormat": "CPU %", "refId": "A" }, { "expr": "node_load1{instance=~\"edge1.*\"}", "legendFormat": "Load 1m", "refId": "B" } ], "title": "edge1 (Pi5 + Hailo) CPU", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "fieldConfig": { "defaults": { "max": 100, "min": 0, "unit": "percent" } }, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 4 }, "targets": [ { "expr": "100 - (avg(rate(node_cpu_seconds_total{mode=\"idle\", instance=~\"edge2.*\"}[5m])) * 100)", "legendFormat": "CPU %", "refId": "A" }, { "expr": "node_load1{instance=~\"edge2.*\"}", "legendFormat": "Load 1m", "refId": "B" } ], "title": "edge2 (Pi4) CPU", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "fieldConfig": { "defaults": { "max": 100, "min": 0, "unit": "percent" } }, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 12 }, "targets": [ { "expr": "(1 - node_memory_MemAvailable_bytes{instance=~\"edge.*\"} / node_memory_MemTotal_bytes{instance=~\"edge.*\"}) * 100", "legendFormat": "{{instance}}", "refId": "A" } ], "title": "Edge Memory Usage", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "fieldConfig": { "defaults": { "max": 100, "min": 0, "unit": "percent" } }, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 12 }, "targets": [ { "expr": "100 - (node_filesystem_avail_bytes{instance=~\"edge.*\",mountpoint=\"/\"} / node_filesystem_size_bytes{instance=~\"edge.*\",mountpoint=\"/\"} * 100)", "legendFormat": "{{instance}}", "refId": "A" } ], "title": "Edge Disk Usage", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "fieldConfig": { "defaults": { "unit": "celsius" } }, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 20 }, "targets": [ { "expr": "node_hwmon_temp_celsius{instance=~\"edge.*\"}", "legendFormat": "{{instance}} {{chip}} {{sensor}}", "refId": "A" } ], "title": "Edge CPU Temperature", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "fieldConfig": { "defaults": { "unit": "bps" } }, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 20 }, "targets": [ { "expr": "rate(node_network_receive_bytes_total{instance=~\"edge.*\",device!~\"lo|docker.*|veth.*\"}[5m]) * 8", "legendFormat": "{{instance}} {{device}} RX", "refId": "A" }, { "expr": "-rate(node_network_transmit_bytes_total{instance=~\"edge.*\",device!~\"lo|docker.*|veth.*\"}[5m]) * 8", "legendFormat": "{{instance}} {{device}} TX", "refId": "B" } ], "title": "Edge Network Traffic", "type": "timeseries" } ], "refresh": "30s", "schemaVersion": 40, "tags": ["bluejay", "edge"], "timezone": "browser", "title": "BlueJay Edge Nodes", "uid": "7e1603b9-e918-4b3f-a22b-163132fd5cee" } # ============================================================================= # ConfigMap: Grafana Dashboard — Network Overview # ============================================================================= --- apiVersion: v1 kind: ConfigMap metadata: name: grafana-dashboard-network-overview namespace: monitoring data: bluejay-network-overview.json: | { "id": null, "panels": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, "fieldConfig": { "defaults": { "thresholds": { "steps": [ { "color": "green", "value": null } ] } } }, "gridPos": { "h": 4, "w": 24, "x": 0, "y": 0 }, "targets": [ { "expr": "count(up == 1)", "legendFormat": "Up", "refId": "A" }, { "expr": "count(up == 0)", "legendFormat": "Down", "refId": "B" } ], "title": "Target Health", "type": "stat" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "fieldConfig": { "defaults": { "max": 4, "min": 0, "thresholds": { "steps": [ { "color": "green", "value": null }, { "color": "yellow", "value": 2 }, { "color": "red", "value": 3 } ] } } }, "gridPos": { "h": 6, "w": 6, "x": 0, "y": 4 }, "targets": [ { "expr": "laLoad{instance=\"10.0.56.1\", laNames=\"Load-1\"}", "refId": "A" } ], "title": "pfSense CPU Load (1m)", "type": "gauge" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "fieldConfig": { "defaults": { "max": 100, "min": 0, "thresholds": { "steps": [ { "color": "green", "value": null }, { "color": "yellow", "value": 70 }, { "color": "red", "value": 90 } ] }, "unit": "percent" } }, "gridPos": { "h": 6, "w": 6, "x": 6, "y": 4 }, "targets": [ { "expr": "100 - (memAvailReal{instance=\"10.0.56.1\"} / memTotalReal{instance=\"10.0.56.1\"} * 100)", "refId": "A" } ], "title": "pfSense Memory Used %", "type": "gauge" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "fieldConfig": { "defaults": { "max": 100, "min": 0, "unit": "percent" } }, "gridPos": { "h": 6, "w": 12, "x": 12, "y": 4 }, "targets": [ { "expr": "100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\", instance=\"noc1\"}[5m])) * 100)", "legendFormat": "CPU %", "refId": "A" } ], "title": "noc1 CPU Usage", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "fieldConfig": { "defaults": { "max": 100, "min": 0, "unit": "percent" } }, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 10 }, "targets": [ { "expr": "(1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100", "legendFormat": "{{instance}}", "refId": "A" } ], "title": "Node Memory Usage", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "fieldConfig": { "defaults": { "max": 100, "min": 0, "thresholds": { "steps": [ { "color": "green", "value": null }, { "color": "yellow", "value": 70 }, { "color": "red", "value": 90 } ] }, "unit": "percent" } }, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 10 }, "targets": [ { "expr": "100 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} * 100)", "legendFormat": "{{instance}}", "refId": "A" } ], "title": "Node Disk Usage %", "type": "bargauge" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "fieldConfig": { "defaults": { "unit": "bps" } }, "gridPos": { "h": 8, "w": 24, "x": 0, "y": 18 }, "targets": [ { "expr": "rate(node_network_receive_bytes_total{device!~\"lo|veth.*|cali.*|flannel.*\"}[5m]) * 8", "legendFormat": "{{instance}} {{device}} RX", "refId": "A" }, { "expr": "-rate(node_network_transmit_bytes_total{device!~\"lo|veth.*|cali.*|flannel.*\"}[5m]) * 8", "legendFormat": "{{instance}} {{device}} TX", "refId": "B" } ], "title": "Network Traffic", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "gridPos": { "h": 8, "w": 24, "x": 0, "y": 26 }, "targets": [ { "expr": "up", "format": "table", "instant": true, "refId": "A" } ], "title": "Prometheus Targets", "type": "table" } ], "refresh": "30s", "schemaVersion": 40, "tags": ["bluejay", "network"], "timezone": "browser", "title": "BlueJay Network Overview", "uid": "b83a122c-6eb5-4fae-a632-77cbf753ad05" } # ============================================================================= # ConfigMap: Grafana Dashboard — Operations # ============================================================================= --- apiVersion: v1 kind: ConfigMap metadata: name: grafana-dashboard-operations namespace: monitoring data: bluejay-operations.json: | { "annotations": { "list": [] }, "id": null, "panels": [ { "collapsed": false, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, "title": "Infrastructure Overview", "type": "row" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "fieldConfig": { "defaults": { "noValue": "0", "thresholds": { "steps": [ { "color": "green", "value": null } ] } } }, "gridPos": { "h": 4, "w": 8, "x": 0, "y": 1 }, "targets": [ { "expr": "count(up == 1)", "legendFormat": "Up", "refId": "A" }, { "expr": "count(up == 0)", "legendFormat": "Down", "refId": "B" } ], "title": "All Targets Up/Down", "type": "stat" }, { "datasource": { "type": "alexanderzobnin-zabbix-datasource", "uid": "bffjila3zkdfka" }, "gridPos": { "h": 8, "w": 16, "x": 8, "y": 1 }, "targets": [ { "application": { "filter": "" }, "group": { "filter": "/.*/" }, "host": { "filter": "/.*/" }, "queryType": 5, "refId": "A", "trigger": { "filter": "/.*/" } } ], "title": "Zabbix Active Problems", "type": "alexanderzobnin-zabbix-triggers-panel" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "gridPos": { "h": 6, "w": 8, "x": 0, "y": 5 }, "targets": [ { "expr": "node_load1{instance=\"noc1\"}", "legendFormat": "1m", "refId": "A" }, { "expr": "node_load5{instance=\"noc1\"}", "legendFormat": "5m", "refId": "B" }, { "expr": "node_load15{instance=\"noc1\"}", "legendFormat": "15m", "refId": "C" } ], "title": "noc1 Load Average", "type": "timeseries" }, { "collapsed": false, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 11 }, "title": "Kubernetes & Services", "type": "row" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "fieldConfig": { "defaults": {}, "overrides": [ { "matcher": { "id": "byName", "options": "Value" }, "properties": [ { "id": "mappings", "value": [ { "options": { "0": { "color": "red", "text": "DOWN" }, "1": { "color": "green", "text": "UP" } }, "type": "value" } ] } ] } ] }, "gridPos": { "h": 8, "w": 24, "x": 0, "y": 12 }, "targets": [ { "expr": "up", "format": "table", "instant": true, "refId": "A" } ], "title": "K8s Services Uptime (Prometheus Targets)", "type": "table" }, { "collapsed": false, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 20 }, "title": "Network & SNMP", "type": "row" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "fieldConfig": { "defaults": { "unit": "bps" } }, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 21 }, "targets": [ { "expr": "rate(ifHCInOctets{instance=\"10.0.56.1\", ifAlias=~\".*WAN.*\"}[5m]) * 8", "legendFormat": "WAN In", "refId": "A" }, { "expr": "-rate(ifHCOutOctets{instance=\"10.0.56.1\", ifAlias=~\".*WAN.*\"}[5m]) * 8", "legendFormat": "WAN Out", "refId": "B" } ], "title": "pfSense WAN Traffic", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "fieldConfig": { "defaults": { "unit": "bps" } }, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 21 }, "targets": [ { "expr": "rate(ifHCInOctets{instance=\"10.0.56.1\", ifAlias!~\".*WAN.*\"}[5m]) * 8", "legendFormat": "{{ifAlias}} In", "refId": "A" }, { "expr": "-rate(ifHCOutOctets{instance=\"10.0.56.1\", ifAlias!~\".*WAN.*\"}[5m]) * 8", "legendFormat": "{{ifAlias}} Out", "refId": "B" } ], "title": "pfSense LAN Traffic", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "fieldConfig": { "defaults": { "max": 100, "min": 0, "unit": "percent" } }, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 29 }, "targets": [ { "expr": "(1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100", "legendFormat": "{{instance}}", "refId": "A" } ], "title": "All Nodes Memory", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "fieldConfig": { "defaults": { "max": 100, "min": 0, "unit": "percent" } }, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 29 }, "targets": [ { "expr": "100 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} * 100)", "legendFormat": "{{instance}}", "refId": "A" } ], "title": "All Nodes Disk", "type": "timeseries" } ], "refresh": "1m", "schemaVersion": 40, "tags": ["bluejay", "operations", "zabbix"], "timezone": "browser", "title": "BlueJay Operations", "uid": "a781c4e4-c3fe-4ac7-be93-21363a41b97d" } # ============================================================================= # ConfigMap: Grafana Dashboard — Epson Printer # ============================================================================= --- apiVersion: v1 kind: ConfigMap metadata: name: grafana-dashboard-printer namespace: monitoring data: epson-ecotank-printer.json: | { "id": null, "panels": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, "fieldConfig": { "defaults": { "max": 100, "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "red", "value": null }, { "color": "orange", "value": 10 }, { "color": "yellow", "value": 20 }, { "color": "green", "value": 40 } ] }, "unit": "percent" }, "overrides": [ { "matcher": { "id": "byName", "options": "Black Ink Bottle" }, "properties": [ { "id": "color", "value": { "fixedColor": "dark-blue", "mode": "fixed" } } ] }, { "matcher": { "id": "byName", "options": "Cyan Ink Bottle" }, "properties": [ { "id": "color", "value": { "fixedColor": "super-light-blue", "mode": "fixed" } } ] }, { "matcher": { "id": "byName", "options": "Magenta Ink Bottle" }, "properties": [ { "id": "color", "value": { "fixedColor": "purple", "mode": "fixed" } } ] }, { "matcher": { "id": "byName", "options": "Yellow Ink Bottle" }, "properties": [ { "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } } ] } ] }, "gridPos": { "h": 10, "w": 12, "x": 0, "y": 0 }, "id": 1, "options": { "orientation": "horizontal", "reduceOptions": { "calcs": ["lastNotNull"] }, "showThresholdLabels": false, "showThresholdMarkers": true }, "targets": [ { "expr": "prtMarkerSuppliesLevel{job=\"snmp-printer\"}", "legendFormat": "{{prtMarkerSuppliesDescription}}", "refId": "A" } ], "title": "Ink Levels", "type": "gauge" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "fieldConfig": { "defaults": { "custom": { "fillOpacity": 20, "lineWidth": 2, "spanNulls": true }, "max": 100, "min": 0, "unit": "percent" }, "overrides": [ { "matcher": { "id": "byName", "options": "Black Ink Bottle" }, "properties": [ { "id": "color", "value": { "fixedColor": "dark-blue", "mode": "fixed" } } ] }, { "matcher": { "id": "byName", "options": "Cyan Ink Bottle" }, "properties": [ { "id": "color", "value": { "fixedColor": "super-light-blue", "mode": "fixed" } } ] }, { "matcher": { "id": "byName", "options": "Magenta Ink Bottle" }, "properties": [ { "id": "color", "value": { "fixedColor": "purple", "mode": "fixed" } } ] }, { "matcher": { "id": "byName", "options": "Yellow Ink Bottle" }, "properties": [ { "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } } ] } ] }, "gridPos": { "h": 10, "w": 12, "x": 12, "y": 0 }, "id": 2, "targets": [ { "expr": "prtMarkerSuppliesLevel{job=\"snmp-printer\"}", "legendFormat": "{{prtMarkerSuppliesDescription}}", "refId": "A" } ], "title": "Ink Level History", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "fieldConfig": { "defaults": { "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "yellow", "value": 10000 }, { "color": "red", "value": 50000 } ] }, "unit": "short" } }, "gridPos": { "h": 5, "w": 6, "x": 0, "y": 10 }, "id": 3, "options": { "colorMode": "background", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "value_and_name" }, "targets": [ { "expr": "prtMarkerLifeCount{job=\"snmp-printer\"}", "legendFormat": "Pages", "refId": "A" } ], "title": "Lifetime Page Count", "type": "stat" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "fieldConfig": { "defaults": { "mappings": [ { "options": { "1": { "text": "Online" } }, "type": "value" } ], "thresholds": { "steps": [ { "color": "blue", "value": null } ] } } }, "gridPos": { "h": 5, "w": 6, "x": 6, "y": 10 }, "id": 4, "options": { "colorMode": "background", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "name" }, "targets": [ { "expr": "prtGeneralPrinterName{job=\"snmp-printer\"}", "legendFormat": "{{prtGeneralPrinterName}}", "refId": "A" } ], "title": "Printer Model", "type": "stat" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "fieldConfig": { "defaults": { "thresholds": { "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 1 } ] }, "unit": "short" } }, "gridPos": { "h": 5, "w": 6, "x": 12, "y": 10 }, "id": 5, "options": { "colorMode": "background", "reduceOptions": { "calcs": ["lastNotNull"] } }, "targets": [ { "expr": "prtAlertCriticalEvents{job=\"snmp-printer\"}", "legendFormat": "Critical Alerts", "refId": "A" } ], "title": "Critical Events", "type": "stat" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "fieldConfig": { "defaults": { "thresholds": { "steps": [ { "color": "blue", "value": null } ] } } }, "gridPos": { "h": 5, "w": 6, "x": 18, "y": 10 }, "id": 6, "options": { "colorMode": "background", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "name" }, "targets": [ { "expr": "prtGeneralSerialNumber{job=\"snmp-printer\"}", "legendFormat": "{{prtGeneralSerialNumber}}", "refId": "A" } ], "title": "Serial Number", "type": "stat" } ], "refresh": "5m", "schemaVersion": 39, "tags": ["printer", "snmp", "bluejay"], "time": { "from": "now-24h", "to": "now" }, "timezone": "browser", "title": "Epson ET-3750 EcoTank Printer", "uid": "epson-ecotank" } # ============================================================================= # ConfigMap: Grafana Dashboard — Infrastructure Overview # ============================================================================= --- apiVersion: v1 kind: ConfigMap metadata: name: grafana-dashboard-infra-overview namespace: monitoring data: infra-overview.json: | { "id": null, "panels": [ { "collapsed": false, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, "id": 100, "title": "AI Stack", "type": "row" }, { "fieldConfig": { "defaults": { "mappings": [ { "options": { "0": { "color": "red", "text": "DOWN" }, "1": { "color": "green", "text": "UP" } }, "type": "value" } ], "thresholds": { "mode": "absolute", "steps": [ { "color": "red", "value": null }, { "color": "green", "value": 1 } ] } } }, "gridPos": { "h": 4, "w": 6, "x": 0, "y": 1 }, "id": 1, "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "probe_success{job=\"probe-ollama-local\"}", "legendFormat": "Status" } ], "title": "Ollama (Local)", "type": "stat" }, { "fieldConfig": { "defaults": { "mappings": [ { "options": { "0": { "color": "red", "text": "DOWN" }, "1": { "color": "green", "text": "UP" } }, "type": "value" } ], "thresholds": { "mode": "absolute", "steps": [ { "color": "red", "value": null }, { "color": "green", "value": 1 } ] } } }, "gridPos": { "h": 4, "w": 6, "x": 6, "y": 1 }, "id": 2, "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "probe_success{job=\"probe-ollama-edge1\"}", "legendFormat": "Status" } ], "title": "Ollama (Edge1)", "type": "stat" }, { "fieldConfig": { "defaults": { "mappings": [ { "options": { "0": { "color": "red", "text": "DOWN" }, "1": { "color": "green", "text": "UP" } }, "type": "value" } ], "thresholds": { "mode": "absolute", "steps": [ { "color": "red", "value": null }, { "color": "green", "value": 1 } ] } } }, "gridPos": { "h": 4, "w": 6, "x": 12, "y": 1 }, "id": 3, "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "probe_success{job=\"probe-agentzero-local\"}", "legendFormat": "Status" } ], "title": "Agent Zero (Local)", "type": "stat" }, { "fieldConfig": { "defaults": { "mappings": [ { "options": { "0": { "color": "red", "text": "DOWN" }, "1": { "color": "green", "text": "UP" } }, "type": "value" } ], "thresholds": { "mode": "absolute", "steps": [ { "color": "red", "value": null }, { "color": "green", "value": 1 } ] } } }, "gridPos": { "h": 4, "w": 6, "x": 18, "y": 1 }, "id": 4, "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "probe_success{job=\"probe-agentzero-nuc\"}", "legendFormat": "Status" } ], "title": "Agent Zero (NUC)", "type": "stat" }, { "collapsed": false, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 5 }, "id": 101, "title": "K8s Cluster", "type": "row" }, { "fieldConfig": { "defaults": { "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "yellow", "value": 30 }, { "color": "red", "value": 50 } ] } } }, "gridPos": { "h": 6, "w": 8, "x": 0, "y": 6 }, "id": 5, "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "count(up{job=\"node-exporter\"} == 1)", "legendFormat": "Nodes Up" } ], "title": "Nodes Up (node-exporter)", "type": "stat" }, { "fieldConfig": { "defaults": { "custom": { "fillOpacity": 15, "lineWidth": 2 }, "max": 100, "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "yellow", "value": 70 }, { "color": "red", "value": 90 } ] }, "unit": "percent" } }, "gridPos": { "h": 6, "w": 8, "x": 8, "y": 6 }, "id": 6, "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "100 - (avg(rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) by (instance) * 100)", "legendFormat": "{{ instance }}" } ], "title": "Node CPU Usage %", "type": "timeseries" }, { "fieldConfig": { "defaults": { "custom": { "fillOpacity": 15, "lineWidth": 2 }, "max": 100, "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "yellow", "value": 70 }, { "color": "red", "value": 90 } ] }, "unit": "percent" } }, "gridPos": { "h": 6, "w": 8, "x": 16, "y": 6 }, "id": 7, "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "(1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100", "legendFormat": "{{ instance }}" } ], "title": "Node Memory Usage %", "type": "timeseries" }, { "collapsed": false, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 12 }, "id": 102, "title": "Network", "type": "row" }, { "fieldConfig": { "defaults": { "custom": { "fillOpacity": 10, "lineWidth": 2 }, "unit": "Bps" } }, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 13 }, "id": 8, "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "rate(ifHCInOctets{job=\"snmp-pfsense\",ifDescr=\"igb0\"}[5m])", "legendFormat": "WAN In" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "rate(ifHCOutOctets{job=\"snmp-pfsense\",ifDescr=\"igb0\"}[5m])", "legendFormat": "WAN Out" } ], "title": "pfSense WAN Bandwidth", "type": "timeseries" }, { "fieldConfig": { "defaults": { "thresholds": { "mode": "absolute", "steps": [ { "color": "red", "value": null }, { "color": "green", "value": 1 } ] } }, "overrides": [] }, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 13 }, "id": 9, "options": { "showHeader": true, "sortBy": [{ "displayName": "Value", "desc": false }] }, "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "up", "format": "table", "instant": true, "legendFormat": "" } ], "title": "Target Health (up)", "transformations": [ { "id": "organize", "options": { "excludeByName": { "Time": true, "__name__": true }, "renameByName": { "job": "Job", "instance": "Instance", "Value": "Up" } } } ], "type": "table" }, { "collapsed": false, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 21 }, "id": 103, "title": "Services", "type": "row" }, { "gridPos": { "h": 4, "w": 24, "x": 0, "y": 22 }, "id": 10, "options": { "content": "### ArgoCD Applications\n\nView sync status and health of all 11 ArgoCD-managed applications:\n\n[Open ArgoCD Dashboard](https://argocd.iamworkin.lan)\n\n| App | Expected Status |\n|-----|----------------|\n| gitea | Healthy / Synced |\n| zabbix | Healthy / Synced |\n| guacamole | Healthy / Synced |\n| irc | Healthy / Synced |\n| mail | Healthy / Synced |\n| matrix | Healthy / Synced |\n| teamspeak | Healthy / Synced |\n| pki-web | Healthy / Synced |\n| intranet | Healthy / Synced |\n| telephony | Healthy / Synced |\n| infra-noc-services | Healthy / Synced |", "mode": "markdown" }, "title": "ArgoCD App Status", "type": "text" }, { "collapsed": false, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 26 }, "id": 104, "title": "Alerting", "type": "row" }, { "fieldConfig": { "defaults": { "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "orange", "value": 1 }, { "color": "red", "value": 3 } ] } } }, "gridPos": { "h": 6, "w": 24, "x": 0, "y": 27 }, "id": 11, "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "count(ALERTS{alertstate=\"firing\"}) or vector(0)", "legendFormat": "Firing Alerts" } ], "title": "Firing Alerts", "type": "stat" } ], "refresh": "30s", "schemaVersion": 39, "tags": ["infrastructure", "blue-jay", "overview"], "time": { "from": "now-1h", "to": "now" }, "timezone": "browser", "title": "Infrastructure Overview", "uid": "infra-overview", "version": 1 } # ============================================================================= # ConfigMap: Grafana Datasource Provisioning # ============================================================================= --- apiVersion: v1 kind: ConfigMap metadata: name: grafana-datasource-provisioning namespace: monitoring data: datasource.yml: | apiVersion: 1 datasources: - name: Prometheus type: prometheus access: proxy url: http://prometheus.monitoring.svc:9090 isDefault: true editable: true # ============================================================================= # ConfigMap: Grafana Alerting Provisioning (contact points + policies + rules) # ============================================================================= # Makes alert rules declarative — survives pod rebuilds without API recreation --- apiVersion: v1 kind: ConfigMap metadata: name: grafana-alerting-provisioning namespace: monitoring data: alerting.yml: | apiVersion: 1 contactPoints: - orgId: 1 name: IRC #alerts receivers: - uid: irc-alerts-webhook type: webhook settings: url: http://irc-notify.monitoring.svc:9119 httpMethod: POST disableResolveMessage: false - orgId: 1 name: Thermal Printer receivers: - uid: thermal-print-001 type: webhook settings: url: http://irc-notify.monitoring.svc:9119 httpMethod: POST disableResolveMessage: true policies: - orgId: 1 receiver: IRC #alerts group_by: ['alertname'] group_wait: 30s group_interval: 5m repeat_interval: 1h routes: - receiver: Thermal Printer matchers: ['alert_channel = thermal_print'] group_wait: 1m group_interval: 10m repeat_interval: 4h continue: true groups: - orgId: 1 name: AI Stack folder: AI Stack Alerts interval: 1m rules: - uid: ollama-down-local title: Ollama DOWN (Local) condition: C for: 2m noDataState: Alerting execErrState: OK annotations: summary: Ollama DOWN on workstation (R9700) description: Agent Zero LOCAL cannot reach Ollama. FAISS memory will fail. runbook: "1. Check PC power/wake 2. ssh stoltz@10.0.58.100 3. systemctl status ollama 4. journalctl -u ollama --since 5min" labels: severity: warning service: ollama data: - refId: A relativeTimeRange: {from: 120, to: 0} datasourceUid: prometheus model: {expr: 'probe_success{job="probe-ollama-local"}', instant: true, refId: A} - refId: B relativeTimeRange: {from: 120, to: 0} datasourceUid: __expr__ model: {type: reduce, expression: A, reducer: last, refId: B} - refId: C relativeTimeRange: {from: 120, to: 0} datasourceUid: __expr__ model: {type: threshold, expression: B, conditions: [{evaluator: {params: [1], type: lt}}], refId: C} - uid: ollama-down-edge1 title: Ollama DOWN (Edge1) condition: C for: 2m noDataState: Alerting execErrState: OK annotations: summary: Ollama DOWN on edge1 Pi 5 description: Agent Zero NUC cannot reach Ollama. runbook: "1. ssh stoltz@10.0.57.17 2. systemctl status ollama 3. journalctl -u ollama --since 5min 4. cat /sys/class/thermal/thermal_zone0/temp" labels: severity: warning service: ollama data: - refId: A relativeTimeRange: {from: 120, to: 0} datasourceUid: prometheus model: {expr: 'probe_success{job="probe-ollama-edge1"}', instant: true, refId: A} - refId: B relativeTimeRange: {from: 120, to: 0} datasourceUid: __expr__ model: {type: reduce, expression: A, reducer: last, refId: B} - refId: C relativeTimeRange: {from: 120, to: 0} datasourceUid: __expr__ model: {type: threshold, expression: B, conditions: [{evaluator: {params: [1], type: lt}}], refId: C} - uid: a0-down-local title: Agent Zero DOWN (Local) condition: C for: 2m noDataState: Alerting execErrState: OK annotations: summary: Agent Zero LOCAL DOWN description: K3s web UI unreachable. runbook: "1. ssh stoltz@10.0.58.100 2. docker ps | grep agent-zero 3. docker logs agent-zero --tail 20 4. Check Ollama first (A0 crashes if Ollama down)" labels: severity: warning service: agent-zero data: - refId: A relativeTimeRange: {from: 120, to: 0} datasourceUid: prometheus model: {expr: 'probe_success{job="probe-agentzero-local"}', instant: true, refId: A} - refId: B relativeTimeRange: {from: 120, to: 0} datasourceUid: __expr__ model: {type: reduce, expression: A, reducer: last, refId: B} - refId: C relativeTimeRange: {from: 120, to: 0} datasourceUid: __expr__ model: {type: threshold, expression: B, conditions: [{evaluator: {params: [1], type: lt}}], refId: C} - uid: a0-down-nuc title: Agent Zero DOWN (NUC) condition: C for: 2m noDataState: Alerting execErrState: OK annotations: summary: Agent Zero NUC DOWN description: RKE2 web UI unreachable. runbook: "1. ssh root@10.0.56.200 via noc1 2. kubectl get pods -n agent-zero 3. kubectl logs -n agent-zero deploy/agent-zero --tail 20" labels: severity: warning service: agent-zero data: - refId: A relativeTimeRange: {from: 120, to: 0} datasourceUid: prometheus model: {expr: 'probe_success{job="probe-agentzero-nuc"}', instant: true, refId: A} - refId: B relativeTimeRange: {from: 120, to: 0} datasourceUid: __expr__ model: {type: reduce, expression: A, reducer: last, refId: B} - refId: C relativeTimeRange: {from: 120, to: 0} datasourceUid: __expr__ model: {type: threshold, expression: B, conditions: [{evaluator: {params: [1], type: lt}}], refId: C} - uid: print-ollama-runner-long-keepalive title: Print.Web Ollama runner keep-alive >10m condition: C for: 2m noDataState: NoData execErrState: OK annotations: summary: Print.Web Ollama runner held too long description: "A Print.Web Ollama runner has more than 10 minutes of keep-alive remaining. Check active AI requests before the Pi 5 Ollama lane thrashes." runbook: "1. Open https://print.iamworkin.lan/admin 2. Check Ollama Fleet model + remaining keep-alive 3. Query Prometheus: ai_ollama_runner_keepalive_remaining_seconds{job=\"printweb-otel\"} 4. Stop duplicate model callers before restarting Ollama." labels: severity: warning service: print-web alert_channel: thermal_print data: - refId: A relativeTimeRange: {from: 120, to: 0} datasourceUid: prometheus model: {expr: 'max by(model) (ai_ollama_runner_keepalive_remaining_seconds{job="printweb-otel"})', instant: true, refId: A} - refId: B relativeTimeRange: {from: 120, to: 0} datasourceUid: __expr__ model: {type: reduce, expression: A, reducer: last, refId: B} - refId: C relativeTimeRange: {from: 120, to: 0} datasourceUid: __expr__ model: {type: threshold, expression: B, conditions: [{evaluator: {params: [600], type: gt}}], refId: C} - orgId: 1 name: CI Runners folder: CI Alerts interval: 1m rules: - uid: linux-runner-offline title: LinuxRunnerOffline condition: C for: 5m noDataState: OK execErrState: Error annotations: summary: "Linux CI runner offline: {{ $labels.deployment }}" description: "A github-runner namespace Deployment has 0 ready replicas for more than 5 minutes. CI jobs targeting that repo will queue until the runner pod restarts and re-registers." runbook: "1. kubectl -n github-runner get pods -l app.kubernetes.io/name={{ $labels.deployment }} 2. kubectl -n github-runner logs -l app.kubernetes.io/name={{ $labels.deployment }} --tail=50 3. Verify PAT repo access if registration returns 404 4. Verify no RWO PVC is shared by scaled runners" labels: severity: warning service: github-runner alert_channel: irc team: ci data: - refId: A relativeTimeRange: {from: 300, to: 0} datasourceUid: prometheus model: {expr: 'kube_deployment_status_replicas_ready{namespace="github-runner",deployment=~"github-runner(|-(sharedpos|puppet|signage|dms|telephony|print-web|chat|mysql|kiosk-linux))"} == 0', instant: true, refId: A} - refId: B relativeTimeRange: {from: 300, to: 0} datasourceUid: __expr__ model: {type: reduce, expression: A, reducer: last, refId: B} - refId: C relativeTimeRange: {from: 300, to: 0} datasourceUid: __expr__ model: {type: threshold, expression: B, conditions: [{evaluator: {params: [0], type: gt}}], refId: C} - orgId: 1 name: Infrastructure folder: AI Stack Alerts interval: 1m rules: - uid: node-down title: Node DOWN condition: C for: 2m noDataState: Alerting execErrState: OK annotations: summary: Node down description: Node exporter unreachable for 2 minutes. Host may be down or network issue. runbook: "1. Ping host IP 2. SSH via noc1 jumpbox (root@10.0.56.10) 3. systemctl status node_exporter 4. Check network/VLAN 5. Physical power check if unreachable" labels: severity: critical service: infrastructure alert_channel: thermal_print data: - refId: A relativeTimeRange: {from: 120, to: 0} datasourceUid: prometheus model: {expr: 'up{job=~"node-exporter|rke2-nodes|edge-nodes"}', instant: true, refId: A} - refId: B relativeTimeRange: {from: 120, to: 0} datasourceUid: __expr__ model: {type: reduce, expression: A, reducer: last, refId: B} - refId: C relativeTimeRange: {from: 120, to: 0} datasourceUid: __expr__ model: {type: threshold, expression: B, conditions: [{evaluator: {params: [1], type: lt}}], refId: C} - uid: macmini-runner-offline title: MacMiniRunnerOffline condition: C for: 10m noDataState: Alerting execErrState: OK annotations: summary: Mac mini GitHub runner offline description: "One or more macmini-* GitHub Actions runners have not reported online for more than 10 minutes. LaunchDaemons survive reboot and do not require the bluejay GUI session." runbook: "1. ssh fcadmin@macmini.iamworkin.lan 2. launchctl print system/io.flowercore.github-runner- 3. Check /Users/fcadmin/Library/Logs/github-runners//stderr.log 4. Re-register the repo runner if .runner is missing" labels: severity: warning service: github-runner data: - refId: A relativeTimeRange: {from: 600, to: 0} datasourceUid: prometheus model: {expr: 'min(flowercore_github_runner_online{runner=~"macmini-.*"} or vector(0))', instant: true, refId: A} - refId: B relativeTimeRange: {from: 600, to: 0} datasourceUid: __expr__ model: {type: reduce, expression: A, reducer: last, refId: B} - refId: C relativeTimeRange: {from: 600, to: 0} datasourceUid: __expr__ model: {type: threshold, expression: B, conditions: [{evaluator: {params: [1], type: lt}}], refId: C} - uid: high-cpu title: High CPU (>85%) condition: C for: 10m noDataState: NoData execErrState: OK annotations: summary: High CPU description: CPU above 85% for 10 minutes. Performance degradation likely. runbook: "1. SSH to host 2. top -b -n1 | head -20 3. Check for runaway process 4. kubectl top pods (if K8s node)" labels: severity: warning service: infrastructure data: - refId: A relativeTimeRange: {from: 600, to: 0} datasourceUid: prometheus model: {expr: 'instance:node_cpu_usage:avg5m', instant: true, refId: A} - refId: B relativeTimeRange: {from: 600, to: 0} datasourceUid: __expr__ model: {type: reduce, expression: A, reducer: last, refId: B} - refId: C relativeTimeRange: {from: 600, to: 0} datasourceUid: __expr__ model: {type: threshold, expression: B, conditions: [{evaluator: {params: [85], type: gt}}], refId: C} - uid: high-memory title: High Memory (>90%) condition: C for: 5m noDataState: NoData execErrState: OK annotations: summary: High memory usage description: Memory above 90% for 5 minutes. OOM kills imminent. runbook: "1. SSH to host 2. free -h 3. ps aux --sort=-%mem | head 10 4. Check for memory leak (growing RSS)" labels: severity: warning service: infrastructure data: - refId: A relativeTimeRange: {from: 600, to: 0} datasourceUid: prometheus model: {expr: 'instance:node_memory_usage:percent', instant: true, refId: A} - refId: B relativeTimeRange: {from: 600, to: 0} datasourceUid: __expr__ model: {type: reduce, expression: A, reducer: last, refId: B} - refId: C relativeTimeRange: {from: 600, to: 0} datasourceUid: __expr__ model: {type: threshold, expression: B, conditions: [{evaluator: {params: [90], type: gt}}], refId: C} - uid: disk-low title: Disk Space Low (>85%) condition: C for: 10m noDataState: NoData execErrState: OK annotations: summary: Disk usage high description: Root disk above 85% for 10 minutes. Service disruption if full. runbook: "1. SSH to host 2. df -h / 3. du -sh /* | sort -rh | head 4. journalctl --vacuum-size=100M 5. podman system prune" labels: severity: warning service: infrastructure data: - refId: A relativeTimeRange: {from: 600, to: 0} datasourceUid: prometheus model: {expr: 'instance:node_disk_usage:percent', instant: true, refId: A} - refId: B relativeTimeRange: {from: 600, to: 0} datasourceUid: __expr__ model: {type: reduce, expression: A, reducer: last, refId: B} - refId: C relativeTimeRange: {from: 600, to: 0} datasourceUid: __expr__ model: {type: threshold, expression: B, conditions: [{evaluator: {params: [85], type: gt}}], refId: C} - orgId: 1 name: RemoteDesktop folder: AI Stack Alerts interval: 1m rules: - uid: remotedesktop-web-down title: RemoteDesktop Web DOWN condition: C for: 3m noDataState: Alerting execErrState: OK annotations: summary: FlowerCore RemoteDesktop /health probe failing description: "https://desktop.iamworkin.lan/health has failed for 3 minutes. Catalog + session launch surface offline." runbook: "1. kubectl -n fc-desktop get pods -l app.kubernetes.io/name=remotedesktop-web 2. kubectl -n fc-desktop logs deploy/remotedesktop-web --tail=50 3. Check Traefik IngressRoute + step-ca cert 4. Rollout restart if pod is stuck" labels: severity: warning service: remotedesktop data: - refId: A relativeTimeRange: {from: 180, to: 0} datasourceUid: prometheus model: {expr: 'probe_success{job="probe-remotedesktop"}', instant: true, refId: A} - refId: B relativeTimeRange: {from: 180, to: 0} datasourceUid: __expr__ model: {type: reduce, expression: A, reducer: last, refId: B} - refId: C relativeTimeRange: {from: 180, to: 0} datasourceUid: __expr__ model: {type: threshold, expression: B, conditions: [{evaluator: {params: [1], type: lt}}], refId: C} - uid: remotedesktop-metrics-stale title: RemoteDesktop metrics stale condition: C for: 10m noDataState: Alerting execErrState: OK annotations: summary: RemoteDesktop /metrics returning no series description: "No fc_desktop_session_events_total series for 10 minutes. Either the Prometheus scrape is misconfigured or the web deployment stopped exporting metrics. Cross-checked by Zabbix template's identical 10m no-data trigger." runbook: "1. curl -sk https://desktop.iamworkin.lan/metrics | head 2. kubectl -n monitoring exec deploy/prometheus -- wget -qO- localhost:9090/api/v1/targets?scrapePool=fc-remotedesktop 3. Check monitoring-netpol egress allows to fc-desktop:8080" labels: severity: warning service: remotedesktop data: - refId: A relativeTimeRange: {from: 600, to: 0} datasourceUid: prometheus model: {expr: 'count(fc_desktop_session_events_total) or vector(0)', instant: true, refId: A} - refId: B relativeTimeRange: {from: 600, to: 0} datasourceUid: __expr__ model: {type: reduce, expression: A, reducer: last, refId: B} - refId: C relativeTimeRange: {from: 600, to: 0} datasourceUid: __expr__ model: {type: threshold, expression: B, conditions: [{evaluator: {params: [1], type: lt}}], refId: C} - uid: remotedesktop-pool-depleted title: RemoteDesktop pool depleted condition: C for: 5m noDataState: OK execErrState: OK annotations: summary: RemoteDesktop warm pool depleted for 5m description: "A RemoteDesktop warm pool has fc_desktop_pool_depleted=1 for 5 minutes. New launches will cold-start. Check pod scheduling, image pull, node capacity." runbook: "1. kubectl -n fc-desktop get pods -l app.kubernetes.io/name=remote-desktop --sort-by=.status.startTime 2. kubectl -n fc-desktop describe desktoppool 3. Verify localhost/fc-desktop:* images imported on all 3 RKE2 nodes" labels: severity: warning service: remotedesktop data: - refId: A relativeTimeRange: {from: 300, to: 0} datasourceUid: prometheus model: {expr: 'max(fc_desktop_pool_depleted)', instant: true, refId: A} - refId: B relativeTimeRange: {from: 300, to: 0} datasourceUid: __expr__ model: {type: reduce, expression: A, reducer: last, refId: B} - refId: C relativeTimeRange: {from: 300, to: 0} datasourceUid: __expr__ model: {type: threshold, expression: B, conditions: [{evaluator: {params: [0.5], type: gt}}], refId: C} - uid: remotedesktop-pool-deficit-sustained title: RemoteDesktop pool below desired condition: C for: 10m noDataState: OK execErrState: OK annotations: summary: RemoteDesktop pool sustained deficit description: "A pool has fc_desktop_pool_deficit>0 for 10 minutes. Operator is reconciling but can't reach desired size — likely image pull, NFS affinity, or claim-init issue." runbook: "1. kubectl -n fc-desktop get pods -l flowercore.io/pool= 2. kubectl logs -n fc-desktop deploy/remotedesktop-operator 3. Check claim-init hook env on template" labels: severity: info service: remotedesktop data: - refId: A relativeTimeRange: {from: 600, to: 0} datasourceUid: prometheus model: {expr: 'max(fc_desktop_pool_deficit)', instant: true, refId: A} - refId: B relativeTimeRange: {from: 600, to: 0} datasourceUid: __expr__ model: {type: reduce, expression: A, reducer: last, refId: B} - refId: C relativeTimeRange: {from: 600, to: 0} datasourceUid: __expr__ model: {type: threshold, expression: B, conditions: [{evaluator: {params: [0], type: gt}}], refId: C} - uid: remotedesktop-session-churn-spike title: RemoteDesktop launch rate spike condition: C for: 5m noDataState: OK execErrState: OK annotations: summary: RemoteDesktop launch rate exceeds 20/min description: "Launch events >20/min for 5 minutes. Could be a user-facing feature launch, pooled template thrashing, or runaway automation loop." runbook: "1. kubectl -n fc-desktop get pods -l app.kubernetes.io/name=remote-desktop -o wide | wc -l 2. curl -sk https://desktop.iamworkin.lan/api/sessions/active 3. Check operator logs for reconcile loops" labels: severity: info service: remotedesktop data: - refId: A relativeTimeRange: {from: 300, to: 0} datasourceUid: prometheus model: {expr: 'sum(rate(fc_desktop_session_events_total{event="launch"}[5m])) * 60', instant: true, refId: A} - refId: B relativeTimeRange: {from: 300, to: 0} datasourceUid: __expr__ model: {type: reduce, expression: A, reducer: last, refId: B} - refId: C relativeTimeRange: {from: 300, to: 0} datasourceUid: __expr__ model: {type: threshold, expression: B, conditions: [{evaluator: {params: [20], type: gt}}], refId: C} - uid: remotedesktop-tls-expiry title: RemoteDesktop TLS cert expiring condition: C for: 6h noDataState: OK execErrState: OK annotations: summary: desktop.iamworkin.lan cert <2d to expiry description: "The desktop.iamworkin.lan certificate is inside the 2-day renewal window and cert-manager has not renewed. Check cert-manager logs, step-ca reachability, FlowerCore.DNS preflight for dnsNames." runbook: "1. kubectl -n fc-desktop get certificate remotedesktop-web-tls 2. kubectl -n cert-manager logs deploy/cert-manager --tail=50 3. Verify pfSense DNS override for desktop.iamworkin.lan" labels: severity: critical service: remotedesktop data: - refId: A relativeTimeRange: {from: 21600, to: 0} datasourceUid: prometheus model: {expr: '(probe_ssl_earliest_cert_expiry{job="probe-remotedesktop"} - time()) / 86400', instant: true, refId: A} - refId: B relativeTimeRange: {from: 21600, to: 0} datasourceUid: __expr__ model: {type: reduce, expression: A, reducer: last, refId: B} - refId: C relativeTimeRange: {from: 21600, to: 0} datasourceUid: __expr__ model: {type: threshold, expression: B, conditions: [{evaluator: {params: [2], type: lt}}], refId: C} # ============================================================================= # Deployment: Grafana # ============================================================================= --- apiVersion: apps/v1 kind: Deployment metadata: name: grafana namespace: monitoring labels: app: grafana spec: replicas: 1 strategy: type: Recreate selector: matchLabels: app: grafana template: metadata: labels: app: grafana spec: securityContext: fsGroup: 472 # grafana group runAsUser: 472 runAsGroup: 472 containers: - name: grafana image: docker.io/grafana/grafana:latest env: # Credentials from 1Password Operator (OnePasswordItem → Secret "grafana-credentials") - name: GF_SECURITY_ADMIN_USER valueFrom: secretKeyRef: name: grafana-credentials key: username - name: GF_SECURITY_ADMIN_PASSWORD valueFrom: secretKeyRef: name: grafana-credentials key: password - name: GF_SERVER_ROOT_URL value: "https://grafana.iamworkin.lan" - name: GF_SERVER_SERVE_FROM_SUB_PATH value: "false" # Zabbix plugin: install manually after first boot if needed # GF_INSTALL_PLUGINS requires internet on startup — breaks with restrictive NetworkPolicy # kubectl exec -n monitoring deploy/grafana -- grafana cli plugins install alexanderzobnin-zabbix-app ports: - containerPort: 3000 name: http volumeMounts: - name: data mountPath: /var/lib/grafana - name: dashboard-provider mountPath: /etc/grafana/provisioning/dashboards readOnly: true - name: dashboards-ai-stack mountPath: /var/lib/grafana/dashboards/ai-stack readOnly: true - name: dashboards-edge-nodes mountPath: /var/lib/grafana/dashboards/edge-nodes readOnly: true - name: dashboards-network mountPath: /var/lib/grafana/dashboards/network readOnly: true - name: dashboards-operations mountPath: /var/lib/grafana/dashboards/operations readOnly: true - name: dashboards-printer mountPath: /var/lib/grafana/dashboards/printer readOnly: true - name: dashboards-infra-overview mountPath: /var/lib/grafana/dashboards/infra-overview readOnly: true - name: dashboards-remotedesktop mountPath: /var/lib/grafana/dashboards/remotedesktop readOnly: true - name: datasource-provisioning mountPath: /etc/grafana/provisioning/datasources readOnly: true - name: alerting-provisioning mountPath: /etc/grafana/provisioning/alerting readOnly: true resources: requests: cpu: 100m memory: 128Mi limits: cpu: 500m memory: 512Mi livenessProbe: httpGet: path: /api/health port: 3000 initialDelaySeconds: 30 periodSeconds: 30 readinessProbe: httpGet: path: /api/health port: 3000 initialDelaySeconds: 10 periodSeconds: 10 volumes: - name: data persistentVolumeClaim: claimName: grafana-data - name: dashboard-provider configMap: name: grafana-dashboard-provider - name: dashboards-ai-stack configMap: name: grafana-dashboards - name: dashboards-edge-nodes configMap: name: grafana-dashboard-edge-nodes - name: dashboards-network configMap: name: grafana-dashboard-network-overview - name: dashboards-operations configMap: name: grafana-dashboard-operations - name: dashboards-printer configMap: name: grafana-dashboard-printer - name: dashboards-infra-overview configMap: name: grafana-dashboard-infra-overview - name: dashboards-remotedesktop configMap: name: grafana-dashboard-remotedesktop - name: datasource-provisioning configMap: name: grafana-datasource-provisioning - name: alerting-provisioning configMap: name: grafana-alerting-provisioning # ============================================================================= # Deployment: Blackbox Exporter # ============================================================================= --- apiVersion: apps/v1 kind: Deployment metadata: name: blackbox-exporter namespace: monitoring labels: app: blackbox-exporter spec: replicas: 1 selector: matchLabels: app: blackbox-exporter template: metadata: labels: app: blackbox-exporter spec: containers: - name: blackbox-exporter image: quay.io/prometheus/blackbox-exporter:latest args: - "--config.file=/config/blackbox.yml" ports: - containerPort: 9115 name: http volumeMounts: - name: config mountPath: /config/blackbox.yml subPath: blackbox.yml readOnly: true resources: requests: cpu: 50m memory: 32Mi limits: cpu: 200m memory: 128Mi livenessProbe: httpGet: path: / port: 9115 initialDelaySeconds: 5 periodSeconds: 30 readinessProbe: httpGet: path: / port: 9115 initialDelaySeconds: 3 periodSeconds: 10 volumes: - name: config configMap: name: blackbox-config # ============================================================================= # PVC: SNMP Exporter Config (100Mi, Longhorn) # ============================================================================= # The custom snmp.yml (~2MB) exceeds the 1MB ConfigMap limit. # This PVC stores the config file. To load a custom config: # kubectl cp snmp.yml monitoring/:/config/snmp.yml # Then restart the pod to pick up the new config. --- apiVersion: v1 kind: PersistentVolumeClaim metadata: name: snmp-config namespace: monitoring spec: accessModes: - ReadWriteOnce storageClassName: longhorn resources: requests: storage: 100Mi # ============================================================================= # Deployment: SNMP Exporter # ============================================================================= # Uses a PVC-mounted config at /config/snmp.yml. An init container copies the # default config from the image if the PVC is empty (first deploy). # To load the custom noc1 snmp.yml (~2MB): # kubectl cp snmp.yml monitoring/:/config/snmp.yml # Then restart: kubectl rollout restart deploy/snmp-exporter -n monitoring --- apiVersion: apps/v1 kind: Deployment metadata: name: snmp-exporter namespace: monitoring labels: app: snmp-exporter spec: replicas: 1 selector: matchLabels: app: snmp-exporter template: metadata: labels: app: snmp-exporter spec: initContainers: # Copy default snmp.yml from image if PVC is empty (first deploy) - name: init-config image: docker.io/prom/snmp-exporter:latest command: - sh - -c - | if [ ! -f /config/snmp.yml ]; then echo "No custom config found, copying default from image..." cp /etc/snmp_exporter/snmp.yml /config/snmp.yml echo "Default snmp.yml copied to PVC." else echo "Custom snmp.yml already exists on PVC, skipping copy." fi volumeMounts: - name: snmp-config mountPath: /config containers: - name: snmp-exporter image: docker.io/prom/snmp-exporter:latest args: - "--config.file=/config/snmp.yml" ports: - containerPort: 9116 name: http volumeMounts: - name: snmp-config mountPath: /config readOnly: true resources: requests: cpu: 50m memory: 64Mi limits: cpu: 200m memory: 256Mi livenessProbe: httpGet: path: / port: 9116 initialDelaySeconds: 5 periodSeconds: 30 readinessProbe: httpGet: path: / port: 9116 initialDelaySeconds: 3 periodSeconds: 10 volumes: - name: snmp-config persistentVolumeClaim: claimName: snmp-config # ============================================================================= # Deployment: IRC Notify (alert relay) # ============================================================================= --- apiVersion: apps/v1 kind: Deployment metadata: name: irc-notify namespace: monitoring labels: app: irc-notify spec: replicas: 1 selector: matchLabels: app: irc-notify template: metadata: labels: app: irc-notify spec: containers: - name: irc-notify image: docker.io/library/python:3.12-slim command: ["python3", "/app/notify.py"] ports: - containerPort: 9119 name: http volumeMounts: - name: script mountPath: /app/notify.py subPath: notify.py readOnly: true resources: requests: cpu: 25m memory: 32Mi limits: cpu: 100m memory: 64Mi livenessProbe: tcpSocket: port: 9119 initialDelaySeconds: 5 periodSeconds: 30 readinessProbe: tcpSocket: port: 9119 initialDelaySeconds: 3 periodSeconds: 10 volumes: - name: script configMap: name: irc-notify-script # ============================================================================= # DaemonSet: Node Exporter (runs on every RKE2 node) # ============================================================================= # Port 9101 avoids conflict with host-level node-exporters already on :9100. # The rke2-nodes Prometheus job scrapes the host instances on :9100; this # DaemonSet provides K8s service-discovery-based scraping on :9101. --- apiVersion: apps/v1 kind: DaemonSet metadata: name: node-exporter namespace: monitoring labels: app: node-exporter spec: selector: matchLabels: app: node-exporter updateStrategy: type: RollingUpdate rollingUpdate: maxUnavailable: 1 template: metadata: labels: app: node-exporter spec: hostPID: true hostNetwork: true tolerations: - operator: Exists securityContext: runAsNonRoot: false runAsUser: 0 containers: - name: node-exporter image: docker.io/prom/node-exporter:latest args: - "--path.rootfs=/host" - "--path.sysfs=/host/sys" - "--path.procfs=/host/proc" - "--collector.filesystem.mount-points-exclude=^/(dev|proc|sys|run|var/lib/containers|run/containers|var/lib/rancher|var/lib/longhorn)($|/)" - "--collector.netclass.ignored-devices=^(veth|cali|flannel|cni).*$" - "--no-collector.btrfs" - "--web.listen-address=:9101" ports: - containerPort: 9101 hostPort: 9101 name: metrics securityContext: privileged: true readOnlyRootFilesystem: true volumeMounts: - name: rootfs mountPath: /host readOnly: true mountPropagation: HostToContainer - name: proc mountPath: /host/proc readOnly: true - name: sys mountPath: /host/sys readOnly: true resources: requests: cpu: 50m memory: 32Mi limits: cpu: 200m memory: 128Mi volumes: - name: rootfs hostPath: path: / - name: proc hostPath: path: /proc - name: sys hostPath: path: /sys # ============================================================================= # Service: Prometheus (ClusterIP :9090) # ============================================================================= --- apiVersion: v1 kind: Service metadata: name: prometheus namespace: monitoring labels: app: prometheus spec: type: ClusterIP ports: - port: 9090 targetPort: 9090 protocol: TCP name: http selector: app: prometheus # ============================================================================= # Service: Grafana (ClusterIP :3000) # ============================================================================= --- apiVersion: v1 kind: Service metadata: name: grafana namespace: monitoring labels: app: grafana spec: type: ClusterIP ports: - port: 3000 targetPort: 3000 protocol: TCP name: http selector: app: grafana # ============================================================================= # Service: Blackbox Exporter (ClusterIP :9115) # ============================================================================= --- apiVersion: v1 kind: Service metadata: name: blackbox-exporter namespace: monitoring labels: app: blackbox-exporter spec: type: ClusterIP ports: - port: 9115 targetPort: 9115 protocol: TCP name: http selector: app: blackbox-exporter # ============================================================================= # Service: SNMP Exporter (ClusterIP :9116) # ============================================================================= --- apiVersion: v1 kind: Service metadata: name: snmp-exporter namespace: monitoring labels: app: snmp-exporter spec: type: ClusterIP ports: - port: 9116 targetPort: 9116 protocol: TCP name: http selector: app: snmp-exporter # ============================================================================= # Service: Node Exporter (headless for Prometheus SD) # ============================================================================= --- apiVersion: v1 kind: Service metadata: name: node-exporter namespace: monitoring labels: app: node-exporter spec: type: ClusterIP clusterIP: None ports: - port: 9101 targetPort: 9101 protocol: TCP name: metrics selector: app: node-exporter # ============================================================================= # Service: IRC Notify (ClusterIP :9119) # ============================================================================= --- apiVersion: v1 kind: Service metadata: name: irc-notify namespace: monitoring labels: app: irc-notify spec: type: ClusterIP ports: - port: 9119 targetPort: 9119 protocol: TCP name: http selector: app: irc-notify # ============================================================================= # TLS Certificates (cert-manager + step-ca ACME) # ============================================================================= --- apiVersion: cert-manager.io/v1 kind: Certificate metadata: name: grafana-tls namespace: monitoring spec: secretName: grafana-tls issuerRef: name: step-ca-acme kind: ClusterIssuer dnsNames: - grafana.iamworkin.lan --- apiVersion: cert-manager.io/v1 kind: Certificate metadata: name: prometheus-tls namespace: monitoring spec: secretName: prometheus-tls issuerRef: name: step-ca-acme kind: ClusterIssuer dnsNames: - prometheus.iamworkin.lan # ============================================================================= # Traefik IngressRoute: Grafana # ============================================================================= --- apiVersion: traefik.io/v1alpha1 kind: IngressRoute metadata: name: grafana namespace: monitoring spec: entryPoints: - websecure routes: - kind: Rule match: Host(`grafana.iamworkin.lan`) services: - name: grafana port: 3000 tls: secretName: grafana-tls # ============================================================================= # Traefik IngressRoute: Prometheus # ============================================================================= --- apiVersion: traefik.io/v1alpha1 kind: IngressRoute metadata: name: prometheus namespace: monitoring spec: entryPoints: - websecure routes: - kind: Rule match: Host(`prometheus.iamworkin.lan`) services: - name: prometheus port: 9090 tls: secretName: prometheus-tls # ============================================================================= # NetworkPolicy: monitoring namespace # ============================================================================= --- apiVersion: networking.k8s.io/v1 kind: NetworkPolicy metadata: name: monitoring-netpol namespace: monitoring spec: podSelector: {} policyTypes: - Ingress - Egress ingress: # Allow from Traefik (IngressRoutes AND ACME solver pods) - from: - namespaceSelector: matchLabels: kubernetes.io/metadata.name: traefik-system # Allow intra-namespace (prometheus→exporters, grafana→prometheus, grafana→irc-notify) - from: - podSelector: {} # Allow from cert-manager (ACME HTTP-01 self-check) - from: - namespaceSelector: matchLabels: kubernetes.io/metadata.name: cert-manager egress: # DNS - to: - namespaceSelector: {} ports: - port: 53 protocol: UDP - port: 53 protocol: TCP # MGMT VLAN (noc1, pfSense, switches, SNMP, node-exporter) - to: - ipBlock: cidr: 10.0.56.0/24 # PROD VLAN (edge nodes) - to: - ipBlock: cidr: 10.0.57.0/24 # HOME VLAN (workstation, printer, NAS) - to: - ipBlock: cidr: 10.0.58.0/24 # Intra-namespace - to: - podSelector: {} # Blackbox probes to other namespaces (agent-zero, etc) - to: - namespaceSelector: matchLabels: kubernetes.io/metadata.name: agent-zero ports: - port: 80 protocol: TCP # FlowerCore.RemoteDesktop /metrics scrape via the fc-desktop # ClusterIP Service (remotedesktop-web:8080). Also covers the # Traefik VIP hairpin path since after kube-proxy DNAT, the egress # destination is the backend pod IP on the service port (see # feedback_netpol_dnat_backend_port). - to: - namespaceSelector: matchLabels: kubernetes.io/metadata.name: fc-desktop ports: - port: 8080 protocol: TCP # Traefik backend ports — needed for in-cluster egress to public # iamworkin.lan hostnames that CoreDNS wildcard resolves to the # LoadBalancer VIP. Post-DNAT destination is a Traefik pod on 8080/8443. - to: - namespaceSelector: matchLabels: kubernetes.io/metadata.name: traefik-system podSelector: matchLabels: app.kubernetes.io/name: traefik ports: - port: 8080 protocol: TCP - port: 8443 protocol: TCP # Traefik /metrics endpoint (port 9100) — separate from the data-path # ports above. Required for the in-cluster `traefik` scrape job. - to: - namespaceSelector: matchLabels: kubernetes.io/metadata.name: traefik-system ports: - port: 9100 protocol: TCP # kube-state-metrics — required for kubernetes-state alert group. - to: - namespaceSelector: matchLabels: kubernetes.io/metadata.name: kube-system ports: - port: 8080 protocol: TCP # cert-manager metrics — required for CertManagerCertificate* alerts. - to: - namespaceSelector: matchLabels: kubernetes.io/metadata.name: cert-manager ports: - port: 9402 protocol: TCP # Longhorn manager metrics — required for Longhorn* alerts. - to: - namespaceSelector: matchLabels: kubernetes.io/metadata.name: longhorn-system ports: - port: 9500 protocol: TCP # IRC (irc-notify → UnrealIRCd in irc namespace via K8s DNS) - to: - namespaceSelector: matchLabels: kubernetes.io/metadata.name: irc ports: - port: 6667 protocol: TCP - port: 6697 protocol: TCP # Step-CA ACME (cert renewal) - to: - ipBlock: cidr: 10.0.56.10/32 ports: - port: 9443 protocol: TCP # Internet (optional: Grafana plugin install, ACME) - to: - ipBlock: cidr: 0.0.0.0/0 except: - 10.0.0.0/8 - 172.16.0.0/12 - 192.168.0.0/16 # ============================================================================= # Job: SNMP Config Loader (ArgoCD PostSync hook) # ============================================================================= # Runs once after the main deployment to populate the SNMP config PVC. # Attempts to download custom snmp.yml from noc1; falls back to the default # config bundled in the snmp-exporter image. --- apiVersion: batch/v1 kind: Job metadata: name: snmp-config-loader namespace: monitoring annotations: argocd.argoproj.io/hook: PostSync argocd.argoproj.io/hook-delete-policy: HookSucceeded spec: backoffLimit: 0 template: metadata: labels: app: snmp-config-loader spec: restartPolicy: Never initContainers: # Try to download custom snmp.yml from noc1 - name: download-config image: docker.io/curlimages/curl:latest command: - sh - -c - | echo "Attempting to download custom snmp.yml from noc1..." curl -sf --connect-timeout 10 --max-time 30 \ http://10.0.56.10:9116/config -o /config/snmp.yml 2>/dev/null if [ $? -eq 0 ] && [ -s /config/snmp.yml ]; then echo "Custom snmp.yml downloaded from noc1 successfully." else echo "Download failed or empty, will use default from image." rm -f /config/snmp.yml fi volumeMounts: - name: snmp-config mountPath: /config containers: # If download failed, copy the default config from the image - name: fallback-default image: docker.io/prom/snmp-exporter:latest command: - sh - -c - | if [ -f /config/snmp.yml ] && [ -s /config/snmp.yml ]; then echo "Custom config already present, nothing to do." else echo "Copying default snmp.yml from image to PVC..." cp /etc/snmp_exporter/snmp.yml /config/snmp.yml echo "Default config copied." fi echo "SNMP config loader complete." volumeMounts: - name: snmp-config mountPath: /config volumes: - name: snmp-config persistentVolumeClaim: claimName: snmp-config