diff --git a/apps/monitoring/noc-monitoring.yaml b/apps/monitoring/noc-monitoring.yaml
new file mode 100644
index 0000000..335f583
--- /dev/null
+++ b/apps/monitoring/noc-monitoring.yaml
@@ -0,0 +1,3788 @@
+# =============================================================================
+# NOC Monitoring Stack — K8s Migration Target
+# =============================================================================
+# Migrates the noc1 Podman monitoring pod to RKE2 K8s.
+# Source: noc1 (10.0.56.10) /opt/monitoring/
+#
+# Components:
+#   - Prometheus (metrics, alerting)
+#   - Grafana (dashboards)
+#   - Blackbox Exporter (HTTP probes)
+#   - SNMP Exporter (network device metrics)
+#   - Node Exporter (host metrics, DaemonSet)
+#   - IRC Notify (alert relay to UnrealIRCd)
+#
+# Note: SNMP exporter config (snmp.yml) is ~2MB, exceeding the 1MB ConfigMap
+# limit. It is stored in a separate file (snmp-config.yaml) and must be
+# applied as a standalone ConfigMap or mounted via an init container that
+# downloads it from Gitea.
+# =============================================================================
+
+---
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: monitoring
+
+# =============================================================================
+# ConfigMap: Prometheus Configuration
+# =============================================================================
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: prometheus-config
+  namespace: monitoring
+data:
+  prometheus.yml: |
+    global:
+      scrape_interval: 30s
+      evaluation_interval: 30s
+
+    rule_files:
+      - /etc/prometheus/alerts.yml
+      - /etc/prometheus/recording-rules.yml
+
+    scrape_configs:
+      # noc1 host metrics (external to cluster)
+      - job_name: "node-exporter"
+        static_configs:
+          - targets: ["10.0.56.10:9100"]
+            labels:
+              instance: "noc1"
+              vlan: "mgmt"
+
+      # RKE2 bare-metal cluster (openSUSE Leap 16 on NUCs)
+      - job_name: "rke2-nodes"
+        scrape_timeout: 15s
+        static_configs:
+          - targets: ["10.0.56.11:9100"]
+            labels:
+              instance: "rke2-server"
+              vlan: "mgmt"
+              cluster: "rke2"
+              role: "server"
+          - targets: ["10.0.56.12:9100"]
+            labels:
+              instance: "rke2-agent1"
+              vlan: "mgmt"
+              cluster: "rke2"
+              role: "agent"
+          - targets: ["10.0.56.13:9100"]
+            labels:
+              instance: "rke2-agent2"
+              vlan: "mgmt"
+              cluster: "rke2"
+              role: "agent"
+
+      # In-cluster node-exporter DaemonSet
+      - job_name: "k8s-node-exporter"
+        kubernetes_sd_configs:
+          - role: endpoints
+            namespaces:
+              names: ["monitoring"]
+        relabel_configs:
+          - source_labels: [__meta_kubernetes_endpoints_name]
+            action: keep
+            regex: node-exporter
+          - source_labels: [__meta_kubernetes_endpoint_node_name]
+            target_label: instance
+
+      # pfSense SNMP via snmp-exporter
+      - job_name: "snmp-pfsense"
+        static_configs:
+          - targets: ["10.0.56.1"]
+        metrics_path: /snmp
+        params:
+          module: [if_mib]
+          auth: [bluejay_v2]
+        relabel_configs:
+          - source_labels: [__address__]
+            target_label: __param_target
+          - source_labels: [__param_target]
+            target_label: instance
+          - target_label: __address__
+            replacement: snmp-exporter.monitoring.svc:9116
+
+      # UniFi Cloud Key SNMP
+      - job_name: "snmp-cloudkey"
+        static_configs:
+          - targets: ["10.0.56.3"]
+        metrics_path: /snmp
+        params:
+          module: [if_mib]
+          auth: [bluejay_v2]
+        relabel_configs:
+          - source_labels: [__address__]
+            target_label: __param_target
+          - source_labels: [__param_target]
+            target_label: instance
+          - target_label: __address__
+            replacement: snmp-exporter.monitoring.svc:9116
+
+      # UniFi Switch SNMP
+      - job_name: "snmp-switch"
+        static_configs:
+          - targets: ["10.0.56.2"]
+        metrics_path: /snmp
+        params:
+          module: [if_mib]
+          auth: [bluejay_v2]
+        relabel_configs:
+          - source_labels: [__address__]
+            target_label: __param_target
+          - source_labels: [__param_target]
+            target_label: instance
+          - target_label: __address__
+            replacement: snmp-exporter.monitoring.svc:9116
+
+      # Synology NAS SNMP
+      - job_name: "snmp-nas"
+        static_configs:
+          - targets: ["10.0.58.3"]
+        metrics_path: /snmp
+        params:
+          module: [synology]
+          auth: [public_v2]
+        relabel_configs:
+          - source_labels: [__address__]
+            target_label: __param_target
+          - source_labels: [__param_target]
+            target_label: instance
+          - target_label: __address__
+            replacement: snmp-exporter.monitoring.svc:9116
+
+      # Prometheus self-monitoring
+      - job_name: "prometheus"
+        static_configs:
+          - targets: ["localhost:9090"]
+
+      # Edge nodes (PROD VLAN)
+      - job_name: "edge-nodes"
+        static_configs:
+          - targets: ["10.0.57.17:9100"]
+            labels:
+              instance: "edge1"
+              vlan: "prod"
+              arch: "arm64"
+              role: "ai-inference"
+              puppet_managed: "true"
+              puppet_server: "puppet.iamworkin.lan"
+          - targets: ["10.0.57.16:9100"]
+            labels:
+              instance: "edge2"
+              vlan: "prod"
+              arch: "arm64"
+              role: "ci-runner"
+              puppet_managed: "true"
+              puppet_server: "puppet.iamworkin.lan"
+          - targets: ["10.0.58.25:9100"]
+            labels:
+              instance: "piez"
+              vlan: "home"
+              arch: "arm64"
+              role: "prototyping"
+          - targets: ["10.0.58.113:9100"]
+            labels:
+              instance: "pirelay"
+              vlan: "home"
+              arch: "arm64"
+              role: "relay-controller"
+
+      # =======================================================================
+      # PiManager Application Metrics (relay states, temps, automation)
+      # =======================================================================
+
+      - job_name: "pimanager-app"
+        scrape_interval: 15s
+        metrics_path: /metrics
+        static_configs:
+          - targets: ["10.0.58.25:5000"]
+            labels:
+              instance: "piez"
+              service: "pimanager"
+              vlan: "home"
+              device: "pi4-ezconnect"
+          - targets: ["10.0.58.113:5100"]
+            labels:
+              instance: "pirelay"
+              service: "pimanager"
+              vlan: "home"
+              device: "pi3-ks0212"
+
+      # Epson ET-3750 EcoTank Printer SNMP
+      - job_name: "snmp-printer"
+        scrape_interval: 5m
+        scrape_timeout: 30s
+        static_configs:
+          - targets: ["10.0.58.107"]
+            labels:
+              instance: "epson-ecotank"
+              vlan: "home"
+              device_type: "printer"
+        metrics_path: /snmp
+        params:
+          module: [printer_mib]
+          auth: [public_v2]
+        relabel_configs:
+          - source_labels: [__address__]
+            target_label: __param_target
+          - source_labels: [__param_target]
+            target_label: instance
+          - target_label: __address__
+            replacement: snmp-exporter.monitoring.svc:9116
+
+      # =============================================================================
+      # Print Services (CUPS + Print.Web on edge2)
+      # =============================================================================
+
+      # CUPS Prometheus exporter (cups_exporter on edge2:9628)
+      - job_name: "cups"
+        scrape_interval: 30s
+        static_configs:
+          - targets: ["10.0.57.16:9628"]
+            labels:
+              instance: "edge2"
+              service: "cups"
+              device_type: "printer"
+              printer_model: "NuPrint 210"
+
+      # Print.Web OTEL metrics (counters: jobs enqueued/completed/failed, bytes, duration histograms)
+      - job_name: "printweb-otel"
+        scrape_interval: 30s
+        metrics_path: /metrics/prometheus
+        static_configs:
+          - targets: ["10.0.57.16:5200"]
+            labels:
+              instance: "print-web"
+              service: "print-web"
+              device_type: "printer"
+              printer_model: "NuPrint 210"
+
+      # Print.Web health (Blazor app on edge2:5200)
+      - job_name: "probe-printweb"
+        metrics_path: /probe
+        params:
+          module: [http_2xx]
+        scrape_interval: 30s
+        static_configs:
+          - targets: ["http://10.0.57.16:5200/"]
+            labels:
+              instance: "print-web"
+              service: "print-web"
+        relabel_configs:
+          - source_labels: [__address__]
+            target_label: __param_target
+          - source_labels: [__param_target]
+            target_label: instance
+          - target_label: __address__
+            replacement: blackbox-exporter.monitoring.svc:9115
+
+      # CUPS web UI health (port 631)
+      - job_name: "probe-cups"
+        metrics_path: /probe
+        params:
+          module: [http_2xx]
+        scrape_interval: 60s
+        static_configs:
+          - targets: ["http://10.0.57.16:631/"]
+            labels:
+              instance: "cups-edge2"
+              service: "cups"
+        relabel_configs:
+          - source_labels: [__address__]
+            target_label: __param_target
+          - source_labels: [__param_target]
+            target_label: instance
+          - target_label: __address__
+            replacement: blackbox-exporter.monitoring.svc:9115
+
+      # =============================================================================
+      # AI Stack Health Probes (Blackbox Exporter)
+      # =============================================================================
+
+      # Ollama API — workstation (LOCAL Agent Zero)
+      - job_name: "probe-ollama-local"
+        metrics_path: /probe
+        params:
+          module: [http_ollama]
+        scrape_interval: 30s
+        static_configs:
+          - targets: ["http://10.0.58.100:11434/api/tags"]
+            labels:
+              instance: "ollama-local"
+              service: "ollama"
+              deployment: "local"
+              gpu: "r9700"
+        relabel_configs:
+          - source_labels: [__address__]
+            target_label: __param_target
+          - source_labels: [__param_target]
+            target_label: instance
+          - target_label: __address__
+            replacement: blackbox-exporter.monitoring.svc:9115
+
+      # Ollama API — edge1 Pi 5 (NUC Agent Zero)
+      - job_name: "probe-ollama-edge1"
+        metrics_path: /probe
+        params:
+          module: [http_ollama]
+        scrape_interval: 30s
+        static_configs:
+          - targets: ["http://10.0.57.17:11434/api/tags"]
+            labels:
+              instance: "ollama-edge1"
+              service: "ollama"
+              deployment: "nuc"
+              gpu: "cpu"
+        relabel_configs:
+          - source_labels: [__address__]
+            target_label: __param_target
+          - source_labels: [__param_target]
+            target_label: instance
+          - target_label: __address__
+            replacement: blackbox-exporter.monitoring.svc:9115
+
+      # Agent Zero Web UI — local (K3s)
+      - job_name: "probe-agentzero-local"
+        metrics_path: /probe
+        params:
+          module: [http_2xx]
+        scrape_interval: 30s
+        static_configs:
+          - targets: ["http://10.0.58.100:30050/"]
+            labels:
+              instance: "agent-zero-local"
+              service: "agent-zero"
+              deployment: "local"
+        relabel_configs:
+          - source_labels: [__address__]
+            target_label: __param_target
+          - source_labels: [__param_target]
+            target_label: instance
+          - target_label: __address__
+            replacement: blackbox-exporter.monitoring.svc:9115
+
+      # Agent Zero Web UI — NUC (RKE2 via Traefik)
+      - job_name: "probe-agentzero-nuc"
+        metrics_path: /probe
+        params:
+          module: [http_2xx]
+        scrape_interval: 30s
+        static_configs:
+          - targets: ["http://agent-zero.agent-zero.svc.cluster.local/"]
+            labels:
+              instance: "agent-zero-nuc"
+              service: "agent-zero"
+              deployment: "nuc"
+        relabel_configs:
+          - source_labels: [__address__]
+            target_label: __param_target
+          - source_labels: [__param_target]
+            target_label: instance
+          - target_label: __address__
+            replacement: blackbox-exporter.monitoring.svc:9115
+
+      # =============================================================================
+      # Self-monitoring (K8s monitoring namespace)
+      # =============================================================================
+
+      - job_name: "monitoring-grafana"
+        metrics_path: /metrics
+        static_configs:
+          - targets: ["grafana.monitoring.svc:3000"]
+            labels:
+              instance: "grafana-k8s"
+              service: "grafana"
+
+      - job_name: "monitoring-blackbox"
+        static_configs:
+          - targets: ["blackbox-exporter.monitoring.svc:9115"]
+            labels:
+              instance: "blackbox-k8s"
+              service: "blackbox"
+
+  recording-rules.yml: |
+    groups:
+      - name: node-aggregations
+        interval: 30s
+        rules:
+          - record: instance:node_cpu_usage:avg5m
+            expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)
+          - record: instance:node_memory_usage:percent
+            expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100
+          - record: instance:node_disk_usage:percent
+            expr: (1 - node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100
+          - record: instance:node_network_receive:rate5m
+            expr: rate(node_network_receive_bytes_total{device!~"lo|veth.*|cali.*|flannel.*"}[5m]) * 8
+          - record: instance:node_network_transmit:rate5m
+            expr: rate(node_network_transmit_bytes_total{device!~"lo|veth.*|cali.*|flannel.*"}[5m]) * 8
+      - name: probe-aggregations
+        interval: 30s
+        rules:
+          - record: service:probe_success:min
+            expr: min by(service) (probe_success)
+          - record: service:probe_duration:avg
+            expr: avg by(service) (probe_duration_seconds)
+      - name: print-rates
+        interval: 30s
+        rules:
+          - record: print:jobs_per_minute:rate5m
+            expr: rate(print_jobs_enqueued_total[5m]) * 60
+          - record: print:success_rate:ratio5m
+            expr: rate(print_jobs_completed_total[5m]) / rate(print_jobs_enqueued_total[5m])
+          - record: print:job_duration_p95:5m
+            expr: histogram_quantile(0.95, rate(print_job_duration_ms_bucket[5m]))
+      - name: relay-rates
+        interval: 15s
+        rules:
+          - record: relay:state_changes:1h
+            expr: changes(pimanager_relay_state[1h])
+          - record: epson:pages_per_day:rate24h
+            expr: increase(prtMarkerLifeCount{job="snmp-printer"}[24h])
+
+  alerts.yml: |
+    groups:
+      - name: ai-stack
+        rules:
+          - alert: OllamaDown
+            expr: probe_success{service="ollama"} == 0
+            for: 2m
+            labels:
+              severity: warning
+            annotations:
+              summary: "Ollama is down on {{ $labels.deployment }}"
+              description: "Ollama API at {{ $labels.instance }} has been unreachable for 2 minutes. Agent Zero FAISS memory will fail."
+
+          - alert: AgentZeroDown
+            expr: probe_success{service="agent-zero"} == 0
+            for: 2m
+            labels:
+              severity: warning
+            annotations:
+              summary: "Agent Zero is down on {{ $labels.deployment }}"
+              description: "Agent Zero web UI at {{ $labels.instance }} has been unreachable for 2 minutes."
+
+          - alert: OllamaSlowResponse
+            expr: probe_duration_seconds{service="ollama"} > 3
+            for: 5m
+            labels:
+              severity: info
+            annotations:
+              summary: "Ollama responding slowly on {{ $labels.deployment }}"
+              description: "Ollama API response time exceeds 3s for 5 minutes. GPU may be overloaded."
+
+      - name: print-services
+        rules:
+          - alert: CUPSExporterDown
+            expr: up{job="cups"} == 0
+            for: 2m
+            labels:
+              severity: warning
+            annotations:
+              summary: "CUPS exporter unreachable on edge2"
+              description: "cups_exporter at edge2:9628 has been down for 2 minutes. CUPS monitoring offline."
+
+          - alert: CUPSWebUIDown
+            expr: probe_success{job="probe-cups"} == 0
+            for: 3m
+            labels:
+              severity: warning
+            annotations:
+              summary: "CUPS web UI down on edge2"
+              description: "CUPS port 631 unreachable for 3 minutes. Network printing unavailable."
+
+          - alert: PrintWebDown
+            expr: probe_success{job="probe-printweb"} == 0
+            for: 2m
+            labels:
+              severity: warning
+            annotations:
+              summary: "Print.Web is down on edge2"
+              description: "FlowerCore Print.Web at edge2:5200 unreachable. API/MCP/Blazor printing unavailable."
+
+          - alert: CUPSPrinterStopped
+            expr: cups_printer_state_total{state="stopped"} > 0
+            for: 5m
+            labels:
+              severity: warning
+            annotations:
+              summary: "CUPS printer stopped on edge2"
+              description: "A CUPS printer has been in stopped state for 5 minutes. Check USB connection or paper."
+
+          - alert: CUPSJobBacklog
+            expr: cups_job_active_total > 10
+            for: 2m
+            labels:
+              severity: warning
+            annotations:
+              summary: "Print queue backlog on edge2 ({{ $value }} active jobs)"
+              description: "CUPS has {{ $value }} active jobs queued. Possible printer jam, USB disconnect, or paper out."
+
+          - alert: CUPSHighJobRate
+            expr: rate(cups_job_total[5m]) * 60 > 30
+            for: 5m
+            labels:
+              severity: info
+            annotations:
+              summary: "High print volume on edge2 ({{ $value | printf \"%.0f\" }} jobs/min)"
+              description: "Print rate exceeds 30 jobs/min for 5 minutes. Possible runaway print loop."
+
+      - name: pi-fleet
+        rules:
+          - alert: PiManagerDown
+            expr: up{job="pimanager-app"} == 0
+            for: 3m
+            labels:
+              severity: warning
+            annotations:
+              summary: "PiManager down on {{ $labels.instance }}"
+              description: "PiManager app on {{ $labels.instance }} ({{ $labels.device }}) unreachable for 3 minutes."
+
+          - alert: PiCpuTempHigh
+            expr: pimanager_cpu_temperature_celsius > 75
+            for: 5m
+            labels:
+              severity: warning
+            annotations:
+              summary: "Pi CPU temperature high on {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}C)"
+
+          - alert: PiCpuTempCritical
+            expr: pimanager_cpu_temperature_celsius > 82
+            for: 2m
+            labels:
+              severity: critical
+            annotations:
+              summary: "Pi CPU temperature CRITICAL on {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}C)"
+
+          - alert: PiMemoryHigh
+            expr: pimanager_memory_usage_percent > 90
+            for: 5m
+            labels:
+              severity: warning
+            annotations:
+              summary: "Pi memory usage high on {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}%)"
+
+          - alert: PiDiskHigh
+            expr: pimanager_disk_usage_percent > 85
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              summary: "Pi disk usage high on {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}%)"
+
+          - alert: RelayAllOff
+            expr: sum by (instance) (pimanager_relay_state) == 0 and pimanager_relay_channel_count > 0
+            for: 0m
+            labels:
+              severity: info
+            annotations:
+              summary: "All relay channels OFF on {{ $labels.instance }}"
+
+          - alert: PiWifiWeak
+            expr: pimanager_wifi_signal_dbm < -75 and pimanager_wifi_signal_dbm != 0
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              summary: "Weak WiFi signal on {{ $labels.instance }} ({{ $value }}dBm)"
+
+      - name: snmp-devices
+        rules:
+          - alert: EpsonInkLow
+            expr: prtMarkerSuppliesLevel{job="snmp-printer"} < 15 and prtMarkerSuppliesLevel{job="snmp-printer"} > 0
+            for: 0m
+            labels:
+              severity: warning
+              alert_channel: thermal_print
+            annotations:
+              summary: "Epson ink low: {{ $labels.prtMarkerSuppliesDescription }} at {{ $value }}%"
+
+          - alert: EpsonInkCritical
+            expr: prtMarkerSuppliesLevel{job="snmp-printer"} < 5 and prtMarkerSuppliesLevel{job="snmp-printer"} > 0
+            for: 0m
+            labels:
+              severity: critical
+              alert_channel: thermal_print
+            annotations:
+              summary: "Epson ink CRITICAL: {{ $labels.prtMarkerSuppliesDescription }} at {{ $value }}%"
+
+          - alert: EpsonPrinterDown
+            expr: up{job="snmp-printer"} == 0
+            for: 5m
+            labels:
+              severity: warning
+            annotations:
+              summary: "Epson ET-3750 SNMP unreachable"
+
+          - alert: SynologyDiskLow
+            expr: hrStorageUsed{job="snmp-nas"} / hrStorageSize{job="snmp-nas"} * 100 > 85
+            for: 10m
+            labels:
+              severity: warning
+              alert_channel: thermal_print
+            annotations:
+              summary: "Synology NAS storage high ({{ $value | printf \"%.1f\" }}%)"
+
+          - alert: SynologyDown
+            expr: up{job="snmp-nas"} == 0
+            for: 3m
+            labels:
+              severity: critical
+              alert_channel: thermal_print
+            annotations:
+              summary: "Synology NAS SNMP unreachable"
+
+      - name: infrastructure
+        rules:
+          - alert: NodeDown
+            expr: up{job=~"node-exporter|rke2-nodes|edge-nodes|k8s-node-exporter"} == 0
+            for: 2m
+            labels:
+              severity: critical
+            annotations:
+              summary: "Node {{ $labels.instance }} is down"
+
+          - alert: HighCPU
+            expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 85
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              summary: "High CPU on {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}%)"
+
+          - alert: HighMemory
+            expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100 > 90
+            for: 5m
+            labels:
+              severity: warning
+            annotations:
+              summary: "High memory usage on {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}%)"
+
+          - alert: DiskSpaceLow
+            expr: (1 - node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100 > 85
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              summary: "Disk usage high on {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}%)"
+
+# =============================================================================
+# ConfigMap: Blackbox Exporter Configuration
+# =============================================================================
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: blackbox-config
+  namespace: monitoring
+data:
+  blackbox.yml: |
+    modules:
+      http_2xx:
+        prober: http
+        timeout: 5s
+        http:
+          valid_http_versions: ["HTTP/1.1", "HTTP/2.0"]
+          valid_status_codes: [200]
+          method: GET
+          fail_if_body_not_matches_regexp: []
+          preferred_ip_protocol: ip4
+      http_ollama:
+        prober: http
+        timeout: 5s
+        http:
+          valid_http_versions: ["HTTP/1.1", "HTTP/2.0"]
+          valid_status_codes: [200]
+          method: GET
+          fail_if_body_not_matches_regexp:
+            - '"models"'
+          preferred_ip_protocol: ip4
+
+# =============================================================================
+# ConfigMap: IRC Notify Script
+# =============================================================================
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: irc-notify-script
+  namespace: monitoring
+data:
+  notify.py: |
+    #!/usr/bin/env python3
+    """HTTP->IRC alert relay with thermal printer forwarding for Grafana webhooks.
+    Listens on :9119, posts to #alerts on UnrealIRCd via raw IRC protocol.
+    Alerts tagged alert_channel=thermal_print also POST to Print.Web /api/print/alert.
+    """
+    import json, socket, sys, time
+    from http.server import HTTPServer, BaseHTTPRequestHandler
+    from urllib.request import Request, urlopen
+    from urllib.error import URLError
+
+    IRC_HOST = "unrealircd.irc.svc.cluster.local"
+    IRC_PORT = 6667
+    IRC_NICK = "grafana-bot"
+    IRC_CHANNEL = "#alerts"
+    PRINT_WEB_URL = "http://10.0.57.16:5200/api/print/alert"
+    PRINT_ENABLED = True
+
+    def send_irc(message):
+        """Connect, handle PING, join, send, quit."""
+        try:
+            sock = socket.create_connection((IRC_HOST, IRC_PORT), timeout=15)
+            sock.sendall(f"NICK {IRC_NICK}\r\n".encode())
+            sock.sendall(f"USER {IRC_NICK} 0 * :Grafana Alert Bot\r\n".encode())
+            registered = False
+            deadline = time.time() + 10
+            buf = ""
+            while time.time() < deadline:
+                try:
+                    data = sock.recv(4096).decode("utf-8", errors="replace")
+                    if not data: break
+                    buf += data
+                    for line in buf.split("\r\n"):
+                        if line.startswith("PING"):
+                            sock.sendall(("PONG " + line.split(" ", 1)[1] + "\r\n").encode())
+                    if " 001 " in buf:
+                        registered = True
+                        break
+                except socket.timeout: break
+            if not registered:
+                sock.close()
+                return False
+            sock.sendall(f"JOIN {IRC_CHANNEL}\r\n".encode())
+            time.sleep(0.5)
+            sock.recv(4096)
+            for line in message.split("\n"):
+                if line.strip():
+                    sock.sendall(f"PRIVMSG {IRC_CHANNEL} :{line}\r\n".encode())
+                    time.sleep(0.3)
+            time.sleep(0.5)
+            sock.sendall(b"QUIT :alert delivered\r\n")
+            sock.close()
+            return True
+        except Exception as e:
+            print(f"[irc-notify] IRC send failed: {e}", file=sys.stderr)
+            return False
+
+    def send_thermal_print(alert):
+        if not PRINT_ENABLED: return
+        labels = alert.get("labels", {})
+        annotations = alert.get("annotations", {})
+        status = alert.get("status", "firing").upper()
+        summary = annotations.get("summary", "")
+        description = annotations.get("description", "")
+        runbook = annotations.get("runbook", "")
+        # Build a useful message: summary + description + runbook steps
+        parts = []
+        if summary: parts.append(summary)
+        if description and description != summary: parts.append(description)
+        if runbook: parts.append("STEPS: " + runbook)
+        message = " | ".join(parts) if parts else labels.get("alertname", "Unknown alert")
+        payload = {
+            "title": labels.get("alertname", "Unknown"),
+            "severity": labels.get("severity", "warning").capitalize(),
+            "host": labels.get("instance", labels.get("host", "unknown")),
+            "message": message,
+            "eventId": alert.get("fingerprint", ""),
+            "source": "Grafana",
+            "status": "RESOLVED" if status == "RESOLVED" else "PROBLEM",
+            "acknowledged": False
+        }
+        try:
+            req = Request(PRINT_WEB_URL, data=json.dumps(payload).encode("utf-8"),
+                          headers={"Content-Type": "application/json"}, method="POST")
+            resp = urlopen(req, timeout=10)
+            print(f"[irc-notify] Thermal print sent: {resp.read().decode()}", file=sys.stderr)
+        except Exception as e:
+            print(f"[irc-notify] Thermal print failed: {e}", file=sys.stderr)
+
+    def should_print(alert):
+        labels = alert.get("labels", {})
+        if labels.get("alert_channel") == "thermal_print": return True
+        if labels.get("severity", "").lower() in ("critical", "disaster"): return True
+        if alert.get("status", "").upper() == "RESOLVED": return False
+        return False
+
+    class Handler(BaseHTTPRequestHandler):
+        def do_POST(self):
+            length = int(self.headers.get("Content-Length", 0))
+            body = json.loads(self.rfile.read(length)) if length else {}
+            for alert in body.get("alerts", []):
+                status = alert.get("status", "unknown").upper()
+                labels = alert.get("labels", {})
+                name = labels.get("alertname", "Unknown")
+                summary = alert.get("annotations", {}).get("summary", "")
+                desc = alert.get("annotations", {}).get("description", "")
+                severity = labels.get("severity", "")
+                icon = "\x0304[FIRING]\x03" if status == "FIRING" else "\x0303[RESOLVED]\x03"
+                sev_tag = f" [{severity}]" if severity else ""
+                msg = f"{icon}{sev_tag} {name}: {summary}"
+                if desc: msg += f"\n  {desc}"
+                send_irc(msg)
+                if should_print(alert): send_thermal_print(alert)
+            self.send_response(200)
+            self.send_header("Content-Type", "application/json")
+            self.end_headers()
+            self.wfile.write(b'{"status":"ok"}')
+        def do_GET(self):
+            self.send_response(200)
+            self.send_header("Content-Type", "application/json")
+            self.end_headers()
+            self.wfile.write(json.dumps({"service":"irc-notify","thermal_print":PRINT_ENABLED}).encode())
+        def log_message(self, format, *args):
+            print(f"[irc-notify] {args[0]}", file=sys.stderr)
+
+    if __name__ == "__main__":
+        server = HTTPServer(("0.0.0.0", 9119), Handler)
+        print(f"IRC alert relay :9119 -> {IRC_HOST}:{IRC_PORT} {IRC_CHANNEL} (thermal: {PRINT_ENABLED})")
+        server.serve_forever()
+
+# =============================================================================
+# SNMP Exporter Auth Secret
+# =============================================================================
+# The full snmp.yml (~2MB, auto-generated) exceeds the 1MB ConfigMap limit.
+# Strategy: store SNMP auth credentials in a Secret, and use an init container
+# to download the full snmp.yml from Gitea, or split into multiple ConfigMaps.
+# For now, we mount a minimal auth-only config and rely on the default modules
+# bundled in the snmp-exporter image. To use custom modules, apply
+# snmp-config.yaml separately (see comments in that file).
+---
+apiVersion: v1
+kind: Secret
+metadata:
+  name: snmp-auth
+  namespace: monitoring
+type: Opaque
+stringData:
+  # SNMP v2 community string used by prometheus scrape configs
+  SNMP_COMMUNITY_BLUEJAY: bluejay_monitor
+  SNMP_V3_USER: bluejay_snmpv3
+  SNMP_V3_AUTH_PASS: BlueJay-SNMP-Auth-2026
+  SNMP_V3_PRIV_PASS: BlueJay-SNMP-Priv-2026
+
+# =============================================================================
+# Grafana Credentials — synced from 1Password via Operator
+# =============================================================================
+# 1Password vault: IAmWorkin > "Grafana"
+# Creates K8s Secret "grafana-credentials" with fields: username, password
+# Grafana Deployment uses secretKeyRef to map username→GF_SECURITY_ADMIN_USER, password→GF_SECURITY_ADMIN_PASSWORD
+---
+apiVersion: onepassword.com/v1
+kind: OnePasswordItem
+metadata:
+  name: grafana-credentials
+  namespace: monitoring
+spec:
+  itemPath: vaults/IAmWorkin/items/Grafana
+
+# =============================================================================
+# RBAC: ServiceAccount + ClusterRole for Prometheus K8s SD
+# =============================================================================
+---
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: prometheus
+  namespace: monitoring
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: prometheus
+rules:
+  - apiGroups: [""]
+    resources: ["nodes", "nodes/proxy", "services", "endpoints", "pods"]
+    verbs: ["get", "list", "watch"]
+  - apiGroups: ["extensions", "networking.k8s.io"]
+    resources: ["ingresses"]
+    verbs: ["get", "list", "watch"]
+  - nonResourceURLs: ["/metrics"]
+    verbs: ["get"]
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: prometheus
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: prometheus
+subjects:
+  - kind: ServiceAccount
+    name: prometheus
+    namespace: monitoring
+
+# =============================================================================
+# PVC: Prometheus Data (10Gi, Longhorn)
+# =============================================================================
+---
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: prometheus-data
+  namespace: monitoring
+spec:
+  accessModes:
+    - ReadWriteOnce
+  storageClassName: longhorn
+  resources:
+    requests:
+      storage: 10Gi
+
+# =============================================================================
+# PVC: Grafana Data (2Gi, Longhorn)
+# =============================================================================
+---
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: grafana-data
+  namespace: monitoring
+spec:
+  accessModes:
+    - ReadWriteOnce
+  storageClassName: longhorn
+  resources:
+    requests:
+      storage: 2Gi
+
+# =============================================================================
+# Deployment: Prometheus
+# =============================================================================
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: prometheus
+  namespace: monitoring
+  labels:
+    app: prometheus
+spec:
+  replicas: 1
+  strategy:
+    type: Recreate
+  selector:
+    matchLabels:
+      app: prometheus
+  template:
+    metadata:
+      labels:
+        app: prometheus
+    spec:
+      serviceAccountName: prometheus
+      securityContext:
+        fsGroup: 65534  # nobody
+        runAsUser: 65534
+        runAsGroup: 65534
+      containers:
+        - name: prometheus
+          image: docker.io/prom/prometheus:latest
+          args:
+            - "--config.file=/etc/prometheus/prometheus.yml"
+            - "--storage.tsdb.path=/prometheus"
+            - "--storage.tsdb.retention.time=90d"
+            - "--web.enable-lifecycle"
+          ports:
+            - containerPort: 9090
+              name: http
+          volumeMounts:
+            - name: config
+              mountPath: /etc/prometheus/prometheus.yml
+              subPath: prometheus.yml
+              readOnly: true
+            - name: config
+              mountPath: /etc/prometheus/alerts.yml
+              subPath: alerts.yml
+              readOnly: true
+            - name: config
+              mountPath: /etc/prometheus/recording-rules.yml
+              subPath: recording-rules.yml
+              readOnly: true
+            - name: data
+              mountPath: /prometheus
+          resources:
+            requests:
+              cpu: 200m
+              memory: 512Mi
+            limits:
+              cpu: "1"
+              memory: 2Gi
+          livenessProbe:
+            httpGet:
+              path: /-/healthy
+              port: 9090
+            initialDelaySeconds: 15
+            periodSeconds: 30
+          readinessProbe:
+            httpGet:
+              path: /-/ready
+              port: 9090
+            initialDelaySeconds: 5
+            periodSeconds: 10
+      volumes:
+        - name: config
+          configMap:
+            name: prometheus-config
+        - name: data
+          persistentVolumeClaim:
+            claimName: prometheus-data
+
+# =============================================================================
+# ConfigMap: Grafana Dashboard Provider
+# =============================================================================
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: grafana-dashboard-provider
+  namespace: monitoring
+data:
+  default.yml: |
+    apiVersion: 1
+    providers:
+      - name: 'default'
+        orgId: 1
+        folder: ''
+        type: file
+        disableDeletion: false
+        updateIntervalSeconds: 30
+        options:
+          path: /var/lib/grafana/dashboards
+          foldersFromFilesStructure: true
+
+# =============================================================================
+# ConfigMap: Grafana Dashboards (AI Stack Health)
+# =============================================================================
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: grafana-dashboards
+  namespace: monitoring
+data:
+  ai-stack-health.json: |
+    {
+      "id": null,
+      "panels": [
+        {
+          "fieldConfig": {
+            "defaults": {
+              "mappings": [
+                {
+                  "options": {
+                    "0": { "color": "red", "text": "DOWN" },
+                    "1": { "color": "green", "text": "UP" }
+                  },
+                  "type": "value"
+                }
+              ],
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  { "color": "red", "value": null },
+                  { "color": "green", "value": 1 }
+                ]
+              }
+            }
+          },
+          "gridPos": { "h": 4, "w": 6, "x": 0, "y": 0 },
+          "id": 1,
+          "targets": [
+            {
+              "datasource": { "type": "prometheus", "uid": "prometheus" },
+              "expr": "probe_success{job=\"probe-ollama-local\"}",
+              "legendFormat": "Status"
+            }
+          ],
+          "title": "Ollama (Local)",
+          "type": "stat"
+        },
+        {
+          "fieldConfig": {
+            "defaults": {
+              "mappings": [
+                {
+                  "options": {
+                    "0": { "color": "red", "text": "DOWN" },
+                    "1": { "color": "green", "text": "UP" }
+                  },
+                  "type": "value"
+                }
+              ],
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  { "color": "red", "value": null },
+                  { "color": "green", "value": 1 }
+                ]
+              }
+            }
+          },
+          "gridPos": { "h": 4, "w": 6, "x": 6, "y": 0 },
+          "id": 2,
+          "targets": [
+            {
+              "datasource": { "type": "prometheus", "uid": "prometheus" },
+              "expr": "probe_success{job=\"probe-ollama-edge1\"}",
+              "legendFormat": "Status"
+            }
+          ],
+          "title": "Ollama (Edge1)",
+          "type": "stat"
+        },
+        {
+          "fieldConfig": {
+            "defaults": {
+              "mappings": [
+                {
+                  "options": {
+                    "0": { "color": "red", "text": "DOWN" },
+                    "1": { "color": "green", "text": "UP" }
+                  },
+                  "type": "value"
+                }
+              ],
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  { "color": "red", "value": null },
+                  { "color": "green", "value": 1 }
+                ]
+              }
+            }
+          },
+          "gridPos": { "h": 4, "w": 6, "x": 12, "y": 0 },
+          "id": 3,
+          "targets": [
+            {
+              "datasource": { "type": "prometheus", "uid": "prometheus" },
+              "expr": "probe_success{job=\"probe-agentzero-local\"}",
+              "legendFormat": "Status"
+            }
+          ],
+          "title": "Agent Zero (Local)",
+          "type": "stat"
+        },
+        {
+          "fieldConfig": {
+            "defaults": {
+              "mappings": [
+                {
+                  "options": {
+                    "0": { "color": "red", "text": "DOWN" },
+                    "1": { "color": "green", "text": "UP" }
+                  },
+                  "type": "value"
+                }
+              ],
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  { "color": "red", "value": null },
+                  { "color": "green", "value": 1 }
+                ]
+              }
+            }
+          },
+          "gridPos": { "h": 4, "w": 6, "x": 18, "y": 0 },
+          "id": 4,
+          "targets": [
+            {
+              "datasource": { "type": "prometheus", "uid": "prometheus" },
+              "expr": "probe_success{job=\"probe-agentzero-nuc\"}",
+              "legendFormat": "Status"
+            }
+          ],
+          "title": "Agent Zero (NUC)",
+          "type": "stat"
+        },
+        {
+          "fieldConfig": {
+            "defaults": {
+              "custom": { "fillOpacity": 15, "lineWidth": 2 },
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  { "color": "green", "value": null },
+                  { "color": "yellow", "value": 1 },
+                  { "color": "red", "value": 3 }
+                ]
+              },
+              "unit": "s"
+            }
+          },
+          "gridPos": { "h": 8, "w": 12, "x": 0, "y": 4 },
+          "id": 5,
+          "targets": [
+            {
+              "datasource": { "type": "prometheus", "uid": "prometheus" },
+              "expr": "probe_duration_seconds{service=\"ollama\"}",
+              "legendFormat": "{{ deployment }}"
+            }
+          ],
+          "title": "Ollama Response Time",
+          "type": "timeseries"
+        },
+        {
+          "fieldConfig": {
+            "defaults": {
+              "custom": { "fillOpacity": 15, "lineWidth": 2 },
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  { "color": "green", "value": null },
+                  { "color": "yellow", "value": 1 },
+                  { "color": "red", "value": 3 }
+                ]
+              },
+              "unit": "s"
+            }
+          },
+          "gridPos": { "h": 8, "w": 12, "x": 12, "y": 4 },
+          "id": 6,
+          "targets": [
+            {
+              "datasource": { "type": "prometheus", "uid": "prometheus" },
+              "expr": "probe_duration_seconds{service=\"agent-zero\"}",
+              "legendFormat": "{{ deployment }}"
+            }
+          ],
+          "title": "Agent Zero Response Time",
+          "type": "timeseries"
+        },
+        {
+          "fieldConfig": {
+            "defaults": {
+              "custom": { "fillOpacity": 30, "lineWidth": 1, "stacking": { "mode": "none" } },
+              "mappings": [
+                {
+                  "options": {
+                    "0": { "text": "DOWN" },
+                    "1": { "text": "UP" }
+                  },
+                  "type": "value"
+                }
+              ],
+              "max": 1,
+              "min": 0,
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  { "color": "red", "value": null },
+                  { "color": "green", "value": 1 }
+                ]
+              }
+            }
+          },
+          "gridPos": { "h": 6, "w": 24, "x": 0, "y": 12 },
+          "id": 7,
+          "targets": [
+            {
+              "datasource": { "type": "prometheus", "uid": "prometheus" },
+              "expr": "probe_success{service=\"ollama\"}",
+              "legendFormat": "Ollama ({{ deployment }})"
+            },
+            {
+              "datasource": { "type": "prometheus", "uid": "prometheus" },
+              "expr": "probe_success{service=\"agent-zero\"}",
+              "legendFormat": "Agent Zero ({{ deployment }})"
+            }
+          ],
+          "title": "Uptime History",
+          "type": "timeseries"
+        },
+        {
+          "fieldConfig": {
+            "defaults": {
+              "custom": { "fillOpacity": 10, "lineWidth": 2 },
+              "max": 100,
+              "min": 0,
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  { "color": "green", "value": null },
+                  { "color": "yellow", "value": 75 },
+                  { "color": "red", "value": 90 }
+                ]
+              },
+              "unit": "percent"
+            }
+          },
+          "gridPos": { "h": 8, "w": 12, "x": 0, "y": 18 },
+          "id": 8,
+          "targets": [
+            {
+              "datasource": { "type": "prometheus", "uid": "prometheus" },
+              "expr": "100 - (avg(rate(node_cpu_seconds_total{instance=\"edge1\",mode=\"idle\"}[5m])) * 100)",
+              "legendFormat": "CPU %"
+            },
+            {
+              "datasource": { "type": "prometheus", "uid": "prometheus" },
+              "expr": "(1 - node_memory_MemAvailable_bytes{instance=\"edge1\"} / node_memory_MemTotal_bytes{instance=\"edge1\"}) * 100",
+              "legendFormat": "Memory %"
+            },
+            {
+              "datasource": { "type": "prometheus", "uid": "prometheus" },
+              "expr": "(1 - node_filesystem_avail_bytes{instance=\"edge1\",mountpoint=\"/\"} / node_filesystem_size_bytes{instance=\"edge1\",mountpoint=\"/\"}) * 100",
+              "legendFormat": "Disk %"
+            }
+          ],
+          "title": "Edge1 Pi 5 \u2014 CPU / Memory / Disk",
+          "type": "timeseries"
+        },
+        {
+          "fieldConfig": {
+            "defaults": {
+              "custom": { "fillOpacity": 10, "lineWidth": 2 },
+              "unit": "s"
+            }
+          },
+          "gridPos": { "h": 8, "w": 12, "x": 12, "y": 18 },
+          "id": 9,
+          "targets": [
+            {
+              "datasource": { "type": "prometheus", "uid": "prometheus" },
+              "expr": "probe_dns_lookup_time_seconds",
+              "legendFormat": "{{ job }}"
+            }
+          ],
+          "title": "Probe DNS Lookup Time",
+          "type": "timeseries"
+        }
+      ],
+      "refresh": "30s",
+      "schemaVersion": 39,
+      "tags": ["ai", "ollama", "agent-zero", "blue-jay"],
+      "time": { "from": "now-1h", "to": "now" },
+      "timezone": "browser",
+      "title": "AI Stack Health",
+      "uid": "ai-stack-health",
+      "version": 1
+    }
+
+# =============================================================================
+# ConfigMap: Grafana Dashboard — Edge Nodes
+# =============================================================================
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: grafana-dashboard-edge-nodes
+  namespace: monitoring
+data:
+  bluejay-edge-nodes.json: |
+    {
+      "id": null,
+      "panels": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "fieldConfig": {
+            "defaults": {
+              "mappings": [
+                {
+                  "options": {
+                    "0": {
+                      "color": "red",
+                      "text": "DOWN"
+                    },
+                    "1": {
+                      "color": "green",
+                      "text": "UP"
+                    }
+                  },
+                  "type": "value"
+                }
+              ]
+            }
+          },
+          "gridPos": { "h": 4, "w": 24, "x": 0, "y": 0 },
+          "targets": [
+            {
+              "expr": "up{instance=~\"edge.*\"}",
+              "legendFormat": "{{instance}}",
+              "refId": "A"
+            }
+          ],
+          "title": "Edge Node Status",
+          "type": "stat"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "fieldConfig": {
+            "defaults": {
+              "max": 100,
+              "min": 0,
+              "unit": "percent"
+            }
+          },
+          "gridPos": { "h": 8, "w": 12, "x": 0, "y": 4 },
+          "targets": [
+            {
+              "expr": "100 - (avg(rate(node_cpu_seconds_total{mode=\"idle\", instance=~\"edge1.*\"}[5m])) * 100)",
+              "legendFormat": "CPU %",
+              "refId": "A"
+            },
+            {
+              "expr": "node_load1{instance=~\"edge1.*\"}",
+              "legendFormat": "Load 1m",
+              "refId": "B"
+            }
+          ],
+          "title": "edge1 (Pi5 + Hailo) CPU",
+          "type": "timeseries"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "fieldConfig": {
+            "defaults": {
+              "max": 100,
+              "min": 0,
+              "unit": "percent"
+            }
+          },
+          "gridPos": { "h": 8, "w": 12, "x": 12, "y": 4 },
+          "targets": [
+            {
+              "expr": "100 - (avg(rate(node_cpu_seconds_total{mode=\"idle\", instance=~\"edge2.*\"}[5m])) * 100)",
+              "legendFormat": "CPU %",
+              "refId": "A"
+            },
+            {
+              "expr": "node_load1{instance=~\"edge2.*\"}",
+              "legendFormat": "Load 1m",
+              "refId": "B"
+            }
+          ],
+          "title": "edge2 (Pi4) CPU",
+          "type": "timeseries"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "fieldConfig": {
+            "defaults": {
+              "max": 100,
+              "min": 0,
+              "unit": "percent"
+            }
+          },
+          "gridPos": { "h": 8, "w": 12, "x": 0, "y": 12 },
+          "targets": [
+            {
+              "expr": "(1 - node_memory_MemAvailable_bytes{instance=~\"edge.*\"} / node_memory_MemTotal_bytes{instance=~\"edge.*\"}) * 100",
+              "legendFormat": "{{instance}}",
+              "refId": "A"
+            }
+          ],
+          "title": "Edge Memory Usage",
+          "type": "timeseries"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "fieldConfig": {
+            "defaults": {
+              "max": 100,
+              "min": 0,
+              "unit": "percent"
+            }
+          },
+          "gridPos": { "h": 8, "w": 12, "x": 12, "y": 12 },
+          "targets": [
+            {
+              "expr": "100 - (node_filesystem_avail_bytes{instance=~\"edge.*\",mountpoint=\"/\"} / node_filesystem_size_bytes{instance=~\"edge.*\",mountpoint=\"/\"} * 100)",
+              "legendFormat": "{{instance}}",
+              "refId": "A"
+            }
+          ],
+          "title": "Edge Disk Usage",
+          "type": "timeseries"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "fieldConfig": {
+            "defaults": {
+              "unit": "celsius"
+            }
+          },
+          "gridPos": { "h": 8, "w": 12, "x": 0, "y": 20 },
+          "targets": [
+            {
+              "expr": "node_hwmon_temp_celsius{instance=~\"edge.*\"}",
+              "legendFormat": "{{instance}} {{chip}} {{sensor}}",
+              "refId": "A"
+            }
+          ],
+          "title": "Edge CPU Temperature",
+          "type": "timeseries"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "fieldConfig": {
+            "defaults": {
+              "unit": "bps"
+            }
+          },
+          "gridPos": { "h": 8, "w": 12, "x": 12, "y": 20 },
+          "targets": [
+            {
+              "expr": "rate(node_network_receive_bytes_total{instance=~\"edge.*\",device!~\"lo|docker.*|veth.*\"}[5m]) * 8",
+              "legendFormat": "{{instance}} {{device}} RX",
+              "refId": "A"
+            },
+            {
+              "expr": "-rate(node_network_transmit_bytes_total{instance=~\"edge.*\",device!~\"lo|docker.*|veth.*\"}[5m]) * 8",
+              "legendFormat": "{{instance}} {{device}} TX",
+              "refId": "B"
+            }
+          ],
+          "title": "Edge Network Traffic",
+          "type": "timeseries"
+        }
+      ],
+      "refresh": "30s",
+      "schemaVersion": 40,
+      "tags": ["bluejay", "edge"],
+      "timezone": "browser",
+      "title": "BlueJay Edge Nodes",
+      "uid": "7e1603b9-e918-4b3f-a22b-163132fd5cee"
+    }
+
+# =============================================================================
+# ConfigMap: Grafana Dashboard — Network Overview
+# =============================================================================
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: grafana-dashboard-network-overview
+  namespace: monitoring
+data:
+  bluejay-network-overview.json: |
+    {
+      "id": null,
+      "panels": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "fieldConfig": {
+            "defaults": {
+              "thresholds": {
+                "steps": [
+                  { "color": "green", "value": null }
+                ]
+              }
+            }
+          },
+          "gridPos": { "h": 4, "w": 24, "x": 0, "y": 0 },
+          "targets": [
+            {
+              "expr": "count(up == 1)",
+              "legendFormat": "Up",
+              "refId": "A"
+            },
+            {
+              "expr": "count(up == 0)",
+              "legendFormat": "Down",
+              "refId": "B"
+            }
+          ],
+          "title": "Target Health",
+          "type": "stat"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "fieldConfig": {
+            "defaults": {
+              "max": 4,
+              "min": 0,
+              "thresholds": {
+                "steps": [
+                  { "color": "green", "value": null },
+                  { "color": "yellow", "value": 2 },
+                  { "color": "red", "value": 3 }
+                ]
+              }
+            }
+          },
+          "gridPos": { "h": 6, "w": 6, "x": 0, "y": 4 },
+          "targets": [
+            {
+              "expr": "laLoad{instance=\"10.0.56.1\", laNames=\"Load-1\"}",
+              "refId": "A"
+            }
+          ],
+          "title": "pfSense CPU Load (1m)",
+          "type": "gauge"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "fieldConfig": {
+            "defaults": {
+              "max": 100,
+              "min": 0,
+              "thresholds": {
+                "steps": [
+                  { "color": "green", "value": null },
+                  { "color": "yellow", "value": 70 },
+                  { "color": "red", "value": 90 }
+                ]
+              },
+              "unit": "percent"
+            }
+          },
+          "gridPos": { "h": 6, "w": 6, "x": 6, "y": 4 },
+          "targets": [
+            {
+              "expr": "100 - (memAvailReal{instance=\"10.0.56.1\"} / memTotalReal{instance=\"10.0.56.1\"} * 100)",
+              "refId": "A"
+            }
+          ],
+          "title": "pfSense Memory Used %",
+          "type": "gauge"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "fieldConfig": {
+            "defaults": {
+              "max": 100,
+              "min": 0,
+              "unit": "percent"
+            }
+          },
+          "gridPos": { "h": 6, "w": 12, "x": 12, "y": 4 },
+          "targets": [
+            {
+              "expr": "100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\", instance=\"noc1\"}[5m])) * 100)",
+              "legendFormat": "CPU %",
+              "refId": "A"
+            }
+          ],
+          "title": "noc1 CPU Usage",
+          "type": "timeseries"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "fieldConfig": {
+            "defaults": {
+              "max": 100,
+              "min": 0,
+              "unit": "percent"
+            }
+          },
+          "gridPos": { "h": 8, "w": 12, "x": 0, "y": 10 },
+          "targets": [
+            {
+              "expr": "(1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100",
+              "legendFormat": "{{instance}}",
+              "refId": "A"
+            }
+          ],
+          "title": "Node Memory Usage",
+          "type": "timeseries"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "fieldConfig": {
+            "defaults": {
+              "max": 100,
+              "min": 0,
+              "thresholds": {
+                "steps": [
+                  { "color": "green", "value": null },
+                  { "color": "yellow", "value": 70 },
+                  { "color": "red", "value": 90 }
+                ]
+              },
+              "unit": "percent"
+            }
+          },
+          "gridPos": { "h": 8, "w": 12, "x": 12, "y": 10 },
+          "targets": [
+            {
+              "expr": "100 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} * 100)",
+              "legendFormat": "{{instance}}",
+              "refId": "A"
+            }
+          ],
+          "title": "Node Disk Usage %",
+          "type": "bargauge"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "fieldConfig": {
+            "defaults": {
+              "unit": "bps"
+            }
+          },
+          "gridPos": { "h": 8, "w": 24, "x": 0, "y": 18 },
+          "targets": [
+            {
+              "expr": "rate(node_network_receive_bytes_total{device!~\"lo|veth.*|cali.*|flannel.*\"}[5m]) * 8",
+              "legendFormat": "{{instance}} {{device}} RX",
+              "refId": "A"
+            },
+            {
+              "expr": "-rate(node_network_transmit_bytes_total{device!~\"lo|veth.*|cali.*|flannel.*\"}[5m]) * 8",
+              "legendFormat": "{{instance}} {{device}} TX",
+              "refId": "B"
+            }
+          ],
+          "title": "Network Traffic",
+          "type": "timeseries"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "gridPos": { "h": 8, "w": 24, "x": 0, "y": 26 },
+          "targets": [
+            {
+              "expr": "up",
+              "format": "table",
+              "instant": true,
+              "refId": "A"
+            }
+          ],
+          "title": "Prometheus Targets",
+          "type": "table"
+        }
+      ],
+      "refresh": "30s",
+      "schemaVersion": 40,
+      "tags": ["bluejay", "network"],
+      "timezone": "browser",
+      "title": "BlueJay Network Overview",
+      "uid": "b83a122c-6eb5-4fae-a632-77cbf753ad05"
+    }
+
+# =============================================================================
+# ConfigMap: Grafana Dashboard — Operations
+# =============================================================================
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: grafana-dashboard-operations
+  namespace: monitoring
+data:
+  bluejay-operations.json: |
+    {
+      "annotations": {
+        "list": []
+      },
+      "id": null,
+      "panels": [
+        {
+          "collapsed": false,
+          "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
+          "title": "Infrastructure Overview",
+          "type": "row"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "fieldConfig": {
+            "defaults": {
+              "noValue": "0",
+              "thresholds": {
+                "steps": [
+                  { "color": "green", "value": null }
+                ]
+              }
+            }
+          },
+          "gridPos": { "h": 4, "w": 8, "x": 0, "y": 1 },
+          "targets": [
+            {
+              "expr": "count(up == 1)",
+              "legendFormat": "Up",
+              "refId": "A"
+            },
+            {
+              "expr": "count(up == 0)",
+              "legendFormat": "Down",
+              "refId": "B"
+            }
+          ],
+          "title": "All Targets Up/Down",
+          "type": "stat"
+        },
+        {
+          "datasource": {
+            "type": "alexanderzobnin-zabbix-datasource",
+            "uid": "bffjila3zkdfka"
+          },
+          "gridPos": { "h": 8, "w": 16, "x": 8, "y": 1 },
+          "targets": [
+            {
+              "application": { "filter": "" },
+              "group": { "filter": "/.*/" },
+              "host": { "filter": "/.*/" },
+              "queryType": 5,
+              "refId": "A",
+              "trigger": { "filter": "/.*/" }
+            }
+          ],
+          "title": "Zabbix Active Problems",
+          "type": "alexanderzobnin-zabbix-triggers-panel"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "gridPos": { "h": 6, "w": 8, "x": 0, "y": 5 },
+          "targets": [
+            {
+              "expr": "node_load1{instance=\"noc1\"}",
+              "legendFormat": "1m",
+              "refId": "A"
+            },
+            {
+              "expr": "node_load5{instance=\"noc1\"}",
+              "legendFormat": "5m",
+              "refId": "B"
+            },
+            {
+              "expr": "node_load15{instance=\"noc1\"}",
+              "legendFormat": "15m",
+              "refId": "C"
+            }
+          ],
+          "title": "noc1 Load Average",
+          "type": "timeseries"
+        },
+        {
+          "collapsed": false,
+          "gridPos": { "h": 1, "w": 24, "x": 0, "y": 11 },
+          "title": "Kubernetes & Services",
+          "type": "row"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "fieldConfig": {
+            "defaults": {},
+            "overrides": [
+              {
+                "matcher": { "id": "byName", "options": "Value" },
+                "properties": [
+                  {
+                    "id": "mappings",
+                    "value": [
+                      {
+                        "options": {
+                          "0": { "color": "red", "text": "DOWN" },
+                          "1": { "color": "green", "text": "UP" }
+                        },
+                        "type": "value"
+                      }
+                    ]
+                  }
+                ]
+              }
+            ]
+          },
+          "gridPos": { "h": 8, "w": 24, "x": 0, "y": 12 },
+          "targets": [
+            {
+              "expr": "up",
+              "format": "table",
+              "instant": true,
+              "refId": "A"
+            }
+          ],
+          "title": "K8s Services Uptime (Prometheus Targets)",
+          "type": "table"
+        },
+        {
+          "collapsed": false,
+          "gridPos": { "h": 1, "w": 24, "x": 0, "y": 20 },
+          "title": "Network & SNMP",
+          "type": "row"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "fieldConfig": {
+            "defaults": {
+              "unit": "bps"
+            }
+          },
+          "gridPos": { "h": 8, "w": 12, "x": 0, "y": 21 },
+          "targets": [
+            {
+              "expr": "rate(ifHCInOctets{instance=\"10.0.56.1\", ifAlias=~\".*WAN.*\"}[5m]) * 8",
+              "legendFormat": "WAN In",
+              "refId": "A"
+            },
+            {
+              "expr": "-rate(ifHCOutOctets{instance=\"10.0.56.1\", ifAlias=~\".*WAN.*\"}[5m]) * 8",
+              "legendFormat": "WAN Out",
+              "refId": "B"
+            }
+          ],
+          "title": "pfSense WAN Traffic",
+          "type": "timeseries"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "fieldConfig": {
+            "defaults": {
+              "unit": "bps"
+            }
+          },
+          "gridPos": { "h": 8, "w": 12, "x": 12, "y": 21 },
+          "targets": [
+            {
+              "expr": "rate(ifHCInOctets{instance=\"10.0.56.1\", ifAlias!~\".*WAN.*\"}[5m]) * 8",
+              "legendFormat": "{{ifAlias}} In",
+              "refId": "A"
+            },
+            {
+              "expr": "-rate(ifHCOutOctets{instance=\"10.0.56.1\", ifAlias!~\".*WAN.*\"}[5m]) * 8",
+              "legendFormat": "{{ifAlias}} Out",
+              "refId": "B"
+            }
+          ],
+          "title": "pfSense LAN Traffic",
+          "type": "timeseries"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "fieldConfig": {
+            "defaults": {
+              "max": 100,
+              "min": 0,
+              "unit": "percent"
+            }
+          },
+          "gridPos": { "h": 8, "w": 12, "x": 0, "y": 29 },
+          "targets": [
+            {
+              "expr": "(1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100",
+              "legendFormat": "{{instance}}",
+              "refId": "A"
+            }
+          ],
+          "title": "All Nodes Memory",
+          "type": "timeseries"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "fieldConfig": {
+            "defaults": {
+              "max": 100,
+              "min": 0,
+              "unit": "percent"
+            }
+          },
+          "gridPos": { "h": 8, "w": 12, "x": 12, "y": 29 },
+          "targets": [
+            {
+              "expr": "100 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} * 100)",
+              "legendFormat": "{{instance}}",
+              "refId": "A"
+            }
+          ],
+          "title": "All Nodes Disk",
+          "type": "timeseries"
+        }
+      ],
+      "refresh": "1m",
+      "schemaVersion": 40,
+      "tags": ["bluejay", "operations", "zabbix"],
+      "timezone": "browser",
+      "title": "BlueJay Operations",
+      "uid": "a781c4e4-c3fe-4ac7-be93-21363a41b97d"
+    }
+
+# =============================================================================
+# ConfigMap: Grafana Dashboard — Epson Printer
+# =============================================================================
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: grafana-dashboard-printer
+  namespace: monitoring
+data:
+  epson-ecotank-printer.json: |
+    {
+      "id": null,
+      "panels": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "fieldConfig": {
+            "defaults": {
+              "max": 100,
+              "min": 0,
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  { "color": "red", "value": null },
+                  { "color": "orange", "value": 10 },
+                  { "color": "yellow", "value": 20 },
+                  { "color": "green", "value": 40 }
+                ]
+              },
+              "unit": "percent"
+            },
+            "overrides": [
+              {
+                "matcher": { "id": "byName", "options": "Black Ink Bottle" },
+                "properties": [
+                  { "id": "color", "value": { "fixedColor": "dark-blue", "mode": "fixed" } }
+                ]
+              },
+              {
+                "matcher": { "id": "byName", "options": "Cyan Ink Bottle" },
+                "properties": [
+                  { "id": "color", "value": { "fixedColor": "super-light-blue", "mode": "fixed" } }
+                ]
+              },
+              {
+                "matcher": { "id": "byName", "options": "Magenta Ink Bottle" },
+                "properties": [
+                  { "id": "color", "value": { "fixedColor": "purple", "mode": "fixed" } }
+                ]
+              },
+              {
+                "matcher": { "id": "byName", "options": "Yellow Ink Bottle" },
+                "properties": [
+                  { "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }
+                ]
+              }
+            ]
+          },
+          "gridPos": { "h": 10, "w": 12, "x": 0, "y": 0 },
+          "id": 1,
+          "options": {
+            "orientation": "horizontal",
+            "reduceOptions": {
+              "calcs": ["lastNotNull"]
+            },
+            "showThresholdLabels": false,
+            "showThresholdMarkers": true
+          },
+          "targets": [
+            {
+              "expr": "prtMarkerSuppliesLevel{job=\"snmp-printer\"}",
+              "legendFormat": "{{prtMarkerSuppliesDescription}}",
+              "refId": "A"
+            }
+          ],
+          "title": "Ink Levels",
+          "type": "gauge"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "fieldConfig": {
+            "defaults": {
+              "custom": {
+                "fillOpacity": 20,
+                "lineWidth": 2,
+                "spanNulls": true
+              },
+              "max": 100,
+              "min": 0,
+              "unit": "percent"
+            },
+            "overrides": [
+              {
+                "matcher": { "id": "byName", "options": "Black Ink Bottle" },
+                "properties": [
+                  { "id": "color", "value": { "fixedColor": "dark-blue", "mode": "fixed" } }
+                ]
+              },
+              {
+                "matcher": { "id": "byName", "options": "Cyan Ink Bottle" },
+                "properties": [
+                  { "id": "color", "value": { "fixedColor": "super-light-blue", "mode": "fixed" } }
+                ]
+              },
+              {
+                "matcher": { "id": "byName", "options": "Magenta Ink Bottle" },
+                "properties": [
+                  { "id": "color", "value": { "fixedColor": "purple", "mode": "fixed" } }
+                ]
+              },
+              {
+                "matcher": { "id": "byName", "options": "Yellow Ink Bottle" },
+                "properties": [
+                  { "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }
+                ]
+              }
+            ]
+          },
+          "gridPos": { "h": 10, "w": 12, "x": 12, "y": 0 },
+          "id": 2,
+          "targets": [
+            {
+              "expr": "prtMarkerSuppliesLevel{job=\"snmp-printer\"}",
+              "legendFormat": "{{prtMarkerSuppliesDescription}}",
+              "refId": "A"
+            }
+          ],
+          "title": "Ink Level History",
+          "type": "timeseries"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "fieldConfig": {
+            "defaults": {
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  { "color": "green", "value": null },
+                  { "color": "yellow", "value": 10000 },
+                  { "color": "red", "value": 50000 }
+                ]
+              },
+              "unit": "short"
+            }
+          },
+          "gridPos": { "h": 5, "w": 6, "x": 0, "y": 10 },
+          "id": 3,
+          "options": {
+            "colorMode": "background",
+            "reduceOptions": {
+              "calcs": ["lastNotNull"]
+            },
+            "textMode": "value_and_name"
+          },
+          "targets": [
+            {
+              "expr": "prtMarkerLifeCount{job=\"snmp-printer\"}",
+              "legendFormat": "Pages",
+              "refId": "A"
+            }
+          ],
+          "title": "Lifetime Page Count",
+          "type": "stat"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "fieldConfig": {
+            "defaults": {
+              "mappings": [
+                {
+                  "options": {
+                    "1": { "text": "Online" }
+                  },
+                  "type": "value"
+                }
+              ],
+              "thresholds": {
+                "steps": [
+                  { "color": "blue", "value": null }
+                ]
+              }
+            }
+          },
+          "gridPos": { "h": 5, "w": 6, "x": 6, "y": 10 },
+          "id": 4,
+          "options": {
+            "colorMode": "background",
+            "reduceOptions": {
+              "calcs": ["lastNotNull"]
+            },
+            "textMode": "name"
+          },
+          "targets": [
+            {
+              "expr": "prtGeneralPrinterName{job=\"snmp-printer\"}",
+              "legendFormat": "{{prtGeneralPrinterName}}",
+              "refId": "A"
+            }
+          ],
+          "title": "Printer Model",
+          "type": "stat"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "fieldConfig": {
+            "defaults": {
+              "thresholds": {
+                "steps": [
+                  { "color": "green", "value": null },
+                  { "color": "red", "value": 1 }
+                ]
+              },
+              "unit": "short"
+            }
+          },
+          "gridPos": { "h": 5, "w": 6, "x": 12, "y": 10 },
+          "id": 5,
+          "options": {
+            "colorMode": "background",
+            "reduceOptions": {
+              "calcs": ["lastNotNull"]
+            }
+          },
+          "targets": [
+            {
+              "expr": "prtAlertCriticalEvents{job=\"snmp-printer\"}",
+              "legendFormat": "Critical Alerts",
+              "refId": "A"
+            }
+          ],
+          "title": "Critical Events",
+          "type": "stat"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "fieldConfig": {
+            "defaults": {
+              "thresholds": {
+                "steps": [
+                  { "color": "blue", "value": null }
+                ]
+              }
+            }
+          },
+          "gridPos": { "h": 5, "w": 6, "x": 18, "y": 10 },
+          "id": 6,
+          "options": {
+            "colorMode": "background",
+            "reduceOptions": {
+              "calcs": ["lastNotNull"]
+            },
+            "textMode": "name"
+          },
+          "targets": [
+            {
+              "expr": "prtGeneralSerialNumber{job=\"snmp-printer\"}",
+              "legendFormat": "{{prtGeneralSerialNumber}}",
+              "refId": "A"
+            }
+          ],
+          "title": "Serial Number",
+          "type": "stat"
+        }
+      ],
+      "refresh": "5m",
+      "schemaVersion": 39,
+      "tags": ["printer", "snmp", "bluejay"],
+      "time": { "from": "now-24h", "to": "now" },
+      "timezone": "browser",
+      "title": "Epson ET-3750 EcoTank Printer",
+      "uid": "epson-ecotank"
+    }
+
+# =============================================================================
+# ConfigMap: Grafana Dashboard — Infrastructure Overview
+# =============================================================================
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: grafana-dashboard-infra-overview
+  namespace: monitoring
+data:
+  infra-overview.json: |
+    {
+      "id": null,
+      "panels": [
+        {
+          "collapsed": false,
+          "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
+          "id": 100,
+          "title": "AI Stack",
+          "type": "row"
+        },
+        {
+          "fieldConfig": {
+            "defaults": {
+              "mappings": [
+                {
+                  "options": {
+                    "0": { "color": "red", "text": "DOWN" },
+                    "1": { "color": "green", "text": "UP" }
+                  },
+                  "type": "value"
+                }
+              ],
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  { "color": "red", "value": null },
+                  { "color": "green", "value": 1 }
+                ]
+              }
+            }
+          },
+          "gridPos": { "h": 4, "w": 6, "x": 0, "y": 1 },
+          "id": 1,
+          "targets": [
+            {
+              "datasource": { "type": "prometheus", "uid": "prometheus" },
+              "expr": "probe_success{job=\"probe-ollama-local\"}",
+              "legendFormat": "Status"
+            }
+          ],
+          "title": "Ollama (Local)",
+          "type": "stat"
+        },
+        {
+          "fieldConfig": {
+            "defaults": {
+              "mappings": [
+                {
+                  "options": {
+                    "0": { "color": "red", "text": "DOWN" },
+                    "1": { "color": "green", "text": "UP" }
+                  },
+                  "type": "value"
+                }
+              ],
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  { "color": "red", "value": null },
+                  { "color": "green", "value": 1 }
+                ]
+              }
+            }
+          },
+          "gridPos": { "h": 4, "w": 6, "x": 6, "y": 1 },
+          "id": 2,
+          "targets": [
+            {
+              "datasource": { "type": "prometheus", "uid": "prometheus" },
+              "expr": "probe_success{job=\"probe-ollama-edge1\"}",
+              "legendFormat": "Status"
+            }
+          ],
+          "title": "Ollama (Edge1)",
+          "type": "stat"
+        },
+        {
+          "fieldConfig": {
+            "defaults": {
+              "mappings": [
+                {
+                  "options": {
+                    "0": { "color": "red", "text": "DOWN" },
+                    "1": { "color": "green", "text": "UP" }
+                  },
+                  "type": "value"
+                }
+              ],
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  { "color": "red", "value": null },
+                  { "color": "green", "value": 1 }
+                ]
+              }
+            }
+          },
+          "gridPos": { "h": 4, "w": 6, "x": 12, "y": 1 },
+          "id": 3,
+          "targets": [
+            {
+              "datasource": { "type": "prometheus", "uid": "prometheus" },
+              "expr": "probe_success{job=\"probe-agentzero-local\"}",
+              "legendFormat": "Status"
+            }
+          ],
+          "title": "Agent Zero (Local)",
+          "type": "stat"
+        },
+        {
+          "fieldConfig": {
+            "defaults": {
+              "mappings": [
+                {
+                  "options": {
+                    "0": { "color": "red", "text": "DOWN" },
+                    "1": { "color": "green", "text": "UP" }
+                  },
+                  "type": "value"
+                }
+              ],
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  { "color": "red", "value": null },
+                  { "color": "green", "value": 1 }
+                ]
+              }
+            }
+          },
+          "gridPos": { "h": 4, "w": 6, "x": 18, "y": 1 },
+          "id": 4,
+          "targets": [
+            {
+              "datasource": { "type": "prometheus", "uid": "prometheus" },
+              "expr": "probe_success{job=\"probe-agentzero-nuc\"}",
+              "legendFormat": "Status"
+            }
+          ],
+          "title": "Agent Zero (NUC)",
+          "type": "stat"
+        },
+        {
+          "collapsed": false,
+          "gridPos": { "h": 1, "w": 24, "x": 0, "y": 5 },
+          "id": 101,
+          "title": "K8s Cluster",
+          "type": "row"
+        },
+        {
+          "fieldConfig": {
+            "defaults": {
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  { "color": "green", "value": null },
+                  { "color": "yellow", "value": 30 },
+                  { "color": "red", "value": 50 }
+                ]
+              }
+            }
+          },
+          "gridPos": { "h": 6, "w": 8, "x": 0, "y": 6 },
+          "id": 5,
+          "targets": [
+            {
+              "datasource": { "type": "prometheus", "uid": "prometheus" },
+              "expr": "count(up{job=\"node-exporter\"} == 1)",
+              "legendFormat": "Nodes Up"
+            }
+          ],
+          "title": "Nodes Up (node-exporter)",
+          "type": "stat"
+        },
+        {
+          "fieldConfig": {
+            "defaults": {
+              "custom": { "fillOpacity": 15, "lineWidth": 2 },
+              "max": 100,
+              "min": 0,
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  { "color": "green", "value": null },
+                  { "color": "yellow", "value": 70 },
+                  { "color": "red", "value": 90 }
+                ]
+              },
+              "unit": "percent"
+            }
+          },
+          "gridPos": { "h": 6, "w": 8, "x": 8, "y": 6 },
+          "id": 6,
+          "targets": [
+            {
+              "datasource": { "type": "prometheus", "uid": "prometheus" },
+              "expr": "100 - (avg(rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) by (instance) * 100)",
+              "legendFormat": "{{ instance }}"
+            }
+          ],
+          "title": "Node CPU Usage %",
+          "type": "timeseries"
+        },
+        {
+          "fieldConfig": {
+            "defaults": {
+              "custom": { "fillOpacity": 15, "lineWidth": 2 },
+              "max": 100,
+              "min": 0,
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  { "color": "green", "value": null },
+                  { "color": "yellow", "value": 70 },
+                  { "color": "red", "value": 90 }
+                ]
+              },
+              "unit": "percent"
+            }
+          },
+          "gridPos": { "h": 6, "w": 8, "x": 16, "y": 6 },
+          "id": 7,
+          "targets": [
+            {
+              "datasource": { "type": "prometheus", "uid": "prometheus" },
+              "expr": "(1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100",
+              "legendFormat": "{{ instance }}"
+            }
+          ],
+          "title": "Node Memory Usage %",
+          "type": "timeseries"
+        },
+        {
+          "collapsed": false,
+          "gridPos": { "h": 1, "w": 24, "x": 0, "y": 12 },
+          "id": 102,
+          "title": "Network",
+          "type": "row"
+        },
+        {
+          "fieldConfig": {
+            "defaults": {
+              "custom": { "fillOpacity": 10, "lineWidth": 2 },
+              "unit": "Bps"
+            }
+          },
+          "gridPos": { "h": 8, "w": 12, "x": 0, "y": 13 },
+          "id": 8,
+          "targets": [
+            {
+              "datasource": { "type": "prometheus", "uid": "prometheus" },
+              "expr": "rate(ifHCInOctets{job=\"snmp-pfsense\",ifDescr=\"igb0\"}[5m])",
+              "legendFormat": "WAN In"
+            },
+            {
+              "datasource": { "type": "prometheus", "uid": "prometheus" },
+              "expr": "rate(ifHCOutOctets{job=\"snmp-pfsense\",ifDescr=\"igb0\"}[5m])",
+              "legendFormat": "WAN Out"
+            }
+          ],
+          "title": "pfSense WAN Bandwidth",
+          "type": "timeseries"
+        },
+        {
+          "fieldConfig": {
+            "defaults": {
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  { "color": "red", "value": null },
+                  { "color": "green", "value": 1 }
+                ]
+              }
+            },
+            "overrides": []
+          },
+          "gridPos": { "h": 8, "w": 12, "x": 12, "y": 13 },
+          "id": 9,
+          "options": {
+            "showHeader": true,
+            "sortBy": [{ "displayName": "Value", "desc": false }]
+          },
+          "targets": [
+            {
+              "datasource": { "type": "prometheus", "uid": "prometheus" },
+              "expr": "up",
+              "format": "table",
+              "instant": true,
+              "legendFormat": ""
+            }
+          ],
+          "title": "Target Health (up)",
+          "transformations": [
+            {
+              "id": "organize",
+              "options": {
+                "excludeByName": { "Time": true, "__name__": true },
+                "renameByName": { "job": "Job", "instance": "Instance", "Value": "Up" }
+              }
+            }
+          ],
+          "type": "table"
+        },
+        {
+          "collapsed": false,
+          "gridPos": { "h": 1, "w": 24, "x": 0, "y": 21 },
+          "id": 103,
+          "title": "Services",
+          "type": "row"
+        },
+        {
+          "gridPos": { "h": 4, "w": 24, "x": 0, "y": 22 },
+          "id": 10,
+          "options": {
+            "content": "### ArgoCD Applications\n\nView sync status and health of all 11 ArgoCD-managed applications:\n\n[Open ArgoCD Dashboard](https://argocd.iamworkin.lan)\n\n| App | Expected Status |\n|-----|----------------|\n| gitea | Healthy / Synced |\n| zabbix | Healthy / Synced |\n| guacamole | Healthy / Synced |\n| irc | Healthy / Synced |\n| mail | Healthy / Synced |\n| matrix | Healthy / Synced |\n| teamspeak | Healthy / Synced |\n| pki-web | Healthy / Synced |\n| intranet | Healthy / Synced |\n| telephony | Healthy / Synced |\n| infra-noc-services | Healthy / Synced |",
+            "mode": "markdown"
+          },
+          "title": "ArgoCD App Status",
+          "type": "text"
+        },
+        {
+          "collapsed": false,
+          "gridPos": { "h": 1, "w": 24, "x": 0, "y": 26 },
+          "id": 104,
+          "title": "Alerting",
+          "type": "row"
+        },
+        {
+          "fieldConfig": {
+            "defaults": {
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  { "color": "green", "value": null },
+                  { "color": "orange", "value": 1 },
+                  { "color": "red", "value": 3 }
+                ]
+              }
+            }
+          },
+          "gridPos": { "h": 6, "w": 24, "x": 0, "y": 27 },
+          "id": 11,
+          "targets": [
+            {
+              "datasource": { "type": "prometheus", "uid": "prometheus" },
+              "expr": "count(ALERTS{alertstate=\"firing\"}) or vector(0)",
+              "legendFormat": "Firing Alerts"
+            }
+          ],
+          "title": "Firing Alerts",
+          "type": "stat"
+        }
+      ],
+      "refresh": "30s",
+      "schemaVersion": 39,
+      "tags": ["infrastructure", "blue-jay", "overview"],
+      "time": { "from": "now-1h", "to": "now" },
+      "timezone": "browser",
+      "title": "Infrastructure Overview",
+      "uid": "infra-overview",
+      "version": 1
+    }
+
+# =============================================================================
+# ConfigMap: Grafana Datasource Provisioning
+# =============================================================================
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: grafana-datasource-provisioning
+  namespace: monitoring
+data:
+  datasource.yml: |
+    apiVersion: 1
+    datasources:
+      - name: Prometheus
+        type: prometheus
+        access: proxy
+        url: http://prometheus.monitoring.svc:9090
+        isDefault: true
+        editable: true
+
+# =============================================================================
+# ConfigMap: Grafana Alerting Provisioning (contact points + policies + rules)
+# =============================================================================
+# Makes alert rules declarative — survives pod rebuilds without API recreation
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: grafana-alerting-provisioning
+  namespace: monitoring
+data:
+  alerting.yml: |
+    apiVersion: 1
+    contactPoints:
+      - orgId: 1
+        name: IRC #alerts
+        receivers:
+          - uid: irc-alerts-webhook
+            type: webhook
+            settings:
+              url: http://irc-notify.monitoring.svc:9119
+              httpMethod: POST
+            disableResolveMessage: false
+      - orgId: 1
+        name: Thermal Printer
+        receivers:
+          - uid: thermal-print-001
+            type: webhook
+            settings:
+              url: http://irc-notify.monitoring.svc:9119
+              httpMethod: POST
+            disableResolveMessage: true
+    policies:
+      - orgId: 1
+        receiver: IRC #alerts
+        group_by: ['alertname']
+        group_wait: 30s
+        group_interval: 5m
+        repeat_interval: 1h
+        routes:
+          - receiver: Thermal Printer
+            matchers: ['alert_channel = thermal_print']
+            group_wait: 1m
+            group_interval: 10m
+            repeat_interval: 4h
+            continue: true
+    groups:
+      - orgId: 1
+        name: AI Stack
+        folder: AI Stack Alerts
+        interval: 1m
+        rules:
+          - uid: ollama-down-local
+            title: Ollama DOWN (Local)
+            condition: C
+            for: 2m
+            noDataState: Alerting
+            execErrState: OK
+            annotations:
+              summary: Ollama DOWN on workstation (R9700)
+              description: Agent Zero LOCAL cannot reach Ollama. FAISS memory will fail.
+              runbook: "1. Check PC power/wake 2. ssh stoltz@10.0.58.100 3. systemctl status ollama 4. journalctl -u ollama --since 5min"
+            labels:
+              severity: warning
+              service: ollama
+            data:
+              - refId: A
+                relativeTimeRange: {from: 120, to: 0}
+                datasourceUid: prometheus
+                model: {expr: 'probe_success{job="probe-ollama-local"}', instant: true, refId: A}
+              - refId: B
+                relativeTimeRange: {from: 120, to: 0}
+                datasourceUid: __expr__
+                model: {type: reduce, expression: A, reducer: last, refId: B}
+              - refId: C
+                relativeTimeRange: {from: 120, to: 0}
+                datasourceUid: __expr__
+                model: {type: threshold, expression: B, conditions: [{evaluator: {params: [1], type: lt}}], refId: C}
+          - uid: ollama-down-edge1
+            title: Ollama DOWN (Edge1)
+            condition: C
+            for: 2m
+            noDataState: Alerting
+            execErrState: OK
+            annotations:
+              summary: Ollama DOWN on edge1 Pi 5
+              description: Agent Zero NUC cannot reach Ollama.
+              runbook: "1. ssh stoltz@10.0.57.17 2. systemctl status ollama 3. journalctl -u ollama --since 5min 4. cat /sys/class/thermal/thermal_zone0/temp"
+            labels:
+              severity: warning
+              service: ollama
+            data:
+              - refId: A
+                relativeTimeRange: {from: 120, to: 0}
+                datasourceUid: prometheus
+                model: {expr: 'probe_success{job="probe-ollama-edge1"}', instant: true, refId: A}
+              - refId: B
+                relativeTimeRange: {from: 120, to: 0}
+                datasourceUid: __expr__
+                model: {type: reduce, expression: A, reducer: last, refId: B}
+              - refId: C
+                relativeTimeRange: {from: 120, to: 0}
+                datasourceUid: __expr__
+                model: {type: threshold, expression: B, conditions: [{evaluator: {params: [1], type: lt}}], refId: C}
+          - uid: a0-down-local
+            title: Agent Zero DOWN (Local)
+            condition: C
+            for: 2m
+            noDataState: Alerting
+            execErrState: OK
+            annotations:
+              summary: Agent Zero LOCAL DOWN
+              description: K3s web UI unreachable.
+              runbook: "1. ssh stoltz@10.0.58.100 2. docker ps | grep agent-zero 3. docker logs agent-zero --tail 20 4. Check Ollama first (A0 crashes if Ollama down)"
+            labels:
+              severity: warning
+              service: agent-zero
+            data:
+              - refId: A
+                relativeTimeRange: {from: 120, to: 0}
+                datasourceUid: prometheus
+                model: {expr: 'probe_success{job="probe-agentzero-local"}', instant: true, refId: A}
+              - refId: B
+                relativeTimeRange: {from: 120, to: 0}
+                datasourceUid: __expr__
+                model: {type: reduce, expression: A, reducer: last, refId: B}
+              - refId: C
+                relativeTimeRange: {from: 120, to: 0}
+                datasourceUid: __expr__
+                model: {type: threshold, expression: B, conditions: [{evaluator: {params: [1], type: lt}}], refId: C}
+          - uid: a0-down-nuc
+            title: Agent Zero DOWN (NUC)
+            condition: C
+            for: 2m
+            noDataState: Alerting
+            execErrState: OK
+            annotations:
+              summary: Agent Zero NUC DOWN
+              description: RKE2 web UI unreachable.
+              runbook: "1. ssh root@10.0.56.200 via noc1 2. kubectl get pods -n agent-zero 3. kubectl logs -n agent-zero deploy/agent-zero --tail 20"
+            labels:
+              severity: warning
+              service: agent-zero
+            data:
+              - refId: A
+                relativeTimeRange: {from: 120, to: 0}
+                datasourceUid: prometheus
+                model: {expr: 'probe_success{job="probe-agentzero-nuc"}', instant: true, refId: A}
+              - refId: B
+                relativeTimeRange: {from: 120, to: 0}
+                datasourceUid: __expr__
+                model: {type: reduce, expression: A, reducer: last, refId: B}
+              - refId: C
+                relativeTimeRange: {from: 120, to: 0}
+                datasourceUid: __expr__
+                model: {type: threshold, expression: B, conditions: [{evaluator: {params: [1], type: lt}}], refId: C}
+      - orgId: 1
+        name: Infrastructure
+        folder: AI Stack Alerts
+        interval: 1m
+        rules:
+          - uid: node-down
+            title: Node DOWN
+            condition: C
+            for: 2m
+            noDataState: Alerting
+            execErrState: OK
+            annotations:
+              summary: Node down
+              description: Node exporter unreachable for 2 minutes. Host may be down or network issue.
+              runbook: "1. Ping host IP 2. SSH via noc1 jumpbox (root@10.0.56.10) 3. systemctl status node_exporter 4. Check network/VLAN 5. Physical power check if unreachable"
+            labels:
+              severity: critical
+              service: infrastructure
+              alert_channel: thermal_print
+            data:
+              - refId: A
+                relativeTimeRange: {from: 120, to: 0}
+                datasourceUid: prometheus
+                model: {expr: 'up{job=~"node-exporter|rke2-nodes|edge-nodes"}', instant: true, refId: A}
+              - refId: B
+                relativeTimeRange: {from: 120, to: 0}
+                datasourceUid: __expr__
+                model: {type: reduce, expression: A, reducer: last, refId: B}
+              - refId: C
+                relativeTimeRange: {from: 120, to: 0}
+                datasourceUid: __expr__
+                model: {type: threshold, expression: B, conditions: [{evaluator: {params: [1], type: lt}}], refId: C}
+          - uid: high-cpu
+            title: High CPU (>85%)
+            condition: C
+            for: 10m
+            noDataState: NoData
+            execErrState: OK
+            annotations:
+              summary: High CPU
+              description: CPU above 85% for 10 minutes. Performance degradation likely.
+              runbook: "1. SSH to host 2. top -b -n1 | head -20 3. Check for runaway process 4. kubectl top pods (if K8s node)"
+            labels:
+              severity: warning
+              service: infrastructure
+            data:
+              - refId: A
+                relativeTimeRange: {from: 600, to: 0}
+                datasourceUid: prometheus
+                model: {expr: 'instance:node_cpu_usage:avg5m', instant: true, refId: A}
+              - refId: B
+                relativeTimeRange: {from: 600, to: 0}
+                datasourceUid: __expr__
+                model: {type: reduce, expression: A, reducer: last, refId: B}
+              - refId: C
+                relativeTimeRange: {from: 600, to: 0}
+                datasourceUid: __expr__
+                model: {type: threshold, expression: B, conditions: [{evaluator: {params: [85], type: gt}}], refId: C}
+          - uid: high-memory
+            title: High Memory (>90%)
+            condition: C
+            for: 5m
+            noDataState: NoData
+            execErrState: OK
+            annotations:
+              summary: High memory usage
+              description: Memory above 90% for 5 minutes. OOM kills imminent.
+              runbook: "1. SSH to host 2. free -h 3. ps aux --sort=-%mem | head 10 4. Check for memory leak (growing RSS)"
+            labels:
+              severity: warning
+              service: infrastructure
+            data:
+              - refId: A
+                relativeTimeRange: {from: 600, to: 0}
+                datasourceUid: prometheus
+                model: {expr: 'instance:node_memory_usage:percent', instant: true, refId: A}
+              - refId: B
+                relativeTimeRange: {from: 600, to: 0}
+                datasourceUid: __expr__
+                model: {type: reduce, expression: A, reducer: last, refId: B}
+              - refId: C
+                relativeTimeRange: {from: 600, to: 0}
+                datasourceUid: __expr__
+                model: {type: threshold, expression: B, conditions: [{evaluator: {params: [90], type: gt}}], refId: C}
+          - uid: disk-low
+            title: Disk Space Low (>85%)
+            condition: C
+            for: 10m
+            noDataState: NoData
+            execErrState: OK
+            annotations:
+              summary: Disk usage high
+              description: Root disk above 85% for 10 minutes. Service disruption if full.
+              runbook: "1. SSH to host 2. df -h / 3. du -sh /* | sort -rh | head 4. journalctl --vacuum-size=100M 5. podman system prune"
+            labels:
+              severity: warning
+              service: infrastructure
+            data:
+              - refId: A
+                relativeTimeRange: {from: 600, to: 0}
+                datasourceUid: prometheus
+                model: {expr: 'instance:node_disk_usage:percent', instant: true, refId: A}
+              - refId: B
+                relativeTimeRange: {from: 600, to: 0}
+                datasourceUid: __expr__
+                model: {type: reduce, expression: A, reducer: last, refId: B}
+              - refId: C
+                relativeTimeRange: {from: 600, to: 0}
+                datasourceUid: __expr__
+                model: {type: threshold, expression: B, conditions: [{evaluator: {params: [85], type: gt}}], refId: C}
+
+# =============================================================================
+# Deployment: Grafana
+# =============================================================================
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: grafana
+  namespace: monitoring
+  labels:
+    app: grafana
+spec:
+  replicas: 1
+  strategy:
+    type: Recreate
+  selector:
+    matchLabels:
+      app: grafana
+  template:
+    metadata:
+      labels:
+        app: grafana
+    spec:
+      securityContext:
+        fsGroup: 472  # grafana group
+        runAsUser: 472
+        runAsGroup: 472
+      containers:
+        - name: grafana
+          image: docker.io/grafana/grafana:latest
+          env:
+            # Credentials from 1Password Operator (OnePasswordItem → Secret "grafana-credentials")
+            - name: GF_SECURITY_ADMIN_USER
+              valueFrom:
+                secretKeyRef:
+                  name: grafana-credentials
+                  key: username
+            - name: GF_SECURITY_ADMIN_PASSWORD
+              valueFrom:
+                secretKeyRef:
+                  name: grafana-credentials
+                  key: password
+            - name: GF_SERVER_ROOT_URL
+              value: "https://grafana.iamworkin.lan"
+            - name: GF_SERVER_SERVE_FROM_SUB_PATH
+              value: "false"
+            # Zabbix plugin: install manually after first boot if needed
+            # GF_INSTALL_PLUGINS requires internet on startup — breaks with restrictive NetworkPolicy
+            # kubectl exec -n monitoring deploy/grafana -- grafana cli plugins install alexanderzobnin-zabbix-app
+          ports:
+            - containerPort: 3000
+              name: http
+          volumeMounts:
+            - name: data
+              mountPath: /var/lib/grafana
+            - name: dashboard-provider
+              mountPath: /etc/grafana/provisioning/dashboards
+              readOnly: true
+            - name: dashboards-ai-stack
+              mountPath: /var/lib/grafana/dashboards/ai-stack
+              readOnly: true
+            - name: dashboards-edge-nodes
+              mountPath: /var/lib/grafana/dashboards/edge-nodes
+              readOnly: true
+            - name: dashboards-network
+              mountPath: /var/lib/grafana/dashboards/network
+              readOnly: true
+            - name: dashboards-operations
+              mountPath: /var/lib/grafana/dashboards/operations
+              readOnly: true
+            - name: dashboards-printer
+              mountPath: /var/lib/grafana/dashboards/printer
+              readOnly: true
+            - name: dashboards-infra-overview
+              mountPath: /var/lib/grafana/dashboards/infra-overview
+              readOnly: true
+            - name: datasource-provisioning
+              mountPath: /etc/grafana/provisioning/datasources
+              readOnly: true
+            - name: alerting-provisioning
+              mountPath: /etc/grafana/provisioning/alerting
+              readOnly: true
+          resources:
+            requests:
+              cpu: 100m
+              memory: 128Mi
+            limits:
+              cpu: 500m
+              memory: 512Mi
+          livenessProbe:
+            httpGet:
+              path: /api/health
+              port: 3000
+            initialDelaySeconds: 30
+            periodSeconds: 30
+          readinessProbe:
+            httpGet:
+              path: /api/health
+              port: 3000
+            initialDelaySeconds: 10
+            periodSeconds: 10
+      volumes:
+        - name: data
+          persistentVolumeClaim:
+            claimName: grafana-data
+        - name: dashboard-provider
+          configMap:
+            name: grafana-dashboard-provider
+        - name: dashboards-ai-stack
+          configMap:
+            name: grafana-dashboards
+        - name: dashboards-edge-nodes
+          configMap:
+            name: grafana-dashboard-edge-nodes
+        - name: dashboards-network
+          configMap:
+            name: grafana-dashboard-network-overview
+        - name: dashboards-operations
+          configMap:
+            name: grafana-dashboard-operations
+        - name: dashboards-printer
+          configMap:
+            name: grafana-dashboard-printer
+        - name: dashboards-infra-overview
+          configMap:
+            name: grafana-dashboard-infra-overview
+        - name: datasource-provisioning
+          configMap:
+            name: grafana-datasource-provisioning
+        - name: alerting-provisioning
+          configMap:
+            name: grafana-alerting-provisioning
+
+# =============================================================================
+# Deployment: Blackbox Exporter
+# =============================================================================
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: blackbox-exporter
+  namespace: monitoring
+  labels:
+    app: blackbox-exporter
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: blackbox-exporter
+  template:
+    metadata:
+      labels:
+        app: blackbox-exporter
+    spec:
+      containers:
+        - name: blackbox-exporter
+          image: quay.io/prometheus/blackbox-exporter:latest
+          args:
+            - "--config.file=/config/blackbox.yml"
+          ports:
+            - containerPort: 9115
+              name: http
+          volumeMounts:
+            - name: config
+              mountPath: /config/blackbox.yml
+              subPath: blackbox.yml
+              readOnly: true
+          resources:
+            requests:
+              cpu: 50m
+              memory: 32Mi
+            limits:
+              cpu: 200m
+              memory: 128Mi
+          livenessProbe:
+            httpGet:
+              path: /
+              port: 9115
+            initialDelaySeconds: 5
+            periodSeconds: 30
+          readinessProbe:
+            httpGet:
+              path: /
+              port: 9115
+            initialDelaySeconds: 3
+            periodSeconds: 10
+      volumes:
+        - name: config
+          configMap:
+            name: blackbox-config
+
+# =============================================================================
+# PVC: SNMP Exporter Config (100Mi, Longhorn)
+# =============================================================================
+# The custom snmp.yml (~2MB) exceeds the 1MB ConfigMap limit.
+# This PVC stores the config file. To load a custom config:
+#   kubectl cp snmp.yml monitoring/<snmp-exporter-pod>:/config/snmp.yml
+# Then restart the pod to pick up the new config.
+---
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: snmp-config
+  namespace: monitoring
+spec:
+  accessModes:
+    - ReadWriteOnce
+  storageClassName: longhorn
+  resources:
+    requests:
+      storage: 100Mi
+
+# =============================================================================
+# Deployment: SNMP Exporter
+# =============================================================================
+# Uses a PVC-mounted config at /config/snmp.yml. An init container copies the
+# default config from the image if the PVC is empty (first deploy).
+# To load the custom noc1 snmp.yml (~2MB):
+#   kubectl cp snmp.yml monitoring/<snmp-exporter-pod>:/config/snmp.yml
+# Then restart: kubectl rollout restart deploy/snmp-exporter -n monitoring
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: snmp-exporter
+  namespace: monitoring
+  labels:
+    app: snmp-exporter
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: snmp-exporter
+  template:
+    metadata:
+      labels:
+        app: snmp-exporter
+    spec:
+      initContainers:
+        # Copy default snmp.yml from image if PVC is empty (first deploy)
+        - name: init-config
+          image: docker.io/prom/snmp-exporter:latest
+          command:
+            - sh
+            - -c
+            - |
+              if [ ! -f /config/snmp.yml ]; then
+                echo "No custom config found, copying default from image..."
+                cp /etc/snmp_exporter/snmp.yml /config/snmp.yml
+                echo "Default snmp.yml copied to PVC."
+              else
+                echo "Custom snmp.yml already exists on PVC, skipping copy."
+              fi
+          volumeMounts:
+            - name: snmp-config
+              mountPath: /config
+      containers:
+        - name: snmp-exporter
+          image: docker.io/prom/snmp-exporter:latest
+          args:
+            - "--config.file=/config/snmp.yml"
+          ports:
+            - containerPort: 9116
+              name: http
+          volumeMounts:
+            - name: snmp-config
+              mountPath: /config
+              readOnly: true
+          resources:
+            requests:
+              cpu: 50m
+              memory: 64Mi
+            limits:
+              cpu: 200m
+              memory: 256Mi
+          livenessProbe:
+            httpGet:
+              path: /
+              port: 9116
+            initialDelaySeconds: 5
+            periodSeconds: 30
+          readinessProbe:
+            httpGet:
+              path: /
+              port: 9116
+            initialDelaySeconds: 3
+            periodSeconds: 10
+      volumes:
+        - name: snmp-config
+          persistentVolumeClaim:
+            claimName: snmp-config
+
+# =============================================================================
+# Deployment: IRC Notify (alert relay)
+# =============================================================================
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: irc-notify
+  namespace: monitoring
+  labels:
+    app: irc-notify
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: irc-notify
+  template:
+    metadata:
+      labels:
+        app: irc-notify
+    spec:
+      containers:
+        - name: irc-notify
+          image: docker.io/library/python:3.12-slim
+          command: ["python3", "/app/notify.py"]
+          ports:
+            - containerPort: 9119
+              name: http
+          volumeMounts:
+            - name: script
+              mountPath: /app/notify.py
+              subPath: notify.py
+              readOnly: true
+          resources:
+            requests:
+              cpu: 25m
+              memory: 32Mi
+            limits:
+              cpu: 100m
+              memory: 64Mi
+          livenessProbe:
+            tcpSocket:
+              port: 9119
+            initialDelaySeconds: 5
+            periodSeconds: 30
+          readinessProbe:
+            tcpSocket:
+              port: 9119
+            initialDelaySeconds: 3
+            periodSeconds: 10
+      volumes:
+        - name: script
+          configMap:
+            name: irc-notify-script
+
+# =============================================================================
+# DaemonSet: Node Exporter (runs on every RKE2 node)
+# =============================================================================
+# Port 9101 avoids conflict with host-level node-exporters already on :9100.
+# The rke2-nodes Prometheus job scrapes the host instances on :9100; this
+# DaemonSet provides K8s service-discovery-based scraping on :9101.
+---
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: node-exporter
+  namespace: monitoring
+  labels:
+    app: node-exporter
+spec:
+  selector:
+    matchLabels:
+      app: node-exporter
+  updateStrategy:
+    type: RollingUpdate
+    rollingUpdate:
+      maxUnavailable: 1
+  template:
+    metadata:
+      labels:
+        app: node-exporter
+    spec:
+      hostPID: true
+      hostNetwork: true
+      tolerations:
+        - operator: Exists
+      securityContext:
+        runAsNonRoot: false
+        runAsUser: 0
+      containers:
+        - name: node-exporter
+          image: docker.io/prom/node-exporter:latest
+          args:
+            - "--path.rootfs=/host"
+            - "--path.sysfs=/host/sys"
+            - "--path.procfs=/host/proc"
+            - "--collector.filesystem.mount-points-exclude=^/(dev|proc|sys|run|var/lib/containers|run/containers|var/lib/rancher|var/lib/longhorn)($|/)"
+            - "--collector.netclass.ignored-devices=^(veth|cali|flannel|cni).*$"
+            - "--no-collector.btrfs"
+            - "--web.listen-address=:9101"
+          ports:
+            - containerPort: 9101
+              hostPort: 9101
+              name: metrics
+          securityContext:
+            privileged: true
+            readOnlyRootFilesystem: true
+          volumeMounts:
+            - name: rootfs
+              mountPath: /host
+              readOnly: true
+              mountPropagation: HostToContainer
+            - name: proc
+              mountPath: /host/proc
+              readOnly: true
+            - name: sys
+              mountPath: /host/sys
+              readOnly: true
+          resources:
+            requests:
+              cpu: 50m
+              memory: 32Mi
+            limits:
+              cpu: 200m
+              memory: 128Mi
+      volumes:
+        - name: rootfs
+          hostPath:
+            path: /
+        - name: proc
+          hostPath:
+            path: /proc
+        - name: sys
+          hostPath:
+            path: /sys
+
+# =============================================================================
+# Service: Prometheus (ClusterIP :9090)
+# =============================================================================
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: prometheus
+  namespace: monitoring
+  labels:
+    app: prometheus
+spec:
+  type: ClusterIP
+  ports:
+    - port: 9090
+      targetPort: 9090
+      protocol: TCP
+      name: http
+  selector:
+    app: prometheus
+
+# =============================================================================
+# Service: Grafana (ClusterIP :3000)
+# =============================================================================
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: grafana
+  namespace: monitoring
+  labels:
+    app: grafana
+spec:
+  type: ClusterIP
+  ports:
+    - port: 3000
+      targetPort: 3000
+      protocol: TCP
+      name: http
+  selector:
+    app: grafana
+
+# =============================================================================
+# Service: Blackbox Exporter (ClusterIP :9115)
+# =============================================================================
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: blackbox-exporter
+  namespace: monitoring
+  labels:
+    app: blackbox-exporter
+spec:
+  type: ClusterIP
+  ports:
+    - port: 9115
+      targetPort: 9115
+      protocol: TCP
+      name: http
+  selector:
+    app: blackbox-exporter
+
+# =============================================================================
+# Service: SNMP Exporter (ClusterIP :9116)
+# =============================================================================
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: snmp-exporter
+  namespace: monitoring
+  labels:
+    app: snmp-exporter
+spec:
+  type: ClusterIP
+  ports:
+    - port: 9116
+      targetPort: 9116
+      protocol: TCP
+      name: http
+  selector:
+    app: snmp-exporter
+
+# =============================================================================
+# Service: Node Exporter (headless for Prometheus SD)
+# =============================================================================
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: node-exporter
+  namespace: monitoring
+  labels:
+    app: node-exporter
+spec:
+  type: ClusterIP
+  clusterIP: None
+  ports:
+    - port: 9101
+      targetPort: 9101
+      protocol: TCP
+      name: metrics
+  selector:
+    app: node-exporter
+
+# =============================================================================
+# Service: IRC Notify (ClusterIP :9119)
+# =============================================================================
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: irc-notify
+  namespace: monitoring
+  labels:
+    app: irc-notify
+spec:
+  type: ClusterIP
+  ports:
+    - port: 9119
+      targetPort: 9119
+      protocol: TCP
+      name: http
+  selector:
+    app: irc-notify
+
+# =============================================================================
+# TLS Certificates (cert-manager + step-ca ACME)
+# =============================================================================
+---
+apiVersion: cert-manager.io/v1
+kind: Certificate
+metadata:
+  name: grafana-tls
+  namespace: monitoring
+spec:
+  secretName: grafana-tls
+  issuerRef:
+    name: step-ca-acme
+    kind: ClusterIssuer
+  dnsNames:
+    - grafana.iamworkin.lan
+---
+apiVersion: cert-manager.io/v1
+kind: Certificate
+metadata:
+  name: prometheus-tls
+  namespace: monitoring
+spec:
+  secretName: prometheus-tls
+  issuerRef:
+    name: step-ca-acme
+    kind: ClusterIssuer
+  dnsNames:
+    - prometheus.iamworkin.lan
+
+# =============================================================================
+# Traefik IngressRoute: Grafana
+# =============================================================================
+---
+apiVersion: traefik.io/v1alpha1
+kind: IngressRoute
+metadata:
+  name: grafana
+  namespace: monitoring
+spec:
+  entryPoints:
+    - websecure
+  routes:
+    - kind: Rule
+      match: Host(`grafana.iamworkin.lan`)
+      services:
+        - name: grafana
+          port: 3000
+  tls:
+    secretName: grafana-tls
+
+# =============================================================================
+# Traefik IngressRoute: Prometheus
+# =============================================================================
+---
+apiVersion: traefik.io/v1alpha1
+kind: IngressRoute
+metadata:
+  name: prometheus
+  namespace: monitoring
+spec:
+  entryPoints:
+    - websecure
+  routes:
+    - kind: Rule
+      match: Host(`prometheus.iamworkin.lan`)
+      services:
+        - name: prometheus
+          port: 9090
+  tls:
+    secretName: prometheus-tls
+
+# =============================================================================
+# NetworkPolicy: monitoring namespace
+# =============================================================================
+---
+apiVersion: networking.k8s.io/v1
+kind: NetworkPolicy
+metadata:
+  name: monitoring-netpol
+  namespace: monitoring
+spec:
+  podSelector: {}
+  policyTypes:
+    - Ingress
+    - Egress
+  ingress:
+    # Allow from Traefik (IngressRoutes AND ACME solver pods)
+    - from:
+        - namespaceSelector:
+            matchLabels:
+              kubernetes.io/metadata.name: traefik-system
+    # Allow intra-namespace (prometheus→exporters, grafana→prometheus, grafana→irc-notify)
+    - from:
+        - podSelector: {}
+    # Allow from cert-manager (ACME HTTP-01 self-check)
+    - from:
+        - namespaceSelector:
+            matchLabels:
+              kubernetes.io/metadata.name: cert-manager
+  egress:
+    # DNS
+    - to:
+        - namespaceSelector: {}
+      ports:
+        - port: 53
+          protocol: UDP
+        - port: 53
+          protocol: TCP
+    # MGMT VLAN (noc1, pfSense, switches, SNMP, node-exporter)
+    - to:
+        - ipBlock:
+            cidr: 10.0.56.0/24
+    # PROD VLAN (edge nodes)
+    - to:
+        - ipBlock:
+            cidr: 10.0.57.0/24
+    # HOME VLAN (workstation, printer, NAS)
+    - to:
+        - ipBlock:
+            cidr: 10.0.58.0/24
+    # Intra-namespace
+    - to:
+        - podSelector: {}
+    # Blackbox probes to other namespaces (agent-zero, etc)
+    - to:
+        - namespaceSelector:
+            matchLabels:
+              kubernetes.io/metadata.name: agent-zero
+      ports:
+        - port: 80
+          protocol: TCP
+    # IRC (irc-notify → UnrealIRCd in irc namespace via K8s DNS)
+    - to:
+        - namespaceSelector:
+            matchLabels:
+              kubernetes.io/metadata.name: irc
+      ports:
+        - port: 6667
+          protocol: TCP
+        - port: 6697
+          protocol: TCP
+    # Step-CA ACME (cert renewal)
+    - to:
+        - ipBlock:
+            cidr: 10.0.56.10/32
+      ports:
+        - port: 9443
+          protocol: TCP
+    # Internet (optional: Grafana plugin install, ACME)
+    - to:
+        - ipBlock:
+            cidr: 0.0.0.0/0
+            except:
+              - 10.0.0.0/8
+              - 172.16.0.0/12
+              - 192.168.0.0/16
+
+# =============================================================================
+# Job: SNMP Config Loader (ArgoCD PostSync hook)
+# =============================================================================
+# Runs once after the main deployment to populate the SNMP config PVC.
+# Attempts to download custom snmp.yml from noc1; falls back to the default
+# config bundled in the snmp-exporter image.
+---
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: snmp-config-loader
+  namespace: monitoring
+  annotations:
+    argocd.argoproj.io/hook: PostSync
+    argocd.argoproj.io/hook-delete-policy: HookSucceeded
+spec:
+  backoffLimit: 0
+  template:
+    metadata:
+      labels:
+        app: snmp-config-loader
+    spec:
+      restartPolicy: Never
+      initContainers:
+        # Try to download custom snmp.yml from noc1
+        - name: download-config
+          image: docker.io/curlimages/curl:latest
+          command:
+            - sh
+            - -c
+            - |
+              echo "Attempting to download custom snmp.yml from noc1..."
+              curl -sf --connect-timeout 10 --max-time 30 \
+                http://10.0.56.10:9116/config -o /config/snmp.yml 2>/dev/null
+              if [ $? -eq 0 ] && [ -s /config/snmp.yml ]; then
+                echo "Custom snmp.yml downloaded from noc1 successfully."
+              else
+                echo "Download failed or empty, will use default from image."
+                rm -f /config/snmp.yml
+              fi
+          volumeMounts:
+            - name: snmp-config
+              mountPath: /config
+      containers:
+        # If download failed, copy the default config from the image
+        - name: fallback-default
+          image: docker.io/prom/snmp-exporter:latest
+          command:
+            - sh
+            - -c
+            - |
+              if [ -f /config/snmp.yml ] && [ -s /config/snmp.yml ]; then
+                echo "Custom config already present, nothing to do."
+              else
+                echo "Copying default snmp.yml from image to PVC..."
+                cp /etc/snmp_exporter/snmp.yml /config/snmp.yml
+                echo "Default config copied."
+              fi
+              echo "SNMP config loader complete."
+          volumeMounts:
+            - name: snmp-config
+              mountPath: /config
+      volumes:
+        - name: snmp-config
+          persistentVolumeClaim:
+            claimName: snmp-config