bluejay-infra/apps/monitoring/noc-monitoring.yaml

# =============================================================================
# NOC Monitoring Stack — K8s Migration Target
# =============================================================================
# Migrates the noc1 Podman monitoring pod to RKE2 K8s.
# Source: noc1 (10.0.56.10) /opt/monitoring/
#
# Components:
#   - Prometheus (metrics, alerting)
#   - Grafana (dashboards)
#   - Blackbox Exporter (HTTP probes)
#   - SNMP Exporter (network device metrics)
#   - Node Exporter (host metrics, DaemonSet)
#   - IRC Notify (alert relay to UnrealIRCd)
#
# Note: SNMP exporter config (snmp.yml) is ~2MB, exceeding the 1MB ConfigMap
# limit. It is stored in a separate file (snmp-config.yaml) and must be
# applied as a standalone ConfigMap or mounted via an init container that
# downloads it from Gitea.
# =============================================================================

---
apiVersion: v1
kind: Namespace
metadata:
  name: monitoring

# =============================================================================
# ConfigMap: Prometheus Configuration
# =============================================================================
---
apiVersion: v1
kind: ConfigMap
metadata:
  name: prometheus-config
  namespace: monitoring
data:
  prometheus.yml: |
    global:
      scrape_interval: 30s
      evaluation_interval: 30s

    rule_files:
      - /etc/prometheus/alerts.yml
      - /etc/prometheus/recording-rules.yml

    scrape_configs:
      # noc1 host metrics (external to cluster)
      - job_name: "node-exporter"
        static_configs:
          - targets: ["10.0.56.10:9100"]
            labels:
              instance: "noc1"
              vlan: "mgmt"

      # RKE2 bare-metal cluster (openSUSE Leap 16 on NUCs)
      - job_name: "rke2-nodes"
        scrape_timeout: 15s
        static_configs:
          - targets: ["10.0.56.11:9100"]
            labels:
              instance: "rke2-server"
              vlan: "mgmt"
              cluster: "rke2"
              role: "server"
          - targets: ["10.0.56.12:9100"]
            labels:
              instance: "rke2-agent1"
              vlan: "mgmt"
              cluster: "rke2"
              role: "agent"
          - targets: ["10.0.56.13:9100"]
            labels:
              instance: "rke2-agent2"
              vlan: "mgmt"
              cluster: "rke2"
              role: "agent"

      # Mac mini macOS runner node (INFRA VLAN)
      - job_name: "macmini-node"
        scrape_timeout: 15s
        static_configs:
          - targets: ["10.0.56.115:9100"]
            labels:
              instance: "macmini"
              host: "macmini.iamworkin.lan"
              vlan: "infra"
              arch: "arm64"
              role: "macos-runner"
              puppet_managed: "true"
              puppet_server: "puppet.iamworkin.lan"

      # In-cluster node-exporter DaemonSet
      - job_name: "k8s-node-exporter"
        kubernetes_sd_configs:
          - role: endpoints
            namespaces:
              names: ["monitoring"]
        relabel_configs:
          - source_labels: [__meta_kubernetes_endpoints_name]
            action: keep
            regex: node-exporter
          - source_labels: [__meta_kubernetes_endpoint_node_name]
            target_label: instance

      # pfSense SNMP via snmp-exporter
      - job_name: "snmp-pfsense"
        static_configs:
          - targets: ["10.0.56.1"]
        metrics_path: /snmp
        params:
          module: [if_mib]
          auth: [bluejay_v2]
        relabel_configs:
          - source_labels: [__address__]
            target_label: __param_target
          - source_labels: [__param_target]
            target_label: instance
          - target_label: __address__
            replacement: snmp-exporter.monitoring.svc:9116

      # UniFi Cloud Key SNMP — DISABLED 2026-04-26
      # The Cloud Key Gen2+ runs unifi-core (controller) only — not a network
      # device — and does NOT run an SNMP agent on UDP/161. Scrapes were
      # silently failing with "connection refused" from 10.42.x.x:161 every
      # 30s, polluting up{} = 0 and lastError on the Targets page. Hardware
      # health (CPU/mem/disk) for the Cloud Key host should come from
      # node_exporter via SSH — not SNMP.
      # - job_name: "snmp-cloudkey"
      #   static_configs:
      #     - targets: ["10.0.56.3"]
      #   metrics_path: /snmp
      #   params:
      #     module: [if_mib]
      #     auth: [bluejay_v2]
      #   relabel_configs:
      #     - source_labels: [__address__]
      #       target_label: __param_target
      #     - source_labels: [__param_target]
      #       target_label: instance
      #     - target_label: __address__
      #       replacement: snmp-exporter.monitoring.svc:9116

      # UniFi Switch SNMP
      - job_name: "snmp-switch"
        static_configs:
          - targets: ["10.0.56.2"]
        metrics_path: /snmp
        params:
          module: [if_mib]
          auth: [bluejay_v2]
        relabel_configs:
          - source_labels: [__address__]
            target_label: __param_target
          - source_labels: [__param_target]
            target_label: instance
          - target_label: __address__
            replacement: snmp-exporter.monitoring.svc:9116

      # Synology NAS SNMP
      - job_name: "snmp-nas"
        static_configs:
          - targets: ["10.0.58.3"]
        metrics_path: /snmp
        params:
          module: [synology]
          auth: [bluejay_v2]
        relabel_configs:
          - source_labels: [__address__]
            target_label: __param_target
          - source_labels: [__param_target]
            target_label: instance
          - target_label: __address__
            replacement: snmp-exporter.monitoring.svc:9116

      # Prometheus self-monitoring
      - job_name: "prometheus"
        static_configs:
          - targets: ["localhost:9090"]

      # Edge nodes (PROD VLAN)
      - job_name: "edge-nodes"
        static_configs:
          - targets: ["10.0.57.17:9100"]
            labels:
              instance: "edge1"
              vlan: "prod"
              arch: "arm64"
              role: "ai-inference"
              puppet_managed: "true"
              puppet_server: "puppet.iamworkin.lan"
          - targets: ["10.0.57.16:9100"]
            labels:
              instance: "edge2"
              vlan: "prod"
              arch: "arm64"
              role: "ci-runner"
              puppet_managed: "true"
              puppet_server: "puppet.iamworkin.lan"
          - targets: ["10.0.58.25:9100"]
            labels:
              instance: "piez"
              vlan: "home"
              arch: "arm64"
              role: "prototyping"
          - targets: ["10.0.58.113:9100"]
            labels:
              instance: "pirelay"
              vlan: "home"
              arch: "arm64"
              role: "relay-controller"

      # =======================================================================
      # PiManager Application Metrics (relay states, temps, automation)
      # =======================================================================

      - job_name: "pimanager-app"
        scrape_interval: 15s
        metrics_path: /metrics
        static_configs:
          - targets: ["10.0.58.25:5000"]
            labels:
              instance: "piez"
              service: "pimanager"
              vlan: "home"
              device: "pi4-ezconnect"
          - targets: ["10.0.58.113:5100"]
            labels:
              instance: "pirelay"
              service: "pimanager"
              vlan: "home"
              device: "pi3-ks0212"

      # Epson ET-3750 EcoTank Printer SNMP
      - job_name: "snmp-printer"
        scrape_interval: 5m
        scrape_timeout: 30s
        static_configs:
          - targets: ["10.0.58.107"]
            labels:
              instance: "epson-ecotank"
              vlan: "home"
              device_type: "printer"
        metrics_path: /snmp
        params:
          module: [printer_mib]
          auth: [public_v2]
        relabel_configs:
          - source_labels: [__address__]
            target_label: __param_target
          - source_labels: [__param_target]
            target_label: instance
          - target_label: __address__
            replacement: snmp-exporter.monitoring.svc:9116

      # =============================================================================
      # Print Services (CUPS + Print.Web on edge2)
      # =============================================================================

      # CUPS Prometheus exporter (cups_exporter on edge2:9628)
      - job_name: "cups"
        scrape_interval: 30s
        static_configs:
          - targets: ["10.0.57.16:9628"]
            labels:
              instance: "edge2"
              service: "cups"
              device_type: "printer"
              printer_model: "NuPrint 210"

      # Print.Web OTEL metrics (print counters/histograms plus Ollama runner gauges)
      - job_name: "printweb-otel"
        scrape_interval: 30s
        metrics_path: /metrics/prometheus
        static_configs:
          - targets: ["10.0.57.16:5200"]
            labels:
              instance: "print-web"
              service: "print-web"
              device_type: "printer"
              printer_model: "NuPrint 210"

      # Print.Web health (Blazor app on edge2:5200)
      - job_name: "probe-printweb"
        metrics_path: /probe
        params:
          module: [http_2xx]
        scrape_interval: 30s
        static_configs:
          - targets: ["http://10.0.57.16:5200/"]
            labels:
              instance: "print-web"
              service: "print-web"
        relabel_configs:
          - source_labels: [__address__]
            target_label: __param_target
          - source_labels: [__param_target]
            target_label: instance
          - target_label: __address__
            replacement: blackbox-exporter.monitoring.svc:9115

      # FlowerCore.RemoteDesktop web health (public cluster VIP)
      # Module is https_internal — desktop.iamworkin.lan uses a step-ca leaf
      # cert; blackbox does NOT trust step-ca root, so http_2xx fails with
      # x509 unknown authority and probe_success=0 even when /health 200s.
      - job_name: "probe-remotedesktop"
        metrics_path: /probe
        params:
          module: [https_internal]
        scrape_interval: 30s
        static_configs:
          - targets: ["https://desktop.iamworkin.lan/health"]
            labels:
              instance: "https://desktop.iamworkin.lan/health"
              service: "remotedesktop-web"
        relabel_configs:
          - source_labels: [__address__]
            target_label: __param_target
          - target_label: __address__
            replacement: blackbox-exporter.monitoring.svc:9115

      # FlowerCore.RemoteDesktop /metrics (direct scrape for counters)
      - job_name: "fc-remotedesktop"
        metrics_path: /metrics
        scheme: https
        scrape_interval: 30s
        tls_config:
          insecure_skip_verify: true
        static_configs:
          - targets: ["desktop.iamworkin.lan"]
            labels:
              service: "remotedesktop-web"

      # CUPS web UI health (port 631)
      - job_name: "probe-cups"
        metrics_path: /probe
        params:
          module: [http_2xx]
        scrape_interval: 60s
        static_configs:
          - targets: ["http://10.0.57.16:631/"]
            labels:
              instance: "cups-edge2"
              service: "cups"
        relabel_configs:
          - source_labels: [__address__]
            target_label: __param_target
          - source_labels: [__param_target]
            target_label: instance
          - target_label: __address__
            replacement: blackbox-exporter.monitoring.svc:9115

      # =============================================================================
      # AI Stack Health Probes (Blackbox Exporter)
      # =============================================================================

      # NOTE: probe-ollama-local and probe-agentzero-local were REMOVED
      # 2026-04-26. They pointed at 10.0.58.100 (HOME VLAN) which is not
      # reachable from cluster pods (firewalled). They had been firing as
      # OllamaDown / AgentZeroDown since 2026-04-24. Workstation/AI-laptop
      # Ollama and Agent Zero should be monitored via host-side Puppet
      # (node_exporter on the box) once the AI laptop is running 24/7.

      # Ollama API — edge1 Pi 5 (NUC Agent Zero)
      - job_name: "probe-ollama-edge1"
        metrics_path: /probe
        params:
          module: [http_ollama]
        scrape_interval: 30s
        static_configs:
          - targets: ["http://10.0.57.17:11434/api/tags"]
            labels:
              instance: "ollama-edge1"
              service: "ollama"
              deployment: "nuc"
              gpu: "cpu"
        relabel_configs:
          - source_labels: [__address__]
            target_label: __param_target
          - source_labels: [__param_target]
            target_label: instance
          - target_label: __address__
            replacement: blackbox-exporter.monitoring.svc:9115

      # Agent Zero Web UI — in-cluster (RKE2)
      # Target uses short svc form (agent-zero.agent-zero.svc) NOT
      # cluster.local FQDN — the *.cluster.local form gets rewritten to
      # 10.0.56.200 (Traefik VIP) by the CoreDNS iamworkin.lan template +
      # ndots:5 search-suffix expansion. Memory: feedback_coredns_ndots_template_collision.
      - job_name: "probe-agentzero-nuc"
        metrics_path: /probe
        params:
          module: [http_2xx]
        scrape_interval: 30s
        static_configs:
          - targets: ["http://agent-zero.agent-zero.svc:80/"]
            labels:
              instance: "agent-zero-nuc"
              service: "agent-zero"
              deployment: "nuc"
        relabel_configs:
          - source_labels: [__address__]
            target_label: __param_target
          - source_labels: [__param_target]
            target_label: instance
          - target_label: __address__
            replacement: blackbox-exporter.monitoring.svc:9115

      # =============================================================================
      # K8s Cluster State (kube-state-metrics, cert-manager, traefik)
      # =============================================================================
      # Use in-cluster ClusterIP service DNS — NOT NodePorts — so a same-node
      # NodePort hairpin doesn't break the scrape (hit on rke2-agent1 hosting
      # both prometheus and traefik on 2026-04-26: 10.0.56.12:30900 timed out
      # from prometheus while .11/.13 worked). NodePorts at 30900-30902 are
      # still useful for noc1-Podman-style external scrapers, but in-cluster
      # we should always use the svc DNS form.

      # kube-state-metrics — exposes K8s object state (pods, deployments, nodes)
      # Required for KubeContainerRestartingFrequently / KubePodNotReady alerts.
      - job_name: "kube-state-metrics"
        scrape_interval: 30s
        static_configs:
          - targets: ["kube-state-metrics.kube-system.svc:8080"]
            labels:
              cluster: "rke2"

      # cert-manager — exposes certmanager_certificate_ready_status,
      # certmanager_certificate_expiration_timestamp_seconds, etc. Drives the
      # CertManagerCertificateNotReady / CertManagerCertificateRenewalFailed
      # alerts. Memory: project_cert_manager_prometheus_scrape.
      - job_name: "cert-manager"
        scrape_interval: 30s
        static_configs:
          - targets: ["cert-manager-metrics.cert-manager.svc:9402"]
            labels:
              cluster: "rke2"

      # Traefik — request rates, latency, TLS cert metadata, router state.
      # ClusterIP svc routes to one of the traefik pods; per-pod scrape via
      # the headless `traefik-metrics` selector would be nicer for failover
      # visibility but the single-replica scrape is enough for steady-state.
      - job_name: "traefik"
        scrape_interval: 15s
        static_configs:
          - targets: ["traefik-metrics.traefik-system.svc:9100"]
            labels:
              service: "traefik"
              cluster: "rke2"

      # Longhorn — exposes longhorn_volume_robustness, longhorn_backup_*,
      # longhorn_node_status_*. Enables LonghornVolumeUnhealthy +
      # LonghornBackupFailed alerts (no real visibility into Longhorn
      # health before this — was relying on K8s events which are noisy
      # transient lifecycle messages, not actionable signals).
      - job_name: "longhorn"
        scrape_interval: 30s
        static_configs:
          - targets: ["longhorn-backend.longhorn-system.svc:9500"]
            labels:
              service: "longhorn"
              cluster: "rke2"

      # FC web services through Traefik — single probe surface to spot any
      # iamworkin.lan host returning non-200. Uses https_internal because all
      # certs are step-ca leaves; blackbox would x509-fail with http_2xx.
      # Some services need explicit healthcheck paths because root returns
      # 404 (acme, guac) or 401 (grafana, prometheus). Drop them or point at
      # the right endpoint — don't lower valid_status_codes globally because
      # 401 from a healthy pod and 401 from an outage look identical.
      - job_name: "probe-traefik-services"
        metrics_path: /probe
        params:
          module: [https_internal]
        scrape_interval: 60s
        static_configs:
          - targets:
              # Root-reachable services (200 or 3xx)
              - "https://gitea.iamworkin.lan/"
              - "https://argocd.iamworkin.lan/"
              - "https://intranet.iamworkin.lan/"
              - "https://signage.iamworkin.lan/"
              - "https://kiosk.iamworkin.lan/"
              - "https://media.iamworkin.lan/"
              - "https://mysql.iamworkin.lan/"
              - "https://php.iamworkin.lan/"
              - "https://zabbix.iamworkin.lan/"
              - "https://desktop.iamworkin.lan/"
              - "https://print.iamworkin.lan/"
              - "https://dns.iamworkin.lan/"
              - "https://chat.iamworkin.lan/"
              - "https://dist.iamworkin.lan/"
              - "https://dms.iamworkin.lan/"
              - "https://menuboard.iamworkin.lan/"
              - "https://messageboard.iamworkin.lan/"
              - "https://presentations.iamworkin.lan/"
              - "https://retail.iamworkin.lan/"
              - "https://ttsreader.iamworkin.lan/"
              # Explicit healthcheck paths
              - "https://fc-llm-bridge.iamworkin.lan/healthz"
              - "https://acme.iamworkin.lan/health"
              # NOTE: services intentionally NOT in this probe surface
              #   - grafana.iamworkin.lan: every endpoint (incl. /api/health
              #     and /login) returns 401 behind Traefik basic-auth.
              #     Health covered by in-cluster monitoring-grafana scrape.
              #   - prometheus.iamworkin.lan: same auth pattern. Health covered
              #     by the prometheus self-scrape job.
              #   - guac.iamworkin.lan: deprecated — Guacamole moved to
              #     desktop.iamworkin.lan/guacamole/ (memory:
              #     feedback_traefik_cross_namespace_refs_disabled).
            labels:
              probe_type: "traefik-service"
        relabel_configs:
          - source_labels: [__address__]
            target_label: __param_target
          - source_labels: [__param_target]
            regex: "https?://([^/:]+).*"
            target_label: instance
          - target_label: __address__
            replacement: blackbox-exporter.monitoring.svc:9115

      # =============================================================================
      # Self-monitoring (K8s monitoring namespace)
      # =============================================================================

      - job_name: "monitoring-grafana"
        metrics_path: /metrics
        static_configs:
          - targets: ["grafana.monitoring.svc:3000"]
            labels:
              instance: "grafana-k8s"
              service: "grafana"

      - job_name: "monitoring-blackbox"
        static_configs:
          - targets: ["blackbox-exporter.monitoring.svc:9115"]
            labels:
              instance: "blackbox-k8s"
              service: "blackbox"

  recording-rules.yml: |
    groups:
      - name: node-aggregations
        interval: 30s
        rules:
          - record: instance:node_cpu_usage:avg5m
            expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)
          - record: instance:node_memory_usage:percent
            expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100
          - record: instance:node_disk_usage:percent
            expr: (1 - node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100
          - record: instance:node_network_receive:rate5m
            expr: rate(node_network_receive_bytes_total{device!~"lo|veth.*|cali.*|flannel.*"}[5m]) * 8
          - record: instance:node_network_transmit:rate5m
            expr: rate(node_network_transmit_bytes_total{device!~"lo|veth.*|cali.*|flannel.*"}[5m]) * 8
      - name: probe-aggregations
        interval: 30s
        rules:
          - record: service:probe_success:min
            expr: min by(service) (probe_success)
          - record: service:probe_duration:avg
            expr: avg by(service) (probe_duration_seconds)
      - name: print-rates
        interval: 30s
        rules:
          - record: print:jobs_per_minute:rate5m
            expr: rate(print_jobs_enqueued_total[5m]) * 60
          - record: print:success_rate:ratio5m
            expr: rate(print_jobs_completed_total[5m]) / rate(print_jobs_enqueued_total[5m])
          - record: print:job_duration_p95:5m
            expr: histogram_quantile(0.95, rate(print_job_duration_ms_bucket[5m]))
          - record: print:ollama_runner_keepalive_remaining_seconds:max
            expr: max by(instance, model) (ai_ollama_runner_keepalive_remaining_seconds{job="printweb-otel"})
      - name: relay-rates
        interval: 15s
        rules:
          - record: relay:state_changes:1h
            expr: changes(pimanager_relay_state[1h])
          - record: epson:pages_per_day:rate24h
            expr: increase(prtMarkerLifeCount{job="snmp-printer"}[24h])

  alerts.yml: |
    groups:
      - name: ai-stack
        rules:
          - alert: OllamaDown
            expr: probe_success{service="ollama"} == 0
            for: 2m
            labels:
              severity: warning
            annotations:
              summary: "Ollama is down on {{ $labels.deployment }}"
              description: "Ollama API at {{ $labels.instance }} has been unreachable for 2 minutes. Agent Zero FAISS memory will fail."

          - alert: AgentZeroDown
            expr: probe_success{service="agent-zero"} == 0
            for: 2m
            labels:
              severity: warning
            annotations:
              summary: "Agent Zero is down on {{ $labels.deployment }}"
              description: "Agent Zero web UI at {{ $labels.instance }} has been unreachable for 2 minutes."

          - alert: OllamaSlowResponse
            expr: probe_duration_seconds{service="ollama"} > 3
            for: 5m
            labels:
              severity: info
            annotations:
              summary: "Ollama responding slowly on {{ $labels.deployment }}"
              description: "Ollama API response time exceeds 3s for 5 minutes. GPU may be overloaded."

      - name: print-services
        rules:
          - alert: CUPSExporterDown
            expr: up{job="cups"} == 0
            for: 2m
            labels:
              severity: warning
            annotations:
              summary: "CUPS exporter unreachable on edge2"
              description: "cups_exporter at edge2:9628 has been down for 2 minutes. CUPS monitoring offline."

          - alert: CUPSWebUIDown
            expr: probe_success{job="probe-cups"} == 0
            for: 3m
            labels:
              severity: warning
            annotations:
              summary: "CUPS web UI down on edge2"
              description: "CUPS port 631 unreachable for 3 minutes. Network printing unavailable."

          - alert: PrintWebDown
            expr: probe_success{job="probe-printweb"} == 0
            for: 2m
            labels:
              severity: warning
            annotations:
              summary: "Print.Web is down on edge2"
              description: "FlowerCore Print.Web at edge2:5200 unreachable. API/MCP/Blazor printing unavailable."

          - alert: CUPSPrinterStopped
            expr: cups_printer_state_total{state="stopped"} > 0
            for: 5m
            labels:
              severity: warning
            annotations:
              summary: "CUPS printer stopped on edge2"
              description: "A CUPS printer has been in stopped state for 5 minutes. Check USB connection or paper."

          - alert: CUPSJobBacklog
            expr: cups_job_active_total > 10
            for: 2m
            labels:
              severity: warning
            annotations:
              summary: "Print queue backlog on edge2 ({{ $value }} active jobs)"
              description: "CUPS has {{ $value }} active jobs queued. Possible printer jam, USB disconnect, or paper out."

          # Paper roll lifecycle alerts (XL Track I, 2026-04-26).
          # Source-of-truth gauge: print_paper_remaining_percent (Print.Web OTEL,
          # hydrated on startup from the active PaperRoll row).
          # alert_channel=thermal_print routes through irc-notify -> Print.Web
          # /api/print/alert so the printer announces its own paper-out warning
          # on its remaining paper. Self-referential humor + operator nudge.
          - alert: PrintPaperRollLow
            expr: print_paper_remaining_percent{job="printweb-otel"} < 10 and print_paper_remaining_percent{job="printweb-otel"} > 5
            for: 5m
            labels:
              severity: warning
              alert_channel: thermal_print
            annotations:
              summary: "Print roll low on edge2 ({{ $value | printf \"%.1f\" }}% remaining)"
              description: "NuPrint 210 paper roll has {{ $value | printf \"%.1f\" }}% remaining. Operator should load a fresh roll soon. Run /api/paper/status for the precise mm + estimated jobs left."

          - alert: PrintPaperRollCritical
            expr: print_paper_remaining_percent{job="printweb-otel"} <= 5
            for: 2m
            labels:
              severity: critical
              alert_channel: thermal_print
            annotations:
              summary: "Print roll critical on edge2 ({{ $value | printf \"%.1f\" }}% remaining)"
              description: "NuPrint 210 paper roll at {{ $value | printf \"%.1f\" }}% — load a new roll NOW. The 50ft roll has a ~12% red-stripe zone; once paper passes that, the printer can run dry mid-job."

          - alert: PrintJobDeadLetter
            expr: increase(print_jobs_dead_letter_total[15m]) > 0
            for: 1m
            labels:
              severity: warning
              alert_channel: thermal_print
            annotations:
              summary: "Print job(s) entered dead-letter on edge2 ({{ $value | printf \"%.0f\" }} in last 15m)"
              description: "{{ $value | printf \"%.0f\" }} print job(s) exhausted MaxRetries and need operator action. Open /print-log, filter Status=DeadLetter, click 'Retry From Start' after fixing the underlying cause (paper jam, USB disconnect, printer power-cycle)."

          - alert: CUPSHighJobRate
            expr: rate(cups_job_total[5m]) * 60 > 30
            for: 5m
            labels:
              severity: info
            annotations:
              summary: "High print volume on edge2 ({{ $value | printf \"%.0f\" }} jobs/min)"
              description: "Print rate exceeds 30 jobs/min for 5 minutes. Possible runaway print loop."

          - alert: PrintOllamaRunnerLongKeepAlive
            expr: ai_ollama_runner_keepalive_remaining_seconds{job="printweb-otel"} > 600
            for: 2m
            labels:
              severity: warning
              alert_channel: thermal_print
            annotations:
              summary: "Print.Web Ollama runner held for >10m ({{ $labels.model }})"
              description: "Print.Web reports model {{ $labels.model }} with {{ $value | printf \"%.0f\" }}s of keep-alive remaining. Check concurrent requests before the Pi 5 Ollama lane thrashes."

      - name: macmini-runners
        rules:
          - alert: MacMiniRunnerOffline
            expr: (flowercore_github_runner_online{runner=~"macmini-.*"} == 0) or absent(flowercore_github_runner_online{runner=~"macmini-.*"})
            for: 10m
            labels:
              severity: warning
              service: github-runner
            annotations:
              summary: "Mac mini GitHub runner offline ({{ $labels.runner }})"
              description: "A macmini-* GitHub Actions runner has not reported online for more than 10 minutes. Puppet manages its LaunchDaemon under /Library/LaunchDaemons/io.flowercore.github-runner-<slug>.plist; runners survive reboot and do not require a GUI session."

      - name: linux-runners
        rules:
          - alert: LinuxRunnerOffline
            expr: |
              kube_deployment_status_replicas_ready{
                namespace="github-runner",
                deployment=~"github-runner(|-(sharedpos|puppet|signage|dms|telephony|print-web|chat|mysql|kiosk-linux))"
              } == 0
            for: 5m
            labels:
              severity: warning
              alert_channel: irc
              service: github-runner
              team: ci
            annotations:
              summary: "Linux CI runner offline: {{ $labels.deployment }}"
              description: "Deployment {{ $labels.deployment }} in namespace github-runner has 0 ready replicas for more than 5 minutes. CI jobs targeting this repo will queue until the runner pod restarts and re-registers with GitHub. Check pods with: kubectl -n github-runner get pods -l app.kubernetes.io/name={{ $labels.deployment }}. Check logs with: kubectl -n github-runner logs -l app.kubernetes.io/name={{ $labels.deployment }} --tail=50. Common causes: PAT missing repo access, runner CrashLoopBackOff, or node/resource pressure."

      - name: remote-desktop
        rules:
          - alert: RemoteDesktopWebDown
            expr: probe_success{job="probe-remotedesktop",instance="https://desktop.iamworkin.lan/health"} == 0
            for: 3m
            labels:
              severity: warning
            annotations:
              summary: "FlowerCore RemoteDesktop web is down"
              description: "https://desktop.iamworkin.lan/health probe has failed for 3 minutes. Catalog + session launch surface offline."

          - alert: RemoteDesktopMetricsStale
            expr: absent(fc_desktop_session_events_total)
            for: 10m
            labels:
              severity: warning
            annotations:
              summary: "RemoteDesktop /metrics scrape returning no data"
              description: "No fc_desktop_session_events_total series for 10 minutes. Either the Prometheus scrape target is misconfigured or the web deployment stopped exporting metrics. Zabbix template carries the same 10m no-data trigger for cross-monitor parity."

          # PUBLISHER QUIRK: fc_desktop_pool_depleted / _deficit emit one
          # series per template per status (Ready/Warming/BelowDesiredSize/
          # Disabled), and the historical series for non-current statuses
          # stay at their last value. So just `_depleted > 0` fires forever
          # on any template that ever entered a bad state.
          #
          # SAFE PATTERN: alert only when the canonical "Ready" status
          # gauge does NOT report ready=1 for the enabled template. This
          # is the publisher's own canary — _ready{status="Ready"}==1 is
          # always the current "everything is fine" signal.
          - alert: RemoteDesktopPoolDepleted
            expr: |
              group by(template) (fc_desktop_pool_ready{enabled="true"})
              unless on(template) (fc_desktop_pool_ready{enabled="true",status="Ready"} == 1)
            for: 5m
            labels:
              severity: warning
            annotations:
              summary: "RemoteDesktop pool depleted ({{ $labels.template }})"
              description: "Pool for template {{ $labels.template }} has no Ready warm pod for 5 minutes. New launches will cold-start. Check pod-scheduling failures, image pull issues, or exhausted node capacity."

          # Same pattern, but only fires when template explicitly reports
          # a sustained Warning-level alert state (current-status series).
          - alert: RemoteDesktopPoolDeficitSustained
            expr: |
              fc_desktop_pool_deficit{enabled="true",alert_level="Warning"} > 0
              unless on(template) (fc_desktop_pool_ready{enabled="true",status="Ready"} == 1)
            for: 10m
            labels:
              severity: info
            annotations:
              summary: "RemoteDesktop pool {{ $labels.template }} below desired for 10m"
              description: "Pool {{ $labels.template }} has a persistent deficit of {{ $value }} warm pods AND no Ready series. Likely image pull, NFS affinity, or claim-init issue."

          - alert: RemoteDesktopSessionChurnSpike
            expr: sum(rate(fc_desktop_session_events_total{event="launch"}[5m])) * 60 > 20
            for: 5m
            labels:
              severity: info
            annotations:
              summary: "RemoteDesktop launch rate high ({{ $value | printf \"%.0f\" }}/min)"
              description: "Launch events exceed 20/min for 5 minutes. Could be a user-facing feature launch, a pooled template thrashing, or a runaway automation loop."

          - alert: RemoteDesktopRecordingEventsDropped
            expr: absent_over_time(fc_desktop_session_events_total{event="recording"}[30m]) and on() (sum(fc_desktop_session_events_total{event="launch"}) > 0)
            for: 15m
            labels:
              severity: info
            annotations:
              summary: "RemoteDesktop recording events silent for 30m despite active launches"
              description: "No recording events in 30 minutes while launches are happening. Recording may be silently disabled on all templates (SessionRecordingEnabled=false), the guacd NFS mount may be unhealthy, or the retention sweep isn't emitting events. Not an error by itself — worth checking."

          # Match by job — instance label carries full URL incl. /health,
          # not just hostname, so a hostname-only match never fires.
          - alert: RemoteDesktopTlsExpiry
            expr: probe_ssl_earliest_cert_expiry{job="probe-remotedesktop"} - time() < 2 * 86400
            for: 6h
            labels:
              severity: critical
            annotations:
              summary: "desktop.iamworkin.lan TLS cert expires within 2 days"
              description: "The desktop.iamworkin.lan cert is inside the 2-day renewal window and cert-manager has not renewed. Check cert-manager logs, step-ca reachability, and pfSense DNS overrides per the ACME DNS-01 gate."

          - alert: LonghornPVCGrowthRapid
            expr: |
              (
                (
                  (
                    longhorn_volume_actual_size_bytes
                    - (longhorn_volume_actual_size_bytes offset 1h)
                  )
                  / clamp_min(longhorn_volume_actual_size_bytes offset 1h, 1)
                )
                * on(volume) group_left(namespace, persistentvolumeclaim) (
                  (
                    label_replace(kube_persistentvolumeclaim_info{storageclass="longhorn"}, "volume", "$1", "volumename", "(.+)")
                    * on(namespace, persistentvolumeclaim) group_left()
                      kube_persistentvolumeclaim_labels{label_flowercore_io_managed_by="remotedesktop"}
                  )
                  or
                  label_replace(kube_persistentvolumeclaim_info{namespace="fc-desktop", storageclass="longhorn", persistentvolumeclaim=~"fc-profile-.*|remotedesktop-data"}, "volume", "$1", "volumename", "(.+)")
                )
              ) > 0.20
              or
              (
                (
                  longhorn_volume_actual_size_bytes
                  / on(volume) clamp_min(longhorn_volume_capacity_bytes, 1)
                )
                * on(volume) group_left(namespace, persistentvolumeclaim) (
                  (
                    label_replace(kube_persistentvolumeclaim_info{storageclass="longhorn"}, "volume", "$1", "volumename", "(.+)")
                    * on(namespace, persistentvolumeclaim) group_left()
                      kube_persistentvolumeclaim_labels{label_flowercore_io_managed_by="remotedesktop"}
                  )
                  or
                  label_replace(kube_persistentvolumeclaim_info{namespace="fc-desktop", storageclass="longhorn", persistentvolumeclaim=~"fc-profile-.*|remotedesktop-data"}, "volume", "$1", "volumename", "(.+)")
                )
              ) > 0.80
            for: 5m
            labels:
              severity: warning
              alert_channel: thermal_print
              service: remotedesktop
            annotations:
              summary: "RemoteDesktop Longhorn PVC {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} growing rapidly"
              description: "Longhorn volume {{ $labels.volume }} backing RemoteDesktop PVC {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} grew more than 20% in 1h or is over 80% capacity. Check for runaway SQLite/user-profile growth; this alert was added after the 2026-05-16 RemoteDesktop web SQLite Error 13 incident."
              runbook: "1. kubectl -n {{ $labels.namespace }} describe pvc {{ $labels.persistentvolumeclaim }} 2. Open Longhorn UI volume {{ $labels.volume }} 3. Check RemoteDesktop web/user-volume SQLite files for permission or runaway growth 4. Expand PVC only after confirming the writer is healthy"
              todo: "2026-05-19 metric gate: live noc1 Prometheus currently exposes kube_persistentvolumeclaim_info and kube_persistentvolumeclaim_resource_requests_storage_bytes, but not longhorn_volume_actual_size_bytes, longhorn_volume_capacity_bytes, kube_persistentvolumeclaim_labels, or kubelet_volume_stats_used_bytes. Keep the fc-desktop PVC fallback until kube-state-metrics label allowlist exposes flowercore.io/managed-by=remotedesktop."

      - name: pi-fleet
        rules:
          - alert: PiManagerDown
            expr: up{job="pimanager-app"} == 0
            for: 3m
            labels:
              severity: warning
            annotations:
              summary: "PiManager down on {{ $labels.instance }}"
              description: "PiManager app on {{ $labels.instance }} ({{ $labels.device }}) unreachable for 3 minutes."

          - alert: PiCpuTempHigh
            expr: pimanager_cpu_temperature_celsius > 75
            for: 5m
            labels:
              severity: warning
            annotations:
              summary: "Pi CPU temperature high on {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}C)"

          - alert: PiCpuTempCritical
            expr: pimanager_cpu_temperature_celsius > 82
            for: 2m
            labels:
              severity: critical
            annotations:
              summary: "Pi CPU temperature CRITICAL on {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}C)"

          - alert: PiMemoryHigh
            expr: pimanager_memory_usage_percent > 90
            for: 5m
            labels:
              severity: warning
            annotations:
              summary: "Pi memory usage high on {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}%)"

          - alert: PiDiskHigh
            expr: pimanager_disk_usage_percent > 85
            for: 10m
            labels:
              severity: warning
            annotations:
              summary: "Pi disk usage high on {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}%)"

          - alert: RelayAllOff
            expr: sum by (instance) (pimanager_relay_state) == 0 and pimanager_relay_channel_count > 0
            for: 0m
            labels:
              severity: info
            annotations:
              summary: "All relay channels OFF on {{ $labels.instance }}"

          - alert: PiWifiWeak
            expr: pimanager_wifi_signal_dbm < -75 and pimanager_wifi_signal_dbm != 0
            for: 10m
            labels:
              severity: warning
            annotations:
              summary: "Weak WiFi signal on {{ $labels.instance }} ({{ $value }}dBm)"

      - name: snmp-devices
        rules:
          - alert: EpsonInkLow
            expr: prtMarkerSuppliesLevel{job="snmp-printer"} < 15 and prtMarkerSuppliesLevel{job="snmp-printer"} > 0
            for: 0m
            labels:
              severity: warning
              alert_channel: thermal_print
            annotations:
              summary: "Epson ink low: {{ $labels.prtMarkerSuppliesDescription }} at {{ $value }}%"

          - alert: EpsonInkCritical
            expr: prtMarkerSuppliesLevel{job="snmp-printer"} < 5 and prtMarkerSuppliesLevel{job="snmp-printer"} > 0
            for: 0m
            labels:
              severity: critical
              alert_channel: thermal_print
            annotations:
              summary: "Epson ink CRITICAL: {{ $labels.prtMarkerSuppliesDescription }} at {{ $value }}%"

          # for: 30m absorbs sleep cycles. The EcoTank sleeps after ~5 min
          # of idle and SNMP times out, so 5m for: would page nightly. A
          # genuine printer outage (jam, disconnected) lasts well over 30m.
          - alert: EpsonPrinterDown
            expr: up{job="snmp-printer"} == 0
            for: 30m
            labels:
              severity: warning
            annotations:
              summary: "Epson ET-3750 SNMP unreachable for >30m (likely actual fault, not sleep)"

          - alert: SynologyDiskLow
            expr: hrStorageUsed{job="snmp-nas"} / hrStorageSize{job="snmp-nas"} * 100 > 85
            for: 10m
            labels:
              severity: warning
              alert_channel: thermal_print
            annotations:
              summary: "Synology NAS storage high ({{ $value | printf \"%.1f\" }}%)"

          - alert: SynologyDown
            expr: up{job="snmp-nas"} == 0
            for: 3m
            labels:
              severity: critical
              alert_channel: thermal_print
            annotations:
              summary: "Synology NAS SNMP unreachable"

      - name: infrastructure
        rules:
          - alert: NodeDown
            expr: up{job=~"node-exporter|rke2-nodes|edge-nodes|k8s-node-exporter"} == 0
            for: 2m
            labels:
              severity: critical
            annotations:
              summary: "Node {{ $labels.instance }} is down"

          - alert: HighCPU
            expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 85
            for: 10m
            labels:
              severity: warning
            annotations:
              summary: "High CPU on {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}%)"

          - alert: HighMemory
            expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100 > 90
            for: 5m
            labels:
              severity: warning
            annotations:
              summary: "High memory usage on {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}%)"

          - alert: DiskSpaceLow
            expr: (1 - node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100 > 85
            for: 10m
            labels:
              severity: warning
            annotations:
              summary: "Disk usage high on {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}%)"

      # Puppet agent + service alerts.
      # Mirror of FlowerCore.Notes/scripts/monitoring/alerts.yml `puppet` group
      # so a future migration to in-cluster Prometheus inherits the ruleset.
      # Source-of-truth for the live Podman Prometheus on noc1 is the Notes file.
      # See feedback_monitoring_k8s_target_vs_live_podman.
      - name: puppet
        rules:
          - alert: PuppetAgentReportStale
            expr: puppet_last_run_age_seconds > 7200
            for: 30m
            labels:
              severity: warning
              alert_channel: irc
            annotations:
              summary: "Puppet agent {{ $labels.instance }} hasn't reported in over 2h"
              description: "Last run age: {{ $value | humanizeDuration }}. The puppet agent on {{ $labels.instance }} may be stopped, the node may be powered off, or noc1 may be unreachable from this node."
              runbook: "1. SSH to node (via noc1 jumpbox if needed) 2. sudo systemctl status puppet 3. sudo puppet agent -t --noop to force a run 4. Check r10k: ssh fcadmin@10.0.56.10 'sudo podman logs openvoxserver --tail 50' 5. Verify noc1 reachability: ping puppet.iamworkin.lan"

          - alert: PuppetAgentReportCritical
            expr: puppet_last_run_age_seconds > 86400
            for: 1h
            labels:
              severity: critical
              alert_channel: irc
            annotations:
              summary: "Puppet agent {{ $labels.instance }} silent for over 24h — node is unmanaged"
              description: "Last run age: {{ $value | humanizeDuration }}. Node {{ $labels.instance }} has not submitted a Puppet report in over 24 hours. Config drift is accumulating — investigate immediately. If intentional (maintenance), add to the exclusion filter or silence in Grafana."
              runbook: "URGENT: 1. Check node power state 2. SSH via noc1 jumpbox: ssh fcadmin@10.0.56.10 then ssh <node> 3. sudo systemctl status puppet 4. sudo systemctl start puppet + sudo puppet agent -t 5. Check for network partitions (VLAN connectivity to 10.0.56.10) 6. If node was recently reimaged: sudo puppet agent -t to re-register with new SSL cert"

          # Sprint 33 Cx-7 Phase B (2026-05-25 postmortem follow-up):
          # Detects puppet.service in failed state — distinct from PuppetAgentReportStale
          # which catches "agent hasn't run." This catches "systemd gave up restarting it"
          # (CA-verify loop or other fatal exit). Requires node-exporter systemd collector
          # enabled with --collector.systemd. If `node_systemd_unit_state` has no series
          # for a node, the collector is disabled there — flag in postmortem follow-up.
          - alert: PuppetServiceFailed
            expr: node_systemd_unit_state{name="puppet.service",state="failed"} == 1
            for: 5m
            labels:
              severity: warning
              alert_channel: irc
            annotations:
              summary: "Puppet service failed on {{ $labels.instance }}"
              description: "puppet.service on {{ $labels.instance }} has been in failed state for 5+ minutes. systemd has stopped auto-restarting (CA-verify-loop or other exit). Manual `systemctl status puppet` confirms. Run `sudo systemctl start puppet` to recover; investigate journal for root cause."
              runbook_url: "https://github.com/astoltz/FlowerCore.Notes/blob/master/memory/feedback_puppet_service_dead_after_ca_loop_alert_misreads.md"

      # K8s pod-state alerts. Require kube-state-metrics scrape (added
      # 2026-04-26 — see scrape_configs above). Would have surfaced the
      # agent-zero ollama-proxy 172x crash-loop instead of letting it
      # silently churn for ~3 days.
      - name: kubernetes-state
        rules:
          - alert: KubeContainerRestartingFrequently
            expr: increase(kube_pod_container_status_restarts_total[1h]) > 5
            for: 15m
            labels:
              severity: warning
            annotations:
              summary: "{{ $labels.namespace }}/{{ $labels.pod }} container {{ $labels.container }} restarting >5x/hr"
              description: "Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has restarted {{ $value | printf \"%.0f\" }} times in the last hour. Check 'kubectl describe pod' + last-state termination reason."

          - alert: KubeContainerCrashLooping
            expr: increase(kube_pod_container_status_restarts_total[15m]) > 3
            for: 5m
            labels:
              severity: critical
              alert_channel: thermal_print
            annotations:
              summary: "{{ $labels.namespace }}/{{ $labels.pod }} crashlooping ({{ $value | printf \"%.0f\" }} restarts/15m)"
              description: "Container {{ $labels.container }} restarted {{ $value | printf \"%.0f\" }} times in 15 minutes — actively crashlooping."

          - alert: KubePodNotReady
            expr: sum by(namespace, pod) (kube_pod_status_phase{phase=~"Pending|Failed|Unknown"}) > 0
            for: 15m
            labels:
              severity: warning
            annotations:
              summary: "{{ $labels.namespace }}/{{ $labels.pod }} not Ready for >15m"
              description: "Pod is in a non-Running, non-Succeeded phase for over 15 minutes. Common causes: ImagePullBackOff (registry/Nexus down, wrong image tag), pending PVC, scheduling failure (taint/resources)."

          - alert: KubePodImagePullBackOff
            expr: sum by(namespace, pod) (kube_pod_container_status_waiting_reason{reason=~"ImagePullBackOff|ErrImagePull"}) > 0
            for: 10m
            labels:
              severity: warning
            annotations:
              summary: "{{ $labels.namespace }}/{{ $labels.pod }} ImagePullBackOff for >10m"
              description: "Pod can't pull image. Check the image ref (often a stale tag or unreachable registry) and clean up if it's an orphan."

          - alert: KubeDeploymentReplicasMismatch
            expr: kube_deployment_spec_replicas != kube_deployment_status_replicas_available
            for: 15m
            labels:
              severity: warning
            annotations:
              summary: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} replica mismatch"
              description: "Spec wants {{ $labels.spec_replicas }} but only {{ $value }} available. Likely a rollout stuck on probe failure, scheduling, or PVC."

          # Q-MR-3 (2026-05-11): multus memory pressure — catches the next OOM
          # cascade BEFORE multus is OOM-killed cluster-wide. The 2026-05-10
          # outage (21h) hit because no alert fired on the rising multus working
          # set — only downstream blackbox / Traefik / service alerts. With
          # 1Gi limit (bluejay-infra@eb8693e), 80% = ~800MiB; steady-state
          # runs ~150-250MiB so this only fires when an avalanche starts.
          - alert: MultusMemoryPressure
            expr: |
              container_memory_working_set_bytes{container="kube-multus"}
                / container_spec_memory_limit_bytes{container="kube-multus"} > 0.8
            for: 5m
            labels:
              severity: critical
              alert_channel: thermal_print
            annotations:
              summary: "kube-multus memory >80% of limit on {{ $labels.node }} for 5m"
              description: "kube-multus working set is {{ $value | humanizePercentage }} of its memory limit on node {{ $labels.node }}. If this keeps climbing, multus will OOM and all new pod networking will halt cluster-wide (precedent: 2026-05-10 outage)."

          # Q-MR-3 (2026-05-11): namespace pending-pod backlog — catches the
          # operator-leak avalanche pattern BEFORE it cascades into a multus
          # CNI OOM. Any FC operator (RemoteDesktop / Distribution / WorldBuilder)
          # emitting pods without ownerReferences will accumulate them when
          # the operator crashes. >25 pending pods in any namespace for 30m
          # is the signal to investigate the reconciler.
          - alert: NamespacePendingPodBacklog
            expr: sum by (namespace) (kube_pod_status_phase{phase="Pending"}) > 25
            for: 30m
            labels:
              severity: warning
            annotations:
              summary: "Namespace {{ $labels.namespace }} has {{ $value }} Pending pods for 30m"
              description: "Pending pod count in {{ $labels.namespace }} exceeds 25 sustained for 30m. Likely operator-leak avalanche pattern — children emitted without ownerReferences. Risk of multus CNI OOM cascade."

      # Longhorn storage health alerts. Required: longhorn scrape job
      # (added 2026-04-26 — see scrape_configs above). The K8s events
      # for "snapshot becomes not ready to use" are transient lifecycle
      # noise, not actionable — these alerts use the actual Longhorn
      # gauges that reflect persistent state.
      - name: longhorn-storage
        rules:
          # Volume robustness: 0=unknown, 1=healthy, 2=degraded, 3=faulted.
          # Detached volumes report 0 — that's normal for unattached PVCs,
          # so filter to only attached.
          - alert: LonghornVolumeDegraded
            expr: longhorn_volume_robustness{robustness="degraded"} == 1
            for: 15m
            labels:
              severity: warning
            annotations:
              summary: "Longhorn volume {{ $labels.volume }} degraded for >15m"
              description: "Volume {{ $labels.volume }} on node {{ $labels.node }} has been degraded (one or more replicas unhealthy) for 15+ minutes. Auto-rebuild may need help — check 'kubectl describe volume.longhorn.io {{ $labels.volume }} -n longhorn-system'."

          - alert: LonghornVolumeFaulted
            expr: longhorn_volume_robustness{robustness="faulted"} == 1
            for: 5m
            labels:
              severity: critical
              alert_channel: thermal_print
            annotations:
              summary: "Longhorn volume {{ $labels.volume }} FAULTED"
              description: "Volume {{ $labels.volume }} on node {{ $labels.node }} is faulted — all replicas unavailable. Data inaccessible. Manual intervention required."

          # No backup in 36h indicates the daily-backup recurringJob is
          # silently failing. Allows for one missed run + slack.
          - alert: LonghornBackupStale
            expr: |
              (time() - max by(volume) (longhorn_backup_state{state="Completed"} * on(backup) group_left() longhorn_backup_actual_size_bytes)) > 36 * 3600
            for: 1h
            labels:
              severity: warning
            annotations:
              summary: "Longhorn volume {{ $labels.volume }} has no completed backup in >36h"
              description: "Daily backup recurringJob (cron 0 2 * * *) appears to have skipped this volume. Check 'kubectl get backups.longhorn.io -n longhorn-system' and the daily-backup CronJob logs."

          - alert: LonghornNodeUnhealthy
            expr: longhorn_node_status{condition="ready",condition_reason!=""} == 0
            for: 5m
            labels:
              severity: warning
            annotations:
              summary: "Longhorn node {{ $labels.node }} not Ready"
              description: "Node {{ $labels.node }} reports ready=false (reason: {{ $labels.condition_reason }}). Volumes scheduled to this node will be unavailable until it recovers."

      # ============================================================
      # FC Signage Marquee Performance — Track 3 + 8 (2026-05-06)
      # Live-mirrored from FlowerCore.Notes/scripts/monitoring/alerts.yml.
      # Source-of-truth for the live Podman Prometheus on noc1 is the
      # Notes file; this K8s ConfigMap exists so a future migration to
      # in-cluster Prometheus inherits the ruleset automatically.
      # See feedback_monitoring_k8s_target_vs_live_podman.
      # ============================================================
      - name: fc-signage-marquee
        rules:
          - alert: MarqueeDroppedFramesHigh
            expr: |
              (
                sum by (renderer, phase, node_id) (rate(marquee_dropped_frames_total[5m]))
                /
                sum by (renderer, phase, node_id) (rate(marquee_render_latency_ms_count[5m]))
              ) > 0.05
              unless on()
              absent_over_time(marquee_dropped_frames_total[7d])
            for: 5m
            labels:
              severity: warning
              service: signage
              alert_channel: irc
            annotations:
              summary: "Marquee dropped-frame rate >5% on {{ $labels.renderer }}/{{ $labels.node_id }} ({{ $labels.phase }})"
              description: "Renderer {{ $labels.renderer }} on {{ $labels.node_id }} drops >5% of frames during {{ $labels.phase }}. Animation visibly stuttery."

          - alert: MarqueeRenderLatencyP99High
            expr: |
              histogram_quantile(
                0.99,
                sum by (renderer, phase, node_id, le) (rate(marquee_render_latency_ms_bucket[5m]))
              ) > 16
              unless on()
              absent_over_time(marquee_render_latency_ms_bucket[7d])
            for: 10m
            labels:
              severity: warning
              service: signage
              alert_channel: irc
            annotations:
              summary: "Marquee render latency p99 > 16ms on {{ $labels.renderer }}/{{ $labels.node_id }} ({{ $labels.phase }})"
              description: "Per-frame render latency p99 has exceeded the Pi-class 16ms budget for 10 minutes."

          - alert: MarqueeAnimationDurationDrift
            expr: |
              abs(
                histogram_quantile(0.5, sum by (renderer, phase, le) (rate(marquee_animation_duration_ms_bucket[15m])))
                -
                on (phase) group_left() avg by (phase) (marquee_animation_duration_target_ms)
              )
              /
              on (phase) group_left() avg by (phase) (marquee_animation_duration_target_ms)
              > 0.10
              unless on()
              absent_over_time(marquee_animation_duration_ms_bucket[7d])
            for: 15m
            labels:
              severity: info
              service: signage
              alert_channel: irc
            annotations:
              summary: "Marquee animation duration drifting > 10% on {{ $labels.renderer }} ({{ $labels.phase }})"
              description: "Median observed cycle duration deviates from target DurationMs by >10%. Could indicate browser tab throttling, GPU pressure, or phase-advancement bug."

# =============================================================================
# ConfigMap: Blackbox Exporter Configuration
# =============================================================================
---
apiVersion: v1
kind: ConfigMap
metadata:
  name: blackbox-config
  namespace: monitoring
data:
  blackbox.yml: |
    modules:
      http_2xx:
        prober: http
        timeout: 5s
        http:
          valid_http_versions: ["HTTP/1.1", "HTTP/2.0"]
          valid_status_codes: [200]
          method: GET
          fail_if_body_not_matches_regexp: []
          preferred_ip_protocol: ip4
      http_ollama:
        prober: http
        timeout: 5s
        http:
          valid_http_versions: ["HTTP/1.1", "HTTP/2.0"]
          valid_status_codes: [200]
          method: GET
          fail_if_body_not_matches_regexp:
            - '"models"'
          preferred_ip_protocol: ip4
      # https_internal — for Traefik-fronted services with step-ca leaf
      # certs. blackbox does not trust the step-ca root CA, so http_2xx
      # against any *.iamworkin.lan host fails with x509 unknown authority.
      # Redirects + multiple status codes are accepted because some hosts
      # 302 to /login or /scalar.
      https_internal:
        prober: http
        timeout: 10s
        http:
          valid_http_versions: ["HTTP/1.1", "HTTP/2.0"]
          valid_status_codes: [200, 301, 302, 303, 307, 308]
          method: GET
          follow_redirects: true
          preferred_ip_protocol: ip4
          tls_config:
            insecure_skip_verify: true

# =============================================================================
# ConfigMap: IRC Notify Script
# =============================================================================
---
apiVersion: v1
kind: ConfigMap
metadata:
  name: irc-notify-script
  namespace: monitoring
data:
  notify.py: |
    #!/usr/bin/env python3
    """HTTP->IRC alert relay with thermal-printer DIGEST forwarding.

    Listens on :9119, posts to #alerts on UnrealIRCd, forwards to Print.Web
    /api/print/alert. Thermal printing is BATCHED into hourly digests by
    default so the printer no longer spam-fires per Grafana webhook.

    Routing (per Grafana webhook alert):
      - IRC: always per-event (operator likes the stream)
      - Thermal printer:
          * severity in {critical,disaster,page} OR
            label alert_channel=thermal_print_immediate -> print NOW
          * label alert_channel=thermal_print -> enqueue into hourly digest
          * everything else -> IRC only
      - RESOLVED webhooks remove the alert from the digest buffer

    Env vars (defaults preserve old behavior on first deploy):
      THERMAL_PRINT_ENABLED  default "true"   - master kill switch
      BATCH_INTERVAL_MIN     default "60"     - minutes between digest prints
      BATCH_MAX_PENDING      default "50"     - force-flush threshold

    HTTP surface:
      POST /         - Grafana webhook entry
      POST /flush    - manual digest flush (idempotent)
      GET  /         - status + config + buffer depth + stats
    """
    import json, os, socket, sys, threading, time
    from collections import defaultdict
    from datetime import datetime, timezone
    from http.server import HTTPServer, BaseHTTPRequestHandler
    from urllib.request import Request, urlopen

    THERMAL_PRINT_ENABLED = os.environ.get("THERMAL_PRINT_ENABLED", "true").lower() == "true"
    BATCH_INTERVAL_MIN    = int(os.environ.get("BATCH_INTERVAL_MIN", "60"))
    BATCH_MAX_PENDING     = int(os.environ.get("BATCH_MAX_PENDING", "50"))

    IRC_HOST      = os.environ.get("IRC_HOST", "unrealircd.irc.svc")
    IRC_PORT      = int(os.environ.get("IRC_PORT", "6667"))
    IRC_NICK      = os.environ.get("IRC_NICK", "grafana-bot")
    IRC_CHANNEL   = os.environ.get("IRC_CHANNEL", "#alerts")
    PRINT_WEB_URL = os.environ.get("PRINT_WEB_URL", "http://10.0.57.16:5200/api/print/alert")

    _buffer_lock = threading.Lock()
    _buffer = {}   # fingerprint -> {"alert": dict, "first_seen": float, "last_seen": float}
    _last_flush_time = time.time()
    _stats = {"webhooks_received": 0, "irc_sent": 0, "print_immediate": 0,
              "digest_flushed": 0, "buffer_dedup": 0, "buffer_added": 0,
              "buffer_resolved": 0, "started_at": time.time()}

    def send_irc(message):
        try:
            sock = socket.create_connection((IRC_HOST, IRC_PORT), timeout=15)
            sock.sendall(f"NICK {IRC_NICK}\r\n".encode())
            sock.sendall(f"USER {IRC_NICK} 0 * :Grafana Alert Bot\r\n".encode())
            registered = False
            deadline = time.time() + 10
            buf = ""
            while time.time() < deadline:
                try:
                    data = sock.recv(4096).decode("utf-8", errors="replace")
                    if not data: break
                    buf += data
                    for line in buf.split("\r\n"):
                        if line.startswith("PING"):
                            sock.sendall(("PONG " + line.split(" ", 1)[1] + "\r\n").encode())
                    if " 001 " in buf:
                        registered = True
                        break
                except socket.timeout: break
            if not registered:
                sock.close()
                return False
            sock.sendall(f"JOIN {IRC_CHANNEL}\r\n".encode())
            time.sleep(0.5)
            sock.recv(4096)
            for line in message.split("\n"):
                if line.strip():
                    sock.sendall(f"PRIVMSG {IRC_CHANNEL} :{line}\r\n".encode())
                    time.sleep(0.3)
            time.sleep(0.5)
            sock.sendall(b"QUIT :alert delivered\r\n")
            sock.close()
            _stats["irc_sent"] += 1
            return True
        except Exception as e:
            print(f"[irc-notify] IRC send failed: {e}", file=sys.stderr)
            return False

    def post_thermal(payload, kind):
        if not THERMAL_PRINT_ENABLED:
            print(f"[irc-notify] thermal disabled; skip {kind} ({payload.get('title','?')[:40]})", file=sys.stderr)
            return False
        try:
            req = Request(PRINT_WEB_URL, data=json.dumps(payload).encode("utf-8"),
                          headers={"Content-Type": "application/json"}, method="POST")
            resp = urlopen(req, timeout=10)
            if kind == "immediate": _stats["print_immediate"] += 1
            print(f"[irc-notify] thermal {kind} sent: {payload.get('title','?')[:50]}", file=sys.stderr)
            return True
        except Exception as e:
            print(f"[irc-notify] thermal {kind} failed: {e}", file=sys.stderr)
            return False

    def fingerprint_of(alert):
        fp = alert.get("fingerprint", "")
        if fp: return fp
        labels = alert.get("labels", {})
        target = labels.get("pod") or labels.get("instance") or labels.get("deployment") or labels.get("statefulset") or labels.get("namespace") or ""
        return f"{labels.get('alertname','?')}/{labels.get('namespace','')}/{target}"

    def is_critical(alert):
        return alert.get("labels", {}).get("severity", "").lower() in ("critical", "disaster", "page")

    def is_immediate_label(alert):
        return alert.get("labels", {}).get("alert_channel") == "thermal_print_immediate"

    def is_batched_label(alert):
        return alert.get("labels", {}).get("alert_channel") == "thermal_print"

    def add_to_digest(alert):
        """Add an alert to the digest buffer. Returns True if the buffer GREW
        (new fingerprint), False if it was a dedup, resolution, or no-op.
        """
        if not THERMAL_PRINT_ENABLED: return False
        fp = fingerprint_of(alert)
        status = alert.get("status", "firing").lower()
        with _buffer_lock:
            if status == "resolved":
                if fp in _buffer:
                    del _buffer[fp]
                    _stats["buffer_resolved"] += 1
                return False
            if fp in _buffer:
                _buffer[fp]["last_seen"] = time.time()
                _buffer[fp]["alert"] = alert
                _stats["buffer_dedup"] += 1
                return False
            _buffer[fp] = {"alert": alert, "first_seen": time.time(), "last_seen": time.time()}
            _stats["buffer_added"] += 1
            return True

    def build_digest_payload():
        with _buffer_lock:
            items = list(_buffer.values())
        if not items: return None
        by_name = defaultdict(list)
        for item in items:
            labels = item["alert"].get("labels", {})
            by_name[labels.get("alertname", "Unknown")].append(item)
        lines = []
        for name, group in sorted(by_name.items()):
            targets = []
            for it in group[:5]:
                labels = it["alert"].get("labels", {})
                t = (labels.get("pod") or labels.get("instance") or labels.get("deployment")
                     or labels.get("statefulset") or labels.get("namespace") or "?")
                targets.append(t)
            more = f" (+{len(group)-5})" if len(group) > 5 else ""
            sevs = sorted({it["alert"].get("labels", {}).get("severity", "warning") for it in group})
            lines.append(f"[{'/'.join(sevs)}] {name} x{len(group)}: {', '.join(targets)}{more}")
        now = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC")
        title = f"Alert digest: {len(items)} firing"
        body = "\n".join([
            f"=== {title} ===",
            f"as of {now}",
            "",
            *lines,
            "",
            "Stream: #alerts (IRC)  |  Triage: grafana-noc1.iamworkin.lan",
            "Force-flush: POST irc-notify.monitoring.svc:9119/flush",
        ])
        return {"title": title, "severity": "Warning", "host": "monitoring",
                "message": body, "eventId": f"digest-{int(time.time())}",
                "source": "Grafana digest", "status": "PROBLEM", "acknowledged": False}

    def flush_digest():
        payload = build_digest_payload()
        if payload is None:
            print("[irc-notify] flush: buffer empty, no digest sent", file=sys.stderr)
            return False
        sent = post_thermal(payload, "digest")
        with _buffer_lock:
            _buffer.clear()
        if sent: _stats["digest_flushed"] += 1
        return sent

    def digest_loop():
        global _last_flush_time
        while True:
            try:
                now = time.time()
                elapsed = now - _last_flush_time
                if elapsed >= BATCH_INTERVAL_MIN * 60:
                    print(f"[irc-notify] digest tick: interval reached ({BATCH_INTERVAL_MIN}m); buffer={len(_buffer)}", file=sys.stderr)
                    flush_digest()
                    _last_flush_time = now
                elif len(_buffer) >= BATCH_MAX_PENDING:
                    print(f"[irc-notify] digest tick: buffer full ({len(_buffer)}); force flush", file=sys.stderr)
                    flush_digest()
                    _last_flush_time = now
                time.sleep(15)
            except Exception as e:
                print(f"[irc-notify] digest loop error: {e}", file=sys.stderr)
                time.sleep(60)

    class Handler(BaseHTTPRequestHandler):
        def do_POST(self):
            if self.path == "/flush":
                ok = flush_digest()
                self.send_response(200); self.send_header("Content-Type", "application/json"); self.end_headers()
                self.wfile.write(json.dumps({"flushed": ok, "buffer_after": len(_buffer)}).encode())
                return
            _stats["webhooks_received"] += 1
            length = int(self.headers.get("Content-Length", 0))
            body = json.loads(self.rfile.read(length)) if length else {}
            for alert in body.get("alerts", []):
                status = alert.get("status", "unknown").upper()
                labels = alert.get("labels", {})
                name = labels.get("alertname", "Unknown")
                summary = alert.get("annotations", {}).get("summary", "")
                desc = alert.get("annotations", {}).get("description", "")
                severity = labels.get("severity", "")
                icon = "\x0304[FIRING]\x03" if status == "FIRING" else "\x0303[RESOLVED]\x03"
                sev_tag = f" [{severity}]" if severity else ""
                msg = f"{icon}{sev_tag} {name}: {summary}"
                if desc: msg += f"\n  {desc}"
                send_irc(msg)
                # Thermal routing — EVERYTHING (including criticals) goes into
                # the hourly digest. Only the explicit `alert_channel=thermal_print_immediate`
                # label bypasses, and even that flushes-the-current-digest rather
                # than printing a standalone job, so the same fingerprint can't
                # spam the printer per webhook cycle.
                if status == "RESOLVED":
                    add_to_digest(alert)  # removes from buffer
                    continue
                if is_immediate_label(alert):
                    # Explicit opt-in for "paper this NOW" — first arrival of a
                    # new fingerprint triggers an immediate digest flush; repeat
                    # webhooks for the same fingerprint dedupe in the buffer
                    # until the next interval or until the alert resolves.
                    new_in_buffer = add_to_digest(alert)
                    if new_in_buffer:
                        global _last_flush_time
                        flush_digest()
                        _last_flush_time = time.time()
                elif is_critical(alert) or is_batched_label(alert):
                    add_to_digest(alert)
                # else: IRC-only (warnings without thermal_print label)
            self.send_response(200); self.send_header("Content-Type", "application/json"); self.end_headers()
            self.wfile.write(b'{"status":"ok"}')

        def do_GET(self):
            self.send_response(200); self.send_header("Content-Type", "application/json"); self.end_headers()
            with _buffer_lock:
                alertnames = sorted({it["alert"].get("labels", {}).get("alertname", "?") for it in _buffer.values()})
                depth = len(_buffer)
            info = {
                "service": "irc-notify",
                "config": {"thermal_print_enabled": THERMAL_PRINT_ENABLED,
                           "batch_interval_min": BATCH_INTERVAL_MIN,
                           "batch_max_pending": BATCH_MAX_PENDING,
                           "irc_target": f"{IRC_HOST}:{IRC_PORT} {IRC_CHANNEL}",
                           "print_web_url": PRINT_WEB_URL},
                "buffer": {"depth": depth, "alertnames": alertnames,
                           "seconds_since_last_flush": int(time.time() - _last_flush_time),
                           "seconds_until_next_flush": max(0, int(BATCH_INTERVAL_MIN*60 - (time.time() - _last_flush_time)))},
                "stats": _stats,
            }
            self.wfile.write(json.dumps(info, indent=2).encode())

        def log_message(self, format, *args):
            print(f"[irc-notify] {args[0]}", file=sys.stderr)

    if __name__ == "__main__":
        threading.Thread(target=digest_loop, daemon=True).start()
        server = HTTPServer(("0.0.0.0", 9119), Handler)
        print(f"[irc-notify] :9119 -> IRC {IRC_HOST}:{IRC_PORT} {IRC_CHANNEL} | thermal={'ON' if THERMAL_PRINT_ENABLED else 'OFF'} | digest={BATCH_INTERVAL_MIN}m max={BATCH_MAX_PENDING}", file=sys.stderr)
        server.serve_forever()

# =============================================================================
# SNMP Exporter Auth Secret
# =============================================================================
# The full snmp.yml (~2MB, auto-generated) exceeds the 1MB ConfigMap limit.
# Strategy: store SNMP auth credentials in a Secret, and use an init container
# to download the full snmp.yml from Gitea, or split into multiple ConfigMaps.
# For now, we mount a minimal auth-only config and rely on the default modules
# bundled in the snmp-exporter image. To use custom modules, apply
# snmp-config.yaml separately (see comments in that file).
---
apiVersion: v1
kind: Secret
metadata:
  name: snmp-auth
  namespace: monitoring
type: Opaque
stringData:
  # SNMP v2 community string used by prometheus scrape configs
  SNMP_COMMUNITY_BLUEJAY: bluejay_monitor
  SNMP_V3_USER: bluejay_snmpv3
  SNMP_V3_AUTH_PASS: BlueJay-SNMP-Auth-2026
  SNMP_V3_PRIV_PASS: BlueJay-SNMP-Priv-2026

# =============================================================================
# Grafana Credentials — synced from 1Password via Operator
# =============================================================================
# 1Password vault: IAmWorkin > "Grafana"
# Creates K8s Secret "grafana-credentials" with fields: username, password
# Grafana Deployment uses secretKeyRef to map username→GF_SECURITY_ADMIN_USER, password→GF_SECURITY_ADMIN_PASSWORD
---
apiVersion: onepassword.com/v1
kind: OnePasswordItem
metadata:
  name: grafana-credentials
  namespace: monitoring
spec:
  itemPath: vaults/IAmWorkin/items/Grafana

# =============================================================================
# RBAC: ServiceAccount + ClusterRole for Prometheus K8s SD
# =============================================================================
---
apiVersion: v1
kind: ServiceAccount
metadata:
  name: prometheus
  namespace: monitoring
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
  name: prometheus
rules:
  - apiGroups: [""]
    resources: ["nodes", "nodes/proxy", "services", "endpoints", "pods"]
    verbs: ["get", "list", "watch"]
  - apiGroups: ["extensions", "networking.k8s.io"]
    resources: ["ingresses"]
    verbs: ["get", "list", "watch"]
  - nonResourceURLs: ["/metrics"]
    verbs: ["get"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
  name: prometheus
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: prometheus
subjects:
  - kind: ServiceAccount
    name: prometheus
    namespace: monitoring

# =============================================================================
# PVC: Prometheus Data (10Gi, Longhorn)
# =============================================================================
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
  name: prometheus-data
  namespace: monitoring
spec:
  accessModes:
    - ReadWriteOnce
  storageClassName: longhorn
  resources:
    requests:
      storage: 10Gi

# =============================================================================
# PVC: Grafana Data (2Gi, Longhorn)
# =============================================================================
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
  name: grafana-data
  namespace: monitoring
spec:
  accessModes:
    - ReadWriteOnce
  storageClassName: longhorn
  resources:
    requests:
      storage: 2Gi

# =============================================================================
# Deployment: Prometheus
# =============================================================================
---
apiVersion: apps/v1
kind: Deployment
metadata:
  name: prometheus
  namespace: monitoring
  labels:
    app: prometheus
spec:
  replicas: 1
  strategy:
    type: Recreate
  selector:
    matchLabels:
      app: prometheus
  template:
    metadata:
      labels:
        app: prometheus
    spec:
      serviceAccountName: prometheus
      securityContext:
        fsGroup: 65534  # nobody
        runAsUser: 65534
        runAsGroup: 65534
      containers:
        - name: prometheus
          image: docker.io/prom/prometheus:latest
          args:
            - "--config.file=/etc/prometheus/prometheus.yml"
            - "--storage.tsdb.path=/prometheus"
            - "--storage.tsdb.retention.time=90d"
            - "--web.enable-lifecycle"
          ports:
            - containerPort: 9090
              name: http
          volumeMounts:
            - name: config
              mountPath: /etc/prometheus/prometheus.yml
              subPath: prometheus.yml
              readOnly: true
            - name: config
              mountPath: /etc/prometheus/alerts.yml
              subPath: alerts.yml
              readOnly: true
            - name: config
              mountPath: /etc/prometheus/recording-rules.yml
              subPath: recording-rules.yml
              readOnly: true
            - name: data
              mountPath: /prometheus
          resources:
            requests:
              cpu: 200m
              memory: 512Mi
            limits:
              cpu: "1"
              memory: 2Gi
          livenessProbe:
            httpGet:
              path: /-/healthy
              port: 9090
            initialDelaySeconds: 15
            periodSeconds: 30
          readinessProbe:
            httpGet:
              path: /-/ready
              port: 9090
            initialDelaySeconds: 5
            periodSeconds: 10
      volumes:
        - name: config
          configMap:
            name: prometheus-config
        - name: data
          persistentVolumeClaim:
            claimName: prometheus-data

# =============================================================================
# ConfigMap: Grafana Dashboard Provider
# =============================================================================
---
apiVersion: v1
kind: ConfigMap
metadata:
  name: grafana-dashboard-provider
  namespace: monitoring
data:
  default.yml: |
    apiVersion: 1
    providers:
      - name: 'default'
        orgId: 1
        folder: ''
        type: file
        disableDeletion: false
        updateIntervalSeconds: 30
        options:
          path: /var/lib/grafana/dashboards
          foldersFromFilesStructure: true

# =============================================================================
# ConfigMap: Grafana Dashboards (AI Stack Health)
# =============================================================================
---
apiVersion: v1
kind: ConfigMap
metadata:
  name: grafana-dashboards
  namespace: monitoring
data:
  ai-stack-health.json: |
    {
      "id": null,
      "panels": [
        {
          "fieldConfig": {
            "defaults": {
              "mappings": [
                {
                  "options": {
                    "0": { "color": "red", "text": "DOWN" },
                    "1": { "color": "green", "text": "UP" }
                  },
                  "type": "value"
                }
              ],
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  { "color": "red", "value": null },
                  { "color": "green", "value": 1 }
                ]
              }
            }
          },
          "gridPos": { "h": 4, "w": 6, "x": 0, "y": 0 },
          "id": 1,
          "targets": [
            {
              "datasource": { "type": "prometheus", "uid": "prometheus" },
              "expr": "probe_success{job=\"probe-ollama-local\"}",
              "legendFormat": "Status"
            }
          ],
          "title": "Ollama (Local)",
          "type": "stat"
        },
        {
          "fieldConfig": {
            "defaults": {
              "mappings": [
                {
                  "options": {
                    "0": { "color": "red", "text": "DOWN" },
                    "1": { "color": "green", "text": "UP" }
                  },
                  "type": "value"
                }
              ],
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  { "color": "red", "value": null },
                  { "color": "green", "value": 1 }
                ]
              }
            }
          },
          "gridPos": { "h": 4, "w": 6, "x": 6, "y": 0 },
          "id": 2,
          "targets": [
            {
              "datasource": { "type": "prometheus", "uid": "prometheus" },
              "expr": "probe_success{job=\"probe-ollama-edge1\"}",
              "legendFormat": "Status"
            }
          ],
          "title": "Ollama (Edge1)",
          "type": "stat"
        },
        {
          "fieldConfig": {
            "defaults": {
              "mappings": [
                {
                  "options": {
                    "0": { "color": "red", "text": "DOWN" },
                    "1": { "color": "green", "text": "UP" }
                  },
                  "type": "value"
                }
              ],
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  { "color": "red", "value": null },
                  { "color": "green", "value": 1 }
                ]
              }
            }
          },
          "gridPos": { "h": 4, "w": 6, "x": 12, "y": 0 },
          "id": 3,
          "targets": [
            {
              "datasource": { "type": "prometheus", "uid": "prometheus" },
              "expr": "probe_success{job=\"probe-agentzero-local\"}",
              "legendFormat": "Status"
            }
          ],
          "title": "Agent Zero (Local)",
          "type": "stat"
        },
        {
          "fieldConfig": {
            "defaults": {
              "mappings": [
                {
                  "options": {
                    "0": { "color": "red", "text": "DOWN" },
                    "1": { "color": "green", "text": "UP" }
                  },
                  "type": "value"
                }
              ],
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  { "color": "red", "value": null },
                  { "color": "green", "value": 1 }
                ]
              }
            }
          },
          "gridPos": { "h": 4, "w": 6, "x": 18, "y": 0 },
          "id": 4,
          "targets": [
            {
              "datasource": { "type": "prometheus", "uid": "prometheus" },
              "expr": "probe_success{job=\"probe-agentzero-nuc\"}",
              "legendFormat": "Status"
            }
          ],
          "title": "Agent Zero (NUC)",
          "type": "stat"
        },
        {
          "fieldConfig": {
            "defaults": {
              "custom": { "fillOpacity": 15, "lineWidth": 2 },
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  { "color": "green", "value": null },
                  { "color": "yellow", "value": 1 },
                  { "color": "red", "value": 3 }
                ]
              },
              "unit": "s"
            }
          },
          "gridPos": { "h": 8, "w": 12, "x": 0, "y": 4 },
          "id": 5,
          "targets": [
            {
              "datasource": { "type": "prometheus", "uid": "prometheus" },
              "expr": "probe_duration_seconds{service=\"ollama\"}",
              "legendFormat": "{{ deployment }}"
            }
          ],
          "title": "Ollama Response Time",
          "type": "timeseries"
        },
        {
          "fieldConfig": {
            "defaults": {
              "custom": { "fillOpacity": 15, "lineWidth": 2 },
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  { "color": "green", "value": null },
                  { "color": "yellow", "value": 1 },
                  { "color": "red", "value": 3 }
                ]
              },
              "unit": "s"
            }
          },
          "gridPos": { "h": 8, "w": 12, "x": 12, "y": 4 },
          "id": 6,
          "targets": [
            {
              "datasource": { "type": "prometheus", "uid": "prometheus" },
              "expr": "probe_duration_seconds{service=\"agent-zero\"}",
              "legendFormat": "{{ deployment }}"
            }
          ],
          "title": "Agent Zero Response Time",
          "type": "timeseries"
        },
        {
          "fieldConfig": {
            "defaults": {
              "custom": { "fillOpacity": 30, "lineWidth": 1, "stacking": { "mode": "none" } },
              "mappings": [
                {
                  "options": {
                    "0": { "text": "DOWN" },
                    "1": { "text": "UP" }
                  },
                  "type": "value"
                }
              ],
              "max": 1,
              "min": 0,
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  { "color": "red", "value": null },
                  { "color": "green", "value": 1 }
                ]
              }
            }
          },
          "gridPos": { "h": 6, "w": 24, "x": 0, "y": 12 },
          "id": 7,
          "targets": [
            {
              "datasource": { "type": "prometheus", "uid": "prometheus" },
              "expr": "probe_success{service=\"ollama\"}",
              "legendFormat": "Ollama ({{ deployment }})"
            },
            {
              "datasource": { "type": "prometheus", "uid": "prometheus" },
              "expr": "probe_success{service=\"agent-zero\"}",
              "legendFormat": "Agent Zero ({{ deployment }})"
            }
          ],
          "title": "Uptime History",
          "type": "timeseries"
        },
        {
          "fieldConfig": {
            "defaults": {
              "custom": { "fillOpacity": 10, "lineWidth": 2 },
              "max": 100,
              "min": 0,
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  { "color": "green", "value": null },
                  { "color": "yellow", "value": 75 },
                  { "color": "red", "value": 90 }
                ]
              },
              "unit": "percent"
            }
          },
          "gridPos": { "h": 8, "w": 12, "x": 0, "y": 18 },
          "id": 8,
          "targets": [
            {
              "datasource": { "type": "prometheus", "uid": "prometheus" },
              "expr": "100 - (avg(rate(node_cpu_seconds_total{instance=\"edge1\",mode=\"idle\"}[5m])) * 100)",
              "legendFormat": "CPU %"
            },
            {
              "datasource": { "type": "prometheus", "uid": "prometheus" },
              "expr": "(1 - node_memory_MemAvailable_bytes{instance=\"edge1\"} / node_memory_MemTotal_bytes{instance=\"edge1\"}) * 100",
              "legendFormat": "Memory %"
            },
            {
              "datasource": { "type": "prometheus", "uid": "prometheus" },
              "expr": "(1 - node_filesystem_avail_bytes{instance=\"edge1\",mountpoint=\"/\"} / node_filesystem_size_bytes{instance=\"edge1\",mountpoint=\"/\"}) * 100",
              "legendFormat": "Disk %"
            }
          ],
          "title": "Edge1 Pi 5 \u2014 CPU / Memory / Disk",
          "type": "timeseries"
        },
        {
          "fieldConfig": {
            "defaults": {
              "custom": { "fillOpacity": 10, "lineWidth": 2 },
              "unit": "s"
            }
          },
          "gridPos": { "h": 8, "w": 12, "x": 12, "y": 18 },
          "id": 9,
          "targets": [
            {
              "datasource": { "type": "prometheus", "uid": "prometheus" },
              "expr": "probe_dns_lookup_time_seconds",
              "legendFormat": "{{ job }}"
            }
          ],
          "title": "Probe DNS Lookup Time",
          "type": "timeseries"
        }
      ],
      "refresh": "30s",
      "schemaVersion": 39,
      "tags": ["ai", "ollama", "agent-zero", "blue-jay"],
      "time": { "from": "now-1h", "to": "now" },
      "timezone": "browser",
      "title": "AI Stack Health",
      "uid": "ai-stack-health",
      "version": 1
    }

# =============================================================================
# ConfigMap: Grafana Dashboard — Edge Nodes
# =============================================================================
---
apiVersion: v1
kind: ConfigMap
metadata:
  name: grafana-dashboard-edge-nodes
  namespace: monitoring
data:
  bluejay-edge-nodes.json: |
    {
      "id": null,
      "panels": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "prometheus"
          },
          "fieldConfig": {
            "defaults": {
              "mappings": [
                {
                  "options": {
                    "0": {
                      "color": "red",
                      "text": "DOWN"
                    },
                    "1": {
                      "color": "green",
                      "text": "UP"
                    }
                  },
                  "type": "value"
                }
              ]
            }
          },
          "gridPos": { "h": 4, "w": 24, "x": 0, "y": 0 },
          "targets": [
            {
              "expr": "up{instance=~\"edge.*\"}",
              "legendFormat": "{{instance}}",
              "refId": "A"
            }
          ],
          "title": "Edge Node Status",
          "type": "stat"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "prometheus"
          },
          "fieldConfig": {
            "defaults": {
              "max": 100,
              "min": 0,
              "unit": "percent"
            }
          },
          "gridPos": { "h": 8, "w": 12, "x": 0, "y": 4 },
          "targets": [
            {
              "expr": "100 - (avg(rate(node_cpu_seconds_total{mode=\"idle\", instance=~\"edge1.*\"}[5m])) * 100)",
              "legendFormat": "CPU %",
              "refId": "A"
            },
            {
              "expr": "node_load1{instance=~\"edge1.*\"}",
              "legendFormat": "Load 1m",
              "refId": "B"
            }
          ],
          "title": "edge1 (Pi5 + Hailo) CPU",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "prometheus"
          },
          "fieldConfig": {
            "defaults": {
              "max": 100,
              "min": 0,
              "unit": "percent"
            }
          },
          "gridPos": { "h": 8, "w": 12, "x": 12, "y": 4 },
          "targets": [
            {
              "expr": "100 - (avg(rate(node_cpu_seconds_total{mode=\"idle\", instance=~\"edge2.*\"}[5m])) * 100)",
              "legendFormat": "CPU %",
              "refId": "A"
            },
            {
              "expr": "node_load1{instance=~\"edge2.*\"}",
              "legendFormat": "Load 1m",
              "refId": "B"
            }
          ],
          "title": "edge2 (Pi4) CPU",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "prometheus"
          },
          "fieldConfig": {
            "defaults": {
              "max": 100,
              "min": 0,
              "unit": "percent"
            }
          },
          "gridPos": { "h": 8, "w": 12, "x": 0, "y": 12 },
          "targets": [
            {
              "expr": "(1 - node_memory_MemAvailable_bytes{instance=~\"edge.*\"} / node_memory_MemTotal_bytes{instance=~\"edge.*\"}) * 100",
              "legendFormat": "{{instance}}",
              "refId": "A"
            }
          ],
          "title": "Edge Memory Usage",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "prometheus"
          },
          "fieldConfig": {
            "defaults": {
              "max": 100,
              "min": 0,
              "unit": "percent"
            }
          },
          "gridPos": { "h": 8, "w": 12, "x": 12, "y": 12 },
          "targets": [
            {
              "expr": "100 - (node_filesystem_avail_bytes{instance=~\"edge.*\",mountpoint=\"/\"} / node_filesystem_size_bytes{instance=~\"edge.*\",mountpoint=\"/\"} * 100)",
              "legendFormat": "{{instance}}",
              "refId": "A"
            }
          ],
          "title": "Edge Disk Usage",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "prometheus"
          },
          "fieldConfig": {
            "defaults": {
              "unit": "celsius"
            }
          },
          "gridPos": { "h": 8, "w": 12, "x": 0, "y": 20 },
          "targets": [
            {
              "expr": "node_hwmon_temp_celsius{instance=~\"edge.*\"}",
              "legendFormat": "{{instance}} {{chip}} {{sensor}}",
              "refId": "A"
            }
          ],
          "title": "Edge CPU Temperature",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "prometheus"
          },
          "fieldConfig": {
            "defaults": {
              "unit": "bps"
            }
          },
          "gridPos": { "h": 8, "w": 12, "x": 12, "y": 20 },
          "targets": [
            {
              "expr": "rate(node_network_receive_bytes_total{instance=~\"edge.*\",device!~\"lo|docker.*|veth.*\"}[5m]) * 8",
              "legendFormat": "{{instance}} {{device}} RX",
              "refId": "A"
            },
            {
              "expr": "-rate(node_network_transmit_bytes_total{instance=~\"edge.*\",device!~\"lo|docker.*|veth.*\"}[5m]) * 8",
              "legendFormat": "{{instance}} {{device}} TX",
              "refId": "B"
            }
          ],
          "title": "Edge Network Traffic",
          "type": "timeseries"
        }
      ],
      "refresh": "30s",
      "schemaVersion": 40,
      "tags": ["bluejay", "edge"],
      "timezone": "browser",
      "title": "BlueJay Edge Nodes",
      "uid": "7e1603b9-e918-4b3f-a22b-163132fd5cee"
    }

# =============================================================================
# ConfigMap: Grafana Dashboard — Network Overview
# =============================================================================
---
apiVersion: v1
kind: ConfigMap
metadata:
  name: grafana-dashboard-network-overview
  namespace: monitoring
data:
  bluejay-network-overview.json: |
    {
      "id": null,
      "panels": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "prometheus"
          },
          "fieldConfig": {
            "defaults": {
              "thresholds": {
                "steps": [
                  { "color": "green", "value": null }
                ]
              }
            }
          },
          "gridPos": { "h": 4, "w": 24, "x": 0, "y": 0 },
          "targets": [
            {
              "expr": "count(up == 1)",
              "legendFormat": "Up",
              "refId": "A"
            },
            {
              "expr": "count(up == 0)",
              "legendFormat": "Down",
              "refId": "B"
            }
          ],
          "title": "Target Health",
          "type": "stat"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "prometheus"
          },
          "fieldConfig": {
            "defaults": {
              "max": 4,
              "min": 0,
              "thresholds": {
                "steps": [
                  { "color": "green", "value": null },
                  { "color": "yellow", "value": 2 },
                  { "color": "red", "value": 3 }
                ]
              }
            }
          },
          "gridPos": { "h": 6, "w": 6, "x": 0, "y": 4 },
          "targets": [
            {
              "expr": "laLoad{instance=\"10.0.56.1\", laNames=\"Load-1\"}",
              "refId": "A"
            }
          ],
          "title": "pfSense CPU Load (1m)",
          "type": "gauge"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "prometheus"
          },
          "fieldConfig": {
            "defaults": {
              "max": 100,
              "min": 0,
              "thresholds": {
                "steps": [
                  { "color": "green", "value": null },
                  { "color": "yellow", "value": 70 },
                  { "color": "red", "value": 90 }
                ]
              },
              "unit": "percent"
            }
          },
          "gridPos": { "h": 6, "w": 6, "x": 6, "y": 4 },
          "targets": [
            {
              "expr": "100 - (memAvailReal{instance=\"10.0.56.1\"} / memTotalReal{instance=\"10.0.56.1\"} * 100)",
              "refId": "A"
            }
          ],
          "title": "pfSense Memory Used %",
          "type": "gauge"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "prometheus"
          },
          "fieldConfig": {
            "defaults": {
              "max": 100,
              "min": 0,
              "unit": "percent"
            }
          },
          "gridPos": { "h": 6, "w": 12, "x": 12, "y": 4 },
          "targets": [
            {
              "expr": "100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\", instance=\"noc1\"}[5m])) * 100)",
              "legendFormat": "CPU %",
              "refId": "A"
            }
          ],
          "title": "noc1 CPU Usage",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "prometheus"
          },
          "fieldConfig": {
            "defaults": {
              "max": 100,
              "min": 0,
              "unit": "percent"
            }
          },
          "gridPos": { "h": 8, "w": 12, "x": 0, "y": 10 },
          "targets": [
            {
              "expr": "(1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100",
              "legendFormat": "{{instance}}",
              "refId": "A"
            }
          ],
          "title": "Node Memory Usage",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "prometheus"
          },
          "fieldConfig": {
            "defaults": {
              "max": 100,
              "min": 0,
              "thresholds": {
                "steps": [
                  { "color": "green", "value": null },
                  { "color": "yellow", "value": 70 },
                  { "color": "red", "value": 90 }
                ]
              },
              "unit": "percent"
            }
          },
          "gridPos": { "h": 8, "w": 12, "x": 12, "y": 10 },
          "targets": [
            {
              "expr": "100 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} * 100)",
              "legendFormat": "{{instance}}",
              "refId": "A"
            }
          ],
          "title": "Node Disk Usage %",
          "type": "bargauge"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "prometheus"
          },
          "fieldConfig": {
            "defaults": {
              "unit": "bps"
            }
          },
          "gridPos": { "h": 8, "w": 24, "x": 0, "y": 18 },
          "targets": [
            {
              "expr": "rate(node_network_receive_bytes_total{device!~\"lo|veth.*|cali.*|flannel.*\"}[5m]) * 8",
              "legendFormat": "{{instance}} {{device}} RX",
              "refId": "A"
            },
            {
              "expr": "-rate(node_network_transmit_bytes_total{device!~\"lo|veth.*|cali.*|flannel.*\"}[5m]) * 8",
              "legendFormat": "{{instance}} {{device}} TX",
              "refId": "B"
            }
          ],
          "title": "Network Traffic",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "prometheus"
          },
          "gridPos": { "h": 8, "w": 24, "x": 0, "y": 26 },
          "targets": [
            {
              "expr": "up",
              "format": "table",
              "instant": true,
              "refId": "A"
            }
          ],
          "title": "Prometheus Targets",
          "type": "table"
        }
      ],
      "refresh": "30s",
      "schemaVersion": 40,
      "tags": ["bluejay", "network"],
      "timezone": "browser",
      "title": "BlueJay Network Overview",
      "uid": "b83a122c-6eb5-4fae-a632-77cbf753ad05"
    }

# =============================================================================
# ConfigMap: Grafana Dashboard — Operations
# =============================================================================
---
apiVersion: v1
kind: ConfigMap
metadata:
  name: grafana-dashboard-operations
  namespace: monitoring
data:
  bluejay-operations.json: |
    {
      "annotations": {
        "list": []
      },
      "id": null,
      "panels": [
        {
          "collapsed": false,
          "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
          "title": "Infrastructure Overview",
          "type": "row"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "prometheus"
          },
          "fieldConfig": {
            "defaults": {
              "noValue": "0",
              "thresholds": {
                "steps": [
                  { "color": "green", "value": null }
                ]
              }
            }
          },
          "gridPos": { "h": 4, "w": 8, "x": 0, "y": 1 },
          "targets": [
            {
              "expr": "count(up == 1)",
              "legendFormat": "Up",
              "refId": "A"
            },
            {
              "expr": "count(up == 0)",
              "legendFormat": "Down",
              "refId": "B"
            }
          ],
          "title": "All Targets Up/Down",
          "type": "stat"
        },
        {
          "datasource": {
            "type": "alexanderzobnin-zabbix-datasource",
            "uid": "bffjila3zkdfka"
          },
          "gridPos": { "h": 8, "w": 16, "x": 8, "y": 1 },
          "targets": [
            {
              "application": { "filter": "" },
              "group": { "filter": "/.*/" },
              "host": { "filter": "/.*/" },
              "queryType": 5,
              "refId": "A",
              "trigger": { "filter": "/.*/" }
            }
          ],
          "title": "Zabbix Active Problems",
          "type": "alexanderzobnin-zabbix-triggers-panel"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "prometheus"
          },
          "gridPos": { "h": 6, "w": 8, "x": 0, "y": 5 },
          "targets": [
            {
              "expr": "node_load1{instance=\"noc1\"}",
              "legendFormat": "1m",
              "refId": "A"
            },
            {
              "expr": "node_load5{instance=\"noc1\"}",
              "legendFormat": "5m",
              "refId": "B"
            },
            {
              "expr": "node_load15{instance=\"noc1\"}",
              "legendFormat": "15m",
              "refId": "C"
            }
          ],
          "title": "noc1 Load Average",
          "type": "timeseries"
        },
        {
          "collapsed": false,
          "gridPos": { "h": 1, "w": 24, "x": 0, "y": 11 },
          "title": "Kubernetes & Services",
          "type": "row"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "prometheus"
          },
          "fieldConfig": {
            "defaults": {},
            "overrides": [
              {
                "matcher": { "id": "byName", "options": "Value" },
                "properties": [
                  {
                    "id": "mappings",
                    "value": [
                      {
                        "options": {
                          "0": { "color": "red", "text": "DOWN" },
                          "1": { "color": "green", "text": "UP" }
                        },
                        "type": "value"
                      }
                    ]
                  }
                ]
              }
            ]
          },
          "gridPos": { "h": 8, "w": 24, "x": 0, "y": 12 },
          "targets": [
            {
              "expr": "up",
              "format": "table",
              "instant": true,
              "refId": "A"
            }
          ],
          "title": "K8s Services Uptime (Prometheus Targets)",
          "type": "table"
        },
        {
          "collapsed": false,
          "gridPos": { "h": 1, "w": 24, "x": 0, "y": 20 },
          "title": "Network & SNMP",
          "type": "row"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "prometheus"
          },
          "fieldConfig": {
            "defaults": {
              "unit": "bps"
            }
          },
          "gridPos": { "h": 8, "w": 12, "x": 0, "y": 21 },
          "targets": [
            {
              "expr": "rate(ifHCInOctets{instance=\"10.0.56.1\", ifAlias=~\".*WAN.*\"}[5m]) * 8",
              "legendFormat": "WAN In",
              "refId": "A"
            },
            {
              "expr": "-rate(ifHCOutOctets{instance=\"10.0.56.1\", ifAlias=~\".*WAN.*\"}[5m]) * 8",
              "legendFormat": "WAN Out",
              "refId": "B"
            }
          ],
          "title": "pfSense WAN Traffic",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "prometheus"
          },
          "fieldConfig": {
            "defaults": {
              "unit": "bps"
            }
          },
          "gridPos": { "h": 8, "w": 12, "x": 12, "y": 21 },
          "targets": [
            {
              "expr": "rate(ifHCInOctets{instance=\"10.0.56.1\", ifAlias!~\".*WAN.*\"}[5m]) * 8",
              "legendFormat": "{{ifAlias}} In",
              "refId": "A"
            },
            {
              "expr": "-rate(ifHCOutOctets{instance=\"10.0.56.1\", ifAlias!~\".*WAN.*\"}[5m]) * 8",
              "legendFormat": "{{ifAlias}} Out",
              "refId": "B"
            }
          ],
          "title": "pfSense LAN Traffic",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "prometheus"
          },
          "fieldConfig": {
            "defaults": {
              "max": 100,
              "min": 0,
              "unit": "percent"
            }
          },
          "gridPos": { "h": 8, "w": 12, "x": 0, "y": 29 },
          "targets": [
            {
              "expr": "(1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100",
              "legendFormat": "{{instance}}",
              "refId": "A"
            }
          ],
          "title": "All Nodes Memory",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "prometheus"
          },
          "fieldConfig": {
            "defaults": {
              "max": 100,
              "min": 0,
              "unit": "percent"
            }
          },
          "gridPos": { "h": 8, "w": 12, "x": 12, "y": 29 },
          "targets": [
            {
              "expr": "100 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} * 100)",
              "legendFormat": "{{instance}}",
              "refId": "A"
            }
          ],
          "title": "All Nodes Disk",
          "type": "timeseries"
        }
      ],
      "refresh": "1m",
      "schemaVersion": 40,
      "tags": ["bluejay", "operations", "zabbix"],
      "timezone": "browser",
      "title": "BlueJay Operations",
      "uid": "a781c4e4-c3fe-4ac7-be93-21363a41b97d"
    }

# =============================================================================
# ConfigMap: Grafana Dashboard — Epson Printer
# =============================================================================
---
apiVersion: v1
kind: ConfigMap
metadata:
  name: grafana-dashboard-printer
  namespace: monitoring
data:
  epson-ecotank-printer.json: |
    {
      "id": null,
      "panels": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "prometheus"
          },
          "fieldConfig": {
            "defaults": {
              "max": 100,
              "min": 0,
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  { "color": "red", "value": null },
                  { "color": "orange", "value": 10 },
                  { "color": "yellow", "value": 20 },
                  { "color": "green", "value": 40 }
                ]
              },
              "unit": "percent"
            },
            "overrides": [
              {
                "matcher": { "id": "byName", "options": "Black Ink Bottle" },
                "properties": [
                  { "id": "color", "value": { "fixedColor": "dark-blue", "mode": "fixed" } }
                ]
              },
              {
                "matcher": { "id": "byName", "options": "Cyan Ink Bottle" },
                "properties": [
                  { "id": "color", "value": { "fixedColor": "super-light-blue", "mode": "fixed" } }
                ]
              },
              {
                "matcher": { "id": "byName", "options": "Magenta Ink Bottle" },
                "properties": [
                  { "id": "color", "value": { "fixedColor": "purple", "mode": "fixed" } }
                ]
              },
              {
                "matcher": { "id": "byName", "options": "Yellow Ink Bottle" },
                "properties": [
                  { "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }
                ]
              }
            ]
          },
          "gridPos": { "h": 10, "w": 12, "x": 0, "y": 0 },
          "id": 1,
          "options": {
            "orientation": "horizontal",
            "reduceOptions": {
              "calcs": ["lastNotNull"]
            },
            "showThresholdLabels": false,
            "showThresholdMarkers": true
          },
          "targets": [
            {
              "expr": "prtMarkerSuppliesLevel{job=\"snmp-printer\"}",
              "legendFormat": "{{prtMarkerSuppliesDescription}}",
              "refId": "A"
            }
          ],
          "title": "Ink Levels",
          "type": "gauge"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "prometheus"
          },
          "fieldConfig": {
            "defaults": {
              "custom": {
                "fillOpacity": 20,
                "lineWidth": 2,
                "spanNulls": true
              },
              "max": 100,
              "min": 0,
              "unit": "percent"
            },
            "overrides": [
              {
                "matcher": { "id": "byName", "options": "Black Ink Bottle" },
                "properties": [
                  { "id": "color", "value": { "fixedColor": "dark-blue", "mode": "fixed" } }
                ]
              },
              {
                "matcher": { "id": "byName", "options": "Cyan Ink Bottle" },
                "properties": [
                  { "id": "color", "value": { "fixedColor": "super-light-blue", "mode": "fixed" } }
                ]
              },
              {
                "matcher": { "id": "byName", "options": "Magenta Ink Bottle" },
                "properties": [
                  { "id": "color", "value": { "fixedColor": "purple", "mode": "fixed" } }
                ]
              },
              {
                "matcher": { "id": "byName", "options": "Yellow Ink Bottle" },
                "properties": [
                  { "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }
                ]
              }
            ]
          },
          "gridPos": { "h": 10, "w": 12, "x": 12, "y": 0 },
          "id": 2,
          "targets": [
            {
              "expr": "prtMarkerSuppliesLevel{job=\"snmp-printer\"}",
              "legendFormat": "{{prtMarkerSuppliesDescription}}",
              "refId": "A"
            }
          ],
          "title": "Ink Level History",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "prometheus"
          },
          "fieldConfig": {
            "defaults": {
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  { "color": "green", "value": null },
                  { "color": "yellow", "value": 10000 },
                  { "color": "red", "value": 50000 }
                ]
              },
              "unit": "short"
            }
          },
          "gridPos": { "h": 5, "w": 6, "x": 0, "y": 10 },
          "id": 3,
          "options": {
            "colorMode": "background",
            "reduceOptions": {
              "calcs": ["lastNotNull"]
            },
            "textMode": "value_and_name"
          },
          "targets": [
            {
              "expr": "prtMarkerLifeCount{job=\"snmp-printer\"}",
              "legendFormat": "Pages",
              "refId": "A"
            }
          ],
          "title": "Lifetime Page Count",
          "type": "stat"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "prometheus"
          },
          "fieldConfig": {
            "defaults": {
              "mappings": [
                {
                  "options": {
                    "1": { "text": "Online" }
                  },
                  "type": "value"
                }
              ],
              "thresholds": {
                "steps": [
                  { "color": "blue", "value": null }
                ]
              }
            }
          },
          "gridPos": { "h": 5, "w": 6, "x": 6, "y": 10 },
          "id": 4,
          "options": {
            "colorMode": "background",
            "reduceOptions": {
              "calcs": ["lastNotNull"]
            },
            "textMode": "name"
          },
          "targets": [
            {
              "expr": "prtGeneralPrinterName{job=\"snmp-printer\"}",
              "legendFormat": "{{prtGeneralPrinterName}}",
              "refId": "A"
            }
          ],
          "title": "Printer Model",
          "type": "stat"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "prometheus"
          },
          "fieldConfig": {
            "defaults": {
              "thresholds": {
                "steps": [
                  { "color": "green", "value": null },
                  { "color": "red", "value": 1 }
                ]
              },
              "unit": "short"
            }
          },
          "gridPos": { "h": 5, "w": 6, "x": 12, "y": 10 },
          "id": 5,
          "options": {
            "colorMode": "background",
            "reduceOptions": {
              "calcs": ["lastNotNull"]
            }
          },
          "targets": [
            {
              "expr": "prtAlertCriticalEvents{job=\"snmp-printer\"}",
              "legendFormat": "Critical Alerts",
              "refId": "A"
            }
          ],
          "title": "Critical Events",
          "type": "stat"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "prometheus"
          },
          "fieldConfig": {
            "defaults": {
              "thresholds": {
                "steps": [
                  { "color": "blue", "value": null }
                ]
              }
            }
          },
          "gridPos": { "h": 5, "w": 6, "x": 18, "y": 10 },
          "id": 6,
          "options": {
            "colorMode": "background",
            "reduceOptions": {
              "calcs": ["lastNotNull"]
            },
            "textMode": "name"
          },
          "targets": [
            {
              "expr": "prtGeneralSerialNumber{job=\"snmp-printer\"}",
              "legendFormat": "{{prtGeneralSerialNumber}}",
              "refId": "A"
            }
          ],
          "title": "Serial Number",
          "type": "stat"
        }
      ],
      "refresh": "5m",
      "schemaVersion": 39,
      "tags": ["printer", "snmp", "bluejay"],
      "time": { "from": "now-24h", "to": "now" },
      "timezone": "browser",
      "title": "Epson ET-3750 EcoTank Printer",
      "uid": "epson-ecotank"
    }

# =============================================================================
# ConfigMap: Grafana Dashboard — Infrastructure Overview
# =============================================================================
---
apiVersion: v1
kind: ConfigMap
metadata:
  name: grafana-dashboard-infra-overview
  namespace: monitoring
data:
  infra-overview.json: |
    {
      "id": null,
      "panels": [
        {
          "collapsed": false,
          "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
          "id": 100,
          "title": "AI Stack",
          "type": "row"
        },
        {
          "fieldConfig": {
            "defaults": {
              "mappings": [
                {
                  "options": {
                    "0": { "color": "red", "text": "DOWN" },
                    "1": { "color": "green", "text": "UP" }
                  },
                  "type": "value"
                }
              ],
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  { "color": "red", "value": null },
                  { "color": "green", "value": 1 }
                ]
              }
            }
          },
          "gridPos": { "h": 4, "w": 6, "x": 0, "y": 1 },
          "id": 1,
          "targets": [
            {
              "datasource": { "type": "prometheus", "uid": "prometheus" },
              "expr": "probe_success{job=\"probe-ollama-local\"}",
              "legendFormat": "Status"
            }
          ],
          "title": "Ollama (Local)",
          "type": "stat"
        },
        {
          "fieldConfig": {
            "defaults": {
              "mappings": [
                {
                  "options": {
                    "0": { "color": "red", "text": "DOWN" },
                    "1": { "color": "green", "text": "UP" }
                  },
                  "type": "value"
                }
              ],
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  { "color": "red", "value": null },
                  { "color": "green", "value": 1 }
                ]
              }
            }
          },
          "gridPos": { "h": 4, "w": 6, "x": 6, "y": 1 },
          "id": 2,
          "targets": [
            {
              "datasource": { "type": "prometheus", "uid": "prometheus" },
              "expr": "probe_success{job=\"probe-ollama-edge1\"}",
              "legendFormat": "Status"
            }
          ],
          "title": "Ollama (Edge1)",
          "type": "stat"
        },
        {
          "fieldConfig": {
            "defaults": {
              "mappings": [
                {
                  "options": {
                    "0": { "color": "red", "text": "DOWN" },
                    "1": { "color": "green", "text": "UP" }
                  },
                  "type": "value"
                }
              ],
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  { "color": "red", "value": null },
                  { "color": "green", "value": 1 }
                ]
              }
            }
          },
          "gridPos": { "h": 4, "w": 6, "x": 12, "y": 1 },
          "id": 3,
          "targets": [
            {
              "datasource": { "type": "prometheus", "uid": "prometheus" },
              "expr": "probe_success{job=\"probe-agentzero-local\"}",
              "legendFormat": "Status"
            }
          ],
          "title": "Agent Zero (Local)",
          "type": "stat"
        },
        {
          "fieldConfig": {
            "defaults": {
              "mappings": [
                {
                  "options": {
                    "0": { "color": "red", "text": "DOWN" },
                    "1": { "color": "green", "text": "UP" }
                  },
                  "type": "value"
                }
              ],
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  { "color": "red", "value": null },
                  { "color": "green", "value": 1 }
                ]
              }
            }
          },
          "gridPos": { "h": 4, "w": 6, "x": 18, "y": 1 },
          "id": 4,
          "targets": [
            {
              "datasource": { "type": "prometheus", "uid": "prometheus" },
              "expr": "probe_success{job=\"probe-agentzero-nuc\"}",
              "legendFormat": "Status"
            }
          ],
          "title": "Agent Zero (NUC)",
          "type": "stat"
        },
        {
          "collapsed": false,
          "gridPos": { "h": 1, "w": 24, "x": 0, "y": 5 },
          "id": 101,
          "title": "K8s Cluster",
          "type": "row"
        },
        {
          "fieldConfig": {
            "defaults": {
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  { "color": "green", "value": null },
                  { "color": "yellow", "value": 30 },
                  { "color": "red", "value": 50 }
                ]
              }
            }
          },
          "gridPos": { "h": 6, "w": 8, "x": 0, "y": 6 },
          "id": 5,
          "targets": [
            {
              "datasource": { "type": "prometheus", "uid": "prometheus" },
              "expr": "count(up{job=\"node-exporter\"} == 1)",
              "legendFormat": "Nodes Up"
            }
          ],
          "title": "Nodes Up (node-exporter)",
          "type": "stat"
        },
        {
          "fieldConfig": {
            "defaults": {
              "custom": { "fillOpacity": 15, "lineWidth": 2 },
              "max": 100,
              "min": 0,
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  { "color": "green", "value": null },
                  { "color": "yellow", "value": 70 },
                  { "color": "red", "value": 90 }
                ]
              },
              "unit": "percent"
            }
          },
          "gridPos": { "h": 6, "w": 8, "x": 8, "y": 6 },
          "id": 6,
          "targets": [
            {
              "datasource": { "type": "prometheus", "uid": "prometheus" },
              "expr": "100 - (avg(rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) by (instance) * 100)",
              "legendFormat": "{{ instance }}"
            }
          ],
          "title": "Node CPU Usage %",
          "type": "timeseries"
        },
        {
          "fieldConfig": {
            "defaults": {
              "custom": { "fillOpacity": 15, "lineWidth": 2 },
              "max": 100,
              "min": 0,
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  { "color": "green", "value": null },
                  { "color": "yellow", "value": 70 },
                  { "color": "red", "value": 90 }
                ]
              },
              "unit": "percent"
            }
          },
          "gridPos": { "h": 6, "w": 8, "x": 16, "y": 6 },
          "id": 7,
          "targets": [
            {
              "datasource": { "type": "prometheus", "uid": "prometheus" },
              "expr": "(1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100",
              "legendFormat": "{{ instance }}"
            }
          ],
          "title": "Node Memory Usage %",
          "type": "timeseries"
        },
        {
          "collapsed": false,
          "gridPos": { "h": 1, "w": 24, "x": 0, "y": 12 },
          "id": 102,
          "title": "Network",
          "type": "row"
        },
        {
          "fieldConfig": {
            "defaults": {
              "custom": { "fillOpacity": 10, "lineWidth": 2 },
              "unit": "Bps"
            }
          },
          "gridPos": { "h": 8, "w": 12, "x": 0, "y": 13 },
          "id": 8,
          "targets": [
            {
              "datasource": { "type": "prometheus", "uid": "prometheus" },
              "expr": "rate(ifHCInOctets{job=\"snmp-pfsense\",ifDescr=\"igb0\"}[5m])",
              "legendFormat": "WAN In"
            },
            {
              "datasource": { "type": "prometheus", "uid": "prometheus" },
              "expr": "rate(ifHCOutOctets{job=\"snmp-pfsense\",ifDescr=\"igb0\"}[5m])",
              "legendFormat": "WAN Out"
            }
          ],
          "title": "pfSense WAN Bandwidth",
          "type": "timeseries"
        },
        {
          "fieldConfig": {
            "defaults": {
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  { "color": "red", "value": null },
                  { "color": "green", "value": 1 }
                ]
              }
            },
            "overrides": []
          },
          "gridPos": { "h": 8, "w": 12, "x": 12, "y": 13 },
          "id": 9,
          "options": {
            "showHeader": true,
            "sortBy": [{ "displayName": "Value", "desc": false }]
          },
          "targets": [
            {
              "datasource": { "type": "prometheus", "uid": "prometheus" },
              "expr": "up",
              "format": "table",
              "instant": true,
              "legendFormat": ""
            }
          ],
          "title": "Target Health (up)",
          "transformations": [
            {
              "id": "organize",
              "options": {
                "excludeByName": { "Time": true, "__name__": true },
                "renameByName": { "job": "Job", "instance": "Instance", "Value": "Up" }
              }
            }
          ],
          "type": "table"
        },
        {
          "collapsed": false,
          "gridPos": { "h": 1, "w": 24, "x": 0, "y": 21 },
          "id": 103,
          "title": "Services",
          "type": "row"
        },
        {
          "gridPos": { "h": 4, "w": 24, "x": 0, "y": 22 },
          "id": 10,
          "options": {
            "content": "### ArgoCD Applications\n\nView sync status and health of all 11 ArgoCD-managed applications:\n\n[Open ArgoCD Dashboard](https://argocd.iamworkin.lan)\n\n| App | Expected Status |\n|-----|----------------|\n| gitea | Healthy / Synced |\n| zabbix | Healthy / Synced |\n| guacamole | Healthy / Synced |\n| irc | Healthy / Synced |\n| mail | Healthy / Synced |\n| matrix | Healthy / Synced |\n| teamspeak | Healthy / Synced |\n| pki-web | Healthy / Synced |\n| intranet | Healthy / Synced |\n| telephony | Healthy / Synced |\n| infra-noc-services | Healthy / Synced |",
            "mode": "markdown"
          },
          "title": "ArgoCD App Status",
          "type": "text"
        },
        {
          "collapsed": false,
          "gridPos": { "h": 1, "w": 24, "x": 0, "y": 26 },
          "id": 104,
          "title": "Alerting",
          "type": "row"
        },
        {
          "fieldConfig": {
            "defaults": {
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  { "color": "green", "value": null },
                  { "color": "orange", "value": 1 },
                  { "color": "red", "value": 3 }
                ]
              }
            }
          },
          "gridPos": { "h": 6, "w": 24, "x": 0, "y": 27 },
          "id": 11,
          "targets": [
            {
              "datasource": { "type": "prometheus", "uid": "prometheus" },
              "expr": "count(ALERTS{alertstate=\"firing\"}) or vector(0)",
              "legendFormat": "Firing Alerts"
            }
          ],
          "title": "Firing Alerts",
          "type": "stat"
        }
      ],
      "refresh": "30s",
      "schemaVersion": 39,
      "tags": ["infrastructure", "blue-jay", "overview"],
      "time": { "from": "now-1h", "to": "now" },
      "timezone": "browser",
      "title": "Infrastructure Overview",
      "uid": "infra-overview",
      "version": 1
    }

# =============================================================================
# ConfigMap: Grafana Datasource Provisioning
# =============================================================================
---
apiVersion: v1
kind: ConfigMap
metadata:
  name: grafana-datasource-provisioning
  namespace: monitoring
data:
  datasource.yml: |
    apiVersion: 1
    datasources:
      - name: Prometheus
        type: prometheus
        access: proxy
        url: http://prometheus.monitoring.svc:9090
        isDefault: true
        editable: true

# =============================================================================
# ConfigMap: Grafana Alerting Provisioning (contact points + policies + rules)
# =============================================================================
# Makes alert rules declarative — survives pod rebuilds without API recreation
---
apiVersion: v1
kind: ConfigMap
metadata:
  name: grafana-alerting-provisioning
  namespace: monitoring
data:
  alerting.yml: |
    apiVersion: 1
    contactPoints:
      - orgId: 1
        name: IRC #alerts
        receivers:
          - uid: irc-alerts-webhook
            type: webhook
            settings:
              url: http://irc-notify.monitoring.svc:9119
              httpMethod: POST
            disableResolveMessage: false
      - orgId: 1
        name: Thermal Printer
        receivers:
          - uid: thermal-print-001
            type: webhook
            settings:
              url: http://irc-notify.monitoring.svc:9119
              httpMethod: POST
            disableResolveMessage: true
    policies:
      - orgId: 1
        receiver: IRC #alerts
        group_by: ['alertname']
        group_wait: 30s
        group_interval: 5m
        repeat_interval: 1h
        routes:
          - receiver: Thermal Printer
            matchers: ['alert_channel = thermal_print']
            group_wait: 1m
            group_interval: 10m
            repeat_interval: 4h
            continue: true
    groups:
      - orgId: 1
        name: AI Stack
        folder: AI Stack Alerts
        interval: 1m
        rules:
          - uid: ollama-down-local
            title: Ollama DOWN (Local)
            condition: C
            for: 2m
            noDataState: Alerting
            execErrState: OK
            annotations:
              summary: Ollama DOWN on workstation (R9700)
              description: Agent Zero LOCAL cannot reach Ollama. FAISS memory will fail.
              runbook: "1. Check PC power/wake 2. ssh stoltz@10.0.58.100 3. systemctl status ollama 4. journalctl -u ollama --since 5min"
            labels:
              severity: warning
              service: ollama
            data:
              - refId: A
                relativeTimeRange: {from: 120, to: 0}
                datasourceUid: prometheus
                model: {expr: 'probe_success{job="probe-ollama-local"}', instant: true, refId: A}
              - refId: B
                relativeTimeRange: {from: 120, to: 0}
                datasourceUid: __expr__
                model: {type: reduce, expression: A, reducer: last, refId: B}
              - refId: C
                relativeTimeRange: {from: 120, to: 0}
                datasourceUid: __expr__
                model: {type: threshold, expression: B, conditions: [{evaluator: {params: [1], type: lt}}], refId: C}
          - uid: ollama-down-edge1
            title: Ollama DOWN (Edge1)
            condition: C
            for: 2m
            noDataState: Alerting
            execErrState: OK
            annotations:
              summary: Ollama DOWN on edge1 Pi 5
              description: Agent Zero NUC cannot reach Ollama.
              runbook: "1. ssh stoltz@10.0.57.17 2. systemctl status ollama 3. journalctl -u ollama --since 5min 4. cat /sys/class/thermal/thermal_zone0/temp"
            labels:
              severity: warning
              service: ollama
            data:
              - refId: A
                relativeTimeRange: {from: 120, to: 0}
                datasourceUid: prometheus
                model: {expr: 'probe_success{job="probe-ollama-edge1"}', instant: true, refId: A}
              - refId: B
                relativeTimeRange: {from: 120, to: 0}
                datasourceUid: __expr__
                model: {type: reduce, expression: A, reducer: last, refId: B}
              - refId: C
                relativeTimeRange: {from: 120, to: 0}
                datasourceUid: __expr__
                model: {type: threshold, expression: B, conditions: [{evaluator: {params: [1], type: lt}}], refId: C}
          - uid: a0-down-local
            title: Agent Zero DOWN (Local)
            condition: C
            for: 2m
            noDataState: Alerting
            execErrState: OK
            annotations:
              summary: Agent Zero LOCAL DOWN
              description: K3s web UI unreachable.
              runbook: "1. ssh stoltz@10.0.58.100 2. docker ps | grep agent-zero 3. docker logs agent-zero --tail 20 4. Check Ollama first (A0 crashes if Ollama down)"
            labels:
              severity: warning
              service: agent-zero
            data:
              - refId: A
                relativeTimeRange: {from: 120, to: 0}
                datasourceUid: prometheus
                model: {expr: 'probe_success{job="probe-agentzero-local"}', instant: true, refId: A}
              - refId: B
                relativeTimeRange: {from: 120, to: 0}
                datasourceUid: __expr__
                model: {type: reduce, expression: A, reducer: last, refId: B}
              - refId: C
                relativeTimeRange: {from: 120, to: 0}
                datasourceUid: __expr__
                model: {type: threshold, expression: B, conditions: [{evaluator: {params: [1], type: lt}}], refId: C}
          - uid: a0-down-nuc
            title: Agent Zero DOWN (NUC)
            condition: C
            for: 2m
            noDataState: Alerting
            execErrState: OK
            annotations:
              summary: Agent Zero NUC DOWN
              description: RKE2 web UI unreachable.
              runbook: "1. ssh root@10.0.56.200 via noc1 2. kubectl get pods -n agent-zero 3. kubectl logs -n agent-zero deploy/agent-zero --tail 20"
            labels:
              severity: warning
              service: agent-zero
            data:
              - refId: A
                relativeTimeRange: {from: 120, to: 0}
                datasourceUid: prometheus
                model: {expr: 'probe_success{job="probe-agentzero-nuc"}', instant: true, refId: A}
              - refId: B
                relativeTimeRange: {from: 120, to: 0}
                datasourceUid: __expr__
                model: {type: reduce, expression: A, reducer: last, refId: B}
              - refId: C
                relativeTimeRange: {from: 120, to: 0}
                datasourceUid: __expr__
                model: {type: threshold, expression: B, conditions: [{evaluator: {params: [1], type: lt}}], refId: C}
          - uid: print-ollama-runner-long-keepalive
            title: Print.Web Ollama runner keep-alive >10m
            condition: C
            for: 2m
            noDataState: NoData
            execErrState: OK
            annotations:
              summary: Print.Web Ollama runner held too long
              description: "A Print.Web Ollama runner has more than 10 minutes of keep-alive remaining. Check active AI requests before the Pi 5 Ollama lane thrashes."
              runbook: "1. Open https://print.iamworkin.lan/admin 2. Check Ollama Fleet model + remaining keep-alive 3. Query Prometheus: ai_ollama_runner_keepalive_remaining_seconds{job=\"printweb-otel\"} 4. Stop duplicate model callers before restarting Ollama."
            labels:
              severity: warning
              service: print-web
              alert_channel: thermal_print
            data:
              - refId: A
                relativeTimeRange: {from: 120, to: 0}
                datasourceUid: prometheus
                model: {expr: 'max by(model) (ai_ollama_runner_keepalive_remaining_seconds{job="printweb-otel"})', instant: true, refId: A}
              - refId: B
                relativeTimeRange: {from: 120, to: 0}
                datasourceUid: __expr__
                model: {type: reduce, expression: A, reducer: last, refId: B}
              - refId: C
                relativeTimeRange: {from: 120, to: 0}
                datasourceUid: __expr__
                model: {type: threshold, expression: B, conditions: [{evaluator: {params: [600], type: gt}}], refId: C}
      - orgId: 1
        name: CI Runners
        folder: CI Alerts
        interval: 1m
        rules:
          - uid: linux-runner-offline
            title: LinuxRunnerOffline
            condition: C
            for: 5m
            noDataState: OK
            execErrState: Error
            annotations:
              summary: "Linux CI runner offline: {{ $labels.deployment }}"
              description: "A github-runner namespace Deployment has 0 ready replicas for more than 5 minutes. CI jobs targeting that repo will queue until the runner pod restarts and re-registers."
              runbook: "1. kubectl -n github-runner get pods -l app.kubernetes.io/name={{ $labels.deployment }} 2. kubectl -n github-runner logs -l app.kubernetes.io/name={{ $labels.deployment }} --tail=50 3. Verify PAT repo access if registration returns 404 4. Verify no RWO PVC is shared by scaled runners"
            labels:
              severity: warning
              service: github-runner
              alert_channel: irc
              team: ci
            data:
              - refId: A
                relativeTimeRange: {from: 300, to: 0}
                datasourceUid: prometheus
                model: {expr: 'kube_deployment_status_replicas_ready{namespace="github-runner",deployment=~"github-runner(|-(sharedpos|puppet|signage|dms|telephony|print-web|chat|mysql|kiosk-linux))"} == 0', instant: true, refId: A}
              - refId: B
                relativeTimeRange: {from: 300, to: 0}
                datasourceUid: __expr__
                model: {type: reduce, expression: A, reducer: last, refId: B}
              - refId: C
                relativeTimeRange: {from: 300, to: 0}
                datasourceUid: __expr__
                model: {type: threshold, expression: B, conditions: [{evaluator: {params: [0], type: gt}}], refId: C}
      - orgId: 1
        name: Infrastructure
        folder: AI Stack Alerts
        interval: 1m
        rules:
          - uid: node-down
            title: Node DOWN
            condition: C
            for: 2m
            noDataState: Alerting
            execErrState: OK
            annotations:
              summary: Node down
              description: Node exporter unreachable for 2 minutes. Host may be down or network issue.
              runbook: "1. Ping host IP 2. SSH via noc1 jumpbox (root@10.0.56.10) 3. systemctl status node_exporter 4. Check network/VLAN 5. Physical power check if unreachable"
            labels:
              severity: critical
              service: infrastructure
              alert_channel: thermal_print
            data:
              - refId: A
                relativeTimeRange: {from: 120, to: 0}
                datasourceUid: prometheus
                model: {expr: 'up{job=~"node-exporter|rke2-nodes|edge-nodes"}', instant: true, refId: A}
              - refId: B
                relativeTimeRange: {from: 120, to: 0}
                datasourceUid: __expr__
                model: {type: reduce, expression: A, reducer: last, refId: B}
              - refId: C
                relativeTimeRange: {from: 120, to: 0}
                datasourceUid: __expr__
                model: {type: threshold, expression: B, conditions: [{evaluator: {params: [1], type: lt}}], refId: C}
          - uid: macmini-runner-offline
            title: MacMiniRunnerOffline
            condition: C
            for: 10m
            noDataState: Alerting
            execErrState: OK
            annotations:
              summary: Mac mini GitHub runner offline
              description: "One or more macmini-* GitHub Actions runners have not reported online for more than 10 minutes. LaunchDaemons survive reboot and do not require the bluejay GUI session."
              runbook: "1. ssh fcadmin@macmini.iamworkin.lan 2. launchctl print system/io.flowercore.github-runner-<slug> 3. Check /Users/fcadmin/Library/Logs/github-runners/<slug>/stderr.log 4. Re-register the repo runner if .runner is missing"
            labels:
              severity: warning
              service: github-runner
            data:
              - refId: A
                relativeTimeRange: {from: 600, to: 0}
                datasourceUid: prometheus
                model: {expr: 'min(flowercore_github_runner_online{runner=~"macmini-.*"} or vector(0))', instant: true, refId: A}
              - refId: B
                relativeTimeRange: {from: 600, to: 0}
                datasourceUid: __expr__
                model: {type: reduce, expression: A, reducer: last, refId: B}
              - refId: C
                relativeTimeRange: {from: 600, to: 0}
                datasourceUid: __expr__
                model: {type: threshold, expression: B, conditions: [{evaluator: {params: [1], type: lt}}], refId: C}
          - uid: high-cpu
            title: High CPU (>85%)
            condition: C
            for: 10m
            noDataState: NoData
            execErrState: OK
            annotations:
              summary: High CPU
              description: CPU above 85% for 10 minutes. Performance degradation likely.
              runbook: "1. SSH to host 2. top -b -n1 | head -20 3. Check for runaway process 4. kubectl top pods (if K8s node)"
            labels:
              severity: warning
              service: infrastructure
            data:
              - refId: A
                relativeTimeRange: {from: 600, to: 0}
                datasourceUid: prometheus
                model: {expr: 'instance:node_cpu_usage:avg5m', instant: true, refId: A}
              - refId: B
                relativeTimeRange: {from: 600, to: 0}
                datasourceUid: __expr__
                model: {type: reduce, expression: A, reducer: last, refId: B}
              - refId: C
                relativeTimeRange: {from: 600, to: 0}
                datasourceUid: __expr__
                model: {type: threshold, expression: B, conditions: [{evaluator: {params: [85], type: gt}}], refId: C}
          - uid: high-memory
            title: High Memory (>90%)
            condition: C
            for: 5m
            noDataState: NoData
            execErrState: OK
            annotations:
              summary: High memory usage
              description: Memory above 90% for 5 minutes. OOM kills imminent.
              runbook: "1. SSH to host 2. free -h 3. ps aux --sort=-%mem | head 10 4. Check for memory leak (growing RSS)"
            labels:
              severity: warning
              service: infrastructure
            data:
              - refId: A
                relativeTimeRange: {from: 600, to: 0}
                datasourceUid: prometheus
                model: {expr: 'instance:node_memory_usage:percent', instant: true, refId: A}
              - refId: B
                relativeTimeRange: {from: 600, to: 0}
                datasourceUid: __expr__
                model: {type: reduce, expression: A, reducer: last, refId: B}
              - refId: C
                relativeTimeRange: {from: 600, to: 0}
                datasourceUid: __expr__
                model: {type: threshold, expression: B, conditions: [{evaluator: {params: [90], type: gt}}], refId: C}
          - uid: disk-low
            title: Disk Space Low (>85%)
            condition: C
            for: 10m
            noDataState: NoData
            execErrState: OK
            annotations:
              summary: Disk usage high
              description: Root disk above 85% for 10 minutes. Service disruption if full.
              runbook: "1. SSH to host 2. df -h / 3. du -sh /* | sort -rh | head 4. journalctl --vacuum-size=100M 5. podman system prune"
            labels:
              severity: warning
              service: infrastructure
            data:
              - refId: A
                relativeTimeRange: {from: 600, to: 0}
                datasourceUid: prometheus
                model: {expr: 'instance:node_disk_usage:percent', instant: true, refId: A}
              - refId: B
                relativeTimeRange: {from: 600, to: 0}
                datasourceUid: __expr__
                model: {type: reduce, expression: A, reducer: last, refId: B}
              - refId: C
                relativeTimeRange: {from: 600, to: 0}
                datasourceUid: __expr__
                model: {type: threshold, expression: B, conditions: [{evaluator: {params: [85], type: gt}}], refId: C}
      - orgId: 1
        name: RemoteDesktop
        folder: AI Stack Alerts
        interval: 1m
        rules:
          - uid: remotedesktop-web-down
            title: RemoteDesktop Web DOWN
            condition: C
            for: 3m
            noDataState: Alerting
            execErrState: OK
            annotations:
              summary: FlowerCore RemoteDesktop /health probe failing
              description: "https://desktop.iamworkin.lan/health has failed for 3 minutes. Catalog + session launch surface offline."
              runbook: "1. kubectl -n fc-desktop get pods -l app.kubernetes.io/name=remotedesktop-web 2. kubectl -n fc-desktop logs deploy/remotedesktop-web --tail=50 3. Check Traefik IngressRoute + step-ca cert 4. Rollout restart if pod is stuck"
            labels:
              severity: warning
              service: remotedesktop
            data:
              - refId: A
                relativeTimeRange: {from: 180, to: 0}
                datasourceUid: prometheus
                model: {expr: 'probe_success{job="probe-remotedesktop"}', instant: true, refId: A}
              - refId: B
                relativeTimeRange: {from: 180, to: 0}
                datasourceUid: __expr__
                model: {type: reduce, expression: A, reducer: last, refId: B}
              - refId: C
                relativeTimeRange: {from: 180, to: 0}
                datasourceUid: __expr__
                model: {type: threshold, expression: B, conditions: [{evaluator: {params: [1], type: lt}}], refId: C}

          - uid: remotedesktop-metrics-stale
            title: RemoteDesktop metrics stale
            condition: C
            for: 10m
            noDataState: Alerting
            execErrState: OK
            annotations:
              summary: RemoteDesktop /metrics returning no series
              description: "No fc_desktop_session_events_total series for 10 minutes. Either the Prometheus scrape is misconfigured or the web deployment stopped exporting metrics. Cross-checked by Zabbix template's identical 10m no-data trigger."
              runbook: "1. curl -sk https://desktop.iamworkin.lan/metrics | head 2. kubectl -n monitoring exec deploy/prometheus -- wget -qO- localhost:9090/api/v1/targets?scrapePool=fc-remotedesktop 3. Check monitoring-netpol egress allows to fc-desktop:8080"
            labels:
              severity: warning
              service: remotedesktop
            data:
              - refId: A
                relativeTimeRange: {from: 600, to: 0}
                datasourceUid: prometheus
                model: {expr: 'count(fc_desktop_session_events_total) or vector(0)', instant: true, refId: A}
              - refId: B
                relativeTimeRange: {from: 600, to: 0}
                datasourceUid: __expr__
                model: {type: reduce, expression: A, reducer: last, refId: B}
              - refId: C
                relativeTimeRange: {from: 600, to: 0}
                datasourceUid: __expr__
                model: {type: threshold, expression: B, conditions: [{evaluator: {params: [1], type: lt}}], refId: C}

          - uid: remotedesktop-pool-depleted
            title: RemoteDesktop pool depleted
            condition: C
            for: 5m
            noDataState: OK
            execErrState: OK
            annotations:
              summary: RemoteDesktop warm pool depleted for 5m
              description: "A RemoteDesktop warm pool has fc_desktop_pool_depleted=1 for 5 minutes. New launches will cold-start. Check pod scheduling, image pull, node capacity."
              runbook: "1. kubectl -n fc-desktop get pods -l app.kubernetes.io/name=remote-desktop --sort-by=.status.startTime 2. kubectl -n fc-desktop describe desktoppool <name> 3. Verify localhost/fc-desktop:* images imported on all 3 RKE2 nodes"
            labels:
              severity: warning
              service: remotedesktop
            data:
              - refId: A
                relativeTimeRange: {from: 300, to: 0}
                datasourceUid: prometheus
                model: {expr: 'max(fc_desktop_pool_depleted)', instant: true, refId: A}
              - refId: B
                relativeTimeRange: {from: 300, to: 0}
                datasourceUid: __expr__
                model: {type: reduce, expression: A, reducer: last, refId: B}
              - refId: C
                relativeTimeRange: {from: 300, to: 0}
                datasourceUid: __expr__
                model: {type: threshold, expression: B, conditions: [{evaluator: {params: [0.5], type: gt}}], refId: C}

          - uid: remotedesktop-pool-deficit-sustained
            title: RemoteDesktop pool below desired
            condition: C
            for: 10m
            noDataState: OK
            execErrState: OK
            annotations:
              summary: RemoteDesktop pool sustained deficit
              description: "A pool has fc_desktop_pool_deficit>0 for 10 minutes. Operator is reconciling but can't reach desired size — likely image pull, NFS affinity, or claim-init issue."
              runbook: "1. kubectl -n fc-desktop get pods -l flowercore.io/pool=<pool> 2. kubectl logs -n fc-desktop deploy/remotedesktop-operator 3. Check claim-init hook env on template"
            labels:
              severity: info
              service: remotedesktop
            data:
              - refId: A
                relativeTimeRange: {from: 600, to: 0}
                datasourceUid: prometheus
                model: {expr: 'max(fc_desktop_pool_deficit)', instant: true, refId: A}
              - refId: B
                relativeTimeRange: {from: 600, to: 0}
                datasourceUid: __expr__
                model: {type: reduce, expression: A, reducer: last, refId: B}
              - refId: C
                relativeTimeRange: {from: 600, to: 0}
                datasourceUid: __expr__
                model: {type: threshold, expression: B, conditions: [{evaluator: {params: [0], type: gt}}], refId: C}

          - uid: remotedesktop-session-churn-spike
            title: RemoteDesktop launch rate spike
            condition: C
            for: 5m
            noDataState: OK
            execErrState: OK
            annotations:
              summary: RemoteDesktop launch rate exceeds 20/min
              description: "Launch events >20/min for 5 minutes. Could be a user-facing feature launch, pooled template thrashing, or runaway automation loop."
              runbook: "1. kubectl -n fc-desktop get pods -l app.kubernetes.io/name=remote-desktop -o wide | wc -l 2. curl -sk https://desktop.iamworkin.lan/api/sessions/active 3. Check operator logs for reconcile loops"
            labels:
              severity: info
              service: remotedesktop
            data:
              - refId: A
                relativeTimeRange: {from: 300, to: 0}
                datasourceUid: prometheus
                model: {expr: 'sum(rate(fc_desktop_session_events_total{event="launch"}[5m])) * 60', instant: true, refId: A}
              - refId: B
                relativeTimeRange: {from: 300, to: 0}
                datasourceUid: __expr__
                model: {type: reduce, expression: A, reducer: last, refId: B}
              - refId: C
                relativeTimeRange: {from: 300, to: 0}
                datasourceUid: __expr__
                model: {type: threshold, expression: B, conditions: [{evaluator: {params: [20], type: gt}}], refId: C}

          - uid: remotedesktop-tls-expiry
            title: RemoteDesktop TLS cert expiring
            condition: C
            for: 6h
            noDataState: OK
            execErrState: OK
            annotations:
              summary: desktop.iamworkin.lan cert <2d to expiry
              description: "The desktop.iamworkin.lan certificate is inside the 2-day renewal window and cert-manager has not renewed. Check cert-manager logs, step-ca reachability, FlowerCore.DNS preflight for dnsNames."
              runbook: "1. kubectl -n fc-desktop get certificate remotedesktop-web-tls 2. kubectl -n cert-manager logs deploy/cert-manager --tail=50 3. Verify pfSense DNS override for desktop.iamworkin.lan"
            labels:
              severity: critical
              service: remotedesktop
            data:
              - refId: A
                relativeTimeRange: {from: 21600, to: 0}
                datasourceUid: prometheus
                model: {expr: '(probe_ssl_earliest_cert_expiry{job="probe-remotedesktop"} - time()) / 86400', instant: true, refId: A}
              - refId: B
                relativeTimeRange: {from: 21600, to: 0}
                datasourceUid: __expr__
                model: {type: reduce, expression: A, reducer: last, refId: B}
              - refId: C
                relativeTimeRange: {from: 21600, to: 0}
                datasourceUid: __expr__
                model: {type: threshold, expression: B, conditions: [{evaluator: {params: [2], type: lt}}], refId: C}

# =============================================================================
# Deployment: Grafana
# =============================================================================
---
apiVersion: apps/v1
kind: Deployment
metadata:
  name: grafana
  namespace: monitoring
  labels:
    app: grafana
spec:
  replicas: 1
  strategy:
    type: Recreate
  selector:
    matchLabels:
      app: grafana
  template:
    metadata:
      labels:
        app: grafana
    spec:
      securityContext:
        fsGroup: 472  # grafana group
        runAsUser: 472
        runAsGroup: 472
      containers:
        - name: grafana
          image: docker.io/grafana/grafana:latest
          env:
            # Credentials from 1Password Operator (OnePasswordItem → Secret "grafana-credentials")
            - name: GF_SECURITY_ADMIN_USER
              valueFrom:
                secretKeyRef:
                  name: grafana-credentials
                  key: username
            - name: GF_SECURITY_ADMIN_PASSWORD
              valueFrom:
                secretKeyRef:
                  name: grafana-credentials
                  key: password
            - name: GF_SERVER_ROOT_URL
              value: "https://grafana.iamworkin.lan"
            - name: GF_SERVER_SERVE_FROM_SUB_PATH
              value: "false"
            # Zabbix plugin: install manually after first boot if needed
            # GF_INSTALL_PLUGINS requires internet on startup — breaks with restrictive NetworkPolicy
            # kubectl exec -n monitoring deploy/grafana -- grafana cli plugins install alexanderzobnin-zabbix-app
          ports:
            - containerPort: 3000
              name: http
          volumeMounts:
            - name: data
              mountPath: /var/lib/grafana
            - name: dashboard-provider
              mountPath: /etc/grafana/provisioning/dashboards
              readOnly: true
            - name: dashboards-ai-stack
              mountPath: /var/lib/grafana/dashboards/ai-stack
              readOnly: true
            - name: dashboards-edge-nodes
              mountPath: /var/lib/grafana/dashboards/edge-nodes
              readOnly: true
            - name: dashboards-network
              mountPath: /var/lib/grafana/dashboards/network
              readOnly: true
            - name: dashboards-operations
              mountPath: /var/lib/grafana/dashboards/operations
              readOnly: true
            - name: dashboards-printer
              mountPath: /var/lib/grafana/dashboards/printer
              readOnly: true
            - name: dashboards-infra-overview
              mountPath: /var/lib/grafana/dashboards/infra-overview
              readOnly: true
            - name: dashboards-remotedesktop
              mountPath: /var/lib/grafana/dashboards/remotedesktop
              readOnly: true
            - name: datasource-provisioning
              mountPath: /etc/grafana/provisioning/datasources
              readOnly: true
            - name: alerting-provisioning
              mountPath: /etc/grafana/provisioning/alerting
              readOnly: true
          resources:
            requests:
              cpu: 100m
              memory: 128Mi
            limits:
              cpu: 500m
              memory: 512Mi
          livenessProbe:
            httpGet:
              path: /api/health
              port: 3000
            initialDelaySeconds: 30
            periodSeconds: 30
          readinessProbe:
            httpGet:
              path: /api/health
              port: 3000
            initialDelaySeconds: 10
            periodSeconds: 10
      volumes:
        - name: data
          persistentVolumeClaim:
            claimName: grafana-data
        - name: dashboard-provider
          configMap:
            name: grafana-dashboard-provider
        - name: dashboards-ai-stack
          configMap:
            name: grafana-dashboards
        - name: dashboards-edge-nodes
          configMap:
            name: grafana-dashboard-edge-nodes
        - name: dashboards-network
          configMap:
            name: grafana-dashboard-network-overview
        - name: dashboards-operations
          configMap:
            name: grafana-dashboard-operations
        - name: dashboards-printer
          configMap:
            name: grafana-dashboard-printer
        - name: dashboards-infra-overview
          configMap:
            name: grafana-dashboard-infra-overview
        - name: dashboards-remotedesktop
          configMap:
            name: grafana-dashboard-remotedesktop
        - name: datasource-provisioning
          configMap:
            name: grafana-datasource-provisioning
        - name: alerting-provisioning
          configMap:
            name: grafana-alerting-provisioning

# =============================================================================
# Deployment: Blackbox Exporter
# =============================================================================
---
apiVersion: apps/v1
kind: Deployment
metadata:
  name: blackbox-exporter
  namespace: monitoring
  labels:
    app: blackbox-exporter
spec:
  replicas: 1
  selector:
    matchLabels:
      app: blackbox-exporter
  template:
    metadata:
      labels:
        app: blackbox-exporter
    spec:
      containers:
        - name: blackbox-exporter
          image: quay.io/prometheus/blackbox-exporter:latest
          args:
            - "--config.file=/config/blackbox.yml"
          ports:
            - containerPort: 9115
              name: http
          volumeMounts:
            - name: config
              mountPath: /config/blackbox.yml
              subPath: blackbox.yml
              readOnly: true
          resources:
            requests:
              cpu: 50m
              memory: 32Mi
            limits:
              cpu: 200m
              memory: 128Mi
          livenessProbe:
            httpGet:
              path: /
              port: 9115
            initialDelaySeconds: 5
            periodSeconds: 30
          readinessProbe:
            httpGet:
              path: /
              port: 9115
            initialDelaySeconds: 3
            periodSeconds: 10
      volumes:
        - name: config
          configMap:
            name: blackbox-config

# =============================================================================
# PVC: SNMP Exporter Config (100Mi, Longhorn)
# =============================================================================
# The custom snmp.yml (~2MB) exceeds the 1MB ConfigMap limit.
# This PVC stores the config file. To load a custom config:
#   kubectl cp snmp.yml monitoring/<snmp-exporter-pod>:/config/snmp.yml
# Then restart the pod to pick up the new config.
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
  name: snmp-config
  namespace: monitoring
spec:
  accessModes:
    - ReadWriteOnce
  storageClassName: longhorn
  resources:
    requests:
      storage: 100Mi

# =============================================================================
# Deployment: SNMP Exporter
# =============================================================================
# Uses a PVC-mounted config at /config/snmp.yml. An init container copies the
# default config from the image if the PVC is empty (first deploy).
# To load the custom noc1 snmp.yml (~2MB):
#   kubectl cp snmp.yml monitoring/<snmp-exporter-pod>:/config/snmp.yml
# Then restart: kubectl rollout restart deploy/snmp-exporter -n monitoring
---
apiVersion: apps/v1
kind: Deployment
metadata:
  name: snmp-exporter
  namespace: monitoring
  labels:
    app: snmp-exporter
spec:
  replicas: 1
  selector:
    matchLabels:
      app: snmp-exporter
  template:
    metadata:
      labels:
        app: snmp-exporter
    spec:
      initContainers:
        # Copy default snmp.yml from image if PVC is empty (first deploy)
        - name: init-config
          image: docker.io/prom/snmp-exporter:latest
          command:
            - sh
            - -c
            - |
              if [ ! -f /config/snmp.yml ]; then
                echo "No custom config found, copying default from image..."
                cp /etc/snmp_exporter/snmp.yml /config/snmp.yml
                echo "Default snmp.yml copied to PVC."
              else
                echo "Custom snmp.yml already exists on PVC, skipping copy."
              fi
          volumeMounts:
            - name: snmp-config
              mountPath: /config
      containers:
        - name: snmp-exporter
          image: docker.io/prom/snmp-exporter:latest
          args:
            - "--config.file=/config/snmp.yml"
          ports:
            - containerPort: 9116
              name: http
          volumeMounts:
            - name: snmp-config
              mountPath: /config
              readOnly: true
          resources:
            requests:
              cpu: 50m
              memory: 64Mi
            limits:
              cpu: 200m
              memory: 256Mi
          livenessProbe:
            httpGet:
              path: /
              port: 9116
            initialDelaySeconds: 5
            periodSeconds: 30
          readinessProbe:
            httpGet:
              path: /
              port: 9116
            initialDelaySeconds: 3
            periodSeconds: 10
      volumes:
        - name: snmp-config
          persistentVolumeClaim:
            claimName: snmp-config

# =============================================================================
# Deployment: IRC Notify (alert relay)
# =============================================================================
---
apiVersion: apps/v1
kind: Deployment
metadata:
  name: irc-notify
  namespace: monitoring
  labels:
    app: irc-notify
spec:
  replicas: 1
  selector:
    matchLabels:
      app: irc-notify
  template:
    metadata:
      labels:
        app: irc-notify
    spec:
      containers:
        - name: irc-notify
          image: docker.io/library/python:3.12-slim
          command: ["python3", "/app/notify.py"]
          ports:
            - containerPort: 9119
              name: http
          volumeMounts:
            - name: script
              mountPath: /app/notify.py
              subPath: notify.py
              readOnly: true
          resources:
            requests:
              cpu: 25m
              memory: 32Mi
            limits:
              cpu: 100m
              memory: 64Mi
          livenessProbe:
            tcpSocket:
              port: 9119
            initialDelaySeconds: 5
            periodSeconds: 30
          readinessProbe:
            tcpSocket:
              port: 9119
            initialDelaySeconds: 3
            periodSeconds: 10
      volumes:
        - name: script
          configMap:
            name: irc-notify-script

# =============================================================================
# DaemonSet: Node Exporter (runs on every RKE2 node)
# =============================================================================
# Port 9101 avoids conflict with host-level node-exporters already on :9100.
# The rke2-nodes Prometheus job scrapes the host instances on :9100; this
# DaemonSet provides K8s service-discovery-based scraping on :9101.
---
apiVersion: apps/v1
kind: DaemonSet
metadata:
  name: node-exporter
  namespace: monitoring
  labels:
    app: node-exporter
spec:
  selector:
    matchLabels:
      app: node-exporter
  updateStrategy:
    type: RollingUpdate
    rollingUpdate:
      maxUnavailable: 1
  template:
    metadata:
      labels:
        app: node-exporter
    spec:
      hostPID: true
      hostNetwork: true
      tolerations:
        - operator: Exists
      securityContext:
        runAsNonRoot: false
        runAsUser: 0
      containers:
        - name: node-exporter
          image: docker.io/prom/node-exporter:latest
          args:
            - "--path.rootfs=/host"
            - "--path.sysfs=/host/sys"
            - "--path.procfs=/host/proc"
            - "--collector.filesystem.mount-points-exclude=^/(dev|proc|sys|run|var/lib/containers|run/containers|var/lib/rancher|var/lib/longhorn)($|/)"
            - "--collector.netclass.ignored-devices=^(veth|cali|flannel|cni).*$"
            - "--no-collector.btrfs"
            - "--web.listen-address=:9101"
          ports:
            - containerPort: 9101
              hostPort: 9101
              name: metrics
          securityContext:
            privileged: true
            readOnlyRootFilesystem: true
          volumeMounts:
            - name: rootfs
              mountPath: /host
              readOnly: true
              mountPropagation: HostToContainer
            - name: proc
              mountPath: /host/proc
              readOnly: true
            - name: sys
              mountPath: /host/sys
              readOnly: true
          resources:
            requests:
              cpu: 50m
              memory: 32Mi
            limits:
              cpu: 200m
              memory: 128Mi
      volumes:
        - name: rootfs
          hostPath:
            path: /
        - name: proc
          hostPath:
            path: /proc
        - name: sys
          hostPath:
            path: /sys

# =============================================================================
# Service: Prometheus (ClusterIP :9090)
# =============================================================================
---
apiVersion: v1
kind: Service
metadata:
  name: prometheus
  namespace: monitoring
  labels:
    app: prometheus
spec:
  type: ClusterIP
  ports:
    - port: 9090
      targetPort: 9090
      protocol: TCP
      name: http
  selector:
    app: prometheus

# =============================================================================
# Service: Grafana (ClusterIP :3000)
# =============================================================================
---
apiVersion: v1
kind: Service
metadata:
  name: grafana
  namespace: monitoring
  labels:
    app: grafana
spec:
  type: ClusterIP
  ports:
    - port: 3000
      targetPort: 3000
      protocol: TCP
      name: http
  selector:
    app: grafana

# =============================================================================
# Service: Blackbox Exporter (ClusterIP :9115)
# =============================================================================
---
apiVersion: v1
kind: Service
metadata:
  name: blackbox-exporter
  namespace: monitoring
  labels:
    app: blackbox-exporter
spec:
  type: ClusterIP
  ports:
    - port: 9115
      targetPort: 9115
      protocol: TCP
      name: http
  selector:
    app: blackbox-exporter

# =============================================================================
# Service: SNMP Exporter (ClusterIP :9116)
# =============================================================================
---
apiVersion: v1
kind: Service
metadata:
  name: snmp-exporter
  namespace: monitoring
  labels:
    app: snmp-exporter
spec:
  type: ClusterIP
  ports:
    - port: 9116
      targetPort: 9116
      protocol: TCP
      name: http
  selector:
    app: snmp-exporter

# =============================================================================
# Service: Node Exporter (headless for Prometheus SD)
# =============================================================================
---
apiVersion: v1
kind: Service
metadata:
  name: node-exporter
  namespace: monitoring
  labels:
    app: node-exporter
spec:
  type: ClusterIP
  clusterIP: None
  ports:
    - port: 9101
      targetPort: 9101
      protocol: TCP
      name: metrics
  selector:
    app: node-exporter

# =============================================================================
# Service: IRC Notify (ClusterIP :9119)
# =============================================================================
---
apiVersion: v1
kind: Service
metadata:
  name: irc-notify
  namespace: monitoring
  labels:
    app: irc-notify
spec:
  type: ClusterIP
  ports:
    - port: 9119
      targetPort: 9119
      protocol: TCP
      name: http
  selector:
    app: irc-notify

# =============================================================================
# TLS Certificates (cert-manager + step-ca ACME)
# =============================================================================
---
apiVersion: cert-manager.io/v1
kind: Certificate
metadata:
  name: grafana-tls
  namespace: monitoring
spec:
  secretName: grafana-tls
  issuerRef:
    name: step-ca-acme
    kind: ClusterIssuer
  dnsNames:
    - grafana.iamworkin.lan
---
apiVersion: cert-manager.io/v1
kind: Certificate
metadata:
  name: prometheus-tls
  namespace: monitoring
spec:
  secretName: prometheus-tls
  issuerRef:
    name: step-ca-acme
    kind: ClusterIssuer
  dnsNames:
    - prometheus.iamworkin.lan

# =============================================================================
# Traefik IngressRoute: Grafana
# =============================================================================
---
apiVersion: traefik.io/v1alpha1
kind: IngressRoute
metadata:
  name: grafana
  namespace: monitoring
spec:
  entryPoints:
    - websecure
  routes:
    - kind: Rule
      match: Host(`grafana.iamworkin.lan`)
      services:
        - name: grafana
          port: 3000
  tls:
    secretName: grafana-tls

# =============================================================================
# Traefik IngressRoute: Prometheus
# =============================================================================
---
apiVersion: traefik.io/v1alpha1
kind: IngressRoute
metadata:
  name: prometheus
  namespace: monitoring
spec:
  entryPoints:
    - websecure
  routes:
    - kind: Rule
      match: Host(`prometheus.iamworkin.lan`)
      services:
        - name: prometheus
          port: 9090
  tls:
    secretName: prometheus-tls

# =============================================================================
# NetworkPolicy: monitoring namespace
# =============================================================================
---
apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
metadata:
  name: monitoring-netpol
  namespace: monitoring
spec:
  podSelector: {}
  policyTypes:
    - Ingress
    - Egress
  ingress:
    # Allow from Traefik (IngressRoutes AND ACME solver pods)
    - from:
        - namespaceSelector:
            matchLabels:
              kubernetes.io/metadata.name: traefik-system
    # Allow intra-namespace (prometheus→exporters, grafana→prometheus, grafana→irc-notify)
    - from:
        - podSelector: {}
    # Allow from cert-manager (ACME HTTP-01 self-check)
    - from:
        - namespaceSelector:
            matchLabels:
              kubernetes.io/metadata.name: cert-manager
  egress:
    # DNS
    - to:
        - namespaceSelector: {}
      ports:
        - port: 53
          protocol: UDP
        - port: 53
          protocol: TCP
    # MGMT VLAN (noc1, pfSense, switches, SNMP, node-exporter)
    - to:
        - ipBlock:
            cidr: 10.0.56.0/24
    # PROD VLAN (edge nodes)
    - to:
        - ipBlock:
            cidr: 10.0.57.0/24
    # HOME VLAN (workstation, printer, NAS)
    - to:
        - ipBlock:
            cidr: 10.0.58.0/24
    # Intra-namespace
    - to:
        - podSelector: {}
    # Blackbox probes to other namespaces (agent-zero, etc)
    - to:
        - namespaceSelector:
            matchLabels:
              kubernetes.io/metadata.name: agent-zero
      ports:
        - port: 80
          protocol: TCP
    # FlowerCore.RemoteDesktop /metrics scrape via the fc-desktop
    # ClusterIP Service (remotedesktop-web:8080). Also covers the
    # Traefik VIP hairpin path since after kube-proxy DNAT, the egress
    # destination is the backend pod IP on the service port (see
    # feedback_netpol_dnat_backend_port).
    - to:
        - namespaceSelector:
            matchLabels:
              kubernetes.io/metadata.name: fc-desktop
      ports:
        - port: 8080
          protocol: TCP
    # Traefik backend ports — needed for in-cluster egress to public
    # iamworkin.lan hostnames that CoreDNS wildcard resolves to the
    # LoadBalancer VIP. Post-DNAT destination is a Traefik pod on 8080/8443.
    - to:
        - namespaceSelector:
            matchLabels:
              kubernetes.io/metadata.name: traefik-system
          podSelector:
            matchLabels:
              app.kubernetes.io/name: traefik
      ports:
        - port: 8080
          protocol: TCP
        - port: 8443
          protocol: TCP
    # Traefik /metrics endpoint (port 9100) — separate from the data-path
    # ports above. Required for the in-cluster `traefik` scrape job.
    - to:
        - namespaceSelector:
            matchLabels:
              kubernetes.io/metadata.name: traefik-system
      ports:
        - port: 9100
          protocol: TCP
    # kube-state-metrics — required for kubernetes-state alert group.
    - to:
        - namespaceSelector:
            matchLabels:
              kubernetes.io/metadata.name: kube-system
      ports:
        - port: 8080
          protocol: TCP
    # cert-manager metrics — required for CertManagerCertificate* alerts.
    - to:
        - namespaceSelector:
            matchLabels:
              kubernetes.io/metadata.name: cert-manager
      ports:
        - port: 9402
          protocol: TCP
    # Longhorn manager metrics — required for Longhorn* alerts.
    - to:
        - namespaceSelector:
            matchLabels:
              kubernetes.io/metadata.name: longhorn-system
      ports:
        - port: 9500
          protocol: TCP
    # IRC (irc-notify → UnrealIRCd in irc namespace via K8s DNS)
    - to:
        - namespaceSelector:
            matchLabels:
              kubernetes.io/metadata.name: irc
      ports:
        - port: 6667
          protocol: TCP
        - port: 6697
          protocol: TCP
    # Step-CA ACME (cert renewal)
    - to:
        - ipBlock:
            cidr: 10.0.56.10/32
      ports:
        - port: 9443
          protocol: TCP
    # Internet (optional: Grafana plugin install, ACME)
    - to:
        - ipBlock:
            cidr: 0.0.0.0/0
            except:
              - 10.0.0.0/8
              - 172.16.0.0/12
              - 192.168.0.0/16

# =============================================================================
# Job: SNMP Config Loader (ArgoCD PostSync hook)
# =============================================================================
# Runs once after the main deployment to populate the SNMP config PVC.
# Attempts to download custom snmp.yml from noc1; falls back to the default
# config bundled in the snmp-exporter image.
---
apiVersion: batch/v1
kind: Job
metadata:
  name: snmp-config-loader
  namespace: monitoring
  annotations:
    argocd.argoproj.io/hook: PostSync
    argocd.argoproj.io/hook-delete-policy: HookSucceeded
spec:
  backoffLimit: 0
  template:
    metadata:
      labels:
        app: snmp-config-loader
    spec:
      restartPolicy: Never
      initContainers:
        # Try to download custom snmp.yml from noc1
        - name: download-config
          image: docker.io/curlimages/curl:latest
          command:
            - sh
            - -c
            - |
              echo "Attempting to download custom snmp.yml from noc1..."
              curl -sf --connect-timeout 10 --max-time 30 \
                http://10.0.56.10:9116/config -o /config/snmp.yml 2>/dev/null
              if [ $? -eq 0 ] && [ -s /config/snmp.yml ]; then
                echo "Custom snmp.yml downloaded from noc1 successfully."
              else
                echo "Download failed or empty, will use default from image."
                rm -f /config/snmp.yml
              fi
          volumeMounts:
            - name: snmp-config
              mountPath: /config
      containers:
        # If download failed, copy the default config from the image
        - name: fallback-default
          image: docker.io/prom/snmp-exporter:latest
          command:
            - sh
            - -c
            - |
              if [ -f /config/snmp.yml ] && [ -s /config/snmp.yml ]; then
                echo "Custom config already present, nothing to do."
              else
                echo "Copying default snmp.yml from image to PVC..."
                cp /etc/snmp_exporter/snmp.yml /config/snmp.yml
                echo "Default config copied."
              fi
              echo "SNMP config loader complete."
          volumeMounts:
            - name: snmp-config
              mountPath: /config
      volumes:
        - name: snmp-config
          persistentVolumeClaim:
            claimName: snmp-config