diff --git a/apps/monitoring/noc-monitoring.yaml b/apps/monitoring/noc-monitoring.yaml
index ad4f791..f77d7cd 100644
--- a/apps/monitoring/noc-monitoring.yaml
+++ b/apps/monitoring/noc-monitoring.yaml
@@ -104,21 +104,27 @@ data:
           - target_label: __address__
             replacement: snmp-exporter.monitoring.svc:9116
 
-      # UniFi Cloud Key SNMP
-      - job_name: "snmp-cloudkey"
-        static_configs:
-          - targets: ["10.0.56.3"]
-        metrics_path: /snmp
-        params:
-          module: [if_mib]
-          auth: [bluejay_v2]
-        relabel_configs:
-          - source_labels: [__address__]
-            target_label: __param_target
-          - source_labels: [__param_target]
-            target_label: instance
-          - target_label: __address__
-            replacement: snmp-exporter.monitoring.svc:9116
+      # UniFi Cloud Key SNMP — DISABLED 2026-04-26
+      # The Cloud Key Gen2+ runs unifi-core (controller) only — not a network
+      # device — and does NOT run an SNMP agent on UDP/161. Scrapes were
+      # silently failing with "connection refused" from 10.42.x.x:161 every
+      # 30s, polluting up{} = 0 and lastError on the Targets page. Hardware
+      # health (CPU/mem/disk) for the Cloud Key host should come from
+      # node_exporter via SSH — not SNMP.
+      # - job_name: "snmp-cloudkey"
+      #   static_configs:
+      #     - targets: ["10.0.56.3"]
+      #   metrics_path: /snmp
+      #   params:
+      #     module: [if_mib]
+      #     auth: [bluejay_v2]
+      #   relabel_configs:
+      #     - source_labels: [__address__]
+      #       target_label: __param_target
+      #     - source_labels: [__param_target]
+      #       target_label: instance
+      #     - target_label: __address__
+      #       replacement: snmp-exporter.monitoring.svc:9116
 
       # UniFi Switch SNMP
       - job_name: "snmp-switch"
@@ -279,10 +285,13 @@ data:
             replacement: blackbox-exporter.monitoring.svc:9115
 
       # FlowerCore.RemoteDesktop web health (public cluster VIP)
+      # Module is https_internal — desktop.iamworkin.lan uses a step-ca leaf
+      # cert; blackbox does NOT trust step-ca root, so http_2xx fails with
+      # x509 unknown authority and probe_success=0 even when /health 200s.
       - job_name: "probe-remotedesktop"
         metrics_path: /probe
         params:
-          module: [http_2xx]
+          module: [https_internal]
         scrape_interval: 30s
         static_configs:
           - targets: ["https://desktop.iamworkin.lan/health"]
@@ -330,26 +339,12 @@ data:
       # AI Stack Health Probes (Blackbox Exporter)
       # =============================================================================
 
-      # Ollama API — workstation (LOCAL Agent Zero)
-      - job_name: "probe-ollama-local"
-        metrics_path: /probe
-        params:
-          module: [http_ollama]
-        scrape_interval: 30s
-        static_configs:
-          - targets: ["http://10.0.58.100:11434/api/tags"]
-            labels:
-              instance: "ollama-local"
-              service: "ollama"
-              deployment: "local"
-              gpu: "r9700"
-        relabel_configs:
-          - source_labels: [__address__]
-            target_label: __param_target
-          - source_labels: [__param_target]
-            target_label: instance
-          - target_label: __address__
-            replacement: blackbox-exporter.monitoring.svc:9115
+      # NOTE: probe-ollama-local and probe-agentzero-local were REMOVED
+      # 2026-04-26. They pointed at 10.0.58.100 (HOME VLAN) which is not
+      # reachable from cluster pods (firewalled). They had been firing as
+      # OllamaDown / AgentZeroDown since 2026-04-24. Workstation/AI-laptop
+      # Ollama and Agent Zero should be monitored via host-side Puppet
+      # (node_exporter on the box) once the AI laptop is running 24/7.
 
       # Ollama API — edge1 Pi 5 (NUC Agent Zero)
       - job_name: "probe-ollama-edge1"
@@ -372,34 +367,18 @@ data:
           - target_label: __address__
             replacement: blackbox-exporter.monitoring.svc:9115
 
-      # Agent Zero Web UI — local (K3s)
-      - job_name: "probe-agentzero-local"
-        metrics_path: /probe
-        params:
-          module: [http_2xx]
-        scrape_interval: 30s
-        static_configs:
-          - targets: ["http://10.0.58.100:30050/"]
-            labels:
-              instance: "agent-zero-local"
-              service: "agent-zero"
-              deployment: "local"
-        relabel_configs:
-          - source_labels: [__address__]
-            target_label: __param_target
-          - source_labels: [__param_target]
-            target_label: instance
-          - target_label: __address__
-            replacement: blackbox-exporter.monitoring.svc:9115
-
-      # Agent Zero Web UI — NUC (RKE2 via Traefik)
+      # Agent Zero Web UI — in-cluster (RKE2)
+      # Target uses short svc form (agent-zero.agent-zero.svc) NOT
+      # cluster.local FQDN — the *.cluster.local form gets rewritten to
+      # 10.0.56.200 (Traefik VIP) by the CoreDNS iamworkin.lan template +
+      # ndots:5 search-suffix expansion. Memory: feedback_coredns_ndots_template_collision.
       - job_name: "probe-agentzero-nuc"
         metrics_path: /probe
         params:
           module: [http_2xx]
         scrape_interval: 30s
         static_configs:
-          - targets: ["http://agent-zero.agent-zero.svc.cluster.local/"]
+          - targets: ["http://agent-zero.agent-zero.svc:80/"]
             labels:
               instance: "agent-zero-nuc"
               service: "agent-zero"
@@ -412,6 +391,84 @@ data:
           - target_label: __address__
             replacement: blackbox-exporter.monitoring.svc:9115
 
+      # =============================================================================
+      # K8s Cluster State (kube-state-metrics, cert-manager, traefik)
+      # =============================================================================
+      # All exposed as NodePorts via the *-metrics-nodeport.yaml manifests in
+      # this dir. Single-node target — kube-proxy routes to whichever node
+      # the underlying pod runs on.
+
+      # kube-state-metrics — exposes K8s object state (pods, deployments, nodes)
+      # Required for KubeContainerRestartingFrequently / KubePodNotReady alerts.
+      - job_name: "kube-state-metrics"
+        scrape_interval: 30s
+        static_configs:
+          - targets: ["10.0.56.11:30901"]
+            labels:
+              cluster: "rke2"
+
+      # cert-manager — exposes certmanager_certificate_ready_status,
+      # certmanager_certificate_expiration_timestamp_seconds, etc. Drives the
+      # CertManagerCertificateNotReady / CertManagerCertificateRenewalFailed
+      # alerts. Memory: project_cert_manager_prometheus_scrape.
+      - job_name: "cert-manager"
+        scrape_interval: 30s
+        static_configs:
+          - targets: ["10.0.56.11:30902"]
+            labels:
+              cluster: "rke2"
+
+      # Traefik — request rates, latency, TLS cert metadata, router state.
+      # Three replicas (one per node) — scrape all so failover is visible.
+      - job_name: "traefik"
+        scrape_interval: 15s
+        static_configs:
+          - targets:
+              - "10.0.56.11:30900"
+              - "10.0.56.12:30900"
+              - "10.0.56.13:30900"
+            labels:
+              service: "traefik"
+              cluster: "rke2"
+
+      # FC web services through Traefik — single probe surface to spot any
+      # iamworkin.lan host returning non-200. Uses https_internal because all
+      # certs are step-ca leaves; blackbox would x509-fail with http_2xx.
+      - job_name: "probe-traefik-services"
+        metrics_path: /probe
+        params:
+          module: [https_internal]
+        scrape_interval: 60s
+        static_configs:
+          - targets:
+              - "https://gitea.iamworkin.lan/"
+              - "https://argocd.iamworkin.lan/"
+              - "https://intranet.iamworkin.lan/"
+              - "https://signage.iamworkin.lan/"
+              - "https://kiosk.iamworkin.lan/"
+              - "https://media.iamworkin.lan/"
+              - "https://mysql.iamworkin.lan/"
+              - "https://php.iamworkin.lan/"
+              - "https://zabbix.iamworkin.lan/"
+              - "https://guac.iamworkin.lan/"
+              - "https://desktop.iamworkin.lan/"
+              - "https://print.iamworkin.lan/"
+              - "https://dns.iamworkin.lan/"
+              - "https://fc-llm-bridge.iamworkin.lan/healthz"
+              - "https://acme.iamworkin.lan:9443/health"
+              - "https://prometheus.iamworkin.lan/"
+              - "https://grafana.iamworkin.lan/"
+            labels:
+              probe_type: "traefik-service"
+        relabel_configs:
+          - source_labels: [__address__]
+            target_label: __param_target
+          - source_labels: [__param_target]
+            regex: "https?://([^/:]+).*"
+            target_label: instance
+          - target_label: __address__
+            replacement: blackbox-exporter.monitoring.svc:9115
+
       # =============================================================================
       # Self-monitoring (K8s monitoring namespace)
       # =============================================================================
@@ -589,23 +646,31 @@ data:
               summary: "RemoteDesktop /metrics scrape returning no data"
               description: "No fc_desktop_session_events_total series for 10 minutes. Either the Prometheus scrape target is misconfigured or the web deployment stopped exporting metrics. Zabbix template carries the same 10m no-data trigger for cross-monitor parity."
 
+          # fc_desktop_pool_depleted is emitted as state-as-label: one series
+          # per template per status (Ready/Warming/BelowDesiredSize/Disabled).
+          # The publisher does NOT reset old series to 0 when a template
+          # transitions states — it just emits a new series with new labels.
+          # So a template that was Warming yesterday still has its
+          # Warming-labeled series stuck at 1 even when current status=Ready.
+          # Filter on the Critical alert_level (= BelowDesiredSize) so only
+          # genuine current-state depletion fires. Same fix on Deficit below.
           - alert: RemoteDesktopPoolDepleted
-            expr: fc_desktop_pool_depleted > 0
+            expr: fc_desktop_pool_depleted{alert_level="Critical",enabled="true"} > 0
             for: 5m
             labels:
               severity: warning
             annotations:
-              summary: "RemoteDesktop pool {{ $labels.pool }} depleted ({{ $labels.template }})"
-              description: "Pool {{ $labels.pool }} has been depleted for 5 minutes. New launches will cold-start. Operator should check for pod-scheduling failures, image pull issues, or exhausted node capacity before warm-pool parity is expected back."
+              summary: "RemoteDesktop pool depleted ({{ $labels.template }})"
+              description: "Pool for template {{ $labels.template }} (status={{ $labels.status }}) has been depleted for 5 minutes. New launches will cold-start. Operator should check for pod-scheduling failures, image pull issues, or exhausted node capacity before warm-pool parity is expected back."
 
           - alert: RemoteDesktopPoolDeficitSustained
-            expr: fc_desktop_pool_deficit > 0
+            expr: fc_desktop_pool_deficit{alert_level=~"Warning|Critical",enabled="true"} > 0
             for: 10m
             labels:
               severity: info
             annotations:
-              summary: "RemoteDesktop pool {{ $labels.pool }} below desired for 10m"
-              description: "Pool {{ $labels.pool }} has a persistent deficit of {{ $value }} warm pods. The operator is reconciling but can't reach desired size — likely an image pull, NFS affinity, or claim-init issue."
+              summary: "RemoteDesktop pool {{ $labels.template }} below desired for 10m"
+              description: "Pool {{ $labels.template }} (status={{ $labels.status }}) has a persistent deficit of {{ $value }} warm pods. The operator is reconciling but can't reach desired size — likely an image pull, NFS affinity, or claim-init issue."
 
           - alert: RemoteDesktopSessionChurnSpike
             expr: sum(rate(fc_desktop_session_events_total{event="launch"}[5m])) * 60 > 20
@@ -625,8 +690,10 @@ data:
               summary: "RemoteDesktop recording events silent for 30m despite active launches"
               description: "No recording events in 30 minutes while launches are happening. Recording may be silently disabled on all templates (SessionRecordingEnabled=false), the guacd NFS mount may be unhealthy, or the retention sweep isn't emitting events. Not an error by itself — worth checking."
 
+          # Match by job — instance label carries full URL incl. /health,
+          # not just hostname, so a hostname-only match never fires.
           - alert: RemoteDesktopTlsExpiry
-            expr: probe_ssl_earliest_cert_expiry{instance="https://desktop.iamworkin.lan"} - time() < 2 * 86400
+            expr: probe_ssl_earliest_cert_expiry{job="probe-remotedesktop"} - time() < 2 * 86400
             for: 6h
             labels:
               severity: critical
@@ -713,13 +780,16 @@ data:
             annotations:
               summary: "Epson ink CRITICAL: {{ $labels.prtMarkerSuppliesDescription }} at {{ $value }}%"
 
+          # for: 30m absorbs sleep cycles. The EcoTank sleeps after ~5 min
+          # of idle and SNMP times out, so 5m for: would page nightly. A
+          # genuine printer outage (jam, disconnected) lasts well over 30m.
           - alert: EpsonPrinterDown
             expr: up{job="snmp-printer"} == 0
-            for: 5m
+            for: 30m
             labels:
               severity: warning
             annotations:
-              summary: "Epson ET-3750 SNMP unreachable"
+              summary: "Epson ET-3750 SNMP unreachable for >30m (likely actual fault, not sleep)"
 
           - alert: SynologyDiskLow
             expr: hrStorageUsed{job="snmp-nas"} / hrStorageSize{job="snmp-nas"} * 100 > 85
@@ -773,6 +843,58 @@ data:
             annotations:
               summary: "Disk usage high on {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}%)"
 
+      # K8s pod-state alerts. Require kube-state-metrics scrape (added
+      # 2026-04-26 — see scrape_configs above). Would have surfaced the
+      # agent-zero ollama-proxy 172x crash-loop instead of letting it
+      # silently churn for ~3 days.
+      - name: kubernetes-state
+        rules:
+          - alert: KubeContainerRestartingFrequently
+            expr: increase(kube_pod_container_status_restarts_total[1h]) > 5
+            for: 15m
+            labels:
+              severity: warning
+            annotations:
+              summary: "{{ $labels.namespace }}/{{ $labels.pod }} container {{ $labels.container }} restarting >5x/hr"
+              description: "Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has restarted {{ $value | printf \"%.0f\" }} times in the last hour. Check 'kubectl describe pod' + last-state termination reason."
+
+          - alert: KubeContainerCrashLooping
+            expr: increase(kube_pod_container_status_restarts_total[15m]) > 3
+            for: 5m
+            labels:
+              severity: critical
+              alert_channel: thermal_print
+            annotations:
+              summary: "{{ $labels.namespace }}/{{ $labels.pod }} crashlooping ({{ $value | printf \"%.0f\" }} restarts/15m)"
+              description: "Container {{ $labels.container }} restarted {{ $value | printf \"%.0f\" }} times in 15 minutes — actively crashlooping."
+
+          - alert: KubePodNotReady
+            expr: sum by(namespace, pod) (kube_pod_status_phase{phase=~"Pending|Failed|Unknown"}) > 0
+            for: 15m
+            labels:
+              severity: warning
+            annotations:
+              summary: "{{ $labels.namespace }}/{{ $labels.pod }} not Ready for >15m"
+              description: "Pod is in a non-Running, non-Succeeded phase for over 15 minutes. Common causes: ImagePullBackOff (registry/Nexus down, wrong image tag), pending PVC, scheduling failure (taint/resources)."
+
+          - alert: KubePodImagePullBackOff
+            expr: sum by(namespace, pod) (kube_pod_container_status_waiting_reason{reason=~"ImagePullBackOff|ErrImagePull"}) > 0
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              summary: "{{ $labels.namespace }}/{{ $labels.pod }} ImagePullBackOff for >10m"
+              description: "Pod can't pull image. Check the image ref (often a stale tag or unreachable registry) and clean up if it's an orphan."
+
+          - alert: KubeDeploymentReplicasMismatch
+            expr: kube_deployment_spec_replicas != kube_deployment_status_replicas_available
+            for: 15m
+            labels:
+              severity: warning
+            annotations:
+              summary: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} replica mismatch"
+              description: "Spec wants {{ $labels.spec_replicas }} but only {{ $value }} available. Likely a rollout stuck on probe failure, scheduling, or PVC."
+
 # =============================================================================
 # ConfigMap: Blackbox Exporter Configuration
 # =============================================================================
@@ -804,6 +926,22 @@ data:
           fail_if_body_not_matches_regexp:
             - '"models"'
           preferred_ip_protocol: ip4
+      # https_internal — for Traefik-fronted services with step-ca leaf
+      # certs. blackbox does not trust the step-ca root CA, so http_2xx
+      # against any *.iamworkin.lan host fails with x509 unknown authority.
+      # Redirects + multiple status codes are accepted because some hosts
+      # 302 to /login or /scalar.
+      https_internal:
+        prober: http
+        timeout: 10s
+        http:
+          valid_http_versions: ["HTTP/1.1", "HTTP/2.0"]
+          valid_status_codes: [200, 301, 302, 303, 307, 308]
+          method: GET
+          follow_redirects: true
+          preferred_ip_protocol: ip4
+          tls_config:
+            insecure_skip_verify: true
 
 # =============================================================================
 # ConfigMap: IRC Notify Script