diff --git a/apps/monitoring/noc-monitoring.yaml b/apps/monitoring/noc-monitoring.yaml index ad4f791..f77d7cd 100644 --- a/apps/monitoring/noc-monitoring.yaml +++ b/apps/monitoring/noc-monitoring.yaml @@ -104,21 +104,27 @@ data: - target_label: __address__ replacement: snmp-exporter.monitoring.svc:9116 - # UniFi Cloud Key SNMP - - job_name: "snmp-cloudkey" - static_configs: - - targets: ["10.0.56.3"] - metrics_path: /snmp - params: - module: [if_mib] - auth: [bluejay_v2] - relabel_configs: - - source_labels: [__address__] - target_label: __param_target - - source_labels: [__param_target] - target_label: instance - - target_label: __address__ - replacement: snmp-exporter.monitoring.svc:9116 + # UniFi Cloud Key SNMP — DISABLED 2026-04-26 + # The Cloud Key Gen2+ runs unifi-core (controller) only — not a network + # device — and does NOT run an SNMP agent on UDP/161. Scrapes were + # silently failing with "connection refused" from 10.42.x.x:161 every + # 30s, polluting up{} = 0 and lastError on the Targets page. Hardware + # health (CPU/mem/disk) for the Cloud Key host should come from + # node_exporter via SSH — not SNMP. + # - job_name: "snmp-cloudkey" + # static_configs: + # - targets: ["10.0.56.3"] + # metrics_path: /snmp + # params: + # module: [if_mib] + # auth: [bluejay_v2] + # relabel_configs: + # - source_labels: [__address__] + # target_label: __param_target + # - source_labels: [__param_target] + # target_label: instance + # - target_label: __address__ + # replacement: snmp-exporter.monitoring.svc:9116 # UniFi Switch SNMP - job_name: "snmp-switch" @@ -279,10 +285,13 @@ data: replacement: blackbox-exporter.monitoring.svc:9115 # FlowerCore.RemoteDesktop web health (public cluster VIP) + # Module is https_internal — desktop.iamworkin.lan uses a step-ca leaf + # cert; blackbox does NOT trust step-ca root, so http_2xx fails with + # x509 unknown authority and probe_success=0 even when /health 200s. - job_name: "probe-remotedesktop" metrics_path: /probe params: - module: [http_2xx] + module: [https_internal] scrape_interval: 30s static_configs: - targets: ["https://desktop.iamworkin.lan/health"] @@ -330,26 +339,12 @@ data: # AI Stack Health Probes (Blackbox Exporter) # ============================================================================= - # Ollama API — workstation (LOCAL Agent Zero) - - job_name: "probe-ollama-local" - metrics_path: /probe - params: - module: [http_ollama] - scrape_interval: 30s - static_configs: - - targets: ["http://10.0.58.100:11434/api/tags"] - labels: - instance: "ollama-local" - service: "ollama" - deployment: "local" - gpu: "r9700" - relabel_configs: - - source_labels: [__address__] - target_label: __param_target - - source_labels: [__param_target] - target_label: instance - - target_label: __address__ - replacement: blackbox-exporter.monitoring.svc:9115 + # NOTE: probe-ollama-local and probe-agentzero-local were REMOVED + # 2026-04-26. They pointed at 10.0.58.100 (HOME VLAN) which is not + # reachable from cluster pods (firewalled). They had been firing as + # OllamaDown / AgentZeroDown since 2026-04-24. Workstation/AI-laptop + # Ollama and Agent Zero should be monitored via host-side Puppet + # (node_exporter on the box) once the AI laptop is running 24/7. # Ollama API — edge1 Pi 5 (NUC Agent Zero) - job_name: "probe-ollama-edge1" @@ -372,34 +367,18 @@ data: - target_label: __address__ replacement: blackbox-exporter.monitoring.svc:9115 - # Agent Zero Web UI — local (K3s) - - job_name: "probe-agentzero-local" - metrics_path: /probe - params: - module: [http_2xx] - scrape_interval: 30s - static_configs: - - targets: ["http://10.0.58.100:30050/"] - labels: - instance: "agent-zero-local" - service: "agent-zero" - deployment: "local" - relabel_configs: - - source_labels: [__address__] - target_label: __param_target - - source_labels: [__param_target] - target_label: instance - - target_label: __address__ - replacement: blackbox-exporter.monitoring.svc:9115 - - # Agent Zero Web UI — NUC (RKE2 via Traefik) + # Agent Zero Web UI — in-cluster (RKE2) + # Target uses short svc form (agent-zero.agent-zero.svc) NOT + # cluster.local FQDN — the *.cluster.local form gets rewritten to + # 10.0.56.200 (Traefik VIP) by the CoreDNS iamworkin.lan template + + # ndots:5 search-suffix expansion. Memory: feedback_coredns_ndots_template_collision. - job_name: "probe-agentzero-nuc" metrics_path: /probe params: module: [http_2xx] scrape_interval: 30s static_configs: - - targets: ["http://agent-zero.agent-zero.svc.cluster.local/"] + - targets: ["http://agent-zero.agent-zero.svc:80/"] labels: instance: "agent-zero-nuc" service: "agent-zero" @@ -412,6 +391,84 @@ data: - target_label: __address__ replacement: blackbox-exporter.monitoring.svc:9115 + # ============================================================================= + # K8s Cluster State (kube-state-metrics, cert-manager, traefik) + # ============================================================================= + # All exposed as NodePorts via the *-metrics-nodeport.yaml manifests in + # this dir. Single-node target — kube-proxy routes to whichever node + # the underlying pod runs on. + + # kube-state-metrics — exposes K8s object state (pods, deployments, nodes) + # Required for KubeContainerRestartingFrequently / KubePodNotReady alerts. + - job_name: "kube-state-metrics" + scrape_interval: 30s + static_configs: + - targets: ["10.0.56.11:30901"] + labels: + cluster: "rke2" + + # cert-manager — exposes certmanager_certificate_ready_status, + # certmanager_certificate_expiration_timestamp_seconds, etc. Drives the + # CertManagerCertificateNotReady / CertManagerCertificateRenewalFailed + # alerts. Memory: project_cert_manager_prometheus_scrape. + - job_name: "cert-manager" + scrape_interval: 30s + static_configs: + - targets: ["10.0.56.11:30902"] + labels: + cluster: "rke2" + + # Traefik — request rates, latency, TLS cert metadata, router state. + # Three replicas (one per node) — scrape all so failover is visible. + - job_name: "traefik" + scrape_interval: 15s + static_configs: + - targets: + - "10.0.56.11:30900" + - "10.0.56.12:30900" + - "10.0.56.13:30900" + labels: + service: "traefik" + cluster: "rke2" + + # FC web services through Traefik — single probe surface to spot any + # iamworkin.lan host returning non-200. Uses https_internal because all + # certs are step-ca leaves; blackbox would x509-fail with http_2xx. + - job_name: "probe-traefik-services" + metrics_path: /probe + params: + module: [https_internal] + scrape_interval: 60s + static_configs: + - targets: + - "https://gitea.iamworkin.lan/" + - "https://argocd.iamworkin.lan/" + - "https://intranet.iamworkin.lan/" + - "https://signage.iamworkin.lan/" + - "https://kiosk.iamworkin.lan/" + - "https://media.iamworkin.lan/" + - "https://mysql.iamworkin.lan/" + - "https://php.iamworkin.lan/" + - "https://zabbix.iamworkin.lan/" + - "https://guac.iamworkin.lan/" + - "https://desktop.iamworkin.lan/" + - "https://print.iamworkin.lan/" + - "https://dns.iamworkin.lan/" + - "https://fc-llm-bridge.iamworkin.lan/healthz" + - "https://acme.iamworkin.lan:9443/health" + - "https://prometheus.iamworkin.lan/" + - "https://grafana.iamworkin.lan/" + labels: + probe_type: "traefik-service" + relabel_configs: + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + regex: "https?://([^/:]+).*" + target_label: instance + - target_label: __address__ + replacement: blackbox-exporter.monitoring.svc:9115 + # ============================================================================= # Self-monitoring (K8s monitoring namespace) # ============================================================================= @@ -589,23 +646,31 @@ data: summary: "RemoteDesktop /metrics scrape returning no data" description: "No fc_desktop_session_events_total series for 10 minutes. Either the Prometheus scrape target is misconfigured or the web deployment stopped exporting metrics. Zabbix template carries the same 10m no-data trigger for cross-monitor parity." + # fc_desktop_pool_depleted is emitted as state-as-label: one series + # per template per status (Ready/Warming/BelowDesiredSize/Disabled). + # The publisher does NOT reset old series to 0 when a template + # transitions states — it just emits a new series with new labels. + # So a template that was Warming yesterday still has its + # Warming-labeled series stuck at 1 even when current status=Ready. + # Filter on the Critical alert_level (= BelowDesiredSize) so only + # genuine current-state depletion fires. Same fix on Deficit below. - alert: RemoteDesktopPoolDepleted - expr: fc_desktop_pool_depleted > 0 + expr: fc_desktop_pool_depleted{alert_level="Critical",enabled="true"} > 0 for: 5m labels: severity: warning annotations: - summary: "RemoteDesktop pool {{ $labels.pool }} depleted ({{ $labels.template }})" - description: "Pool {{ $labels.pool }} has been depleted for 5 minutes. New launches will cold-start. Operator should check for pod-scheduling failures, image pull issues, or exhausted node capacity before warm-pool parity is expected back." + summary: "RemoteDesktop pool depleted ({{ $labels.template }})" + description: "Pool for template {{ $labels.template }} (status={{ $labels.status }}) has been depleted for 5 minutes. New launches will cold-start. Operator should check for pod-scheduling failures, image pull issues, or exhausted node capacity before warm-pool parity is expected back." - alert: RemoteDesktopPoolDeficitSustained - expr: fc_desktop_pool_deficit > 0 + expr: fc_desktop_pool_deficit{alert_level=~"Warning|Critical",enabled="true"} > 0 for: 10m labels: severity: info annotations: - summary: "RemoteDesktop pool {{ $labels.pool }} below desired for 10m" - description: "Pool {{ $labels.pool }} has a persistent deficit of {{ $value }} warm pods. The operator is reconciling but can't reach desired size — likely an image pull, NFS affinity, or claim-init issue." + summary: "RemoteDesktop pool {{ $labels.template }} below desired for 10m" + description: "Pool {{ $labels.template }} (status={{ $labels.status }}) has a persistent deficit of {{ $value }} warm pods. The operator is reconciling but can't reach desired size — likely an image pull, NFS affinity, or claim-init issue." - alert: RemoteDesktopSessionChurnSpike expr: sum(rate(fc_desktop_session_events_total{event="launch"}[5m])) * 60 > 20 @@ -625,8 +690,10 @@ data: summary: "RemoteDesktop recording events silent for 30m despite active launches" description: "No recording events in 30 minutes while launches are happening. Recording may be silently disabled on all templates (SessionRecordingEnabled=false), the guacd NFS mount may be unhealthy, or the retention sweep isn't emitting events. Not an error by itself — worth checking." + # Match by job — instance label carries full URL incl. /health, + # not just hostname, so a hostname-only match never fires. - alert: RemoteDesktopTlsExpiry - expr: probe_ssl_earliest_cert_expiry{instance="https://desktop.iamworkin.lan"} - time() < 2 * 86400 + expr: probe_ssl_earliest_cert_expiry{job="probe-remotedesktop"} - time() < 2 * 86400 for: 6h labels: severity: critical @@ -713,13 +780,16 @@ data: annotations: summary: "Epson ink CRITICAL: {{ $labels.prtMarkerSuppliesDescription }} at {{ $value }}%" + # for: 30m absorbs sleep cycles. The EcoTank sleeps after ~5 min + # of idle and SNMP times out, so 5m for: would page nightly. A + # genuine printer outage (jam, disconnected) lasts well over 30m. - alert: EpsonPrinterDown expr: up{job="snmp-printer"} == 0 - for: 5m + for: 30m labels: severity: warning annotations: - summary: "Epson ET-3750 SNMP unreachable" + summary: "Epson ET-3750 SNMP unreachable for >30m (likely actual fault, not sleep)" - alert: SynologyDiskLow expr: hrStorageUsed{job="snmp-nas"} / hrStorageSize{job="snmp-nas"} * 100 > 85 @@ -773,6 +843,58 @@ data: annotations: summary: "Disk usage high on {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}%)" + # K8s pod-state alerts. Require kube-state-metrics scrape (added + # 2026-04-26 — see scrape_configs above). Would have surfaced the + # agent-zero ollama-proxy 172x crash-loop instead of letting it + # silently churn for ~3 days. + - name: kubernetes-state + rules: + - alert: KubeContainerRestartingFrequently + expr: increase(kube_pod_container_status_restarts_total[1h]) > 5 + for: 15m + labels: + severity: warning + annotations: + summary: "{{ $labels.namespace }}/{{ $labels.pod }} container {{ $labels.container }} restarting >5x/hr" + description: "Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has restarted {{ $value | printf \"%.0f\" }} times in the last hour. Check 'kubectl describe pod' + last-state termination reason." + + - alert: KubeContainerCrashLooping + expr: increase(kube_pod_container_status_restarts_total[15m]) > 3 + for: 5m + labels: + severity: critical + alert_channel: thermal_print + annotations: + summary: "{{ $labels.namespace }}/{{ $labels.pod }} crashlooping ({{ $value | printf \"%.0f\" }} restarts/15m)" + description: "Container {{ $labels.container }} restarted {{ $value | printf \"%.0f\" }} times in 15 minutes — actively crashlooping." + + - alert: KubePodNotReady + expr: sum by(namespace, pod) (kube_pod_status_phase{phase=~"Pending|Failed|Unknown"}) > 0 + for: 15m + labels: + severity: warning + annotations: + summary: "{{ $labels.namespace }}/{{ $labels.pod }} not Ready for >15m" + description: "Pod is in a non-Running, non-Succeeded phase for over 15 minutes. Common causes: ImagePullBackOff (registry/Nexus down, wrong image tag), pending PVC, scheduling failure (taint/resources)." + + - alert: KubePodImagePullBackOff + expr: sum by(namespace, pod) (kube_pod_container_status_waiting_reason{reason=~"ImagePullBackOff|ErrImagePull"}) > 0 + for: 10m + labels: + severity: warning + annotations: + summary: "{{ $labels.namespace }}/{{ $labels.pod }} ImagePullBackOff for >10m" + description: "Pod can't pull image. Check the image ref (often a stale tag or unreachable registry) and clean up if it's an orphan." + + - alert: KubeDeploymentReplicasMismatch + expr: kube_deployment_spec_replicas != kube_deployment_status_replicas_available + for: 15m + labels: + severity: warning + annotations: + summary: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} replica mismatch" + description: "Spec wants {{ $labels.spec_replicas }} but only {{ $value }} available. Likely a rollout stuck on probe failure, scheduling, or PVC." + # ============================================================================= # ConfigMap: Blackbox Exporter Configuration # ============================================================================= @@ -804,6 +926,22 @@ data: fail_if_body_not_matches_regexp: - '"models"' preferred_ip_protocol: ip4 + # https_internal — for Traefik-fronted services with step-ca leaf + # certs. blackbox does not trust the step-ca root CA, so http_2xx + # against any *.iamworkin.lan host fails with x509 unknown authority. + # Redirects + multiple status codes are accepted because some hosts + # 302 to /login or /scalar. + https_internal: + prober: http + timeout: 10s + http: + valid_http_versions: ["HTTP/1.1", "HTTP/2.0"] + valid_status_codes: [200, 301, 302, 303, 307, 308] + method: GET + follow_redirects: true + preferred_ip_protocol: ip4 + tls_config: + insecure_skip_verify: true # ============================================================================= # ConfigMap: IRC Notify Script