From 05a273d3a6d7ade97068e2ac948789011a33041b Mon Sep 17 00:00:00 2001 From: Andrew Stoltz Date: Sun, 26 Apr 2026 13:05:32 -0500 Subject: [PATCH] monitoring: switch K8s scrapes to ClusterIP svc + fix probe paths MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Followup to ab6ade4. Three issues uncovered after the rollout: 1. NodePort hairpin breaks scrape from same-node pod. Prometheus on rke2-agent1 could reach traefik-metrics on .11/.13 NodePort 30900 but timed out on its OWN node's NodePort. Same problem would hit kube-state-metrics + cert-manager whenever prometheus reschedules. Fix: scrape via ClusterIP svc DNS instead of NodePort. NodePorts stay in place for external/Podman scrapers. 2. probe-traefik-services failed for grafana, prometheus, guac with non-200/3xx codes. grafana + prometheus are behind Traefik basic- auth (every endpoint returns 401), so drop from probe surface — health is covered by the in-cluster monitoring-* scrape jobs. guac.iamworkin.lan was deprecated when Guacamole moved under desktop.iamworkin.lan/guacamole/ — drop it. 3. acme path was wrong (root 404). Use /health. Coverage adds (probe-traefik-services): chat, dist, dms, menuboard, messageboard, presentations, retail, ttsreader. All of these have IngressRoutes serving root at 200/3xx. NetworkPolicy egress rules added so the new ClusterIP svc scrapes work: - traefik-system: port 9100 (metrics) — separate from data-path 8080/8443 - kube-system: port 8080 (kube-state-metrics) - cert-manager: port 9402 (controller metrics) Out-of-band fix during this audit: - Print.Web on edge2 was inactive (clean exit at 12:55 CDT, root cause unclear — systemd Stopping signal). Restarted. Service back on 5200. Co-Authored-By: Claude Opus 4.7 (1M context) --- apps/monitoring/noc-monitoring.yaml | 75 +++++++++++++++++++++++------ 1 file changed, 61 insertions(+), 14 deletions(-) diff --git a/apps/monitoring/noc-monitoring.yaml b/apps/monitoring/noc-monitoring.yaml index f77d7cd..461ced3 100644 --- a/apps/monitoring/noc-monitoring.yaml +++ b/apps/monitoring/noc-monitoring.yaml @@ -394,16 +394,19 @@ data: # ============================================================================= # K8s Cluster State (kube-state-metrics, cert-manager, traefik) # ============================================================================= - # All exposed as NodePorts via the *-metrics-nodeport.yaml manifests in - # this dir. Single-node target — kube-proxy routes to whichever node - # the underlying pod runs on. + # Use in-cluster ClusterIP service DNS — NOT NodePorts — so a same-node + # NodePort hairpin doesn't break the scrape (hit on rke2-agent1 hosting + # both prometheus and traefik on 2026-04-26: 10.0.56.12:30900 timed out + # from prometheus while .11/.13 worked). NodePorts at 30900-30902 are + # still useful for noc1-Podman-style external scrapers, but in-cluster + # we should always use the svc DNS form. # kube-state-metrics — exposes K8s object state (pods, deployments, nodes) # Required for KubeContainerRestartingFrequently / KubePodNotReady alerts. - job_name: "kube-state-metrics" scrape_interval: 30s static_configs: - - targets: ["10.0.56.11:30901"] + - targets: ["kube-state-metrics.kube-system.svc:8080"] labels: cluster: "rke2" @@ -414,19 +417,18 @@ data: - job_name: "cert-manager" scrape_interval: 30s static_configs: - - targets: ["10.0.56.11:30902"] + - targets: ["cert-manager-metrics.cert-manager.svc:9402"] labels: cluster: "rke2" # Traefik — request rates, latency, TLS cert metadata, router state. - # Three replicas (one per node) — scrape all so failover is visible. + # ClusterIP svc routes to one of the traefik pods; per-pod scrape via + # the headless `traefik-metrics` selector would be nicer for failover + # visibility but the single-replica scrape is enough for steady-state. - job_name: "traefik" scrape_interval: 15s static_configs: - - targets: - - "10.0.56.11:30900" - - "10.0.56.12:30900" - - "10.0.56.13:30900" + - targets: ["traefik-metrics.traefik-system.svc:9100"] labels: service: "traefik" cluster: "rke2" @@ -434,6 +436,10 @@ data: # FC web services through Traefik — single probe surface to spot any # iamworkin.lan host returning non-200. Uses https_internal because all # certs are step-ca leaves; blackbox would x509-fail with http_2xx. + # Some services need explicit healthcheck paths because root returns + # 404 (acme, guac) or 401 (grafana, prometheus). Drop them or point at + # the right endpoint — don't lower valid_status_codes globally because + # 401 from a healthy pod and 401 from an outage look identical. - job_name: "probe-traefik-services" metrics_path: /probe params: @@ -441,6 +447,7 @@ data: scrape_interval: 60s static_configs: - targets: + # Root-reachable services (200 or 3xx) - "https://gitea.iamworkin.lan/" - "https://argocd.iamworkin.lan/" - "https://intranet.iamworkin.lan/" @@ -450,14 +457,29 @@ data: - "https://mysql.iamworkin.lan/" - "https://php.iamworkin.lan/" - "https://zabbix.iamworkin.lan/" - - "https://guac.iamworkin.lan/" - "https://desktop.iamworkin.lan/" - "https://print.iamworkin.lan/" - "https://dns.iamworkin.lan/" + - "https://chat.iamworkin.lan/" + - "https://dist.iamworkin.lan/" + - "https://dms.iamworkin.lan/" + - "https://menuboard.iamworkin.lan/" + - "https://messageboard.iamworkin.lan/" + - "https://presentations.iamworkin.lan/" + - "https://retail.iamworkin.lan/" + - "https://ttsreader.iamworkin.lan/" + # Explicit healthcheck paths - "https://fc-llm-bridge.iamworkin.lan/healthz" - - "https://acme.iamworkin.lan:9443/health" - - "https://prometheus.iamworkin.lan/" - - "https://grafana.iamworkin.lan/" + - "https://acme.iamworkin.lan/health" + # NOTE: services intentionally NOT in this probe surface + # - grafana.iamworkin.lan: every endpoint (incl. /api/health + # and /login) returns 401 behind Traefik basic-auth. + # Health covered by in-cluster monitoring-grafana scrape. + # - prometheus.iamworkin.lan: same auth pattern. Health covered + # by the prometheus self-scrape job. + # - guac.iamworkin.lan: deprecated — Guacamole moved to + # desktop.iamworkin.lan/guacamole/ (memory: + # feedback_traefik_cross_namespace_refs_disabled). labels: probe_type: "traefik-service" relabel_configs: @@ -4164,6 +4186,31 @@ spec: protocol: TCP - port: 8443 protocol: TCP + # Traefik /metrics endpoint (port 9100) — separate from the data-path + # ports above. Required for the in-cluster `traefik` scrape job. + - to: + - namespaceSelector: + matchLabels: + kubernetes.io/metadata.name: traefik-system + ports: + - port: 9100 + protocol: TCP + # kube-state-metrics — required for kubernetes-state alert group. + - to: + - namespaceSelector: + matchLabels: + kubernetes.io/metadata.name: kube-system + ports: + - port: 8080 + protocol: TCP + # cert-manager metrics — required for CertManagerCertificate* alerts. + - to: + - namespaceSelector: + matchLabels: + kubernetes.io/metadata.name: cert-manager + ports: + - port: 9402 + protocol: TCP # IRC (irc-notify → UnrealIRCd in irc namespace via K8s DNS) - to: - namespaceSelector: