monitoring: stabilize firing alerts + add cluster-state coverage
Live audit on 2026-04-26 found 14 firing alerts caused by stale probe targets, blackbox TLS verify failures, and stale state-as-label series. Plus three K8s scrape sources (kube-state-metrics, cert-manager, traefik) that exposed NodePorts but were not in any scrape config. Fixes - probe-remotedesktop: switch http_2xx -> https_internal. Blackbox does not trust step-ca root, so /health was failing with x509 unknown authority while the app served 200s. - probe-agentzero-nuc: short svc form (agent-zero.agent-zero.svc:80) instead of *.cluster.local. The FQDN form was being rewritten to the Traefik VIP by the CoreDNS iamworkin.lan template + ndots:5 search expansion, then 5s timeout. - probe-agentzero-local + probe-ollama-local: removed. 10.0.58.100 is on HOME VLAN and not reachable from cluster pods. Workstation/AI-laptop Ollama monitoring belongs to host-side Puppet, not cluster blackbox. - snmp-cloudkey: commented out. The Cloud Key Gen2+ runs unifi-core (controller), not an SNMP agent. Was generating "connection refused" every 30s. - RemoteDesktopPoolDepleted / RemoteDesktopPoolDeficitSustained: filter on alert_level=Critical / Warning|Critical + enabled=true. The publisher emits one series per template per status without resetting old series to 0, so the historical Warming/BelowDesiredSize series stayed at 1 and the alert kept firing on stale labels. - RemoteDesktopTlsExpiry: match by job, not hostname-only instance. The probe sets instance=https://desktop.iamworkin.lan/health so a hostname-only label match never fired. - EpsonPrinterDown for: 5m -> 30m. EcoTank sleeps after ~5 min idle and SNMP times out, so 5m guaranteed nightly noise. Coverage adds - kube-state-metrics scrape (NodePort 30901). Required for the new pod-state alerts and a long list of standard K8s SLO queries. - cert-manager scrape (NodePort 30902). Required for the CertManagerCertificateNotReady / RenewalFailed alert pair documented in project_cert_manager_prometheus_scrape. - traefik scrape (NodePort 30900) on all three nodes. - probe-traefik-services: HTTPS probe (https_internal) over the 17 main iamworkin.lan hosts so any Traefik-fronted service returning non-200 shows up as a single named probe failure. - blackbox-config: add the https_internal module that the new probes reference (was only in the FlowerCore.Notes scripts/monitoring copy, not in the live ConfigMap). New alerts (kubernetes-state group) - KubeContainerRestartingFrequently (>5 restarts/h) - KubeContainerCrashLooping (>3 restarts/15m, thermal print) - KubePodNotReady (Pending/Failed/Unknown >15m) - KubePodImagePullBackOff (>10m) - KubeDeploymentReplicasMismatch (>15m) Without these, the agent-zero ollama-proxy 172x restart loop was invisible for ~3 days. Same gap would have hidden the fc-php php84-app-probe ImagePullBackOff orphan (cleaned up out of band). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -104,21 +104,27 @@ data:
|
|||||||
- target_label: __address__
|
- target_label: __address__
|
||||||
replacement: snmp-exporter.monitoring.svc:9116
|
replacement: snmp-exporter.monitoring.svc:9116
|
||||||
|
|
||||||
# UniFi Cloud Key SNMP
|
# UniFi Cloud Key SNMP — DISABLED 2026-04-26
|
||||||
- job_name: "snmp-cloudkey"
|
# The Cloud Key Gen2+ runs unifi-core (controller) only — not a network
|
||||||
static_configs:
|
# device — and does NOT run an SNMP agent on UDP/161. Scrapes were
|
||||||
- targets: ["10.0.56.3"]
|
# silently failing with "connection refused" from 10.42.x.x:161 every
|
||||||
metrics_path: /snmp
|
# 30s, polluting up{} = 0 and lastError on the Targets page. Hardware
|
||||||
params:
|
# health (CPU/mem/disk) for the Cloud Key host should come from
|
||||||
module: [if_mib]
|
# node_exporter via SSH — not SNMP.
|
||||||
auth: [bluejay_v2]
|
# - job_name: "snmp-cloudkey"
|
||||||
relabel_configs:
|
# static_configs:
|
||||||
- source_labels: [__address__]
|
# - targets: ["10.0.56.3"]
|
||||||
target_label: __param_target
|
# metrics_path: /snmp
|
||||||
- source_labels: [__param_target]
|
# params:
|
||||||
target_label: instance
|
# module: [if_mib]
|
||||||
- target_label: __address__
|
# auth: [bluejay_v2]
|
||||||
replacement: snmp-exporter.monitoring.svc:9116
|
# relabel_configs:
|
||||||
|
# - source_labels: [__address__]
|
||||||
|
# target_label: __param_target
|
||||||
|
# - source_labels: [__param_target]
|
||||||
|
# target_label: instance
|
||||||
|
# - target_label: __address__
|
||||||
|
# replacement: snmp-exporter.monitoring.svc:9116
|
||||||
|
|
||||||
# UniFi Switch SNMP
|
# UniFi Switch SNMP
|
||||||
- job_name: "snmp-switch"
|
- job_name: "snmp-switch"
|
||||||
@@ -279,10 +285,13 @@ data:
|
|||||||
replacement: blackbox-exporter.monitoring.svc:9115
|
replacement: blackbox-exporter.monitoring.svc:9115
|
||||||
|
|
||||||
# FlowerCore.RemoteDesktop web health (public cluster VIP)
|
# FlowerCore.RemoteDesktop web health (public cluster VIP)
|
||||||
|
# Module is https_internal — desktop.iamworkin.lan uses a step-ca leaf
|
||||||
|
# cert; blackbox does NOT trust step-ca root, so http_2xx fails with
|
||||||
|
# x509 unknown authority and probe_success=0 even when /health 200s.
|
||||||
- job_name: "probe-remotedesktop"
|
- job_name: "probe-remotedesktop"
|
||||||
metrics_path: /probe
|
metrics_path: /probe
|
||||||
params:
|
params:
|
||||||
module: [http_2xx]
|
module: [https_internal]
|
||||||
scrape_interval: 30s
|
scrape_interval: 30s
|
||||||
static_configs:
|
static_configs:
|
||||||
- targets: ["https://desktop.iamworkin.lan/health"]
|
- targets: ["https://desktop.iamworkin.lan/health"]
|
||||||
@@ -330,26 +339,12 @@ data:
|
|||||||
# AI Stack Health Probes (Blackbox Exporter)
|
# AI Stack Health Probes (Blackbox Exporter)
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
|
|
||||||
# Ollama API — workstation (LOCAL Agent Zero)
|
# NOTE: probe-ollama-local and probe-agentzero-local were REMOVED
|
||||||
- job_name: "probe-ollama-local"
|
# 2026-04-26. They pointed at 10.0.58.100 (HOME VLAN) which is not
|
||||||
metrics_path: /probe
|
# reachable from cluster pods (firewalled). They had been firing as
|
||||||
params:
|
# OllamaDown / AgentZeroDown since 2026-04-24. Workstation/AI-laptop
|
||||||
module: [http_ollama]
|
# Ollama and Agent Zero should be monitored via host-side Puppet
|
||||||
scrape_interval: 30s
|
# (node_exporter on the box) once the AI laptop is running 24/7.
|
||||||
static_configs:
|
|
||||||
- targets: ["http://10.0.58.100:11434/api/tags"]
|
|
||||||
labels:
|
|
||||||
instance: "ollama-local"
|
|
||||||
service: "ollama"
|
|
||||||
deployment: "local"
|
|
||||||
gpu: "r9700"
|
|
||||||
relabel_configs:
|
|
||||||
- source_labels: [__address__]
|
|
||||||
target_label: __param_target
|
|
||||||
- source_labels: [__param_target]
|
|
||||||
target_label: instance
|
|
||||||
- target_label: __address__
|
|
||||||
replacement: blackbox-exporter.monitoring.svc:9115
|
|
||||||
|
|
||||||
# Ollama API — edge1 Pi 5 (NUC Agent Zero)
|
# Ollama API — edge1 Pi 5 (NUC Agent Zero)
|
||||||
- job_name: "probe-ollama-edge1"
|
- job_name: "probe-ollama-edge1"
|
||||||
@@ -372,34 +367,18 @@ data:
|
|||||||
- target_label: __address__
|
- target_label: __address__
|
||||||
replacement: blackbox-exporter.monitoring.svc:9115
|
replacement: blackbox-exporter.monitoring.svc:9115
|
||||||
|
|
||||||
# Agent Zero Web UI — local (K3s)
|
# Agent Zero Web UI — in-cluster (RKE2)
|
||||||
- job_name: "probe-agentzero-local"
|
# Target uses short svc form (agent-zero.agent-zero.svc) NOT
|
||||||
metrics_path: /probe
|
# cluster.local FQDN — the *.cluster.local form gets rewritten to
|
||||||
params:
|
# 10.0.56.200 (Traefik VIP) by the CoreDNS iamworkin.lan template +
|
||||||
module: [http_2xx]
|
# ndots:5 search-suffix expansion. Memory: feedback_coredns_ndots_template_collision.
|
||||||
scrape_interval: 30s
|
|
||||||
static_configs:
|
|
||||||
- targets: ["http://10.0.58.100:30050/"]
|
|
||||||
labels:
|
|
||||||
instance: "agent-zero-local"
|
|
||||||
service: "agent-zero"
|
|
||||||
deployment: "local"
|
|
||||||
relabel_configs:
|
|
||||||
- source_labels: [__address__]
|
|
||||||
target_label: __param_target
|
|
||||||
- source_labels: [__param_target]
|
|
||||||
target_label: instance
|
|
||||||
- target_label: __address__
|
|
||||||
replacement: blackbox-exporter.monitoring.svc:9115
|
|
||||||
|
|
||||||
# Agent Zero Web UI — NUC (RKE2 via Traefik)
|
|
||||||
- job_name: "probe-agentzero-nuc"
|
- job_name: "probe-agentzero-nuc"
|
||||||
metrics_path: /probe
|
metrics_path: /probe
|
||||||
params:
|
params:
|
||||||
module: [http_2xx]
|
module: [http_2xx]
|
||||||
scrape_interval: 30s
|
scrape_interval: 30s
|
||||||
static_configs:
|
static_configs:
|
||||||
- targets: ["http://agent-zero.agent-zero.svc.cluster.local/"]
|
- targets: ["http://agent-zero.agent-zero.svc:80/"]
|
||||||
labels:
|
labels:
|
||||||
instance: "agent-zero-nuc"
|
instance: "agent-zero-nuc"
|
||||||
service: "agent-zero"
|
service: "agent-zero"
|
||||||
@@ -412,6 +391,84 @@ data:
|
|||||||
- target_label: __address__
|
- target_label: __address__
|
||||||
replacement: blackbox-exporter.monitoring.svc:9115
|
replacement: blackbox-exporter.monitoring.svc:9115
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# K8s Cluster State (kube-state-metrics, cert-manager, traefik)
|
||||||
|
# =============================================================================
|
||||||
|
# All exposed as NodePorts via the *-metrics-nodeport.yaml manifests in
|
||||||
|
# this dir. Single-node target — kube-proxy routes to whichever node
|
||||||
|
# the underlying pod runs on.
|
||||||
|
|
||||||
|
# kube-state-metrics — exposes K8s object state (pods, deployments, nodes)
|
||||||
|
# Required for KubeContainerRestartingFrequently / KubePodNotReady alerts.
|
||||||
|
- job_name: "kube-state-metrics"
|
||||||
|
scrape_interval: 30s
|
||||||
|
static_configs:
|
||||||
|
- targets: ["10.0.56.11:30901"]
|
||||||
|
labels:
|
||||||
|
cluster: "rke2"
|
||||||
|
|
||||||
|
# cert-manager — exposes certmanager_certificate_ready_status,
|
||||||
|
# certmanager_certificate_expiration_timestamp_seconds, etc. Drives the
|
||||||
|
# CertManagerCertificateNotReady / CertManagerCertificateRenewalFailed
|
||||||
|
# alerts. Memory: project_cert_manager_prometheus_scrape.
|
||||||
|
- job_name: "cert-manager"
|
||||||
|
scrape_interval: 30s
|
||||||
|
static_configs:
|
||||||
|
- targets: ["10.0.56.11:30902"]
|
||||||
|
labels:
|
||||||
|
cluster: "rke2"
|
||||||
|
|
||||||
|
# Traefik — request rates, latency, TLS cert metadata, router state.
|
||||||
|
# Three replicas (one per node) — scrape all so failover is visible.
|
||||||
|
- job_name: "traefik"
|
||||||
|
scrape_interval: 15s
|
||||||
|
static_configs:
|
||||||
|
- targets:
|
||||||
|
- "10.0.56.11:30900"
|
||||||
|
- "10.0.56.12:30900"
|
||||||
|
- "10.0.56.13:30900"
|
||||||
|
labels:
|
||||||
|
service: "traefik"
|
||||||
|
cluster: "rke2"
|
||||||
|
|
||||||
|
# FC web services through Traefik — single probe surface to spot any
|
||||||
|
# iamworkin.lan host returning non-200. Uses https_internal because all
|
||||||
|
# certs are step-ca leaves; blackbox would x509-fail with http_2xx.
|
||||||
|
- job_name: "probe-traefik-services"
|
||||||
|
metrics_path: /probe
|
||||||
|
params:
|
||||||
|
module: [https_internal]
|
||||||
|
scrape_interval: 60s
|
||||||
|
static_configs:
|
||||||
|
- targets:
|
||||||
|
- "https://gitea.iamworkin.lan/"
|
||||||
|
- "https://argocd.iamworkin.lan/"
|
||||||
|
- "https://intranet.iamworkin.lan/"
|
||||||
|
- "https://signage.iamworkin.lan/"
|
||||||
|
- "https://kiosk.iamworkin.lan/"
|
||||||
|
- "https://media.iamworkin.lan/"
|
||||||
|
- "https://mysql.iamworkin.lan/"
|
||||||
|
- "https://php.iamworkin.lan/"
|
||||||
|
- "https://zabbix.iamworkin.lan/"
|
||||||
|
- "https://guac.iamworkin.lan/"
|
||||||
|
- "https://desktop.iamworkin.lan/"
|
||||||
|
- "https://print.iamworkin.lan/"
|
||||||
|
- "https://dns.iamworkin.lan/"
|
||||||
|
- "https://fc-llm-bridge.iamworkin.lan/healthz"
|
||||||
|
- "https://acme.iamworkin.lan:9443/health"
|
||||||
|
- "https://prometheus.iamworkin.lan/"
|
||||||
|
- "https://grafana.iamworkin.lan/"
|
||||||
|
labels:
|
||||||
|
probe_type: "traefik-service"
|
||||||
|
relabel_configs:
|
||||||
|
- source_labels: [__address__]
|
||||||
|
target_label: __param_target
|
||||||
|
- source_labels: [__param_target]
|
||||||
|
regex: "https?://([^/:]+).*"
|
||||||
|
target_label: instance
|
||||||
|
- target_label: __address__
|
||||||
|
replacement: blackbox-exporter.monitoring.svc:9115
|
||||||
|
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
# Self-monitoring (K8s monitoring namespace)
|
# Self-monitoring (K8s monitoring namespace)
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
@@ -589,23 +646,31 @@ data:
|
|||||||
summary: "RemoteDesktop /metrics scrape returning no data"
|
summary: "RemoteDesktop /metrics scrape returning no data"
|
||||||
description: "No fc_desktop_session_events_total series for 10 minutes. Either the Prometheus scrape target is misconfigured or the web deployment stopped exporting metrics. Zabbix template carries the same 10m no-data trigger for cross-monitor parity."
|
description: "No fc_desktop_session_events_total series for 10 minutes. Either the Prometheus scrape target is misconfigured or the web deployment stopped exporting metrics. Zabbix template carries the same 10m no-data trigger for cross-monitor parity."
|
||||||
|
|
||||||
|
# fc_desktop_pool_depleted is emitted as state-as-label: one series
|
||||||
|
# per template per status (Ready/Warming/BelowDesiredSize/Disabled).
|
||||||
|
# The publisher does NOT reset old series to 0 when a template
|
||||||
|
# transitions states — it just emits a new series with new labels.
|
||||||
|
# So a template that was Warming yesterday still has its
|
||||||
|
# Warming-labeled series stuck at 1 even when current status=Ready.
|
||||||
|
# Filter on the Critical alert_level (= BelowDesiredSize) so only
|
||||||
|
# genuine current-state depletion fires. Same fix on Deficit below.
|
||||||
- alert: RemoteDesktopPoolDepleted
|
- alert: RemoteDesktopPoolDepleted
|
||||||
expr: fc_desktop_pool_depleted > 0
|
expr: fc_desktop_pool_depleted{alert_level="Critical",enabled="true"} > 0
|
||||||
for: 5m
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: "RemoteDesktop pool {{ $labels.pool }} depleted ({{ $labels.template }})"
|
summary: "RemoteDesktop pool depleted ({{ $labels.template }})"
|
||||||
description: "Pool {{ $labels.pool }} has been depleted for 5 minutes. New launches will cold-start. Operator should check for pod-scheduling failures, image pull issues, or exhausted node capacity before warm-pool parity is expected back."
|
description: "Pool for template {{ $labels.template }} (status={{ $labels.status }}) has been depleted for 5 minutes. New launches will cold-start. Operator should check for pod-scheduling failures, image pull issues, or exhausted node capacity before warm-pool parity is expected back."
|
||||||
|
|
||||||
- alert: RemoteDesktopPoolDeficitSustained
|
- alert: RemoteDesktopPoolDeficitSustained
|
||||||
expr: fc_desktop_pool_deficit > 0
|
expr: fc_desktop_pool_deficit{alert_level=~"Warning|Critical",enabled="true"} > 0
|
||||||
for: 10m
|
for: 10m
|
||||||
labels:
|
labels:
|
||||||
severity: info
|
severity: info
|
||||||
annotations:
|
annotations:
|
||||||
summary: "RemoteDesktop pool {{ $labels.pool }} below desired for 10m"
|
summary: "RemoteDesktop pool {{ $labels.template }} below desired for 10m"
|
||||||
description: "Pool {{ $labels.pool }} has a persistent deficit of {{ $value }} warm pods. The operator is reconciling but can't reach desired size — likely an image pull, NFS affinity, or claim-init issue."
|
description: "Pool {{ $labels.template }} (status={{ $labels.status }}) has a persistent deficit of {{ $value }} warm pods. The operator is reconciling but can't reach desired size — likely an image pull, NFS affinity, or claim-init issue."
|
||||||
|
|
||||||
- alert: RemoteDesktopSessionChurnSpike
|
- alert: RemoteDesktopSessionChurnSpike
|
||||||
expr: sum(rate(fc_desktop_session_events_total{event="launch"}[5m])) * 60 > 20
|
expr: sum(rate(fc_desktop_session_events_total{event="launch"}[5m])) * 60 > 20
|
||||||
@@ -625,8 +690,10 @@ data:
|
|||||||
summary: "RemoteDesktop recording events silent for 30m despite active launches"
|
summary: "RemoteDesktop recording events silent for 30m despite active launches"
|
||||||
description: "No recording events in 30 minutes while launches are happening. Recording may be silently disabled on all templates (SessionRecordingEnabled=false), the guacd NFS mount may be unhealthy, or the retention sweep isn't emitting events. Not an error by itself — worth checking."
|
description: "No recording events in 30 minutes while launches are happening. Recording may be silently disabled on all templates (SessionRecordingEnabled=false), the guacd NFS mount may be unhealthy, or the retention sweep isn't emitting events. Not an error by itself — worth checking."
|
||||||
|
|
||||||
|
# Match by job — instance label carries full URL incl. /health,
|
||||||
|
# not just hostname, so a hostname-only match never fires.
|
||||||
- alert: RemoteDesktopTlsExpiry
|
- alert: RemoteDesktopTlsExpiry
|
||||||
expr: probe_ssl_earliest_cert_expiry{instance="https://desktop.iamworkin.lan"} - time() < 2 * 86400
|
expr: probe_ssl_earliest_cert_expiry{job="probe-remotedesktop"} - time() < 2 * 86400
|
||||||
for: 6h
|
for: 6h
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
@@ -713,13 +780,16 @@ data:
|
|||||||
annotations:
|
annotations:
|
||||||
summary: "Epson ink CRITICAL: {{ $labels.prtMarkerSuppliesDescription }} at {{ $value }}%"
|
summary: "Epson ink CRITICAL: {{ $labels.prtMarkerSuppliesDescription }} at {{ $value }}%"
|
||||||
|
|
||||||
|
# for: 30m absorbs sleep cycles. The EcoTank sleeps after ~5 min
|
||||||
|
# of idle and SNMP times out, so 5m for: would page nightly. A
|
||||||
|
# genuine printer outage (jam, disconnected) lasts well over 30m.
|
||||||
- alert: EpsonPrinterDown
|
- alert: EpsonPrinterDown
|
||||||
expr: up{job="snmp-printer"} == 0
|
expr: up{job="snmp-printer"} == 0
|
||||||
for: 5m
|
for: 30m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: "Epson ET-3750 SNMP unreachable"
|
summary: "Epson ET-3750 SNMP unreachable for >30m (likely actual fault, not sleep)"
|
||||||
|
|
||||||
- alert: SynologyDiskLow
|
- alert: SynologyDiskLow
|
||||||
expr: hrStorageUsed{job="snmp-nas"} / hrStorageSize{job="snmp-nas"} * 100 > 85
|
expr: hrStorageUsed{job="snmp-nas"} / hrStorageSize{job="snmp-nas"} * 100 > 85
|
||||||
@@ -773,6 +843,58 @@ data:
|
|||||||
annotations:
|
annotations:
|
||||||
summary: "Disk usage high on {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}%)"
|
summary: "Disk usage high on {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}%)"
|
||||||
|
|
||||||
|
# K8s pod-state alerts. Require kube-state-metrics scrape (added
|
||||||
|
# 2026-04-26 — see scrape_configs above). Would have surfaced the
|
||||||
|
# agent-zero ollama-proxy 172x crash-loop instead of letting it
|
||||||
|
# silently churn for ~3 days.
|
||||||
|
- name: kubernetes-state
|
||||||
|
rules:
|
||||||
|
- alert: KubeContainerRestartingFrequently
|
||||||
|
expr: increase(kube_pod_container_status_restarts_total[1h]) > 5
|
||||||
|
for: 15m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "{{ $labels.namespace }}/{{ $labels.pod }} container {{ $labels.container }} restarting >5x/hr"
|
||||||
|
description: "Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has restarted {{ $value | printf \"%.0f\" }} times in the last hour. Check 'kubectl describe pod' + last-state termination reason."
|
||||||
|
|
||||||
|
- alert: KubeContainerCrashLooping
|
||||||
|
expr: increase(kube_pod_container_status_restarts_total[15m]) > 3
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
alert_channel: thermal_print
|
||||||
|
annotations:
|
||||||
|
summary: "{{ $labels.namespace }}/{{ $labels.pod }} crashlooping ({{ $value | printf \"%.0f\" }} restarts/15m)"
|
||||||
|
description: "Container {{ $labels.container }} restarted {{ $value | printf \"%.0f\" }} times in 15 minutes — actively crashlooping."
|
||||||
|
|
||||||
|
- alert: KubePodNotReady
|
||||||
|
expr: sum by(namespace, pod) (kube_pod_status_phase{phase=~"Pending|Failed|Unknown"}) > 0
|
||||||
|
for: 15m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "{{ $labels.namespace }}/{{ $labels.pod }} not Ready for >15m"
|
||||||
|
description: "Pod is in a non-Running, non-Succeeded phase for over 15 minutes. Common causes: ImagePullBackOff (registry/Nexus down, wrong image tag), pending PVC, scheduling failure (taint/resources)."
|
||||||
|
|
||||||
|
- alert: KubePodImagePullBackOff
|
||||||
|
expr: sum by(namespace, pod) (kube_pod_container_status_waiting_reason{reason=~"ImagePullBackOff|ErrImagePull"}) > 0
|
||||||
|
for: 10m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "{{ $labels.namespace }}/{{ $labels.pod }} ImagePullBackOff for >10m"
|
||||||
|
description: "Pod can't pull image. Check the image ref (often a stale tag or unreachable registry) and clean up if it's an orphan."
|
||||||
|
|
||||||
|
- alert: KubeDeploymentReplicasMismatch
|
||||||
|
expr: kube_deployment_spec_replicas != kube_deployment_status_replicas_available
|
||||||
|
for: 15m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} replica mismatch"
|
||||||
|
description: "Spec wants {{ $labels.spec_replicas }} but only {{ $value }} available. Likely a rollout stuck on probe failure, scheduling, or PVC."
|
||||||
|
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
# ConfigMap: Blackbox Exporter Configuration
|
# ConfigMap: Blackbox Exporter Configuration
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
@@ -804,6 +926,22 @@ data:
|
|||||||
fail_if_body_not_matches_regexp:
|
fail_if_body_not_matches_regexp:
|
||||||
- '"models"'
|
- '"models"'
|
||||||
preferred_ip_protocol: ip4
|
preferred_ip_protocol: ip4
|
||||||
|
# https_internal — for Traefik-fronted services with step-ca leaf
|
||||||
|
# certs. blackbox does not trust the step-ca root CA, so http_2xx
|
||||||
|
# against any *.iamworkin.lan host fails with x509 unknown authority.
|
||||||
|
# Redirects + multiple status codes are accepted because some hosts
|
||||||
|
# 302 to /login or /scalar.
|
||||||
|
https_internal:
|
||||||
|
prober: http
|
||||||
|
timeout: 10s
|
||||||
|
http:
|
||||||
|
valid_http_versions: ["HTTP/1.1", "HTTP/2.0"]
|
||||||
|
valid_status_codes: [200, 301, 302, 303, 307, 308]
|
||||||
|
method: GET
|
||||||
|
follow_redirects: true
|
||||||
|
preferred_ip_protocol: ip4
|
||||||
|
tls_config:
|
||||||
|
insecure_skip_verify: true
|
||||||
|
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
# ConfigMap: IRC Notify Script
|
# ConfigMap: IRC Notify Script
|
||||||
|
|||||||
Reference in New Issue
Block a user