fix(monitoring): probe OIDC-safe health routes
This commit is contained in:
@@ -481,22 +481,25 @@ data:
|
||||
- "https://intranet.iamworkin.lan/"
|
||||
- "https://signage.iamworkin.lan/healthz" # root 401 auth-gated 2026-06-01; /healthz anon 200
|
||||
- "https://kiosk.iamworkin.lan/"
|
||||
- "https://media.iamworkin.lan/"
|
||||
- "https://media.iamworkin.lan/" # OIDC lane must add /healthz before flipping auth; live /healthz 404 on 2026-06-04
|
||||
- "https://mysql.iamworkin.lan/healthz" # root 401 auth-gated 2026-06-01; /healthz anon 200
|
||||
- "https://php.iamworkin.lan/healthz" # root 401 auth-gated 2026-06-01; /healthz anon 200
|
||||
- "https://zabbix.iamworkin.lan/"
|
||||
- "https://desktop.iamworkin.lan/"
|
||||
- "https://print.iamworkin.lan/"
|
||||
- "https://dns.iamworkin.lan/"
|
||||
- "https://chat.iamworkin.lan/"
|
||||
- "https://dist.iamworkin.lan/"
|
||||
- "https://dms.iamworkin.lan/"
|
||||
- "https://print.iamworkin.lan/healthz" # root 401 behind API key auth; /healthz anonymous 200
|
||||
- "https://dns.iamworkin.lan/" # OIDC lane must add /healthz before flipping auth; live /healthz 404 on 2026-06-04
|
||||
- "https://chat.iamworkin.lan/healthz" # OIDC staged; keep blackbox off root before enforcement flips
|
||||
- "https://dist.iamworkin.lan/healthz" # distribution OIDC flip outage was /healthz gating; probe the anonymous health route
|
||||
- "https://dms.iamworkin.lan/healthz" # future OIDC posture; health route is already anonymous/live
|
||||
- "https://menuboard.iamworkin.lan/"
|
||||
- "https://messageboard.iamworkin.lan/"
|
||||
- "https://presentations.iamworkin.lan/"
|
||||
- "https://retail.iamworkin.lan/"
|
||||
- "https://ttsreader.iamworkin.lan/"
|
||||
# Explicit healthcheck paths
|
||||
- "https://library.iamworkin.lan/health"
|
||||
- "https://aistation.iamworkin.lan/healthz"
|
||||
- "https://knowledge.iamworkin.lan/healthz"
|
||||
- "https://fc-llm-bridge.iamworkin.lan/healthz"
|
||||
- "https://acme.iamworkin.lan/health"
|
||||
# NOTE: services intentionally NOT in this probe surface
|
||||
@@ -1020,7 +1023,12 @@ data:
|
||||
- name: kubernetes-state
|
||||
rules:
|
||||
- alert: KubeContainerRestartingFrequently
|
||||
expr: increase(kube_pod_container_status_restarts_total[1h]) > 5
|
||||
# Exclude github-runner: ephemeral runners register, run one job,
|
||||
# exit cleanly, and restart by design. Also require kube_pod_info so
|
||||
# deleted rollout pods do not keep firing from retained restart series.
|
||||
expr: |
|
||||
increase(kube_pod_container_status_restarts_total{namespace!="github-runner"}[1h]) > 5
|
||||
and on(namespace, pod) kube_pod_info
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
@@ -1029,7 +1037,12 @@ data:
|
||||
description: "Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has restarted {{ $value | printf \"%.0f\" }} times in the last hour. Check 'kubectl describe pod' + last-state termination reason."
|
||||
|
||||
- alert: KubeContainerCrashLooping
|
||||
expr: increase(kube_pod_container_status_restarts_total[15m]) > 3
|
||||
# Same github-runner/delete-retention exclusions as the hourly
|
||||
# restart rule above; real runner failures are covered by the
|
||||
# dedicated LinuxRunnerOffline/MacMiniRunnerOffline alerts.
|
||||
expr: |
|
||||
increase(kube_pod_container_status_restarts_total{namespace!="github-runner"}[15m]) > 3
|
||||
and on(namespace, pod) kube_pod_info
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
@@ -1057,7 +1070,10 @@ data:
|
||||
description: "Pod can't pull image. Check the image ref (often a stale tag or unreachable registry) and clean up if it's an orphan."
|
||||
|
||||
- alert: KubeDeploymentReplicasMismatch
|
||||
expr: kube_deployment_spec_replicas != kube_deployment_status_replicas_available
|
||||
# github-runner has explicit runner-offline alerts; the generic
|
||||
# replica-mismatch rule should not page on intentionally ephemeral
|
||||
# 0/1 runner churn between CI jobs.
|
||||
expr: kube_deployment_spec_replicas{namespace!="github-runner"} != kube_deployment_status_replicas_available{namespace!="github-runner"}
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
|
||||
Reference in New Issue
Block a user