fix(monitoring): probe OIDC-safe health routes

This commit is contained in:
Andrew Stoltz
2026-06-04 00:23:48 -05:00
parent fe38c2641f
commit b87df27844
3 changed files with 118 additions and 12 deletions

View File

@@ -93,6 +93,7 @@ spec:
prometheus.io/scrape: "true"
prometheus.io/port: "8080"
prometheus.io/path: "/metrics"
flowercore.io/healthz-auth-policy: "allow-anonymous"
spec:
securityContext:
runAsNonRoot: true
@@ -123,9 +124,9 @@ spec:
value: "Production"
- name: DOTNET_SYSTEM_GLOBALIZATION_INVARIANT
value: "false"
# AuthentiK/OIDC is wired but not enforced until the
# knowledge-oidc-client Secret is provisioned and
# FlowerCore__Auth__Enabled is flipped to true.
# AuthentiK/OIDC is enforced. /healthz stays anonymous by contract;
# see flowercore.io/healthz-auth-policy above and the Sprint 58
# OIDC readiness probe audit.
- name: FlowerCore__Auth__Enabled
value: "true"
- name: FlowerCore__Auth__Oidc__Enabled

View File

@@ -481,22 +481,25 @@ data:
- "https://intranet.iamworkin.lan/"
- "https://signage.iamworkin.lan/healthz" # root 401 auth-gated 2026-06-01; /healthz anon 200
- "https://kiosk.iamworkin.lan/"
- "https://media.iamworkin.lan/healthz" # root auth-gated by OIDC; /healthz anon 200
- "https://media.iamworkin.lan/healthz" # root auth-gated by OIDC; /healthz anonymous 200
- "https://mysql.iamworkin.lan/healthz" # root 401 auth-gated 2026-06-01; /healthz anon 200
- "https://php.iamworkin.lan/healthz" # root 401 auth-gated 2026-06-01; /healthz anon 200
- "https://zabbix.iamworkin.lan/"
- "https://desktop.iamworkin.lan/"
- "https://print.iamworkin.lan/"
- "https://dns.iamworkin.lan/healthz" # root auth-gated by OIDC; /healthz anon 200
- "https://chat.iamworkin.lan/"
- "https://dist.iamworkin.lan/healthz" # root/admin auth-gated by OIDC; /healthz anon 200
- "https://dms.iamworkin.lan/"
- "https://print.iamworkin.lan/healthz" # root 401 behind API key auth; /healthz anonymous 200
- "https://dns.iamworkin.lan/healthz" # root auth-gated by OIDC; /healthz anonymous 200
- "https://chat.iamworkin.lan/healthz" # OIDC staged; keep blackbox off root before enforcement flips
- "https://dist.iamworkin.lan/healthz" # root/admin auth-gated by OIDC; /healthz anonymous 200
- "https://dms.iamworkin.lan/healthz" # future OIDC posture; health route is already anonymous/live
- "https://menuboard.iamworkin.lan/"
- "https://messageboard.iamworkin.lan/"
- "https://presentations.iamworkin.lan/"
- "https://retail.iamworkin.lan/"
- "https://ttsreader.iamworkin.lan/"
# Explicit healthcheck paths
- "https://library.iamworkin.lan/health"
- "https://aistation.iamworkin.lan/healthz"
- "https://knowledge.iamworkin.lan/healthz"
- "https://fc-llm-bridge.iamworkin.lan/healthz"
- "https://acme.iamworkin.lan/health"
# NOTE: services intentionally NOT in this probe surface
@@ -1020,7 +1023,12 @@ data:
- name: kubernetes-state
rules:
- alert: KubeContainerRestartingFrequently
expr: increase(kube_pod_container_status_restarts_total[1h]) > 5
# Exclude github-runner: ephemeral runners register, run one job,
# exit cleanly, and restart by design. Also require kube_pod_info so
# deleted rollout pods do not keep firing from retained restart series.
expr: |
increase(kube_pod_container_status_restarts_total{namespace!="github-runner"}[1h]) > 5
and on(namespace, pod) kube_pod_info
for: 15m
labels:
severity: warning
@@ -1029,7 +1037,12 @@ data:
description: "Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has restarted {{ $value | printf \"%.0f\" }} times in the last hour. Check 'kubectl describe pod' + last-state termination reason."
- alert: KubeContainerCrashLooping
expr: increase(kube_pod_container_status_restarts_total[15m]) > 3
# Same github-runner/delete-retention exclusions as the hourly
# restart rule above; real runner failures are covered by the
# dedicated LinuxRunnerOffline/MacMiniRunnerOffline alerts.
expr: |
increase(kube_pod_container_status_restarts_total{namespace!="github-runner"}[15m]) > 3
and on(namespace, pod) kube_pod_info
for: 5m
labels:
severity: critical
@@ -1057,7 +1070,10 @@ data:
description: "Pod can't pull image. Check the image ref (often a stale tag or unreachable registry) and clean up if it's an orphan."
- alert: KubeDeploymentReplicasMismatch
expr: kube_deployment_spec_replicas != kube_deployment_status_replicas_available
# github-runner has explicit runner-offline alerts; the generic
# replica-mismatch rule should not page on intentionally ephemeral
# 0/1 runner churn between CI jobs.
expr: kube_deployment_spec_replicas{namespace!="github-runner"} != kube_deployment_status_replicas_available{namespace!="github-runner"}
for: 15m
labels:
severity: warning