fix(monitoring): probe OIDC-safe health routes

2026-06-04 00:23:48 -05:00
parent fe38c2641f
commit b87df27844
3 changed files with 118 additions and 12 deletions
--- a/apps/knowledge/knowledge.yaml
+++ b/apps/knowledge/knowledge.yaml
@@ -93,6 +93,7 @@ spec:
        prometheus.io/scrape: "true"
        prometheus.io/port: "8080"
        prometheus.io/path: "/metrics"
+        flowercore.io/healthz-auth-policy: "allow-anonymous"
    spec:
      securityContext:
        runAsNonRoot: true
@@ -123,9 +124,9 @@ spec:
              value: "Production"
            - name: DOTNET_SYSTEM_GLOBALIZATION_INVARIANT
              value: "false"
-            # AuthentiK/OIDC is wired but not enforced until the
-            # knowledge-oidc-client Secret is provisioned and
-            # FlowerCore__Auth__Enabled is flipped to true.
+            # AuthentiK/OIDC is enforced. /healthz stays anonymous by contract;
+            # see flowercore.io/healthz-auth-policy above and the Sprint 58
+            # OIDC readiness probe audit.
            - name: FlowerCore__Auth__Enabled
              value: "true"
            - name: FlowerCore__Auth__Oidc__Enabled
--- a/apps/monitoring/noc-monitoring.yaml
+++ b/apps/monitoring/noc-monitoring.yaml
@@ -481,22 +481,25 @@ data:
              - "https://intranet.iamworkin.lan/"
              - "https://signage.iamworkin.lan/healthz"   # root 401 auth-gated 2026-06-01; /healthz anon 200
              - "https://kiosk.iamworkin.lan/"
-              - "https://media.iamworkin.lan/healthz"   # root auth-gated by OIDC; /healthz anon 200
+              - "https://media.iamworkin.lan/healthz"    # root auth-gated by OIDC; /healthz anonymous 200
              - "https://mysql.iamworkin.lan/healthz"   # root 401 auth-gated 2026-06-01; /healthz anon 200
              - "https://php.iamworkin.lan/healthz"     # root 401 auth-gated 2026-06-01; /healthz anon 200
              - "https://zabbix.iamworkin.lan/"
              - "https://desktop.iamworkin.lan/"
-              - "https://print.iamworkin.lan/"
-              - "https://dns.iamworkin.lan/healthz"     # root auth-gated by OIDC; /healthz anon 200
-              - "https://chat.iamworkin.lan/"
-              - "https://dist.iamworkin.lan/healthz"    # root/admin auth-gated by OIDC; /healthz anon 200
-              - "https://dms.iamworkin.lan/"
+              - "https://print.iamworkin.lan/healthz"    # root 401 behind API key auth; /healthz anonymous 200
+              - "https://dns.iamworkin.lan/healthz"      # root auth-gated by OIDC; /healthz anonymous 200
+              - "https://chat.iamworkin.lan/healthz"     # OIDC staged; keep blackbox off root before enforcement flips
+              - "https://dist.iamworkin.lan/healthz"     # root/admin auth-gated by OIDC; /healthz anonymous 200
+              - "https://dms.iamworkin.lan/healthz"      # future OIDC posture; health route is already anonymous/live
              - "https://menuboard.iamworkin.lan/"
              - "https://messageboard.iamworkin.lan/"
              - "https://presentations.iamworkin.lan/"
              - "https://retail.iamworkin.lan/"
              - "https://ttsreader.iamworkin.lan/"
              # Explicit healthcheck paths
+              - "https://library.iamworkin.lan/health"
+              - "https://aistation.iamworkin.lan/healthz"
+              - "https://knowledge.iamworkin.lan/healthz"
              - "https://fc-llm-bridge.iamworkin.lan/healthz"
              - "https://acme.iamworkin.lan/health"
              # NOTE: services intentionally NOT in this probe surface
@@ -1020,7 +1023,12 @@ data:
      - name: kubernetes-state
        rules:
          - alert: KubeContainerRestartingFrequently
-            expr: increase(kube_pod_container_status_restarts_total[1h]) > 5
+            # Exclude github-runner: ephemeral runners register, run one job,
+            # exit cleanly, and restart by design. Also require kube_pod_info so
+            # deleted rollout pods do not keep firing from retained restart series.
+            expr: |
+              increase(kube_pod_container_status_restarts_total{namespace!="github-runner"}[1h]) > 5
+              and on(namespace, pod) kube_pod_info
            for: 15m
            labels:
              severity: warning
@@ -1029,7 +1037,12 @@ data:
              description: "Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has restarted {{ $value | printf \"%.0f\" }} times in the last hour. Check 'kubectl describe pod' + last-state termination reason."

          - alert: KubeContainerCrashLooping
-            expr: increase(kube_pod_container_status_restarts_total[15m]) > 3
+            # Same github-runner/delete-retention exclusions as the hourly
+            # restart rule above; real runner failures are covered by the
+            # dedicated LinuxRunnerOffline/MacMiniRunnerOffline alerts.
+            expr: |
+              increase(kube_pod_container_status_restarts_total{namespace!="github-runner"}[15m]) > 3
+              and on(namespace, pod) kube_pod_info
            for: 5m
            labels:
              severity: critical
@@ -1057,7 +1070,10 @@ data:
              description: "Pod can't pull image. Check the image ref (often a stale tag or unreachable registry) and clean up if it's an orphan."

          - alert: KubeDeploymentReplicasMismatch
-            expr: kube_deployment_spec_replicas != kube_deployment_status_replicas_available
+            # github-runner has explicit runner-offline alerts; the generic
+            # replica-mismatch rule should not page on intentionally ephemeral
+            # 0/1 runner churn between CI jobs.
+            expr: kube_deployment_spec_replicas{namespace!="github-runner"} != kube_deployment_status_replicas_available{namespace!="github-runner"}
            for: 15m
            labels:
              severity: warning