diff --git a/apps/knowledge/knowledge.yaml b/apps/knowledge/knowledge.yaml index b5fa027..b6cde07 100644 --- a/apps/knowledge/knowledge.yaml +++ b/apps/knowledge/knowledge.yaml @@ -93,6 +93,7 @@ spec: prometheus.io/scrape: "true" prometheus.io/port: "8080" prometheus.io/path: "/metrics" + flowercore.io/healthz-auth-policy: "allow-anonymous" spec: securityContext: runAsNonRoot: true @@ -123,9 +124,9 @@ spec: value: "Production" - name: DOTNET_SYSTEM_GLOBALIZATION_INVARIANT value: "false" - # AuthentiK/OIDC is wired but not enforced until the - # knowledge-oidc-client Secret is provisioned and - # FlowerCore__Auth__Enabled is flipped to true. + # AuthentiK/OIDC is enforced. /healthz stays anonymous by contract; + # see flowercore.io/healthz-auth-policy above and the Sprint 58 + # OIDC readiness probe audit. - name: FlowerCore__Auth__Enabled value: "true" - name: FlowerCore__Auth__Oidc__Enabled diff --git a/apps/monitoring/noc-monitoring.yaml b/apps/monitoring/noc-monitoring.yaml index 9e4668e..09a6f83 100644 --- a/apps/monitoring/noc-monitoring.yaml +++ b/apps/monitoring/noc-monitoring.yaml @@ -481,22 +481,25 @@ data: - "https://intranet.iamworkin.lan/" - "https://signage.iamworkin.lan/healthz" # root 401 auth-gated 2026-06-01; /healthz anon 200 - "https://kiosk.iamworkin.lan/" - - "https://media.iamworkin.lan/healthz" # root auth-gated by OIDC; /healthz anon 200 + - "https://media.iamworkin.lan/healthz" # root auth-gated by OIDC; /healthz anonymous 200 - "https://mysql.iamworkin.lan/healthz" # root 401 auth-gated 2026-06-01; /healthz anon 200 - "https://php.iamworkin.lan/healthz" # root 401 auth-gated 2026-06-01; /healthz anon 200 - "https://zabbix.iamworkin.lan/" - "https://desktop.iamworkin.lan/" - - "https://print.iamworkin.lan/" - - "https://dns.iamworkin.lan/healthz" # root auth-gated by OIDC; /healthz anon 200 - - "https://chat.iamworkin.lan/" - - "https://dist.iamworkin.lan/healthz" # root/admin auth-gated by OIDC; /healthz anon 200 - - "https://dms.iamworkin.lan/" + - "https://print.iamworkin.lan/healthz" # root 401 behind API key auth; /healthz anonymous 200 + - "https://dns.iamworkin.lan/healthz" # root auth-gated by OIDC; /healthz anonymous 200 + - "https://chat.iamworkin.lan/healthz" # OIDC staged; keep blackbox off root before enforcement flips + - "https://dist.iamworkin.lan/healthz" # root/admin auth-gated by OIDC; /healthz anonymous 200 + - "https://dms.iamworkin.lan/healthz" # future OIDC posture; health route is already anonymous/live - "https://menuboard.iamworkin.lan/" - "https://messageboard.iamworkin.lan/" - "https://presentations.iamworkin.lan/" - "https://retail.iamworkin.lan/" - "https://ttsreader.iamworkin.lan/" # Explicit healthcheck paths + - "https://library.iamworkin.lan/health" + - "https://aistation.iamworkin.lan/healthz" + - "https://knowledge.iamworkin.lan/healthz" - "https://fc-llm-bridge.iamworkin.lan/healthz" - "https://acme.iamworkin.lan/health" # NOTE: services intentionally NOT in this probe surface @@ -1020,7 +1023,12 @@ data: - name: kubernetes-state rules: - alert: KubeContainerRestartingFrequently - expr: increase(kube_pod_container_status_restarts_total[1h]) > 5 + # Exclude github-runner: ephemeral runners register, run one job, + # exit cleanly, and restart by design. Also require kube_pod_info so + # deleted rollout pods do not keep firing from retained restart series. + expr: | + increase(kube_pod_container_status_restarts_total{namespace!="github-runner"}[1h]) > 5 + and on(namespace, pod) kube_pod_info for: 15m labels: severity: warning @@ -1029,7 +1037,12 @@ data: description: "Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has restarted {{ $value | printf \"%.0f\" }} times in the last hour. Check 'kubectl describe pod' + last-state termination reason." - alert: KubeContainerCrashLooping - expr: increase(kube_pod_container_status_restarts_total[15m]) > 3 + # Same github-runner/delete-retention exclusions as the hourly + # restart rule above; real runner failures are covered by the + # dedicated LinuxRunnerOffline/MacMiniRunnerOffline alerts. + expr: | + increase(kube_pod_container_status_restarts_total{namespace!="github-runner"}[15m]) > 3 + and on(namespace, pod) kube_pod_info for: 5m labels: severity: critical @@ -1057,7 +1070,10 @@ data: description: "Pod can't pull image. Check the image ref (often a stale tag or unreachable registry) and clean up if it's an orphan." - alert: KubeDeploymentReplicasMismatch - expr: kube_deployment_spec_replicas != kube_deployment_status_replicas_available + # github-runner has explicit runner-offline alerts; the generic + # replica-mismatch rule should not page on intentionally ephemeral + # 0/1 runner churn between CI jobs. + expr: kube_deployment_spec_replicas{namespace!="github-runner"} != kube_deployment_status_replicas_available{namespace!="github-runner"} for: 15m labels: severity: warning diff --git a/tests/bluejay-infra-lint/FleetManifestLintTests.cs b/tests/bluejay-infra-lint/FleetManifestLintTests.cs index 379b6ed..3e21938 100644 --- a/tests/bluejay-infra-lint/FleetManifestLintTests.cs +++ b/tests/bluejay-infra-lint/FleetManifestLintTests.cs @@ -423,6 +423,82 @@ public sealed class FleetManifestLintTests monitoring.Should().Contain("alert_channel: irc"); } + [Fact] + public void Monitoring_GenericKubernetesAlerts_MustExcludeEphemeralGithubRunnerNamespace() + { + var monitoring = File.ReadAllText(Path.Combine(Inventory.BluejayRoot, "apps", "monitoring", "noc-monitoring.yaml")); + + monitoring.Should().Contain("kube_pod_container_status_restarts_total{namespace!=\"github-runner\"}"); + monitoring.Should().Contain("and on(namespace, pod) kube_pod_info"); + monitoring.Should().Contain("kube_deployment_spec_replicas{namespace!=\"github-runner\"} != kube_deployment_status_replicas_available{namespace!=\"github-runner\"}"); + monitoring.Should().Contain("dedicated LinuxRunnerOffline/MacMiniRunnerOffline alerts"); + } + + [Fact] + public void Monitoring_BlackboxTargetsForOidcSensitiveServices_MustUseAnonymousHealthRoutesWhenAvailable() + { + var monitoring = File.ReadAllText(Path.Combine(Inventory.BluejayRoot, "apps", "monitoring", "noc-monitoring.yaml")); + + monitoring.Should().Contain("https://chat.iamworkin.lan/healthz"); + monitoring.Should().Contain("https://dist.iamworkin.lan/healthz"); + monitoring.Should().Contain("https://dms.iamworkin.lan/healthz"); + monitoring.Should().Contain("https://print.iamworkin.lan/healthz"); + monitoring.Should().Contain("https://knowledge.iamworkin.lan/healthz"); + monitoring.Should().Contain("https://library.iamworkin.lan/health"); + monitoring.Should().Contain("https://aistation.iamworkin.lan/healthz"); + monitoring.Should().NotContain("https://print.iamworkin.lan/\""); + } + + [Fact] + public void OidcEnforcedDeployments_WithHttpHealthzProbes_MustDeclareAnonymousHealthzContract() + { + var violations = Inventory.Documents + .Where(document => document.Kind == "Deployment") + .SelectMany(document => document.MainContainerMappings() + .Where(container => string.Equals(EnvValue(container, "FlowerCore__Auth__Enabled"), "true", StringComparison.OrdinalIgnoreCase)) + .Where(container => string.Equals(EnvValue(container, "FlowerCore__Auth__Oidc__Enabled"), "true", StringComparison.OrdinalIgnoreCase)) + .Where(container => ProbeHttpGetPath(container, "readinessProbe") == "/healthz" + || ProbeHttpGetPath(container, "startupProbe") == "/healthz") + .Where(_ => !string.Equals( + PodAnnotation(document, "flowercore.io/healthz-auth-policy"), + "allow-anonymous", + StringComparison.Ordinal)) + .Select(container => + { + var containerName = ManifestNodeExtensions.Scalar(container, "name") ?? ""; + return $"{document.Descriptor} container '{containerName}' enforces OIDC while probing /healthz but lacks flowercore.io/healthz-auth-policy: allow-anonymous."; + })) + .ToList(); + + violations.Should().BeEmpty(); + } + + [Fact] + public void Knowledge_OidcEnforcement_MustKeepHealthzAnonymousContractVisibleInManifest() + { + var knowledge = Inventory.Documents + .Single(document => document.Kind == "Deployment" && document.Namespace == "knowledge" && document.Name == "knowledge-web"); + var container = knowledge.MainContainerMappings().Should().ContainSingle().Subject; + + EnvValue(container, "FlowerCore__Auth__Enabled").Should().Be("true"); + EnvValue(container, "FlowerCore__Auth__Oidc__Enabled").Should().Be("true"); + ProbeHttpGetPath(container, "readinessProbe").Should().Be("/healthz"); + PodAnnotation(knowledge, "flowercore.io/healthz-auth-policy").Should().Be("allow-anonymous"); + } + + [Fact] + public void Distribution_OidcEnforcement_MustStayOffUntilHealthzAllowAnonymousProofLands() + { + var distribution = Inventory.Documents + .Single(document => document.Kind == "Deployment" && document.Namespace == "fc-distribution" && document.Name == "fc-distribution"); + var container = distribution.MainContainerMappings().Should().ContainSingle().Subject; + + EnvValue(container, "FlowerCore__Auth__Oidc__Enabled").Should().Be("true"); + EnvValue(container, "FlowerCore__Auth__Enabled").Should().Be("false"); + ProbeHttpGetPath(container, "readinessProbe").Should().Be("/healthz"); + PodAnnotation(distribution, "flowercore.io/healthz-auth-policy").Should().NotBe("allow-anonymous"); + } + [Fact] public void StatefulSets_WithVolumeClaimTemplates_MustDeclareFilesystemDefaults() { @@ -926,6 +1002,19 @@ public sealed class FleetManifestLintTests .SingleOrDefault(env => string.Equals(ManifestNodeExtensions.Scalar(env, "name"), name, StringComparison.Ordinal)); } + private static string? PodAnnotation(ManifestDocument document, string name) + { + return document.Scalar("spec", "template", "metadata", "annotations", name); + } + + private static string? ProbeHttpGetPath(YamlMappingNode container, string probeKey) + { + return ManifestNodeExtensions.TryGetMapping(container, probeKey, out var probe) + && ManifestNodeExtensions.TryGetMapping(probe, "httpGet", out var httpGet) + ? ManifestNodeExtensions.Scalar(httpGet, "path") + : null; + } + private static IReadOnlyList FcDeviceManagementDocuments() { return Inventory.Documents