fix(monitoring): probe OIDC-safe health routes
Sprint 58 Cx-12. Rebased over OIDC GitOps main; YAML parse and focused bluejay-infra lint tests passed.
This commit was merged in pull request #35.
This commit is contained in:
@@ -93,6 +93,7 @@ spec:
|
|||||||
prometheus.io/scrape: "true"
|
prometheus.io/scrape: "true"
|
||||||
prometheus.io/port: "8080"
|
prometheus.io/port: "8080"
|
||||||
prometheus.io/path: "/metrics"
|
prometheus.io/path: "/metrics"
|
||||||
|
flowercore.io/healthz-auth-policy: "allow-anonymous"
|
||||||
spec:
|
spec:
|
||||||
securityContext:
|
securityContext:
|
||||||
runAsNonRoot: true
|
runAsNonRoot: true
|
||||||
@@ -123,9 +124,9 @@ spec:
|
|||||||
value: "Production"
|
value: "Production"
|
||||||
- name: DOTNET_SYSTEM_GLOBALIZATION_INVARIANT
|
- name: DOTNET_SYSTEM_GLOBALIZATION_INVARIANT
|
||||||
value: "false"
|
value: "false"
|
||||||
# AuthentiK/OIDC is wired but not enforced until the
|
# AuthentiK/OIDC is enforced. /healthz stays anonymous by contract;
|
||||||
# knowledge-oidc-client Secret is provisioned and
|
# see flowercore.io/healthz-auth-policy above and the Sprint 58
|
||||||
# FlowerCore__Auth__Enabled is flipped to true.
|
# OIDC readiness probe audit.
|
||||||
- name: FlowerCore__Auth__Enabled
|
- name: FlowerCore__Auth__Enabled
|
||||||
value: "true"
|
value: "true"
|
||||||
- name: FlowerCore__Auth__Oidc__Enabled
|
- name: FlowerCore__Auth__Oidc__Enabled
|
||||||
|
|||||||
@@ -481,22 +481,25 @@ data:
|
|||||||
- "https://intranet.iamworkin.lan/"
|
- "https://intranet.iamworkin.lan/"
|
||||||
- "https://signage.iamworkin.lan/healthz" # root 401 auth-gated 2026-06-01; /healthz anon 200
|
- "https://signage.iamworkin.lan/healthz" # root 401 auth-gated 2026-06-01; /healthz anon 200
|
||||||
- "https://kiosk.iamworkin.lan/"
|
- "https://kiosk.iamworkin.lan/"
|
||||||
- "https://media.iamworkin.lan/healthz" # root auth-gated by OIDC; /healthz anon 200
|
- "https://media.iamworkin.lan/healthz" # root auth-gated by OIDC; /healthz anonymous 200
|
||||||
- "https://mysql.iamworkin.lan/healthz" # root 401 auth-gated 2026-06-01; /healthz anon 200
|
- "https://mysql.iamworkin.lan/healthz" # root 401 auth-gated 2026-06-01; /healthz anon 200
|
||||||
- "https://php.iamworkin.lan/healthz" # root 401 auth-gated 2026-06-01; /healthz anon 200
|
- "https://php.iamworkin.lan/healthz" # root 401 auth-gated 2026-06-01; /healthz anon 200
|
||||||
- "https://zabbix.iamworkin.lan/"
|
- "https://zabbix.iamworkin.lan/"
|
||||||
- "https://desktop.iamworkin.lan/"
|
- "https://desktop.iamworkin.lan/"
|
||||||
- "https://print.iamworkin.lan/"
|
- "https://print.iamworkin.lan/healthz" # root 401 behind API key auth; /healthz anonymous 200
|
||||||
- "https://dns.iamworkin.lan/healthz" # root auth-gated by OIDC; /healthz anon 200
|
- "https://dns.iamworkin.lan/healthz" # root auth-gated by OIDC; /healthz anonymous 200
|
||||||
- "https://chat.iamworkin.lan/"
|
- "https://chat.iamworkin.lan/healthz" # OIDC staged; keep blackbox off root before enforcement flips
|
||||||
- "https://dist.iamworkin.lan/healthz" # root/admin auth-gated by OIDC; /healthz anon 200
|
- "https://dist.iamworkin.lan/healthz" # root/admin auth-gated by OIDC; /healthz anonymous 200
|
||||||
- "https://dms.iamworkin.lan/"
|
- "https://dms.iamworkin.lan/healthz" # future OIDC posture; health route is already anonymous/live
|
||||||
- "https://menuboard.iamworkin.lan/"
|
- "https://menuboard.iamworkin.lan/"
|
||||||
- "https://messageboard.iamworkin.lan/"
|
- "https://messageboard.iamworkin.lan/"
|
||||||
- "https://presentations.iamworkin.lan/"
|
- "https://presentations.iamworkin.lan/"
|
||||||
- "https://retail.iamworkin.lan/"
|
- "https://retail.iamworkin.lan/"
|
||||||
- "https://ttsreader.iamworkin.lan/"
|
- "https://ttsreader.iamworkin.lan/"
|
||||||
# Explicit healthcheck paths
|
# Explicit healthcheck paths
|
||||||
|
- "https://library.iamworkin.lan/health"
|
||||||
|
- "https://aistation.iamworkin.lan/healthz"
|
||||||
|
- "https://knowledge.iamworkin.lan/healthz"
|
||||||
- "https://fc-llm-bridge.iamworkin.lan/healthz"
|
- "https://fc-llm-bridge.iamworkin.lan/healthz"
|
||||||
- "https://acme.iamworkin.lan/health"
|
- "https://acme.iamworkin.lan/health"
|
||||||
# NOTE: services intentionally NOT in this probe surface
|
# NOTE: services intentionally NOT in this probe surface
|
||||||
@@ -1020,7 +1023,12 @@ data:
|
|||||||
- name: kubernetes-state
|
- name: kubernetes-state
|
||||||
rules:
|
rules:
|
||||||
- alert: KubeContainerRestartingFrequently
|
- alert: KubeContainerRestartingFrequently
|
||||||
expr: increase(kube_pod_container_status_restarts_total[1h]) > 5
|
# Exclude github-runner: ephemeral runners register, run one job,
|
||||||
|
# exit cleanly, and restart by design. Also require kube_pod_info so
|
||||||
|
# deleted rollout pods do not keep firing from retained restart series.
|
||||||
|
expr: |
|
||||||
|
increase(kube_pod_container_status_restarts_total{namespace!="github-runner"}[1h]) > 5
|
||||||
|
and on(namespace, pod) kube_pod_info
|
||||||
for: 15m
|
for: 15m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
@@ -1029,7 +1037,12 @@ data:
|
|||||||
description: "Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has restarted {{ $value | printf \"%.0f\" }} times in the last hour. Check 'kubectl describe pod' + last-state termination reason."
|
description: "Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has restarted {{ $value | printf \"%.0f\" }} times in the last hour. Check 'kubectl describe pod' + last-state termination reason."
|
||||||
|
|
||||||
- alert: KubeContainerCrashLooping
|
- alert: KubeContainerCrashLooping
|
||||||
expr: increase(kube_pod_container_status_restarts_total[15m]) > 3
|
# Same github-runner/delete-retention exclusions as the hourly
|
||||||
|
# restart rule above; real runner failures are covered by the
|
||||||
|
# dedicated LinuxRunnerOffline/MacMiniRunnerOffline alerts.
|
||||||
|
expr: |
|
||||||
|
increase(kube_pod_container_status_restarts_total{namespace!="github-runner"}[15m]) > 3
|
||||||
|
and on(namespace, pod) kube_pod_info
|
||||||
for: 5m
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
@@ -1057,7 +1070,10 @@ data:
|
|||||||
description: "Pod can't pull image. Check the image ref (often a stale tag or unreachable registry) and clean up if it's an orphan."
|
description: "Pod can't pull image. Check the image ref (often a stale tag or unreachable registry) and clean up if it's an orphan."
|
||||||
|
|
||||||
- alert: KubeDeploymentReplicasMismatch
|
- alert: KubeDeploymentReplicasMismatch
|
||||||
expr: kube_deployment_spec_replicas != kube_deployment_status_replicas_available
|
# github-runner has explicit runner-offline alerts; the generic
|
||||||
|
# replica-mismatch rule should not page on intentionally ephemeral
|
||||||
|
# 0/1 runner churn between CI jobs.
|
||||||
|
expr: kube_deployment_spec_replicas{namespace!="github-runner"} != kube_deployment_status_replicas_available{namespace!="github-runner"}
|
||||||
for: 15m
|
for: 15m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
|
|||||||
@@ -423,6 +423,82 @@ public sealed class FleetManifestLintTests
|
|||||||
monitoring.Should().Contain("alert_channel: irc");
|
monitoring.Should().Contain("alert_channel: irc");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public void Monitoring_GenericKubernetesAlerts_MustExcludeEphemeralGithubRunnerNamespace()
|
||||||
|
{
|
||||||
|
var monitoring = File.ReadAllText(Path.Combine(Inventory.BluejayRoot, "apps", "monitoring", "noc-monitoring.yaml"));
|
||||||
|
|
||||||
|
monitoring.Should().Contain("kube_pod_container_status_restarts_total{namespace!=\"github-runner\"}");
|
||||||
|
monitoring.Should().Contain("and on(namespace, pod) kube_pod_info");
|
||||||
|
monitoring.Should().Contain("kube_deployment_spec_replicas{namespace!=\"github-runner\"} != kube_deployment_status_replicas_available{namespace!=\"github-runner\"}");
|
||||||
|
monitoring.Should().Contain("dedicated LinuxRunnerOffline/MacMiniRunnerOffline alerts");
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public void Monitoring_BlackboxTargetsForOidcSensitiveServices_MustUseAnonymousHealthRoutesWhenAvailable()
|
||||||
|
{
|
||||||
|
var monitoring = File.ReadAllText(Path.Combine(Inventory.BluejayRoot, "apps", "monitoring", "noc-monitoring.yaml"));
|
||||||
|
|
||||||
|
monitoring.Should().Contain("https://chat.iamworkin.lan/healthz");
|
||||||
|
monitoring.Should().Contain("https://dist.iamworkin.lan/healthz");
|
||||||
|
monitoring.Should().Contain("https://dms.iamworkin.lan/healthz");
|
||||||
|
monitoring.Should().Contain("https://print.iamworkin.lan/healthz");
|
||||||
|
monitoring.Should().Contain("https://knowledge.iamworkin.lan/healthz");
|
||||||
|
monitoring.Should().Contain("https://library.iamworkin.lan/health");
|
||||||
|
monitoring.Should().Contain("https://aistation.iamworkin.lan/healthz");
|
||||||
|
monitoring.Should().NotContain("https://print.iamworkin.lan/\"");
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public void OidcEnforcedDeployments_WithHttpHealthzProbes_MustDeclareAnonymousHealthzContract()
|
||||||
|
{
|
||||||
|
var violations = Inventory.Documents
|
||||||
|
.Where(document => document.Kind == "Deployment")
|
||||||
|
.SelectMany(document => document.MainContainerMappings()
|
||||||
|
.Where(container => string.Equals(EnvValue(container, "FlowerCore__Auth__Enabled"), "true", StringComparison.OrdinalIgnoreCase))
|
||||||
|
.Where(container => string.Equals(EnvValue(container, "FlowerCore__Auth__Oidc__Enabled"), "true", StringComparison.OrdinalIgnoreCase))
|
||||||
|
.Where(container => ProbeHttpGetPath(container, "readinessProbe") == "/healthz"
|
||||||
|
|| ProbeHttpGetPath(container, "startupProbe") == "/healthz")
|
||||||
|
.Where(_ => !string.Equals(
|
||||||
|
PodAnnotation(document, "flowercore.io/healthz-auth-policy"),
|
||||||
|
"allow-anonymous",
|
||||||
|
StringComparison.Ordinal))
|
||||||
|
.Select(container =>
|
||||||
|
{
|
||||||
|
var containerName = ManifestNodeExtensions.Scalar(container, "name") ?? "<unnamed>";
|
||||||
|
return $"{document.Descriptor} container '{containerName}' enforces OIDC while probing /healthz but lacks flowercore.io/healthz-auth-policy: allow-anonymous.";
|
||||||
|
}))
|
||||||
|
.ToList();
|
||||||
|
|
||||||
|
violations.Should().BeEmpty();
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public void Knowledge_OidcEnforcement_MustKeepHealthzAnonymousContractVisibleInManifest()
|
||||||
|
{
|
||||||
|
var knowledge = Inventory.Documents
|
||||||
|
.Single(document => document.Kind == "Deployment" && document.Namespace == "knowledge" && document.Name == "knowledge-web");
|
||||||
|
var container = knowledge.MainContainerMappings().Should().ContainSingle().Subject;
|
||||||
|
|
||||||
|
EnvValue(container, "FlowerCore__Auth__Enabled").Should().Be("true");
|
||||||
|
EnvValue(container, "FlowerCore__Auth__Oidc__Enabled").Should().Be("true");
|
||||||
|
ProbeHttpGetPath(container, "readinessProbe").Should().Be("/healthz");
|
||||||
|
PodAnnotation(knowledge, "flowercore.io/healthz-auth-policy").Should().Be("allow-anonymous");
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public void Distribution_OidcEnforcement_MustStayOffUntilHealthzAllowAnonymousProofLands()
|
||||||
|
{
|
||||||
|
var distribution = Inventory.Documents
|
||||||
|
.Single(document => document.Kind == "Deployment" && document.Namespace == "fc-distribution" && document.Name == "fc-distribution");
|
||||||
|
var container = distribution.MainContainerMappings().Should().ContainSingle().Subject;
|
||||||
|
|
||||||
|
EnvValue(container, "FlowerCore__Auth__Oidc__Enabled").Should().Be("true");
|
||||||
|
EnvValue(container, "FlowerCore__Auth__Enabled").Should().Be("false");
|
||||||
|
ProbeHttpGetPath(container, "readinessProbe").Should().Be("/healthz");
|
||||||
|
PodAnnotation(distribution, "flowercore.io/healthz-auth-policy").Should().NotBe("allow-anonymous");
|
||||||
|
}
|
||||||
|
|
||||||
[Fact]
|
[Fact]
|
||||||
public void StatefulSets_WithVolumeClaimTemplates_MustDeclareFilesystemDefaults()
|
public void StatefulSets_WithVolumeClaimTemplates_MustDeclareFilesystemDefaults()
|
||||||
{
|
{
|
||||||
@@ -926,6 +1002,19 @@ public sealed class FleetManifestLintTests
|
|||||||
.SingleOrDefault(env => string.Equals(ManifestNodeExtensions.Scalar(env, "name"), name, StringComparison.Ordinal));
|
.SingleOrDefault(env => string.Equals(ManifestNodeExtensions.Scalar(env, "name"), name, StringComparison.Ordinal));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static string? PodAnnotation(ManifestDocument document, string name)
|
||||||
|
{
|
||||||
|
return document.Scalar("spec", "template", "metadata", "annotations", name);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static string? ProbeHttpGetPath(YamlMappingNode container, string probeKey)
|
||||||
|
{
|
||||||
|
return ManifestNodeExtensions.TryGetMapping(container, probeKey, out var probe)
|
||||||
|
&& ManifestNodeExtensions.TryGetMapping(probe, "httpGet", out var httpGet)
|
||||||
|
? ManifestNodeExtensions.Scalar(httpGet, "path")
|
||||||
|
: null;
|
||||||
|
}
|
||||||
|
|
||||||
private static IReadOnlyList<ManifestDocument> FcDeviceManagementDocuments()
|
private static IReadOnlyList<ManifestDocument> FcDeviceManagementDocuments()
|
||||||
{
|
{
|
||||||
return Inventory.Documents
|
return Inventory.Documents
|
||||||
|
|||||||
Reference in New Issue
Block a user