From 0ed9b989fa00838e8a26fc02af17974fae906005 Mon Sep 17 00:00:00 2001 From: Andrew Stoltz Date: Wed, 3 Jun 2026 22:46:33 -0500 Subject: [PATCH] monitoring: mirror Sprint 57 coverage rules --- apps/monitoring/noc-monitoring.yaml | 58 +++++++- .../MonitoringCoverageLintTests.cs | 124 ++++++++++++++++++ 2 files changed, 177 insertions(+), 5 deletions(-) create mode 100644 tests/bluejay-infra-lint/MonitoringCoverageLintTests.cs diff --git a/apps/monitoring/noc-monitoring.yaml b/apps/monitoring/noc-monitoring.yaml index 08441bb..7859b3f 100644 --- a/apps/monitoring/noc-monitoring.yaml +++ b/apps/monitoring/noc-monitoring.yaml @@ -480,14 +480,16 @@ data: - "https://argocd.iamworkin.lan/" - "https://intranet.iamworkin.lan/" - "https://signage.iamworkin.lan/healthz" # root 401 auth-gated 2026-06-01; /healthz anon 200 + - "https://signalcontrol.iamworkin.lan/health" # FlowerCore.SignalControl explicit health route - "https://kiosk.iamworkin.lan/" - "https://media.iamworkin.lan/" - "https://mysql.iamworkin.lan/healthz" # root 401 auth-gated 2026-06-01; /healthz anon 200 - "https://php.iamworkin.lan/healthz" # root 401 auth-gated 2026-06-01; /healthz anon 200 + - "https://dns.iamworkin.lan/" - "https://zabbix.iamworkin.lan/" + - "https://flowercore.iamworkin.lan/healthz" - "https://desktop.iamworkin.lan/" - "https://print.iamworkin.lan/" - - "https://dns.iamworkin.lan/" - "https://chat.iamworkin.lan/" - "https://dist.iamworkin.lan/" - "https://dms.iamworkin.lan/" @@ -496,9 +498,15 @@ data: - "https://presentations.iamworkin.lan/" - "https://retail.iamworkin.lan/" - "https://ttsreader.iamworkin.lan/" + - "https://updates.iamworkin.lan/api/v1/manifests/_schema" # Explicit healthcheck paths - "https://fc-llm-bridge.iamworkin.lan/healthz" - "https://acme.iamworkin.lan/health" + - "https://replay.iamworkin.lan/healthz" + - "https://updatecenter-internal.iamworkin.lan/api/v1/manifests/_schema" + - "https://worldbuilder.iamworkin.lan/healthz" + # Coverage gaps logged Q-MR-129/Q-MR-130: devices.iamworkin.lan + # returns 503 and e2e-test-pma/wpdemo only return 404. # NOTE: services intentionally NOT in this probe surface # - grafana.iamworkin.lan: every endpoint (incl. /api/health # and /login) returns 401 behind Traefik basic-auth. @@ -907,11 +915,14 @@ data: # for: 30m absorbs sleep cycles. The EcoTank sleeps after ~5 min # of idle and SNMP times out, so 5m for: would page nightly. A # genuine printer outage (jam, disconnected) lasts well over 30m. + # Use a range-window expression: instant up{} can go stale/absent + # after repeated snmp-exporter 500s. - alert: EpsonPrinterDown - expr: up{job="snmp-printer"} == 0 + expr: (max_over_time(up{job="snmp-printer"}[35m]) == bool 0) == 1 and (hour() >= 13 or hour() < 1) for: 30m labels: severity: warning + alert_channel: irc annotations: summary: "Epson ET-3750 SNMP unreachable for >30m (likely actual fault, not sleep)" @@ -1020,7 +1031,9 @@ data: - name: kubernetes-state rules: - alert: KubeContainerRestartingFrequently - expr: increase(kube_pod_container_status_restarts_total[1h]) > 5 + # Exclude github-runner: ephemeral runners register, run one job, + # exit cleanly, then restart by design. + expr: increase(kube_pod_container_status_restarts_total{namespace!="github-runner"}[1h]) > 5 for: 15m labels: severity: warning @@ -1029,7 +1042,9 @@ data: description: "Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has restarted {{ $value | printf \"%.0f\" }} times in the last hour. Check 'kubectl describe pod' + last-state termination reason." - alert: KubeContainerCrashLooping - expr: increase(kube_pod_container_status_restarts_total[15m]) > 3 + # Exclude github-runner: ephemeral runners register, run one job, + # exit cleanly, then restart by design. + expr: increase(kube_pod_container_status_restarts_total{namespace!="github-runner"}[15m]) > 3 for: 5m labels: severity: critical @@ -1057,7 +1072,8 @@ data: description: "Pod can't pull image. Check the image ref (often a stale tag or unreachable registry) and clean up if it's an orphan." - alert: KubeDeploymentReplicasMismatch - expr: kube_deployment_spec_replicas != kube_deployment_status_replicas_available + # Exclude github-runner: ephemeral runner deployments flap 0/1 between jobs by design. + expr: kube_deployment_spec_replicas{namespace!="github-runner"} != kube_deployment_status_replicas_available{namespace!="github-runner"} for: 15m labels: severity: warning @@ -3636,6 +3652,38 @@ data: relativeTimeRange: {from: 120, to: 0} datasourceUid: __expr__ model: {type: threshold, expression: B, conditions: [{evaluator: {params: [600], type: gt}}], refId: C} + - orgId: 1 + name: SNMP Devices + folder: Infrastructure Alerts + interval: 1m + rules: + - uid: epson-printer-down-stale-window + title: EpsonPrinterDown + condition: C + for: 30m + noDataState: OK + execErrState: OK + annotations: + summary: Epson ET-3750 SNMP unreachable + description: The Epson ET-3750 snmp-printer target has reported only failed scrapes for at least 35 minutes. + runbook: "1. Check if printer is intentionally powered off 2. If printing needed: press power button on printer 3. Ping 10.0.58.107 after wake-up 4. Check WiFi on printer LCD if still unreachable" + labels: + severity: info + service: printer + alert_channel: irc + data: + - refId: A + relativeTimeRange: {from: 2100, to: 0} + datasourceUid: prometheus + model: {expr: '(max_over_time(up{job="snmp-printer"}[35m]) == bool 0) == 1 and (hour() >= 13 or hour() < 1)', instant: true, refId: A} + - refId: B + relativeTimeRange: {from: 2100, to: 0} + datasourceUid: __expr__ + model: {type: reduce, expression: A, reducer: last, refId: B} + - refId: C + relativeTimeRange: {from: 2100, to: 0} + datasourceUid: __expr__ + model: {type: threshold, expression: B, conditions: [{evaluator: {params: [0], type: gt}}], refId: C} - orgId: 1 name: CI Runners folder: CI Alerts diff --git a/tests/bluejay-infra-lint/MonitoringCoverageLintTests.cs b/tests/bluejay-infra-lint/MonitoringCoverageLintTests.cs new file mode 100644 index 0000000..ca16fc0 --- /dev/null +++ b/tests/bluejay-infra-lint/MonitoringCoverageLintTests.cs @@ -0,0 +1,124 @@ +using FluentAssertions; +using System.Text.RegularExpressions; +using Xunit; + +namespace BluejayInfraLint.Tests; + +[Trait("Category", "Unit")] +public sealed class MonitoringCoverageLintTests +{ + private static readonly ManifestInventory Inventory = ManifestInventory.Load(); + + private static readonly string[] Sprint57ProbeTargets = + { + "https://dns.iamworkin.lan/", + "https://flowercore.iamworkin.lan/healthz", + "https://replay.iamworkin.lan/healthz", + "https://signalcontrol.iamworkin.lan/health", + "https://updatecenter-internal.iamworkin.lan/api/v1/manifests/_schema", + "https://updates.iamworkin.lan/api/v1/manifests/_schema", + "https://worldbuilder.iamworkin.lan/healthz", + }; + + [Fact] + public void PrometheusScrape_MustNotTargetDeadPiManagerPort() + { + var monitoring = ReadMonitoringMirror(); + + monitoring.Should().NotContain("10.0.58.113:5100"); + monitoring.Should().Contain("10.0.58.113:5200"); + } + + [Fact] + public void ProbeJobs_MustKeepEnvironmentSpecificBlackboxRelabels() + { + var monitoring = ReadMonitoringMirror(); + var probeJobs = FindProbeJobs(monitoring); + + probeJobs.Should().NotBeEmpty(); + probeJobs.Should().OnlyContain( + job => job.Contains("replacement: blackbox-exporter.monitoring.svc:9115", StringComparison.Ordinal), + "the bluejay-infra mirror runs Prometheus in-cluster and should use the blackbox service DNS"); + + var livePodmanPrometheus = TryReadNotesMonitoringFile("prometheus.yml"); + if (livePodmanPrometheus is not null) + { + FindProbeJobs(livePodmanPrometheus).Should().OnlyContain( + job => job.Contains("replacement: localhost:9115", StringComparison.Ordinal), + "live Podman monitoring uses host networking, so blackbox probes must relabel to localhost:9115"); + } + } + + [Fact] + public void TraefikServiceProbes_MustCoverSprint57LiveFlowerCoreHosts() + { + var monitoring = ReadMonitoringMirror(); + + foreach (var target in Sprint57ProbeTargets) + { + monitoring.Should().Contain(target); + } + } + + [Fact] + public void EpsonPrinterDown_MustUseRangeWindowForStaleScrapeCoverage() + { + var alerts = ReadMonitoringMirror(); + + alerts.Should().Contain("- alert: EpsonPrinterDown"); + alerts.Should().Contain("max_over_time(up{job=\"snmp-printer\"}[35m]) == bool 0"); + alerts.Should().NotContain("expr: up{job=\"snmp-printer\"} == 0"); + } + + [Fact] + public void MonitoringMirror_MustCarryRunnerExclusionsAndEpsonGrafanaDelivery() + { + var mirror = ReadMonitoringMirror(); + + GetAlertBlock(mirror, "KubeContainerRestartingFrequently") + .Should() + .Contain("kube_pod_container_status_restarts_total{namespace!=\"github-runner\"}[1h]"); + GetAlertBlock(mirror, "KubeContainerCrashLooping") + .Should() + .Contain("kube_pod_container_status_restarts_total{namespace!=\"github-runner\"}[15m]"); + GetAlertBlock(mirror, "KubeDeploymentReplicasMismatch") + .Should() + .Contain("kube_deployment_spec_replicas{namespace!=\"github-runner\"} != kube_deployment_status_replicas_available{namespace!=\"github-runner\"}"); + mirror.Should().Contain("uid: epson-printer-down-stale-window"); + mirror.Should().Contain("title: EpsonPrinterDown"); + mirror.Should().Contain("alert_channel: irc"); + } + + private static string ReadMonitoringMirror() => + File.ReadAllText(Path.Combine(Inventory.BluejayRoot, "apps", "monitoring", "noc-monitoring.yaml")); + + private static string? TryReadNotesMonitoringFile(string fileName) + { + var overrideRoot = Environment.GetEnvironmentVariable("FLOWERCORE_NOTES_ROOT"); + if (string.IsNullOrWhiteSpace(overrideRoot)) + { + return null; + } + + var path = Path.Combine(overrideRoot, "scripts", "monitoring", fileName); + return File.ReadAllText(path); + } + + private static IReadOnlyList FindProbeJobs(string yaml) => + Regex.Matches( + yaml, + "(?ms)^\\s+- job_name: \"probe-[^\"]+\".*?(?=^\\s+- job_name:|\\z)") + .Cast() + .Select(match => match.Value) + .ToList(); + + private static string GetAlertBlock(string yaml, string alertName) + { + var match = Regex.Match( + yaml, + $"(?ms)^\\s+- alert: {Regex.Escape(alertName)}\\s*$.*?(?=^\\s+- alert:|\\z)"); + + match.Success.Should().BeTrue($"alert {alertName} should be present in noc-monitoring.yaml"); + return match.Value; + } +}