From 13d8ca8c1a335bee147ec02ca06172c57a7added Mon Sep 17 00:00:00 2001 From: Andrew Stoltz Date: Wed, 10 Jun 2026 16:36:18 -0500 Subject: [PATCH] infra: export appset and mirror alert polish --- README.md | 16 ++++ apps/monitoring/noc-monitoring.yaml | 52 +++++++++++ argocd/applicationset-bluejay-infra.yaml | 74 +++++++++++++++ .../FleetManifestLintTests.cs | 93 +++++++++++++++++++ 4 files changed, 235 insertions(+) create mode 100644 argocd/applicationset-bluejay-infra.yaml diff --git a/README.md b/README.md index 6ef4c02..75df4e8 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,22 @@ Infrastructure manifests for ArgoCD. An `ApplicationSet` in `argocd` namespace watches the `apps/*` directories in this repo and creates one `Application` per subdir (prefixed `infra-`). +## Root GitOps ApplicationSet + +`argocd/applicationset-bluejay-infra.yaml` is the root of this GitOps tree, but +it is **NOT self-managed** by ArgoCD. Apply it manually when the root generator +or sync policy changes: + +```bash +kubectl -n argocd apply -f argocd/applicationset-bluejay-infra.yaml +``` + +Keep the per-StatefulSet `ignoreDifferences` entries in that file synced with +the live ApplicationSet. They intentionally cover `zabbix-postgres`, +`guac-mysql`, `matrix-postgres`, and `authentik-postgres` so ArgoCD does not +loop forever on server-side-apply `volumeClaimTemplates` status drift. Every new +StatefulSet with `volumeClaimTemplates` needs its own entry appended. + ## Adding a new service to the cluster Follow these steps in order. **Step 1 must run before step 3** — if you skip it, cert-manager HTTP-01 will silently fail for ~2h per cert (exponential backoff) until someone diagnoses the DNS. diff --git a/apps/monitoring/noc-monitoring.yaml b/apps/monitoring/noc-monitoring.yaml index c4a5ff3..15b8c9a 100644 --- a/apps/monitoring/noc-monitoring.yaml +++ b/apps/monitoring/noc-monitoring.yaml @@ -1244,6 +1244,58 @@ data: summary: "Marquee animation duration drifting > 10% on {{ $labels.renderer }} ({{ $labels.phase }})" description: "Median observed cycle duration deviates from target DurationMs by >10%. Could indicate browser tab throttling, GPU pressure, or phase-advancement bug." + # ============================================================ + # Update Center public-edge probes + # Live-mirrored from FlowerCore.Notes/scripts/monitoring/alerts.yml. + # This K8s ConfigMap is the future migration target; live Prometheus + # still reads the canonical Notes file from noc1 Podman. + # ============================================================ + - name: update_center + rules: + # Critical only when the edge is genuinely unreachable. A Cloudflare + # HTTP 429 means the prober hit a rate-limit, not that real clients + # are down, so the warning rule below owns that signal. + - alert: UpdateCenterPublicEdgeDown + expr: | + (probe_success{job="probe-update-center-public-edge"} == 0) + unless on(instance) + (probe_http_status_code{job="probe-update-center-public-edge"} == 429) + for: 10m + labels: + severity: critical + service: update-center + alert_channel: irc + annotations: + summary: "Update Center public edge probe failed for {{ $labels.instance }}" + description: >- + The external probe for {{ $labels.instance }} failed for 10 minutes with a + non-2xx status that is not a rate-limit. Public Update Center clients may be + unable to fetch manifest schema metadata through Cloudflare. + runbook: >- + 1. curl -sk https://{{ $labels.instance }}/api/v1/manifests/_schema + 2. Verify Cloudflare DNS record is proxied and targets the current public edge IP + 3. kubectl -n fc-updater get ingressroute updatecenter-web-public secret cf-origin-flowercore-io + 4. Check Traefik logs for Method() or TLS secret errors + + - alert: UpdateCenterPublicEdgeRateLimited + expr: probe_http_status_code{job="probe-update-center-public-edge"} == 429 + for: 15m + labels: + severity: warning + service: update-center + alert_channel: irc + annotations: + summary: "Cloudflare is rate-limiting (HTTP 429) the public-edge probe for {{ $labels.instance }}" + description: >- + The blackbox prober receives HTTP 429 from Cloudflare for {{ $labels.instance }} + while the origin is healthy. This is a Cloudflare rate-limit / WAF condition on + the public hostname, not an outage. + runbook: >- + 1. curl -sk https://{{ $labels.instance }}/api/v1/manifests/_schema (expect 200 from a normal client) + 2. Review Cloudflare rate-limit / WAF rules for the hostname; the 5m-cadence prober is tripping a 429 + 3. Add a Cloudflare rate-limit exception for the prober source IP or the /api/v1/manifests/_schema path + 4. Confirm whether the singular host update.flowercore.io is still required, or only updates.flowercore.io + # ============================================================================= # ConfigMap: Blackbox Exporter Configuration # ============================================================================= diff --git a/argocd/applicationset-bluejay-infra.yaml b/argocd/applicationset-bluejay-infra.yaml new file mode 100644 index 0000000..0e6963f --- /dev/null +++ b/argocd/applicationset-bluejay-infra.yaml @@ -0,0 +1,74 @@ +apiVersion: argoproj.io/v1alpha1 +kind: ApplicationSet +metadata: + annotations: + argocd.argoproj.io/refresh: "true" + name: bluejay-infra + namespace: argocd +spec: + generators: + - git: + directories: + - path: apps/* + repoURL: http://gitea-clusterip.gitea.svc:3000/bluejay/bluejay-infra.git + revision: main + template: + metadata: {} + spec: + destination: {} + project: "" + goTemplate: true + goTemplateOptions: + - missingkey=error + template: + metadata: + name: infra-{{.path.basename}} + spec: + destination: + server: https://kubernetes.default.svc + ignoreDifferences: + - group: apps + jqPathExpressions: + - .spec.volumeClaimTemplates[]?.status + jsonPointers: + - /spec/volumeClaimTemplates + kind: StatefulSet + name: zabbix-postgres + namespace: zabbix + - group: apps + jqPathExpressions: + - .spec.volumeClaimTemplates[]?.status + jsonPointers: + - /spec/volumeClaimTemplates + kind: StatefulSet + name: guac-mysql + namespace: guacamole + - group: apps + jqPathExpressions: + - .spec.volumeClaimTemplates[]?.status + jsonPointers: + - /spec/volumeClaimTemplates + kind: StatefulSet + name: matrix-postgres + namespace: matrix + - group: apps + jqPathExpressions: + - .spec.volumeClaimTemplates[]?.status + jsonPointers: + - /spec/volumeClaimTemplates + kind: StatefulSet + name: authentik-postgres + namespace: authentik + project: default + source: + path: '{{.path.path}}' + repoURL: http://gitea-clusterip.gitea.svc:3000/bluejay/bluejay-infra.git + targetRevision: main + syncPolicy: + automated: + prune: true + selfHeal: true + syncOptions: + - CreateNamespace=true + - ServerSideApply=true + - RespectIgnoreDifferences=true diff --git a/tests/bluejay-infra-lint/FleetManifestLintTests.cs b/tests/bluejay-infra-lint/FleetManifestLintTests.cs index edb2d0c..a4e06d1 100644 --- a/tests/bluejay-infra-lint/FleetManifestLintTests.cs +++ b/tests/bluejay-infra-lint/FleetManifestLintTests.cs @@ -468,6 +468,99 @@ public sealed class FleetManifestLintTests monitoring.Should().Contain("dedicated LinuxRunnerOffline/MacMiniRunnerOffline alerts"); } + [Fact] + public void GithubRunnerReadme_DocumentsAcceptedEphemeralExitChurn() + { + var readme = File.ReadAllText(Path.Combine(Inventory.BluejayRoot, "apps", "github-runner", "README.md")); + + readme.Should().Contain("Ephemeral runner pods"); + readme.Should().Contain("exit-1/restart churn"); + readme.Should().Contain("accepted operational noise"); + readme.Should().Contain("repo-scoped runner-offline alerts stay quiet"); + } + + [Fact] + public void Monitoring_PiManagerDownDelayAndUpdateCenterRateLimit_MatchCanonicalAlerts() + { + var notesAlerts = File.ReadAllText(Path.Combine( + Inventory.WorkspaceRoot, + "FlowerCore.Notes", + "scripts", + "monitoring", + "alerts.yml")); + var monitoring = File.ReadAllText(Path.Combine(Inventory.BluejayRoot, "apps", "monitoring", "noc-monitoring.yaml")); + + notesAlerts.Should().Contain("# Sprint 67: keep this warning behind NodeDown's 5m critical page"); + notesAlerts.Should().Contain("- alert: PiManagerDown"); + notesAlerts.Should().Contain("for: 8m"); + monitoring.Should().Contain("# Sprint 67: delayed behind NodeDown's critical page"); + monitoring.Should().Contain("- alert: PiManagerDown"); + monitoring.Should().Contain("for: 8m"); + + notesAlerts.Should().Contain("- alert: UpdateCenterPublicEdgeRateLimited"); + notesAlerts.Should().Contain("expr: probe_http_status_code{job=\"probe-update-center-public-edge\"} == 429"); + notesAlerts.Should().Contain("for: 15m"); + monitoring.Should().Contain("- alert: UpdateCenterPublicEdgeRateLimited"); + monitoring.Should().Contain("expr: probe_http_status_code{job=\"probe-update-center-public-edge\"} == 429"); + monitoring.Should().Contain("for: 15m"); + monitoring.Should().Contain("severity: warning"); + } + + [Fact] + public void ApplicationSetExport_MustRemainManualRootOfGitOpsTree() + { + var readme = File.ReadAllText(Path.Combine(Inventory.BluejayRoot, "README.md")); + var appsetPath = Path.Combine(Inventory.BluejayRoot, "argocd", "applicationset-bluejay-infra.yaml"); + + File.Exists(appsetPath).Should().BeTrue(); + var appset = File.ReadAllText(appsetPath); + + appset.Should().Contain("kind: ApplicationSet"); + appset.Should().Contain("name: bluejay-infra"); + appset.Should().NotContain("\nstatus:"); + appset.Should().NotContain("managedFields:"); + readme.Should().Contain("root of this GitOps tree"); + readme.Should().Contain("NOT self-managed"); + readme.Should().Contain("kubectl -n argocd apply -f argocd/applicationset-bluejay-infra.yaml"); + } + + [Fact] + public void ApplicationSetExport_MustDiscoverAppsDirectoryOnMain() + { + var appset = File.ReadAllText(Path.Combine(Inventory.BluejayRoot, "argocd", "applicationset-bluejay-infra.yaml")); + + appset.Should().Contain("path: apps/*"); + appset.Should().Contain("revision: main"); + appset.Should().Contain("repoURL: http://gitea-clusterip.gitea.svc:3000/bluejay/bluejay-infra.git"); + appset.Should().Contain("path: '{{.path.path}}'"); + appset.Should().Contain("targetRevision: main"); + appset.Should().Contain("ServerSideApply=true"); + appset.Should().Contain("RespectIgnoreDifferences=true"); + } + + [Fact] + public void ApplicationSetExport_MustPreserveStatefulSetIgnoreDifferences() + { + var appset = File.ReadAllText(Path.Combine(Inventory.BluejayRoot, "argocd", "applicationset-bluejay-infra.yaml")); + + appset.Should().Contain("jsonPointers:"); + appset.Should().Contain("- /spec/volumeClaimTemplates"); + appset.Should().Contain(".spec.volumeClaimTemplates[]?.status"); + Regex.Matches(appset, "kind: StatefulSet").Should().HaveCount(4); + + foreach (var (name, ns) in new[] + { + ("zabbix-postgres", "zabbix"), + ("guac-mysql", "guacamole"), + ("matrix-postgres", "matrix"), + ("authentik-postgres", "authentik"), + }) + { + appset.Should().Contain($"name: {name}"); + appset.Should().Contain($"namespace: {ns}"); + } + } + [Fact] public void Monitoring_BlackboxTargetsForOidcSensitiveServices_MustUseAnonymousHealthRoutesWhenAvailable() {