infra: export appset and mirror alert polish

This commit is contained in:
Andrew Stoltz
2026-06-10 16:36:18 -05:00
parent b0a3ef7448
commit 13d8ca8c1a
4 changed files with 235 additions and 0 deletions

View File

@@ -2,6 +2,22 @@
Infrastructure manifests for ArgoCD. An `ApplicationSet` in `argocd` namespace watches the `apps/*` directories in this repo and creates one `Application` per subdir (prefixed `infra-<name>`).
## Root GitOps ApplicationSet
`argocd/applicationset-bluejay-infra.yaml` is the root of this GitOps tree, but
it is **NOT self-managed** by ArgoCD. Apply it manually when the root generator
or sync policy changes:
```bash
kubectl -n argocd apply -f argocd/applicationset-bluejay-infra.yaml
```
Keep the per-StatefulSet `ignoreDifferences` entries in that file synced with
the live ApplicationSet. They intentionally cover `zabbix-postgres`,
`guac-mysql`, `matrix-postgres`, and `authentik-postgres` so ArgoCD does not
loop forever on server-side-apply `volumeClaimTemplates` status drift. Every new
StatefulSet with `volumeClaimTemplates` needs its own entry appended.
## Adding a new service to the cluster
Follow these steps in order. **Step 1 must run before step 3** — if you skip it, cert-manager HTTP-01 will silently fail for ~2h per cert (exponential backoff) until someone diagnoses the DNS.

View File

@@ -1244,6 +1244,58 @@ data:
summary: "Marquee animation duration drifting > 10% on {{ $labels.renderer }} ({{ $labels.phase }})"
description: "Median observed cycle duration deviates from target DurationMs by >10%. Could indicate browser tab throttling, GPU pressure, or phase-advancement bug."
# ============================================================
# Update Center public-edge probes
# Live-mirrored from FlowerCore.Notes/scripts/monitoring/alerts.yml.
# This K8s ConfigMap is the future migration target; live Prometheus
# still reads the canonical Notes file from noc1 Podman.
# ============================================================
- name: update_center
rules:
# Critical only when the edge is genuinely unreachable. A Cloudflare
# HTTP 429 means the prober hit a rate-limit, not that real clients
# are down, so the warning rule below owns that signal.
- alert: UpdateCenterPublicEdgeDown
expr: |
(probe_success{job="probe-update-center-public-edge"} == 0)
unless on(instance)
(probe_http_status_code{job="probe-update-center-public-edge"} == 429)
for: 10m
labels:
severity: critical
service: update-center
alert_channel: irc
annotations:
summary: "Update Center public edge probe failed for {{ $labels.instance }}"
description: >-
The external probe for {{ $labels.instance }} failed for 10 minutes with a
non-2xx status that is not a rate-limit. Public Update Center clients may be
unable to fetch manifest schema metadata through Cloudflare.
runbook: >-
1. curl -sk https://{{ $labels.instance }}/api/v1/manifests/_schema
2. Verify Cloudflare DNS record is proxied and targets the current public edge IP
3. kubectl -n fc-updater get ingressroute updatecenter-web-public secret cf-origin-flowercore-io
4. Check Traefik logs for Method() or TLS secret errors
- alert: UpdateCenterPublicEdgeRateLimited
expr: probe_http_status_code{job="probe-update-center-public-edge"} == 429
for: 15m
labels:
severity: warning
service: update-center
alert_channel: irc
annotations:
summary: "Cloudflare is rate-limiting (HTTP 429) the public-edge probe for {{ $labels.instance }}"
description: >-
The blackbox prober receives HTTP 429 from Cloudflare for {{ $labels.instance }}
while the origin is healthy. This is a Cloudflare rate-limit / WAF condition on
the public hostname, not an outage.
runbook: >-
1. curl -sk https://{{ $labels.instance }}/api/v1/manifests/_schema (expect 200 from a normal client)
2. Review Cloudflare rate-limit / WAF rules for the hostname; the 5m-cadence prober is tripping a 429
3. Add a Cloudflare rate-limit exception for the prober source IP or the /api/v1/manifests/_schema path
4. Confirm whether the singular host update.flowercore.io is still required, or only updates.flowercore.io
# =============================================================================
# ConfigMap: Blackbox Exporter Configuration
# =============================================================================

View File

@@ -0,0 +1,74 @@
apiVersion: argoproj.io/v1alpha1
kind: ApplicationSet
metadata:
annotations:
argocd.argoproj.io/refresh: "true"
name: bluejay-infra
namespace: argocd
spec:
generators:
- git:
directories:
- path: apps/*
repoURL: http://gitea-clusterip.gitea.svc:3000/bluejay/bluejay-infra.git
revision: main
template:
metadata: {}
spec:
destination: {}
project: ""
goTemplate: true
goTemplateOptions:
- missingkey=error
template:
metadata:
name: infra-{{.path.basename}}
spec:
destination:
server: https://kubernetes.default.svc
ignoreDifferences:
- group: apps
jqPathExpressions:
- .spec.volumeClaimTemplates[]?.status
jsonPointers:
- /spec/volumeClaimTemplates
kind: StatefulSet
name: zabbix-postgres
namespace: zabbix
- group: apps
jqPathExpressions:
- .spec.volumeClaimTemplates[]?.status
jsonPointers:
- /spec/volumeClaimTemplates
kind: StatefulSet
name: guac-mysql
namespace: guacamole
- group: apps
jqPathExpressions:
- .spec.volumeClaimTemplates[]?.status
jsonPointers:
- /spec/volumeClaimTemplates
kind: StatefulSet
name: matrix-postgres
namespace: matrix
- group: apps
jqPathExpressions:
- .spec.volumeClaimTemplates[]?.status
jsonPointers:
- /spec/volumeClaimTemplates
kind: StatefulSet
name: authentik-postgres
namespace: authentik
project: default
source:
path: '{{.path.path}}'
repoURL: http://gitea-clusterip.gitea.svc:3000/bluejay/bluejay-infra.git
targetRevision: main
syncPolicy:
automated:
prune: true
selfHeal: true
syncOptions:
- CreateNamespace=true
- ServerSideApply=true
- RespectIgnoreDifferences=true

View File

@@ -468,6 +468,99 @@ public sealed class FleetManifestLintTests
monitoring.Should().Contain("dedicated LinuxRunnerOffline/MacMiniRunnerOffline alerts");
}
[Fact]
public void GithubRunnerReadme_DocumentsAcceptedEphemeralExitChurn()
{
var readme = File.ReadAllText(Path.Combine(Inventory.BluejayRoot, "apps", "github-runner", "README.md"));
readme.Should().Contain("Ephemeral runner pods");
readme.Should().Contain("exit-1/restart churn");
readme.Should().Contain("accepted operational noise");
readme.Should().Contain("repo-scoped runner-offline alerts stay quiet");
}
[Fact]
public void Monitoring_PiManagerDownDelayAndUpdateCenterRateLimit_MatchCanonicalAlerts()
{
var notesAlerts = File.ReadAllText(Path.Combine(
Inventory.WorkspaceRoot,
"FlowerCore.Notes",
"scripts",
"monitoring",
"alerts.yml"));
var monitoring = File.ReadAllText(Path.Combine(Inventory.BluejayRoot, "apps", "monitoring", "noc-monitoring.yaml"));
notesAlerts.Should().Contain("# Sprint 67: keep this warning behind NodeDown's 5m critical page");
notesAlerts.Should().Contain("- alert: PiManagerDown");
notesAlerts.Should().Contain("for: 8m");
monitoring.Should().Contain("# Sprint 67: delayed behind NodeDown's critical page");
monitoring.Should().Contain("- alert: PiManagerDown");
monitoring.Should().Contain("for: 8m");
notesAlerts.Should().Contain("- alert: UpdateCenterPublicEdgeRateLimited");
notesAlerts.Should().Contain("expr: probe_http_status_code{job=\"probe-update-center-public-edge\"} == 429");
notesAlerts.Should().Contain("for: 15m");
monitoring.Should().Contain("- alert: UpdateCenterPublicEdgeRateLimited");
monitoring.Should().Contain("expr: probe_http_status_code{job=\"probe-update-center-public-edge\"} == 429");
monitoring.Should().Contain("for: 15m");
monitoring.Should().Contain("severity: warning");
}
[Fact]
public void ApplicationSetExport_MustRemainManualRootOfGitOpsTree()
{
var readme = File.ReadAllText(Path.Combine(Inventory.BluejayRoot, "README.md"));
var appsetPath = Path.Combine(Inventory.BluejayRoot, "argocd", "applicationset-bluejay-infra.yaml");
File.Exists(appsetPath).Should().BeTrue();
var appset = File.ReadAllText(appsetPath);
appset.Should().Contain("kind: ApplicationSet");
appset.Should().Contain("name: bluejay-infra");
appset.Should().NotContain("\nstatus:");
appset.Should().NotContain("managedFields:");
readme.Should().Contain("root of this GitOps tree");
readme.Should().Contain("NOT self-managed");
readme.Should().Contain("kubectl -n argocd apply -f argocd/applicationset-bluejay-infra.yaml");
}
[Fact]
public void ApplicationSetExport_MustDiscoverAppsDirectoryOnMain()
{
var appset = File.ReadAllText(Path.Combine(Inventory.BluejayRoot, "argocd", "applicationset-bluejay-infra.yaml"));
appset.Should().Contain("path: apps/*");
appset.Should().Contain("revision: main");
appset.Should().Contain("repoURL: http://gitea-clusterip.gitea.svc:3000/bluejay/bluejay-infra.git");
appset.Should().Contain("path: '{{.path.path}}'");
appset.Should().Contain("targetRevision: main");
appset.Should().Contain("ServerSideApply=true");
appset.Should().Contain("RespectIgnoreDifferences=true");
}
[Fact]
public void ApplicationSetExport_MustPreserveStatefulSetIgnoreDifferences()
{
var appset = File.ReadAllText(Path.Combine(Inventory.BluejayRoot, "argocd", "applicationset-bluejay-infra.yaml"));
appset.Should().Contain("jsonPointers:");
appset.Should().Contain("- /spec/volumeClaimTemplates");
appset.Should().Contain(".spec.volumeClaimTemplates[]?.status");
Regex.Matches(appset, "kind: StatefulSet").Should().HaveCount(4);
foreach (var (name, ns) in new[]
{
("zabbix-postgres", "zabbix"),
("guac-mysql", "guacamole"),
("matrix-postgres", "matrix"),
("authentik-postgres", "authentik"),
})
{
appset.Should().Contain($"name: {name}");
appset.Should().Contain($"namespace: {ns}");
}
}
[Fact]
public void Monitoring_BlackboxTargetsForOidcSensitiveServices_MustUseAnonymousHealthRoutesWhenAvailable()
{