Compare commits
6 Commits
codex/s67-
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a7e7c1ae72 | ||
|
|
c8df788d72 | ||
|
|
b1a4d7120e | ||
|
|
4b57b8e939 | ||
|
|
70f36c546b | ||
|
|
cdbddd71af |
16
README.md
16
README.md
@@ -2,22 +2,6 @@
|
|||||||
|
|
||||||
Infrastructure manifests for ArgoCD. An `ApplicationSet` in `argocd` namespace watches the `apps/*` directories in this repo and creates one `Application` per subdir (prefixed `infra-<name>`).
|
Infrastructure manifests for ArgoCD. An `ApplicationSet` in `argocd` namespace watches the `apps/*` directories in this repo and creates one `Application` per subdir (prefixed `infra-<name>`).
|
||||||
|
|
||||||
## Root GitOps ApplicationSet
|
|
||||||
|
|
||||||
`argocd/applicationset-bluejay-infra.yaml` is the root of this GitOps tree, but
|
|
||||||
it is **NOT self-managed** by ArgoCD. Apply it manually when the root generator
|
|
||||||
or sync policy changes:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
kubectl -n argocd apply -f argocd/applicationset-bluejay-infra.yaml
|
|
||||||
```
|
|
||||||
|
|
||||||
Keep the per-StatefulSet `ignoreDifferences` entries in that file synced with
|
|
||||||
the live ApplicationSet. They intentionally cover `zabbix-postgres`,
|
|
||||||
`guac-mysql`, `matrix-postgres`, and `authentik-postgres` so ArgoCD does not
|
|
||||||
loop forever on server-side-apply `volumeClaimTemplates` status drift. Every new
|
|
||||||
StatefulSet with `volumeClaimTemplates` needs its own entry appended.
|
|
||||||
|
|
||||||
## Adding a new service to the cluster
|
## Adding a new service to the cluster
|
||||||
|
|
||||||
Follow these steps in order. **Step 1 must run before step 3** — if you skip it, cert-manager HTTP-01 will silently fail for ~2h per cert (exponential backoff) until someone diagnoses the DNS.
|
Follow these steps in order. **Step 1 must run before step 3** — if you skip it, cert-manager HTTP-01 will silently fail for ~2h per cert (exponential backoff) until someone diagnoses the DNS.
|
||||||
|
|||||||
@@ -17,9 +17,15 @@
|
|||||||
# mtls-chain.pem) — see apps/fc-devicemgmt/1password-item.yaml. Mint mTLS
|
# mtls-chain.pem) — see apps/fc-devicemgmt/1password-item.yaml. Mint mTLS
|
||||||
# from step-ca-agent ClusterIssuer per ADR-126; DB-Password must match the
|
# from step-ca-agent ClusterIssuer per ADR-126; DB-Password must match the
|
||||||
# password configured for the MySQL user.
|
# password configured for the MySQL user.
|
||||||
# Re-enable: change replicas back to 2 after both gaps close. The image tag
|
# Re-enable: change replicas back to 2 after both gaps close.
|
||||||
# in this file (v20260512-cx5) MAY also need a refresh — it predates the
|
#
|
||||||
# Sprint 34 Cl-3 operator fix; Web may have an analogous bug.
|
# 2026-06-10 morning-routine refresh: image tag bumped to v20260610-bluejay —
|
||||||
|
# built from master @ 1614fce (June 9 network/BT command plane PRs #30/#33/#34,
|
||||||
|
# Shared.Data 1.0.1, and the Blue Jay UI.Components restyle). Imported on
|
||||||
|
# rke2-server + rke2-agent1. Gap 1 is wider than noted above: the fc-mysql
|
||||||
|
# OPERATOR deployment itself is absent from the cluster (only mysql-web runs),
|
||||||
|
# so MySqlInstanceCrds would not reconcile — deploy the operator first.
|
||||||
|
# Gap 2 (1P runtime item) also remains open; replicas stays 0.
|
||||||
apiVersion: apps/v1
|
apiVersion: apps/v1
|
||||||
kind: Deployment
|
kind: Deployment
|
||||||
metadata:
|
metadata:
|
||||||
@@ -64,7 +70,7 @@ spec:
|
|||||||
fsGroupChangePolicy: OnRootMismatch
|
fsGroupChangePolicy: OnRootMismatch
|
||||||
containers:
|
containers:
|
||||||
- name: web
|
- name: web
|
||||||
image: localhost/fc-devicemgmt-web:v20260512-cx5
|
image: localhost/fc-devicemgmt-web:v20260610-bluejay
|
||||||
imagePullPolicy: Never
|
imagePullPolicy: Never
|
||||||
ports:
|
ports:
|
||||||
- name: http
|
- name: http
|
||||||
|
|||||||
@@ -24,12 +24,6 @@ original Longhorn ReadWriteOnce NuGet PVC. Every other repo-scoped runner uses
|
|||||||
two replicas with per-pod `emptyDir` caches. That is the safe backlog-drain
|
two replicas with per-pod `emptyDir` caches. That is the safe backlog-drain
|
||||||
strategy: no two pods share one RWO PVC.
|
strategy: no two pods share one RWO PVC.
|
||||||
|
|
||||||
Ephemeral runner pods are expected to register, run one job, deregister, and
|
|
||||||
exit so the Deployment starts a fresh pod for the next registration token. A
|
|
||||||
small amount of exit-1/restart churn from token-expiry or no-work windows is
|
|
||||||
accepted operational noise as long as jobs are not stuck queued and the
|
|
||||||
repo-scoped runner-offline alerts stay quiet.
|
|
||||||
|
|
||||||
Sprint 32 final long-tail wave adds 16 two-replica Deployments:
|
Sprint 32 final long-tail wave adds 16 two-replica Deployments:
|
||||||
`FlowerCore.Knowledge`, `FlowerCore.LlmBridge`, `FlowerCore.Media`,
|
`FlowerCore.Knowledge`, `FlowerCore.LlmBridge`, `FlowerCore.Media`,
|
||||||
`FlowerCore.Presentations`, `FlowerCore.RemoteDesktop`, `FlowerCore.DNS`,
|
`FlowerCore.Presentations`, `FlowerCore.RemoteDesktop`, `FlowerCore.DNS`,
|
||||||
|
|||||||
@@ -46,7 +46,7 @@ spec:
|
|||||||
spec:
|
spec:
|
||||||
containers:
|
containers:
|
||||||
- name: intranet-web
|
- name: intranet-web
|
||||||
image: localhost/fc-intranet-web:v20260531-ttsreader-bridge
|
image: localhost/fc-intranet-web:v20260611-content-quality
|
||||||
imagePullPolicy: Never
|
imagePullPolicy: Never
|
||||||
ports:
|
ports:
|
||||||
- containerPort: 5300
|
- containerPort: 5300
|
||||||
@@ -60,7 +60,12 @@ spec:
|
|||||||
# ≈ 9 hours. BLUEJAY-WS GPU (R9700, 32GB VRAM) does the same work
|
# ≈ 9 hours. BLUEJAY-WS GPU (R9700, 32GB VRAM) does the same work
|
||||||
# in minutes. Memory: feedback_pi5_nomic_embed_slow.
|
# in minutes. Memory: feedback_pi5_nomic_embed_slow.
|
||||||
- name: IntranetSearch__OllamaBaseUrl
|
- name: IntranetSearch__OllamaBaseUrl
|
||||||
value: "http://10.0.56.20:11434"
|
value: "http://edge1.iamworkin.lan:11434"
|
||||||
|
# External Notes corpus roots are not mounted in the live pod today.
|
||||||
|
# Keep the curated/workflow docs directory active without logging
|
||||||
|
# repeated /srv/flowercore-notes missing-root warnings.
|
||||||
|
- name: IntranetSearch__Enabled
|
||||||
|
value: "false"
|
||||||
# Sprint E Phase 2α — JSON-file-backed PageReadingOverride persistence
|
# Sprint E Phase 2α — JSON-file-backed PageReadingOverride persistence
|
||||||
# on the writable PVC at /data. Without this env var the
|
# on the writable PVC at /data. Without this env var the
|
||||||
# intranet falls back to the in-memory store (loses state on
|
# intranet falls back to the in-memory store (loses state on
|
||||||
|
|||||||
@@ -843,9 +843,7 @@ data:
|
|||||||
rules:
|
rules:
|
||||||
- alert: PiManagerDown
|
- alert: PiManagerDown
|
||||||
expr: up{job="pimanager-app"} == 0
|
expr: up{job="pimanager-app"} == 0
|
||||||
# Sprint 67: delayed behind NodeDown's critical page so a powered-off
|
for: 3m
|
||||||
# Pi does not create the first duplicate page for the same host.
|
|
||||||
for: 8m
|
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
@@ -1244,58 +1242,6 @@ data:
|
|||||||
summary: "Marquee animation duration drifting > 10% on {{ $labels.renderer }} ({{ $labels.phase }})"
|
summary: "Marquee animation duration drifting > 10% on {{ $labels.renderer }} ({{ $labels.phase }})"
|
||||||
description: "Median observed cycle duration deviates from target DurationMs by >10%. Could indicate browser tab throttling, GPU pressure, or phase-advancement bug."
|
description: "Median observed cycle duration deviates from target DurationMs by >10%. Could indicate browser tab throttling, GPU pressure, or phase-advancement bug."
|
||||||
|
|
||||||
# ============================================================
|
|
||||||
# Update Center public-edge probes
|
|
||||||
# Live-mirrored from FlowerCore.Notes/scripts/monitoring/alerts.yml.
|
|
||||||
# This K8s ConfigMap is the future migration target; live Prometheus
|
|
||||||
# still reads the canonical Notes file from noc1 Podman.
|
|
||||||
# ============================================================
|
|
||||||
- name: update_center
|
|
||||||
rules:
|
|
||||||
# Critical only when the edge is genuinely unreachable. A Cloudflare
|
|
||||||
# HTTP 429 means the prober hit a rate-limit, not that real clients
|
|
||||||
# are down, so the warning rule below owns that signal.
|
|
||||||
- alert: UpdateCenterPublicEdgeDown
|
|
||||||
expr: |
|
|
||||||
(probe_success{job="probe-update-center-public-edge"} == 0)
|
|
||||||
unless on(instance)
|
|
||||||
(probe_http_status_code{job="probe-update-center-public-edge"} == 429)
|
|
||||||
for: 10m
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
service: update-center
|
|
||||||
alert_channel: irc
|
|
||||||
annotations:
|
|
||||||
summary: "Update Center public edge probe failed for {{ $labels.instance }}"
|
|
||||||
description: >-
|
|
||||||
The external probe for {{ $labels.instance }} failed for 10 minutes with a
|
|
||||||
non-2xx status that is not a rate-limit. Public Update Center clients may be
|
|
||||||
unable to fetch manifest schema metadata through Cloudflare.
|
|
||||||
runbook: >-
|
|
||||||
1. curl -sk https://{{ $labels.instance }}/api/v1/manifests/_schema
|
|
||||||
2. Verify Cloudflare DNS record is proxied and targets the current public edge IP
|
|
||||||
3. kubectl -n fc-updater get ingressroute updatecenter-web-public secret cf-origin-flowercore-io
|
|
||||||
4. Check Traefik logs for Method() or TLS secret errors
|
|
||||||
|
|
||||||
- alert: UpdateCenterPublicEdgeRateLimited
|
|
||||||
expr: probe_http_status_code{job="probe-update-center-public-edge"} == 429
|
|
||||||
for: 15m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
service: update-center
|
|
||||||
alert_channel: irc
|
|
||||||
annotations:
|
|
||||||
summary: "Cloudflare is rate-limiting (HTTP 429) the public-edge probe for {{ $labels.instance }}"
|
|
||||||
description: >-
|
|
||||||
The blackbox prober receives HTTP 429 from Cloudflare for {{ $labels.instance }}
|
|
||||||
while the origin is healthy. This is a Cloudflare rate-limit / WAF condition on
|
|
||||||
the public hostname, not an outage.
|
|
||||||
runbook: >-
|
|
||||||
1. curl -sk https://{{ $labels.instance }}/api/v1/manifests/_schema (expect 200 from a normal client)
|
|
||||||
2. Review Cloudflare rate-limit / WAF rules for the hostname; the 5m-cadence prober is tripping a 429
|
|
||||||
3. Add a Cloudflare rate-limit exception for the prober source IP or the /api/v1/manifests/_schema path
|
|
||||||
4. Confirm whether the singular host update.flowercore.io is still required, or only updates.flowercore.io
|
|
||||||
|
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
# ConfigMap: Blackbox Exporter Configuration
|
# ConfigMap: Blackbox Exporter Configuration
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
|
|||||||
@@ -1,74 +0,0 @@
|
|||||||
apiVersion: argoproj.io/v1alpha1
|
|
||||||
kind: ApplicationSet
|
|
||||||
metadata:
|
|
||||||
annotations:
|
|
||||||
argocd.argoproj.io/refresh: "true"
|
|
||||||
name: bluejay-infra
|
|
||||||
namespace: argocd
|
|
||||||
spec:
|
|
||||||
generators:
|
|
||||||
- git:
|
|
||||||
directories:
|
|
||||||
- path: apps/*
|
|
||||||
repoURL: http://gitea-clusterip.gitea.svc:3000/bluejay/bluejay-infra.git
|
|
||||||
revision: main
|
|
||||||
template:
|
|
||||||
metadata: {}
|
|
||||||
spec:
|
|
||||||
destination: {}
|
|
||||||
project: ""
|
|
||||||
goTemplate: true
|
|
||||||
goTemplateOptions:
|
|
||||||
- missingkey=error
|
|
||||||
template:
|
|
||||||
metadata:
|
|
||||||
name: infra-{{.path.basename}}
|
|
||||||
spec:
|
|
||||||
destination:
|
|
||||||
server: https://kubernetes.default.svc
|
|
||||||
ignoreDifferences:
|
|
||||||
- group: apps
|
|
||||||
jqPathExpressions:
|
|
||||||
- .spec.volumeClaimTemplates[]?.status
|
|
||||||
jsonPointers:
|
|
||||||
- /spec/volumeClaimTemplates
|
|
||||||
kind: StatefulSet
|
|
||||||
name: zabbix-postgres
|
|
||||||
namespace: zabbix
|
|
||||||
- group: apps
|
|
||||||
jqPathExpressions:
|
|
||||||
- .spec.volumeClaimTemplates[]?.status
|
|
||||||
jsonPointers:
|
|
||||||
- /spec/volumeClaimTemplates
|
|
||||||
kind: StatefulSet
|
|
||||||
name: guac-mysql
|
|
||||||
namespace: guacamole
|
|
||||||
- group: apps
|
|
||||||
jqPathExpressions:
|
|
||||||
- .spec.volumeClaimTemplates[]?.status
|
|
||||||
jsonPointers:
|
|
||||||
- /spec/volumeClaimTemplates
|
|
||||||
kind: StatefulSet
|
|
||||||
name: matrix-postgres
|
|
||||||
namespace: matrix
|
|
||||||
- group: apps
|
|
||||||
jqPathExpressions:
|
|
||||||
- .spec.volumeClaimTemplates[]?.status
|
|
||||||
jsonPointers:
|
|
||||||
- /spec/volumeClaimTemplates
|
|
||||||
kind: StatefulSet
|
|
||||||
name: authentik-postgres
|
|
||||||
namespace: authentik
|
|
||||||
project: default
|
|
||||||
source:
|
|
||||||
path: '{{.path.path}}'
|
|
||||||
repoURL: http://gitea-clusterip.gitea.svc:3000/bluejay/bluejay-infra.git
|
|
||||||
targetRevision: main
|
|
||||||
syncPolicy:
|
|
||||||
automated:
|
|
||||||
prune: true
|
|
||||||
selfHeal: true
|
|
||||||
syncOptions:
|
|
||||||
- CreateNamespace=true
|
|
||||||
- ServerSideApply=true
|
|
||||||
- RespectIgnoreDifferences=true
|
|
||||||
@@ -468,99 +468,6 @@ public sealed class FleetManifestLintTests
|
|||||||
monitoring.Should().Contain("dedicated LinuxRunnerOffline/MacMiniRunnerOffline alerts");
|
monitoring.Should().Contain("dedicated LinuxRunnerOffline/MacMiniRunnerOffline alerts");
|
||||||
}
|
}
|
||||||
|
|
||||||
[Fact]
|
|
||||||
public void GithubRunnerReadme_DocumentsAcceptedEphemeralExitChurn()
|
|
||||||
{
|
|
||||||
var readme = File.ReadAllText(Path.Combine(Inventory.BluejayRoot, "apps", "github-runner", "README.md"));
|
|
||||||
|
|
||||||
readme.Should().Contain("Ephemeral runner pods");
|
|
||||||
readme.Should().Contain("exit-1/restart churn");
|
|
||||||
readme.Should().Contain("accepted operational noise");
|
|
||||||
readme.Should().Contain("repo-scoped runner-offline alerts stay quiet");
|
|
||||||
}
|
|
||||||
|
|
||||||
[Fact]
|
|
||||||
public void Monitoring_PiManagerDownDelayAndUpdateCenterRateLimit_MatchCanonicalAlerts()
|
|
||||||
{
|
|
||||||
var notesAlerts = File.ReadAllText(Path.Combine(
|
|
||||||
Inventory.WorkspaceRoot,
|
|
||||||
"FlowerCore.Notes",
|
|
||||||
"scripts",
|
|
||||||
"monitoring",
|
|
||||||
"alerts.yml"));
|
|
||||||
var monitoring = File.ReadAllText(Path.Combine(Inventory.BluejayRoot, "apps", "monitoring", "noc-monitoring.yaml"));
|
|
||||||
|
|
||||||
notesAlerts.Should().Contain("# Sprint 67: keep this warning behind NodeDown's 5m critical page");
|
|
||||||
notesAlerts.Should().Contain("- alert: PiManagerDown");
|
|
||||||
notesAlerts.Should().Contain("for: 8m");
|
|
||||||
monitoring.Should().Contain("# Sprint 67: delayed behind NodeDown's critical page");
|
|
||||||
monitoring.Should().Contain("- alert: PiManagerDown");
|
|
||||||
monitoring.Should().Contain("for: 8m");
|
|
||||||
|
|
||||||
notesAlerts.Should().Contain("- alert: UpdateCenterPublicEdgeRateLimited");
|
|
||||||
notesAlerts.Should().Contain("expr: probe_http_status_code{job=\"probe-update-center-public-edge\"} == 429");
|
|
||||||
notesAlerts.Should().Contain("for: 15m");
|
|
||||||
monitoring.Should().Contain("- alert: UpdateCenterPublicEdgeRateLimited");
|
|
||||||
monitoring.Should().Contain("expr: probe_http_status_code{job=\"probe-update-center-public-edge\"} == 429");
|
|
||||||
monitoring.Should().Contain("for: 15m");
|
|
||||||
monitoring.Should().Contain("severity: warning");
|
|
||||||
}
|
|
||||||
|
|
||||||
[Fact]
|
|
||||||
public void ApplicationSetExport_MustRemainManualRootOfGitOpsTree()
|
|
||||||
{
|
|
||||||
var readme = File.ReadAllText(Path.Combine(Inventory.BluejayRoot, "README.md"));
|
|
||||||
var appsetPath = Path.Combine(Inventory.BluejayRoot, "argocd", "applicationset-bluejay-infra.yaml");
|
|
||||||
|
|
||||||
File.Exists(appsetPath).Should().BeTrue();
|
|
||||||
var appset = File.ReadAllText(appsetPath);
|
|
||||||
|
|
||||||
appset.Should().Contain("kind: ApplicationSet");
|
|
||||||
appset.Should().Contain("name: bluejay-infra");
|
|
||||||
appset.Should().NotContain("\nstatus:");
|
|
||||||
appset.Should().NotContain("managedFields:");
|
|
||||||
readme.Should().Contain("root of this GitOps tree");
|
|
||||||
readme.Should().Contain("NOT self-managed");
|
|
||||||
readme.Should().Contain("kubectl -n argocd apply -f argocd/applicationset-bluejay-infra.yaml");
|
|
||||||
}
|
|
||||||
|
|
||||||
[Fact]
|
|
||||||
public void ApplicationSetExport_MustDiscoverAppsDirectoryOnMain()
|
|
||||||
{
|
|
||||||
var appset = File.ReadAllText(Path.Combine(Inventory.BluejayRoot, "argocd", "applicationset-bluejay-infra.yaml"));
|
|
||||||
|
|
||||||
appset.Should().Contain("path: apps/*");
|
|
||||||
appset.Should().Contain("revision: main");
|
|
||||||
appset.Should().Contain("repoURL: http://gitea-clusterip.gitea.svc:3000/bluejay/bluejay-infra.git");
|
|
||||||
appset.Should().Contain("path: '{{.path.path}}'");
|
|
||||||
appset.Should().Contain("targetRevision: main");
|
|
||||||
appset.Should().Contain("ServerSideApply=true");
|
|
||||||
appset.Should().Contain("RespectIgnoreDifferences=true");
|
|
||||||
}
|
|
||||||
|
|
||||||
[Fact]
|
|
||||||
public void ApplicationSetExport_MustPreserveStatefulSetIgnoreDifferences()
|
|
||||||
{
|
|
||||||
var appset = File.ReadAllText(Path.Combine(Inventory.BluejayRoot, "argocd", "applicationset-bluejay-infra.yaml"));
|
|
||||||
|
|
||||||
appset.Should().Contain("jsonPointers:");
|
|
||||||
appset.Should().Contain("- /spec/volumeClaimTemplates");
|
|
||||||
appset.Should().Contain(".spec.volumeClaimTemplates[]?.status");
|
|
||||||
Regex.Matches(appset, "kind: StatefulSet").Should().HaveCount(4);
|
|
||||||
|
|
||||||
foreach (var (name, ns) in new[]
|
|
||||||
{
|
|
||||||
("zabbix-postgres", "zabbix"),
|
|
||||||
("guac-mysql", "guacamole"),
|
|
||||||
("matrix-postgres", "matrix"),
|
|
||||||
("authentik-postgres", "authentik"),
|
|
||||||
})
|
|
||||||
{
|
|
||||||
appset.Should().Contain($"name: {name}");
|
|
||||||
appset.Should().Contain($"namespace: {ns}");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
[Fact]
|
[Fact]
|
||||||
public void Monitoring_BlackboxTargetsForOidcSensitiveServices_MustUseAnonymousHealthRoutesWhenAvailable()
|
public void Monitoring_BlackboxTargetsForOidcSensitiveServices_MustUseAnonymousHealthRoutesWhenAvailable()
|
||||||
{
|
{
|
||||||
|
|||||||
Reference in New Issue
Block a user