Compare commits

..

50 Commits

Author SHA1 Message Date
Andrew Stoltz
0ed9b989fa monitoring: mirror Sprint 57 coverage rules 2026-06-03 22:46:33 -05:00
Andrew Stoltz
404d884863 Adopt live Library Retail AiStation web apps 2026-06-03 20:24:32 -05:00
f4bd90f805 Merge pull request #33 from codex/s56-monitoring-coverage
fix(monitoring): repoint pirelay scrape to signalcontrol
2026-06-04 01:22:49 +00:00
Andrew Stoltz
67d67ab73d fix(monitoring): repoint pirelay scrape to signalcontrol 2026-06-03 20:20:36 -05:00
Andrew Stoltz
f7d41cdc60 revert: drop fc-library manifest — Library.Web already deployed live (41h)
Library.Web is already running + serving at library.iamworkin.lan (root=200,
healthz=200), deployed manually 41h ago (image fc-library-web:v20260602-...,
PVC library-web-data holding the live SQLite DB). My from-scratch manifest used
a different PVC name (library-data) which ArgoCD would attach as a fresh empty
volume, orphaning the live DB. Adopting the live deploy into GitOps is a
separate careful task. Not disturbing a working deployment.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-03 19:30:23 -05:00
Andrew Stoltz
2c0afc28e4 deploy(fc-library): add Library.Web internal-host deployment
From-scratch .Web deploy at library.iamworkin.lan (operator-authorized 2026-06-03).
Cloned from the worldbuilder pattern: Deployment + Service + Longhorn RWO PVC +
step-ca cert + Traefik IngressRoute. SQLite at /data/library.db, no OIDC, both
/health + /healthz probes. Image localhost/fc-library:v202606031925 imported to
both RKE2 nodes. DNS library.iamworkin.lan -> 10.0.56.200 already in pfSense.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-03 19:28:22 -05:00
Robot
ba5f5dd0fb deploy(knowledge): roll audit backfill fix 2026-06-03 18:24:22 -05:00
Robot
dc699da7b3 fix(knowledge): persist federation database on PVC 2026-06-03 18:17:31 -05:00
Robot
1e8bf54c6e deploy: roll Chat and Knowledge OIDC images 2026-06-03 18:13:09 -05:00
Andrew Stoltz
e2e93d482c Deploy TtsReader schema repair image
Co-Authored-By: Codex <codex@openai.com>
2026-06-02 22:00:15 -05:00
4319cc2b51 Merge PR #32: divoom pi deploy artifact manifests
Lands Divoom-as-DM-device and Divoom-TV Pi HDMI deploy artifacts for Cx-6.
2026-06-03 02:47:36 +00:00
Andrew Stoltz
2bf339ce51 Deploy TtsReader PR29 live proof image
Co-Authored-By: Codex <codex@openai.com>
2026-06-02 21:47:04 -05:00
Andrew Stoltz
5bdedfc5ae divoom: add pi deploy artifact manifests
Add source-controlled Puppet/Hiera contracts for edge2 Divoom-as-DM-device without replacing the live flowercore-divoom systemd deployment.

Add Divoom TV Pi HDMI systemd/Puppet deployment artifacts, LF shell-script guardrails, and focused lint coverage for the additive non-K8s deploy shape.

Co-Authored-By: Codex <codex@openai.com>
2026-06-02 21:45:27 -05:00
Andrew Stoltz
0307ae16ae monitoring(probe): signage/mysql/php blackbox probe / -> /healthz (K8s-target mirror)
Mirrors the live noc1 podman fix + Notes scripts/monitoring/prometheus.yml.
These services enforce OIDC bearer auth (FlowerCore__Auth__Enabled=true), so an
anonymous probe of / returns 401 -> false TraefikServiceDown. All three expose
anonymous /healthz=200. This noc-monitoring.yaml is the forward K8s-migration
target (not live); brings it in sync with the live config.
2026-06-02 01:09:57 -05:00
Andrew Stoltz
6c18f69cf2 mail: remove cert-manager Certificate (manage mail-tls via step-ca JWK + noc1 renew timer)
step-ca-acme only has an HTTP-01 (Traefik) solver, but mail.iamworkin.lan must resolve
to the dedicated MetalLB IP 10.0.56.202 (SMTP/IMAP), so HTTP-01 cannot validate (order
stuck pending since 2026-05-06; cert expired 2026-05-24). mail-tls is now issued from
step-ca's JWK 'admin' provisioner and auto-renewed by a systemd timer on noc1 that writes
the mail-tls secret directly. The secret + Deployment mount + webmail IngressRoute are
unchanged. Re-add a Certificate only if a DNS-01 solver is deployed for step-ca-acme.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-01 15:55:38 -05:00
Andrew Stoltz
47e2256556 Deploy TtsReader correction bridge images 2026-05-31 12:35:45 -05:00
Andrew Stoltz
9d77f8ba0e fc-updater: disable loki audit sink 2026-05-31 11:34:12 -05:00
Andrew Stoltz
2f4be19c85 fc-updater: bump signing diagnostics image 2026-05-31 00:32:48 -05:00
Andrew Stoltz
2a62c40990 fc-updater: bump image to MSI installer surface 2026-05-30 23:31:48 -05:00
Andrew Stoltz
7be98e5efc Bump UpdateCenter image to hosted-service fix 2026-05-30 20:22:13 -05:00
Andrew Stoltz
a65b356c9d deploy(fc-updater): roll UC to v202605301823-a6c3354 (Phase 3 SQLite fixes)
Durable image bump for FlowerCore.Updater main a6c3354 (PRs #63-#66): hosted-service
+ request-path SQLite DateTimeOffset fixes, StopHost restored + per-tick resilience,
Shared.Settings 1.0.1. Image built + imported to rke2-server. Un-degrades the Phase-9
provenance verifier + settings poll (were stopped under the removed global Ignore mask).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-05-30 18:27:45 -05:00
Andrew Stoltz
08c17ef1b4 fc-updater: bump to v202605301703-296f350-fix2 (BackgroundServiceExceptionBehavior=Ignore so a hosted-service SQLite query crash can't stop the host)
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-05-30 17:04:54 -05:00
Andrew Stoltz
06f2f002b7 fc-updater: bump image to v202605301657-296f350-fix1 (Shared.Settings SQLite poll fix)
The v202605301642-296f350-rework image crash-looped: FlowerCore.Shared.Settings SettingsDbPollHostedService
ran a DateTimeOffset Where/OrderBy on SettingsRecordChanges that SQLite can't
translate, and as a BackgroundService it stopped the host. Shared.Settings 1.0.1
materializes the change-log then filters/orders in memory; Updater Web bumped to 1.0.1.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-05-30 16:59:37 -05:00
Andrew Stoltz
7ac4a8b4b7 fc-updater: bump image to v202605301642-296f350-rework (ADR-179 rework live)
Deploy the current FlowerCore.Updater main (PRs #52-#61) to prod: MSI-first
packaging, beta gating + per-install tokens, interactive+bearer Authentik OIDC,
native installer apply, and the .fcsetup.exe retirement (DropReleaseInstallers
migration runs on the now-empty DB). Image pre-imported to rke2-server + agent1.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-05-30 16:47:28 -05:00
Andrew Stoltz
90f2a86819 ops: trim load for degraded 2-node cluster (agent2 PSU dead)
Scale all github-runner deployments to 1 replica and halt the ci1
KubeVirt VM. With agent2 down (failed PSU) the cluster runs on two
passively-cooled NUCs; the ci1 8-vCPU VM drove agent1 to ~100C. Keep
total load trimmed until replacement hardware is in place.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-28 13:47:13 -05:00
Andrew Stoltz
cbdefb2b23 Revert "ci1: expose WinRM/RDP/SSH ports on masquerade interface for Phase 2 bootstrap"
The port additions caused the new VMI to stick at phase=Scheduled with
reason=GuestNotRunning. The guest-console-log sidecar exited 1 and
qemu never started. Reverting to the working 9-day-stable shape until
the port-add path is verified in a non-production VM.

Phase 2 (Windows runner install + registration) needs an operator-
interactive virtctl-vnc session against the rebuilt VM, OR a separate
investigation of why this port-add tipped over the VM.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-26 11:35:10 -05:00
Andrew Stoltz
1c36fe3a0a ci1: expose WinRM/RDP/SSH ports on masquerade interface for Phase 2 bootstrap
The Phase 1 VM has been Running for 9 days but Phase 2 (Puppet bootstrap +
runner registration) was deferred because the operator-interactive
virtctl-vnc path was the only way in. The masquerade interface listed
no exposed ports, so virtctl ssh and kubectl port-forward both hit
'no route to host' — qemu user-mode NAT does not forward inbound by
default.

Adding 5985 (WinRM HTTP) lets a kubectl port-forward + PowerShell
remoting path drive runner registration entirely from outside the VM.
3389 + 22 are reserved for desktop access via Guacamole or virtctl ssh
once OpenSSH Server is installed.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-26 11:24:34 -05:00
Andrew Stoltz
2b420ce8a4 runners: fleet-wide right-size CPU requests from 500m to 100m
All 33 runner Deployments now request 100m CPU instead of 500m,
freeing roughly 50 idle pods × 400m = ~20 cores back to the cluster.
Observed CPU usage on idle runners is ~1m via kubectl top; the 500m
request was a 500× over-provision that was eating allocatable CPU
and blocking new workload scheduling — WorldBuilder runner could not
be scheduled even at the new 100m request because the pre-existing
fleet held the cluster at 99% requested.

Burst headroom preserved by limits.cpu: 2000m unchanged. TtsReader
keeps its 8Gi memory limit from the 2026-05-25 OOMKill fix; only
the CPU request line moves.

Recreate strategy on each deployment means a brief offline window
per runner during rollout; in-flight CI jobs complete on the
existing container before the new spec takes effect.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-26 10:09:24 -05:00
Andrew Stoltz
5cbc1a06b1 runners: scale DM/AiStation.Linux/WorldBuilder down to 1 replica until cluster relieved
After cutting requests to 100m, 4 of 6 new pods scheduled and 2 stayed
Pending — cluster CPU REQUEST utilization is 49.6 of 48 allocatable cores
because the existing fleet of ~50 idle runners reserves 25.6 cores
(500m × ~50) for ~50m actual use. Single-replica per new repo gets the
service online without competing with in-flight CI from the rest of the
fleet.

When the broader fleet-wide request right-sizing pass lands
(500m → 100m on all idle runners would free ~20 cores), these can be
bumped back to 2 replicas if PR-CI backlog warrants it.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-26 10:03:30 -05:00
Andrew Stoltz
9e7ee39b3a runners: drop CPU request 500m→100m on DM/AiStation.Linux/WorldBuilder
All 3 fleet nodes were at 99% CPU REQUEST allocation; the 6 new pods
from the previous commit (3 deployments × 2 replicas × 500m) couldn't
schedule. Idle runners actually use ~1m CPU per `kubectl top pods`;
the 500m request was significantly over-provisioned. Burst headroom
preserved by limits.cpu: 2000m unchanged.

Follow-up: similar request right-sizing pass across the rest of the
runner fleet is queued for a future morning-routine sweep — 25 cores
reserved for ~50m actual use is a large slack we can reclaim cluster-
wide.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-26 10:00:23 -05:00
Andrew Stoltz
ae030a5f33 runners: add github-runner Deployments for DeviceManagement + AiStation.Linux + WorldBuilder
Morning-routine 2026-05-26 — these three repos had ZERO online Linux PR-CI
capacity, blocking the Sprint 37 Cx-1 Linux-CI-migration PRs (DM #20/#21/
#22, AiStation.Linux #13, WorldBuilder #3/#4). Chicken-and-egg: the
migration PRs need Linux runners that the migration creates.

Each Deployment uses the same canonical emptyDir-only pattern as the
fresh-2026-05-26 updater deployment that lives just above:
  - replicas: 2 (room for parallel PR-CI without head-of-line blocking)
  - per-pod emptyDir caches (no RWO PVC contention)
  - shared github-runner-token secret (existing ACCESS_TOKEN PAT has
    org-wide read access)
  - LABELS: self-hosted,linux,fc-build-linux
  - DOTNET_INSTALL_DIR pinned per ADR-170 family

For AiStation.Linux specifically: Linux job will now pick up; the
Windows job in #13 remains queued indefinitely until the Windows runner
host substrate lands per Sprint 36 v2 Cl-2 / ADR-174 — that's a separate
arc, not this PR.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-26 09:55:31 -05:00
bc8c35896f tests: add bluejay-ws runner-exclusion lint + fix 3 stale runner-fleet assertions (#30)
BLUEJAY-WS must never be a fleet GHA runner (operator directive 2026-05-26). Build-side analog of Sprint 9 safe-account exclusion. Also fixes 3 stale runner-fleet assertions broken by initContainer addition + replica tuning.
2026-05-26 03:42:01 +00:00
Andrew Stoltz
2cc91b6df0 runners: bump tts-reader memory limit 4Gi -> 8Gi
The github-runner-tts-reader pod was being OOMKilled (exit 137)
mid-`dotnet test` on the TtsReader 1000+ test suite. PR #21 CI
(the Windows -> Linux runner migration) flapped twice with the
"self-hosted runner lost communication" annotation before the
K8s-side symptoms surfaced via kubectl describe pod.

Requests bumped 1Gi -> 2Gi, limits 4Gi -> 8Gi. Comment added
inline so future fleet runs don't trip the same wall.

Unblocks PR #21 + the 9 other open TtsReader PRs that all rebase
through it.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-25 22:31:48 -05:00
0d2090fe81 runners: add github-runner-updater Deployment (#29)
Close runner-fleet gap for FlowerCore.Updater. Matches Sprint 32 long-tail pattern; registers entry in fleet-lint required-set.
2026-05-26 03:24:13 +00:00
Andrew Stoltz
bc3548e715 runners: add github-runner-pimanager Deployment
FlowerCore.PiManager build run 26417714843 sat queued 5h with zero
self-hosted runners registered to the repo. PiManager was missed in
the Sprint 32 long-tail sweep — every other FC repo got a dedicated
repo-scoped Deployment with its own ACCESS_TOKEN registration, but
PiManager fell through the cracks.

Adds a 2-replica ephemeral runner Deployment matching the Signage /
DMS / Print.Web pattern (per-pod emptyDir caches, no shared PVC,
labels `self-hosted,linux,fc-build-linux`, shared github-runner-token
PAT). Once ArgoCD syncs, the queued job will pick up automatically.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-25 20:33:44 -05:00
74333cc26b selenium: right-size hub + chrome + edge memory limits (#28) 2026-05-26 01:12:15 +00:00
Andrew Stoltz
7310fb88c2 selenium: right-size hub + chrome + edge memory limits
Edge node has been OOMKilled 51 times in 5 days (~1 every 2.4h) on a
1Gi memory limit. Chrome runs maxSessions=2 on the same 1Gi cap and
was idling at 684Mi — first concurrent session pushing the node to
~900Mi+ would be the next OOM. Hub was running at 766Mi against a 1Gi
limit (75%); no recent restarts but no headroom either.

Firefox node has been running at 2Gi memory limit for 9 days with
zero restarts — that is the right size for a Selenium 4.27 browser
node under our session profile (screen recording sidecar + 1080p
rendering + page captures). Match it.

Changes:
- Hub:    limit 1Gi -> 1.5Gi, request 512Mi -> 1Gi
- Chrome: limit 1Gi -> 2Gi,   request 512Mi -> 1Gi
- Edge:   limit 1Gi -> 2Gi,   request 512Mi -> 1Gi

CPU left alone on all three — observed utilization is well under the
existing limits (hub 54m / 500m, chrome 185m / 1, edge 11m / 1).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-25 20:11:41 -05:00
148bc87b9a runners: bake step-ca root CA into image (v20260525-stepca) (#27) 2026-05-26 01:04:14 +00:00
Andrew Stoltz
2a1e842100 runners: bake step-ca root CA into image (v20260525-stepca)
Without the IAmWorkin step-ca root CA in the runner image's system
trust store, .NET HttpClient calls from CI tests against
`*.iamworkin.lan` (e.g. `https://selenium.iamworkin.lan/session`) fail
with `The remote certificate is invalid because of errors in the
certificate chain: PartialChain`. FlowerCore.Print.Web's
`WebScreenshotService` unit tests hit this on every build.

Drop the step-ca root PEM into `/usr/local/share/ca-certificates/`,
run `update-ca-certificates` once during apt install, and let OpenSSL +
.NET-on-Linux read the regenerated `/etc/ssl/certs/ca-certificates.crt`
automatically — no `SSL_CERT_FILE` env var, no per-Deployment volume
mount.

Image rebuilt + saved + imported on all 3 schedulable RKE2 nodes
(rke2-server, rke2-agent1, rke2-agent2) before this PR — verified with
`ctr images list -q | grep stepca` on each node.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-25 19:55:38 -05:00
bc28430d24 selenium: allow github-runner namespace ingress on 4444 (#26) 2026-05-26 00:44:23 +00:00
Andrew Stoltz
cc92272217 selenium: allow github-runner namespace ingress on 4444
Unblocks CI jobs running in github-runner pods (e.g. FlowerCore.Print.Web
`help-screenshots`) from reaching selenium-hub. Previously the session
POST was DNAT'd to the hub pod IP then dropped at the Calico ingress
hook, surfacing as a 60s timeout against
http://selenium-hub.selenium.svc.cluster.local:4444 while the Selenium
UI showed 0/4 sessions.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-25 19:43:12 -05:00
d6f4468a9c selenium: migrate hub + 3 nodes into ArgoCD-managed manifests (#25) 2026-05-26 00:09:35 +00:00
Andrew Stoltz
2f796a2ebd selenium: migrate hub + 3 nodes + service + ingressroute into ArgoCD
Previously orphan kubectl-applied since the Selenium Grid was first set
up. The `infra-selenium` ArgoCD app existed but only managed
`network-policy.yaml` — the deployments themselves drifted whenever
anyone `kubectl set env`'d or `kubectl scale`'d.

This commit captures the live state (with the 2026-05-25 maxSessions
bump for chrome already baked in) as canonical git source. ArgoCD's
ServerSideApply syncPolicy + selfHeal will now keep the grid in lock
step with this file.

Resources captured:
  - Service selenium-hub (ClusterIP, internal traffic on 4444)
  - Service selenium-hub-external (LoadBalancer, MetalLB 10.0.56.208)
  - Deployment selenium-hub
  - Deployment selenium-node-chrome (replicas=1, SE_NODE_MAX_SESSIONS=2)
  - Deployment selenium-node-firefox (replicas=1, maxSessions=1)
  - Deployment selenium-node-edge (replicas=1, maxSessions=1)
  - IngressRoute selenium-hub (Traefik, selenium.iamworkin.lan)

No live behavior change — server-side dry-run confirms unchanged for
hub/firefox/ingressroute, "configured" for hub-external + 3 deploys
(default-field reordering only; SSA + field managers handle the diff).

Refs: Sprint 33 morning-routine 2026-05-25 follow-up Q-MR.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-25 19:08:55 -05:00
1f1f6823db runners: right-size replica counts per 14d CI activity (#24) 2026-05-26 00:01:47 +00:00
Andrew Stoltz
b92f74b63a runners: right-size replica counts per 14d CI activity data
Drop 2 → 1 for 10 deploys based on trailing-14d run counts:
  - LlmBridge, Media, Knowledge, Intranet.Web, DNS  (0 runs each)
  - Presentations (6), Redis (3), Provisioning (3),
    MessageBoard (3), MenuBoard (3)

Bump 2 → 3 for Print.Web: 12 runs in trailing 5d, and the
help-screenshots AAT job holds a runner 30+ min, creating
head-of-line blocking for parallel PRs.

Net change: -9 replicas (≈ -9 GiB committed memory).
Aligns with Sprint 33 morning-routine capacity audit.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-25 18:55:47 -05:00
Andrew Stoltz
cb7f7dbc4d authentik: generous startup/liveness probes for first-boot migration
The server pod was getting killed by liveness probe at 60s while still
waiting on migration DB lock (worker pod also running migrations against
same DB). Add startupProbe with 10.5 min budget so liveness doesn't fire
until migrations finish.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-25 16:03:03 -05:00
Andrew Stoltz
03126d5584 authentik: add fsGroup:1000 to server + worker so non-root uid can write /media
PermissionError: [Errno 13] Permission denied: '/media/public' in tenant_files
migration because Authentik container runs as uid 1000 but Longhorn PVC mounts
root:root by default. fsGroup on Pod securityContext recursively chgrps the
PVC mount to gid 1000 + chmods g+rwx.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-25 15:58:35 -05:00
Andrew Stoltz
495e884c41 authentik: initial deployment at id.iamworkin.lan
Stack:
  - PostgreSQL 16 StatefulSet (Longhorn RWO 5Gi)
  - Redis 7 Deployment (no persistence)
  - Authentik server + worker (ghcr.io/goauthentik/server:2024.12.3)
  - Shared media PVC (Longhorn RWO 2Gi) between server+worker
  - Certificate via step-ca-acme ClusterIssuer
  - Traefik IngressRoute at id.iamworkin.lan

Secrets sourced from 1Password item 'authentik-credentials' (IAmWorkin
vault, id y6i74ch22q5wvm7znquq4nhhcu) via OnePasswordItem CRD. Fields:
AUTHENTIK_SECRET_KEY, POSTGRES_PASSWORD, REDIS_PASSWORD,
BOOTSTRAP_ADMIN_PASSWORD, BOOTSTRAP_ADMIN_TOKEN, BOOTSTRAP_ADMIN_EMAIL.

DNS A record id.iamworkin.lan -> 10.0.56.200 added via
scripts/pfsense-add-id-host.py (FlowerCore.DNS service was 502'ing on
pfSense diag_command.php response parsing).

Closes the immediate gap from PiManager OIDC Cohort 3 wire-up: PiManager
(a87cd6f) configures id.iamworkin.lan as JWT authority but the backend
was never deployed. Pirelay specifically is on Mode:apikey until this
backend is bootstrapped and a pimanager service-account exists.

Post-deploy bootstrap (manual once pods Ready):
  1. Login at https://id.iamworkin.lan/if/admin/ as akadmin
     using BOOTSTRAP_ADMIN_PASSWORD from 1Password.
  2. Create OAuth2/OpenID Provider for pimanager (issuer
     https://id.iamworkin.lan/application/o/pimanager/, audience 'pimanager').
  3. Create Application binding the provider.
  4. Create service account user 'pimanager-service-account', generate
     long-lived token, store in 1Password as 'pimanager-service-account'.
  5. Re-enable jwt mode on pirelay + un-mask puppet.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-25 15:50:10 -05:00
Andrew Stoltz
65aa1e6104 fix(monitoring): point probe-printweb at /health (Q-MR-90)
Root path requires API key auth — `/` returned 401 to the blackbox
probe, firing PrintWebDown despite `/health` reporting Healthy.
Pattern: feedback_k8s_probes_behind_auth_middleware.

Mirrors FlowerCore.Notes scripts/monitoring/prometheus.yml.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-20 14:52:02 -05:00
Andrew Stoltz
7f2a3b76b4 feat(github-runner): bake Ruby 3.3 into Linux self-hosted runner image (Q-MR-81) 2026-05-20 11:45:43 -05:00
42 changed files with 3958 additions and 590 deletions

4
.gitattributes vendored Normal file
View File

@@ -0,0 +1,4 @@
/.gitattributes text eol=lf
*.yaml text eol=lf
*.yml text eol=lf
*.sh text eol=lf

View File

@@ -116,6 +116,16 @@ dotnet test tests/bluejay-infra-lint/BluejayInfraLint.Tests.csproj -c Release
That test project sweeps `bluejay-infra/apps/**` plus the canonical sibling `FlowerCore.*\\k8s` manifests that share the same workspace. Matching `conftest.dev` policy files live under `tests/bluejay-infra-lint/conftest.dev/` for environments that also have `conftest` or `opa`. That test project sweeps `bluejay-infra/apps/**` plus the canonical sibling `FlowerCore.*\\k8s` manifests that share the same workspace. Matching `conftest.dev` policy files live under `tests/bluejay-infra-lint/conftest.dev/` for environments that also have `conftest` or `opa`.
## Non-K8s Pi Artifacts
Some `apps/*` directories are deployment artifact bundles consumed by Puppet
instead of Kubernetes workloads. `apps/fc-signage-pi-player/` carries the
Chromium signage Pi player, `apps/fc-divoom-dm-pi-device/` carries the additive
edge2 Divoom-as-DeviceManagement-device profile/Hiera contract, and
`apps/fc-divoom-tv-pi/` carries the Divoom TV Pi HDMI systemd/Puppet shape.
These bundles intentionally avoid Deployment, IngressRoute, Certificate, and
OnePasswordItem resources.
## References ## References
- OpenVox noc1 durability runbook: `docs/runbooks/openvoxserver-quadlet-durability.md` - OpenVox noc1 durability runbook: `docs/runbooks/openvoxserver-quadlet-durability.md`

View File

@@ -0,0 +1,448 @@
# Authentik OIDC backend
# ArgoCD-managed. BlueJay Lab.
#
# Stack:
# - PostgreSQL 16 StatefulSet (single replica, Longhorn RWO 5Gi)
# - Redis 7 Deployment (no persistence — session/cache only)
# - Authentik server + worker Deployments (image ghcr.io/goauthentik/server:2024.12.3)
# - Media PVC shared between server + worker (Longhorn RWO 2Gi)
# - Certificate via step-ca-acme ClusterIssuer
# - Traefik IngressRoute at id.iamworkin.lan
#
# Secrets come from 1Password item "authentik-credentials" (IAmWorkin vault, id y6i74ch22q5wvm7znquq4nhhcu)
# via the OnePasswordItem CRD, materialized into k8s Secret authentik/authentik-credentials.
#
# Why the discovery URL is /application/o/pimanager/ : Authentik issues per-application OIDC providers.
# The pimanager OIDC application/provider is created after the cluster pods are healthy (manual or
# via API once the bootstrap token is available — see Notes substrate).
---
apiVersion: v1
kind: Namespace
metadata:
name: authentik
labels:
app.kubernetes.io/part-of: bluejay-infra
---
# 1Password operator pulls the authentik-credentials item into a k8s Secret of the same name.
# Field labels in 1P become Secret keys: AUTHENTIK_SECRET_KEY, POSTGRES_PASSWORD, REDIS_PASSWORD,
# BOOTSTRAP_ADMIN_PASSWORD, BOOTSTRAP_ADMIN_TOKEN, BOOTSTRAP_ADMIN_EMAIL.
apiVersion: onepassword.com/v1
kind: OnePasswordItem
metadata:
name: authentik-credentials
namespace: authentik
spec:
itemPath: "vaults/IAmWorkin/items/authentik-credentials"
---
# Shared media volume for server + worker pods.
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: authentik-media
namespace: authentik
spec:
storageClassName: longhorn
accessModes: [ReadWriteOnce]
resources:
requests:
storage: 2Gi
---
# PostgreSQL 16 StatefulSet — Authentik's primary store.
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: authentik-postgres
namespace: authentik
labels:
app: authentik-postgres
argocd.argoproj.io/instance: infra-authentik
spec:
persistentVolumeClaimRetentionPolicy:
whenDeleted: Retain
whenScaled: Retain
podManagementPolicy: OrderedReady
serviceName: authentik-postgres
replicas: 1
revisionHistoryLimit: 10
selector:
matchLabels:
app: authentik-postgres
template:
metadata:
labels:
app: authentik-postgres
spec:
containers:
- name: postgres
image: postgres:16-alpine
ports:
- containerPort: 5432
name: postgres
env:
- name: POSTGRES_USER
value: authentik
- name: POSTGRES_PASSWORD
valueFrom:
secretKeyRef:
name: authentik-credentials
key: POSTGRES_PASSWORD
- name: POSTGRES_DB
value: authentik
- name: POSTGRES_INITDB_ARGS
value: "--encoding=UTF-8 --lc-collate=C --lc-ctype=C"
- name: PGDATA
value: /var/lib/postgresql/data/pgdata
readinessProbe:
exec:
command: ["pg_isready", "-U", "authentik"]
initialDelaySeconds: 5
periodSeconds: 5
livenessProbe:
exec:
command: ["pg_isready", "-U", "authentik"]
initialDelaySeconds: 30
periodSeconds: 30
resources:
requests: { cpu: 100m, memory: 256Mi }
limits: { cpu: 1000m, memory: 1Gi }
volumeMounts:
- name: pgdata
mountPath: /var/lib/postgresql/data
volumeClaimTemplates:
- metadata:
name: pgdata
spec:
storageClassName: longhorn
accessModes: [ReadWriteOnce]
volumeMode: Filesystem
resources:
requests:
storage: 5Gi
---
apiVersion: v1
kind: Service
metadata:
name: authentik-postgres
namespace: authentik
spec:
clusterIP: None
selector:
app: authentik-postgres
ports:
- name: postgres
port: 5432
targetPort: 5432
---
# Redis 7 — session storage + Celery broker. No persistence needed (cache).
apiVersion: apps/v1
kind: Deployment
metadata:
name: authentik-redis
namespace: authentik
labels:
app: authentik-redis
argocd.argoproj.io/instance: infra-authentik
spec:
replicas: 1
strategy:
type: Recreate
selector:
matchLabels:
app: authentik-redis
template:
metadata:
labels:
app: authentik-redis
spec:
containers:
- name: redis
image: redis:7-alpine
args:
- "--save"
- ""
- "--appendonly"
- "no"
- "--requirepass"
- "$(REDIS_PASSWORD)"
env:
- name: REDIS_PASSWORD
valueFrom:
secretKeyRef:
name: authentik-credentials
key: REDIS_PASSWORD
ports:
- containerPort: 6379
name: redis
readinessProbe:
tcpSocket: { port: 6379 }
initialDelaySeconds: 5
periodSeconds: 5
livenessProbe:
tcpSocket: { port: 6379 }
initialDelaySeconds: 30
periodSeconds: 30
resources:
requests: { cpu: 50m, memory: 64Mi }
limits: { cpu: 500m, memory: 256Mi }
---
apiVersion: v1
kind: Service
metadata:
name: authentik-redis
namespace: authentik
spec:
selector:
app: authentik-redis
ports:
- name: redis
port: 6379
targetPort: 6379
---
# Authentik server Deployment — HTTP frontend on :9000.
apiVersion: apps/v1
kind: Deployment
metadata:
name: authentik-server
namespace: authentik
labels:
app: authentik-server
argocd.argoproj.io/instance: infra-authentik
spec:
replicas: 1
strategy:
type: Recreate # shares /media RWO PVC with worker
selector:
matchLabels:
app: authentik-server
template:
metadata:
labels:
app: authentik-server
spec:
securityContext:
# Authentik image runs as uid 1000 "authentik" but the Longhorn PVC mounts
# root:root by default. fsGroup recursively chgrp + chmod g+rwx so the
# non-root container can mkdir /media/public during the tenant_files migration.
fsGroup: 1000
containers:
- name: server
image: ghcr.io/goauthentik/server:2024.12.3
args: ["server"]
ports:
- containerPort: 9000
name: http
- containerPort: 9443
name: https
env:
- name: AUTHENTIK_SECRET_KEY
valueFrom:
secretKeyRef:
name: authentik-credentials
key: AUTHENTIK_SECRET_KEY
- name: AUTHENTIK_REDIS__HOST
value: authentik-redis
- name: AUTHENTIK_REDIS__PASSWORD
valueFrom:
secretKeyRef:
name: authentik-credentials
key: REDIS_PASSWORD
- name: AUTHENTIK_POSTGRESQL__HOST
value: authentik-postgres
- name: AUTHENTIK_POSTGRESQL__NAME
value: authentik
- name: AUTHENTIK_POSTGRESQL__USER
value: authentik
- name: AUTHENTIK_POSTGRESQL__PASSWORD
valueFrom:
secretKeyRef:
name: authentik-credentials
key: POSTGRES_PASSWORD
- name: AUTHENTIK_BOOTSTRAP_PASSWORD
valueFrom:
secretKeyRef:
name: authentik-credentials
key: BOOTSTRAP_ADMIN_PASSWORD
- name: AUTHENTIK_BOOTSTRAP_TOKEN
valueFrom:
secretKeyRef:
name: authentik-credentials
key: BOOTSTRAP_ADMIN_TOKEN
- name: AUTHENTIK_BOOTSTRAP_EMAIL
valueFrom:
secretKeyRef:
name: authentik-credentials
key: BOOTSTRAP_ADMIN_EMAIL
- name: AUTHENTIK_DISABLE_UPDATE_CHECK
value: "true"
- name: AUTHENTIK_ERROR_REPORTING__ENABLED
value: "false"
- name: AUTHENTIK_LOG_LEVEL
value: info
# First-boot Authentik can take 3+ min on the migration phase
# (waiting on DB lock while worker also runs migrations). Initial
# delays are generous so kubelet doesn't kill the pod mid-migration;
# periodSeconds keeps post-startup probing responsive.
readinessProbe:
httpGet:
path: /-/health/ready/
port: 9000
initialDelaySeconds: 60
periodSeconds: 10
timeoutSeconds: 5
failureThreshold: 12
livenessProbe:
httpGet:
path: /-/health/live/
port: 9000
initialDelaySeconds: 300
periodSeconds: 30
timeoutSeconds: 10
failureThreshold: 3
startupProbe:
httpGet:
path: /-/health/live/
port: 9000
initialDelaySeconds: 30
periodSeconds: 15
timeoutSeconds: 10
failureThreshold: 40 # 30s + 40*15s = 10.5 min budget
resources:
requests: { cpu: 150m, memory: 512Mi }
limits: { cpu: 1500m, memory: 1Gi }
volumeMounts:
- name: media
mountPath: /media
volumes:
- name: media
persistentVolumeClaim:
claimName: authentik-media
---
# Authentik worker Deployment — runs Celery background tasks.
apiVersion: apps/v1
kind: Deployment
metadata:
name: authentik-worker
namespace: authentik
labels:
app: authentik-worker
argocd.argoproj.io/instance: infra-authentik
spec:
replicas: 1
strategy:
type: Recreate # shares /media RWO PVC with server
selector:
matchLabels:
app: authentik-worker
template:
metadata:
labels:
app: authentik-worker
spec:
securityContext:
# Same as server pod — non-root uid 1000 needs PVC group write.
fsGroup: 1000
containers:
- name: worker
image: ghcr.io/goauthentik/server:2024.12.3
args: ["worker"]
env:
- name: AUTHENTIK_SECRET_KEY
valueFrom:
secretKeyRef:
name: authentik-credentials
key: AUTHENTIK_SECRET_KEY
- name: AUTHENTIK_REDIS__HOST
value: authentik-redis
- name: AUTHENTIK_REDIS__PASSWORD
valueFrom:
secretKeyRef:
name: authentik-credentials
key: REDIS_PASSWORD
- name: AUTHENTIK_POSTGRESQL__HOST
value: authentik-postgres
- name: AUTHENTIK_POSTGRESQL__NAME
value: authentik
- name: AUTHENTIK_POSTGRESQL__USER
value: authentik
- name: AUTHENTIK_POSTGRESQL__PASSWORD
valueFrom:
secretKeyRef:
name: authentik-credentials
key: POSTGRES_PASSWORD
- name: AUTHENTIK_DISABLE_UPDATE_CHECK
value: "true"
- name: AUTHENTIK_ERROR_REPORTING__ENABLED
value: "false"
- name: AUTHENTIK_LOG_LEVEL
value: info
resources:
requests: { cpu: 100m, memory: 256Mi }
limits: { cpu: 1000m, memory: 768Mi }
volumeMounts:
- name: media
mountPath: /media
volumes:
- name: media
persistentVolumeClaim:
claimName: authentik-media
---
apiVersion: v1
kind: Service
metadata:
name: authentik-server
namespace: authentik
spec:
selector:
app: authentik-server
ports:
- name: http
port: 9000
targetPort: 9000
- name: https
port: 9443
targetPort: 9443
---
# step-ca leaf certificate for id.iamworkin.lan.
# step-ca container resolver uses pfSense Unbound, so the public A record for id.iamworkin.lan
# MUST exist before this Certificate is applied (cert-manager HTTP-01 will silently 2h-backoff
# otherwise). Added 2026-05-25 via scripts/pfsense-add-id-host.py.
apiVersion: cert-manager.io/v1
kind: Certificate
metadata:
name: authentik-tls
namespace: authentik
spec:
secretName: authentik-tls
dnsNames:
- id.iamworkin.lan
issuerRef:
name: step-ca-acme
kind: ClusterIssuer
---
apiVersion: traefik.io/v1alpha1
kind: IngressRoute
metadata:
name: authentik
namespace: authentik
spec:
entryPoints: [websecure]
routes:
- match: Host(`id.iamworkin.lan`)
kind: Rule
services:
- name: authentik-server
port: 9000
tls:
secretName: authentik-tls

View File

@@ -0,0 +1,169 @@
# FlowerCore.AiStation.Web GitOps adoption manifest.
#
# Authored from the already-live fc-aistation resources on 2026-06-04.
# Keep the live image tag, Service ClusterIP, and PVC volumeName unchanged so
# ArgoCD adopts in place instead of replacing the workload or data volume.
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: aistation-web-data
namespace: fc-aistation
labels:
app.kubernetes.io/name: aistation-web
app.kubernetes.io/part-of: flowercore
app.kubernetes.io/managed-by: argocd
argocd.argoproj.io/instance: infra-fc-aistation
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 1Gi
storageClassName: longhorn
volumeMode: Filesystem
volumeName: pvc-27448d6f-6e66-42a7-a293-73dd8bbd6b3e
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: aistation-web
namespace: fc-aistation
labels:
app.kubernetes.io/name: aistation-web
app.kubernetes.io/part-of: flowercore
app.kubernetes.io/managed-by: argocd
argocd.argoproj.io/instance: infra-fc-aistation
spec:
progressDeadlineSeconds: 600
replicas: 1
revisionHistoryLimit: 3
selector:
matchLabels:
app.kubernetes.io/name: aistation-web
strategy:
type: Recreate
template:
metadata:
annotations:
prometheus.io/path: /metrics/prometheus
prometheus.io/port: "5000"
prometheus.io/scrape: "true"
labels:
app.kubernetes.io/name: aistation-web
app.kubernetes.io/part-of: flowercore
spec:
containers:
- envFrom:
- configMapRef:
name: aistation-web-config
image: localhost/fc-aistation-web:v20260602-aistation-owned-deploy-fix2
imagePullPolicy: Never
livenessProbe:
failureThreshold: 3
httpGet:
path: /healthz
port: 5000
scheme: HTTP
initialDelaySeconds: 30
periodSeconds: 30
successThreshold: 1
timeoutSeconds: 5
name: aistation-web
ports:
- containerPort: 5000
name: http
protocol: TCP
readinessProbe:
failureThreshold: 6
httpGet:
path: /healthz
port: 5000
scheme: HTTP
initialDelaySeconds: 10
periodSeconds: 10
successThreshold: 1
timeoutSeconds: 5
resources: {}
terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File
volumeMounts:
- mountPath: /data
name: data
dnsPolicy: ClusterFirst
restartPolicy: Always
schedulerName: default-scheduler
securityContext: {}
terminationGracePeriodSeconds: 30
volumes:
- name: data
persistentVolumeClaim:
claimName: aistation-web-data
---
apiVersion: v1
kind: Service
metadata:
name: aistation-web
namespace: fc-aistation
labels:
app.kubernetes.io/name: aistation-web
app.kubernetes.io/part-of: flowercore
app.kubernetes.io/managed-by: argocd
argocd.argoproj.io/instance: infra-fc-aistation
spec:
clusterIP: 10.43.211.127
clusterIPs:
- 10.43.211.127
internalTrafficPolicy: Cluster
ipFamilies:
- IPv4
ipFamilyPolicy: SingleStack
ports:
- name: http
port: 80
protocol: TCP
targetPort: 5000
selector:
app.kubernetes.io/name: aistation-web
sessionAffinity: None
type: ClusterIP
---
apiVersion: cert-manager.io/v1
kind: Certificate
metadata:
name: aistation-web-tls
namespace: fc-aistation
labels:
app.kubernetes.io/name: aistation-web-tls
app.kubernetes.io/part-of: flowercore
app.kubernetes.io/managed-by: argocd
argocd.argoproj.io/instance: infra-fc-aistation
spec:
dnsNames:
- aistation.iamworkin.lan
issuerRef:
kind: ClusterIssuer
name: step-ca-acme
secretName: aistation-web-tls
---
apiVersion: traefik.io/v1alpha1
kind: IngressRoute
metadata:
name: aistation-web
namespace: fc-aistation
labels:
app.kubernetes.io/name: aistation-web
app.kubernetes.io/part-of: flowercore
app.kubernetes.io/managed-by: argocd
argocd.argoproj.io/instance: infra-fc-aistation
spec:
entryPoints:
- websecure
routes:
- kind: Rule
match: Host(`aistation.iamworkin.lan`)
services:
- name: aistation-web
port: 80
tls:
secretName: aistation-web-tls

View File

@@ -1,263 +0,0 @@
# fc-build-windows runner gate
Status: OPEN-WITH-OPERATOR-ACTION as of 2026-05-20.
This directory is intentionally not a live runner deployment. It records the
exact gate for bringing up the Windows self-hosted runner fleet without faking
capacity in GitHub or Kubernetes.
## Lane evidence
- `D:\git\FlowerCore\FlowerCore.Notes\docs\dashboards\decisions-waiting.html`
lines 15078-15085: Q-MR-82 says the Updater Windows Sandbox E2E run is
queued and `bluejay-ws-sandbox-1` is offline.
- `D:\git\FlowerCore\FlowerCore.Notes\memory\project_morning_routine_8_2026_05_20.md`:
Morning Routine #8 carries Q-MR-82 as the fleet-wide Windows runner gap.
- `D:\git\FlowerCore\FlowerCore.Notes\docs\standards\sprint-37-codex-dispatch-log-2026-05-19.md`
lines 76, 84-85, and 97: keep BLUEJAY-WS out of runner plans, merge Linux
runner expansion separately, and keep true Windows-only workflows parked on
the Windows runner host substrate path.
- `D:\git\FlowerCore\FlowerCore.Notes\docs\ai-agents\codex-prompts\2026-05-20-xxxxl-sprint-42-orchestrator-briefs.md`
lane Cx-5: land a deployment only if a Windows runner image/substrate is
ready; otherwise commit an operator-action gate.
- `D:\git\FlowerCore\FlowerCore.Notes\memory\feedback_bluejay_ws_never_a_github_runner.md`:
BLUEJAY-WS is operator-only territory; Windows runners belong on a dedicated
KubeVirt Windows VM such as `ci1` or a sibling VM.
## Live probe summary
Commands run on 2026-05-20 from `D:\git\FlowerCore\bluejay-infra`:
```powershell
$env:KUBECONFIG="$env:USERPROFILE\.kube\rke2.yaml"
kubectl get nodes -o jsonpath='{range .items[*]}{.metadata.name}{"`t"}{.metadata.labels.kubernetes\.io/os}{"`n"}{end}'
```
Result: `rke2-agent1`, `rke2-agent2`, and `rke2-server` all report
`kubernetes.io/os=linux`. There is no Windows Kubernetes node, so Windows
containers on RKE2 cannot satisfy `fc-build-windows`.
```powershell
kubectl -n kubevirt-vms get vm,vmi,pods -o wide
```
Result: KubeVirt is healthy and `ci1` is `Running` / `Ready=True` on
`rke2-agent1` with VMI IP `10.42.103.35`.
```powershell
virtctl --kubeconfig $env:USERPROFILE\.kube\rke2.yaml port-forward vm/ci1.kubevirt-vms 15985:5985
```
Result during port tests: `dial tcp 10.42.103.35:5985: connect: no route to
host`. The same result was seen for RDP 3389 and SSH 22. The VM exists, but it
is not remotely reachable for runner bootstrap from this lane.
```powershell
gh api /repos/astoltz/FlowerCore.Updater/actions/runners `
--jq '.runners[]? | {name,status,busy,labels:[.labels[].name]}'
gh run list --repo astoltz/FlowerCore.Updater `
--workflow "Updater Windows Sandbox E2E" --limit 5
```
Result: GitHub has one Updater runner, `bluejay-ws-sandbox-1`, with
`status=offline`; run `26150689447` is still `queued`.
## Feasibility classification
### Option A: Windows containers on RKE2
Not feasible without operator-physical infrastructure work. Kubernetes Windows
containers require a Windows node. The current cluster has Linux-only RKE2
nodes.
### Option B: KubeVirt Windows VM
Partially present, not deployable from this lane.
`apps/kubevirt-vms/ci1.yaml` already defines a Windows Server 2025 KubeVirt VM
using `localhost/fc-win-server-2025:v1`, and the live VM is running. However:
- the guest is not reachable over RDP, WinRM, or SSH through `virtctl
port-forward`;
- the current root disk is a `containerDisk`, so runner installation inside the
running guest is not a durable fleet state unless the first-boot automation
re-registers on every boot or the VM is moved to a persistent PVC-backed
disk;
- FC.Updater `Updater Windows Sandbox E2E` uses
`[self-hosted, windows, windows-sandbox]`, while `fc-build-windows` build jobs
use `[self-hosted, windows, fc-build-windows]`. Do not advertise
`windows-sandbox` until Windows Sandbox has been proven in the guest.
### Option C: bluejay-ws-sandbox-1
Operator-only emergency fallback. GitHub shows it registered but offline. The
current memory says BLUEJAY-WS must not be a fleet runner host, so this lane
does not start or re-register it. If the operator deliberately overrides the
policy to drain an emergency queue, start the existing visible runner console
from the BLUEJAY-WS desktop and treat that as temporary break-glass, not the
permanent Q-MR-82 closure.
## Operator action plan
### 1. Pick the Windows host class
Use `ci1` or a sibling Windows Server 2025 VM for WPF build/test jobs that need
`fc-build-windows`.
Use a Windows 11 Pro/Enterprise KubeVirt VM for Updater or WorldBuilder
Windows Sandbox gates, unless Windows Sandbox support is explicitly proven on
the selected guest. The workflow labels must match the real capability:
- WPF build runner: `self-hosted,windows,fc-build-windows,ci1`
- Sandbox runner: `self-hosted,windows,windows-sandbox,ci-sandbox1`
### 2. Make the VM reachable and durable
From BLUEJAY-WS:
```powershell
$env:KUBECONFIG="$env:USERPROFILE\.kube\rke2.yaml"
kubectl -n kubevirt-vms get vm,vmi,pods -o wide
virtctl --kubeconfig $env:KUBECONFIG vnc ci1 -n kubevirt-vms
virtctl --kubeconfig $env:KUBECONFIG port-forward vm/ci1.kubevirt-vms 13389:3389
virtctl --kubeconfig $env:KUBECONFIG port-forward vm/ci1.kubevirt-vms 15985:5985
```
Before runner registration, fix the current port-forward failure. The expected
state is that RDP or WinRM accepts a connection through the control plane.
For durability, either:
- move the runner VM to a persistent PVC-backed root disk; or
- keep `containerDisk` and bake first-boot runner registration into the sysprep
flow using a non-expiring credential lookup path.
Do not install a runner by hand into a transient VM and call Q-MR-82 closed.
### 3. Install runner prerequisites inside the VM
Run in an elevated PowerShell session in the Windows runner guest:
```powershell
winget install Microsoft.DotNet.SDK.10 --silent
winget install Microsoft.DotNet.DesktopRuntime.8 --silent
winget install Microsoft.PowerShell --silent
winget install Git.Git --silent
winget install Microsoft.VisualStudio.2022.BuildTools --silent
winget install Google.Chrome --silent
```
For a Sandbox-capable runner only:
```powershell
Enable-WindowsOptionalFeature -Online -FeatureName Containers-DisposableClientVM -All
Restart-Computer -Force
```
After reboot:
```powershell
Get-CimInstance -ClassName Win32_OptionalFeature -Filter "Name='Containers-DisposableClientVM'"
Test-Path C:\Windows\System32\WindowsSandbox.exe
```
### 4. Register repo-scoped GitHub runners
The `astoltz` account uses repo-scoped runners. Generate a fresh one-hour
registration token per repo immediately before `config.cmd`.
From a trusted operator shell with `gh` authenticated:
```powershell
$repos = @(
"FlowerCore.Updater",
"FlowerCore.WorldBuilder",
"FlowerCore.DeviceManagement"
)
foreach ($repo in $repos) {
$token = gh api -X POST "/repos/astoltz/$repo/actions/runners/registration-token" --jq .token
$repoSlug = $repo.ToLowerInvariant().Replace("flowercore.", "").Replace(".", "-")
$runnerDir = "C:\fc-ghr\$repoSlug-fc-build-windows"
New-Item -ItemType Directory -Force -Path $runnerDir | Out-Null
Set-Location $runnerDir
if (-not (Test-Path ".\config.cmd")) {
Invoke-WebRequest `
-Uri "https://github.com/actions/runner/releases/download/v2.323.0/actions-runner-win-x64-2.323.0.zip" `
-OutFile "actions-runner.zip"
Add-Type -AssemblyName System.IO.Compression.FileSystem
[System.IO.Compression.ZipFile]::ExtractToDirectory((Resolve-Path actions-runner.zip), $runnerDir)
}
.\config.cmd `
--url "https://github.com/astoltz/$repo" `
--token $token `
--name "ci1-$repoSlug-fc-build-windows" `
--labels "self-hosted,windows,fc-build-windows,ci1" `
--work "_work" `
--unattended `
--replace
.\svc.ps1 install
.\svc.ps1 start
}
```
For Updater Sandbox E2E, register only after the guest proves Sandbox support,
and use `windows-sandbox` labels:
```powershell
$token = gh api -X POST "/repos/astoltz/FlowerCore.Updater/actions/runners/registration-token" --jq .token
.\config.cmd `
--url "https://github.com/astoltz/FlowerCore.Updater" `
--token $token `
--name "ci-sandbox1-updater" `
--labels "self-hosted,windows,windows-sandbox,ci-sandbox1" `
--work "_work" `
--unattended `
--replace
```
Keep registration tokens out of Git and logs. The durable credential source for
automation should be the existing 1Password item named `GitHub PAT (Runner
Registration)`, used only to mint short-lived repo registration tokens.
### 5. Verify GitHub and workflow pickup
```powershell
gh api /repos/astoltz/FlowerCore.Updater/actions/runners `
--jq '.runners[] | select(.labels[].name == "windows-sandbox") | {name,status,busy,labels:[.labels[].name]}'
gh api /repos/astoltz/FlowerCore.DeviceManagement/actions/runners `
--jq '.runners[] | select(.labels[].name == "fc-build-windows") | {name,status,busy,labels:[.labels[].name]}'
gh run list --repo astoltz/FlowerCore.Updater `
--workflow "Updater Windows Sandbox E2E" --limit 3
```
Q-MR-82 can be marked resolved only after the Updater run moves from `queued` to
`in_progress` or `completed` on an online runner, or after the affected WPF
build repos show online `fc-build-windows` repo-scoped runners and their queued
jobs start.
## Break-glass BLUEJAY-WS command
Only if the operator explicitly overrides the "BLUEJAY-WS is not a runner"
policy to drain a queue:
```powershell
Set-Location C:\fc-ghr\updater-sandbox
.\run.cmd
```
If a Windows service exists:
```powershell
Get-Service 'actions.runner.*'
Start-Service 'actions.runner.*'
```
This does not close Q-MR-82 permanently. It is a temporary queue drain until a
dedicated VM runner is online.

View File

@@ -1,4 +0,0 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- operator-gate-configmap.yaml

View File

@@ -1,61 +0,0 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: fc-build-windows-operator-gate
namespace: kubevirt-vms
labels:
app.kubernetes.io/name: fc-build-windows
app.kubernetes.io/component: operator-gate
app.kubernetes.io/part-of: github-runner
flowercore.io/q-card: Q-MR-82
annotations:
flowercore.io/outcome: OPEN-WITH-OPERATOR-ACTION
flowercore.io/live-runner: "false"
data:
outcome: OPEN-WITH-OPERATOR-ACTION
gate.md: |
Do not treat this ConfigMap as runner capacity.
Current probe, 2026-05-20:
- RKE2 nodes are linux-only; Windows containers require a Windows node.
- KubeVirt `ci1` is Running/Ready, but RDP 3389, WinRM 5985, and SSH 22
through `virtctl port-forward` return `connect: no route to host`.
- GitHub Updater runner list has only `bluejay-ws-sandbox-1`, status
offline. Updater Windows Sandbox E2E run 26150689447 remains queued.
Required operator action:
1. Make a dedicated Windows VM reachable and durable.
2. Install .NET 10 SDK, .NET 8 Desktop Runtime, Git, VS Build Tools, and
PowerShell 7.
3. Register repo-scoped runners with short-lived GitHub registration tokens.
4. Add `fc-build-windows` labels only to WPF build-capable guests.
5. Add `windows-sandbox` labels only after Sandbox support is proven.
registration-token-pattern.ps1: |
$repo = "FlowerCore.Updater"
$token = gh api -X POST "/repos/astoltz/$repo/actions/runners/registration-token" --jq .token
$runnerDir = "C:\fc-ghr\updater-fc-build-windows"
New-Item -ItemType Directory -Force -Path $runnerDir | Out-Null
Set-Location $runnerDir
# Install the Actions runner package here if config.cmd is absent.
.\config.cmd `
--url "https://github.com/astoltz/$repo" `
--token $token `
--name "ci1-updater-fc-build-windows" `
--labels "self-hosted,windows,fc-build-windows,ci1" `
--work "_work" `
--unattended `
--replace
.\svc.ps1 install
.\svc.ps1 start
verification.ps1: |
gh api /repos/astoltz/FlowerCore.Updater/actions/runners `
--jq '.runners[] | {name,status,busy,labels:[.labels[].name]}'
gh run list --repo astoltz/FlowerCore.Updater `
--workflow "Updater Windows Sandbox E2E" --limit 3
$env:KUBECONFIG="$env:USERPROFILE\.kube\rke2.yaml"
kubectl -n kubevirt-vms get vm,vmi,pods -o wide

View File

@@ -1,5 +1,206 @@
# FlowerCore Chat — TLS + Ingress # FlowerCore Chat
# Deployment and Service managed by deploy script (not ArgoCD) #
# ArgoCD-managed workload plus TLS/Ingress. The chat-web-secret remains an
# out-of-band Secret until the values are moved into a 1Password-backed item;
# the Deployment references it as optional so GitOps can own the workload
# without storing secret material in this repo.
---
apiVersion: v1
kind: Namespace
metadata:
name: fc-chat
labels:
app.kubernetes.io/part-of: flowercore
---
apiVersion: v1
kind: ConfigMap
metadata:
name: chat-web-config
namespace: fc-chat
labels:
app.kubernetes.io/name: chat-web
app.kubernetes.io/part-of: flowercore
data:
ASPNETCORE_ENVIRONMENT: Production
ASPNETCORE_URLS: "http://+:8080"
ASPNETCORE_FORWARDEDHEADERS_ENABLED: "true"
FlowerCore__Auth__Enabled: "false"
FlowerCore__Auth__Oidc__Enabled: "true"
FlowerCore__Auth__Oidc__Authority: "https://id.iamworkin.lan/application/o/chat/"
FlowerCore__Auth__Oidc__Audience: "chat"
FlowerCore__Auth__Oidc__ClientId: "chat"
FlowerCore__Database__ConnectionStrings__Sqlite: "Data Source=/data/chat.db"
# Ollama target. Switched 2026-04-25 from edge1 Pi5 (10.0.57.17) to BLUEJAY-WS
# workstation (10.0.56.20, RX 9070 XT 16GB, OLLAMA_HOST=0.0.0.0:11434, Vulkan
# backend per feedback_rdna4_vulkan_broken). The Pi5 was timing out every team-
# round speaker at the 300s per-turn cap (live-proven 2026-04-25 03:53 UTC,
# see feedback_chat_team_round_edge1_too_slow). Workstation has gemma3:4b for
# the Cheap tier, plus gemma3:27b/phi4:14b/qwen3:14b for Default/Balanced/Deep.
# Piper TTS stays on edge1 below (different service, Pi handles TTS fine).
FlowerCore__AI__OllamaBaseUrl: "http://10.0.56.20:11434"
FlowerCore__AI__DefaultModelName: "phi4:14b"
ChatOptions__BehaviorRuleEngine__OllamaBaseUrl: "http://10.0.56.20:11434"
ChatOptions__BehaviorRuleEngine__FallbackOllamaBaseUrl: "http://10.0.57.17:11434"
ChatOptions__BehaviorRuleEngine__ModelName: "gemma3:12b"
FlowerCore__AI__Memory__UseSharedIndexingAdapter: "true"
FlowerCore__AI__Memory__UseOllamaEmbeddings: "true"
FlowerCore__AI__Memory__EmbeddingModel: "nomic-embed-text"
FlowerCore__AI__Memory__EnableSharedIndexingBackfill: "true"
FlowerCore__AI__Memory__SharedIndexingDatabasePath: "/data/chat-memory-index.db"
FlowerCore__AI__Skills__Library__LibraryApiUrl: "http://library-web.fc-library.svc.cluster.local"
FlowerCore__AI__Skills__Retail__RetailApiUrl: "http://retail-web.fc-retail.svc.cluster.local"
FlowerCore__AI__Skills__Intranet__IntranetBaseUrl: "http://intranet-web.intranet.svc.cluster.local"
FlowerCore__AI__Skills__Print__PrintMcpBaseUrl: "http://10.0.57.16:5200"
FlowerCore__AI__IrcBridge__Enabled: "true"
FlowerCore__AI__IrcBridge__DefaultProfileSlug: "it-helpdesk"
FlowerCore__AI__IrcBridge__MentionProfileSlug: "it-helpdesk"
FlowerCore__AI__IrcBridge__MentionReactiveMode: "mentions-only"
FlowerCore__AI__IrcBridge__AllowActionExecution: "false"
FlowerCore__AI__Voice__Piper__Host: "10.0.57.17"
FlowerCore__AI__Voice__Piper__Port: "10400"
FlowerCore__AI__Voice__OutputRoot: "/data/audio"
FlowerCore__AI__Voice__RetentionDays: "30"
# LLM provider abstraction (ADR-088). Anthropic stays disabled here -- when
# an operator wants to enable Claude, they flip Enabled=true and mount
# FlowerCore__Anthropic__ApiKey from the onepassword-synced Secret (see
# docs/ai-agents/anthropic-integration.md).
FlowerCore__Anthropic__Enabled: "false"
FlowerCore__Anthropic__BaseUrl: "https://api.anthropic.com"
FlowerCore__Anthropic__DefaultModel: "claude-sonnet-4-6"
FlowerCore__Anthropic__CheapModel: "claude-haiku-4-5-20251001"
FlowerCore__Anthropic__DeepModel: "claude-opus-4-7"
FlowerCore__Budget__ResponseCacheEnabled: "true"
OTEL_SERVICE_NAME: FlowerCore.Chat
OTEL_EXPORTER_OTLP_ENDPOINT: "http://otel-collector.monitoring.svc.cluster.local:4317"
OTEL_EXPORTER_OTLP_PROTOCOL: grpc
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: chat-web-data
namespace: fc-chat
labels:
app.kubernetes.io/name: chat-web
app.kubernetes.io/part-of: flowercore
spec:
accessModes:
- ReadWriteOnce
storageClassName: longhorn
volumeMode: Filesystem
resources:
requests:
storage: 1Gi
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: chat-web
namespace: fc-chat
labels:
app.kubernetes.io/name: chat-web
app.kubernetes.io/part-of: flowercore
spec:
replicas: 1
strategy:
type: Recreate
selector:
matchLabels:
app.kubernetes.io/name: chat-web
template:
metadata:
labels:
app.kubernetes.io/name: chat-web
app.kubernetes.io/part-of: flowercore
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "8080"
prometheus.io/path: "/metrics/prometheus"
spec:
nodeSelector:
kubernetes.io/hostname: rke2-server
securityContext:
fsGroup: 1654
fsGroupChangePolicy: OnRootMismatch
containers:
- name: chat-web
image: localhost/fc-chat-web:v20260603-oidc-authentik
imagePullPolicy: Never
ports:
- name: http
containerPort: 8080
envFrom:
- configMapRef:
name: chat-web-config
- secretRef:
name: chat-web-secret
optional: true
env:
- name: FlowerCore__Auth__Oidc__Authority
valueFrom:
secretKeyRef:
name: chat-oidc-client
key: issuer_url
optional: true
- name: FlowerCore__Auth__Oidc__ClientId
valueFrom:
secretKeyRef:
name: chat-oidc-client
key: client_id
optional: true
- name: FlowerCore__Auth__Oidc__ClientSecret
valueFrom:
secretKeyRef:
name: chat-oidc-client
key: client_secret
optional: true
volumeMounts:
- name: data
mountPath: /data
resources:
requests:
memory: "128Mi"
cpu: "100m"
limits:
memory: "512Mi"
cpu: "500m"
readinessProbe:
httpGet:
path: /healthz
port: 8080
initialDelaySeconds: 10
periodSeconds: 10
timeoutSeconds: 5
failureThreshold: 6
livenessProbe:
httpGet:
path: /healthz
port: 8080
initialDelaySeconds: 30
periodSeconds: 30
timeoutSeconds: 5
failureThreshold: 3
volumes:
- name: data
persistentVolumeClaim:
claimName: chat-web-data
---
apiVersion: v1
kind: Service
metadata:
name: chat-web
namespace: fc-chat
labels:
app.kubernetes.io/name: chat-web
app.kubernetes.io/part-of: flowercore
spec:
type: ClusterIP
selector:
app.kubernetes.io/name: chat-web
ports:
- name: http
port: 80
targetPort: 8080
protocol: TCP
--- ---
apiVersion: cert-manager.io/v1 apiVersion: cert-manager.io/v1
kind: Certificate kind: Certificate

View File

@@ -0,0 +1,45 @@
# FlowerCore Divoom DM Pi Device
Source-controlled Puppet/Hiera deployment contract for registering the edge2
Divoom MiniToo panel as a FlowerCore DeviceManagement-managed Pi device.
This is not a Kubernetes application. The live panel remains the existing
edge2 `flowercore-divoom.service` managed by `FlowerCore.Puppet`
`profile::pi::service::divoom`, with the .NET payload deployed out of band
and `/opt/flowercore/divoom/data` plus the Bluetooth shell wrappers preserved.
Because edge2 is already Hiera-driven through `profile::pi::service::apps`,
the deploy home is additive `profile::pi::service` data/profile source, not
`profile::edge::service::apps` and not an ArgoCD/K8s app.
## Scope
- Stage DeviceManagement registration metadata for the edge2 Divoom MiniToo.
- Stage a separate, disabled-by-default DM Agent executor unit for privileged
Bluetooth operations once the DM-RPC lane lands.
- Keep `flowercore-divoom.service` and `flowercore-divoom-bt.service`
untouched: no service replacement, no restart subscription, no K8s surface.
- Preserve the current wrapper contract:
`/opt/flowercore/divoom/bt-link.sh`,
`/opt/flowercore/divoom/bt-reset.sh`, and
`/opt/flowercore/divoom/audio-link.sh`.
- Keep FM radio disabled and require visible render proof; device-info echo is
not render proof.
## Artifact Map
| Path | Use |
| --- | --- |
| `hiera/edge2-divoom-dm-device.overlay.yaml` | Additive Hiera overlay for edge2. Merge into the existing node YAML without removing `fc-pimanager` or `fc-divoom`. |
| `puppet/profile/pi/service/divoom_dm_device.pp` | Puppet profile shape to vendor into `FlowerCore.Puppet` after the DM-RPC executor binary exists. |
| `puppet/templates/divoom-device-registration.json.epp` | DM device registration metadata rendered on edge2. |
| `puppet/templates/flowercore-divoom-dm-agent.service.epp` | Separate DM Agent systemd unit. Defaults are stopped and disabled until a later cutover. |
## Rollout Notes
1. Land these artifacts in bluejay-infra as the deploy contract.
2. Vendor the Puppet profile and EPP templates into `FlowerCore.Puppet`.
3. Merge the Hiera overlay into `data/nodes/edge2.iamworkin.lan.yaml`.
4. Run Puppet in noop first, preferably with a node-local validation directory
under `~/.fcv` rather than `/tmp`.
5. Only enable the DM Agent service after the DeviceManagement BT executor has
landed and passed operator-eyeball render proof.

View File

@@ -0,0 +1,32 @@
---
# Merge into FlowerCore.Puppet data/nodes/edge2.iamworkin.lan.yaml.
# Additive overlay only: keep the existing fc-pimanager version/tarball entry,
# keep fc-divoom enabled, and do not move Divoom into Kubernetes.
profile::pi::service::apps:
fc-pimanager:
binary: 'FlowerCore.PiManager.Web'
install_dir: '/opt/fc-pimanager'
port: 5000
environment: 'edge2'
version: '2026.05.28.1646'
tarball_source: 'puppet:///modules/profile/pi/builds/fc-pimanager.tar.gz'
fc-divoom:
enabled: true
profile::pi::service::divoom_dm_device::ensure: 'present'
profile::pi::service::divoom_dm_device::service_enabled: false
profile::pi::service::divoom_dm_device::service_ensure: 'stopped'
profile::pi::service::divoom_dm_device::device_id: 'edge2-divoom-minitoo'
profile::pi::service::divoom_dm_device::display_name: 'edge2 Divoom MiniToo'
profile::pi::service::divoom_dm_device::host_fqdn: 'edge2.iamworkin.lan'
profile::pi::service::divoom_dm_device::dm_web_url: 'https://devicemgmt.iamworkin.lan'
profile::pi::service::divoom_dm_device::divoom_install_dir: '/opt/flowercore/divoom'
profile::pi::service::divoom_dm_device::agent_install_dir: '/opt/flowercore/devicemanagement-agent'
profile::pi::service::divoom_dm_device::bt_candidate_channels:
- '1'
- '10'
profile::pi::service::divoom_dm_device::default_bt_channel: '1'
profile::pi::service::divoom_dm_device::a2dp_default_state: 'off'
profile::pi::service::divoom_dm_device::fm_radio_enabled: false
profile::pi::service::divoom_dm_device::visible_render_proof_required: true

View File

@@ -0,0 +1,140 @@
# Drop into FlowerCore.Puppet site-modules/profile/manifests/pi/service/divoom_dm_device.pp.
# This profile is additive to profile::pi::service::divoom. It must not manage,
# restart, replace, or subscribe the existing flowercore-divoom.service.
class profile::pi::service::divoom_dm_device (
Enum['present', 'absent'] $ensure = 'present',
Boolean $service_enabled = false,
Enum['running', 'stopped'] $service_ensure = 'stopped',
String $service_name = 'flowercore-divoom-dm-agent',
String $device_id = 'edge2-divoom-minitoo',
String $display_name = 'edge2 Divoom MiniToo',
String $host_fqdn = 'edge2.iamworkin.lan',
String $dm_web_url = 'https://devicemgmt.iamworkin.lan',
String $divoom_install_dir = '/opt/flowercore/divoom',
String $agent_install_dir = '/opt/flowercore/devicemanagement-agent',
String $agent_binary = 'FlowerCore.DeviceManagement.Agent',
Array[String] $bt_candidate_channels = ['1', '10'],
String $default_bt_channel = '1',
Enum['on', 'off'] $a2dp_default_state = 'off',
Boolean $fm_radio_enabled = false,
Boolean $visible_render_proof_required = true,
) {
include profile::workstation::safe_account_exclusion
$safe_account = $profile::workstation::safe_account_exclusion::safe_account
$config_dir = '/etc/flowercore/device-management/devices'
$state_dir = '/var/lib/flowercore/divoom-dm-agent'
$log_dir = '/var/log/flowercore/divoom-dm-agent'
$registration_path = "${config_dir}/${device_id}.json"
$agent_binary_path = "${agent_install_dir}/${agent_binary}"
$bt_channels_json = inline_template('[<%= @bt_candidate_channels.map { |c| "\"#{c}\"" }.join(", ") %>]')
if $safe_account {
notify { 'fc-divoom-dm-device safe-account exclusion':
message => 'SAFE-ACCOUNT-EXCLUSION: Divoom DM Pi device profile refused to apply on operator workstation',
}
if $facts['os']['family'] != 'windows' {
ensure_resource('file', '/var/log/flowercore-audit', {
'ensure' => 'directory',
'owner' => 'root',
'group' => 'root',
'mode' => '0755',
})
file { '/var/log/flowercore-audit/safe-account-noop-fc-divoom-dm-device.log':
ensure => file,
owner => 'root',
group => 'root',
mode => '0644',
content => "noop: divoom dm pi device profile refused to apply on safe-account host\n",
require => File['/var/log/flowercore-audit'],
}
}
} elsif $ensure == 'absent' {
service { $service_name:
ensure => stopped,
enable => false,
}
file { [
"/etc/systemd/system/${service_name}.service",
$registration_path,
]:
ensure => absent,
}
exec { 'fc-divoom-dm-agent-systemd-reload':
command => '/usr/bin/systemctl daemon-reload',
refreshonly => true,
path => ['/usr/bin', '/bin'],
}
} else {
case $facts['os']['family'] {
'Debian': {}
default: { fail("profile::pi::service::divoom_dm_device only supports Debian-family OS, got ${facts['os']['family']}") }
}
file { [$config_dir, $state_dir, $log_dir]:
ensure => directory,
owner => 'root',
group => 'root',
mode => '0755',
}
file { $registration_path:
ensure => file,
owner => 'root',
group => 'root',
mode => '0644',
content => epp('profile/pi/fc_divoom_dm/divoom-device-registration.json.epp', {
'device_id' => $device_id,
'display_name' => $display_name,
'host_fqdn' => $host_fqdn,
'divoom_install_dir' => $divoom_install_dir,
'bt_channels_json' => $bt_channels_json,
'default_bt_channel' => $default_bt_channel,
'a2dp_default_state' => $a2dp_default_state,
'fm_radio_enabled' => $fm_radio_enabled,
'visible_render_proof_required' => $visible_render_proof_required,
}),
require => File[$config_dir],
}
file { "/etc/systemd/system/${service_name}.service":
ensure => file,
owner => 'root',
group => 'root',
mode => '0644',
content => epp('profile/pi/fc_divoom_dm/flowercore-divoom-dm-agent.service.epp', {
'service_name' => $service_name,
'device_id' => $device_id,
'dm_web_url' => $dm_web_url,
'registration_path' => $registration_path,
'divoom_install_dir' => $divoom_install_dir,
'agent_install_dir' => $agent_install_dir,
'agent_binary_path' => $agent_binary_path,
'state_dir' => $state_dir,
'log_dir' => $log_dir,
}),
notify => Exec['fc-divoom-dm-agent-systemd-reload'],
require => File[$registration_path],
}
exec { 'fc-divoom-dm-agent-systemd-reload':
command => '/usr/bin/systemctl daemon-reload',
refreshonly => true,
path => ['/usr/bin', '/bin'],
}
service { $service_name:
ensure => $service_ensure,
enable => $service_enabled,
require => [
File["/etc/systemd/system/${service_name}.service"],
File[$registration_path],
Exec['fc-divoom-dm-agent-systemd-reload'],
],
}
}
}

View File

@@ -0,0 +1,34 @@
{
"deviceId": "<%= $device_id %>",
"displayName": "<%= $display_name %>",
"hostFqdn": "<%= $host_fqdn %>",
"kind": "DivoomMiniToo",
"managedBy": "FlowerCore.DeviceManagement",
"executionMode": "Pi",
"transport": {
"kind": "BluetoothSerial",
"candidateChannels": <%= $bt_channels_json %>,
"defaultChannel": "<%= $default_bt_channel %>",
"deviceInfoIsRenderProof": false,
"visibleRenderProofRequired": <%= $visible_render_proof_required %>
},
"paths": {
"divoomInstallDir": "<%= $divoom_install_dir %>",
"btLink": "<%= $divoom_install_dir %>/bt-link.sh",
"btReset": "<%= $divoom_install_dir %>/bt-reset.sh",
"audioLink": "<%= $divoom_install_dir %>/audio-link.sh"
},
"capabilities": {
"supportsBluetoothSerial": true,
"supportsBtChannelRedetect": true,
"supportsBtHardReset": true,
"supportsBtAudioProfileSwitch": true,
"a2dpDefaultState": "<%= $a2dp_default_state %>",
"fmRadioEnabled": <%= $fm_radio_enabled %>
},
"safety": {
"preserveExistingService": "flowercore-divoom.service",
"preserveDataDirectory": "<%= $divoom_install_dir %>/data",
"doNotEnableFmRadio": true
}
}

View File

@@ -0,0 +1,36 @@
[Unit]
Description=FlowerCore Divoom DM Agent Bluetooth executor
Documentation=https://github.com/astoltz/FlowerCore.Notes/blob/master/docs/standards/divoom-tv-hdmi-multitarget-render-substrate.md
Wants=network-online.target
After=network-online.target bluetooth.service
Requires=bluetooth.service
ConditionPathExists=<%= $agent_binary_path %>
ConditionPathExists=<%= $registration_path %>
ConditionPathExists=<%= $divoom_install_dir %>/bt-link.sh
ConditionPathExists=<%= $divoom_install_dir %>/bt-reset.sh
ConditionPathExists=<%= $divoom_install_dir %>/audio-link.sh
[Service]
Type=simple
User=stoltz
Group=stoltz
WorkingDirectory=<%= $agent_install_dir %>
Environment=DOTNET_CLI_TELEMETRY_OPTOUT=1
Environment=FLOWERCORE_DM_DEVICE_REGISTRATION=<%= $registration_path %>
Environment=Divoom__Bluetooth__DeviceInfoIsRenderProof=false
Environment=Divoom__Bluetooth__VisibleRenderProofRequired=true
Environment=Divoom__Bluetooth__A2dpDefaultState=off
ExecStart=<%= $agent_binary_path %> --mode=Pi --device-id=<%= $device_id %> --dm-web-url=<%= $dm_web_url %> --registration=<%= $registration_path %>
Restart=on-failure
RestartSec=10s
StartLimitBurst=3
StartLimitIntervalSec=300s
SupplementaryGroups=bluetooth audio dialout
NoNewPrivileges=true
PrivateTmp=true
ProtectSystem=strict
ProtectHome=true
ReadWritePaths=<%= $state_dir %> <%= $log_dir %>
[Install]
WantedBy=multi-user.target

View File

@@ -0,0 +1,44 @@
# FlowerCore Divoom TV Pi HDMI
Source-controlled deploy shape for the native `FlowerCore.Divoom.Tv`
Avalonia HDMI renderer on a Raspberry Pi connected to a TV.
This is a Puppet/systemd appliance bundle, not a Kubernetes application. It
mirrors the existing `fc-signage-pi-player` pattern: bluejay-infra carries the
systemd units, scripts, Hiera shape, and Puppet profile source that
`FlowerCore.Puppet` vendors and installs.
## Scope
- Launch the future `FlowerCore.Divoom.Tv` linux-arm64 self-contained payload
from `/opt/flowercore/divoom-tv/FlowerCore.Divoom.Tv`.
- Prefer `cage` as the Wayland fullscreen compositor, with direct app launch as
a fallback for development images.
- Restart the app after HDMI hotplug with a 2 second DRM settle delay.
- Keep all runtime state local: `/var/lib/fc-divoom-tv` and
`/var/log/fc-divoom-tv`.
- Avoid CDN/runtime fetches; the app renders the in-house Divoom scene catalog
locally.
## Artifact Map
| Path | Use |
| --- | --- |
| `systemd/flowercore-divoom-tv.service` | Fullscreen Avalonia HDMI app service. |
| `systemd/flowercore-divoom-tv-hdmi.service` | HDMI hotplug responder service. |
| `systemd/99-flowercore-divoom-tv-hdmi.rules` | DRM udev hotplug rule. |
| `scripts/flowercore-divoom-tv-prelaunch.sh` | Preflight checks and local directory creation. |
| `scripts/flowercore-divoom-tv-launch.sh` | Cage-first fullscreen launcher. |
| `scripts/flowercore-divoom-tv-hdmi-respond.sh` | Hotplug settle and restart script. |
| `puppet/profile/pi/service/divoom_tv.pp` | Puppet profile shape to vendor into `FlowerCore.Puppet`. |
| `hiera/example-divoom-tv-pi.iamworkin.lan.yaml` | Example node Hiera for a Divoom TV Pi. |
## Rollout Notes
1. Build `FlowerCore.Divoom.Tv` with `dotnet.exe publish -c Release -r linux-arm64 --self-contained`.
2. Stage the payload to `/opt/flowercore/divoom-tv/` through the standard noc1
jump path and avoid `/tmp` for unprivileged Pi scratch.
3. Vendor the profile and static files into `FlowerCore.Puppet`.
4. Run Puppet noop, then apply on the target Pi.
5. Prove deployment with `systemctl is-active flowercore-divoom-tv.service`,
journal lines showing frames presented, and a visible HDMI display check.

View File

@@ -0,0 +1,19 @@
---
# Example node data for a dedicated Pi -> HDMI -> TV Divoom renderer.
# Copy into FlowerCore.Puppet data/nodes/<hostname>.iamworkin.lan.yaml only
# after the Pi has a static DHCP/DNS entry and the linux-arm64 payload exists.
facts:
role: pi_prototype
profile::motd::role: 'Divoom TV HDMI Renderer'
profile::pi::service::divoom_tv::ensure: 'present'
profile::pi::service::divoom_tv::service_enabled: true
profile::pi::service::divoom_tv::service_ensure: 'running'
profile::pi::service::divoom_tv::install_dir: '/opt/flowercore/divoom-tv'
profile::pi::service::divoom_tv::state_dir: '/var/lib/fc-divoom-tv'
profile::pi::service::divoom_tv::log_dir: '/var/log/fc-divoom-tv'
profile::pi::service::divoom_tv::presentation_mode: 'PillarboxSquare'
profile::pi::service::divoom_tv::startup_scene: 'bluejay-clock'
profile::pi::service::divoom_tv::reduced_motion: false

View File

@@ -0,0 +1,149 @@
# Drop into FlowerCore.Puppet site-modules/profile/manifests/pi/service/divoom_tv.pp.
# Static files come from profile/pi/fc_divoom_tv/ after this bluejay-infra
# bundle is vendored into the Puppet control repo.
class profile::pi::service::divoom_tv (
Enum['present', 'absent'] $ensure = 'present',
Boolean $service_enabled = false,
Enum['running', 'stopped'] $service_ensure = 'stopped',
String $service_name = 'flowercore-divoom-tv',
String $user = 'fc-divoom-tv',
String $group = 'fc-divoom-tv',
String $install_dir = '/opt/flowercore/divoom-tv',
String $state_dir = '/var/lib/fc-divoom-tv',
String $log_dir = '/var/log/fc-divoom-tv',
String $presentation_mode = 'PillarboxSquare',
String $startup_scene = 'bluejay-clock',
Boolean $reduced_motion = false,
) {
include profile::workstation::safe_account_exclusion
$safe_account = $profile::workstation::safe_account_exclusion::safe_account
if $safe_account {
notify { 'fc-divoom-tv safe-account exclusion':
message => 'SAFE-ACCOUNT-EXCLUSION: Divoom TV Pi profile refused to apply on operator workstation',
}
} elsif $ensure == 'absent' {
service { $service_name:
ensure => stopped,
enable => false,
}
file { [
"/etc/systemd/system/${service_name}.service",
"/etc/systemd/system/${service_name}-hdmi.service",
'/etc/udev/rules.d/99-flowercore-divoom-tv-hdmi.rules',
'/usr/local/bin/flowercore-divoom-tv-prelaunch.sh',
'/usr/local/bin/flowercore-divoom-tv-launch.sh',
'/usr/local/bin/flowercore-divoom-tv-hdmi-respond.sh',
'/etc/flowercore/divoom-tv.env',
]:
ensure => absent,
}
} else {
case $facts['os']['family'] {
'Debian': {}
default: { fail("profile::pi::service::divoom_tv only supports Debian-family OS, got ${facts['os']['family']}") }
}
package { ['cage', 'libgbm1', 'libdrm2', 'libxkbcommon0', 'fonts-dejavu-core']:
ensure => installed,
}
group { $group:
ensure => present,
system => true,
}
user { $user:
ensure => present,
system => true,
gid => $group,
home => $state_dir,
managehome => false,
shell => '/usr/sbin/nologin',
require => Group[$group],
}
file { [$install_dir, $state_dir, $log_dir, '/etc/flowercore']:
ensure => directory,
owner => $user,
group => $group,
mode => '0755',
}
file { '/etc/flowercore/divoom-tv.env':
ensure => file,
owner => 'root',
group => 'root',
mode => '0644',
content => "FC_DIVOOM_TV_PRESENTATION_MODE=${presentation_mode}\nFC_DIVOOM_TV_START_SCENE=${startup_scene}\nFC_DIVOOM_TV_REDUCED_MOTION=${reduced_motion}\n",
require => File['/etc/flowercore'],
}
$script_map = {
'/usr/local/bin/flowercore-divoom-tv-prelaunch.sh' => 'profile/pi/fc_divoom_tv/flowercore-divoom-tv-prelaunch.sh',
'/usr/local/bin/flowercore-divoom-tv-launch.sh' => 'profile/pi/fc_divoom_tv/flowercore-divoom-tv-launch.sh',
'/usr/local/bin/flowercore-divoom-tv-hdmi-respond.sh' => 'profile/pi/fc_divoom_tv/flowercore-divoom-tv-hdmi-respond.sh',
}
$script_map.each |$dest, $src| {
file { $dest:
ensure => file,
owner => 'root',
group => 'root',
mode => '0755',
source => "puppet:///modules/${src}",
}
}
$unit_map = {
"/etc/systemd/system/${service_name}.service" => 'profile/pi/fc_divoom_tv/flowercore-divoom-tv.service',
"/etc/systemd/system/${service_name}-hdmi.service" => 'profile/pi/fc_divoom_tv/flowercore-divoom-tv-hdmi.service',
}
$unit_map.each |$dest, $src| {
file { $dest:
ensure => file,
owner => 'root',
group => 'root',
mode => '0644',
source => "puppet:///modules/${src}",
notify => Exec['fc-divoom-tv-systemd-reload'],
}
}
file { '/etc/udev/rules.d/99-flowercore-divoom-tv-hdmi.rules':
ensure => file,
owner => 'root',
group => 'root',
mode => '0644',
source => 'puppet:///modules/profile/pi/fc_divoom_tv/99-flowercore-divoom-tv-hdmi.rules',
notify => Exec['fc-divoom-tv-udev-reload'],
}
exec { 'fc-divoom-tv-systemd-reload':
command => '/usr/bin/systemctl daemon-reload',
refreshonly => true,
path => ['/usr/bin', '/bin'],
}
exec { 'fc-divoom-tv-udev-reload':
command => '/usr/bin/udevadm control --reload-rules',
refreshonly => true,
path => ['/usr/bin', '/bin'],
}
service { $service_name:
ensure => $service_ensure,
enable => $service_enabled,
require => [
File["/etc/systemd/system/${service_name}.service"],
File['/etc/flowercore/divoom-tv.env'],
File['/usr/local/bin/flowercore-divoom-tv-prelaunch.sh'],
File['/usr/local/bin/flowercore-divoom-tv-launch.sh'],
Exec['fc-divoom-tv-systemd-reload'],
],
}
}
}

View File

@@ -0,0 +1,5 @@
#!/usr/bin/env bash
set -euo pipefail
sleep 2
systemctl restart flowercore-divoom-tv.service

View File

@@ -0,0 +1,25 @@
#!/usr/bin/env bash
set -euo pipefail
APP_BIN="${FC_DIVOOM_TV_BIN:-/opt/flowercore/divoom-tv/FlowerCore.Divoom.Tv}"
STATE_DIR="${FC_DIVOOM_TV_STATE_DIR:-/var/lib/fc-divoom-tv}"
LOG_DIR="${FC_DIVOOM_TV_LOG_DIR:-/var/log/fc-divoom-tv}"
PRESENTATION_MODE="${FC_DIVOOM_TV_PRESENTATION_MODE:-PillarboxSquare}"
START_SCENE="${FC_DIVOOM_TV_START_SCENE:-bluejay-clock}"
REDUCED_MOTION="${FC_DIVOOM_TV_REDUCED_MOTION:-false}"
COMMON_ARGS=(
"--target=hdmi"
"--presentation-mode=${PRESENTATION_MODE}"
"--startup-scene=${START_SCENE}"
"--reduced-motion=${REDUCED_MOTION}"
"--state-dir=${STATE_DIR}"
"--log-dir=${LOG_DIR}"
)
if command -v cage >/dev/null 2>&1; then
exec cage -- "${APP_BIN}" "${COMMON_ARGS[@]}" "$@"
fi
echo "[$(date -Is)] cage not found; launching FlowerCore.Divoom.Tv directly" >&2
exec "${APP_BIN}" "${COMMON_ARGS[@]}" "$@"

View File

@@ -0,0 +1,23 @@
#!/usr/bin/env bash
set -euo pipefail
APP_BIN="${FC_DIVOOM_TV_BIN:-/opt/flowercore/divoom-tv/FlowerCore.Divoom.Tv}"
STATE_DIR="${FC_DIVOOM_TV_STATE_DIR:-/var/lib/fc-divoom-tv}"
LOG_DIR="${FC_DIVOOM_TV_LOG_DIR:-/var/log/fc-divoom-tv}"
mkdir -p "${STATE_DIR}" "${LOG_DIR}"
if [[ ! -x "${APP_BIN}" ]]; then
echo "[$(date -Is)] missing executable ${APP_BIN}" >&2
exit 1
fi
if [[ -d /sys/class/drm ]] && ! find /sys/class/drm -maxdepth 1 -name 'card*-HDMI-A-*' -print -quit | grep -q .; then
echo "[$(date -Is)] no HDMI connector visible yet; continuing so the app can wait for display" >&2
fi
if command -v cage >/dev/null 2>&1; then
echo "[$(date -Is)] cage available for fullscreen Wayland launch"
else
echo "[$(date -Is)] cage not installed; direct launch fallback will be used" >&2
fi

View File

@@ -0,0 +1,2 @@
# Settle DRM for 2s before restarting the fullscreen Avalonia renderer.
SUBSYSTEM=="drm", KERNEL=="card?-HDMI-A-?", ACTION=="change", RUN+="/usr/bin/systemctl start flowercore-divoom-tv-hdmi.service"

View File

@@ -0,0 +1,7 @@
[Unit]
Description=FlowerCore Divoom TV HDMI hotplug responder
DefaultDependencies=no
[Service]
Type=oneshot
ExecStart=/usr/local/bin/flowercore-divoom-tv-hdmi-respond.sh

View File

@@ -0,0 +1,40 @@
[Unit]
Description=FlowerCore Divoom TV HDMI Renderer (Avalonia fullscreen)
Documentation=https://github.com/astoltz/FlowerCore.Notes/blob/master/docs/standards/divoom-tv-hdmi-multitarget-render-substrate.md
Wants=network-online.target
After=network-online.target systemd-user-sessions.service
ConditionPathExists=/opt/flowercore/divoom-tv/FlowerCore.Divoom.Tv
[Service]
Type=simple
User=fc-divoom-tv
Group=fc-divoom-tv
WorkingDirectory=/opt/flowercore/divoom-tv
EnvironmentFile=-/etc/flowercore/divoom-tv.env
Environment=DOTNET_CLI_TELEMETRY_OPTOUT=1
Environment=XDG_RUNTIME_DIR=/run/fc-divoom-tv
RuntimeDirectory=fc-divoom-tv
RuntimeDirectoryMode=0700
ExecStartPre=/usr/local/bin/flowercore-divoom-tv-prelaunch.sh
ExecStart=/usr/local/bin/flowercore-divoom-tv-launch.sh
Restart=always
RestartSec=10s
StartLimitBurst=5
StartLimitIntervalSec=300s
MemoryMax=2G
MemoryHigh=1500M
PrivateTmp=true
NoNewPrivileges=true
ProtectSystem=strict
ProtectHome=true
ReadWritePaths=/var/lib/fc-divoom-tv /var/log/fc-divoom-tv /run/fc-divoom-tv
TTYPath=/dev/tty1
StandardInput=tty
StandardOutput=journal
StandardError=journal
TTYReset=yes
TTYVHangup=yes
TTYVTDisallocate=yes
[Install]
WantedBy=graphical.target

View File

@@ -0,0 +1,169 @@
# FlowerCore.Library.Web GitOps adoption manifest.
#
# Authored from the already-live fc-library resources on 2026-06-04.
# Keep the live image tag, Service ClusterIP, and PVC volumeName unchanged so
# ArgoCD adopts in place instead of replacing the workload or data volume.
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: library-web-data
namespace: fc-library
labels:
app.kubernetes.io/name: library-web
app.kubernetes.io/part-of: flowercore
app.kubernetes.io/managed-by: argocd
argocd.argoproj.io/instance: infra-fc-library
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 1Gi
storageClassName: longhorn
volumeMode: Filesystem
volumeName: pvc-2690bae2-4ee0-417a-b95f-50ec5c632b63
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: library-web
namespace: fc-library
labels:
app.kubernetes.io/name: library-web
app.kubernetes.io/part-of: flowercore
app.kubernetes.io/managed-by: argocd
argocd.argoproj.io/instance: infra-fc-library
spec:
progressDeadlineSeconds: 600
replicas: 1
revisionHistoryLimit: 3
selector:
matchLabels:
app.kubernetes.io/name: library-web
strategy:
type: Recreate
template:
metadata:
annotations:
prometheus.io/path: /metrics/prometheus
prometheus.io/port: "5000"
prometheus.io/scrape: "true"
labels:
app.kubernetes.io/name: library-web
app.kubernetes.io/part-of: flowercore
spec:
containers:
- envFrom:
- configMapRef:
name: library-web-config
image: localhost/fc-library-web:v20260602-library-owned-deploy-fix1
imagePullPolicy: Never
livenessProbe:
failureThreshold: 3
httpGet:
path: /health
port: 5000
scheme: HTTP
initialDelaySeconds: 30
periodSeconds: 30
successThreshold: 1
timeoutSeconds: 5
name: library-web
ports:
- containerPort: 5000
name: http
protocol: TCP
readinessProbe:
failureThreshold: 6
httpGet:
path: /health
port: 5000
scheme: HTTP
initialDelaySeconds: 10
periodSeconds: 10
successThreshold: 1
timeoutSeconds: 5
resources: {}
terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File
volumeMounts:
- mountPath: /data
name: data
dnsPolicy: ClusterFirst
restartPolicy: Always
schedulerName: default-scheduler
securityContext: {}
terminationGracePeriodSeconds: 30
volumes:
- name: data
persistentVolumeClaim:
claimName: library-web-data
---
apiVersion: v1
kind: Service
metadata:
name: library-web
namespace: fc-library
labels:
app.kubernetes.io/name: library-web
app.kubernetes.io/part-of: flowercore
app.kubernetes.io/managed-by: argocd
argocd.argoproj.io/instance: infra-fc-library
spec:
clusterIP: 10.43.179.63
clusterIPs:
- 10.43.179.63
internalTrafficPolicy: Cluster
ipFamilies:
- IPv4
ipFamilyPolicy: SingleStack
ports:
- name: http
port: 80
protocol: TCP
targetPort: 5000
selector:
app.kubernetes.io/name: library-web
sessionAffinity: None
type: ClusterIP
---
apiVersion: cert-manager.io/v1
kind: Certificate
metadata:
name: library-web-tls
namespace: fc-library
labels:
app.kubernetes.io/name: library-web-tls
app.kubernetes.io/part-of: flowercore
app.kubernetes.io/managed-by: argocd
argocd.argoproj.io/instance: infra-fc-library
spec:
dnsNames:
- library.iamworkin.lan
issuerRef:
kind: ClusterIssuer
name: step-ca-acme
secretName: library-web-tls
---
apiVersion: traefik.io/v1alpha1
kind: IngressRoute
metadata:
name: library-web
namespace: fc-library
labels:
app.kubernetes.io/name: library-web
app.kubernetes.io/part-of: flowercore
app.kubernetes.io/managed-by: argocd
argocd.argoproj.io/instance: infra-fc-library
spec:
entryPoints:
- websecure
routes:
- kind: Rule
match: Host(`library.iamworkin.lan`)
services:
- name: library-web
port: 80
tls:
secretName: library-web-tls

View File

@@ -0,0 +1,170 @@
# FlowerCore.Retail.Web GitOps adoption manifest.
#
# Authored from the already-live fc-retail resources on 2026-06-04.
# Keep the live image tag, Service ClusterIP, and PVC volumeName unchanged so
# ArgoCD adopts in place instead of replacing the workload or data volume.
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: retail-web-data
namespace: fc-retail
labels:
app.kubernetes.io/name: retail-web
app.kubernetes.io/part-of: flowercore
app.kubernetes.io/managed-by: argocd
argocd.argoproj.io/instance: infra-fc-retail
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 1Gi
storageClassName: longhorn
volumeMode: Filesystem
volumeName: pvc-3d40b336-eab4-41b3-812c-d5e9413ce0ab
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: retail-web
namespace: fc-retail
labels:
app.kubernetes.io/name: retail-web
app.kubernetes.io/part-of: flowercore
app.kubernetes.io/managed-by: argocd
argocd.argoproj.io/instance: infra-fc-retail
spec:
progressDeadlineSeconds: 600
replicas: 1
revisionHistoryLimit: 3
selector:
matchLabels:
app.kubernetes.io/name: retail-web
strategy:
type: Recreate
template:
metadata:
annotations:
kubectl.kubernetes.io/restartedAt: "2026-06-02T01:34:08-05:00"
prometheus.io/path: /metrics/prometheus
prometheus.io/port: "5000"
prometheus.io/scrape: "true"
labels:
app.kubernetes.io/name: retail-web
app.kubernetes.io/part-of: flowercore
spec:
containers:
- envFrom:
- configMapRef:
name: retail-web-config
image: localhost/fc-retail-web:v20260602-retail-owned-deploy-fix5
imagePullPolicy: Never
livenessProbe:
failureThreshold: 3
httpGet:
path: /health
port: 5000
scheme: HTTP
initialDelaySeconds: 30
periodSeconds: 30
successThreshold: 1
timeoutSeconds: 5
name: retail-web
ports:
- containerPort: 5000
name: http
protocol: TCP
readinessProbe:
failureThreshold: 6
httpGet:
path: /health
port: 5000
scheme: HTTP
initialDelaySeconds: 10
periodSeconds: 10
successThreshold: 1
timeoutSeconds: 5
resources: {}
terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File
volumeMounts:
- mountPath: /data
name: data
dnsPolicy: ClusterFirst
restartPolicy: Always
schedulerName: default-scheduler
securityContext: {}
terminationGracePeriodSeconds: 30
volumes:
- name: data
persistentVolumeClaim:
claimName: retail-web-data
---
apiVersion: v1
kind: Service
metadata:
name: retail-web
namespace: fc-retail
labels:
app.kubernetes.io/name: retail-web
app.kubernetes.io/part-of: flowercore
app.kubernetes.io/managed-by: argocd
argocd.argoproj.io/instance: infra-fc-retail
spec:
clusterIP: 10.43.239.8
clusterIPs:
- 10.43.239.8
internalTrafficPolicy: Cluster
ipFamilies:
- IPv4
ipFamilyPolicy: SingleStack
ports:
- name: http
port: 80
protocol: TCP
targetPort: 5000
selector:
app.kubernetes.io/name: retail-web
sessionAffinity: None
type: ClusterIP
---
apiVersion: cert-manager.io/v1
kind: Certificate
metadata:
name: retail-web-tls
namespace: fc-retail
labels:
app.kubernetes.io/name: retail-web-tls
app.kubernetes.io/part-of: flowercore
app.kubernetes.io/managed-by: argocd
argocd.argoproj.io/instance: infra-fc-retail
spec:
dnsNames:
- retail.iamworkin.lan
issuerRef:
kind: ClusterIssuer
name: step-ca-acme
secretName: retail-web-tls
---
apiVersion: traefik.io/v1alpha1
kind: IngressRoute
metadata:
name: retail-web
namespace: fc-retail
labels:
app.kubernetes.io/name: retail-web
app.kubernetes.io/part-of: flowercore
app.kubernetes.io/managed-by: argocd
argocd.argoproj.io/instance: infra-fc-retail
spec:
entryPoints:
- websecure
routes:
- kind: Rule
match: Host(`retail.iamworkin.lan`)
services:
- name: retail-web
port: 80
tls:
secretName: retail-web-tls

View File

@@ -532,7 +532,7 @@ spec:
fsGroupChangePolicy: OnRootMismatch fsGroupChangePolicy: OnRootMismatch
containers: containers:
- name: web - name: web
image: localhost/fc-ttsreader-web:v20260518-sprint36-demo-finish-b132cbf image: localhost/fc-ttsreader-web:v20260603-s54cx14-pr29-schema
imagePullPolicy: Never imagePullPolicy: Never
ports: ports:
- containerPort: 5217 - containerPort: 5217
@@ -554,6 +554,8 @@ spec:
value: "/data/chapter-context.db" value: "/data/chapter-context.db"
- name: TtsReader__Jobs__Root - name: TtsReader__Jobs__Root
value: "/data/jobs" value: "/data/jobs"
- name: TtsReader__Export__LocalCasRoot
value: "/data/bundles/cas"
- name: TtsReader__Piper__Host - name: TtsReader__Piper__Host
value: "10.0.57.17" value: "10.0.57.17"
- name: TtsReader__Piper__Port - name: TtsReader__Piper__Port

View File

@@ -58,7 +58,7 @@ spec:
nodeName: rke2-server nodeName: rke2-server
containers: containers:
- name: web - name: web
image: localhost/fc-updater-web:v20260509-4162dca-authgate image: localhost/fc-updater-web:v202605310029-7974fc4
imagePullPolicy: Never imagePullPolicy: Never
ports: ports:
- containerPort: 8080 - containerPort: 8080
@@ -88,6 +88,8 @@ spec:
value: Faith AI Mike Edition value: Faith AI Mike Edition
- name: FlowerCore__Updater__PublicShares__Links__0__Description - name: FlowerCore__Updater__PublicShares__Links__0__Description
value: Private release link for Mike's Faith AI bundle. value: Private release link for Mike's Faith AI bundle.
- name: FlowerCore__Audit__Sinks__Loki__Enabled
value: "false"
- name: FlowerCore__Updater__Auth__Bootstrap__Enabled - name: FlowerCore__Updater__Auth__Bootstrap__Enabled
value: "true" value: "true"
- name: FlowerCore__Updater__Auth__Bootstrap__Username - name: FlowerCore__Updater__Auth__Bootstrap__Username

2
apps/github-runner/.gitattributes vendored Normal file
View File

@@ -0,0 +1,2 @@
*.sh text eol=lf
Dockerfile text eol=lf

View File

@@ -0,0 +1,54 @@
FROM myoung34/github-runner:latest
ARG RUBY_VERSION=3.3.11
ARG RUBY_MINOR=3.3
ARG RUBY_BUILD_VERSION=v20260326
ARG RUNNER_UID=1001
ARG RUNNER_GID=1001
ENV RUNNER_TOOL_CACHE=/home/runner/_tool
ENV RUNNER_RUBY_TOOLCACHE=/opt/runner-toolcache
ENV PATH="/home/runner/_tool/Ruby/${RUBY_MINOR}/x64/bin:/opt/runner-toolcache/Ruby/${RUBY_MINOR}/x64/bin:${PATH}"
USER root
# Bake the IAmWorkin step-ca root CA into the system trust store. Without
# this, .NET HttpClient calls from CI tests against *.iamworkin.lan
# (e.g. https://selenium.iamworkin.lan/session) fail with `PartialChain`
# because the runner image's default Ubuntu trust bundle doesn't include
# our internal Root CA. update-ca-certificates regenerates
# /etc/ssl/certs/ca-certificates.crt, which OpenSSL + .NET on Linux read
# automatically — no SSL_CERT_FILE env var needed.
COPY step-ca-root.crt /usr/local/share/ca-certificates/iamworkin-step-ca-root.crt
RUN apt-get update \
&& DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
autoconf \
bison \
build-essential \
ca-certificates \
curl \
libdb-dev \
libffi-dev \
libgdbm-dev \
libgmp-dev \
libncurses-dev \
libreadline-dev \
libssl-dev \
libyaml-dev \
patch \
pkg-config \
uuid-dev \
zlib1g-dev \
&& update-ca-certificates \
&& curl -fsSL "https://github.com/rbenv/ruby-build/archive/refs/tags/${RUBY_BUILD_VERSION}.tar.gz" -o /tmp/ruby-build.tar.gz \
&& mkdir -p /tmp/ruby-build \
&& tar -xzf /tmp/ruby-build.tar.gz --strip-components=1 -C /tmp/ruby-build \
&& /tmp/ruby-build/install.sh \
&& rm -rf /tmp/ruby-build /tmp/ruby-build.tar.gz /var/lib/apt/lists/*
COPY install-ruby-toolcache.sh /usr/local/bin/install-ruby-toolcache.sh
RUN chmod +x /usr/local/bin/install-ruby-toolcache.sh \
&& RUBY_VERSION="${RUBY_VERSION}" RUBY_MINOR="${RUBY_MINOR}" TOOLCACHE_ROOT="${RUNNER_RUBY_TOOLCACHE}" RUNNER_UID="${RUNNER_UID}" RUNNER_GID="${RUNNER_GID}" /usr/local/bin/install-ruby-toolcache.sh \
&& ruby -v

View File

@@ -7,12 +7,17 @@ Deployments with `kubectl`; update this manifest and let ArgoCD reconcile.
All repo-scoped Linux runners use: All repo-scoped Linux runners use:
- `localhost/fc-github-runner:v20260525-ruby3.3.11-stepca`, derived from
`myoung34/github-runner:latest`
- `ACCESS_TOKEN` from the `github-runner-token` Secret - `ACCESS_TOKEN` from the `github-runner-token` Secret
- `RUN_AS_ROOT=false` - `RUN_AS_ROOT=false`
- `EPHEMERAL=true` - `EPHEMERAL=true`
- `LABELS=self-hosted,linux,fc-build-linux` - `LABELS=self-hosted,linux,fc-build-linux`
- writable non-root paths under `/home/runner` for .NET, NuGet, XDG cache, and - writable non-root paths under `/home/runner` for .NET, NuGet, XDG cache, and
Actions tool cache Actions tool cache
- Ruby 3.3.11 seeded into `/home/runner/_tool/Ruby/3.3/x64` from the baked
`/opt/runner-toolcache` copy so `ruby/setup-ruby@v1` can discover it on
self-hosted `ubuntu-20.04-x64` runners
`github-runner` for `FlowerCore.Common` is single-replica because it retains the `github-runner` for `FlowerCore.Common` is single-replica because it retains the
original Longhorn ReadWriteOnce NuGet PVC. Every other repo-scoped runner uses original Longhorn ReadWriteOnce NuGet PVC. Every other repo-scoped runner uses
@@ -28,6 +33,46 @@ Sprint 32 final long-tail wave adds 16 two-replica Deployments:
`FlowerCore.Provisioning`, `FlowerCore.Redis`, `FlowerCore.MessageBoard`, and `FlowerCore.Provisioning`, `FlowerCore.Redis`, `FlowerCore.MessageBoard`, and
`FlowerCore.MenuBoard`. `FlowerCore.MenuBoard`.
## Image Build
Ruby is baked with a pinned `ruby-build` release and Ruby patch version. The pod
still mounts an `emptyDir` over `/home/runner`, so the `setup-runner-home` init
container copies the baked toolcache from `/opt/runner-toolcache/Ruby` into
`/home/runner/_tool/Ruby` before the runner container starts.
The IAmWorkin step-ca root CA is also baked into the system trust store
(`/usr/local/share/ca-certificates/iamworkin-step-ca-root.crt`, registered by
`update-ca-certificates`). Without it, .NET HttpClient calls from CI tests
against `*.iamworkin.lan` (e.g. `https://selenium.iamworkin.lan/session`)
fail with `PartialChain`. To refresh the bundled cert when the root rotates,
re-extract from the cluster and overwrite `step-ca-root.crt`:
```bash
kubectl get secret -n cert-manager step-ca-root \
-o jsonpath='{.data.ca\.crt}' | base64 -d > step-ca-root.crt
```
```bash
cd apps/github-runner
podman build -t localhost/fc-github-runner:v20260525-ruby3.3.11-stepca .
podman run --rm localhost/fc-github-runner:v20260525-ruby3.3.11-stepca ruby -v
podman run --rm localhost/fc-github-runner:v20260525-ruby3.3.11-stepca \
test -f /opt/runner-toolcache/Ruby/3.3/x64.complete
podman save localhost/fc-github-runner:v20260525-ruby3.3.11-stepca \
-o fc-github-runner-v20260525-ruby3.3.11-stepca.tar
```
Import the saved image on every schedulable RKE2 node before ArgoCD rolls the
Deployments:
```bash
for node in rke2-server rke2-agent1 rke2-agent2; do
scp fc-github-runner-v20260525-ruby3.3.11-stepca.tar "$node:/tmp/"
ssh "$node" 'sudo ctr -a /run/k3s/containerd/containerd.sock -n k8s.io images rm localhost/fc-github-runner:v20260525-ruby3.3.11-stepca || true'
ssh "$node" 'sudo ctr -a /run/k3s/containerd/containerd.sock -n k8s.io images import /tmp/fc-github-runner-v20260525-ruby3.3.11-stepca.tar'
done
```
## Post-Merge Proof ## Post-Merge Proof
After the PR is merged and ArgoCD syncs, verify the runner fleet: After the PR is merged and ArgoCD syncs, verify the runner fleet:
@@ -36,6 +81,14 @@ After the PR is merged and ArgoCD syncs, verify the runner fleet:
kubectl -n github-runner get deploy,pods,pvc kubectl -n github-runner get deploy,pods,pvc
``` ```
Verify the Ruby toolcache in a fresh pod:
```bash
kubectl -n github-runner exec deploy/github-runner-puppet -c runner -- ruby -v
kubectl -n github-runner exec deploy/github-runner-puppet -c runner -- sh -c \
'echo "$RUNNER_TOOL_CACHE" && test -f "$RUNNER_TOOL_CACHE/Ruby/3.3/x64.complete"'
```
Verify GitHub registration for the repo-scoped runners: Verify GitHub registration for the repo-scoped runners:
```bash ```bash
@@ -69,6 +122,10 @@ from GitHub Actions and verify it lands on an `rke2-linux-*` runner.
- `actions/setup-dotnet` permission error at `/usr/share/dotnet`: check that - `actions/setup-dotnet` permission error at `/usr/share/dotnet`: check that
`DOTNET_INSTALL_DIR=/home/runner/.dotnet` and related cache env vars are `DOTNET_INSTALL_DIR=/home/runner/.dotnet` and related cache env vars are
present on the runner pod. present on the runner pod.
- `ruby/setup-ruby@v1` says self-hosted runners must install Ruby in
`$RUNNER_TOOL_CACHE`: check that the init container copied
`/opt/runner-toolcache/Ruby` into `/home/runner/_tool/Ruby` and that
`/home/runner/_tool/Ruby/3.3/x64.complete` exists.
- `404` during runner registration: the fine-grained PAT is valid but missing - `404` during runner registration: the fine-grained PAT is valid but missing
repository access for that repo. Add the repo to the PAT access list; the PAT repository access for that repo. Add the repo to the PAT access list; the PAT
value does not change. value does not change.

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,19 @@
#!/usr/bin/env bash
set -euo pipefail
RUBY_VERSION="${RUBY_VERSION:-3.3.11}"
RUBY_MINOR="${RUBY_MINOR:-3.3}"
TOOLCACHE_ROOT="${TOOLCACHE_ROOT:-/opt/runner-toolcache}"
RUNNER_UID="${RUNNER_UID:-1001}"
RUNNER_GID="${RUNNER_GID:-1001}"
RUBY_PREFIX="${TOOLCACHE_ROOT}/Ruby/${RUBY_VERSION}/x64"
mkdir -p "${TOOLCACHE_ROOT}/Ruby"
RUBY_CONFIGURE_OPTS="${RUBY_CONFIGURE_OPTS:---disable-install-doc --disable-yjit}" ruby-build "${RUBY_VERSION}" "${RUBY_PREFIX}"
touch "${TOOLCACHE_ROOT}/Ruby/${RUBY_VERSION}/x64.complete"
ln -sfn "${RUBY_VERSION}" "${TOOLCACHE_ROOT}/Ruby/${RUBY_MINOR}"
"${RUBY_PREFIX}/bin/ruby" -v
chown -R "${RUNNER_UID}:${RUNNER_GID}" "${TOOLCACHE_ROOT}"
chmod -R a+rX "${TOOLCACHE_ROOT}"

View File

@@ -0,0 +1,12 @@
-----BEGIN CERTIFICATE-----
MIIBxDCCAWqgAwIBAgIRAPY357G6ow6zMAL5+4bS2kkwCgYIKoZIzj0EAwIwQDEa
MBgGA1UEChMRSUFtV29ya2luIEFDTUUgQ0ExIjAgBgNVBAMTGUlBbVdvcmtpbiBB
Q01FIENBIFJvb3QgQ0EwHhcNMjYwMzA4MTgwNzExWhcNMzYwMzA1MTgwNzExWjBA
MRowGAYDVQQKExFJQW1Xb3JraW4gQUNNRSBDQTEiMCAGA1UEAxMZSUFtV29ya2lu
IEFDTUUgQ0EgUm9vdCBDQTBZMBMGByqGSM49AgEGCCqGSM49AwEHA0IABJ2n04X1
JZo5Zdq/i1Idv8+fqwZyAzBh7whbqj0SWsJL8UWRabCMqYCs7+dXO0xRSzqkwFDL
x+vooOai8RgRNhajRTBDMA4GA1UdDwEB/wQEAwIBBjASBgNVHRMBAf8ECDAGAQH/
AgEBMB0GA1UdDgQWBBRnuPPQR6iM/H6vOluiU3Sygayz8jAKBggqhkjOPQQDAgNI
ADBFAiEArQK9dYPGmAZsdYnjziuFVVE5NKZUcceYvGfGC+tLXUsCIAudF2zJrCRq
3mK50ZZET/fwTkJwiEF4824mjP8p1CKM
-----END CERTIFICATE-----

View File

@@ -46,7 +46,7 @@ spec:
spec: spec:
containers: containers:
- name: intranet-web - name: intranet-web
image: localhost/fc-intranet-web:v20260508-brochure-w1 image: localhost/fc-intranet-web:v20260531-ttsreader-bridge
imagePullPolicy: Never imagePullPolicy: Never
ports: ports:
- containerPort: 5300 - containerPort: 5300

View File

@@ -102,7 +102,7 @@ spec:
- name: web - name: web
# Placeholder tag — bump to the image you built + imported to ALL # Placeholder tag — bump to the image you built + imported to ALL
# RKE2 nodes via scripts/deploy-knowledge.sh before applying. # RKE2 nodes via scripts/deploy-knowledge.sh before applying.
image: localhost/fc-knowledge-web:v20260429232635 image: localhost/fc-knowledge-web:v20260603-oidc-authentik-auditfix
imagePullPolicy: Never imagePullPolicy: Never
command: command:
- /bin/sh - /bin/sh
@@ -123,6 +123,25 @@ spec:
value: "Production" value: "Production"
- name: DOTNET_SYSTEM_GLOBALIZATION_INVARIANT - name: DOTNET_SYSTEM_GLOBALIZATION_INVARIANT
value: "false" value: "false"
# AuthentiK/OIDC is wired but not enforced until the
# knowledge-oidc-client Secret is provisioned and
# FlowerCore__Auth__Enabled is flipped to true.
- name: FlowerCore__Auth__Enabled
value: "false"
- name: FlowerCore__Auth__Oidc__Enabled
value: "true"
- name: FlowerCore__Auth__Oidc__Authority
value: "https://id.iamworkin.lan/application/o/knowledge/"
- name: FlowerCore__Auth__Oidc__Audience
value: "knowledge"
- name: FlowerCore__Auth__Oidc__ClientId
value: "knowledge"
- name: FlowerCore__Auth__Oidc__ClientSecret
valueFrom:
secretKeyRef:
name: knowledge-oidc-client
key: client_secret
optional: true
# Vector-store directory + embedding model + edition profile dir. # Vector-store directory + embedding model + edition profile dir.
# Profile JSON is baked into the image at /home/app/editions via the # Profile JSON is baked into the image at /home/app/editions via the
# csproj Content-link from FlowerCore.Common/editions/. # csproj Content-link from FlowerCore.Common/editions/.
@@ -134,6 +153,8 @@ spec:
value: "5" value: "5"
- name: Knowledge__MaxLimit - name: Knowledge__MaxLimit
value: "50" value: "50"
- name: Knowledge__Federation__DatabasePath
value: "/data/vector-stores/knowledge-federation.db"
- name: FlowerCore__Editions__ProfileDirectory - name: FlowerCore__Editions__ProfileDirectory
value: "/home/app/editions" value: "/home/app/editions"
# Embed via edge1 Pi 5 + AI HAT+ (10.0.57.17:11434). Cluster # Embed via edge1 Pi 5 + AI HAT+ (10.0.57.17:11434). Cluster

View File

@@ -25,7 +25,7 @@ metadata:
role: github-actions-runner role: github-actions-runner
flowercore.io/managed-by: bluejay-infra flowercore.io/managed-by: bluejay-infra
spec: spec:
runStrategy: Always runStrategy: Halted
template: template:
metadata: metadata:
labels: labels:

View File

@@ -207,20 +207,13 @@ spec:
- port: 993 - port: 993
targetPort: 993 targetPort: 993
name: imaps name: imaps
--- # --- mail-tls Certificate REMOVED 2026-06-01 ---
# TLS Certificate via cert-manager # mail-tls is now managed OUTSIDE cert-manager: issued from step-ca's JWK 'admin'
apiVersion: cert-manager.io/v1 # provisioner and auto-renewed by a systemd timer on noc1 (step ca renew), which
kind: Certificate # writes the mail-tls secret directly. step-ca-acme only has an HTTP-01 (Traefik)
metadata: # solver, but mail.iamworkin.lan must resolve to the dedicated MetalLB IP 10.0.56.202
name: mail-tls # (SMTP/IMAP), so HTTP-01 cannot validate. Do NOT re-add a cert-manager Certificate
namespace: mail # here unless a DNS-01 solver is deployed for step-ca-acme.
spec:
secretName: mail-tls
issuerRef:
name: step-ca-acme
kind: ClusterIssuer
dnsNames:
- mail.iamworkin.lan
--- ---
# Traefik IngressRoute - Webmail placeholder # Traefik IngressRoute - Webmail placeholder
apiVersion: traefik.io/v1alpha1 apiVersion: traefik.io/v1alpha1

View File

@@ -223,7 +223,7 @@ data:
service: "pimanager" service: "pimanager"
vlan: "home" vlan: "home"
device: "pi4-ezconnect" device: "pi4-ezconnect"
- targets: ["10.0.58.113:5100"] - targets: ["10.0.58.113:5200"]
labels: labels:
instance: "pirelay" instance: "pirelay"
service: "pimanager" service: "pimanager"
@@ -280,13 +280,14 @@ data:
printer_model: "NuPrint 210" printer_model: "NuPrint 210"
# Print.Web health (Blazor app on edge2:5200) # Print.Web health (Blazor app on edge2:5200)
# Target `/health` (anonymous) — root path requires API key auth and returns 401.
- job_name: "probe-printweb" - job_name: "probe-printweb"
metrics_path: /probe metrics_path: /probe
params: params:
module: [http_2xx] module: [http_2xx]
scrape_interval: 30s scrape_interval: 30s
static_configs: static_configs:
- targets: ["http://10.0.57.16:5200/"] - targets: ["http://10.0.57.16:5200/health"]
labels: labels:
instance: "print-web" instance: "print-web"
service: "print-web" service: "print-web"
@@ -478,15 +479,17 @@ data:
- "https://gitea.iamworkin.lan/" - "https://gitea.iamworkin.lan/"
- "https://argocd.iamworkin.lan/" - "https://argocd.iamworkin.lan/"
- "https://intranet.iamworkin.lan/" - "https://intranet.iamworkin.lan/"
- "https://signage.iamworkin.lan/" - "https://signage.iamworkin.lan/healthz" # root 401 auth-gated 2026-06-01; /healthz anon 200
- "https://signalcontrol.iamworkin.lan/health" # FlowerCore.SignalControl explicit health route
- "https://kiosk.iamworkin.lan/" - "https://kiosk.iamworkin.lan/"
- "https://media.iamworkin.lan/" - "https://media.iamworkin.lan/"
- "https://mysql.iamworkin.lan/" - "https://mysql.iamworkin.lan/healthz" # root 401 auth-gated 2026-06-01; /healthz anon 200
- "https://php.iamworkin.lan/" - "https://php.iamworkin.lan/healthz" # root 401 auth-gated 2026-06-01; /healthz anon 200
- "https://dns.iamworkin.lan/"
- "https://zabbix.iamworkin.lan/" - "https://zabbix.iamworkin.lan/"
- "https://flowercore.iamworkin.lan/healthz"
- "https://desktop.iamworkin.lan/" - "https://desktop.iamworkin.lan/"
- "https://print.iamworkin.lan/" - "https://print.iamworkin.lan/"
- "https://dns.iamworkin.lan/"
- "https://chat.iamworkin.lan/" - "https://chat.iamworkin.lan/"
- "https://dist.iamworkin.lan/" - "https://dist.iamworkin.lan/"
- "https://dms.iamworkin.lan/" - "https://dms.iamworkin.lan/"
@@ -495,9 +498,15 @@ data:
- "https://presentations.iamworkin.lan/" - "https://presentations.iamworkin.lan/"
- "https://retail.iamworkin.lan/" - "https://retail.iamworkin.lan/"
- "https://ttsreader.iamworkin.lan/" - "https://ttsreader.iamworkin.lan/"
- "https://updates.iamworkin.lan/api/v1/manifests/_schema"
# Explicit healthcheck paths # Explicit healthcheck paths
- "https://fc-llm-bridge.iamworkin.lan/healthz" - "https://fc-llm-bridge.iamworkin.lan/healthz"
- "https://acme.iamworkin.lan/health" - "https://acme.iamworkin.lan/health"
- "https://replay.iamworkin.lan/healthz"
- "https://updatecenter-internal.iamworkin.lan/api/v1/manifests/_schema"
- "https://worldbuilder.iamworkin.lan/healthz"
# Coverage gaps logged Q-MR-129/Q-MR-130: devices.iamworkin.lan
# returns 503 and e2e-test-pma/wpdemo only return 404.
# NOTE: services intentionally NOT in this probe surface # NOTE: services intentionally NOT in this probe surface
# - grafana.iamworkin.lan: every endpoint (incl. /api/health # - grafana.iamworkin.lan: every endpoint (incl. /api/health
# and /login) returns 401 behind Traefik basic-auth. # and /login) returns 401 behind Traefik basic-auth.
@@ -906,11 +915,14 @@ data:
# for: 30m absorbs sleep cycles. The EcoTank sleeps after ~5 min # for: 30m absorbs sleep cycles. The EcoTank sleeps after ~5 min
# of idle and SNMP times out, so 5m for: would page nightly. A # of idle and SNMP times out, so 5m for: would page nightly. A
# genuine printer outage (jam, disconnected) lasts well over 30m. # genuine printer outage (jam, disconnected) lasts well over 30m.
# Use a range-window expression: instant up{} can go stale/absent
# after repeated snmp-exporter 500s.
- alert: EpsonPrinterDown - alert: EpsonPrinterDown
expr: up{job="snmp-printer"} == 0 expr: (max_over_time(up{job="snmp-printer"}[35m]) == bool 0) == 1 and (hour() >= 13 or hour() < 1)
for: 30m for: 30m
labels: labels:
severity: warning severity: warning
alert_channel: irc
annotations: annotations:
summary: "Epson ET-3750 SNMP unreachable for >30m (likely actual fault, not sleep)" summary: "Epson ET-3750 SNMP unreachable for >30m (likely actual fault, not sleep)"
@@ -1019,7 +1031,9 @@ data:
- name: kubernetes-state - name: kubernetes-state
rules: rules:
- alert: KubeContainerRestartingFrequently - alert: KubeContainerRestartingFrequently
expr: increase(kube_pod_container_status_restarts_total[1h]) > 5 # Exclude github-runner: ephemeral runners register, run one job,
# exit cleanly, then restart by design.
expr: increase(kube_pod_container_status_restarts_total{namespace!="github-runner"}[1h]) > 5
for: 15m for: 15m
labels: labels:
severity: warning severity: warning
@@ -1028,7 +1042,9 @@ data:
description: "Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has restarted {{ $value | printf \"%.0f\" }} times in the last hour. Check 'kubectl describe pod' + last-state termination reason." description: "Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has restarted {{ $value | printf \"%.0f\" }} times in the last hour. Check 'kubectl describe pod' + last-state termination reason."
- alert: KubeContainerCrashLooping - alert: KubeContainerCrashLooping
expr: increase(kube_pod_container_status_restarts_total[15m]) > 3 # Exclude github-runner: ephemeral runners register, run one job,
# exit cleanly, then restart by design.
expr: increase(kube_pod_container_status_restarts_total{namespace!="github-runner"}[15m]) > 3
for: 5m for: 5m
labels: labels:
severity: critical severity: critical
@@ -1056,7 +1072,8 @@ data:
description: "Pod can't pull image. Check the image ref (often a stale tag or unreachable registry) and clean up if it's an orphan." description: "Pod can't pull image. Check the image ref (often a stale tag or unreachable registry) and clean up if it's an orphan."
- alert: KubeDeploymentReplicasMismatch - alert: KubeDeploymentReplicasMismatch
expr: kube_deployment_spec_replicas != kube_deployment_status_replicas_available # Exclude github-runner: ephemeral runner deployments flap 0/1 between jobs by design.
expr: kube_deployment_spec_replicas{namespace!="github-runner"} != kube_deployment_status_replicas_available{namespace!="github-runner"}
for: 15m for: 15m
labels: labels:
severity: warning severity: warning
@@ -3635,6 +3652,38 @@ data:
relativeTimeRange: {from: 120, to: 0} relativeTimeRange: {from: 120, to: 0}
datasourceUid: __expr__ datasourceUid: __expr__
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [600], type: gt}}], refId: C} model: {type: threshold, expression: B, conditions: [{evaluator: {params: [600], type: gt}}], refId: C}
- orgId: 1
name: SNMP Devices
folder: Infrastructure Alerts
interval: 1m
rules:
- uid: epson-printer-down-stale-window
title: EpsonPrinterDown
condition: C
for: 30m
noDataState: OK
execErrState: OK
annotations:
summary: Epson ET-3750 SNMP unreachable
description: The Epson ET-3750 snmp-printer target has reported only failed scrapes for at least 35 minutes.
runbook: "1. Check if printer is intentionally powered off 2. If printing needed: press power button on printer 3. Ping 10.0.58.107 after wake-up 4. Check WiFi on printer LCD if still unreachable"
labels:
severity: info
service: printer
alert_channel: irc
data:
- refId: A
relativeTimeRange: {from: 2100, to: 0}
datasourceUid: prometheus
model: {expr: '(max_over_time(up{job="snmp-printer"}[35m]) == bool 0) == 1 and (hour() >= 13 or hour() < 1)', instant: true, refId: A}
- refId: B
relativeTimeRange: {from: 2100, to: 0}
datasourceUid: __expr__
model: {type: reduce, expression: A, reducer: last, refId: B}
- refId: C
relativeTimeRange: {from: 2100, to: 0}
datasourceUid: __expr__
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [0], type: gt}}], refId: C}
- orgId: 1 - orgId: 1
name: CI Runners name: CI Runners
folder: CI Alerts folder: CI Alerts

View File

@@ -24,7 +24,16 @@
# (10.0.57.16:5200), public internet 80/443 (excluding RFC1918), and # (10.0.57.16:5200), public internet 80/443 (excluding RFC1918), and
# fc-signage:5190 for the signage AAT lane. # fc-signage:5190 for the signage AAT lane.
# - Ingress: Traefik (4444 + 8089 ACME-solver-style), intra-pod, # - Ingress: Traefik (4444 + 8089 ACME-solver-style), intra-pod,
# telephony / gitea / fc-system / fc-signage namespaces on 4444. # telephony / gitea / fc-system / fc-signage / github-runner namespaces
# on 4444.
#
# 2026-05-25: added github-runner ingress on 4444 so CI jobs running in
# self-hosted runner pods (e.g. FlowerCore.Print.Web `help-screenshots`)
# can reach the grid. Without this allow, the session POST to
# `selenium-hub.selenium.svc.cluster.local:4444` was DNAT'd to the hub
# pod IP and then dropped at the Calico ingress hook — Selenium UI showed
# 0/4 sessions while the .NET HTTP client timed out at 60s. Same family
# as `feedback_netpol_dnat_backend_port`, wrong-source-namespace flavor.
apiVersion: networking.k8s.io/v1 apiVersion: networking.k8s.io/v1
kind: NetworkPolicy kind: NetworkPolicy
metadata: metadata:
@@ -203,6 +212,13 @@ spec:
ports: ports:
- port: 4444 - port: 4444
protocol: TCP protocol: TCP
- from:
- namespaceSelector:
matchLabels:
kubernetes.io/metadata.name: github-runner
ports:
- port: 4444
protocol: TCP
podSelector: {} podSelector: {}
policyTypes: policyTypes:
- Ingress - Ingress

View File

@@ -0,0 +1,427 @@
# Selenium Grid 4 — RKE2 deployment
#
# Hub + chrome + firefox + edge browser nodes serving fleet-wide AAT runs from
# the GitHub Actions self-hosted runners. ArgoCD owns this namespace from
# 2026-05-25 (`infra-selenium` Application; previously these resources were
# orphan kubectl-applied since 2026-03-15).
#
# Endpoints:
# - Internal cluster: http://selenium-hub.selenium.svc.cluster.local:4444
# - LAN LoadBalancer (MetalLB): http://10.0.56.208:4444
# - Traefik public: https://selenium.iamworkin.lan
#
# Browser maxSessions:
# - chrome 2 (bumped from 1 on 2026-05-25 morning-routine — AAT-heavy
# Print.Web help-screenshots was the global bottleneck;
# see commit history for ops/runner-replica-rightsize)
# - firefox 1
# - edge 1
#
# Screenshots + video recording write to NFS via the chrome video sidecar.
# See: CLAUDE.md "Selenium Grid & Visual AAT Testing" + bluejay-infra ADR notes.
---
apiVersion: v1
kind: Service
metadata:
labels:
app: selenium-hub
app.kubernetes.io/name: selenium-hub
app.kubernetes.io/part-of: selenium-grid
name: selenium-hub
namespace: selenium
spec:
ports:
- name: web
port: 4444
targetPort: 4444
- name: publish
port: 4442
targetPort: 4442
- name: subscribe
port: 4443
targetPort: 4443
selector:
app: selenium-hub
type: ClusterIP
---
apiVersion: v1
kind: Service
metadata:
annotations:
metallb.io/ip-allocated-from-pool: bluejay-pool
metallb.universe.tf/loadBalancerIPs: 10.0.56.208
labels:
app: selenium-hub
component: external-access
name: selenium-hub-external
namespace: selenium
spec:
clusterIP: 10.43.90.147
clusterIPs:
- 10.43.90.147
externalTrafficPolicy: Local
healthCheckNodePort: 32213
ports:
- name: web
nodePort: 32411
port: 4444
targetPort: 4444
- name: publish
nodePort: 32068
port: 4442
targetPort: 4442
- name: subscribe
nodePort: 31000
port: 4443
targetPort: 4443
selector:
app: selenium-hub
type: LoadBalancer
---
apiVersion: apps/v1
kind: Deployment
metadata:
labels:
app: selenium-hub
app.kubernetes.io/name: selenium-hub
app.kubernetes.io/part-of: selenium-grid
name: selenium-hub
namespace: selenium
spec:
replicas: 1
selector:
matchLabels:
app: selenium-hub
template:
metadata:
labels:
app: selenium-hub
app.kubernetes.io/name: selenium-hub
app.kubernetes.io/part-of: selenium-grid
spec:
containers:
- env:
- name: SE_NODE_SESSION_TIMEOUT
value: '300'
- name: SE_SESSION_REQUEST_TIMEOUT
value: '300'
- name: SE_SESSION_RETRY_INTERVAL
value: '5'
- name: JAVA_OPTS
value: -Xmx512m
image: selenium/hub:4.27.0
livenessProbe:
httpGet:
path: /wd/hub/status
port: 4444
initialDelaySeconds: 30
periodSeconds: 15
timeoutSeconds: 5
name: selenium-hub
ports:
- containerPort: 4444
name: web
- containerPort: 4442
name: publish
- containerPort: 4443
name: subscribe
readinessProbe:
httpGet:
path: /wd/hub/status
port: 4444
initialDelaySeconds: 10
periodSeconds: 5
timeoutSeconds: 5
# Hub baseline working set ~766Mi on 2026-05-25 (75% of prior 1Gi
# limit). Bump to 1.5Gi / 1Gi to keep ~50% headroom; matches the
# stampede-buffer pattern documented for multus
# (feedback_k8s_cni_multus_sizing). CPU left alone — observed 54m
# against a 500m limit, no contention.
resources:
limits:
cpu: 500m
memory: 1536Mi
requests:
cpu: 250m
memory: 1Gi
---
apiVersion: apps/v1
kind: Deployment
metadata:
labels:
app: selenium-node-chrome
app.kubernetes.io/name: selenium-node-chrome
app.kubernetes.io/part-of: selenium-grid
name: selenium-node-chrome
namespace: selenium
spec:
replicas: 1
selector:
matchLabels:
app: selenium-node-chrome
template:
metadata:
labels:
app: selenium-node-chrome
app.kubernetes.io/name: selenium-node-chrome
app.kubernetes.io/part-of: selenium-grid
spec:
containers:
- env:
- name: SE_EVENT_BUS_HOST
value: selenium-hub
- name: SE_EVENT_BUS_PUBLISH_PORT
value: '4442'
- name: SE_EVENT_BUS_SUBSCRIBE_PORT
value: '4443'
- name: SE_NODE_MAX_SESSIONS
value: '2'
- name: SE_NODE_OVERRIDE_MAX_SESSIONS
value: 'false'
- name: SE_VNC_NO_PASSWORD
value: '1'
- name: SE_SCREEN_WIDTH
value: '1920'
- name: SE_SCREEN_HEIGHT
value: '1080'
- name: SE_NODE_SESSION_TIMEOUT
value: '300'
image: selenium/node-chrome:4.27.0
livenessProbe:
httpGet:
path: /status
port: 5555
initialDelaySeconds: 30
periodSeconds: 15
name: selenium-chrome
ports:
- containerPort: 5555
name: node
readinessProbe:
httpGet:
path: /status
port: 5555
initialDelaySeconds: 15
periodSeconds: 5
# Chromium-based browser node. Bumped from 1Gi -> 2Gi (req 512Mi
# -> 1Gi) on 2026-05-25 — Edge had 51 OOMKills in 5d on the
# original 1Gi cap (~1 OOM every 2.4h), and Chrome at maxSessions=2
# was running 684Mi idle on the same cap. Matches the Firefox node's
# tested-stable 2Gi limit. CPU unchanged.
resources:
limits:
cpu: '1'
memory: 2Gi
requests:
cpu: 500m
memory: 1Gi
volumeMounts:
- mountPath: /dev/shm
name: dshm
- env:
- name: DISPLAY_CONTAINER_NAME
value: localhost
- name: SE_SCREEN_WIDTH
value: '1920'
- name: SE_SCREEN_HEIGHT
value: '1080'
- name: SE_VIDEO_FILE_NAME
value: auto
- name: SE_VIDEO_UPLOAD_ENABLED
value: 'false'
image: selenium/video:ffmpeg-7.1-20250101
name: video
resources:
limits:
cpu: 500m
memory: 768Mi
requests:
cpu: 250m
memory: 384Mi
volumeMounts:
- mountPath: /videos
name: selenium-videos
volumes:
- emptyDir:
medium: Memory
sizeLimit: 2Gi
name: dshm
- emptyDir:
sizeLimit: 5Gi
name: selenium-videos
---
apiVersion: apps/v1
kind: Deployment
metadata:
labels:
app: selenium-node-firefox
app.kubernetes.io/name: selenium-node-firefox
app.kubernetes.io/part-of: selenium-grid
name: selenium-node-firefox
namespace: selenium
spec:
replicas: 1
selector:
matchLabels:
app: selenium-node-firefox
template:
metadata:
labels:
app: selenium-node-firefox
app.kubernetes.io/name: selenium-node-firefox
app.kubernetes.io/part-of: selenium-grid
spec:
containers:
- env:
- name: SE_EVENT_BUS_HOST
value: selenium-hub
- name: SE_EVENT_BUS_PUBLISH_PORT
value: '4442'
- name: SE_EVENT_BUS_SUBSCRIBE_PORT
value: '4443'
- name: SE_NODE_MAX_SESSIONS
value: '1'
- name: SE_NODE_OVERRIDE_MAX_SESSIONS
value: 'true'
- name: SE_VNC_NO_PASSWORD
value: '1'
- name: SE_START_VNC
value: 'false'
- name: SE_SCREEN_WIDTH
value: '1920'
- name: SE_SCREEN_HEIGHT
value: '1080'
- name: SE_NODE_SESSION_TIMEOUT
value: '300'
image: selenium/node-firefox:4.27.0
livenessProbe:
failureThreshold: 5
httpGet:
path: /status
port: 5555
initialDelaySeconds: 30
periodSeconds: 15
timeoutSeconds: 5
name: selenium-firefox
ports:
- containerPort: 5555
name: node
readinessProbe:
failureThreshold: 5
httpGet:
path: /status
port: 5555
initialDelaySeconds: 15
periodSeconds: 5
timeoutSeconds: 5
resources:
limits:
cpu: '1'
memory: 2Gi
requests:
cpu: 500m
memory: 1Gi
volumeMounts:
- mountPath: /dev/shm
name: dshm
volumes:
- emptyDir:
medium: Memory
sizeLimit: 2Gi
name: dshm
---
apiVersion: apps/v1
kind: Deployment
metadata:
labels:
app: selenium-node-edge
app.kubernetes.io/name: selenium-node-edge
app.kubernetes.io/part-of: selenium-grid
name: selenium-node-edge
namespace: selenium
spec:
replicas: 1
selector:
matchLabels:
app: selenium-node-edge
template:
metadata:
labels:
app: selenium-node-edge
app.kubernetes.io/name: selenium-node-edge
app.kubernetes.io/part-of: selenium-grid
spec:
containers:
- env:
- name: SE_EVENT_BUS_HOST
value: selenium-hub
- name: SE_EVENT_BUS_PUBLISH_PORT
value: '4442'
- name: SE_EVENT_BUS_SUBSCRIBE_PORT
value: '4443'
- name: SE_NODE_MAX_SESSIONS
value: '1'
- name: SE_NODE_OVERRIDE_MAX_SESSIONS
value: 'true'
- name: SE_VNC_NO_PASSWORD
value: '1'
- name: SE_SCREEN_WIDTH
value: '1920'
- name: SE_SCREEN_HEIGHT
value: '1080'
- name: SE_NODE_SESSION_TIMEOUT
value: '300'
image: selenium/node-edge:4.27.0
livenessProbe:
httpGet:
path: /status
port: 5555
initialDelaySeconds: 30
periodSeconds: 15
name: selenium-edge
ports:
- containerPort: 5555
name: node
readinessProbe:
httpGet:
path: /status
port: 5555
initialDelaySeconds: 15
periodSeconds: 5
# Chromium-based browser node. Bumped from 1Gi -> 2Gi (req 512Mi
# -> 1Gi) on 2026-05-25 — Edge had 51 OOMKills in 5d on the
# original 1Gi cap (~1 OOM every 2.4h), and Chrome at maxSessions=2
# was running 684Mi idle on the same cap. Matches the Firefox node's
# tested-stable 2Gi limit. CPU unchanged.
resources:
limits:
cpu: '1'
memory: 2Gi
requests:
cpu: 500m
memory: 1Gi
volumeMounts:
- mountPath: /dev/shm
name: dshm
volumes:
- emptyDir:
medium: Memory
sizeLimit: 2Gi
name: dshm
---
apiVersion: traefik.io/v1alpha1
kind: IngressRoute
metadata:
name: selenium-hub
namespace: selenium
spec:
entryPoints:
- websecure
routes:
- kind: Rule
match: Host(`selenium.iamworkin.lan`)
services:
- name: selenium-hub
port: 4444
tls:
secretName: selenium-tls

View File

@@ -0,0 +1,206 @@
using FluentAssertions;
using Xunit;
namespace BluejayInfraLint.Tests;
[Trait("Category", "Unit")]
public sealed class DivoomPiDeployArtifactTests
{
private static readonly string Root = FindRepoRoot();
private static readonly string DmRoot = Path.Combine(Root, "apps", "fc-divoom-dm-pi-device");
private static readonly string TvRoot = Path.Combine(Root, "apps", "fc-divoom-tv-pi");
public static TheoryData<string> DmRequiredArtifacts => new()
{
"README.md",
"hiera/edge2-divoom-dm-device.overlay.yaml",
"puppet/profile/pi/service/divoom_dm_device.pp",
"puppet/templates/divoom-device-registration.json.epp",
"puppet/templates/flowercore-divoom-dm-agent.service.epp",
};
public static TheoryData<string> TvRequiredArtifacts => new()
{
"README.md",
"hiera/example-divoom-tv-pi.iamworkin.lan.yaml",
"puppet/profile/pi/service/divoom_tv.pp",
"systemd/flowercore-divoom-tv.service",
"systemd/flowercore-divoom-tv-hdmi.service",
"systemd/99-flowercore-divoom-tv-hdmi.rules",
"scripts/flowercore-divoom-tv-prelaunch.sh",
"scripts/flowercore-divoom-tv-launch.sh",
"scripts/flowercore-divoom-tv-hdmi-respond.sh",
};
[Theory]
[MemberData(nameof(DmRequiredArtifacts))]
public void DmDeviceArtifacts_ArePresent(string relativePath)
{
File.Exists(Path.Combine(DmRoot, relativePath.Replace('/', Path.DirectorySeparatorChar))).Should().BeTrue(relativePath);
}
[Theory]
[MemberData(nameof(TvRequiredArtifacts))]
public void TvPiArtifacts_ArePresent(string relativePath)
{
File.Exists(Path.Combine(TvRoot, relativePath.Replace('/', Path.DirectorySeparatorChar))).Should().BeTrue(relativePath);
}
[Fact]
public void DmDeviceReadme_DeclaresPuppetSystemdNotKubernetes()
{
var readme = ReadDm("README.md");
readme.Should().Contain("not a Kubernetes application");
readme.Should().Contain("profile::pi::service::divoom");
readme.Should().Contain("no K8s surface");
}
[Fact]
public void DmHieraOverlay_PreservesExistingEdge2DivoomService()
{
var hiera = ReadDm("hiera/edge2-divoom-dm-device.overlay.yaml");
hiera.Should().Contain("fc-pimanager:");
hiera.Should().Contain("fc-divoom:");
hiera.Should().Contain("enabled: true");
hiera.Should().Contain("profile::pi::service::divoom_dm_device::service_enabled: false");
hiera.Should().Contain("profile::pi::service::divoom_dm_device::service_ensure: 'stopped'");
}
[Fact]
public void DmPuppetProfile_DefaultsToStoppedDisabledService()
{
var profile = ReadDm("puppet/profile/pi/service/divoom_dm_device.pp");
profile.Should().Contain("Boolean $service_enabled = false");
profile.Should().Contain("Enum['running', 'stopped'] $service_ensure = 'stopped'");
profile.Should().Contain("service { $service_name:");
profile.Should().Contain("ensure => $service_ensure");
profile.Should().Contain("enable => $service_enabled");
}
[Fact]
public void DmPuppetProfile_DoesNotManageLiveDivoomWebUnit()
{
var profile = ReadDm("puppet/profile/pi/service/divoom_dm_device.pp");
profile.Should().NotContain("Service['flowercore-divoom.service']");
profile.Should().NotContain("service { 'flowercore-divoom.service'");
profile.Should().NotContain("notify => Service");
}
[Fact]
public void DmAgentUnit_IsSeparateAndGatedByExistingWrappers()
{
var unit = ReadDm("puppet/templates/flowercore-divoom-dm-agent.service.epp");
unit.Should().Contain("ConditionPathExists=<%= $divoom_install_dir %>/bt-link.sh");
unit.Should().Contain("ConditionPathExists=<%= $divoom_install_dir %>/bt-reset.sh");
unit.Should().Contain("ConditionPathExists=<%= $divoom_install_dir %>/audio-link.sh");
unit.Should().Contain("ExecStart=<%= $agent_binary_path %> --mode=Pi");
unit.Should().NotContain("flowercore-divoom.service");
}
[Fact]
public void DmRegistration_CarriesRenderProofAndSafetyPolicy()
{
var registration = ReadDm("puppet/templates/divoom-device-registration.json.epp");
registration.Should().Contain("\"candidateChannels\": <%= $bt_channels_json %>");
registration.Should().Contain("\"deviceInfoIsRenderProof\": false");
registration.Should().Contain("\"visibleRenderProofRequired\": <%= $visible_render_proof_required %>");
registration.Should().Contain("\"preserveExistingService\": \"flowercore-divoom.service\"");
registration.Should().Contain("\"doNotEnableFmRadio\": true");
}
[Fact]
public void TvService_UsesAvaloniaHdmiSafetyGates()
{
var unit = ReadTv("systemd/flowercore-divoom-tv.service");
unit.Should().Contain("ConditionPathExists=/opt/flowercore/divoom-tv/FlowerCore.Divoom.Tv");
unit.Should().Contain("Environment=XDG_RUNTIME_DIR=/run/fc-divoom-tv");
unit.Should().Contain("RuntimeDirectoryMode=0700");
unit.Should().Contain("ExecStartPre=/usr/local/bin/flowercore-divoom-tv-prelaunch.sh");
unit.Should().Contain("ExecStart=/usr/local/bin/flowercore-divoom-tv-launch.sh");
unit.Should().Contain("MemoryMax=2G");
unit.Should().Contain("PrivateTmp=true");
unit.Should().NotContain("/tmp");
}
[Fact]
public void TvLauncher_PrefersCageAndFallsBackToDirectLaunch()
{
var script = ReadTv("scripts/flowercore-divoom-tv-launch.sh");
script.Should().Contain("command -v cage");
script.Should().Contain("exec cage --");
script.Should().Contain("launching FlowerCore.Divoom.Tv directly");
script.Should().Contain("--target=hdmi");
script.Should().Contain("--presentation-mode=${PRESENTATION_MODE}");
}
[Fact]
public void TvHotplugRule_SettlesAndRestartsRenderer()
{
var rule = ReadTv("systemd/99-flowercore-divoom-tv-hdmi.rules");
var responder = ReadTv("scripts/flowercore-divoom-tv-hdmi-respond.sh");
rule.Should().Contain("KERNEL==\"card?-HDMI-A-?\"");
rule.Should().Contain("start flowercore-divoom-tv-hdmi.service");
responder.Should().Contain("sleep 2");
responder.Should().Contain("systemctl restart flowercore-divoom-tv.service");
}
[Fact]
public void TvPuppetProfile_InstallsCageAndStaticArtifacts()
{
var profile = ReadTv("puppet/profile/pi/service/divoom_tv.pp");
profile.Should().Contain("package { ['cage', 'libgbm1', 'libdrm2', 'libxkbcommon0', 'fonts-dejavu-core']");
profile.Should().Contain("'profile/pi/fc_divoom_tv/flowercore-divoom-tv.service'");
profile.Should().Contain("'profile/pi/fc_divoom_tv/flowercore-divoom-tv-launch.sh'");
profile.Should().Contain("profile/pi/fc_divoom_tv/99-flowercore-divoom-tv-hdmi.rules");
profile.Should().Contain("Boolean $service_enabled = false");
}
[Fact]
public void DivoomArtifacts_DoNotAddKubernetesWorkloads()
{
var allText = Directory.GetFiles(DmRoot, "*", SearchOption.AllDirectories)
.Concat(Directory.GetFiles(TvRoot, "*", SearchOption.AllDirectories))
.Select(File.ReadAllText);
foreach (var text in allText)
{
text.Should().NotContain("kind: Deployment");
text.Should().NotContain("kind: IngressRoute");
text.Should().NotContain("kind: Certificate");
text.Should().NotContain("kind: OnePasswordItem");
}
}
private static string ReadDm(string relativePath)
=> File.ReadAllText(Path.Combine(DmRoot, relativePath.Replace('/', Path.DirectorySeparatorChar)));
private static string ReadTv(string relativePath)
=> File.ReadAllText(Path.Combine(TvRoot, relativePath.Replace('/', Path.DirectorySeparatorChar)));
private static string FindRepoRoot()
{
var current = new DirectoryInfo(AppContext.BaseDirectory);
while (current is not null)
{
if (Directory.Exists(Path.Combine(current.FullName, "apps"))
&& File.Exists(Path.Combine(current.FullName, "README.md")))
{
return current.FullName;
}
current = current.Parent;
}
throw new DirectoryNotFoundException("Could not find bluejay-infra root.");
}
}

View File

@@ -67,6 +67,7 @@ public sealed class FleetManifestLintTests
["github-runner-chat"] = "https://github.com/astoltz/FlowerCore.Chat", ["github-runner-chat"] = "https://github.com/astoltz/FlowerCore.Chat",
["github-runner-mysql"] = "https://github.com/astoltz/FlowerCore.MySQL", ["github-runner-mysql"] = "https://github.com/astoltz/FlowerCore.MySQL",
["github-runner-kiosk-linux"] = "https://github.com/astoltz/FlowerCore.Kiosk.Linux", ["github-runner-kiosk-linux"] = "https://github.com/astoltz/FlowerCore.Kiosk.Linux",
["github-runner-updater"] = "https://github.com/astoltz/FlowerCore.Updater",
}; };
private static readonly HashSet<string> ScaledLinuxRunnerDeployments = new(StringComparer.Ordinal) private static readonly HashSet<string> ScaledLinuxRunnerDeployments = new(StringComparer.Ordinal)
@@ -80,6 +81,7 @@ public sealed class FleetManifestLintTests
"github-runner-chat", "github-runner-chat",
"github-runner-mysql", "github-runner-mysql",
"github-runner-kiosk-linux", "github-runner-kiosk-linux",
"github-runner-updater",
}; };
private static readonly IReadOnlyDictionary<string, string> WritableRunnerEnv = new Dictionary<string, string>(StringComparer.Ordinal) private static readonly IReadOnlyDictionary<string, string> WritableRunnerEnv = new Dictionary<string, string>(StringComparer.Ordinal)
@@ -234,7 +236,7 @@ public sealed class FleetManifestLintTests
{ {
deployments.Should().ContainKey(expectedRunner.Key); deployments.Should().ContainKey(expectedRunner.Key);
var container = deployments[expectedRunner.Key].ContainerMappings().Should().ContainSingle().Subject; var container = deployments[expectedRunner.Key].MainContainerMappings().Should().ContainSingle().Subject;
EnvValue(container, "REPO_URL").Should().Be(expectedRunner.Value); EnvValue(container, "REPO_URL").Should().Be(expectedRunner.Value);
EnvValue(container, "EPHEMERAL").Should().Be("true"); EnvValue(container, "EPHEMERAL").Should().Be("true");
EnvValue(container, "LABELS").Should().Be("self-hosted,linux,fc-build-linux"); EnvValue(container, "LABELS").Should().Be("self-hosted,linux,fc-build-linux");
@@ -250,7 +252,7 @@ public sealed class FleetManifestLintTests
{ {
foreach (var deployment in GitHubRunnerDeployments().Values) foreach (var deployment in GitHubRunnerDeployments().Values)
{ {
var container = deployment.ContainerMappings().Should().ContainSingle().Subject; var container = deployment.MainContainerMappings().Should().ContainSingle().Subject;
foreach (var expectedEnv in WritableRunnerEnv) foreach (var expectedEnv in WritableRunnerEnv)
{ {
@@ -277,7 +279,10 @@ public sealed class FleetManifestLintTests
foreach (var deploymentName in ScaledLinuxRunnerDeployments) foreach (var deploymentName in ScaledLinuxRunnerDeployments)
{ {
var deployment = deployments[deploymentName]; var deployment = deployments[deploymentName];
ReplicaCount(deployment).Should().Be(2); // Scaled runners must have >= 2 replicas (avoid single-pod bottleneck).
// Individual deployments may be tuned upward per CI activity — see
// "runners: right-size replica counts per 14d CI activity (#24)".
ReplicaCount(deployment).Should().BeGreaterOrEqualTo(2, $"{deploymentName} is in the scaled set and must run with at least 2 replicas");
var volumes = deployment.MappingSequence("spec", "template", "spec", "volumes"); var volumes = deployment.MappingSequence("spec", "template", "spec", "volumes");
var claimNames = volumes var claimNames = volumes
@@ -303,6 +308,108 @@ public sealed class FleetManifestLintTests
.Be("github-runner-nuget-cache"); .Be("github-runner-nuget-cache");
} }
[Fact]
public void Runners_MustNotPinToOperatorWorkstationHosts()
{
// CRITICAL SAFETY (operator directive 2026-05-26): BLUEJAY-WS is the
// operator's primary workstation — host of the 1Password Connect
// bearer token, fcadmin SSH keys to noc1, signing CA private keys,
// and source for every FC repo. A self-hosted GitHub Actions runner
// there would execute arbitrary PR code with that local access.
// Build-side analog of the Sprint 9 NEW safe-account exclusion gate
// (Puppet GPO/AppLocker/WDAC/audit-forwarder modules refuse to apply
// on BLUEJAY-WS). This lint asserts no GitHub-runner Deployment in
// apps/github-runner/ pins to a forbidden operator-workstation host
// via nodeName, nodeSelector, nodeAffinity, or tolerations.
// Existing legacy `bluejay-ws-sandbox-1` GitHub-registered runner is
// out of scope here (it's a runtime registration, not a K8s
// Deployment) — see CLAUDE.md "Common Mistakes" entry and
// feedback_bluejay_ws_never_public_runner.md.
var forbiddenHostPatterns = new[]
{
"bluejay-ws",
"BLUEJAY-WS",
"bluejay-ws.iamworkin.lan",
"iamworkin-ws",
};
bool ContainsForbidden(string? value) =>
!string.IsNullOrWhiteSpace(value)
&& forbiddenHostPatterns.Any(pattern => value!.Contains(pattern, StringComparison.OrdinalIgnoreCase));
var violations = GitHubRunnerDeployments().Values.SelectMany(deployment =>
{
var local = new List<string>();
var podSpec = ManifestNodeExtensions.Mapping(deployment.Root, "spec", "template", "spec");
if (podSpec is null)
{
return local;
}
// nodeName: pins the pod to a specific node by name.
var nodeName = ManifestNodeExtensions.Scalar(podSpec, "nodeName");
if (ContainsForbidden(nodeName))
{
local.Add($"{deployment.Name} sets nodeName='{nodeName}' which targets a forbidden operator-workstation host.");
}
// nodeSelector: dict of label → value pinning the pod to nodes
// carrying matching labels. Examples that would trip this:
// kubernetes.io/hostname: bluejay-ws
// flowercore.io/host: bluejay-ws.iamworkin.lan
var nodeSelector = ManifestNodeExtensions.Mapping(podSpec, "nodeSelector");
if (nodeSelector is not null)
{
foreach (var entry in nodeSelector.Children)
{
var key = entry.Key is YamlScalarNode keyScalar ? keyScalar.Value : null;
var value = entry.Value is YamlScalarNode valueScalar ? valueScalar.Value : null;
if (ContainsForbidden(value))
{
local.Add($"{deployment.Name} has nodeSelector entry '{key}: {value}' which targets a forbidden operator-workstation host.");
}
}
}
// nodeAffinity: matchExpressions over node labels.
foreach (var term in ManifestNodeExtensions.MappingSequence(podSpec, "affinity", "nodeAffinity", "requiredDuringSchedulingIgnoredDuringExecution", "nodeSelectorTerms"))
{
foreach (var expr in ManifestNodeExtensions.MappingSequence(term, "matchExpressions"))
{
var key = ManifestNodeExtensions.Scalar(expr, "key");
foreach (var valueNode in ManifestNodeExtensions.ScalarSequence(expr, "values"))
{
if (ContainsForbidden(valueNode))
{
local.Add($"{deployment.Name} has nodeAffinity matchExpression '{key}' value '{valueNode}' which targets a forbidden operator-workstation host.");
}
}
}
}
// tolerations: scheduling onto a tainted operator-workstation
// node would let the runner run there. Forbid any toleration
// value that names the workstation.
foreach (var toleration in ManifestNodeExtensions.MappingSequence(podSpec, "tolerations"))
{
var key = ManifestNodeExtensions.Scalar(toleration, "key");
var value = ManifestNodeExtensions.Scalar(toleration, "value");
if (ContainsForbidden(key))
{
local.Add($"{deployment.Name} has toleration key '{key}' which targets a forbidden operator-workstation host.");
}
if (ContainsForbidden(value))
{
local.Add($"{deployment.Name} has toleration value '{value}' which targets a forbidden operator-workstation host.");
}
}
return local;
}).ToList();
violations.Should().BeEmpty("BLUEJAY-WS / iamworkin-ws must never host a fleet GitHub Actions runner; see CLAUDE.md 'Registering BLUEJAY-WS as a fleet GitHub Actions runner' and feedback_bluejay_ws_never_public_runner.md");
}
[Fact] [Fact]
public void Monitoring_MustAlertWhenLinuxRunnerDeploymentIsUnavailable() public void Monitoring_MustAlertWhenLinuxRunnerDeploymentIsUnavailable()
{ {
@@ -890,6 +997,22 @@ internal sealed record ManifestDocument(
.ToList(); .ToList();
} }
// MainContainerMappings excludes initContainers. Use this when asserting
// properties of the primary container (env, image, volumeMounts) where an
// initContainer would be a false-positive match — e.g. the GitHub runner
// image's `setup-runner-home` initContainer should not count toward the
// single-container assertions on the runner deployments.
public IReadOnlyList<YamlMappingNode> MainContainerMappings()
{
var podSpec = PodSpec();
if (podSpec is null)
{
return Array.Empty<YamlMappingNode>();
}
return ManifestNodeExtensions.MappingSequence(podSpec, "containers").ToList();
}
public IReadOnlyList<ContainerSpec> ContainerSpecs() public IReadOnlyList<ContainerSpec> ContainerSpecs()
{ {
return ContainerMappings() return ContainerMappings()

View File

@@ -0,0 +1,124 @@
using FluentAssertions;
using System.Text.RegularExpressions;
using Xunit;
namespace BluejayInfraLint.Tests;
[Trait("Category", "Unit")]
public sealed class MonitoringCoverageLintTests
{
private static readonly ManifestInventory Inventory = ManifestInventory.Load();
private static readonly string[] Sprint57ProbeTargets =
{
"https://dns.iamworkin.lan/",
"https://flowercore.iamworkin.lan/healthz",
"https://replay.iamworkin.lan/healthz",
"https://signalcontrol.iamworkin.lan/health",
"https://updatecenter-internal.iamworkin.lan/api/v1/manifests/_schema",
"https://updates.iamworkin.lan/api/v1/manifests/_schema",
"https://worldbuilder.iamworkin.lan/healthz",
};
[Fact]
public void PrometheusScrape_MustNotTargetDeadPiManagerPort()
{
var monitoring = ReadMonitoringMirror();
monitoring.Should().NotContain("10.0.58.113:5100");
monitoring.Should().Contain("10.0.58.113:5200");
}
[Fact]
public void ProbeJobs_MustKeepEnvironmentSpecificBlackboxRelabels()
{
var monitoring = ReadMonitoringMirror();
var probeJobs = FindProbeJobs(monitoring);
probeJobs.Should().NotBeEmpty();
probeJobs.Should().OnlyContain(
job => job.Contains("replacement: blackbox-exporter.monitoring.svc:9115", StringComparison.Ordinal),
"the bluejay-infra mirror runs Prometheus in-cluster and should use the blackbox service DNS");
var livePodmanPrometheus = TryReadNotesMonitoringFile("prometheus.yml");
if (livePodmanPrometheus is not null)
{
FindProbeJobs(livePodmanPrometheus).Should().OnlyContain(
job => job.Contains("replacement: localhost:9115", StringComparison.Ordinal),
"live Podman monitoring uses host networking, so blackbox probes must relabel to localhost:9115");
}
}
[Fact]
public void TraefikServiceProbes_MustCoverSprint57LiveFlowerCoreHosts()
{
var monitoring = ReadMonitoringMirror();
foreach (var target in Sprint57ProbeTargets)
{
monitoring.Should().Contain(target);
}
}
[Fact]
public void EpsonPrinterDown_MustUseRangeWindowForStaleScrapeCoverage()
{
var alerts = ReadMonitoringMirror();
alerts.Should().Contain("- alert: EpsonPrinterDown");
alerts.Should().Contain("max_over_time(up{job=\"snmp-printer\"}[35m]) == bool 0");
alerts.Should().NotContain("expr: up{job=\"snmp-printer\"} == 0");
}
[Fact]
public void MonitoringMirror_MustCarryRunnerExclusionsAndEpsonGrafanaDelivery()
{
var mirror = ReadMonitoringMirror();
GetAlertBlock(mirror, "KubeContainerRestartingFrequently")
.Should()
.Contain("kube_pod_container_status_restarts_total{namespace!=\"github-runner\"}[1h]");
GetAlertBlock(mirror, "KubeContainerCrashLooping")
.Should()
.Contain("kube_pod_container_status_restarts_total{namespace!=\"github-runner\"}[15m]");
GetAlertBlock(mirror, "KubeDeploymentReplicasMismatch")
.Should()
.Contain("kube_deployment_spec_replicas{namespace!=\"github-runner\"} != kube_deployment_status_replicas_available{namespace!=\"github-runner\"}");
mirror.Should().Contain("uid: epson-printer-down-stale-window");
mirror.Should().Contain("title: EpsonPrinterDown");
mirror.Should().Contain("alert_channel: irc");
}
private static string ReadMonitoringMirror() =>
File.ReadAllText(Path.Combine(Inventory.BluejayRoot, "apps", "monitoring", "noc-monitoring.yaml"));
private static string? TryReadNotesMonitoringFile(string fileName)
{
var overrideRoot = Environment.GetEnvironmentVariable("FLOWERCORE_NOTES_ROOT");
if (string.IsNullOrWhiteSpace(overrideRoot))
{
return null;
}
var path = Path.Combine(overrideRoot, "scripts", "monitoring", fileName);
return File.ReadAllText(path);
}
private static IReadOnlyList<string> FindProbeJobs(string yaml) =>
Regex.Matches(
yaml,
"(?ms)^\\s+- job_name: \"probe-[^\"]+\".*?(?=^\\s+- job_name:|\\z)")
.Cast<Match>()
.Select(match => match.Value)
.ToList();
private static string GetAlertBlock(string yaml, string alertName)
{
var match = Regex.Match(
yaml,
$"(?ms)^\\s+- alert: {Regex.Escape(alertName)}\\s*$.*?(?=^\\s+- alert:|\\z)");
match.Success.Should().BeTrue($"alert {alertName} should be present in noc-monitoring.yaml");
return match.Value;
}
}