feat(fc-devicemgmt): add Kubernetes deployment manifests

2026-05-17 21:55:33 -05:00
23 changed files with 177 additions and 4176 deletions
--- a/README.md
+++ b/README.md
@@ -118,7 +118,6 @@ That test project sweeps `bluejay-infra/apps/**` plus the canonical sibling `Flo
 ## References
 - OpenVox noc1 durability runbook: `docs/runbooks/openvoxserver-quadlet-durability.md`
 - Cert-manager recovery playbook: `FlowerCore.Notes/memory/project_cert_manager_recovery_2026_04_22.md`
 - Why pfSense DNS is required: `FlowerCore.Notes/memory/feedback_pfsense_dns_required_for_acme.md`
 - Public DNS operator host: `https://dns.iamworkin.lan`
--- a/apps/authentik/authentik.yaml
+++ b/apps/authentik/authentik.yaml
@@ -1,448 +0,0 @@
 # Authentik OIDC backend
 # ArgoCD-managed. BlueJay Lab.
 #
 # Stack:
 #   - PostgreSQL 16 StatefulSet (single replica, Longhorn RWO 5Gi)
 #   - Redis 7 Deployment (no persistence — session/cache only)
 #   - Authentik server + worker Deployments (image ghcr.io/goauthentik/server:2024.12.3)
 #   - Media PVC shared between server + worker (Longhorn RWO 2Gi)
 #   - Certificate via step-ca-acme ClusterIssuer
 #   - Traefik IngressRoute at id.iamworkin.lan
 #
 # Secrets come from 1Password item "authentik-credentials" (IAmWorkin vault, id y6i74ch22q5wvm7znquq4nhhcu)
 # via the OnePasswordItem CRD, materialized into k8s Secret authentik/authentik-credentials.
 #
 # Why the discovery URL is /application/o/pimanager/ : Authentik issues per-application OIDC providers.
 # The pimanager OIDC application/provider is created after the cluster pods are healthy (manual or
 # via API once the bootstrap token is available — see Notes substrate).
 ---
 apiVersion: v1
 kind: Namespace
 metadata:
  name: authentik
  labels:
    app.kubernetes.io/part-of: bluejay-infra
 ---
 # 1Password operator pulls the authentik-credentials item into a k8s Secret of the same name.
 # Field labels in 1P become Secret keys: AUTHENTIK_SECRET_KEY, POSTGRES_PASSWORD, REDIS_PASSWORD,
 # BOOTSTRAP_ADMIN_PASSWORD, BOOTSTRAP_ADMIN_TOKEN, BOOTSTRAP_ADMIN_EMAIL.
 apiVersion: onepassword.com/v1
 kind: OnePasswordItem
 metadata:
  name: authentik-credentials
  namespace: authentik
 spec:
  itemPath: "vaults/IAmWorkin/items/authentik-credentials"
 ---
 # Shared media volume for server + worker pods.
 apiVersion: v1
 kind: PersistentVolumeClaim
 metadata:
  name: authentik-media
  namespace: authentik
 spec:
  storageClassName: longhorn
  accessModes: [ReadWriteOnce]
  resources:
    requests:
      storage: 2Gi
 ---
 # PostgreSQL 16 StatefulSet — Authentik's primary store.
 apiVersion: apps/v1
 kind: StatefulSet
 metadata:
  name: authentik-postgres
  namespace: authentik
  labels:
    app: authentik-postgres
    argocd.argoproj.io/instance: infra-authentik
 spec:
  persistentVolumeClaimRetentionPolicy:
    whenDeleted: Retain
    whenScaled: Retain
  podManagementPolicy: OrderedReady
  serviceName: authentik-postgres
  replicas: 1
  revisionHistoryLimit: 10
  selector:
    matchLabels:
      app: authentik-postgres
  template:
    metadata:
      labels:
        app: authentik-postgres
    spec:
      containers:
        - name: postgres
          image: postgres:16-alpine
          ports:
            - containerPort: 5432
              name: postgres
          env:
            - name: POSTGRES_USER
              value: authentik
            - name: POSTGRES_PASSWORD
              valueFrom:
                secretKeyRef:
                  name: authentik-credentials
                  key: POSTGRES_PASSWORD
            - name: POSTGRES_DB
              value: authentik
            - name: POSTGRES_INITDB_ARGS
              value: "--encoding=UTF-8 --lc-collate=C --lc-ctype=C"
            - name: PGDATA
              value: /var/lib/postgresql/data/pgdata
          readinessProbe:
            exec:
              command: ["pg_isready", "-U", "authentik"]
            initialDelaySeconds: 5
            periodSeconds: 5
          livenessProbe:
            exec:
              command: ["pg_isready", "-U", "authentik"]
            initialDelaySeconds: 30
            periodSeconds: 30
          resources:
            requests: { cpu: 100m, memory: 256Mi }
            limits: { cpu: 1000m, memory: 1Gi }
          volumeMounts:
            - name: pgdata
              mountPath: /var/lib/postgresql/data
  volumeClaimTemplates:
    - metadata:
        name: pgdata
      spec:
        storageClassName: longhorn
        accessModes: [ReadWriteOnce]
        volumeMode: Filesystem
        resources:
          requests:
            storage: 5Gi
 ---
 apiVersion: v1
 kind: Service
 metadata:
  name: authentik-postgres
  namespace: authentik
 spec:
  clusterIP: None
  selector:
    app: authentik-postgres
  ports:
    - name: postgres
      port: 5432
      targetPort: 5432
 ---
 # Redis 7 — session storage + Celery broker. No persistence needed (cache).
 apiVersion: apps/v1
 kind: Deployment
 metadata:
  name: authentik-redis
  namespace: authentik
  labels:
    app: authentik-redis
    argocd.argoproj.io/instance: infra-authentik
 spec:
  replicas: 1
  strategy:
    type: Recreate
  selector:
    matchLabels:
      app: authentik-redis
  template:
    metadata:
      labels:
        app: authentik-redis
    spec:
      containers:
        - name: redis
          image: redis:7-alpine
          args:
            - "--save"
            - ""
            - "--appendonly"
            - "no"
            - "--requirepass"
            - "$(REDIS_PASSWORD)"
          env:
            - name: REDIS_PASSWORD
              valueFrom:
                secretKeyRef:
                  name: authentik-credentials
                  key: REDIS_PASSWORD
          ports:
            - containerPort: 6379
              name: redis
          readinessProbe:
            tcpSocket: { port: 6379 }
            initialDelaySeconds: 5
            periodSeconds: 5
          livenessProbe:
            tcpSocket: { port: 6379 }
            initialDelaySeconds: 30
            periodSeconds: 30
          resources:
            requests: { cpu: 50m, memory: 64Mi }
            limits: { cpu: 500m, memory: 256Mi }
 ---
 apiVersion: v1
 kind: Service
 metadata:
  name: authentik-redis
  namespace: authentik
 spec:
  selector:
    app: authentik-redis
  ports:
    - name: redis
      port: 6379
      targetPort: 6379
 ---
 # Authentik server Deployment — HTTP frontend on :9000.
 apiVersion: apps/v1
 kind: Deployment
 metadata:
  name: authentik-server
  namespace: authentik
  labels:
    app: authentik-server
    argocd.argoproj.io/instance: infra-authentik
 spec:
  replicas: 1
  strategy:
    type: Recreate  # shares /media RWO PVC with worker
  selector:
    matchLabels:
      app: authentik-server
  template:
    metadata:
      labels:
        app: authentik-server
    spec:
      securityContext:
        # Authentik image runs as uid 1000 "authentik" but the Longhorn PVC mounts
        # root:root by default. fsGroup recursively chgrp + chmod g+rwx so the
        # non-root container can mkdir /media/public during the tenant_files migration.
        fsGroup: 1000
      containers:
        - name: server
          image: ghcr.io/goauthentik/server:2024.12.3
          args: ["server"]
          ports:
            - containerPort: 9000
              name: http
            - containerPort: 9443
              name: https
          env:
            - name: AUTHENTIK_SECRET_KEY
              valueFrom:
                secretKeyRef:
                  name: authentik-credentials
                  key: AUTHENTIK_SECRET_KEY
            - name: AUTHENTIK_REDIS__HOST
              value: authentik-redis
            - name: AUTHENTIK_REDIS__PASSWORD
              valueFrom:
                secretKeyRef:
                  name: authentik-credentials
                  key: REDIS_PASSWORD
            - name: AUTHENTIK_POSTGRESQL__HOST
              value: authentik-postgres
            - name: AUTHENTIK_POSTGRESQL__NAME
              value: authentik
            - name: AUTHENTIK_POSTGRESQL__USER
              value: authentik
            - name: AUTHENTIK_POSTGRESQL__PASSWORD
              valueFrom:
                secretKeyRef:
                  name: authentik-credentials
                  key: POSTGRES_PASSWORD
            - name: AUTHENTIK_BOOTSTRAP_PASSWORD
              valueFrom:
                secretKeyRef:
                  name: authentik-credentials
                  key: BOOTSTRAP_ADMIN_PASSWORD
            - name: AUTHENTIK_BOOTSTRAP_TOKEN
              valueFrom:
                secretKeyRef:
                  name: authentik-credentials
                  key: BOOTSTRAP_ADMIN_TOKEN
            - name: AUTHENTIK_BOOTSTRAP_EMAIL
              valueFrom:
                secretKeyRef:
                  name: authentik-credentials
                  key: BOOTSTRAP_ADMIN_EMAIL
            - name: AUTHENTIK_DISABLE_UPDATE_CHECK
              value: "true"
            - name: AUTHENTIK_ERROR_REPORTING__ENABLED
              value: "false"
            - name: AUTHENTIK_LOG_LEVEL
              value: info
          # First-boot Authentik can take 3+ min on the migration phase
          # (waiting on DB lock while worker also runs migrations). Initial
          # delays are generous so kubelet doesn't kill the pod mid-migration;
          # periodSeconds keeps post-startup probing responsive.
          readinessProbe:
            httpGet:
              path: /-/health/ready/
              port: 9000
            initialDelaySeconds: 60
            periodSeconds: 10
            timeoutSeconds: 5
            failureThreshold: 12
          livenessProbe:
            httpGet:
              path: /-/health/live/
              port: 9000
            initialDelaySeconds: 300
            periodSeconds: 30
            timeoutSeconds: 10
            failureThreshold: 3
          startupProbe:
            httpGet:
              path: /-/health/live/
              port: 9000
            initialDelaySeconds: 30
            periodSeconds: 15
            timeoutSeconds: 10
            failureThreshold: 40  # 30s + 40*15s = 10.5 min budget
          resources:
            requests: { cpu: 150m, memory: 512Mi }
            limits: { cpu: 1500m, memory: 1Gi }
          volumeMounts:
            - name: media
              mountPath: /media
      volumes:
        - name: media
          persistentVolumeClaim:
            claimName: authentik-media
 ---
 # Authentik worker Deployment — runs Celery background tasks.
 apiVersion: apps/v1
 kind: Deployment
 metadata:
  name: authentik-worker
  namespace: authentik
  labels:
    app: authentik-worker
    argocd.argoproj.io/instance: infra-authentik
 spec:
  replicas: 1
  strategy:
    type: Recreate  # shares /media RWO PVC with server
  selector:
    matchLabels:
      app: authentik-worker
  template:
    metadata:
      labels:
        app: authentik-worker
    spec:
      securityContext:
        # Same as server pod — non-root uid 1000 needs PVC group write.
        fsGroup: 1000
      containers:
        - name: worker
          image: ghcr.io/goauthentik/server:2024.12.3
          args: ["worker"]
          env:
            - name: AUTHENTIK_SECRET_KEY
              valueFrom:
                secretKeyRef:
                  name: authentik-credentials
                  key: AUTHENTIK_SECRET_KEY
            - name: AUTHENTIK_REDIS__HOST
              value: authentik-redis
            - name: AUTHENTIK_REDIS__PASSWORD
              valueFrom:
                secretKeyRef:
                  name: authentik-credentials
                  key: REDIS_PASSWORD
            - name: AUTHENTIK_POSTGRESQL__HOST
              value: authentik-postgres
            - name: AUTHENTIK_POSTGRESQL__NAME
              value: authentik
            - name: AUTHENTIK_POSTGRESQL__USER
              value: authentik
            - name: AUTHENTIK_POSTGRESQL__PASSWORD
              valueFrom:
                secretKeyRef:
                  name: authentik-credentials
                  key: POSTGRES_PASSWORD
            - name: AUTHENTIK_DISABLE_UPDATE_CHECK
              value: "true"
            - name: AUTHENTIK_ERROR_REPORTING__ENABLED
              value: "false"
            - name: AUTHENTIK_LOG_LEVEL
              value: info
          resources:
            requests: { cpu: 100m, memory: 256Mi }
            limits: { cpu: 1000m, memory: 768Mi }
          volumeMounts:
            - name: media
              mountPath: /media
      volumes:
        - name: media
          persistentVolumeClaim:
            claimName: authentik-media
 ---
 apiVersion: v1
 kind: Service
 metadata:
  name: authentik-server
  namespace: authentik
 spec:
  selector:
    app: authentik-server
  ports:
    - name: http
      port: 9000
      targetPort: 9000
    - name: https
      port: 9443
      targetPort: 9443
 ---
 # step-ca leaf certificate for id.iamworkin.lan.
 # step-ca container resolver uses pfSense Unbound, so the public A record for id.iamworkin.lan
 # MUST exist before this Certificate is applied (cert-manager HTTP-01 will silently 2h-backoff
 # otherwise). Added 2026-05-25 via scripts/pfsense-add-id-host.py.
 apiVersion: cert-manager.io/v1
 kind: Certificate
 metadata:
  name: authentik-tls
  namespace: authentik
 spec:
  secretName: authentik-tls
  dnsNames:
    - id.iamworkin.lan
  issuerRef:
    name: step-ca-acme
    kind: ClusterIssuer
 ---
 apiVersion: traefik.io/v1alpha1
 kind: IngressRoute
 metadata:
  name: authentik
  namespace: authentik
 spec:
  entryPoints: [websecure]
  routes:
    - match: Host(`id.iamworkin.lan`)
      kind: Rule
      services:
        - name: authentik-server
          port: 9000
  tls:
    secretName: authentik-tls
--- a/apps/fc-chat/fc-chat.yaml
+++ b/apps/fc-chat/fc-chat.yaml
@@ -30,41 +30,3 @@ spec:
          port: 80
  tls:
    secretName: chat-web-tls
 ---
 # Public host profile marker. The app treats this header as authoritative for
 # the public twin, while the internal chat.iamworkin.lan route does not attach
 # it and keeps the operator-oriented UI.
 apiVersion: traefik.io/v1alpha1
 kind: Middleware
 metadata:
  name: chat-public-profile-header
  namespace: fc-chat
 spec:
  headers:
    customRequestHeaders:
      X-FC-Chat-Host-Profile: "public"
 ---
 # Public Cloudflare-fronted twin for the anonymous chat surface. Operator
 # paths are intentionally absent from the allowlist below, so /admin,
 # /operator, /console, /ops, /api/operator, and /operatorhub miss this route
 # and return Traefik 404 before reaching the pod. Operator action still needed:
 # create/verify Cloudflare DNS chat.flowercore.io -> public Traefik endpoint
 # and mirror the cf-origin-flowercore-io TLS secret into namespace fc-chat.
 apiVersion: traefik.io/v1alpha1
 kind: IngressRoute
 metadata:
  name: chat-web-public
  namespace: fc-chat
 spec:
  entryPoints:
    - websecure
  routes:
    - match: Host(`chat.flowercore.io`) && (Path(`/`) || Path(`/chat`) || PathPrefix(`/_blazor`) || PathPrefix(`/_framework`) || PathPrefix(`/_content`) || PathPrefix(`/avatars`) || PathPrefix(`/css`) || PathPrefix(`/js`) || PathPrefix(`/favicon`) || PathPrefix(`/chathub`)) && (Method(`GET`) || Method(`HEAD`) || Method(`POST`) || Method(`OPTIONS`))
      kind: Rule
      middlewares:
        - name: chat-public-profile-header
      services:
        - name: chat-web
          port: 80
  tls:
    secretName: cf-origin-flowercore-io
--- a/apps/fc-devicemgmt/argocd-application.yaml
+++ b/apps/fc-devicemgmt/argocd-application.yaml
@@ -0,0 +1,33 @@
 # Explicit ArgoCD Application shape for bootstrap/review.
 #
 # The live bluejay-infra ApplicationSet already discovers apps/* directories
 # and creates this same Application name (`infra-fc-devicemgmt`) automatically.
 # Keep repoURL on the internal Gitea ClusterIP URL; ArgoCD does not trust the
 # external step-ca HTTPS endpoint.
 apiVersion: argoproj.io/v1alpha1
 kind: Application
 metadata:
  name: infra-fc-devicemgmt
  namespace: argocd
  labels:
    app.kubernetes.io/name: fc-devicemgmt
    app.kubernetes.io/part-of: flowercore
    app.kubernetes.io/managed-by: argocd
    flowercore.io/tenant-id: system
    flowercore.io/created-by: bluejay-infra
 spec:
  project: default
  source:
    repoURL: http://gitea-clusterip.gitea.svc.cluster.local:3000/bluejay/bluejay-infra.git
    targetRevision: main
    path: apps/fc-devicemgmt
  destination:
    server: https://kubernetes.default.svc
    namespace: fc-devicemgmt
  syncPolicy:
    automated:
      prune: true
      selfHeal: true
    syncOptions:
      - CreateNamespace=true
      - ServerSideApply=true
--- a/apps/fc-devicemgmt/deployment-operator.yaml
+++ b/apps/fc-devicemgmt/deployment-operator.yaml
@@ -47,7 +47,7 @@ spec:
        fsGroupChangePolicy: OnRootMismatch
      containers:
        - name: operator
-          image: localhost/fc-devicemgmt-operator:v20260519-sp34cl3-fix
+          image: localhost/fc-devicemgmt-operator:v20260512-cx5
          imagePullPolicy: Never
          ports:
            - name: metrics
--- a/apps/fc-devicemgmt/deployment-web.yaml
+++ b/apps/fc-devicemgmt/deployment-web.yaml
@@ -4,22 +4,6 @@
 # Sprint 9+ lane. This manifest is static-valid without requiring the image to
 # exist yet; import localhost/fc-devicemgmt-web:<tag> to all schedulable RKE2
 # nodes before letting ArgoCD sync a live rollout.
 #
 # SCALED TO 0 — 2026-05-19 morning-routine cleanup.
 # The Web pod cannot start until TWO upstream gaps close:
 #   1. MySQL DB instance `flowercore_devicemgmt` (user `fc_devicemgmt`) is
 #      provisioned via fc-mysql Manager. The cluster currently has ZERO
 #      MySqlInstanceCrds and no `mysql.fc-mysql.svc:3306` Service, so the
 #      deployment-web container env `FlowerCore__Database__Host=mysql.fc-mysql.svc`
 #      points at nothing. Provision via the fc-mysql Manager UI/REST/MCP.
 #   2. 1Password vault item `IAmWorkin/FlowerCore DeviceManagement Runtime`
 #      with 5 fields (DB-Password, mtls-ca.pem, mtls-client.crt, mtls-client.key,
 #      mtls-chain.pem) — see apps/fc-devicemgmt/1password-item.yaml. Mint mTLS
 #      from step-ca-agent ClusterIssuer per ADR-126; DB-Password must match the
 #      password configured for the MySQL user.
 # Re-enable: change replicas back to 2 after both gaps close. The image tag
 # in this file (v20260512-cx5) MAY also need a refresh — it predates the
 # Sprint 34 Cl-3 operator fix; Web may have an analogous bug.
 apiVersion: apps/v1
 kind: Deployment
 metadata:
@@ -36,7 +20,7 @@ metadata:
  annotations:
    flowercore.io/traceability-standard: k8s-pod-ownership-and-traceability-standard
 spec:
-  replicas: 0
+  replicas: 2
  revisionHistoryLimit: 3
  selector:
    matchLabels:
--- a/apps/fc-ttsreader/fc-ttsreader.yaml
+++ b/apps/fc-ttsreader/fc-ttsreader.yaml
@@ -532,7 +532,7 @@ spec:
        fsGroupChangePolicy: OnRootMismatch
      containers:
        - name: web
-          image: localhost/fc-ttsreader-web:v20260518-sprint36-demo-finish-b132cbf
+          image: localhost/fc-ttsreader-web:v20260506-phase6
          imagePullPolicy: Never
          ports:
            - containerPort: 5217
@@ -555,13 +555,9 @@ spec:
            - name: TtsReader__Jobs__Root
              value: "/data/jobs"
            - name: TtsReader__Piper__Host
-              value: "10.0.57.17"
+              value: "ttsreader-piper.fc-ttsreader.svc.cluster.local."
            - name: TtsReader__Piper__Port
-              value: "8500"
+              value: "10200"
            - name: TtsReader__Piper__Transport
              value: "http"
            - name: TtsReader__Piper__HttpPath
              value: "/tts"
            - name: TtsReader__Kokoro__Enabled
              value: "true"
            - name: TtsReader__Kokoro__BaseUrl
--- a/apps/github-runner/.gitattributes
+++ b/apps/github-runner/.gitattributes
@@ -1,2 +0,0 @@
 *.sh text eol=lf
 Dockerfile text eol=lf
--- a/apps/github-runner/Dockerfile
+++ b/apps/github-runner/Dockerfile
@@ -1,54 +0,0 @@
 FROM myoung34/github-runner:latest
 ARG RUBY_VERSION=3.3.11
 ARG RUBY_MINOR=3.3
 ARG RUBY_BUILD_VERSION=v20260326
 ARG RUNNER_UID=1001
 ARG RUNNER_GID=1001
 ENV RUNNER_TOOL_CACHE=/home/runner/_tool
 ENV RUNNER_RUBY_TOOLCACHE=/opt/runner-toolcache
 ENV PATH="/home/runner/_tool/Ruby/${RUBY_MINOR}/x64/bin:/opt/runner-toolcache/Ruby/${RUBY_MINOR}/x64/bin:${PATH}"
 USER root
 # Bake the IAmWorkin step-ca root CA into the system trust store. Without
 # this, .NET HttpClient calls from CI tests against *.iamworkin.lan
 # (e.g. https://selenium.iamworkin.lan/session) fail with `PartialChain`
 # because the runner image's default Ubuntu trust bundle doesn't include
 # our internal Root CA. update-ca-certificates regenerates
 # /etc/ssl/certs/ca-certificates.crt, which OpenSSL + .NET on Linux read
 # automatically — no SSL_CERT_FILE env var needed.
 COPY step-ca-root.crt /usr/local/share/ca-certificates/iamworkin-step-ca-root.crt
 RUN apt-get update \
    && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
        autoconf \
        bison \
        build-essential \
        ca-certificates \
        curl \
        libdb-dev \
        libffi-dev \
        libgdbm-dev \
        libgmp-dev \
        libncurses-dev \
        libreadline-dev \
        libssl-dev \
        libyaml-dev \
        patch \
        pkg-config \
        uuid-dev \
        zlib1g-dev \
    && update-ca-certificates \
    && curl -fsSL "https://github.com/rbenv/ruby-build/archive/refs/tags/${RUBY_BUILD_VERSION}.tar.gz" -o /tmp/ruby-build.tar.gz \
    && mkdir -p /tmp/ruby-build \
    && tar -xzf /tmp/ruby-build.tar.gz --strip-components=1 -C /tmp/ruby-build \
    && /tmp/ruby-build/install.sh \
    && rm -rf /tmp/ruby-build /tmp/ruby-build.tar.gz /var/lib/apt/lists/*
 COPY install-ruby-toolcache.sh /usr/local/bin/install-ruby-toolcache.sh
 RUN chmod +x /usr/local/bin/install-ruby-toolcache.sh \
    && RUBY_VERSION="${RUBY_VERSION}" RUBY_MINOR="${RUBY_MINOR}" TOOLCACHE_ROOT="${RUNNER_RUBY_TOOLCACHE}" RUNNER_UID="${RUNNER_UID}" RUNNER_GID="${RUNNER_GID}" /usr/local/bin/install-ruby-toolcache.sh \
    && ruby -v
--- a/apps/github-runner/README.md
+++ b/apps/github-runner/README.md
@@ -7,71 +7,17 @@ Deployments with `kubectl`; update this manifest and let ArgoCD reconcile.
 All repo-scoped Linux runners use:
 - `localhost/fc-github-runner:v20260525-ruby3.3.11-stepca`, derived from
  `myoung34/github-runner:latest`
 - `ACCESS_TOKEN` from the `github-runner-token` Secret
 - `RUN_AS_ROOT=false`
 - `EPHEMERAL=true`
 - `LABELS=self-hosted,linux,fc-build-linux`
 - writable non-root paths under `/home/runner` for .NET, NuGet, XDG cache, and
  Actions tool cache
 - Ruby 3.3.11 seeded into `/home/runner/_tool/Ruby/3.3/x64` from the baked
  `/opt/runner-toolcache` copy so `ruby/setup-ruby@v1` can discover it on
  self-hosted `ubuntu-20.04-x64` runners
 `github-runner` for `FlowerCore.Common` is single-replica because it retains the
-original Longhorn ReadWriteOnce NuGet PVC. Every other repo-scoped runner uses
+original Longhorn ReadWriteOnce NuGet PVC. `github-runner-sharedpos` and the top
-two replicas with per-pod `emptyDir` caches. That is the safe backlog-drain
+Linux-cost repo runners use two replicas with per-pod `emptyDir` caches. That is
-strategy: no two pods share one RWO PVC.
+the safe backlog-drain strategy: no two pods share one RWO PVC.
 Sprint 32 final long-tail wave adds 16 two-replica Deployments:
 `FlowerCore.Knowledge`, `FlowerCore.LlmBridge`, `FlowerCore.Media`,
 `FlowerCore.Presentations`, `FlowerCore.RemoteDesktop`, `FlowerCore.DNS`,
 `FlowerCore.Distribution`, `FlowerCore.Scoreboard`,
 `FlowerCore.SegmentDisplay`, `FlowerCore.Signage.Contracts`,
 `FlowerCore.SignalControl`, `FlowerCore.Intranet.Web`,
 `FlowerCore.Provisioning`, `FlowerCore.Redis`, `FlowerCore.MessageBoard`, and
 `FlowerCore.MenuBoard`.
 ## Image Build
 Ruby is baked with a pinned `ruby-build` release and Ruby patch version. The pod
 still mounts an `emptyDir` over `/home/runner`, so the `setup-runner-home` init
 container copies the baked toolcache from `/opt/runner-toolcache/Ruby` into
 `/home/runner/_tool/Ruby` before the runner container starts.
 The IAmWorkin step-ca root CA is also baked into the system trust store
 (`/usr/local/share/ca-certificates/iamworkin-step-ca-root.crt`, registered by
 `update-ca-certificates`). Without it, .NET HttpClient calls from CI tests
 against `*.iamworkin.lan` (e.g. `https://selenium.iamworkin.lan/session`)
 fail with `PartialChain`. To refresh the bundled cert when the root rotates,
 re-extract from the cluster and overwrite `step-ca-root.crt`:
 ```bash
 kubectl get secret -n cert-manager step-ca-root \
  -o jsonpath='{.data.ca\.crt}' | base64 -d > step-ca-root.crt
 ```
 ```bash
 cd apps/github-runner
 podman build -t localhost/fc-github-runner:v20260525-ruby3.3.11-stepca .
 podman run --rm localhost/fc-github-runner:v20260525-ruby3.3.11-stepca ruby -v
 podman run --rm localhost/fc-github-runner:v20260525-ruby3.3.11-stepca \
  test -f /opt/runner-toolcache/Ruby/3.3/x64.complete
 podman save localhost/fc-github-runner:v20260525-ruby3.3.11-stepca \
  -o fc-github-runner-v20260525-ruby3.3.11-stepca.tar
 ```
 Import the saved image on every schedulable RKE2 node before ArgoCD rolls the
 Deployments:
 ```bash
 for node in rke2-server rke2-agent1 rke2-agent2; do
  scp fc-github-runner-v20260525-ruby3.3.11-stepca.tar "$node:/tmp/"
  ssh "$node" 'sudo ctr -a /run/k3s/containerd/containerd.sock -n k8s.io images rm localhost/fc-github-runner:v20260525-ruby3.3.11-stepca || true'
  ssh "$node" 'sudo ctr -a /run/k3s/containerd/containerd.sock -n k8s.io images import /tmp/fc-github-runner-v20260525-ruby3.3.11-stepca.tar'
 done
 ```
 ## Post-Merge Proof
@@ -81,26 +27,12 @@ After the PR is merged and ArgoCD syncs, verify the runner fleet:
 kubectl -n github-runner get deploy,pods,pvc
 ```
 Verify the Ruby toolcache in a fresh pod:
 ```bash
 kubectl -n github-runner exec deploy/github-runner-puppet -c runner -- ruby -v
 kubectl -n github-runner exec deploy/github-runner-puppet -c runner -- sh -c \
  'echo "$RUNNER_TOOL_CACHE" && test -f "$RUNNER_TOOL_CACHE/Ruby/3.3/x64.complete"'
 ```
 Verify GitHub registration for the repo-scoped runners:
 ```bash
 for repo in FlowerCore.Common FlowerCore.Shared.Pos FlowerCore.Puppet FlowerCore.Signage \
            FlowerCore.DMS FlowerCore.Telephony FlowerCore.Print.Web FlowerCore.Chat \
-            FlowerCore.MySQL FlowerCore.Kiosk.Linux FlowerCore.Marquee FlowerCore.TtsReader \
+            FlowerCore.MySQL FlowerCore.Kiosk.Linux; do
            FlowerCore.Knowledge FlowerCore.LlmBridge FlowerCore.Media \
            FlowerCore.Presentations FlowerCore.RemoteDesktop FlowerCore.DNS \
            FlowerCore.Distribution FlowerCore.Scoreboard FlowerCore.SegmentDisplay \
            FlowerCore.Signage.Contracts FlowerCore.SignalControl FlowerCore.Intranet.Web \
            FlowerCore.Provisioning FlowerCore.Redis FlowerCore.MessageBoard \
            FlowerCore.MenuBoard; do
  echo "=== $repo ==="
  gh api "/repos/astoltz/$repo/actions/runners" \
    --jq '.runners[] | select(.labels[].name == "fc-build-linux") | {name,status,busy,labels:[.labels[].name]}'
@@ -122,10 +54,6 @@ from GitHub Actions and verify it lands on an `rke2-linux-*` runner.
 - `actions/setup-dotnet` permission error at `/usr/share/dotnet`: check that
  `DOTNET_INSTALL_DIR=/home/runner/.dotnet` and related cache env vars are
  present on the runner pod.
 - `ruby/setup-ruby@v1` says self-hosted runners must install Ruby in
  `$RUNNER_TOOL_CACHE`: check that the init container copied
  `/opt/runner-toolcache/Ruby` into `/home/runner/_tool/Ruby` and that
  `/home/runner/_tool/Ruby/3.3/x64.complete` exists.
 - `404` during runner registration: the fine-grained PAT is valid but missing
  repository access for that repo. Add the repo to the PAT access list; the PAT
  value does not change.
--- a/apps/github-runner/github-runner.yaml
+++ b/apps/github-runner/github-runner.yaml
--- a/apps/github-runner/install-ruby-toolcache.sh
+++ b/apps/github-runner/install-ruby-toolcache.sh
@@ -1,19 +0,0 @@
 #!/usr/bin/env bash
 set -euo pipefail
 RUBY_VERSION="${RUBY_VERSION:-3.3.11}"
 RUBY_MINOR="${RUBY_MINOR:-3.3}"
 TOOLCACHE_ROOT="${TOOLCACHE_ROOT:-/opt/runner-toolcache}"
 RUNNER_UID="${RUNNER_UID:-1001}"
 RUNNER_GID="${RUNNER_GID:-1001}"
 RUBY_PREFIX="${TOOLCACHE_ROOT}/Ruby/${RUBY_VERSION}/x64"
 mkdir -p "${TOOLCACHE_ROOT}/Ruby"
 RUBY_CONFIGURE_OPTS="${RUBY_CONFIGURE_OPTS:---disable-install-doc --disable-yjit}" ruby-build "${RUBY_VERSION}" "${RUBY_PREFIX}"
 touch "${TOOLCACHE_ROOT}/Ruby/${RUBY_VERSION}/x64.complete"
 ln -sfn "${RUBY_VERSION}" "${TOOLCACHE_ROOT}/Ruby/${RUBY_MINOR}"
 "${RUBY_PREFIX}/bin/ruby" -v
 chown -R "${RUNNER_UID}:${RUNNER_GID}" "${TOOLCACHE_ROOT}"
 chmod -R a+rX "${TOOLCACHE_ROOT}"
--- a/apps/github-runner/step-ca-root.crt
+++ b/apps/github-runner/step-ca-root.crt
@@ -1,12 +0,0 @@
 -----BEGIN CERTIFICATE-----
 MIIBxDCCAWqgAwIBAgIRAPY357G6ow6zMAL5+4bS2kkwCgYIKoZIzj0EAwIwQDEa
 MBgGA1UEChMRSUFtV29ya2luIEFDTUUgQ0ExIjAgBgNVBAMTGUlBbVdvcmtpbiBB
 Q01FIENBIFJvb3QgQ0EwHhcNMjYwMzA4MTgwNzExWhcNMzYwMzA1MTgwNzExWjBA
 MRowGAYDVQQKExFJQW1Xb3JraW4gQUNNRSBDQTEiMCAGA1UEAxMZSUFtV29ya2lu
 IEFDTUUgQ0EgUm9vdCBDQTBZMBMGByqGSM49AgEGCCqGSM49AwEHA0IABJ2n04X1
 JZo5Zdq/i1Idv8+fqwZyAzBh7whbqj0SWsJL8UWRabCMqYCs7+dXO0xRSzqkwFDL
 x+vooOai8RgRNhajRTBDMA4GA1UdDwEB/wQEAwIBBjASBgNVHRMBAf8ECDAGAQH/
 AgEBMB0GA1UdDgQWBBRnuPPQR6iM/H6vOluiU3Sygayz8jAKBggqhkjOPQQDAgNI
 ADBFAiEArQK9dYPGmAZsdYnjziuFVVE5NKZUcceYvGfGC+tLXUsCIAudF2zJrCRq
 3mK50ZZET/fwTkJwiEF4824mjP8p1CKM
 -----END CERTIFICATE-----
--- a/apps/monitoring/noc-monitoring.yaml
+++ b/apps/monitoring/noc-monitoring.yaml
@@ -280,14 +280,13 @@ data:
              printer_model: "NuPrint 210"
      # Print.Web health (Blazor app on edge2:5200)
      # Target `/health` (anonymous) — root path requires API key auth and returns 401.
      - job_name: "probe-printweb"
        metrics_path: /probe
        params:
          module: [http_2xx]
        scrape_interval: 30s
        static_configs:
-          - targets: ["http://10.0.57.16:5200/health"]
+          - targets: ["http://10.0.57.16:5200/"]
            labels:
              instance: "print-web"
              service: "print-web"
@@ -967,52 +966,6 @@ data:
            annotations:
              summary: "Disk usage high on {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}%)"
      # Puppet agent + service alerts.
      # Mirror of FlowerCore.Notes/scripts/monitoring/alerts.yml `puppet` group
      # so a future migration to in-cluster Prometheus inherits the ruleset.
      # Source-of-truth for the live Podman Prometheus on noc1 is the Notes file.
      # See feedback_monitoring_k8s_target_vs_live_podman.
      - name: puppet
        rules:
          - alert: PuppetAgentReportStale
            expr: puppet_last_run_age_seconds > 7200
            for: 30m
            labels:
              severity: warning
              alert_channel: irc
            annotations:
              summary: "Puppet agent {{ $labels.instance }} hasn't reported in over 2h"
              description: "Last run age: {{ $value | humanizeDuration }}. The puppet agent on {{ $labels.instance }} may be stopped, the node may be powered off, or noc1 may be unreachable from this node."
              runbook: "1. SSH to node (via noc1 jumpbox if needed) 2. sudo systemctl status puppet 3. sudo puppet agent -t --noop to force a run 4. Check r10k: ssh fcadmin@10.0.56.10 'sudo podman logs openvoxserver --tail 50' 5. Verify noc1 reachability: ping puppet.iamworkin.lan"
          - alert: PuppetAgentReportCritical
            expr: puppet_last_run_age_seconds > 86400
            for: 1h
            labels:
              severity: critical
              alert_channel: irc
            annotations:
              summary: "Puppet agent {{ $labels.instance }} silent for over 24h — node is unmanaged"
              description: "Last run age: {{ $value | humanizeDuration }}. Node {{ $labels.instance }} has not submitted a Puppet report in over 24 hours. Config drift is accumulating — investigate immediately. If intentional (maintenance), add to the exclusion filter or silence in Grafana."
              runbook: "URGENT: 1. Check node power state 2. SSH via noc1 jumpbox: ssh fcadmin@10.0.56.10 then ssh <node> 3. sudo systemctl status puppet 4. sudo systemctl start puppet + sudo puppet agent -t 5. Check for network partitions (VLAN connectivity to 10.0.56.10) 6. If node was recently reimaged: sudo puppet agent -t to re-register with new SSL cert"
          # Sprint 33 Cx-7 Phase B (2026-05-25 postmortem follow-up):
          # Detects puppet.service in failed state — distinct from PuppetAgentReportStale
          # which catches "agent hasn't run." This catches "systemd gave up restarting it"
          # (CA-verify loop or other fatal exit). Requires node-exporter systemd collector
          # enabled with --collector.systemd. If `node_systemd_unit_state` has no series
          # for a node, the collector is disabled there — flag in postmortem follow-up.
          - alert: PuppetServiceFailed
            expr: node_systemd_unit_state{name="puppet.service",state="failed"} == 1
            for: 5m
            labels:
              severity: warning
              alert_channel: irc
            annotations:
              summary: "Puppet service failed on {{ $labels.instance }}"
              description: "puppet.service on {{ $labels.instance }} has been in failed state for 5+ minutes. systemd has stopped auto-restarting (CA-verify-loop or other exit). Manual `systemctl status puppet` confirms. Run `sudo systemctl start puppet` to recover; investigate journal for root cause."
              runbook_url: "https://github.com/astoltz/FlowerCore.Notes/blob/master/memory/feedback_puppet_service_dead_after_ca_loop_alert_misreads.md"
      # K8s pod-state alerts. Require kube-state-metrics scrape (added
      # 2026-04-26 — see scrape_configs above). Would have surfaced the
      # agent-zero ollama-proxy 172x crash-loop instead of letting it
@@ -1274,55 +1227,24 @@ metadata:
 data:
  notify.py: |
    #!/usr/bin/env python3
-    """HTTP->IRC alert relay with thermal-printer DIGEST forwarding.
+    """HTTP->IRC alert relay with thermal printer forwarding for Grafana webhooks.
-
+    Listens on :9119, posts to #alerts on UnrealIRCd via raw IRC protocol.
-    Listens on :9119, posts to #alerts on UnrealIRCd, forwards to Print.Web
+    Alerts tagged alert_channel=thermal_print also POST to Print.Web /api/print/alert.
    /api/print/alert. Thermal printing is BATCHED into hourly digests by
    default so the printer no longer spam-fires per Grafana webhook.
    Routing (per Grafana webhook alert):
      - IRC: always per-event (operator likes the stream)
      - Thermal printer:
          * severity in {critical,disaster,page} OR
            label alert_channel=thermal_print_immediate -> print NOW
          * label alert_channel=thermal_print -> enqueue into hourly digest
          * everything else -> IRC only
      - RESOLVED webhooks remove the alert from the digest buffer
    Env vars (defaults preserve old behavior on first deploy):
      THERMAL_PRINT_ENABLED  default "true"   - master kill switch
      BATCH_INTERVAL_MIN     default "60"     - minutes between digest prints
      BATCH_MAX_PENDING      default "50"     - force-flush threshold
    HTTP surface:
      POST /         - Grafana webhook entry
      POST /flush    - manual digest flush (idempotent)
      GET  /         - status + config + buffer depth + stats
    """
-    import json, os, socket, sys, threading, time
+    import json, socket, sys, time
    from collections import defaultdict
    from datetime import datetime, timezone
    from http.server import HTTPServer, BaseHTTPRequestHandler
    from urllib.request import Request, urlopen
    from urllib.error import URLError
-    THERMAL_PRINT_ENABLED = os.environ.get("THERMAL_PRINT_ENABLED", "true").lower() == "true"
+    IRC_HOST = "unrealircd.irc.svc"  # short name: CoreDNS ndots:5 + iamworkin.lan template hijacks full .cluster.local (see memory)
-    BATCH_INTERVAL_MIN    = int(os.environ.get("BATCH_INTERVAL_MIN", "60"))
+    IRC_PORT = 6667
-    BATCH_MAX_PENDING     = int(os.environ.get("BATCH_MAX_PENDING", "50"))
+    IRC_NICK = "grafana-bot"
-
+    IRC_CHANNEL = "#alerts"
-    IRC_HOST      = os.environ.get("IRC_HOST", "unrealircd.irc.svc")
+    PRINT_WEB_URL = "http://10.0.57.16:5200/api/print/alert"
-    IRC_PORT      = int(os.environ.get("IRC_PORT", "6667"))
+    PRINT_ENABLED = True
    IRC_NICK      = os.environ.get("IRC_NICK", "grafana-bot")
    IRC_CHANNEL   = os.environ.get("IRC_CHANNEL", "#alerts")
    PRINT_WEB_URL = os.environ.get("PRINT_WEB_URL", "http://10.0.57.16:5200/api/print/alert")
    _buffer_lock = threading.Lock()
    _buffer = {}   # fingerprint -> {"alert": dict, "first_seen": float, "last_seen": float}
    _last_flush_time = time.time()
    _stats = {"webhooks_received": 0, "irc_sent": 0, "print_immediate": 0,
              "digest_flushed": 0, "buffer_dedup": 0, "buffer_added": 0,
              "buffer_resolved": 0, "started_at": time.time()}
    def send_irc(message):
        """Connect, handle PING, join, send, quit."""
        try:
            sock = socket.create_connection((IRC_HOST, IRC_PORT), timeout=15)
            sock.sendall(f"NICK {IRC_NICK}\r\n".encode())
@@ -1355,137 +1277,52 @@ data:
            time.sleep(0.5)
            sock.sendall(b"QUIT :alert delivered\r\n")
            sock.close()
            _stats["irc_sent"] += 1
            return True
        except Exception as e:
            print(f"[irc-notify] IRC send failed: {e}", file=sys.stderr)
            return False
-    def post_thermal(payload, kind):
+    def send_thermal_print(alert):
-        if not THERMAL_PRINT_ENABLED:
+        if not PRINT_ENABLED: return
-            print(f"[irc-notify] thermal disabled; skip {kind} ({payload.get('title','?')[:40]})", file=sys.stderr)
+        labels = alert.get("labels", {})
-            return False
+        annotations = alert.get("annotations", {})
        status = alert.get("status", "firing").upper()
        summary = annotations.get("summary", "")
        description = annotations.get("description", "")
        runbook = annotations.get("runbook", "")
        # Build a useful message: summary + description + runbook steps
        parts = []
        if summary: parts.append(summary)
        if description and description != summary: parts.append(description)
        if runbook: parts.append("STEPS: " + runbook)
        message = " | ".join(parts) if parts else labels.get("alertname", "Unknown alert")
        payload = {
            "title": labels.get("alertname", "Unknown"),
            "severity": labels.get("severity", "warning").capitalize(),
            "host": labels.get("instance", labels.get("host", "unknown")),
            "message": message,
            "eventId": alert.get("fingerprint", ""),
            "source": "Grafana",
            "status": "RESOLVED" if status == "RESOLVED" else "PROBLEM",
            "acknowledged": False
        }
        try:
            req = Request(PRINT_WEB_URL, data=json.dumps(payload).encode("utf-8"),
                          headers={"Content-Type": "application/json"}, method="POST")
            resp = urlopen(req, timeout=10)
-            if kind == "immediate": _stats["print_immediate"] += 1
+            print(f"[irc-notify] Thermal print sent: {resp.read().decode()}", file=sys.stderr)
            print(f"[irc-notify] thermal {kind} sent: {payload.get('title','?')[:50]}", file=sys.stderr)
            return True
        except Exception as e:
-            print(f"[irc-notify] thermal {kind} failed: {e}", file=sys.stderr)
+            print(f"[irc-notify] Thermal print failed: {e}", file=sys.stderr)
            return False
-    def fingerprint_of(alert):
+    def should_print(alert):
        fp = alert.get("fingerprint", "")
        if fp: return fp
        labels = alert.get("labels", {})
-        target = labels.get("pod") or labels.get("instance") or labels.get("deployment") or labels.get("statefulset") or labels.get("namespace") or ""
+        if labels.get("alert_channel") == "thermal_print": return True
-        return f"{labels.get('alertname','?')}/{labels.get('namespace','')}/{target}"
+        if labels.get("severity", "").lower() in ("critical", "disaster"): return True
-
+        if alert.get("status", "").upper() == "RESOLVED": return False
    def is_critical(alert):
        return alert.get("labels", {}).get("severity", "").lower() in ("critical", "disaster", "page")
    def is_immediate_label(alert):
        return alert.get("labels", {}).get("alert_channel") == "thermal_print_immediate"
    def is_batched_label(alert):
        return alert.get("labels", {}).get("alert_channel") == "thermal_print"
    def add_to_digest(alert):
        """Add an alert to the digest buffer. Returns True if the buffer GREW
        (new fingerprint), False if it was a dedup, resolution, or no-op.
        """
        if not THERMAL_PRINT_ENABLED: return False
        fp = fingerprint_of(alert)
        status = alert.get("status", "firing").lower()
        with _buffer_lock:
            if status == "resolved":
                if fp in _buffer:
                    del _buffer[fp]
                    _stats["buffer_resolved"] += 1
        return False
            if fp in _buffer:
                _buffer[fp]["last_seen"] = time.time()
                _buffer[fp]["alert"] = alert
                _stats["buffer_dedup"] += 1
                return False
            _buffer[fp] = {"alert": alert, "first_seen": time.time(), "last_seen": time.time()}
            _stats["buffer_added"] += 1
            return True
    def build_digest_payload():
        with _buffer_lock:
            items = list(_buffer.values())
        if not items: return None
        by_name = defaultdict(list)
        for item in items:
            labels = item["alert"].get("labels", {})
            by_name[labels.get("alertname", "Unknown")].append(item)
        lines = []
        for name, group in sorted(by_name.items()):
            targets = []
            for it in group[:5]:
                labels = it["alert"].get("labels", {})
                t = (labels.get("pod") or labels.get("instance") or labels.get("deployment")
                     or labels.get("statefulset") or labels.get("namespace") or "?")
                targets.append(t)
            more = f" (+{len(group)-5})" if len(group) > 5 else ""
            sevs = sorted({it["alert"].get("labels", {}).get("severity", "warning") for it in group})
            lines.append(f"[{'/'.join(sevs)}] {name} x{len(group)}: {', '.join(targets)}{more}")
        now = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC")
        title = f"Alert digest: {len(items)} firing"
        body = "\n".join([
            f"=== {title} ===",
            f"as of {now}",
            "",
            *lines,
            "",
            "Stream: #alerts (IRC)  |  Triage: grafana-noc1.iamworkin.lan",
            "Force-flush: POST irc-notify.monitoring.svc:9119/flush",
        ])
        return {"title": title, "severity": "Warning", "host": "monitoring",
                "message": body, "eventId": f"digest-{int(time.time())}",
                "source": "Grafana digest", "status": "PROBLEM", "acknowledged": False}
    def flush_digest():
        payload = build_digest_payload()
        if payload is None:
            print("[irc-notify] flush: buffer empty, no digest sent", file=sys.stderr)
            return False
        sent = post_thermal(payload, "digest")
        with _buffer_lock:
            _buffer.clear()
        if sent: _stats["digest_flushed"] += 1
        return sent
    def digest_loop():
        global _last_flush_time
        while True:
            try:
                now = time.time()
                elapsed = now - _last_flush_time
                if elapsed >= BATCH_INTERVAL_MIN * 60:
                    print(f"[irc-notify] digest tick: interval reached ({BATCH_INTERVAL_MIN}m); buffer={len(_buffer)}", file=sys.stderr)
                    flush_digest()
                    _last_flush_time = now
                elif len(_buffer) >= BATCH_MAX_PENDING:
                    print(f"[irc-notify] digest tick: buffer full ({len(_buffer)}); force flush", file=sys.stderr)
                    flush_digest()
                    _last_flush_time = now
                time.sleep(15)
            except Exception as e:
                print(f"[irc-notify] digest loop error: {e}", file=sys.stderr)
                time.sleep(60)
    class Handler(BaseHTTPRequestHandler):
        def do_POST(self):
            if self.path == "/flush":
                ok = flush_digest()
                self.send_response(200); self.send_header("Content-Type", "application/json"); self.end_headers()
                self.wfile.write(json.dumps({"flushed": ok, "buffer_after": len(_buffer)}).encode())
                return
            _stats["webhooks_received"] += 1
            length = int(self.headers.get("Content-Length", 0))
            body = json.loads(self.rfile.read(length)) if length else {}
            for alert in body.get("alerts", []):
@@ -1500,56 +1337,22 @@ data:
                msg = f"{icon}{sev_tag} {name}: {summary}"
                if desc: msg += f"\n  {desc}"
                send_irc(msg)
-                # Thermal routing — EVERYTHING (including criticals) goes into
+                if should_print(alert): send_thermal_print(alert)
-                # the hourly digest. Only the explicit `alert_channel=thermal_print_immediate`
+            self.send_response(200)
-                # label bypasses, and even that flushes-the-current-digest rather
+            self.send_header("Content-Type", "application/json")
-                # than printing a standalone job, so the same fingerprint can't
+            self.end_headers()
                # spam the printer per webhook cycle.
                if status == "RESOLVED":
                    add_to_digest(alert)  # removes from buffer
                    continue
                if is_immediate_label(alert):
                    # Explicit opt-in for "paper this NOW" — first arrival of a
                    # new fingerprint triggers an immediate digest flush; repeat
                    # webhooks for the same fingerprint dedupe in the buffer
                    # until the next interval or until the alert resolves.
                    new_in_buffer = add_to_digest(alert)
                    if new_in_buffer:
                        global _last_flush_time
                        flush_digest()
                        _last_flush_time = time.time()
                elif is_critical(alert) or is_batched_label(alert):
                    add_to_digest(alert)
                # else: IRC-only (warnings without thermal_print label)
            self.send_response(200); self.send_header("Content-Type", "application/json"); self.end_headers()
            self.wfile.write(b'{"status":"ok"}')
        def do_GET(self):
-            self.send_response(200); self.send_header("Content-Type", "application/json"); self.end_headers()
+            self.send_response(200)
-            with _buffer_lock:
+            self.send_header("Content-Type", "application/json")
-                alertnames = sorted({it["alert"].get("labels", {}).get("alertname", "?") for it in _buffer.values()})
+            self.end_headers()
-                depth = len(_buffer)
+            self.wfile.write(json.dumps({"service":"irc-notify","thermal_print":PRINT_ENABLED}).encode())
            info = {
                "service": "irc-notify",
                "config": {"thermal_print_enabled": THERMAL_PRINT_ENABLED,
                           "batch_interval_min": BATCH_INTERVAL_MIN,
                           "batch_max_pending": BATCH_MAX_PENDING,
                           "irc_target": f"{IRC_HOST}:{IRC_PORT} {IRC_CHANNEL}",
                           "print_web_url": PRINT_WEB_URL},
                "buffer": {"depth": depth, "alertnames": alertnames,
                           "seconds_since_last_flush": int(time.time() - _last_flush_time),
                           "seconds_until_next_flush": max(0, int(BATCH_INTERVAL_MIN*60 - (time.time() - _last_flush_time)))},
                "stats": _stats,
            }
            self.wfile.write(json.dumps(info, indent=2).encode())
        def log_message(self, format, *args):
            print(f"[irc-notify] {args[0]}", file=sys.stderr)
    if __name__ == "__main__":
        threading.Thread(target=digest_loop, daemon=True).start()
        server = HTTPServer(("0.0.0.0", 9119), Handler)
-        print(f"[irc-notify] :9119 -> IRC {IRC_HOST}:{IRC_PORT} {IRC_CHANNEL} | thermal={'ON' if THERMAL_PRINT_ENABLED else 'OFF'} | digest={BATCH_INTERVAL_MIN}m max={BATCH_MAX_PENDING}", file=sys.stderr)
+        print(f"IRC alert relay :9119 -> {IRC_HOST}:{IRC_PORT} {IRC_CHANNEL} (thermal: {PRINT_ENABLED})")
        server.serve_forever()
 # =============================================================================
--- a/apps/selenium/network-policy.yaml
+++ b/apps/selenium/network-policy.yaml
@@ -24,16 +24,7 @@
 #     (10.0.57.16:5200), public internet 80/443 (excluding RFC1918), and
 #     fc-signage:5190 for the signage AAT lane.
 #   - Ingress: Traefik (4444 + 8089 ACME-solver-style), intra-pod,
-#     telephony / gitea / fc-system / fc-signage / github-runner namespaces
+#     telephony / gitea / fc-system / fc-signage namespaces on 4444.
 #     on 4444.
 #
 # 2026-05-25: added github-runner ingress on 4444 so CI jobs running in
 # self-hosted runner pods (e.g. FlowerCore.Print.Web `help-screenshots`)
 # can reach the grid. Without this allow, the session POST to
 # `selenium-hub.selenium.svc.cluster.local:4444` was DNAT'd to the hub
 # pod IP and then dropped at the Calico ingress hook — Selenium UI showed
 # 0/4 sessions while the .NET HTTP client timed out at 60s. Same family
 # as `feedback_netpol_dnat_backend_port`, wrong-source-namespace flavor.
 apiVersion: networking.k8s.io/v1
 kind: NetworkPolicy
 metadata:
@@ -212,13 +203,6 @@ spec:
    ports:
    - port: 4444
      protocol: TCP
  - from:
    - namespaceSelector:
        matchLabels:
          kubernetes.io/metadata.name: github-runner
    ports:
    - port: 4444
      protocol: TCP
  podSelector: {}
  policyTypes:
  - Ingress
--- a/apps/selenium/selenium-grid.yaml
+++ b/apps/selenium/selenium-grid.yaml
@@ -1,412 +0,0 @@
 # Selenium Grid 4 — RKE2 deployment
 #
 # Hub + chrome + firefox + edge browser nodes serving fleet-wide AAT runs from
 # the GitHub Actions self-hosted runners. ArgoCD owns this namespace from
 # 2026-05-25 (`infra-selenium` Application; previously these resources were
 # orphan kubectl-applied since 2026-03-15).
 #
 # Endpoints:
 #   - Internal cluster: http://selenium-hub.selenium.svc.cluster.local:4444
 #   - LAN LoadBalancer (MetalLB): http://10.0.56.208:4444
 #   - Traefik public: https://selenium.iamworkin.lan
 #
 # Browser maxSessions:
 #   - chrome 2  (bumped from 1 on 2026-05-25 morning-routine — AAT-heavy
 #                Print.Web help-screenshots was the global bottleneck;
 #                see commit history for ops/runner-replica-rightsize)
 #   - firefox 1
 #   - edge 1
 #
 # Screenshots + video recording write to NFS via the chrome video sidecar.
 # See: CLAUDE.md "Selenium Grid & Visual AAT Testing" + bluejay-infra ADR notes.
 ---
 apiVersion: v1
 kind: Service
 metadata:
  labels:
    app: selenium-hub
    app.kubernetes.io/name: selenium-hub
    app.kubernetes.io/part-of: selenium-grid
  name: selenium-hub
  namespace: selenium
 spec:
  ports:
  - name: web
    port: 4444
    targetPort: 4444
  - name: publish
    port: 4442
    targetPort: 4442
  - name: subscribe
    port: 4443
    targetPort: 4443
  selector:
    app: selenium-hub
  type: ClusterIP
 ---
 apiVersion: v1
 kind: Service
 metadata:
  annotations:
    metallb.io/ip-allocated-from-pool: bluejay-pool
    metallb.universe.tf/loadBalancerIPs: 10.0.56.208
  labels:
    app: selenium-hub
    component: external-access
  name: selenium-hub-external
  namespace: selenium
 spec:
  clusterIP: 10.43.90.147
  clusterIPs:
  - 10.43.90.147
  externalTrafficPolicy: Local
  healthCheckNodePort: 32213
  ports:
  - name: web
    nodePort: 32411
    port: 4444
    targetPort: 4444
  - name: publish
    nodePort: 32068
    port: 4442
    targetPort: 4442
  - name: subscribe
    nodePort: 31000
    port: 4443
    targetPort: 4443
  selector:
    app: selenium-hub
  type: LoadBalancer
 ---
 apiVersion: apps/v1
 kind: Deployment
 metadata:
  labels:
    app: selenium-hub
    app.kubernetes.io/name: selenium-hub
    app.kubernetes.io/part-of: selenium-grid
  name: selenium-hub
  namespace: selenium
 spec:
  replicas: 1
  selector:
    matchLabels:
      app: selenium-hub
  template:
    metadata:
      labels:
        app: selenium-hub
        app.kubernetes.io/name: selenium-hub
        app.kubernetes.io/part-of: selenium-grid
    spec:
      containers:
      - env:
        - name: SE_NODE_SESSION_TIMEOUT
          value: '300'
        - name: SE_SESSION_REQUEST_TIMEOUT
          value: '300'
        - name: SE_SESSION_RETRY_INTERVAL
          value: '5'
        - name: JAVA_OPTS
          value: -Xmx512m
        image: selenium/hub:4.27.0
        livenessProbe:
          httpGet:
            path: /wd/hub/status
            port: 4444
          initialDelaySeconds: 30
          periodSeconds: 15
          timeoutSeconds: 5
        name: selenium-hub
        ports:
        - containerPort: 4444
          name: web
        - containerPort: 4442
          name: publish
        - containerPort: 4443
          name: subscribe
        readinessProbe:
          httpGet:
            path: /wd/hub/status
            port: 4444
          initialDelaySeconds: 10
          periodSeconds: 5
          timeoutSeconds: 5
        resources:
          limits:
            cpu: 500m
            memory: 1Gi
          requests:
            cpu: 250m
            memory: 512Mi
 ---
 apiVersion: apps/v1
 kind: Deployment
 metadata:
  labels:
    app: selenium-node-chrome
    app.kubernetes.io/name: selenium-node-chrome
    app.kubernetes.io/part-of: selenium-grid
  name: selenium-node-chrome
  namespace: selenium
 spec:
  replicas: 1
  selector:
    matchLabels:
      app: selenium-node-chrome
  template:
    metadata:
      labels:
        app: selenium-node-chrome
        app.kubernetes.io/name: selenium-node-chrome
        app.kubernetes.io/part-of: selenium-grid
    spec:
      containers:
      - env:
        - name: SE_EVENT_BUS_HOST
          value: selenium-hub
        - name: SE_EVENT_BUS_PUBLISH_PORT
          value: '4442'
        - name: SE_EVENT_BUS_SUBSCRIBE_PORT
          value: '4443'
        - name: SE_NODE_MAX_SESSIONS
          value: '2'
        - name: SE_NODE_OVERRIDE_MAX_SESSIONS
          value: 'false'
        - name: SE_VNC_NO_PASSWORD
          value: '1'
        - name: SE_SCREEN_WIDTH
          value: '1920'
        - name: SE_SCREEN_HEIGHT
          value: '1080'
        - name: SE_NODE_SESSION_TIMEOUT
          value: '300'
        image: selenium/node-chrome:4.27.0
        livenessProbe:
          httpGet:
            path: /status
            port: 5555
          initialDelaySeconds: 30
          periodSeconds: 15
        name: selenium-chrome
        ports:
        - containerPort: 5555
          name: node
        readinessProbe:
          httpGet:
            path: /status
            port: 5555
          initialDelaySeconds: 15
          periodSeconds: 5
        resources:
          limits:
            cpu: '1'
            memory: 1Gi
          requests:
            cpu: 500m
            memory: 512Mi
        volumeMounts:
        - mountPath: /dev/shm
          name: dshm
      - env:
        - name: DISPLAY_CONTAINER_NAME
          value: localhost
        - name: SE_SCREEN_WIDTH
          value: '1920'
        - name: SE_SCREEN_HEIGHT
          value: '1080'
        - name: SE_VIDEO_FILE_NAME
          value: auto
        - name: SE_VIDEO_UPLOAD_ENABLED
          value: 'false'
        image: selenium/video:ffmpeg-7.1-20250101
        name: video
        resources:
          limits:
            cpu: 500m
            memory: 768Mi
          requests:
            cpu: 250m
            memory: 384Mi
        volumeMounts:
        - mountPath: /videos
          name: selenium-videos
      volumes:
      - emptyDir:
          medium: Memory
          sizeLimit: 2Gi
        name: dshm
      - emptyDir:
          sizeLimit: 5Gi
        name: selenium-videos
 ---
 apiVersion: apps/v1
 kind: Deployment
 metadata:
  labels:
    app: selenium-node-firefox
    app.kubernetes.io/name: selenium-node-firefox
    app.kubernetes.io/part-of: selenium-grid
  name: selenium-node-firefox
  namespace: selenium
 spec:
  replicas: 1
  selector:
    matchLabels:
      app: selenium-node-firefox
  template:
    metadata:
      labels:
        app: selenium-node-firefox
        app.kubernetes.io/name: selenium-node-firefox
        app.kubernetes.io/part-of: selenium-grid
    spec:
      containers:
      - env:
        - name: SE_EVENT_BUS_HOST
          value: selenium-hub
        - name: SE_EVENT_BUS_PUBLISH_PORT
          value: '4442'
        - name: SE_EVENT_BUS_SUBSCRIBE_PORT
          value: '4443'
        - name: SE_NODE_MAX_SESSIONS
          value: '1'
        - name: SE_NODE_OVERRIDE_MAX_SESSIONS
          value: 'true'
        - name: SE_VNC_NO_PASSWORD
          value: '1'
        - name: SE_START_VNC
          value: 'false'
        - name: SE_SCREEN_WIDTH
          value: '1920'
        - name: SE_SCREEN_HEIGHT
          value: '1080'
        - name: SE_NODE_SESSION_TIMEOUT
          value: '300'
        image: selenium/node-firefox:4.27.0
        livenessProbe:
          failureThreshold: 5
          httpGet:
            path: /status
            port: 5555
          initialDelaySeconds: 30
          periodSeconds: 15
          timeoutSeconds: 5
        name: selenium-firefox
        ports:
        - containerPort: 5555
          name: node
        readinessProbe:
          failureThreshold: 5
          httpGet:
            path: /status
            port: 5555
          initialDelaySeconds: 15
          periodSeconds: 5
          timeoutSeconds: 5
        resources:
          limits:
            cpu: '1'
            memory: 2Gi
          requests:
            cpu: 500m
            memory: 1Gi
        volumeMounts:
        - mountPath: /dev/shm
          name: dshm
      volumes:
      - emptyDir:
          medium: Memory
          sizeLimit: 2Gi
        name: dshm
 ---
 apiVersion: apps/v1
 kind: Deployment
 metadata:
  labels:
    app: selenium-node-edge
    app.kubernetes.io/name: selenium-node-edge
    app.kubernetes.io/part-of: selenium-grid
  name: selenium-node-edge
  namespace: selenium
 spec:
  replicas: 1
  selector:
    matchLabels:
      app: selenium-node-edge
  template:
    metadata:
      labels:
        app: selenium-node-edge
        app.kubernetes.io/name: selenium-node-edge
        app.kubernetes.io/part-of: selenium-grid
    spec:
      containers:
      - env:
        - name: SE_EVENT_BUS_HOST
          value: selenium-hub
        - name: SE_EVENT_BUS_PUBLISH_PORT
          value: '4442'
        - name: SE_EVENT_BUS_SUBSCRIBE_PORT
          value: '4443'
        - name: SE_NODE_MAX_SESSIONS
          value: '1'
        - name: SE_NODE_OVERRIDE_MAX_SESSIONS
          value: 'true'
        - name: SE_VNC_NO_PASSWORD
          value: '1'
        - name: SE_SCREEN_WIDTH
          value: '1920'
        - name: SE_SCREEN_HEIGHT
          value: '1080'
        - name: SE_NODE_SESSION_TIMEOUT
          value: '300'
        image: selenium/node-edge:4.27.0
        livenessProbe:
          httpGet:
            path: /status
            port: 5555
          initialDelaySeconds: 30
          periodSeconds: 15
        name: selenium-edge
        ports:
        - containerPort: 5555
          name: node
        readinessProbe:
          httpGet:
            path: /status
            port: 5555
          initialDelaySeconds: 15
          periodSeconds: 5
        resources:
          limits:
            cpu: '1'
            memory: 1Gi
          requests:
            cpu: 500m
            memory: 512Mi
        volumeMounts:
        - mountPath: /dev/shm
          name: dshm
      volumes:
      - emptyDir:
          medium: Memory
          sizeLimit: 2Gi
        name: dshm
 ---
 apiVersion: traefik.io/v1alpha1
 kind: IngressRoute
 metadata:
  name: selenium-hub
  namespace: selenium
 spec:
  entryPoints:
  - websecure
  routes:
  - kind: Rule
    match: Host(`selenium.iamworkin.lan`)
    services:
    - name: selenium-hub
      port: 4444
  tls:
    secretName: selenium-tls
--- a/apps/worldbuilder/README.md
+++ b/apps/worldbuilder/README.md
@@ -28,12 +28,9 @@ Source: `D:\git\FlowerCore\FlowerCore.WorldBuilder` (master)
   Memory: `feedback_rke2_image_import_per_node_scp`.
 3. **Bump image tag** in `worldbuilder.yaml` and git push.
   ArgoCD ApplicationSet picks up within ~3 minutes.
-4. **First production render** — open
+4. **First production render** — open `https://worldbuilder.iamworkin.lan`,
-   `https://worldbuilder.iamworkin.lan/studio/c32e0000-0000-4000-8000-000000000004`
+   create World → Character → Storyboard → ExportJob, confirm artifact
-   and confirm the Cyberpunk Blue Jay demo prompt loads with five seeded fake
+   downloads. ComfyUI lives on BLUEJAY-WS at `http://10.0.56.20:8188`.
   generated images. This Sprint 32 visitor-safe profile uses
   `ClientMode=fake`; switch the image-generation env vars back to ComfyUI only
   for an operator-owned GPU render lane.
 ## Health probes
@@ -56,13 +53,8 @@ Source: `D:\git\FlowerCore\FlowerCore.WorldBuilder` (master)
 ## Image generation backend
-Sprint 32 pins the Kubernetes profile to
+`FlowerCore:WorldBuilder:ImageGeneration:BaseUrl=http://10.0.56.20:8188` —
-`FlowerCore:WorldBuilder:ImageGeneration:ClientMode=fake` with
+ComfyUI runs on BLUEJAY-WS Windows (R9700 / gfx1201 / ROCm 7.2.1). Pod reaches
-`BaseUrl=http://127.0.0.1:1`. That keeps the public/internal visitor demo
+the workstation directly across the 10.0.56.0/24 VLAN (no Podman-style host-
-deterministic, avoids GPU exposure, and still exercises the studio/gallery
+filter issues — K8s pods route via Calico, which is L3-routed across the
-surface with persisted generated-image metadata.
+VLAN).
 The previous ComfyUI backend target was `http://10.0.56.20:8188` on
 BLUEJAY-WS (R9700 / gfx1201 / ROCm 7.2.1). Re-enable it only in an
 operator-owned follow-up that also verifies workstation reachability and image
 import freshness.
--- a/apps/worldbuilder/worldbuilder.yaml
+++ b/apps/worldbuilder/worldbuilder.yaml
@@ -16,11 +16,7 @@ kind: Namespace
 metadata:
  name: fc-worldbuilder
  labels:
    app.kubernetes.io/name: fc-worldbuilder
    app.kubernetes.io/part-of: flowercore
    app.kubernetes.io/managed-by: argocd
    flowercore.io/tenant-id: system
    flowercore.io/created-by: bluejay-infra
 ---
 # SQLite DB + generated image gallery + PDF/PNG exports.
 # Longhorn RWO — single replica with `Recreate` rollout strategy keeps it safe.
@@ -29,13 +25,6 @@ kind: PersistentVolumeClaim
 metadata:
  name: worldbuilder-data
  namespace: fc-worldbuilder
  labels:
    app.kubernetes.io/name: worldbuilder-data
    app.kubernetes.io/component: storage
    app.kubernetes.io/part-of: flowercore
    app.kubernetes.io/managed-by: argocd
    flowercore.io/tenant-id: system
    flowercore.io/created-by: bluejay-infra
 spec:
  accessModes:
    - ReadWriteOnce
@@ -51,13 +40,7 @@ metadata:
  namespace: fc-worldbuilder
  labels:
    app.kubernetes.io/name: worldbuilder-web
    app.kubernetes.io/component: web
    app.kubernetes.io/part-of: flowercore
    app.kubernetes.io/managed-by: argocd
    flowercore.io/tenant-id: system
    flowercore.io/created-by: bluejay-infra
  annotations:
    flowercore.io/traceability-standard: k8s-pod-ownership-and-traceability-standard
 spec:
  replicas: 1
  revisionHistoryLimit: 3
@@ -71,16 +54,11 @@ spec:
    metadata:
      labels:
        app.kubernetes.io/name: worldbuilder-web
        app.kubernetes.io/component: web
        app.kubernetes.io/part-of: flowercore
        app.kubernetes.io/managed-by: argocd
        flowercore.io/tenant-id: system
        flowercore.io/created-by: bluejay-infra
      annotations:
        prometheus.io/scrape: "true"
        prometheus.io/port: "8080"
        prometheus.io/path: "/metrics/prometheus"
        flowercore.io/audit-trace-id: "worldbuilder-runtime-demo"
    spec:
      securityContext:
        fsGroup: 1654
@@ -114,14 +92,11 @@ spec:
              value: "/data/gallery"
            - name: FlowerCore__WorldBuilder__Export__RootPath
              value: "/data/exports"
-            # Visitor-safe Sprint 32 profile: fake backend keeps public demo
+            # ComfyUI on BLUEJAY-WS (R9700 / gfx1201 / ROCm 7.2.1).
            # rendering deterministic and avoids exposing BLUEJAY-WS GPU.
            - name: FlowerCore__WorldBuilder__ImageGeneration__BaseUrl
-              value: "http://127.0.0.1:1"
+              value: "http://10.0.56.20:8188"
            - name: FlowerCore__WorldBuilder__ImageGeneration__ClientMode
-              value: "fake"
+              value: "comfyui"
            - name: FlowerCore__WorldBuilder__ImageGeneration__BackendId
              value: "fake"
          resources:
            # Cluster CPU-request budget runs hot (99% on all 3 nodes at deploy
            # time) while actual CPU usage is well below capacity. Idle Blazor
@@ -190,11 +165,7 @@ metadata:
  namespace: fc-worldbuilder
  labels:
    app.kubernetes.io/name: worldbuilder-web
    app.kubernetes.io/component: web
    app.kubernetes.io/part-of: flowercore
    app.kubernetes.io/managed-by: argocd
    flowercore.io/tenant-id: system
    flowercore.io/created-by: bluejay-infra
 spec:
  type: ClusterIP
  selector:
@@ -209,13 +180,6 @@ kind: Certificate
 metadata:
  name: worldbuilder-web-tls
  namespace: fc-worldbuilder
  labels:
    app.kubernetes.io/name: worldbuilder-web-tls
    app.kubernetes.io/component: ingress
    app.kubernetes.io/part-of: flowercore
    app.kubernetes.io/managed-by: argocd
    flowercore.io/tenant-id: system
    flowercore.io/created-by: bluejay-infra
 spec:
  secretName: worldbuilder-web-tls
  issuerRef:
@@ -236,13 +200,6 @@ kind: IngressRoute
 metadata:
  name: worldbuilder-web
  namespace: fc-worldbuilder
  labels:
    app.kubernetes.io/name: worldbuilder-web
    app.kubernetes.io/component: ingress
    app.kubernetes.io/part-of: flowercore
    app.kubernetes.io/managed-by: argocd
    flowercore.io/tenant-id: system
    flowercore.io/created-by: bluejay-infra
 spec:
  entryPoints:
    - websecure
--- a/docs/runbooks/openvoxserver-quadlet-durability.md
+++ b/docs/runbooks/openvoxserver-quadlet-durability.md
@@ -1,84 +0,0 @@
 # openvoxserver Quadlet Durability
 This runbook documents the noc1 `openvoxserver` durability fix for the Puppet control-repo deploy path. The service is a noc1 host artifact, not an ArgoCD application, so discovery always starts on noc1 rather than in `apps/*`.
 ## Current State
 As of the Sprint 32 Cx-12 apply on 2026-05-17:
 - `/etc/containers/systemd/openvoxserver.container` has a `GIT_SSH_COMMAND` environment entry that points at the persisted serverdata deploy key.
 - `/etc/systemd/system/openvoxserver-safeconfig.service` is enabled and active, and reapplies `git config --global --add safe.directory *` inside the running container.
 - `/opt/puppet/r10k-deploy.sh` self-heals before each fetch by setting `safe.directory`, the repo-local `core.sshCommand`, and the persisted `known_hosts` file when needed.
 - `puppet-deploy.service` exits `0/SUCCESS` after the apply and the control repo reports `HEAD == origin/master`.
 - `systemctl cat openvoxserver` does not currently resolve to a generated unit on noc1. The container is running through Podman with `restart=always`, so destructive recreate smoke must not run until the generated unit is present.
 ## Discovery
 Run every command through noc1 as `fcadmin`; do not assume BLUEJAY-WS can reach container-local surfaces directly.
 ```bash
 ssh -i ~/.ssh/fcadmin_ed25519 fcadmin@10.0.56.10 "hostname && sudo -n true"
 ssh -i ~/.ssh/fcadmin_ed25519 fcadmin@10.0.56.10 "sudo find /etc/containers/systemd /usr/share/containers/systemd /etc/systemd/system -name 'openvoxserver*' 2>/dev/null"
 ssh -i ~/.ssh/fcadmin_ed25519 fcadmin@10.0.56.10 "sudo sed -n '1,220p' /etc/containers/systemd/openvoxserver.container"
 ssh -i ~/.ssh/fcadmin_ed25519 fcadmin@10.0.56.10 "sudo systemctl cat puppet-deploy.service"
 ```
 If a future noc1 profile manages these files, update the Puppet control repo and let `puppet-deploy.service` apply the change. On 2026-05-17, host `puppet` was not installed, so Cx-12 used a direct noc1 host edit.
 ## Durable Fix Shape
 The Quadlet keeps the deploy key as a path reference only:
 ```ini
 Environment=GIT_SSH_COMMAND=ssh -i /opt/puppetlabs/server/data/puppetserver/.puppet-deploy-key -o StrictHostKeyChecking=yes -o IdentitiesOnly=yes -o UserKnownHostsFile=/opt/puppetlabs/server/data/puppetserver/.known_hosts
 ```
 The safeconfig service is intentionally independent of `openvoxserver.service` until the generated unit exists. It waits for the `openvoxserver` container name and then runs:
 ```bash
 /usr/bin/podman exec openvoxserver git config --global --add safe.directory *
 ```
 The deploy script self-heals inside the container before it fetches the control repo:
 ```bash
 git config --global --add safe.directory "*" 2>/dev/null || true
 DEPLOY_KEY="/opt/puppetlabs/server/data/puppetserver/.puppet-deploy-key"
 KNOWN_HOSTS="/opt/puppetlabs/server/data/puppetserver/.known_hosts"
 REPO="/etc/puppetlabs/code/environments/production"
 export GIT_SSH_COMMAND="ssh -i $DEPLOY_KEY -o StrictHostKeyChecking=yes -o IdentitiesOnly=yes -o UserKnownHostsFile=$KNOWN_HOSTS"
 git -C "$REPO" config core.sshCommand "$GIT_SSH_COMMAND" 2>/dev/null || true
 ```
 ## Validation
 Non-destructive validation:
 ```bash
 ssh -i ~/.ssh/fcadmin_ed25519 fcadmin@10.0.56.10 "sudo grep -n 'GIT_SSH_COMMAND' /etc/containers/systemd/openvoxserver.container"
 ssh -i ~/.ssh/fcadmin_ed25519 fcadmin@10.0.56.10 "sudo systemctl status openvoxserver-safeconfig.service --no-pager -l"
 ssh -i ~/.ssh/fcadmin_ed25519 fcadmin@10.0.56.10 "sudo systemctl start puppet-deploy.service && sudo systemctl status puppet-deploy.service --no-pager -l"
 ssh -i ~/.ssh/fcadmin_ed25519 fcadmin@10.0.56.10 "sudo podman exec openvoxserver git -C /etc/puppetlabs/code/environments/production config --get core.sshCommand"
 ```
 Destructive recreate smoke is opt-in only:
 ```bash
 scp scripts/monitoring/openvox-recreate-smoke.sh fcadmin@10.0.56.10:/tmp/openvox-recreate-smoke.sh
 ssh -i ~/.ssh/fcadmin_ed25519 fcadmin@10.0.56.10 "chmod +x /tmp/openvox-recreate-smoke.sh && sudo OPENVOX_RECREATE_SMOKE=1 /tmp/openvox-recreate-smoke.sh"
 ```
 Do not run the smoke during normal sprint work. It stops and removes the production container before starting it again through systemd, and it now refuses to continue unless `systemctl cat openvoxserver` succeeds.
 ## Credential Rotation Note
 When rotating the Puppet deploy key, update the persisted serverdata copy on noc1:
 ```bash
 sudo install -m 0600 -o root -g root <new-deploy-key> /opt/puppet/serverdata/.puppet-deploy-key
 sudo podman exec openvoxserver sh -c "ssh-keyscan github.com > /opt/puppetlabs/server/data/puppetserver/.known_hosts"
 sudo systemctl start openvoxserver-safeconfig.service
 sudo systemctl start puppet-deploy.service
 ```
 Never commit the deploy key or print it in logs.
--- a/scripts/monitoring/openvox-recreate-smoke.sh
+++ b/scripts/monitoring/openvox-recreate-smoke.sh
@@ -1,48 +0,0 @@
 #!/usr/bin/env bash
 set -euo pipefail
 if [ "${OPENVOX_RECREATE_SMOKE:-}" != "1" ]; then
  echo "SKIP: set OPENVOX_RECREATE_SMOKE=1 to run the destructive openvoxserver recreate smoke." >&2
  exit 64
 fi
 SUDO="${SUDO:-sudo}"
 REPO="/etc/puppetlabs/code/environments/production"
 CORE_SSH_COMMAND_FRAGMENT=".puppet-deploy-key"
 if ! $SUDO systemctl cat openvoxserver >/dev/null 2>&1; then
  echo "SKIP: systemctl cat openvoxserver failed; refusing to remove a container without a verified systemd recreate path." >&2
  exit 65
 fi
 before="$($SUDO podman exec openvoxserver git -C "$REPO" rev-parse --short HEAD)"
 echo "Before recreate: $before"
 $SUDO systemctl stop openvoxserver
 $SUDO podman rm openvoxserver 2>/dev/null || true
 $SUDO systemctl start openvoxserver
 sleep 50
 $SUDO systemctl start puppet-deploy.service
 sleep 5
 $SUDO systemctl status puppet-deploy.service --no-pager -l
 after="$($SUDO podman exec openvoxserver git -C "$REPO" rev-parse --short origin/master)"
 echo "After recreate origin/master: $after"
 $SUDO test -d /opt/puppet/code/environments/production/site-modules/profile/manifests
 core_ssh="$($SUDO podman exec openvoxserver git -C "$REPO" config --get core.sshCommand)"
 case "$core_ssh" in
  *"$CORE_SSH_COMMAND_FRAGMENT"*) ;;
  *)
    echo "FAIL: core.sshCommand does not reference the persisted deploy key." >&2
    exit 1
    ;;
 esac
 $SUDO podman exec openvoxserver git -C "$REPO" status --short --branch
 echo "PASS: openvoxserver recreate smoke completed without git safety or deploy-key failure."
--- a/tests/bluejay-infra-lint/FleetManifestLintTests.cs
+++ b/tests/bluejay-infra-lint/FleetManifestLintTests.cs
@@ -13,7 +13,6 @@ public sealed class FleetManifestLintTests
    private static readonly HashSet<string> PublicReadOnlyHosts = new(StringComparer.Ordinal)
    {
        "brochure.flowercore.io",
        "dist.flowercore.io",
        "dns.iamworkin.lan",
    };
--- a/tests/bluejay-infra-lint/OpenVoxServerDurabilityTests.cs
+++ b/tests/bluejay-infra-lint/OpenVoxServerDurabilityTests.cs
@@ -1,99 +0,0 @@
 using FluentAssertions;
 using Xunit;
 namespace BluejayInfraLint.Tests;
 [Trait("Category", "Unit")]
 public sealed class OpenVoxServerDurabilityTests
 {
    private static readonly string Root = FindRepoRoot();
    private static readonly string RunbookPath = Path.Combine(Root, "docs", "runbooks", "openvoxserver-quadlet-durability.md");
    private static readonly string SmokePath = Path.Combine(Root, "scripts", "monitoring", "openvox-recreate-smoke.sh");
    [Fact]
    public void Runbook_DocumentsHostArtifactAndNonArgoPath()
    {
        var runbook = File.ReadAllText(RunbookPath);
        runbook.Should().Contain("noc1 host artifact");
        runbook.Should().Contain("not an ArgoCD application");
        runbook.Should().Contain("systemctl cat openvoxserver");
        runbook.Should().Contain("/etc/containers/systemd/openvoxserver.container");
    }
    [Fact]
    public void Runbook_DocumentsCx12LiveApplyState()
    {
        var runbook = File.ReadAllText(RunbookPath);
        runbook.Should().Contain("Sprint 32 Cx-12");
        runbook.Should().Contain("openvoxserver-safeconfig.service");
        runbook.Should().Contain("/opt/puppet/r10k-deploy.sh");
        runbook.Should().Contain("HEAD == origin/master");
    }
    [Fact]
    public void SmokeScript_IsExplicitlyOptIn()
    {
        var smoke = File.ReadAllText(SmokePath);
        smoke.Should().Contain("OPENVOX_RECREATE_SMOKE");
        smoke.Should().Contain("exit 64");
        smoke.IndexOf("OPENVOX_RECREATE_SMOKE", StringComparison.Ordinal)
            .Should().BeLessThan(smoke.IndexOf("systemctl stop openvoxserver", StringComparison.Ordinal));
    }
    [Fact]
    public void SmokeScript_RequiresGeneratedSystemdUnitBeforeRemovingContainer()
    {
        var smoke = File.ReadAllText(SmokePath);
        smoke.Should().Contain("systemctl cat openvoxserver");
        smoke.Should().Contain("refusing to remove a container without a verified systemd recreate path");
        smoke.IndexOf("systemctl cat openvoxserver", StringComparison.Ordinal)
            .Should().BeLessThan(smoke.IndexOf("podman rm openvoxserver", StringComparison.Ordinal));
    }
    [Fact]
    public void Artifacts_DoNotStoreSecretsOrPaidRunnerLabels()
    {
        var forbidden = new[]
        {
            "BEGIN OPENSSH PRIVATE KEY",
            "BEGIN RSA PRIVATE KEY",
            "ubuntu-latest",
            "windows-latest",
            "macos-latest",
        };
        var violations = new[] { RunbookPath, SmokePath }
            .SelectMany(path =>
            {
                var text = File.ReadAllText(path);
                return forbidden
                    .Where(token => text.Contains(token, StringComparison.OrdinalIgnoreCase))
                    .Select(token => $"{Path.GetRelativePath(Root, path)} contains forbidden token {token}");
            })
            .ToList();
        violations.Should().BeEmpty();
    }
    private static string FindRepoRoot()
    {
        var current = new DirectoryInfo(AppContext.BaseDirectory);
        while (current is not null)
        {
            if (Directory.Exists(Path.Combine(current.FullName, "apps"))
                && Directory.Exists(Path.Combine(current.FullName, "scripts"))
                && File.Exists(Path.Combine(current.FullName, "README.md")))
            {
                return current.FullName;
            }
            current = current.Parent;
        }
        throw new DirectoryNotFoundException("Could not find bluejay-infra root.");
    }
 }
--- a/tests/bluejay-infra-lint/conftest.dev/02_public_method_allowlist.rego
+++ b/tests/bluejay-infra-lint/conftest.dev/02_public_method_allowlist.rego
@@ -1,6 +1,6 @@
 package bluejayinfra.public_method_allowlist
-public_hosts := {"brochure.flowercore.io", "dist.flowercore.io", "dns.iamworkin.lan"}
+public_hosts := {"dist.flowercore.io", "dns.iamworkin.lan"}
 deny[msg] {
  input.kind == "IngressRoute"