fc-desktop: add remotedesktop warm pool intent

2026-05-19 12:27:47 -05:00
14 changed files with 355 additions and 1352 deletions
--- a/apps/authentik/authentik.yaml
+++ b/apps/authentik/authentik.yaml
@@ -1,448 +0,0 @@
 # Authentik OIDC backend
 # ArgoCD-managed. BlueJay Lab.
 #
 # Stack:
 #   - PostgreSQL 16 StatefulSet (single replica, Longhorn RWO 5Gi)
 #   - Redis 7 Deployment (no persistence — session/cache only)
 #   - Authentik server + worker Deployments (image ghcr.io/goauthentik/server:2024.12.3)
 #   - Media PVC shared between server + worker (Longhorn RWO 2Gi)
 #   - Certificate via step-ca-acme ClusterIssuer
 #   - Traefik IngressRoute at id.iamworkin.lan
 #
 # Secrets come from 1Password item "authentik-credentials" (IAmWorkin vault, id y6i74ch22q5wvm7znquq4nhhcu)
 # via the OnePasswordItem CRD, materialized into k8s Secret authentik/authentik-credentials.
 #
 # Why the discovery URL is /application/o/pimanager/ : Authentik issues per-application OIDC providers.
 # The pimanager OIDC application/provider is created after the cluster pods are healthy (manual or
 # via API once the bootstrap token is available — see Notes substrate).
 ---
 apiVersion: v1
 kind: Namespace
 metadata:
  name: authentik
  labels:
    app.kubernetes.io/part-of: bluejay-infra
 ---
 # 1Password operator pulls the authentik-credentials item into a k8s Secret of the same name.
 # Field labels in 1P become Secret keys: AUTHENTIK_SECRET_KEY, POSTGRES_PASSWORD, REDIS_PASSWORD,
 # BOOTSTRAP_ADMIN_PASSWORD, BOOTSTRAP_ADMIN_TOKEN, BOOTSTRAP_ADMIN_EMAIL.
 apiVersion: onepassword.com/v1
 kind: OnePasswordItem
 metadata:
  name: authentik-credentials
  namespace: authentik
 spec:
  itemPath: "vaults/IAmWorkin/items/authentik-credentials"
 ---
 # Shared media volume for server + worker pods.
 apiVersion: v1
 kind: PersistentVolumeClaim
 metadata:
  name: authentik-media
  namespace: authentik
 spec:
  storageClassName: longhorn
  accessModes: [ReadWriteOnce]
  resources:
    requests:
      storage: 2Gi
 ---
 # PostgreSQL 16 StatefulSet — Authentik's primary store.
 apiVersion: apps/v1
 kind: StatefulSet
 metadata:
  name: authentik-postgres
  namespace: authentik
  labels:
    app: authentik-postgres
    argocd.argoproj.io/instance: infra-authentik
 spec:
  persistentVolumeClaimRetentionPolicy:
    whenDeleted: Retain
    whenScaled: Retain
  podManagementPolicy: OrderedReady
  serviceName: authentik-postgres
  replicas: 1
  revisionHistoryLimit: 10
  selector:
    matchLabels:
      app: authentik-postgres
  template:
    metadata:
      labels:
        app: authentik-postgres
    spec:
      containers:
        - name: postgres
          image: postgres:16-alpine
          ports:
            - containerPort: 5432
              name: postgres
          env:
            - name: POSTGRES_USER
              value: authentik
            - name: POSTGRES_PASSWORD
              valueFrom:
                secretKeyRef:
                  name: authentik-credentials
                  key: POSTGRES_PASSWORD
            - name: POSTGRES_DB
              value: authentik
            - name: POSTGRES_INITDB_ARGS
              value: "--encoding=UTF-8 --lc-collate=C --lc-ctype=C"
            - name: PGDATA
              value: /var/lib/postgresql/data/pgdata
          readinessProbe:
            exec:
              command: ["pg_isready", "-U", "authentik"]
            initialDelaySeconds: 5
            periodSeconds: 5
          livenessProbe:
            exec:
              command: ["pg_isready", "-U", "authentik"]
            initialDelaySeconds: 30
            periodSeconds: 30
          resources:
            requests: { cpu: 100m, memory: 256Mi }
            limits: { cpu: 1000m, memory: 1Gi }
          volumeMounts:
            - name: pgdata
              mountPath: /var/lib/postgresql/data
  volumeClaimTemplates:
    - metadata:
        name: pgdata
      spec:
        storageClassName: longhorn
        accessModes: [ReadWriteOnce]
        volumeMode: Filesystem
        resources:
          requests:
            storage: 5Gi
 ---
 apiVersion: v1
 kind: Service
 metadata:
  name: authentik-postgres
  namespace: authentik
 spec:
  clusterIP: None
  selector:
    app: authentik-postgres
  ports:
    - name: postgres
      port: 5432
      targetPort: 5432
 ---
 # Redis 7 — session storage + Celery broker. No persistence needed (cache).
 apiVersion: apps/v1
 kind: Deployment
 metadata:
  name: authentik-redis
  namespace: authentik
  labels:
    app: authentik-redis
    argocd.argoproj.io/instance: infra-authentik
 spec:
  replicas: 1
  strategy:
    type: Recreate
  selector:
    matchLabels:
      app: authentik-redis
  template:
    metadata:
      labels:
        app: authentik-redis
    spec:
      containers:
        - name: redis
          image: redis:7-alpine
          args:
            - "--save"
            - ""
            - "--appendonly"
            - "no"
            - "--requirepass"
            - "$(REDIS_PASSWORD)"
          env:
            - name: REDIS_PASSWORD
              valueFrom:
                secretKeyRef:
                  name: authentik-credentials
                  key: REDIS_PASSWORD
          ports:
            - containerPort: 6379
              name: redis
          readinessProbe:
            tcpSocket: { port: 6379 }
            initialDelaySeconds: 5
            periodSeconds: 5
          livenessProbe:
            tcpSocket: { port: 6379 }
            initialDelaySeconds: 30
            periodSeconds: 30
          resources:
            requests: { cpu: 50m, memory: 64Mi }
            limits: { cpu: 500m, memory: 256Mi }
 ---
 apiVersion: v1
 kind: Service
 metadata:
  name: authentik-redis
  namespace: authentik
 spec:
  selector:
    app: authentik-redis
  ports:
    - name: redis
      port: 6379
      targetPort: 6379
 ---
 # Authentik server Deployment — HTTP frontend on :9000.
 apiVersion: apps/v1
 kind: Deployment
 metadata:
  name: authentik-server
  namespace: authentik
  labels:
    app: authentik-server
    argocd.argoproj.io/instance: infra-authentik
 spec:
  replicas: 1
  strategy:
    type: Recreate  # shares /media RWO PVC with worker
  selector:
    matchLabels:
      app: authentik-server
  template:
    metadata:
      labels:
        app: authentik-server
    spec:
      securityContext:
        # Authentik image runs as uid 1000 "authentik" but the Longhorn PVC mounts
        # root:root by default. fsGroup recursively chgrp + chmod g+rwx so the
        # non-root container can mkdir /media/public during the tenant_files migration.
        fsGroup: 1000
      containers:
        - name: server
          image: ghcr.io/goauthentik/server:2024.12.3
          args: ["server"]
          ports:
            - containerPort: 9000
              name: http
            - containerPort: 9443
              name: https
          env:
            - name: AUTHENTIK_SECRET_KEY
              valueFrom:
                secretKeyRef:
                  name: authentik-credentials
                  key: AUTHENTIK_SECRET_KEY
            - name: AUTHENTIK_REDIS__HOST
              value: authentik-redis
            - name: AUTHENTIK_REDIS__PASSWORD
              valueFrom:
                secretKeyRef:
                  name: authentik-credentials
                  key: REDIS_PASSWORD
            - name: AUTHENTIK_POSTGRESQL__HOST
              value: authentik-postgres
            - name: AUTHENTIK_POSTGRESQL__NAME
              value: authentik
            - name: AUTHENTIK_POSTGRESQL__USER
              value: authentik
            - name: AUTHENTIK_POSTGRESQL__PASSWORD
              valueFrom:
                secretKeyRef:
                  name: authentik-credentials
                  key: POSTGRES_PASSWORD
            - name: AUTHENTIK_BOOTSTRAP_PASSWORD
              valueFrom:
                secretKeyRef:
                  name: authentik-credentials
                  key: BOOTSTRAP_ADMIN_PASSWORD
            - name: AUTHENTIK_BOOTSTRAP_TOKEN
              valueFrom:
                secretKeyRef:
                  name: authentik-credentials
                  key: BOOTSTRAP_ADMIN_TOKEN
            - name: AUTHENTIK_BOOTSTRAP_EMAIL
              valueFrom:
                secretKeyRef:
                  name: authentik-credentials
                  key: BOOTSTRAP_ADMIN_EMAIL
            - name: AUTHENTIK_DISABLE_UPDATE_CHECK
              value: "true"
            - name: AUTHENTIK_ERROR_REPORTING__ENABLED
              value: "false"
            - name: AUTHENTIK_LOG_LEVEL
              value: info
          # First-boot Authentik can take 3+ min on the migration phase
          # (waiting on DB lock while worker also runs migrations). Initial
          # delays are generous so kubelet doesn't kill the pod mid-migration;
          # periodSeconds keeps post-startup probing responsive.
          readinessProbe:
            httpGet:
              path: /-/health/ready/
              port: 9000
            initialDelaySeconds: 60
            periodSeconds: 10
            timeoutSeconds: 5
            failureThreshold: 12
          livenessProbe:
            httpGet:
              path: /-/health/live/
              port: 9000
            initialDelaySeconds: 300
            periodSeconds: 30
            timeoutSeconds: 10
            failureThreshold: 3
          startupProbe:
            httpGet:
              path: /-/health/live/
              port: 9000
            initialDelaySeconds: 30
            periodSeconds: 15
            timeoutSeconds: 10
            failureThreshold: 40  # 30s + 40*15s = 10.5 min budget
          resources:
            requests: { cpu: 150m, memory: 512Mi }
            limits: { cpu: 1500m, memory: 1Gi }
          volumeMounts:
            - name: media
              mountPath: /media
      volumes:
        - name: media
          persistentVolumeClaim:
            claimName: authentik-media
 ---
 # Authentik worker Deployment — runs Celery background tasks.
 apiVersion: apps/v1
 kind: Deployment
 metadata:
  name: authentik-worker
  namespace: authentik
  labels:
    app: authentik-worker
    argocd.argoproj.io/instance: infra-authentik
 spec:
  replicas: 1
  strategy:
    type: Recreate  # shares /media RWO PVC with server
  selector:
    matchLabels:
      app: authentik-worker
  template:
    metadata:
      labels:
        app: authentik-worker
    spec:
      securityContext:
        # Same as server pod — non-root uid 1000 needs PVC group write.
        fsGroup: 1000
      containers:
        - name: worker
          image: ghcr.io/goauthentik/server:2024.12.3
          args: ["worker"]
          env:
            - name: AUTHENTIK_SECRET_KEY
              valueFrom:
                secretKeyRef:
                  name: authentik-credentials
                  key: AUTHENTIK_SECRET_KEY
            - name: AUTHENTIK_REDIS__HOST
              value: authentik-redis
            - name: AUTHENTIK_REDIS__PASSWORD
              valueFrom:
                secretKeyRef:
                  name: authentik-credentials
                  key: REDIS_PASSWORD
            - name: AUTHENTIK_POSTGRESQL__HOST
              value: authentik-postgres
            - name: AUTHENTIK_POSTGRESQL__NAME
              value: authentik
            - name: AUTHENTIK_POSTGRESQL__USER
              value: authentik
            - name: AUTHENTIK_POSTGRESQL__PASSWORD
              valueFrom:
                secretKeyRef:
                  name: authentik-credentials
                  key: POSTGRES_PASSWORD
            - name: AUTHENTIK_DISABLE_UPDATE_CHECK
              value: "true"
            - name: AUTHENTIK_ERROR_REPORTING__ENABLED
              value: "false"
            - name: AUTHENTIK_LOG_LEVEL
              value: info
          resources:
            requests: { cpu: 100m, memory: 256Mi }
            limits: { cpu: 1000m, memory: 768Mi }
          volumeMounts:
            - name: media
              mountPath: /media
      volumes:
        - name: media
          persistentVolumeClaim:
            claimName: authentik-media
 ---
 apiVersion: v1
 kind: Service
 metadata:
  name: authentik-server
  namespace: authentik
 spec:
  selector:
    app: authentik-server
  ports:
    - name: http
      port: 9000
      targetPort: 9000
    - name: https
      port: 9443
      targetPort: 9443
 ---
 # step-ca leaf certificate for id.iamworkin.lan.
 # step-ca container resolver uses pfSense Unbound, so the public A record for id.iamworkin.lan
 # MUST exist before this Certificate is applied (cert-manager HTTP-01 will silently 2h-backoff
 # otherwise). Added 2026-05-25 via scripts/pfsense-add-id-host.py.
 apiVersion: cert-manager.io/v1
 kind: Certificate
 metadata:
  name: authentik-tls
  namespace: authentik
 spec:
  secretName: authentik-tls
  dnsNames:
    - id.iamworkin.lan
  issuerRef:
    name: step-ca-acme
    kind: ClusterIssuer
 ---
 apiVersion: traefik.io/v1alpha1
 kind: IngressRoute
 metadata:
  name: authentik
  namespace: authentik
 spec:
  entryPoints: [websecure]
  routes:
    - match: Host(`id.iamworkin.lan`)
      kind: Rule
      services:
        - name: authentik-server
          port: 9000
  tls:
    secretName: authentik-tls
--- a/apps/fc-desktop/fc-desktop.yaml
+++ b/apps/fc-desktop/fc-desktop.yaml
@@ -1,10 +1,13 @@
 # FlowerCore Remote Desktop — TLS + Ingress
 #
 # Source-of-truth split:
-#   - bluejay-infra OWNS: Certificate, IngressRoute, all NetworkPolicies
+#   - bluejay-infra OWNS: Certificate, IngressRoute, all NetworkPolicies,
 #     and the explicit RemoteDesktopPoolCrd warm-pool intent in
 #     remotedesktop-pools.yaml.
 #     (see network-policies.yaml in this directory).
-#   - FlowerCore.RemoteDesktop scripts/deploy-web.sh OWNS: Deployment +
+#   - FlowerCore.RemoteDesktop OWNS: CRD definition/operator Deployment and
-#     Service. Reason: image refs like `localhost/fc-desktop:linux-xfce`
+#     scripts/deploy-web.sh Deployment + Service. Reason: image refs like
 #     `localhost/fc-desktop:linux-xfce`
 #     only exist on each node's containerd after a manual import, so a
 #     Deployment manifest in bluejay-infra would race the image-import
 #     step and crash-loop.
--- a/apps/fc-desktop/remotedesktop-pools.yaml
+++ b/apps/fc-desktop/remotedesktop-pools.yaml
@@ -0,0 +1,101 @@
 # FlowerCore RemoteDesktop warm-pool intent.
 #
 # These CRDs are deliberately explicit. The RemoteDesktop warmup loop no
 # longer scans template defaults to decide what to warm; every enabled pool
 # here represents operator/GitOps intent and prevents a repeat of the
 # orphan-pool leak from 2026-05-08.
 ---
 apiVersion: flowercore.io/v1
 kind: RemoteDesktopPoolCrd
 metadata:
  name: browser-lab-pool
  namespace: fc-desktop
  labels:
    app.kubernetes.io/name: remotedesktop-pool
    app.kubernetes.io/part-of: flowercore-remotedesktop
    app.kubernetes.io/managed-by: bluejay-infra
 spec:
  templateSlug: browser-only
  desiredSize: 1
  enabled: true
  reconcileNow: true
 ---
 apiVersion: flowercore.io/v1
 kind: RemoteDesktopPoolCrd
 metadata:
  name: opensuse-xfce-pool
  namespace: fc-desktop
  labels:
    app.kubernetes.io/name: remotedesktop-pool
    app.kubernetes.io/part-of: flowercore-remotedesktop
    app.kubernetes.io/managed-by: bluejay-infra
 spec:
  templateSlug: opensuse-xfce
  desiredSize: 1
  enabled: true
  userVolumeMode: LateAttach
  reconcileNow: true
 ---
 apiVersion: flowercore.io/v1
 kind: RemoteDesktopPoolCrd
 metadata:
  name: dev-workstation-pool
  namespace: fc-desktop
  labels:
    app.kubernetes.io/name: remotedesktop-pool
    app.kubernetes.io/part-of: flowercore-remotedesktop
    app.kubernetes.io/managed-by: bluejay-infra
 spec:
  templateSlug: dev-workstation
  desiredSize: 1
  enabled: true
  userVolumeMode: LateAttach
  reconcileNow: true
 ---
 apiVersion: flowercore.io/v1
 kind: RemoteDesktopPoolCrd
 metadata:
  name: ai-station-pool
  namespace: fc-desktop
  labels:
    app.kubernetes.io/name: remotedesktop-pool
    app.kubernetes.io/part-of: flowercore-remotedesktop
    app.kubernetes.io/managed-by: bluejay-infra
 spec:
  templateSlug: ai-station
  desiredSize: 1
  enabled: true
  userVolumeMode: LateAttach
  reconcileNow: true
 ---
 apiVersion: flowercore.io/v1
 kind: RemoteDesktopPoolCrd
 metadata:
  name: linux-xfce-pool
  namespace: fc-desktop
  labels:
    app.kubernetes.io/name: remotedesktop-pool
    app.kubernetes.io/part-of: flowercore-remotedesktop
    app.kubernetes.io/managed-by: bluejay-infra
 spec:
  templateSlug: linux-xfce
  desiredSize: 1
  enabled: true
  userVolumeMode: LateAttach
  reconcileNow: true
 ---
 apiVersion: flowercore.io/v1
 kind: RemoteDesktopPoolCrd
 metadata:
  name: linux-xfce-rdp-pool
  namespace: fc-desktop
  labels:
    app.kubernetes.io/name: remotedesktop-pool
    app.kubernetes.io/part-of: flowercore-remotedesktop
    app.kubernetes.io/managed-by: bluejay-infra
 spec:
  templateSlug: linux-xfce-rdp
  desiredSize: 1
  enabled: true
  userVolumeMode: LateAttach
  reconcileNow: true
--- a/apps/fc-devicemgmt/argocd-application.yaml
+++ b/apps/fc-devicemgmt/argocd-application.yaml
@@ -0,0 +1,33 @@
 # Explicit ArgoCD Application shape for bootstrap/review.
 #
 # The live bluejay-infra ApplicationSet already discovers apps/* directories
 # and creates this same Application name (`infra-fc-devicemgmt`) automatically.
 # Keep repoURL on the internal Gitea ClusterIP URL; ArgoCD does not trust the
 # external step-ca HTTPS endpoint.
 apiVersion: argoproj.io/v1alpha1
 kind: Application
 metadata:
  name: infra-fc-devicemgmt
  namespace: argocd
  labels:
    app.kubernetes.io/name: fc-devicemgmt
    app.kubernetes.io/part-of: flowercore
    app.kubernetes.io/managed-by: argocd
    flowercore.io/tenant-id: system
    flowercore.io/created-by: bluejay-infra
 spec:
  project: default
  source:
    repoURL: http://gitea-clusterip.gitea.svc.cluster.local:3000/bluejay/bluejay-infra.git
    targetRevision: main
    path: apps/fc-devicemgmt
  destination:
    server: https://kubernetes.default.svc
    namespace: fc-devicemgmt
  syncPolicy:
    automated:
      prune: true
      selfHeal: true
    syncOptions:
      - CreateNamespace=true
      - ServerSideApply=true
--- a/apps/github-runner/.gitattributes
+++ b/apps/github-runner/.gitattributes
@@ -1,2 +0,0 @@
 *.sh text eol=lf
 Dockerfile text eol=lf
--- a/apps/github-runner/Dockerfile
+++ b/apps/github-runner/Dockerfile
@@ -1,54 +0,0 @@
 FROM myoung34/github-runner:latest
 ARG RUBY_VERSION=3.3.11
 ARG RUBY_MINOR=3.3
 ARG RUBY_BUILD_VERSION=v20260326
 ARG RUNNER_UID=1001
 ARG RUNNER_GID=1001
 ENV RUNNER_TOOL_CACHE=/home/runner/_tool
 ENV RUNNER_RUBY_TOOLCACHE=/opt/runner-toolcache
 ENV PATH="/home/runner/_tool/Ruby/${RUBY_MINOR}/x64/bin:/opt/runner-toolcache/Ruby/${RUBY_MINOR}/x64/bin:${PATH}"
 USER root
 # Bake the IAmWorkin step-ca root CA into the system trust store. Without
 # this, .NET HttpClient calls from CI tests against *.iamworkin.lan
 # (e.g. https://selenium.iamworkin.lan/session) fail with `PartialChain`
 # because the runner image's default Ubuntu trust bundle doesn't include
 # our internal Root CA. update-ca-certificates regenerates
 # /etc/ssl/certs/ca-certificates.crt, which OpenSSL + .NET on Linux read
 # automatically — no SSL_CERT_FILE env var needed.
 COPY step-ca-root.crt /usr/local/share/ca-certificates/iamworkin-step-ca-root.crt
 RUN apt-get update \
    && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
        autoconf \
        bison \
        build-essential \
        ca-certificates \
        curl \
        libdb-dev \
        libffi-dev \
        libgdbm-dev \
        libgmp-dev \
        libncurses-dev \
        libreadline-dev \
        libssl-dev \
        libyaml-dev \
        patch \
        pkg-config \
        uuid-dev \
        zlib1g-dev \
    && update-ca-certificates \
    && curl -fsSL "https://github.com/rbenv/ruby-build/archive/refs/tags/${RUBY_BUILD_VERSION}.tar.gz" -o /tmp/ruby-build.tar.gz \
    && mkdir -p /tmp/ruby-build \
    && tar -xzf /tmp/ruby-build.tar.gz --strip-components=1 -C /tmp/ruby-build \
    && /tmp/ruby-build/install.sh \
    && rm -rf /tmp/ruby-build /tmp/ruby-build.tar.gz /var/lib/apt/lists/*
 COPY install-ruby-toolcache.sh /usr/local/bin/install-ruby-toolcache.sh
 RUN chmod +x /usr/local/bin/install-ruby-toolcache.sh \
    && RUBY_VERSION="${RUBY_VERSION}" RUBY_MINOR="${RUBY_MINOR}" TOOLCACHE_ROOT="${RUNNER_RUBY_TOOLCACHE}" RUNNER_UID="${RUNNER_UID}" RUNNER_GID="${RUNNER_GID}" /usr/local/bin/install-ruby-toolcache.sh \
    && ruby -v
--- a/apps/github-runner/README.md
+++ b/apps/github-runner/README.md
@@ -7,17 +7,12 @@ Deployments with `kubectl`; update this manifest and let ArgoCD reconcile.
 All repo-scoped Linux runners use:
 - `localhost/fc-github-runner:v20260525-ruby3.3.11-stepca`, derived from
  `myoung34/github-runner:latest`
 - `ACCESS_TOKEN` from the `github-runner-token` Secret
 - `RUN_AS_ROOT=false`
 - `EPHEMERAL=true`
 - `LABELS=self-hosted,linux,fc-build-linux`
 - writable non-root paths under `/home/runner` for .NET, NuGet, XDG cache, and
  Actions tool cache
 - Ruby 3.3.11 seeded into `/home/runner/_tool/Ruby/3.3/x64` from the baked
  `/opt/runner-toolcache` copy so `ruby/setup-ruby@v1` can discover it on
  self-hosted `ubuntu-20.04-x64` runners
 `github-runner` for `FlowerCore.Common` is single-replica because it retains the
 original Longhorn ReadWriteOnce NuGet PVC. Every other repo-scoped runner uses
@@ -33,46 +28,6 @@ Sprint 32 final long-tail wave adds 16 two-replica Deployments:
 `FlowerCore.Provisioning`, `FlowerCore.Redis`, `FlowerCore.MessageBoard`, and
 `FlowerCore.MenuBoard`.
 ## Image Build
 Ruby is baked with a pinned `ruby-build` release and Ruby patch version. The pod
 still mounts an `emptyDir` over `/home/runner`, so the `setup-runner-home` init
 container copies the baked toolcache from `/opt/runner-toolcache/Ruby` into
 `/home/runner/_tool/Ruby` before the runner container starts.
 The IAmWorkin step-ca root CA is also baked into the system trust store
 (`/usr/local/share/ca-certificates/iamworkin-step-ca-root.crt`, registered by
 `update-ca-certificates`). Without it, .NET HttpClient calls from CI tests
 against `*.iamworkin.lan` (e.g. `https://selenium.iamworkin.lan/session`)
 fail with `PartialChain`. To refresh the bundled cert when the root rotates,
 re-extract from the cluster and overwrite `step-ca-root.crt`:
 ```bash
 kubectl get secret -n cert-manager step-ca-root \
  -o jsonpath='{.data.ca\.crt}' | base64 -d > step-ca-root.crt
 ```
 ```bash
 cd apps/github-runner
 podman build -t localhost/fc-github-runner:v20260525-ruby3.3.11-stepca .
 podman run --rm localhost/fc-github-runner:v20260525-ruby3.3.11-stepca ruby -v
 podman run --rm localhost/fc-github-runner:v20260525-ruby3.3.11-stepca \
  test -f /opt/runner-toolcache/Ruby/3.3/x64.complete
 podman save localhost/fc-github-runner:v20260525-ruby3.3.11-stepca \
  -o fc-github-runner-v20260525-ruby3.3.11-stepca.tar
 ```
 Import the saved image on every schedulable RKE2 node before ArgoCD rolls the
 Deployments:
 ```bash
 for node in rke2-server rke2-agent1 rke2-agent2; do
  scp fc-github-runner-v20260525-ruby3.3.11-stepca.tar "$node:/tmp/"
  ssh "$node" 'sudo ctr -a /run/k3s/containerd/containerd.sock -n k8s.io images rm localhost/fc-github-runner:v20260525-ruby3.3.11-stepca || true'
  ssh "$node" 'sudo ctr -a /run/k3s/containerd/containerd.sock -n k8s.io images import /tmp/fc-github-runner-v20260525-ruby3.3.11-stepca.tar'
 done
 ```
 ## Post-Merge Proof
 After the PR is merged and ArgoCD syncs, verify the runner fleet:
@@ -81,14 +36,6 @@ After the PR is merged and ArgoCD syncs, verify the runner fleet:
 kubectl -n github-runner get deploy,pods,pvc
 ```
 Verify the Ruby toolcache in a fresh pod:
 ```bash
 kubectl -n github-runner exec deploy/github-runner-puppet -c runner -- ruby -v
 kubectl -n github-runner exec deploy/github-runner-puppet -c runner -- sh -c \
  'echo "$RUNNER_TOOL_CACHE" && test -f "$RUNNER_TOOL_CACHE/Ruby/3.3/x64.complete"'
 ```
 Verify GitHub registration for the repo-scoped runners:
 ```bash
@@ -122,10 +69,6 @@ from GitHub Actions and verify it lands on an `rke2-linux-*` runner.
 - `actions/setup-dotnet` permission error at `/usr/share/dotnet`: check that
  `DOTNET_INSTALL_DIR=/home/runner/.dotnet` and related cache env vars are
  present on the runner pod.
 - `ruby/setup-ruby@v1` says self-hosted runners must install Ruby in
  `$RUNNER_TOOL_CACHE`: check that the init container copied
  `/opt/runner-toolcache/Ruby` into `/home/runner/_tool/Ruby` and that
  `/home/runner/_tool/Ruby/3.3/x64.complete` exists.
 - `404` during runner registration: the fine-grained PAT is valid but missing
  repository access for that repo. Add the repo to the PAT access list; the PAT
  value does not change.
--- a/apps/github-runner/github-runner.yaml
+++ b/apps/github-runner/github-runner.yaml
--- a/apps/github-runner/install-ruby-toolcache.sh
+++ b/apps/github-runner/install-ruby-toolcache.sh
@@ -1,19 +0,0 @@
 #!/usr/bin/env bash
 set -euo pipefail
 RUBY_VERSION="${RUBY_VERSION:-3.3.11}"
 RUBY_MINOR="${RUBY_MINOR:-3.3}"
 TOOLCACHE_ROOT="${TOOLCACHE_ROOT:-/opt/runner-toolcache}"
 RUNNER_UID="${RUNNER_UID:-1001}"
 RUNNER_GID="${RUNNER_GID:-1001}"
 RUBY_PREFIX="${TOOLCACHE_ROOT}/Ruby/${RUBY_VERSION}/x64"
 mkdir -p "${TOOLCACHE_ROOT}/Ruby"
 RUBY_CONFIGURE_OPTS="${RUBY_CONFIGURE_OPTS:---disable-install-doc --disable-yjit}" ruby-build "${RUBY_VERSION}" "${RUBY_PREFIX}"
 touch "${TOOLCACHE_ROOT}/Ruby/${RUBY_VERSION}/x64.complete"
 ln -sfn "${RUBY_VERSION}" "${TOOLCACHE_ROOT}/Ruby/${RUBY_MINOR}"
 "${RUBY_PREFIX}/bin/ruby" -v
 chown -R "${RUNNER_UID}:${RUNNER_GID}" "${TOOLCACHE_ROOT}"
 chmod -R a+rX "${TOOLCACHE_ROOT}"
--- a/apps/github-runner/step-ca-root.crt
+++ b/apps/github-runner/step-ca-root.crt
@@ -1,12 +0,0 @@
 -----BEGIN CERTIFICATE-----
 MIIBxDCCAWqgAwIBAgIRAPY357G6ow6zMAL5+4bS2kkwCgYIKoZIzj0EAwIwQDEa
 MBgGA1UEChMRSUFtV29ya2luIEFDTUUgQ0ExIjAgBgNVBAMTGUlBbVdvcmtpbiBB
 Q01FIENBIFJvb3QgQ0EwHhcNMjYwMzA4MTgwNzExWhcNMzYwMzA1MTgwNzExWjBA
 MRowGAYDVQQKExFJQW1Xb3JraW4gQUNNRSBDQTEiMCAGA1UEAxMZSUFtV29ya2lu
 IEFDTUUgQ0EgUm9vdCBDQTBZMBMGByqGSM49AgEGCCqGSM49AwEHA0IABJ2n04X1
 JZo5Zdq/i1Idv8+fqwZyAzBh7whbqj0SWsJL8UWRabCMqYCs7+dXO0xRSzqkwFDL
 x+vooOai8RgRNhajRTBDMA4GA1UdDwEB/wQEAwIBBjASBgNVHRMBAf8ECDAGAQH/
 AgEBMB0GA1UdDgQWBBRnuPPQR6iM/H6vOluiU3Sygayz8jAKBggqhkjOPQQDAgNI
 ADBFAiEArQK9dYPGmAZsdYnjziuFVVE5NKZUcceYvGfGC+tLXUsCIAudF2zJrCRq
 3mK50ZZET/fwTkJwiEF4824mjP8p1CKM
 -----END CERTIFICATE-----
--- a/apps/monitoring/noc-monitoring.yaml
+++ b/apps/monitoring/noc-monitoring.yaml
@@ -280,14 +280,13 @@ data:
              printer_model: "NuPrint 210"
      # Print.Web health (Blazor app on edge2:5200)
      # Target `/health` (anonymous) — root path requires API key auth and returns 401.
      - job_name: "probe-printweb"
        metrics_path: /probe
        params:
          module: [http_2xx]
        scrape_interval: 30s
        static_configs:
-          - targets: ["http://10.0.57.16:5200/health"]
+          - targets: ["http://10.0.57.16:5200/"]
            labels:
              instance: "print-web"
              service: "print-web"
--- a/apps/selenium/network-policy.yaml
+++ b/apps/selenium/network-policy.yaml
@@ -24,16 +24,7 @@
 #     (10.0.57.16:5200), public internet 80/443 (excluding RFC1918), and
 #     fc-signage:5190 for the signage AAT lane.
 #   - Ingress: Traefik (4444 + 8089 ACME-solver-style), intra-pod,
-#     telephony / gitea / fc-system / fc-signage / github-runner namespaces
+#     telephony / gitea / fc-system / fc-signage namespaces on 4444.
 #     on 4444.
 #
 # 2026-05-25: added github-runner ingress on 4444 so CI jobs running in
 # self-hosted runner pods (e.g. FlowerCore.Print.Web `help-screenshots`)
 # can reach the grid. Without this allow, the session POST to
 # `selenium-hub.selenium.svc.cluster.local:4444` was DNAT'd to the hub
 # pod IP and then dropped at the Calico ingress hook — Selenium UI showed
 # 0/4 sessions while the .NET HTTP client timed out at 60s. Same family
 # as `feedback_netpol_dnat_backend_port`, wrong-source-namespace flavor.
 apiVersion: networking.k8s.io/v1
 kind: NetworkPolicy
 metadata:
@@ -212,13 +203,6 @@ spec:
    ports:
    - port: 4444
      protocol: TCP
  - from:
    - namespaceSelector:
        matchLabels:
          kubernetes.io/metadata.name: github-runner
    ports:
    - port: 4444
      protocol: TCP
  podSelector: {}
  policyTypes:
  - Ingress
--- a/apps/selenium/selenium-grid.yaml
+++ b/apps/selenium/selenium-grid.yaml
@@ -1,427 +0,0 @@
 # Selenium Grid 4 — RKE2 deployment
 #
 # Hub + chrome + firefox + edge browser nodes serving fleet-wide AAT runs from
 # the GitHub Actions self-hosted runners. ArgoCD owns this namespace from
 # 2026-05-25 (`infra-selenium` Application; previously these resources were
 # orphan kubectl-applied since 2026-03-15).
 #
 # Endpoints:
 #   - Internal cluster: http://selenium-hub.selenium.svc.cluster.local:4444
 #   - LAN LoadBalancer (MetalLB): http://10.0.56.208:4444
 #   - Traefik public: https://selenium.iamworkin.lan
 #
 # Browser maxSessions:
 #   - chrome 2  (bumped from 1 on 2026-05-25 morning-routine — AAT-heavy
 #                Print.Web help-screenshots was the global bottleneck;
 #                see commit history for ops/runner-replica-rightsize)
 #   - firefox 1
 #   - edge 1
 #
 # Screenshots + video recording write to NFS via the chrome video sidecar.
 # See: CLAUDE.md "Selenium Grid & Visual AAT Testing" + bluejay-infra ADR notes.
 ---
 apiVersion: v1
 kind: Service
 metadata:
  labels:
    app: selenium-hub
    app.kubernetes.io/name: selenium-hub
    app.kubernetes.io/part-of: selenium-grid
  name: selenium-hub
  namespace: selenium
 spec:
  ports:
  - name: web
    port: 4444
    targetPort: 4444
  - name: publish
    port: 4442
    targetPort: 4442
  - name: subscribe
    port: 4443
    targetPort: 4443
  selector:
    app: selenium-hub
  type: ClusterIP
 ---
 apiVersion: v1
 kind: Service
 metadata:
  annotations:
    metallb.io/ip-allocated-from-pool: bluejay-pool
    metallb.universe.tf/loadBalancerIPs: 10.0.56.208
  labels:
    app: selenium-hub
    component: external-access
  name: selenium-hub-external
  namespace: selenium
 spec:
  clusterIP: 10.43.90.147
  clusterIPs:
  - 10.43.90.147
  externalTrafficPolicy: Local
  healthCheckNodePort: 32213
  ports:
  - name: web
    nodePort: 32411
    port: 4444
    targetPort: 4444
  - name: publish
    nodePort: 32068
    port: 4442
    targetPort: 4442
  - name: subscribe
    nodePort: 31000
    port: 4443
    targetPort: 4443
  selector:
    app: selenium-hub
  type: LoadBalancer
 ---
 apiVersion: apps/v1
 kind: Deployment
 metadata:
  labels:
    app: selenium-hub
    app.kubernetes.io/name: selenium-hub
    app.kubernetes.io/part-of: selenium-grid
  name: selenium-hub
  namespace: selenium
 spec:
  replicas: 1
  selector:
    matchLabels:
      app: selenium-hub
  template:
    metadata:
      labels:
        app: selenium-hub
        app.kubernetes.io/name: selenium-hub
        app.kubernetes.io/part-of: selenium-grid
    spec:
      containers:
      - env:
        - name: SE_NODE_SESSION_TIMEOUT
          value: '300'
        - name: SE_SESSION_REQUEST_TIMEOUT
          value: '300'
        - name: SE_SESSION_RETRY_INTERVAL
          value: '5'
        - name: JAVA_OPTS
          value: -Xmx512m
        image: selenium/hub:4.27.0
        livenessProbe:
          httpGet:
            path: /wd/hub/status
            port: 4444
          initialDelaySeconds: 30
          periodSeconds: 15
          timeoutSeconds: 5
        name: selenium-hub
        ports:
        - containerPort: 4444
          name: web
        - containerPort: 4442
          name: publish
        - containerPort: 4443
          name: subscribe
        readinessProbe:
          httpGet:
            path: /wd/hub/status
            port: 4444
          initialDelaySeconds: 10
          periodSeconds: 5
          timeoutSeconds: 5
        # Hub baseline working set ~766Mi on 2026-05-25 (75% of prior 1Gi
        # limit). Bump to 1.5Gi / 1Gi to keep ~50% headroom; matches the
        # stampede-buffer pattern documented for multus
        # (feedback_k8s_cni_multus_sizing). CPU left alone — observed 54m
        # against a 500m limit, no contention.
        resources:
          limits:
            cpu: 500m
            memory: 1536Mi
          requests:
            cpu: 250m
            memory: 1Gi
 ---
 apiVersion: apps/v1
 kind: Deployment
 metadata:
  labels:
    app: selenium-node-chrome
    app.kubernetes.io/name: selenium-node-chrome
    app.kubernetes.io/part-of: selenium-grid
  name: selenium-node-chrome
  namespace: selenium
 spec:
  replicas: 1
  selector:
    matchLabels:
      app: selenium-node-chrome
  template:
    metadata:
      labels:
        app: selenium-node-chrome
        app.kubernetes.io/name: selenium-node-chrome
        app.kubernetes.io/part-of: selenium-grid
    spec:
      containers:
      - env:
        - name: SE_EVENT_BUS_HOST
          value: selenium-hub
        - name: SE_EVENT_BUS_PUBLISH_PORT
          value: '4442'
        - name: SE_EVENT_BUS_SUBSCRIBE_PORT
          value: '4443'
        - name: SE_NODE_MAX_SESSIONS
          value: '2'
        - name: SE_NODE_OVERRIDE_MAX_SESSIONS
          value: 'false'
        - name: SE_VNC_NO_PASSWORD
          value: '1'
        - name: SE_SCREEN_WIDTH
          value: '1920'
        - name: SE_SCREEN_HEIGHT
          value: '1080'
        - name: SE_NODE_SESSION_TIMEOUT
          value: '300'
        image: selenium/node-chrome:4.27.0
        livenessProbe:
          httpGet:
            path: /status
            port: 5555
          initialDelaySeconds: 30
          periodSeconds: 15
        name: selenium-chrome
        ports:
        - containerPort: 5555
          name: node
        readinessProbe:
          httpGet:
            path: /status
            port: 5555
          initialDelaySeconds: 15
          periodSeconds: 5
        # Chromium-based browser node. Bumped from 1Gi -> 2Gi (req 512Mi
        # -> 1Gi) on 2026-05-25 — Edge had 51 OOMKills in 5d on the
        # original 1Gi cap (~1 OOM every 2.4h), and Chrome at maxSessions=2
        # was running 684Mi idle on the same cap. Matches the Firefox node's
        # tested-stable 2Gi limit. CPU unchanged.
        resources:
          limits:
            cpu: '1'
            memory: 2Gi
          requests:
            cpu: 500m
            memory: 1Gi
        volumeMounts:
        - mountPath: /dev/shm
          name: dshm
      - env:
        - name: DISPLAY_CONTAINER_NAME
          value: localhost
        - name: SE_SCREEN_WIDTH
          value: '1920'
        - name: SE_SCREEN_HEIGHT
          value: '1080'
        - name: SE_VIDEO_FILE_NAME
          value: auto
        - name: SE_VIDEO_UPLOAD_ENABLED
          value: 'false'
        image: selenium/video:ffmpeg-7.1-20250101
        name: video
        resources:
          limits:
            cpu: 500m
            memory: 768Mi
          requests:
            cpu: 250m
            memory: 384Mi
        volumeMounts:
        - mountPath: /videos
          name: selenium-videos
      volumes:
      - emptyDir:
          medium: Memory
          sizeLimit: 2Gi
        name: dshm
      - emptyDir:
          sizeLimit: 5Gi
        name: selenium-videos
 ---
 apiVersion: apps/v1
 kind: Deployment
 metadata:
  labels:
    app: selenium-node-firefox
    app.kubernetes.io/name: selenium-node-firefox
    app.kubernetes.io/part-of: selenium-grid
  name: selenium-node-firefox
  namespace: selenium
 spec:
  replicas: 1
  selector:
    matchLabels:
      app: selenium-node-firefox
  template:
    metadata:
      labels:
        app: selenium-node-firefox
        app.kubernetes.io/name: selenium-node-firefox
        app.kubernetes.io/part-of: selenium-grid
    spec:
      containers:
      - env:
        - name: SE_EVENT_BUS_HOST
          value: selenium-hub
        - name: SE_EVENT_BUS_PUBLISH_PORT
          value: '4442'
        - name: SE_EVENT_BUS_SUBSCRIBE_PORT
          value: '4443'
        - name: SE_NODE_MAX_SESSIONS
          value: '1'
        - name: SE_NODE_OVERRIDE_MAX_SESSIONS
          value: 'true'
        - name: SE_VNC_NO_PASSWORD
          value: '1'
        - name: SE_START_VNC
          value: 'false'
        - name: SE_SCREEN_WIDTH
          value: '1920'
        - name: SE_SCREEN_HEIGHT
          value: '1080'
        - name: SE_NODE_SESSION_TIMEOUT
          value: '300'
        image: selenium/node-firefox:4.27.0
        livenessProbe:
          failureThreshold: 5
          httpGet:
            path: /status
            port: 5555
          initialDelaySeconds: 30
          periodSeconds: 15
          timeoutSeconds: 5
        name: selenium-firefox
        ports:
        - containerPort: 5555
          name: node
        readinessProbe:
          failureThreshold: 5
          httpGet:
            path: /status
            port: 5555
          initialDelaySeconds: 15
          periodSeconds: 5
          timeoutSeconds: 5
        resources:
          limits:
            cpu: '1'
            memory: 2Gi
          requests:
            cpu: 500m
            memory: 1Gi
        volumeMounts:
        - mountPath: /dev/shm
          name: dshm
      volumes:
      - emptyDir:
          medium: Memory
          sizeLimit: 2Gi
        name: dshm
 ---
 apiVersion: apps/v1
 kind: Deployment
 metadata:
  labels:
    app: selenium-node-edge
    app.kubernetes.io/name: selenium-node-edge
    app.kubernetes.io/part-of: selenium-grid
  name: selenium-node-edge
  namespace: selenium
 spec:
  replicas: 1
  selector:
    matchLabels:
      app: selenium-node-edge
  template:
    metadata:
      labels:
        app: selenium-node-edge
        app.kubernetes.io/name: selenium-node-edge
        app.kubernetes.io/part-of: selenium-grid
    spec:
      containers:
      - env:
        - name: SE_EVENT_BUS_HOST
          value: selenium-hub
        - name: SE_EVENT_BUS_PUBLISH_PORT
          value: '4442'
        - name: SE_EVENT_BUS_SUBSCRIBE_PORT
          value: '4443'
        - name: SE_NODE_MAX_SESSIONS
          value: '1'
        - name: SE_NODE_OVERRIDE_MAX_SESSIONS
          value: 'true'
        - name: SE_VNC_NO_PASSWORD
          value: '1'
        - name: SE_SCREEN_WIDTH
          value: '1920'
        - name: SE_SCREEN_HEIGHT
          value: '1080'
        - name: SE_NODE_SESSION_TIMEOUT
          value: '300'
        image: selenium/node-edge:4.27.0
        livenessProbe:
          httpGet:
            path: /status
            port: 5555
          initialDelaySeconds: 30
          periodSeconds: 15
        name: selenium-edge
        ports:
        - containerPort: 5555
          name: node
        readinessProbe:
          httpGet:
            path: /status
            port: 5555
          initialDelaySeconds: 15
          periodSeconds: 5
        # Chromium-based browser node. Bumped from 1Gi -> 2Gi (req 512Mi
        # -> 1Gi) on 2026-05-25 — Edge had 51 OOMKills in 5d on the
        # original 1Gi cap (~1 OOM every 2.4h), and Chrome at maxSessions=2
        # was running 684Mi idle on the same cap. Matches the Firefox node's
        # tested-stable 2Gi limit. CPU unchanged.
        resources:
          limits:
            cpu: '1'
            memory: 2Gi
          requests:
            cpu: 500m
            memory: 1Gi
        volumeMounts:
        - mountPath: /dev/shm
          name: dshm
      volumes:
      - emptyDir:
          medium: Memory
          sizeLimit: 2Gi
        name: dshm
 ---
 apiVersion: traefik.io/v1alpha1
 kind: IngressRoute
 metadata:
  name: selenium-hub
  namespace: selenium
 spec:
  entryPoints:
  - websecure
  routes:
  - kind: Rule
    match: Host(`selenium.iamworkin.lan`)
    services:
    - name: selenium-hub
      port: 4444
  tls:
    secretName: selenium-tls
--- a/tests/bluejay-infra-lint/FleetManifestLintTests.cs
+++ b/tests/bluejay-infra-lint/FleetManifestLintTests.cs
@@ -387,6 +387,38 @@ public sealed class FleetManifestLintTests
        violations.Should().BeEmpty();
    }
    [Fact]
    public void RemoteDesktopPoolCrds_MustExplicitlyOptInHookReadyTemplates()
    {
        var expectedModes = new Dictionary<string, string?>(StringComparer.Ordinal)
        {
            ["browser-only"] = null,
            ["opensuse-xfce"] = "LateAttach",
            ["dev-workstation"] = "LateAttach",
            ["ai-station"] = "LateAttach",
            ["linux-xfce"] = "LateAttach",
            ["linux-xfce-rdp"] = "LateAttach",
        };
        var pools = Inventory.Documents
            .Where(document => document.Kind == "RemoteDesktopPoolCrd")
            .Where(document => document.RelativePath == "fc-desktop/remotedesktop-pools.yaml")
            .ToDictionary(
                document => document.Scalar("spec", "templateSlug") ?? string.Empty,
                StringComparer.Ordinal);
        pools.Keys.Should().BeEquivalentTo(expectedModes.Keys);
        foreach (var expected in expectedModes)
        {
            var pool = pools[expected.Key];
            pool.Namespace.Should().Be("fc-desktop");
            pool.Scalar("spec", "desiredSize").Should().Be("1");
            pool.Scalar("spec", "enabled").Should().Be("true");
            pool.Scalar("spec", "reconcileNow").Should().Be("true");
            pool.Scalar("spec", "userVolumeMode").Should().Be(expected.Value);
        }
    }
    [Fact]
    public void PublicEgressDeployments_MustOptOutOfIamworkinLanSearchSuffixes()
    {