Compare commits

..

1 Commits

Author SHA1 Message Date
Andrew Stoltz
2896b60d3c Tighten RemoteDesktop network policies 2026-05-19 12:04:12 -05:00
24 changed files with 473 additions and 2627 deletions

View File

@@ -103,6 +103,7 @@ curl -sk -X DELETE https://dns.iamworkin.lan/api/v1/servers/<serverId>/zones/iam
- **Public read-only hosts**: if a public host fronts a service that also exposes admin writes internally, add a Traefik route match like `Host(...) && (Method(GET) || Method(HEAD))` on the public edge instead of trusting the app to reject unsafe methods.
- **Public read-write allowlist hosts**: if a public host accepts a tightly bounded write surface (e.g. bootstrap-JWT POST), pin the allowlist as `(Method(GET) || Method(HEAD) || Method(POST) || Method(OPTIONS))`. PUT/PATCH/DELETE must still 404 at the route. Track A's `updatecenter.iamworkin.lan` / `updates.iamworkin.lan` are the canonical example. The lint test enforces this invariant.
- **Traefik VIP netpols**: when a `NetworkPolicy` allows `10.0.56.200`, also allow the post-DNAT backend ports (`8443` for TLS plus `8080` or `8000` for HTTP) or Calico will drop the rewritten flow.
- **RemoteDesktop isolation**: `apps/fc-desktop/network-policies.yaml` intentionally keeps desktop pod egress to named CoreDNS, `intranet-web:5300/TCP`, and noc1 step-ca `10.0.56.10:9000/9443` only. Guacamole display egress is owned separately by `apps/guacamole/guacamole.yaml` through `guacd-desktop-egress` on `5901/TCP`.
- **Auth-safe probes**: services behind API-key or global auth middleware should prefer `tcpSocket` probes unless `/health` is explicitly exempted before the middleware runs.
- **ArgoCD must use internal Gitea URL**: `http://gitea-clusterip.gitea.svc.cluster.local:3000/bluejay/bluejay-infra.git`, not the external HTTPS URL (step-ca cert isn't trusted by ArgoCD). The `ApplicationSet` and any hand-created `Application` must both use the internal URL.

View File

@@ -1,448 +0,0 @@
# Authentik OIDC backend
# ArgoCD-managed. BlueJay Lab.
#
# Stack:
# - PostgreSQL 16 StatefulSet (single replica, Longhorn RWO 5Gi)
# - Redis 7 Deployment (no persistence — session/cache only)
# - Authentik server + worker Deployments (image ghcr.io/goauthentik/server:2024.12.3)
# - Media PVC shared between server + worker (Longhorn RWO 2Gi)
# - Certificate via step-ca-acme ClusterIssuer
# - Traefik IngressRoute at id.iamworkin.lan
#
# Secrets come from 1Password item "authentik-credentials" (IAmWorkin vault, id y6i74ch22q5wvm7znquq4nhhcu)
# via the OnePasswordItem CRD, materialized into k8s Secret authentik/authentik-credentials.
#
# Why the discovery URL is /application/o/pimanager/ : Authentik issues per-application OIDC providers.
# The pimanager OIDC application/provider is created after the cluster pods are healthy (manual or
# via API once the bootstrap token is available — see Notes substrate).
---
apiVersion: v1
kind: Namespace
metadata:
name: authentik
labels:
app.kubernetes.io/part-of: bluejay-infra
---
# 1Password operator pulls the authentik-credentials item into a k8s Secret of the same name.
# Field labels in 1P become Secret keys: AUTHENTIK_SECRET_KEY, POSTGRES_PASSWORD, REDIS_PASSWORD,
# BOOTSTRAP_ADMIN_PASSWORD, BOOTSTRAP_ADMIN_TOKEN, BOOTSTRAP_ADMIN_EMAIL.
apiVersion: onepassword.com/v1
kind: OnePasswordItem
metadata:
name: authentik-credentials
namespace: authentik
spec:
itemPath: "vaults/IAmWorkin/items/authentik-credentials"
---
# Shared media volume for server + worker pods.
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: authentik-media
namespace: authentik
spec:
storageClassName: longhorn
accessModes: [ReadWriteOnce]
resources:
requests:
storage: 2Gi
---
# PostgreSQL 16 StatefulSet — Authentik's primary store.
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: authentik-postgres
namespace: authentik
labels:
app: authentik-postgres
argocd.argoproj.io/instance: infra-authentik
spec:
persistentVolumeClaimRetentionPolicy:
whenDeleted: Retain
whenScaled: Retain
podManagementPolicy: OrderedReady
serviceName: authentik-postgres
replicas: 1
revisionHistoryLimit: 10
selector:
matchLabels:
app: authentik-postgres
template:
metadata:
labels:
app: authentik-postgres
spec:
containers:
- name: postgres
image: postgres:16-alpine
ports:
- containerPort: 5432
name: postgres
env:
- name: POSTGRES_USER
value: authentik
- name: POSTGRES_PASSWORD
valueFrom:
secretKeyRef:
name: authentik-credentials
key: POSTGRES_PASSWORD
- name: POSTGRES_DB
value: authentik
- name: POSTGRES_INITDB_ARGS
value: "--encoding=UTF-8 --lc-collate=C --lc-ctype=C"
- name: PGDATA
value: /var/lib/postgresql/data/pgdata
readinessProbe:
exec:
command: ["pg_isready", "-U", "authentik"]
initialDelaySeconds: 5
periodSeconds: 5
livenessProbe:
exec:
command: ["pg_isready", "-U", "authentik"]
initialDelaySeconds: 30
periodSeconds: 30
resources:
requests: { cpu: 100m, memory: 256Mi }
limits: { cpu: 1000m, memory: 1Gi }
volumeMounts:
- name: pgdata
mountPath: /var/lib/postgresql/data
volumeClaimTemplates:
- metadata:
name: pgdata
spec:
storageClassName: longhorn
accessModes: [ReadWriteOnce]
volumeMode: Filesystem
resources:
requests:
storage: 5Gi
---
apiVersion: v1
kind: Service
metadata:
name: authentik-postgres
namespace: authentik
spec:
clusterIP: None
selector:
app: authentik-postgres
ports:
- name: postgres
port: 5432
targetPort: 5432
---
# Redis 7 — session storage + Celery broker. No persistence needed (cache).
apiVersion: apps/v1
kind: Deployment
metadata:
name: authentik-redis
namespace: authentik
labels:
app: authentik-redis
argocd.argoproj.io/instance: infra-authentik
spec:
replicas: 1
strategy:
type: Recreate
selector:
matchLabels:
app: authentik-redis
template:
metadata:
labels:
app: authentik-redis
spec:
containers:
- name: redis
image: redis:7-alpine
args:
- "--save"
- ""
- "--appendonly"
- "no"
- "--requirepass"
- "$(REDIS_PASSWORD)"
env:
- name: REDIS_PASSWORD
valueFrom:
secretKeyRef:
name: authentik-credentials
key: REDIS_PASSWORD
ports:
- containerPort: 6379
name: redis
readinessProbe:
tcpSocket: { port: 6379 }
initialDelaySeconds: 5
periodSeconds: 5
livenessProbe:
tcpSocket: { port: 6379 }
initialDelaySeconds: 30
periodSeconds: 30
resources:
requests: { cpu: 50m, memory: 64Mi }
limits: { cpu: 500m, memory: 256Mi }
---
apiVersion: v1
kind: Service
metadata:
name: authentik-redis
namespace: authentik
spec:
selector:
app: authentik-redis
ports:
- name: redis
port: 6379
targetPort: 6379
---
# Authentik server Deployment — HTTP frontend on :9000.
apiVersion: apps/v1
kind: Deployment
metadata:
name: authentik-server
namespace: authentik
labels:
app: authentik-server
argocd.argoproj.io/instance: infra-authentik
spec:
replicas: 1
strategy:
type: Recreate # shares /media RWO PVC with worker
selector:
matchLabels:
app: authentik-server
template:
metadata:
labels:
app: authentik-server
spec:
securityContext:
# Authentik image runs as uid 1000 "authentik" but the Longhorn PVC mounts
# root:root by default. fsGroup recursively chgrp + chmod g+rwx so the
# non-root container can mkdir /media/public during the tenant_files migration.
fsGroup: 1000
containers:
- name: server
image: ghcr.io/goauthentik/server:2024.12.3
args: ["server"]
ports:
- containerPort: 9000
name: http
- containerPort: 9443
name: https
env:
- name: AUTHENTIK_SECRET_KEY
valueFrom:
secretKeyRef:
name: authentik-credentials
key: AUTHENTIK_SECRET_KEY
- name: AUTHENTIK_REDIS__HOST
value: authentik-redis
- name: AUTHENTIK_REDIS__PASSWORD
valueFrom:
secretKeyRef:
name: authentik-credentials
key: REDIS_PASSWORD
- name: AUTHENTIK_POSTGRESQL__HOST
value: authentik-postgres
- name: AUTHENTIK_POSTGRESQL__NAME
value: authentik
- name: AUTHENTIK_POSTGRESQL__USER
value: authentik
- name: AUTHENTIK_POSTGRESQL__PASSWORD
valueFrom:
secretKeyRef:
name: authentik-credentials
key: POSTGRES_PASSWORD
- name: AUTHENTIK_BOOTSTRAP_PASSWORD
valueFrom:
secretKeyRef:
name: authentik-credentials
key: BOOTSTRAP_ADMIN_PASSWORD
- name: AUTHENTIK_BOOTSTRAP_TOKEN
valueFrom:
secretKeyRef:
name: authentik-credentials
key: BOOTSTRAP_ADMIN_TOKEN
- name: AUTHENTIK_BOOTSTRAP_EMAIL
valueFrom:
secretKeyRef:
name: authentik-credentials
key: BOOTSTRAP_ADMIN_EMAIL
- name: AUTHENTIK_DISABLE_UPDATE_CHECK
value: "true"
- name: AUTHENTIK_ERROR_REPORTING__ENABLED
value: "false"
- name: AUTHENTIK_LOG_LEVEL
value: info
# First-boot Authentik can take 3+ min on the migration phase
# (waiting on DB lock while worker also runs migrations). Initial
# delays are generous so kubelet doesn't kill the pod mid-migration;
# periodSeconds keeps post-startup probing responsive.
readinessProbe:
httpGet:
path: /-/health/ready/
port: 9000
initialDelaySeconds: 60
periodSeconds: 10
timeoutSeconds: 5
failureThreshold: 12
livenessProbe:
httpGet:
path: /-/health/live/
port: 9000
initialDelaySeconds: 300
periodSeconds: 30
timeoutSeconds: 10
failureThreshold: 3
startupProbe:
httpGet:
path: /-/health/live/
port: 9000
initialDelaySeconds: 30
periodSeconds: 15
timeoutSeconds: 10
failureThreshold: 40 # 30s + 40*15s = 10.5 min budget
resources:
requests: { cpu: 150m, memory: 512Mi }
limits: { cpu: 1500m, memory: 1Gi }
volumeMounts:
- name: media
mountPath: /media
volumes:
- name: media
persistentVolumeClaim:
claimName: authentik-media
---
# Authentik worker Deployment — runs Celery background tasks.
apiVersion: apps/v1
kind: Deployment
metadata:
name: authentik-worker
namespace: authentik
labels:
app: authentik-worker
argocd.argoproj.io/instance: infra-authentik
spec:
replicas: 1
strategy:
type: Recreate # shares /media RWO PVC with server
selector:
matchLabels:
app: authentik-worker
template:
metadata:
labels:
app: authentik-worker
spec:
securityContext:
# Same as server pod — non-root uid 1000 needs PVC group write.
fsGroup: 1000
containers:
- name: worker
image: ghcr.io/goauthentik/server:2024.12.3
args: ["worker"]
env:
- name: AUTHENTIK_SECRET_KEY
valueFrom:
secretKeyRef:
name: authentik-credentials
key: AUTHENTIK_SECRET_KEY
- name: AUTHENTIK_REDIS__HOST
value: authentik-redis
- name: AUTHENTIK_REDIS__PASSWORD
valueFrom:
secretKeyRef:
name: authentik-credentials
key: REDIS_PASSWORD
- name: AUTHENTIK_POSTGRESQL__HOST
value: authentik-postgres
- name: AUTHENTIK_POSTGRESQL__NAME
value: authentik
- name: AUTHENTIK_POSTGRESQL__USER
value: authentik
- name: AUTHENTIK_POSTGRESQL__PASSWORD
valueFrom:
secretKeyRef:
name: authentik-credentials
key: POSTGRES_PASSWORD
- name: AUTHENTIK_DISABLE_UPDATE_CHECK
value: "true"
- name: AUTHENTIK_ERROR_REPORTING__ENABLED
value: "false"
- name: AUTHENTIK_LOG_LEVEL
value: info
resources:
requests: { cpu: 100m, memory: 256Mi }
limits: { cpu: 1000m, memory: 768Mi }
volumeMounts:
- name: media
mountPath: /media
volumes:
- name: media
persistentVolumeClaim:
claimName: authentik-media
---
apiVersion: v1
kind: Service
metadata:
name: authentik-server
namespace: authentik
spec:
selector:
app: authentik-server
ports:
- name: http
port: 9000
targetPort: 9000
- name: https
port: 9443
targetPort: 9443
---
# step-ca leaf certificate for id.iamworkin.lan.
# step-ca container resolver uses pfSense Unbound, so the public A record for id.iamworkin.lan
# MUST exist before this Certificate is applied (cert-manager HTTP-01 will silently 2h-backoff
# otherwise). Added 2026-05-25 via scripts/pfsense-add-id-host.py.
apiVersion: cert-manager.io/v1
kind: Certificate
metadata:
name: authentik-tls
namespace: authentik
spec:
secretName: authentik-tls
dnsNames:
- id.iamworkin.lan
issuerRef:
name: step-ca-acme
kind: ClusterIssuer
---
apiVersion: traefik.io/v1alpha1
kind: IngressRoute
metadata:
name: authentik
namespace: authentik
spec:
entryPoints: [websecure]
routes:
- match: Host(`id.iamworkin.lan`)
kind: Rule
services:
- name: authentik-server
port: 9000
tls:
secretName: authentik-tls

View File

@@ -20,9 +20,12 @@
# 1) desktop-isolation — Browser Lab session pods.
#
# Locks down pods labeled `app.kubernetes.io/name=remote-desktop` (every
# session pod regardless of template). Allows guacd ingress for the VNC/RDP
# display lane and remotedesktop-web's pre-handoff probing. Egress: NFS to
# Synology, DNS, Traefik (cluster + LB VIP), Intranet (Browser Lab home).
# session pod regardless of template). Allows guacd ingress for the display
# lane and remotedesktop-web's pre-handoff probing. Egress is deliberately
# narrow: named CoreDNS, direct Intranet web, and noc1 step-ca only. There is
# no broad Traefik/VIP or internet egress from desktop sessions. If a future
# Browser Lab path needs a public-style host, prefer an explicit Service rule
# or include the post-DNAT backend port per the Traefik VIP lint.
apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
metadata:
@@ -65,51 +68,22 @@ spec:
- port: 5901
protocol: TCP
egress:
# NFS to Synology
# CoreDNS only. The old to: [] DNS rule accidentally allowed any DNS
# listener in any namespace or routed network.
- to:
- ipBlock:
cidr: 10.0.58.3/32
ports:
- port: 2049
protocol: TCP
- port: 2049
protocol: UDP
- port: 111
protocol: TCP
- port: 111
protocol: UDP
- to:
- ipBlock:
cidr: 10.0.58.3/32
ports:
- port: 445
protocol: TCP
- to: []
ports:
- port: 53
protocol: UDP
- port: 53
protocol: TCP
- to:
- ipBlock:
cidr: 10.0.56.200/32
- ipBlock:
cidr: 10.43.33.87/32
- namespaceSelector:
matchLabels:
kubernetes.io/metadata.name: traefik-system
kubernetes.io/metadata.name: kube-system
podSelector:
matchLabels:
app.kubernetes.io/name: traefik
k8s-app: kube-dns
ports:
- port: 80
protocol: TCP
- port: 443
protocol: TCP
- port: 8000
protocol: TCP
- port: 8443
- port: 53
protocol: UDP
- port: 53
protocol: TCP
# Browser Lab home / internal docs target. Use the real service port
# directly rather than public Traefik host aliases.
- to:
- namespaceSelector:
matchLabels:
@@ -120,6 +94,17 @@ spec:
ports:
- port: 5300
protocol: TCP
# noc1 step-ca ACME endpoint. The lane brief called out 9000/TCP; the live
# ACME directory currently answers on 9443/TCP, so both stay pinned to the
# same host rather than reopening Traefik or internet egress.
- to:
- ipBlock:
cidr: 10.0.56.10/32
ports:
- port: 9000
protocol: TCP
- port: 9443
protocol: TCP
---
# 2) fc-desktop-default-deny — namespace-wide catch-all.
#
@@ -330,3 +315,11 @@ spec:
protocol: UDP
- port: 53
protocol: TCP
- to:
- ipBlock:
cidr: 10.0.56.10/32
ports:
- port: 9000
protocol: TCP
- port: 9443
protocol: TCP

View File

@@ -0,0 +1,33 @@
# Explicit ArgoCD Application shape for bootstrap/review.
#
# The live bluejay-infra ApplicationSet already discovers apps/* directories
# and creates this same Application name (`infra-fc-devicemgmt`) automatically.
# Keep repoURL on the internal Gitea ClusterIP URL; ArgoCD does not trust the
# external step-ca HTTPS endpoint.
apiVersion: argoproj.io/v1alpha1
kind: Application
metadata:
name: infra-fc-devicemgmt
namespace: argocd
labels:
app.kubernetes.io/name: fc-devicemgmt
app.kubernetes.io/part-of: flowercore
app.kubernetes.io/managed-by: argocd
flowercore.io/tenant-id: system
flowercore.io/created-by: bluejay-infra
spec:
project: default
source:
repoURL: http://gitea-clusterip.gitea.svc.cluster.local:3000/bluejay/bluejay-infra.git
targetRevision: main
path: apps/fc-devicemgmt
destination:
server: https://kubernetes.default.svc
namespace: fc-devicemgmt
syncPolicy:
automated:
prune: true
selfHeal: true
syncOptions:
- CreateNamespace=true
- ServerSideApply=true

View File

@@ -1,33 +0,0 @@
# FlowerCore SignalControl platform notes
This app owns the cluster web manager at `signalcontrol.iamworkin.lan` and documents the physical Pi pilot at `signal-a.iamworkin.lan` / `pirelay`.
## mTLS enrollment pattern
Do not install or restart anything from this repo. The intended pirelay pattern is the Pi-signage step-ca-agent shape:
- stable node identity: `pirelay`
- local private key and CSR generated on the node
- CSR submitted through the approved DeviceManagement/step-ca enrollment path
- client certificate and chain stored node-local under `/etc/flowercore/signalcontrol/mtls/`
- daily renewal timer, renewing only when fewer than 30 days remain
- certificate used for DM-agent to DM-web traffic and future SignalControl inter-service calls
Secrets, enrollment codes, private keys, p12 passphrases, and OIDC client secrets stay out of Git.
## Telemetry
Monitoring manifests add a dedicated Prometheus job:
- `signalcontrol-pi-app`
- target `10.0.58.113:5200`
- path `/metrics/prometheus`
- labels `instance="pirelay"`, `host="signal-a.iamworkin.lan"`, `service="signalcontrol-pi"`
Host metrics continue through the `edge-nodes` node_exporter target at `10.0.58.113:9100`.
## Physical-control audit
The app ships with `FlowerCore:SignalControl:PhysicalAudit:Enabled=false` and `ForwardingEnabled=false`. Enabling local audit creates a SHA-256 hash chain for physical-control mutations. Forwarding to `https://audit.iamworkin.lan/api/v1/audit/signalcontrol` requires flipping the forwarding gate separately.
Telemetry reads and `/metrics` scrapes are not audited.

View File

@@ -532,7 +532,7 @@ spec:
fsGroupChangePolicy: OnRootMismatch
containers:
- name: web
image: localhost/fc-ttsreader-web:v20260531-tts-corrections-r2
image: localhost/fc-ttsreader-web:v20260518-sprint36-demo-finish-b132cbf
imagePullPolicy: Never
ports:
- containerPort: 5217
@@ -554,8 +554,6 @@ spec:
value: "/data/chapter-context.db"
- name: TtsReader__Jobs__Root
value: "/data/jobs"
- name: TtsReader__Export__LocalCasRoot
value: "/data/bundles/cas"
- name: TtsReader__Piper__Host
value: "10.0.57.17"
- name: TtsReader__Piper__Port

View File

@@ -58,7 +58,7 @@ spec:
nodeName: rke2-server
containers:
- name: web
image: localhost/fc-updater-web:v202605310029-7974fc4
image: localhost/fc-updater-web:v20260509-4162dca-authgate
imagePullPolicy: Never
ports:
- containerPort: 8080
@@ -88,8 +88,6 @@ spec:
value: Faith AI Mike Edition
- name: FlowerCore__Updater__PublicShares__Links__0__Description
value: Private release link for Mike's Faith AI bundle.
- name: FlowerCore__Audit__Sinks__Loki__Enabled
value: "false"
- name: FlowerCore__Updater__Auth__Bootstrap__Enabled
value: "true"
- name: FlowerCore__Updater__Auth__Bootstrap__Username

View File

@@ -1,2 +0,0 @@
*.sh text eol=lf
Dockerfile text eol=lf

View File

@@ -1,54 +0,0 @@
FROM myoung34/github-runner:latest
ARG RUBY_VERSION=3.3.11
ARG RUBY_MINOR=3.3
ARG RUBY_BUILD_VERSION=v20260326
ARG RUNNER_UID=1001
ARG RUNNER_GID=1001
ENV RUNNER_TOOL_CACHE=/home/runner/_tool
ENV RUNNER_RUBY_TOOLCACHE=/opt/runner-toolcache
ENV PATH="/home/runner/_tool/Ruby/${RUBY_MINOR}/x64/bin:/opt/runner-toolcache/Ruby/${RUBY_MINOR}/x64/bin:${PATH}"
USER root
# Bake the IAmWorkin step-ca root CA into the system trust store. Without
# this, .NET HttpClient calls from CI tests against *.iamworkin.lan
# (e.g. https://selenium.iamworkin.lan/session) fail with `PartialChain`
# because the runner image's default Ubuntu trust bundle doesn't include
# our internal Root CA. update-ca-certificates regenerates
# /etc/ssl/certs/ca-certificates.crt, which OpenSSL + .NET on Linux read
# automatically — no SSL_CERT_FILE env var needed.
COPY step-ca-root.crt /usr/local/share/ca-certificates/iamworkin-step-ca-root.crt
RUN apt-get update \
&& DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
autoconf \
bison \
build-essential \
ca-certificates \
curl \
libdb-dev \
libffi-dev \
libgdbm-dev \
libgmp-dev \
libncurses-dev \
libreadline-dev \
libssl-dev \
libyaml-dev \
patch \
pkg-config \
uuid-dev \
zlib1g-dev \
&& update-ca-certificates \
&& curl -fsSL "https://github.com/rbenv/ruby-build/archive/refs/tags/${RUBY_BUILD_VERSION}.tar.gz" -o /tmp/ruby-build.tar.gz \
&& mkdir -p /tmp/ruby-build \
&& tar -xzf /tmp/ruby-build.tar.gz --strip-components=1 -C /tmp/ruby-build \
&& /tmp/ruby-build/install.sh \
&& rm -rf /tmp/ruby-build /tmp/ruby-build.tar.gz /var/lib/apt/lists/*
COPY install-ruby-toolcache.sh /usr/local/bin/install-ruby-toolcache.sh
RUN chmod +x /usr/local/bin/install-ruby-toolcache.sh \
&& RUBY_VERSION="${RUBY_VERSION}" RUBY_MINOR="${RUBY_MINOR}" TOOLCACHE_ROOT="${RUNNER_RUBY_TOOLCACHE}" RUNNER_UID="${RUNNER_UID}" RUNNER_GID="${RUNNER_GID}" /usr/local/bin/install-ruby-toolcache.sh \
&& ruby -v

View File

@@ -7,17 +7,12 @@ Deployments with `kubectl`; update this manifest and let ArgoCD reconcile.
All repo-scoped Linux runners use:
- `localhost/fc-github-runner:v20260525-ruby3.3.11-stepca`, derived from
`myoung34/github-runner:latest`
- `ACCESS_TOKEN` from the `github-runner-token` Secret
- `RUN_AS_ROOT=false`
- `EPHEMERAL=true`
- `LABELS=self-hosted,linux,fc-build-linux`
- writable non-root paths under `/home/runner` for .NET, NuGet, XDG cache, and
Actions tool cache
- Ruby 3.3.11 seeded into `/home/runner/_tool/Ruby/3.3/x64` from the baked
`/opt/runner-toolcache` copy so `ruby/setup-ruby@v1` can discover it on
self-hosted `ubuntu-20.04-x64` runners
`github-runner` for `FlowerCore.Common` is single-replica because it retains the
original Longhorn ReadWriteOnce NuGet PVC. Every other repo-scoped runner uses
@@ -33,46 +28,6 @@ Sprint 32 final long-tail wave adds 16 two-replica Deployments:
`FlowerCore.Provisioning`, `FlowerCore.Redis`, `FlowerCore.MessageBoard`, and
`FlowerCore.MenuBoard`.
## Image Build
Ruby is baked with a pinned `ruby-build` release and Ruby patch version. The pod
still mounts an `emptyDir` over `/home/runner`, so the `setup-runner-home` init
container copies the baked toolcache from `/opt/runner-toolcache/Ruby` into
`/home/runner/_tool/Ruby` before the runner container starts.
The IAmWorkin step-ca root CA is also baked into the system trust store
(`/usr/local/share/ca-certificates/iamworkin-step-ca-root.crt`, registered by
`update-ca-certificates`). Without it, .NET HttpClient calls from CI tests
against `*.iamworkin.lan` (e.g. `https://selenium.iamworkin.lan/session`)
fail with `PartialChain`. To refresh the bundled cert when the root rotates,
re-extract from the cluster and overwrite `step-ca-root.crt`:
```bash
kubectl get secret -n cert-manager step-ca-root \
-o jsonpath='{.data.ca\.crt}' | base64 -d > step-ca-root.crt
```
```bash
cd apps/github-runner
podman build -t localhost/fc-github-runner:v20260525-ruby3.3.11-stepca .
podman run --rm localhost/fc-github-runner:v20260525-ruby3.3.11-stepca ruby -v
podman run --rm localhost/fc-github-runner:v20260525-ruby3.3.11-stepca \
test -f /opt/runner-toolcache/Ruby/3.3/x64.complete
podman save localhost/fc-github-runner:v20260525-ruby3.3.11-stepca \
-o fc-github-runner-v20260525-ruby3.3.11-stepca.tar
```
Import the saved image on every schedulable RKE2 node before ArgoCD rolls the
Deployments:
```bash
for node in rke2-server rke2-agent1 rke2-agent2; do
scp fc-github-runner-v20260525-ruby3.3.11-stepca.tar "$node:/tmp/"
ssh "$node" 'sudo ctr -a /run/k3s/containerd/containerd.sock -n k8s.io images rm localhost/fc-github-runner:v20260525-ruby3.3.11-stepca || true'
ssh "$node" 'sudo ctr -a /run/k3s/containerd/containerd.sock -n k8s.io images import /tmp/fc-github-runner-v20260525-ruby3.3.11-stepca.tar'
done
```
## Post-Merge Proof
After the PR is merged and ArgoCD syncs, verify the runner fleet:
@@ -81,14 +36,6 @@ After the PR is merged and ArgoCD syncs, verify the runner fleet:
kubectl -n github-runner get deploy,pods,pvc
```
Verify the Ruby toolcache in a fresh pod:
```bash
kubectl -n github-runner exec deploy/github-runner-puppet -c runner -- ruby -v
kubectl -n github-runner exec deploy/github-runner-puppet -c runner -- sh -c \
'echo "$RUNNER_TOOL_CACHE" && test -f "$RUNNER_TOOL_CACHE/Ruby/3.3/x64.complete"'
```
Verify GitHub registration for the repo-scoped runners:
```bash
@@ -122,10 +69,6 @@ from GitHub Actions and verify it lands on an `rke2-linux-*` runner.
- `actions/setup-dotnet` permission error at `/usr/share/dotnet`: check that
`DOTNET_INSTALL_DIR=/home/runner/.dotnet` and related cache env vars are
present on the runner pod.
- `ruby/setup-ruby@v1` says self-hosted runners must install Ruby in
`$RUNNER_TOOL_CACHE`: check that the init container copied
`/opt/runner-toolcache/Ruby` into `/home/runner/_tool/Ruby` and that
`/home/runner/_tool/Ruby/3.3/x64.complete` exists.
- `404` during runner registration: the fine-grained PAT is valid but missing
repository access for that repo. Add the repo to the PAT access list; the PAT
value does not change.

File diff suppressed because it is too large Load Diff

View File

@@ -1,19 +0,0 @@
#!/usr/bin/env bash
set -euo pipefail
RUBY_VERSION="${RUBY_VERSION:-3.3.11}"
RUBY_MINOR="${RUBY_MINOR:-3.3}"
TOOLCACHE_ROOT="${TOOLCACHE_ROOT:-/opt/runner-toolcache}"
RUNNER_UID="${RUNNER_UID:-1001}"
RUNNER_GID="${RUNNER_GID:-1001}"
RUBY_PREFIX="${TOOLCACHE_ROOT}/Ruby/${RUBY_VERSION}/x64"
mkdir -p "${TOOLCACHE_ROOT}/Ruby"
RUBY_CONFIGURE_OPTS="${RUBY_CONFIGURE_OPTS:---disable-install-doc --disable-yjit}" ruby-build "${RUBY_VERSION}" "${RUBY_PREFIX}"
touch "${TOOLCACHE_ROOT}/Ruby/${RUBY_VERSION}/x64.complete"
ln -sfn "${RUBY_VERSION}" "${TOOLCACHE_ROOT}/Ruby/${RUBY_MINOR}"
"${RUBY_PREFIX}/bin/ruby" -v
chown -R "${RUNNER_UID}:${RUNNER_GID}" "${TOOLCACHE_ROOT}"
chmod -R a+rX "${TOOLCACHE_ROOT}"

View File

@@ -1,12 +0,0 @@
-----BEGIN CERTIFICATE-----
MIIBxDCCAWqgAwIBAgIRAPY357G6ow6zMAL5+4bS2kkwCgYIKoZIzj0EAwIwQDEa
MBgGA1UEChMRSUFtV29ya2luIEFDTUUgQ0ExIjAgBgNVBAMTGUlBbVdvcmtpbiBB
Q01FIENBIFJvb3QgQ0EwHhcNMjYwMzA4MTgwNzExWhcNMzYwMzA1MTgwNzExWjBA
MRowGAYDVQQKExFJQW1Xb3JraW4gQUNNRSBDQTEiMCAGA1UEAxMZSUFtV29ya2lu
IEFDTUUgQ0EgUm9vdCBDQTBZMBMGByqGSM49AgEGCCqGSM49AwEHA0IABJ2n04X1
JZo5Zdq/i1Idv8+fqwZyAzBh7whbqj0SWsJL8UWRabCMqYCs7+dXO0xRSzqkwFDL
x+vooOai8RgRNhajRTBDMA4GA1UdDwEB/wQEAwIBBjASBgNVHRMBAf8ECDAGAQH/
AgEBMB0GA1UdDgQWBBRnuPPQR6iM/H6vOluiU3Sygayz8jAKBggqhkjOPQQDAgNI
ADBFAiEArQK9dYPGmAZsdYnjziuFVVE5NKZUcceYvGfGC+tLXUsCIAudF2zJrCRq
3mK50ZZET/fwTkJwiEF4824mjP8p1CKM
-----END CERTIFICATE-----

View File

@@ -254,6 +254,68 @@ spec:
targetPort: 4822
name: guacd
---
# Guacd display egress isolation.
#
# Guacamole web talks to guacd on TCP/4822. Guacd then opens the desktop
# display connection to the per-session pod. Keep that second hop at raw VNC
# 5901/TCP for the current RemoteDesktop Browser Lab/openSUSE images. Do not
# grant guacd broad fc-desktop namespace egress; desktop-to-desktop lateral
# paths remain blocked by apps/fc-desktop/network-policies.yaml.
apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
metadata:
name: guacd-desktop-egress
namespace: guacamole
labels:
app.kubernetes.io/part-of: remotedesktop
app.kubernetes.io/component: display-isolation
spec:
podSelector:
matchLabels:
app: guacd
policyTypes:
- Ingress
- Egress
ingress:
- from:
- podSelector:
matchLabels:
app: guacamole
ports:
- port: 4822
protocol: TCP
egress:
- to:
- namespaceSelector:
matchLabels:
kubernetes.io/metadata.name: kube-system
podSelector:
matchLabels:
k8s-app: kube-dns
ports:
- port: 53
protocol: UDP
- port: 53
protocol: TCP
# kubectl-proxy sidecar reaches the Kubernetes API; keep it explicit
# because this NetworkPolicy selects the whole guacd pod.
- to: []
ports:
- port: 443
protocol: TCP
- port: 6443
protocol: TCP
- to:
- namespaceSelector:
matchLabels:
kubernetes.io/metadata.name: fc-desktop
podSelector:
matchLabels:
app.kubernetes.io/name: remote-desktop
ports:
- port: 5901
protocol: TCP
---
# Guacamole Web Application
apiVersion: apps/v1
kind: Deployment

View File

@@ -46,7 +46,7 @@ spec:
spec:
containers:
- name: intranet-web
image: localhost/fc-intranet-web:v20260531-ttsreader-bridge
image: localhost/fc-intranet-web:v20260508-brochure-w1
imagePullPolicy: Never
ports:
- containerPort: 5300

View File

@@ -25,7 +25,7 @@ metadata:
role: github-actions-runner
flowercore.io/managed-by: bluejay-infra
spec:
runStrategy: Halted
runStrategy: Always
template:
metadata:
labels:

View File

@@ -207,13 +207,20 @@ spec:
- port: 993
targetPort: 993
name: imaps
# --- mail-tls Certificate REMOVED 2026-06-01 ---
# mail-tls is now managed OUTSIDE cert-manager: issued from step-ca's JWK 'admin'
# provisioner and auto-renewed by a systemd timer on noc1 (step ca renew), which
# writes the mail-tls secret directly. step-ca-acme only has an HTTP-01 (Traefik)
# solver, but mail.iamworkin.lan must resolve to the dedicated MetalLB IP 10.0.56.202
# (SMTP/IMAP), so HTTP-01 cannot validate. Do NOT re-add a cert-manager Certificate
# here unless a DNS-01 solver is deployed for step-ca-acme.
---
# TLS Certificate via cert-manager
apiVersion: cert-manager.io/v1
kind: Certificate
metadata:
name: mail-tls
namespace: mail
spec:
secretName: mail-tls
issuerRef:
name: step-ca-acme
kind: ClusterIssuer
dnsNames:
- mail.iamworkin.lan
---
# Traefik IngressRoute - Webmail placeholder
apiVersion: traefik.io/v1alpha1

View File

@@ -1,260 +0,0 @@
# Grafana dashboard ConfigMap for FlowerCore.SignalControl on pirelay.
#
# The Grafana Deployment in noc-monitoring.yaml mounts this ConfigMap at
# /var/lib/grafana/dashboards/signalcontrol. The paired Prometheus jobs are:
# - signalcontrol-pi-app: 10.0.58.113:5200 /metrics/prometheus
# - edge-nodes: 10.0.58.113:9100 with instance="pirelay"
---
apiVersion: v1
kind: ConfigMap
metadata:
name: grafana-dashboard-signalcontrol
namespace: monitoring
data:
signalcontrol.json: |
{
"annotations": { "list": [] },
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 0,
"id": null,
"links": [],
"panels": [
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "red", "value": null },
{ "color": "green", "value": 1 }
]
},
"unit": "short"
},
"overrides": []
},
"gridPos": { "h": 5, "w": 6, "x": 0, "y": 0 },
"id": 1,
"options": {
"colorMode": "value",
"graphMode": "none",
"justifyMode": "auto",
"orientation": "auto",
"reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false },
"textMode": "auto"
},
"targets": [
{ "editorMode": "code", "expr": "up{job=\"signalcontrol-pi-app\",instance=\"pirelay\"}", "range": true, "refId": "A" }
],
"title": "SignalControl App Up",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "red", "value": null },
{ "color": "green", "value": 1 }
]
},
"unit": "short"
},
"overrides": []
},
"gridPos": { "h": 5, "w": 6, "x": 6, "y": 0 },
"id": 2,
"options": {
"colorMode": "value",
"graphMode": "none",
"justifyMode": "auto",
"orientation": "auto",
"reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false },
"textMode": "auto"
},
"targets": [
{ "editorMode": "code", "expr": "up{job=\"edge-nodes\",instance=\"pirelay\"}", "range": true, "refId": "A" }
],
"title": "pirelay node_exporter Up",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": { "defaults": { "unit": "short" }, "overrides": [] },
"gridPos": { "h": 5, "w": 6, "x": 12, "y": 0 },
"id": 3,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false },
"textMode": "name"
},
"targets": [
{ "editorMode": "code", "expr": "signalcontrol_active_pattern{job=\"signalcontrol-pi-app\",instance=\"pirelay\"}", "legendFormat": "{{pattern}}", "range": true, "refId": "A" }
],
"title": "Active Pattern",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": { "defaults": { "unit": "short" }, "overrides": [] },
"gridPos": { "h": 5, "w": 6, "x": 18, "y": 0 },
"id": 4,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false },
"textMode": "name"
},
"targets": [
{ "editorMode": "code", "expr": "signalcontrol_phase{job=\"signalcontrol-pi-app\",instance=\"pirelay\"}", "legendFormat": "{{phase}}", "range": true, "refId": "A" }
],
"title": "Current Phase",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": { "defaults": { "unit": "ops" }, "overrides": [] },
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 5 },
"id": 5,
"options": { "legend": { "displayMode": "table", "placement": "bottom" }, "tooltip": { "mode": "single" } },
"targets": [
{
"editorMode": "code",
"expr": "sum by (channel, state) (rate(signal_relay_writes_total{job=\"signalcontrol-pi-app\",instance=\"pirelay\"}[$__rate_interval]))",
"legendFormat": "channel {{channel}} {{state}}",
"range": true,
"refId": "A"
}
],
"title": "Relay Activations",
"type": "timeseries"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": { "defaults": { "unit": "ops" }, "overrides": [] },
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 5 },
"id": 6,
"options": { "legend": { "displayMode": "table", "placement": "bottom" }, "tooltip": { "mode": "single" } },
"targets": [
{
"editorMode": "code",
"expr": "sum by (source, to_phase) (rate(signal_transitions_total{job=\"signalcontrol-pi-app\",instance=\"pirelay\"}[$__rate_interval]))",
"legendFormat": "{{source}} -> {{to_phase}}",
"range": true,
"refId": "A"
}
],
"title": "Phase Dwell / Transitions",
"type": "timeseries"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": { "defaults": { "unit": "short" }, "overrides": [] },
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 13 },
"id": 7,
"options": { "legend": { "displayMode": "table", "placement": "bottom" }, "tooltip": { "mode": "single" } },
"targets": [
{
"editorMode": "code",
"expr": "sum by (action) (increase(signal_schedule_fires_total{job=\"signalcontrol-pi-app\",instance=\"pirelay\"}[24h]))",
"legendFormat": "{{action}}",
"range": true,
"refId": "A"
},
{
"editorMode": "code",
"expr": "sum by (from_pattern, to_pattern) (increase(flowercore_signalcontrol_pattern_switches_total{job=\"signalcontrol-pi-app\",instance=\"pirelay\"}[24h]))",
"legendFormat": "{{from_pattern}} -> {{to_pattern}}",
"range": true,
"refId": "B"
}
],
"title": "Schedule Fires and Pattern Switches",
"type": "timeseries"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": { "defaults": { "unit": "percentunit" }, "overrides": [] },
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 13 },
"id": 8,
"options": { "legend": { "displayMode": "table", "placement": "bottom" }, "tooltip": { "mode": "single" } },
"targets": [
{
"editorMode": "code",
"expr": "1 - avg by (instance) (rate(node_cpu_seconds_total{job=\"edge-nodes\",instance=\"pirelay\",mode=\"idle\"}[$__rate_interval]))",
"legendFormat": "CPU",
"range": true,
"refId": "A"
},
{
"editorMode": "code",
"expr": "1 - (node_memory_MemAvailable_bytes{job=\"edge-nodes\",instance=\"pirelay\"} / node_memory_MemTotal_bytes{job=\"edge-nodes\",instance=\"pirelay\"})",
"legendFormat": "Memory",
"range": true,
"refId": "B"
}
],
"title": "pirelay Host Utilization",
"type": "timeseries"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": { "defaults": { "unit": "short" }, "overrides": [] },
"gridPos": { "h": 6, "w": 12, "x": 0, "y": 21 },
"id": 9,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false },
"textMode": "auto"
},
"targets": [
{ "editorMode": "code", "expr": "signalcontrol_screen_saver_enabled{job=\"signalcontrol-pi-app\",instance=\"pirelay\"}", "range": true, "refId": "A" }
],
"title": "Screen-saver Enabled",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": { "defaults": { "unit": "short" }, "overrides": [] },
"gridPos": { "h": 6, "w": 12, "x": 12, "y": 21 },
"id": 10,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false },
"textMode": "name"
},
"targets": [
{ "editorMode": "code", "expr": "signalcontrol_animation_active{job=\"signalcontrol-pi-app\",instance=\"pirelay\"}", "legendFormat": "{{planner}}", "range": true, "refId": "A" }
],
"title": "Screen-saver / Animation Engaged",
"type": "stat"
}
],
"refresh": "30s",
"schemaVersion": 39,
"style": "dark",
"tags": [ "flowercore", "signalcontrol", "pirelay" ],
"templating": { "list": [] },
"time": { "from": "now-24h", "to": "now" },
"timezone": "browser",
"title": "FlowerCore SignalControl",
"uid": "flowercore-signalcontrol",
"version": 1
}

View File

@@ -230,19 +230,6 @@ data:
vlan: "home"
device: "pi3-ks0212"
# SignalControl Pi-edition app metrics (pirelay / signal-a)
- job_name: "signalcontrol-pi-app"
scrape_interval: 15s
metrics_path: /metrics/prometheus
static_configs:
- targets: ["10.0.58.113:5200"]
labels:
instance: "pirelay"
host: "signal-a.iamworkin.lan"
service: "signalcontrol-pi"
vlan: "home"
device: "pi3-ks0212"
# Epson ET-3750 EcoTank Printer SNMP
- job_name: "snmp-printer"
scrape_interval: 5m
@@ -293,14 +280,13 @@ data:
printer_model: "NuPrint 210"
# Print.Web health (Blazor app on edge2:5200)
# Target `/health` (anonymous) — root path requires API key auth and returns 401.
- job_name: "probe-printweb"
metrics_path: /probe
params:
module: [http_2xx]
scrape_interval: 30s
static_configs:
- targets: ["http://10.0.57.16:5200/health"]
- targets: ["http://10.0.57.16:5200/"]
labels:
instance: "print-web"
service: "print-web"
@@ -4064,9 +4050,6 @@ spec:
- name: dashboards-remotedesktop
mountPath: /var/lib/grafana/dashboards/remotedesktop
readOnly: true
- name: dashboards-signalcontrol
mountPath: /var/lib/grafana/dashboards/signalcontrol
readOnly: true
- name: datasource-provisioning
mountPath: /etc/grafana/provisioning/datasources
readOnly: true
@@ -4120,9 +4103,6 @@ spec:
- name: dashboards-remotedesktop
configMap:
name: grafana-dashboard-remotedesktop
- name: dashboards-signalcontrol
configMap:
name: grafana-dashboard-signalcontrol
- name: datasource-provisioning
configMap:
name: grafana-datasource-provisioning

View File

@@ -24,16 +24,7 @@
# (10.0.57.16:5200), public internet 80/443 (excluding RFC1918), and
# fc-signage:5190 for the signage AAT lane.
# - Ingress: Traefik (4444 + 8089 ACME-solver-style), intra-pod,
# telephony / gitea / fc-system / fc-signage / github-runner namespaces
# on 4444.
#
# 2026-05-25: added github-runner ingress on 4444 so CI jobs running in
# self-hosted runner pods (e.g. FlowerCore.Print.Web `help-screenshots`)
# can reach the grid. Without this allow, the session POST to
# `selenium-hub.selenium.svc.cluster.local:4444` was DNAT'd to the hub
# pod IP and then dropped at the Calico ingress hook — Selenium UI showed
# 0/4 sessions while the .NET HTTP client timed out at 60s. Same family
# as `feedback_netpol_dnat_backend_port`, wrong-source-namespace flavor.
# telephony / gitea / fc-system / fc-signage namespaces on 4444.
apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
metadata:
@@ -212,13 +203,6 @@ spec:
ports:
- port: 4444
protocol: TCP
- from:
- namespaceSelector:
matchLabels:
kubernetes.io/metadata.name: github-runner
ports:
- port: 4444
protocol: TCP
podSelector: {}
policyTypes:
- Ingress

View File

@@ -1,427 +0,0 @@
# Selenium Grid 4 — RKE2 deployment
#
# Hub + chrome + firefox + edge browser nodes serving fleet-wide AAT runs from
# the GitHub Actions self-hosted runners. ArgoCD owns this namespace from
# 2026-05-25 (`infra-selenium` Application; previously these resources were
# orphan kubectl-applied since 2026-03-15).
#
# Endpoints:
# - Internal cluster: http://selenium-hub.selenium.svc.cluster.local:4444
# - LAN LoadBalancer (MetalLB): http://10.0.56.208:4444
# - Traefik public: https://selenium.iamworkin.lan
#
# Browser maxSessions:
# - chrome 2 (bumped from 1 on 2026-05-25 morning-routine — AAT-heavy
# Print.Web help-screenshots was the global bottleneck;
# see commit history for ops/runner-replica-rightsize)
# - firefox 1
# - edge 1
#
# Screenshots + video recording write to NFS via the chrome video sidecar.
# See: CLAUDE.md "Selenium Grid & Visual AAT Testing" + bluejay-infra ADR notes.
---
apiVersion: v1
kind: Service
metadata:
labels:
app: selenium-hub
app.kubernetes.io/name: selenium-hub
app.kubernetes.io/part-of: selenium-grid
name: selenium-hub
namespace: selenium
spec:
ports:
- name: web
port: 4444
targetPort: 4444
- name: publish
port: 4442
targetPort: 4442
- name: subscribe
port: 4443
targetPort: 4443
selector:
app: selenium-hub
type: ClusterIP
---
apiVersion: v1
kind: Service
metadata:
annotations:
metallb.io/ip-allocated-from-pool: bluejay-pool
metallb.universe.tf/loadBalancerIPs: 10.0.56.208
labels:
app: selenium-hub
component: external-access
name: selenium-hub-external
namespace: selenium
spec:
clusterIP: 10.43.90.147
clusterIPs:
- 10.43.90.147
externalTrafficPolicy: Local
healthCheckNodePort: 32213
ports:
- name: web
nodePort: 32411
port: 4444
targetPort: 4444
- name: publish
nodePort: 32068
port: 4442
targetPort: 4442
- name: subscribe
nodePort: 31000
port: 4443
targetPort: 4443
selector:
app: selenium-hub
type: LoadBalancer
---
apiVersion: apps/v1
kind: Deployment
metadata:
labels:
app: selenium-hub
app.kubernetes.io/name: selenium-hub
app.kubernetes.io/part-of: selenium-grid
name: selenium-hub
namespace: selenium
spec:
replicas: 1
selector:
matchLabels:
app: selenium-hub
template:
metadata:
labels:
app: selenium-hub
app.kubernetes.io/name: selenium-hub
app.kubernetes.io/part-of: selenium-grid
spec:
containers:
- env:
- name: SE_NODE_SESSION_TIMEOUT
value: '300'
- name: SE_SESSION_REQUEST_TIMEOUT
value: '300'
- name: SE_SESSION_RETRY_INTERVAL
value: '5'
- name: JAVA_OPTS
value: -Xmx512m
image: selenium/hub:4.27.0
livenessProbe:
httpGet:
path: /wd/hub/status
port: 4444
initialDelaySeconds: 30
periodSeconds: 15
timeoutSeconds: 5
name: selenium-hub
ports:
- containerPort: 4444
name: web
- containerPort: 4442
name: publish
- containerPort: 4443
name: subscribe
readinessProbe:
httpGet:
path: /wd/hub/status
port: 4444
initialDelaySeconds: 10
periodSeconds: 5
timeoutSeconds: 5
# Hub baseline working set ~766Mi on 2026-05-25 (75% of prior 1Gi
# limit). Bump to 1.5Gi / 1Gi to keep ~50% headroom; matches the
# stampede-buffer pattern documented for multus
# (feedback_k8s_cni_multus_sizing). CPU left alone — observed 54m
# against a 500m limit, no contention.
resources:
limits:
cpu: 500m
memory: 1536Mi
requests:
cpu: 250m
memory: 1Gi
---
apiVersion: apps/v1
kind: Deployment
metadata:
labels:
app: selenium-node-chrome
app.kubernetes.io/name: selenium-node-chrome
app.kubernetes.io/part-of: selenium-grid
name: selenium-node-chrome
namespace: selenium
spec:
replicas: 1
selector:
matchLabels:
app: selenium-node-chrome
template:
metadata:
labels:
app: selenium-node-chrome
app.kubernetes.io/name: selenium-node-chrome
app.kubernetes.io/part-of: selenium-grid
spec:
containers:
- env:
- name: SE_EVENT_BUS_HOST
value: selenium-hub
- name: SE_EVENT_BUS_PUBLISH_PORT
value: '4442'
- name: SE_EVENT_BUS_SUBSCRIBE_PORT
value: '4443'
- name: SE_NODE_MAX_SESSIONS
value: '2'
- name: SE_NODE_OVERRIDE_MAX_SESSIONS
value: 'false'
- name: SE_VNC_NO_PASSWORD
value: '1'
- name: SE_SCREEN_WIDTH
value: '1920'
- name: SE_SCREEN_HEIGHT
value: '1080'
- name: SE_NODE_SESSION_TIMEOUT
value: '300'
image: selenium/node-chrome:4.27.0
livenessProbe:
httpGet:
path: /status
port: 5555
initialDelaySeconds: 30
periodSeconds: 15
name: selenium-chrome
ports:
- containerPort: 5555
name: node
readinessProbe:
httpGet:
path: /status
port: 5555
initialDelaySeconds: 15
periodSeconds: 5
# Chromium-based browser node. Bumped from 1Gi -> 2Gi (req 512Mi
# -> 1Gi) on 2026-05-25 — Edge had 51 OOMKills in 5d on the
# original 1Gi cap (~1 OOM every 2.4h), and Chrome at maxSessions=2
# was running 684Mi idle on the same cap. Matches the Firefox node's
# tested-stable 2Gi limit. CPU unchanged.
resources:
limits:
cpu: '1'
memory: 2Gi
requests:
cpu: 500m
memory: 1Gi
volumeMounts:
- mountPath: /dev/shm
name: dshm
- env:
- name: DISPLAY_CONTAINER_NAME
value: localhost
- name: SE_SCREEN_WIDTH
value: '1920'
- name: SE_SCREEN_HEIGHT
value: '1080'
- name: SE_VIDEO_FILE_NAME
value: auto
- name: SE_VIDEO_UPLOAD_ENABLED
value: 'false'
image: selenium/video:ffmpeg-7.1-20250101
name: video
resources:
limits:
cpu: 500m
memory: 768Mi
requests:
cpu: 250m
memory: 384Mi
volumeMounts:
- mountPath: /videos
name: selenium-videos
volumes:
- emptyDir:
medium: Memory
sizeLimit: 2Gi
name: dshm
- emptyDir:
sizeLimit: 5Gi
name: selenium-videos
---
apiVersion: apps/v1
kind: Deployment
metadata:
labels:
app: selenium-node-firefox
app.kubernetes.io/name: selenium-node-firefox
app.kubernetes.io/part-of: selenium-grid
name: selenium-node-firefox
namespace: selenium
spec:
replicas: 1
selector:
matchLabels:
app: selenium-node-firefox
template:
metadata:
labels:
app: selenium-node-firefox
app.kubernetes.io/name: selenium-node-firefox
app.kubernetes.io/part-of: selenium-grid
spec:
containers:
- env:
- name: SE_EVENT_BUS_HOST
value: selenium-hub
- name: SE_EVENT_BUS_PUBLISH_PORT
value: '4442'
- name: SE_EVENT_BUS_SUBSCRIBE_PORT
value: '4443'
- name: SE_NODE_MAX_SESSIONS
value: '1'
- name: SE_NODE_OVERRIDE_MAX_SESSIONS
value: 'true'
- name: SE_VNC_NO_PASSWORD
value: '1'
- name: SE_START_VNC
value: 'false'
- name: SE_SCREEN_WIDTH
value: '1920'
- name: SE_SCREEN_HEIGHT
value: '1080'
- name: SE_NODE_SESSION_TIMEOUT
value: '300'
image: selenium/node-firefox:4.27.0
livenessProbe:
failureThreshold: 5
httpGet:
path: /status
port: 5555
initialDelaySeconds: 30
periodSeconds: 15
timeoutSeconds: 5
name: selenium-firefox
ports:
- containerPort: 5555
name: node
readinessProbe:
failureThreshold: 5
httpGet:
path: /status
port: 5555
initialDelaySeconds: 15
periodSeconds: 5
timeoutSeconds: 5
resources:
limits:
cpu: '1'
memory: 2Gi
requests:
cpu: 500m
memory: 1Gi
volumeMounts:
- mountPath: /dev/shm
name: dshm
volumes:
- emptyDir:
medium: Memory
sizeLimit: 2Gi
name: dshm
---
apiVersion: apps/v1
kind: Deployment
metadata:
labels:
app: selenium-node-edge
app.kubernetes.io/name: selenium-node-edge
app.kubernetes.io/part-of: selenium-grid
name: selenium-node-edge
namespace: selenium
spec:
replicas: 1
selector:
matchLabels:
app: selenium-node-edge
template:
metadata:
labels:
app: selenium-node-edge
app.kubernetes.io/name: selenium-node-edge
app.kubernetes.io/part-of: selenium-grid
spec:
containers:
- env:
- name: SE_EVENT_BUS_HOST
value: selenium-hub
- name: SE_EVENT_BUS_PUBLISH_PORT
value: '4442'
- name: SE_EVENT_BUS_SUBSCRIBE_PORT
value: '4443'
- name: SE_NODE_MAX_SESSIONS
value: '1'
- name: SE_NODE_OVERRIDE_MAX_SESSIONS
value: 'true'
- name: SE_VNC_NO_PASSWORD
value: '1'
- name: SE_SCREEN_WIDTH
value: '1920'
- name: SE_SCREEN_HEIGHT
value: '1080'
- name: SE_NODE_SESSION_TIMEOUT
value: '300'
image: selenium/node-edge:4.27.0
livenessProbe:
httpGet:
path: /status
port: 5555
initialDelaySeconds: 30
periodSeconds: 15
name: selenium-edge
ports:
- containerPort: 5555
name: node
readinessProbe:
httpGet:
path: /status
port: 5555
initialDelaySeconds: 15
periodSeconds: 5
# Chromium-based browser node. Bumped from 1Gi -> 2Gi (req 512Mi
# -> 1Gi) on 2026-05-25 — Edge had 51 OOMKills in 5d on the
# original 1Gi cap (~1 OOM every 2.4h), and Chrome at maxSessions=2
# was running 684Mi idle on the same cap. Matches the Firefox node's
# tested-stable 2Gi limit. CPU unchanged.
resources:
limits:
cpu: '1'
memory: 2Gi
requests:
cpu: 500m
memory: 1Gi
volumeMounts:
- mountPath: /dev/shm
name: dshm
volumes:
- emptyDir:
medium: Memory
sizeLimit: 2Gi
name: dshm
---
apiVersion: traefik.io/v1alpha1
kind: IngressRoute
metadata:
name: selenium-hub
namespace: selenium
spec:
entryPoints:
- websecure
routes:
- kind: Rule
match: Host(`selenium.iamworkin.lan`)
services:
- name: selenium-hub
port: 4444
tls:
secretName: selenium-tls

View File

@@ -67,7 +67,6 @@ public sealed class FleetManifestLintTests
["github-runner-chat"] = "https://github.com/astoltz/FlowerCore.Chat",
["github-runner-mysql"] = "https://github.com/astoltz/FlowerCore.MySQL",
["github-runner-kiosk-linux"] = "https://github.com/astoltz/FlowerCore.Kiosk.Linux",
["github-runner-updater"] = "https://github.com/astoltz/FlowerCore.Updater",
};
private static readonly HashSet<string> ScaledLinuxRunnerDeployments = new(StringComparer.Ordinal)
@@ -81,7 +80,6 @@ public sealed class FleetManifestLintTests
"github-runner-chat",
"github-runner-mysql",
"github-runner-kiosk-linux",
"github-runner-updater",
};
private static readonly IReadOnlyDictionary<string, string> WritableRunnerEnv = new Dictionary<string, string>(StringComparer.Ordinal)
@@ -236,7 +234,7 @@ public sealed class FleetManifestLintTests
{
deployments.Should().ContainKey(expectedRunner.Key);
var container = deployments[expectedRunner.Key].MainContainerMappings().Should().ContainSingle().Subject;
var container = deployments[expectedRunner.Key].ContainerMappings().Should().ContainSingle().Subject;
EnvValue(container, "REPO_URL").Should().Be(expectedRunner.Value);
EnvValue(container, "EPHEMERAL").Should().Be("true");
EnvValue(container, "LABELS").Should().Be("self-hosted,linux,fc-build-linux");
@@ -252,7 +250,7 @@ public sealed class FleetManifestLintTests
{
foreach (var deployment in GitHubRunnerDeployments().Values)
{
var container = deployment.MainContainerMappings().Should().ContainSingle().Subject;
var container = deployment.ContainerMappings().Should().ContainSingle().Subject;
foreach (var expectedEnv in WritableRunnerEnv)
{
@@ -279,10 +277,7 @@ public sealed class FleetManifestLintTests
foreach (var deploymentName in ScaledLinuxRunnerDeployments)
{
var deployment = deployments[deploymentName];
// Scaled runners must have >= 2 replicas (avoid single-pod bottleneck).
// Individual deployments may be tuned upward per CI activity — see
// "runners: right-size replica counts per 14d CI activity (#24)".
ReplicaCount(deployment).Should().BeGreaterOrEqualTo(2, $"{deploymentName} is in the scaled set and must run with at least 2 replicas");
ReplicaCount(deployment).Should().Be(2);
var volumes = deployment.MappingSequence("spec", "template", "spec", "volumes");
var claimNames = volumes
@@ -308,108 +303,6 @@ public sealed class FleetManifestLintTests
.Be("github-runner-nuget-cache");
}
[Fact]
public void Runners_MustNotPinToOperatorWorkstationHosts()
{
// CRITICAL SAFETY (operator directive 2026-05-26): BLUEJAY-WS is the
// operator's primary workstation — host of the 1Password Connect
// bearer token, fcadmin SSH keys to noc1, signing CA private keys,
// and source for every FC repo. A self-hosted GitHub Actions runner
// there would execute arbitrary PR code with that local access.
// Build-side analog of the Sprint 9 NEW safe-account exclusion gate
// (Puppet GPO/AppLocker/WDAC/audit-forwarder modules refuse to apply
// on BLUEJAY-WS). This lint asserts no GitHub-runner Deployment in
// apps/github-runner/ pins to a forbidden operator-workstation host
// via nodeName, nodeSelector, nodeAffinity, or tolerations.
// Existing legacy `bluejay-ws-sandbox-1` GitHub-registered runner is
// out of scope here (it's a runtime registration, not a K8s
// Deployment) — see CLAUDE.md "Common Mistakes" entry and
// feedback_bluejay_ws_never_public_runner.md.
var forbiddenHostPatterns = new[]
{
"bluejay-ws",
"BLUEJAY-WS",
"bluejay-ws.iamworkin.lan",
"iamworkin-ws",
};
bool ContainsForbidden(string? value) =>
!string.IsNullOrWhiteSpace(value)
&& forbiddenHostPatterns.Any(pattern => value!.Contains(pattern, StringComparison.OrdinalIgnoreCase));
var violations = GitHubRunnerDeployments().Values.SelectMany(deployment =>
{
var local = new List<string>();
var podSpec = ManifestNodeExtensions.Mapping(deployment.Root, "spec", "template", "spec");
if (podSpec is null)
{
return local;
}
// nodeName: pins the pod to a specific node by name.
var nodeName = ManifestNodeExtensions.Scalar(podSpec, "nodeName");
if (ContainsForbidden(nodeName))
{
local.Add($"{deployment.Name} sets nodeName='{nodeName}' which targets a forbidden operator-workstation host.");
}
// nodeSelector: dict of label → value pinning the pod to nodes
// carrying matching labels. Examples that would trip this:
// kubernetes.io/hostname: bluejay-ws
// flowercore.io/host: bluejay-ws.iamworkin.lan
var nodeSelector = ManifestNodeExtensions.Mapping(podSpec, "nodeSelector");
if (nodeSelector is not null)
{
foreach (var entry in nodeSelector.Children)
{
var key = entry.Key is YamlScalarNode keyScalar ? keyScalar.Value : null;
var value = entry.Value is YamlScalarNode valueScalar ? valueScalar.Value : null;
if (ContainsForbidden(value))
{
local.Add($"{deployment.Name} has nodeSelector entry '{key}: {value}' which targets a forbidden operator-workstation host.");
}
}
}
// nodeAffinity: matchExpressions over node labels.
foreach (var term in ManifestNodeExtensions.MappingSequence(podSpec, "affinity", "nodeAffinity", "requiredDuringSchedulingIgnoredDuringExecution", "nodeSelectorTerms"))
{
foreach (var expr in ManifestNodeExtensions.MappingSequence(term, "matchExpressions"))
{
var key = ManifestNodeExtensions.Scalar(expr, "key");
foreach (var valueNode in ManifestNodeExtensions.ScalarSequence(expr, "values"))
{
if (ContainsForbidden(valueNode))
{
local.Add($"{deployment.Name} has nodeAffinity matchExpression '{key}' value '{valueNode}' which targets a forbidden operator-workstation host.");
}
}
}
}
// tolerations: scheduling onto a tainted operator-workstation
// node would let the runner run there. Forbid any toleration
// value that names the workstation.
foreach (var toleration in ManifestNodeExtensions.MappingSequence(podSpec, "tolerations"))
{
var key = ManifestNodeExtensions.Scalar(toleration, "key");
var value = ManifestNodeExtensions.Scalar(toleration, "value");
if (ContainsForbidden(key))
{
local.Add($"{deployment.Name} has toleration key '{key}' which targets a forbidden operator-workstation host.");
}
if (ContainsForbidden(value))
{
local.Add($"{deployment.Name} has toleration value '{value}' which targets a forbidden operator-workstation host.");
}
}
return local;
}).ToList();
violations.Should().BeEmpty("BLUEJAY-WS / iamworkin-ws must never host a fleet GitHub Actions runner; see CLAUDE.md 'Registering BLUEJAY-WS as a fleet GitHub Actions runner' and feedback_bluejay_ws_never_public_runner.md");
}
[Fact]
public void Monitoring_MustAlertWhenLinuxRunnerDeploymentIsUnavailable()
{
@@ -997,22 +890,6 @@ internal sealed record ManifestDocument(
.ToList();
}
// MainContainerMappings excludes initContainers. Use this when asserting
// properties of the primary container (env, image, volumeMounts) where an
// initContainer would be a false-positive match — e.g. the GitHub runner
// image's `setup-runner-home` initContainer should not count toward the
// single-container assertions on the runner deployments.
public IReadOnlyList<YamlMappingNode> MainContainerMappings()
{
var podSpec = PodSpec();
if (podSpec is null)
{
return Array.Empty<YamlMappingNode>();
}
return ManifestNodeExtensions.MappingSequence(podSpec, "containers").ToList();
}
public IReadOnlyList<ContainerSpec> ContainerSpecs()
{
return ContainerMappings()

View File

@@ -0,0 +1,93 @@
using FluentAssertions;
using Xunit;
namespace BluejayInfraLint.Tests;
[Trait("Category", "Unit")]
public sealed class RemoteDesktopNetworkPolicyTests
{
private static readonly ManifestInventory Inventory = ManifestInventory.Load();
[Fact]
public void LiveDesktopIsolation_AllowsOnlyCoreDnsIntranetAndStepCaEgress()
{
var policy = NetworkPolicy("fc-desktop", "desktop-isolation");
var ports = policy.EgressPorts().ToHashSet(StringComparer.Ordinal);
ports.Should().BeEquivalentTo("53", "5300", "9000", "9443");
policy.AllScalars().Should().Contain(new[]
{
"kube-system",
"kube-dns",
"intranet",
"intranet-web",
"10.0.56.10/32"
});
}
[Fact]
public void LiveDesktopIsolation_RemovesInternetNfsAndTraefikEgress()
{
var policy = NetworkPolicy("fc-desktop", "desktop-isolation");
var scalars = policy.AllScalars().ToList();
var ports = policy.EgressPorts().ToHashSet(StringComparer.Ordinal);
scalars.Should().NotContain(new[] { "10.0.58.3/32", "10.0.56.200/32", "10.43.33.87/32", "traefik-system" });
ports.Should().NotContain(new[] { "80", "443", "445", "111", "2049", "8000", "8080", "8443" });
policy.MappingSequence("spec", "egress")
.Should()
.NotContain(rule => EgressRuleHasEmptyTo(rule), "desktop sessions must not use to: [] internet-style egress");
}
[Fact]
public void LiveGuacdIsolation_AllowsRawVncToDesktopPodsOnly()
{
var policy = NetworkPolicy("guacamole", "guacd-desktop-egress");
var scalars = policy.AllScalars().ToList();
var ports = policy.EgressPorts().ToHashSet(StringComparer.Ordinal);
ports.Should().Contain("5901");
scalars.Should().Contain(new[] { "fc-desktop", "remote-desktop" });
ports.Should().NotContain(new[] { "3000", "3001", "3389", "80", "8080", "8443" });
}
[Fact]
public void LiveGuacdIsolation_KeepsGuacamoleWebIngressOnGuacdPort()
{
var policy = NetworkPolicy("guacamole", "guacd-desktop-egress");
policy.Scalar("spec", "podSelector", "matchLabels", "app").Should().Be("guacd");
policy.AllScalars().Should().Contain(new[] { "guacamole", "4822" });
}
[Fact]
public void HelperSmoke_FindsExpectedRemoteDesktopPolicies()
{
NetworkPolicy("fc-desktop", "desktop-isolation").Name.Should().Be("desktop-isolation");
NetworkPolicy("guacamole", "guacd-desktop-egress").Name.Should().Be("guacd-desktop-egress");
}
[Fact]
public void HelperSmoke_EgressPortExtractionKeepsDistinctPorts()
{
var ports = NetworkPolicy("fc-desktop", "desktop-isolation")
.EgressPorts()
.ToHashSet(StringComparer.Ordinal);
ports.Should().HaveCount(4);
ports.Should().Contain(new[] { "53", "5300", "9000", "9443" });
}
private static ManifestDocument NetworkPolicy(string ns, string name)
=> Inventory.Documents.Single(document =>
document.Kind == "NetworkPolicy"
&& string.Equals(document.Namespace, ns, StringComparison.Ordinal)
&& string.Equals(document.Name, name, StringComparison.Ordinal));
private static bool EgressRuleHasEmptyTo(YamlDotNet.RepresentationModel.YamlMappingNode rule)
=> rule.Children.Any(entry =>
entry.Key is YamlDotNet.RepresentationModel.YamlScalarNode key
&& string.Equals(key.Value, "to", StringComparison.Ordinal)
&& entry.Value is YamlDotNet.RepresentationModel.YamlSequenceNode sequence
&& sequence.Children.Count == 0);
}

View File

@@ -1,51 +0,0 @@
using FluentAssertions;
using Xunit;
namespace BluejayInfraLint.Tests;
[Trait("Category", "Unit")]
public sealed class SignalControlPlatformManifestTests
{
private static readonly string Root = ManifestInventory.Load().BluejayRoot;
[Fact]
public void Monitoring_PrometheusScrapesSignalControlPiAppAndPirelayNodeExporter()
{
var monitoring = File.ReadAllText(Path.Combine(Root, "apps", "monitoring", "noc-monitoring.yaml"));
monitoring.Should().Contain("job_name: \"signalcontrol-pi-app\"");
monitoring.Should().Contain("metrics_path: /metrics/prometheus");
monitoring.Should().Contain("targets: [\"10.0.58.113:5200\"]");
monitoring.Should().Contain("host: \"signal-a.iamworkin.lan\"");
monitoring.Should().Contain("targets: [\"10.0.58.113:9100\"]");
monitoring.Should().Contain("instance: \"pirelay\"");
}
[Fact]
public void Monitoring_GrafanaMountsSignalControlDashboard()
{
var monitoring = File.ReadAllText(Path.Combine(Root, "apps", "monitoring", "noc-monitoring.yaml"));
var dashboard = File.ReadAllText(Path.Combine(Root, "apps", "monitoring", "grafana-dashboard-signalcontrol.yaml"));
monitoring.Should().Contain("name: dashboards-signalcontrol");
monitoring.Should().Contain("mountPath: /var/lib/grafana/dashboards/signalcontrol");
monitoring.Should().Contain("name: grafana-dashboard-signalcontrol");
dashboard.Should().Contain("\"uid\": \"flowercore-signalcontrol\"");
dashboard.Should().Contain("signalcontrol_active_pattern");
dashboard.Should().Contain("signal_relay_writes_total");
dashboard.Should().Contain("node_cpu_seconds_total");
}
[Fact]
public void FcSignalControlReadme_DocumentsMtlsTelemetryAndDefaultOffAudit()
{
var readme = File.ReadAllText(Path.Combine(Root, "apps", "fc-signalcontrol", "README.md"));
readme.Should().Contain("step-ca-agent");
readme.Should().Contain("10.0.58.113:5200");
readme.Should().Contain("10.0.58.113:9100");
readme.Should().Contain("PhysicalAudit:Enabled=false");
readme.Should().Contain("ForwardingEnabled=false");
readme.Should().Contain("Secrets, enrollment codes, private keys");
}
}