Compare commits
1 Commits
runners/bl
...
sprint42/c
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
6e581d2879 |
@@ -1,448 +0,0 @@
|
|||||||
# Authentik OIDC backend
|
|
||||||
# ArgoCD-managed. BlueJay Lab.
|
|
||||||
#
|
|
||||||
# Stack:
|
|
||||||
# - PostgreSQL 16 StatefulSet (single replica, Longhorn RWO 5Gi)
|
|
||||||
# - Redis 7 Deployment (no persistence — session/cache only)
|
|
||||||
# - Authentik server + worker Deployments (image ghcr.io/goauthentik/server:2024.12.3)
|
|
||||||
# - Media PVC shared between server + worker (Longhorn RWO 2Gi)
|
|
||||||
# - Certificate via step-ca-acme ClusterIssuer
|
|
||||||
# - Traefik IngressRoute at id.iamworkin.lan
|
|
||||||
#
|
|
||||||
# Secrets come from 1Password item "authentik-credentials" (IAmWorkin vault, id y6i74ch22q5wvm7znquq4nhhcu)
|
|
||||||
# via the OnePasswordItem CRD, materialized into k8s Secret authentik/authentik-credentials.
|
|
||||||
#
|
|
||||||
# Why the discovery URL is /application/o/pimanager/ : Authentik issues per-application OIDC providers.
|
|
||||||
# The pimanager OIDC application/provider is created after the cluster pods are healthy (manual or
|
|
||||||
# via API once the bootstrap token is available — see Notes substrate).
|
|
||||||
|
|
||||||
---
|
|
||||||
apiVersion: v1
|
|
||||||
kind: Namespace
|
|
||||||
metadata:
|
|
||||||
name: authentik
|
|
||||||
labels:
|
|
||||||
app.kubernetes.io/part-of: bluejay-infra
|
|
||||||
|
|
||||||
---
|
|
||||||
# 1Password operator pulls the authentik-credentials item into a k8s Secret of the same name.
|
|
||||||
# Field labels in 1P become Secret keys: AUTHENTIK_SECRET_KEY, POSTGRES_PASSWORD, REDIS_PASSWORD,
|
|
||||||
# BOOTSTRAP_ADMIN_PASSWORD, BOOTSTRAP_ADMIN_TOKEN, BOOTSTRAP_ADMIN_EMAIL.
|
|
||||||
apiVersion: onepassword.com/v1
|
|
||||||
kind: OnePasswordItem
|
|
||||||
metadata:
|
|
||||||
name: authentik-credentials
|
|
||||||
namespace: authentik
|
|
||||||
spec:
|
|
||||||
itemPath: "vaults/IAmWorkin/items/authentik-credentials"
|
|
||||||
|
|
||||||
---
|
|
||||||
# Shared media volume for server + worker pods.
|
|
||||||
apiVersion: v1
|
|
||||||
kind: PersistentVolumeClaim
|
|
||||||
metadata:
|
|
||||||
name: authentik-media
|
|
||||||
namespace: authentik
|
|
||||||
spec:
|
|
||||||
storageClassName: longhorn
|
|
||||||
accessModes: [ReadWriteOnce]
|
|
||||||
resources:
|
|
||||||
requests:
|
|
||||||
storage: 2Gi
|
|
||||||
|
|
||||||
---
|
|
||||||
# PostgreSQL 16 StatefulSet — Authentik's primary store.
|
|
||||||
apiVersion: apps/v1
|
|
||||||
kind: StatefulSet
|
|
||||||
metadata:
|
|
||||||
name: authentik-postgres
|
|
||||||
namespace: authentik
|
|
||||||
labels:
|
|
||||||
app: authentik-postgres
|
|
||||||
argocd.argoproj.io/instance: infra-authentik
|
|
||||||
spec:
|
|
||||||
persistentVolumeClaimRetentionPolicy:
|
|
||||||
whenDeleted: Retain
|
|
||||||
whenScaled: Retain
|
|
||||||
podManagementPolicy: OrderedReady
|
|
||||||
serviceName: authentik-postgres
|
|
||||||
replicas: 1
|
|
||||||
revisionHistoryLimit: 10
|
|
||||||
selector:
|
|
||||||
matchLabels:
|
|
||||||
app: authentik-postgres
|
|
||||||
template:
|
|
||||||
metadata:
|
|
||||||
labels:
|
|
||||||
app: authentik-postgres
|
|
||||||
spec:
|
|
||||||
containers:
|
|
||||||
- name: postgres
|
|
||||||
image: postgres:16-alpine
|
|
||||||
ports:
|
|
||||||
- containerPort: 5432
|
|
||||||
name: postgres
|
|
||||||
env:
|
|
||||||
- name: POSTGRES_USER
|
|
||||||
value: authentik
|
|
||||||
- name: POSTGRES_PASSWORD
|
|
||||||
valueFrom:
|
|
||||||
secretKeyRef:
|
|
||||||
name: authentik-credentials
|
|
||||||
key: POSTGRES_PASSWORD
|
|
||||||
- name: POSTGRES_DB
|
|
||||||
value: authentik
|
|
||||||
- name: POSTGRES_INITDB_ARGS
|
|
||||||
value: "--encoding=UTF-8 --lc-collate=C --lc-ctype=C"
|
|
||||||
- name: PGDATA
|
|
||||||
value: /var/lib/postgresql/data/pgdata
|
|
||||||
readinessProbe:
|
|
||||||
exec:
|
|
||||||
command: ["pg_isready", "-U", "authentik"]
|
|
||||||
initialDelaySeconds: 5
|
|
||||||
periodSeconds: 5
|
|
||||||
livenessProbe:
|
|
||||||
exec:
|
|
||||||
command: ["pg_isready", "-U", "authentik"]
|
|
||||||
initialDelaySeconds: 30
|
|
||||||
periodSeconds: 30
|
|
||||||
resources:
|
|
||||||
requests: { cpu: 100m, memory: 256Mi }
|
|
||||||
limits: { cpu: 1000m, memory: 1Gi }
|
|
||||||
volumeMounts:
|
|
||||||
- name: pgdata
|
|
||||||
mountPath: /var/lib/postgresql/data
|
|
||||||
volumeClaimTemplates:
|
|
||||||
- metadata:
|
|
||||||
name: pgdata
|
|
||||||
spec:
|
|
||||||
storageClassName: longhorn
|
|
||||||
accessModes: [ReadWriteOnce]
|
|
||||||
volumeMode: Filesystem
|
|
||||||
resources:
|
|
||||||
requests:
|
|
||||||
storage: 5Gi
|
|
||||||
|
|
||||||
---
|
|
||||||
apiVersion: v1
|
|
||||||
kind: Service
|
|
||||||
metadata:
|
|
||||||
name: authentik-postgres
|
|
||||||
namespace: authentik
|
|
||||||
spec:
|
|
||||||
clusterIP: None
|
|
||||||
selector:
|
|
||||||
app: authentik-postgres
|
|
||||||
ports:
|
|
||||||
- name: postgres
|
|
||||||
port: 5432
|
|
||||||
targetPort: 5432
|
|
||||||
|
|
||||||
---
|
|
||||||
# Redis 7 — session storage + Celery broker. No persistence needed (cache).
|
|
||||||
apiVersion: apps/v1
|
|
||||||
kind: Deployment
|
|
||||||
metadata:
|
|
||||||
name: authentik-redis
|
|
||||||
namespace: authentik
|
|
||||||
labels:
|
|
||||||
app: authentik-redis
|
|
||||||
argocd.argoproj.io/instance: infra-authentik
|
|
||||||
spec:
|
|
||||||
replicas: 1
|
|
||||||
strategy:
|
|
||||||
type: Recreate
|
|
||||||
selector:
|
|
||||||
matchLabels:
|
|
||||||
app: authentik-redis
|
|
||||||
template:
|
|
||||||
metadata:
|
|
||||||
labels:
|
|
||||||
app: authentik-redis
|
|
||||||
spec:
|
|
||||||
containers:
|
|
||||||
- name: redis
|
|
||||||
image: redis:7-alpine
|
|
||||||
args:
|
|
||||||
- "--save"
|
|
||||||
- ""
|
|
||||||
- "--appendonly"
|
|
||||||
- "no"
|
|
||||||
- "--requirepass"
|
|
||||||
- "$(REDIS_PASSWORD)"
|
|
||||||
env:
|
|
||||||
- name: REDIS_PASSWORD
|
|
||||||
valueFrom:
|
|
||||||
secretKeyRef:
|
|
||||||
name: authentik-credentials
|
|
||||||
key: REDIS_PASSWORD
|
|
||||||
ports:
|
|
||||||
- containerPort: 6379
|
|
||||||
name: redis
|
|
||||||
readinessProbe:
|
|
||||||
tcpSocket: { port: 6379 }
|
|
||||||
initialDelaySeconds: 5
|
|
||||||
periodSeconds: 5
|
|
||||||
livenessProbe:
|
|
||||||
tcpSocket: { port: 6379 }
|
|
||||||
initialDelaySeconds: 30
|
|
||||||
periodSeconds: 30
|
|
||||||
resources:
|
|
||||||
requests: { cpu: 50m, memory: 64Mi }
|
|
||||||
limits: { cpu: 500m, memory: 256Mi }
|
|
||||||
|
|
||||||
---
|
|
||||||
apiVersion: v1
|
|
||||||
kind: Service
|
|
||||||
metadata:
|
|
||||||
name: authentik-redis
|
|
||||||
namespace: authentik
|
|
||||||
spec:
|
|
||||||
selector:
|
|
||||||
app: authentik-redis
|
|
||||||
ports:
|
|
||||||
- name: redis
|
|
||||||
port: 6379
|
|
||||||
targetPort: 6379
|
|
||||||
|
|
||||||
---
|
|
||||||
# Authentik server Deployment — HTTP frontend on :9000.
|
|
||||||
apiVersion: apps/v1
|
|
||||||
kind: Deployment
|
|
||||||
metadata:
|
|
||||||
name: authentik-server
|
|
||||||
namespace: authentik
|
|
||||||
labels:
|
|
||||||
app: authentik-server
|
|
||||||
argocd.argoproj.io/instance: infra-authentik
|
|
||||||
spec:
|
|
||||||
replicas: 1
|
|
||||||
strategy:
|
|
||||||
type: Recreate # shares /media RWO PVC with worker
|
|
||||||
selector:
|
|
||||||
matchLabels:
|
|
||||||
app: authentik-server
|
|
||||||
template:
|
|
||||||
metadata:
|
|
||||||
labels:
|
|
||||||
app: authentik-server
|
|
||||||
spec:
|
|
||||||
securityContext:
|
|
||||||
# Authentik image runs as uid 1000 "authentik" but the Longhorn PVC mounts
|
|
||||||
# root:root by default. fsGroup recursively chgrp + chmod g+rwx so the
|
|
||||||
# non-root container can mkdir /media/public during the tenant_files migration.
|
|
||||||
fsGroup: 1000
|
|
||||||
containers:
|
|
||||||
- name: server
|
|
||||||
image: ghcr.io/goauthentik/server:2024.12.3
|
|
||||||
args: ["server"]
|
|
||||||
ports:
|
|
||||||
- containerPort: 9000
|
|
||||||
name: http
|
|
||||||
- containerPort: 9443
|
|
||||||
name: https
|
|
||||||
env:
|
|
||||||
- name: AUTHENTIK_SECRET_KEY
|
|
||||||
valueFrom:
|
|
||||||
secretKeyRef:
|
|
||||||
name: authentik-credentials
|
|
||||||
key: AUTHENTIK_SECRET_KEY
|
|
||||||
- name: AUTHENTIK_REDIS__HOST
|
|
||||||
value: authentik-redis
|
|
||||||
- name: AUTHENTIK_REDIS__PASSWORD
|
|
||||||
valueFrom:
|
|
||||||
secretKeyRef:
|
|
||||||
name: authentik-credentials
|
|
||||||
key: REDIS_PASSWORD
|
|
||||||
- name: AUTHENTIK_POSTGRESQL__HOST
|
|
||||||
value: authentik-postgres
|
|
||||||
- name: AUTHENTIK_POSTGRESQL__NAME
|
|
||||||
value: authentik
|
|
||||||
- name: AUTHENTIK_POSTGRESQL__USER
|
|
||||||
value: authentik
|
|
||||||
- name: AUTHENTIK_POSTGRESQL__PASSWORD
|
|
||||||
valueFrom:
|
|
||||||
secretKeyRef:
|
|
||||||
name: authentik-credentials
|
|
||||||
key: POSTGRES_PASSWORD
|
|
||||||
- name: AUTHENTIK_BOOTSTRAP_PASSWORD
|
|
||||||
valueFrom:
|
|
||||||
secretKeyRef:
|
|
||||||
name: authentik-credentials
|
|
||||||
key: BOOTSTRAP_ADMIN_PASSWORD
|
|
||||||
- name: AUTHENTIK_BOOTSTRAP_TOKEN
|
|
||||||
valueFrom:
|
|
||||||
secretKeyRef:
|
|
||||||
name: authentik-credentials
|
|
||||||
key: BOOTSTRAP_ADMIN_TOKEN
|
|
||||||
- name: AUTHENTIK_BOOTSTRAP_EMAIL
|
|
||||||
valueFrom:
|
|
||||||
secretKeyRef:
|
|
||||||
name: authentik-credentials
|
|
||||||
key: BOOTSTRAP_ADMIN_EMAIL
|
|
||||||
- name: AUTHENTIK_DISABLE_UPDATE_CHECK
|
|
||||||
value: "true"
|
|
||||||
- name: AUTHENTIK_ERROR_REPORTING__ENABLED
|
|
||||||
value: "false"
|
|
||||||
- name: AUTHENTIK_LOG_LEVEL
|
|
||||||
value: info
|
|
||||||
# First-boot Authentik can take 3+ min on the migration phase
|
|
||||||
# (waiting on DB lock while worker also runs migrations). Initial
|
|
||||||
# delays are generous so kubelet doesn't kill the pod mid-migration;
|
|
||||||
# periodSeconds keeps post-startup probing responsive.
|
|
||||||
readinessProbe:
|
|
||||||
httpGet:
|
|
||||||
path: /-/health/ready/
|
|
||||||
port: 9000
|
|
||||||
initialDelaySeconds: 60
|
|
||||||
periodSeconds: 10
|
|
||||||
timeoutSeconds: 5
|
|
||||||
failureThreshold: 12
|
|
||||||
livenessProbe:
|
|
||||||
httpGet:
|
|
||||||
path: /-/health/live/
|
|
||||||
port: 9000
|
|
||||||
initialDelaySeconds: 300
|
|
||||||
periodSeconds: 30
|
|
||||||
timeoutSeconds: 10
|
|
||||||
failureThreshold: 3
|
|
||||||
startupProbe:
|
|
||||||
httpGet:
|
|
||||||
path: /-/health/live/
|
|
||||||
port: 9000
|
|
||||||
initialDelaySeconds: 30
|
|
||||||
periodSeconds: 15
|
|
||||||
timeoutSeconds: 10
|
|
||||||
failureThreshold: 40 # 30s + 40*15s = 10.5 min budget
|
|
||||||
resources:
|
|
||||||
requests: { cpu: 150m, memory: 512Mi }
|
|
||||||
limits: { cpu: 1500m, memory: 1Gi }
|
|
||||||
volumeMounts:
|
|
||||||
- name: media
|
|
||||||
mountPath: /media
|
|
||||||
volumes:
|
|
||||||
- name: media
|
|
||||||
persistentVolumeClaim:
|
|
||||||
claimName: authentik-media
|
|
||||||
|
|
||||||
---
|
|
||||||
# Authentik worker Deployment — runs Celery background tasks.
|
|
||||||
apiVersion: apps/v1
|
|
||||||
kind: Deployment
|
|
||||||
metadata:
|
|
||||||
name: authentik-worker
|
|
||||||
namespace: authentik
|
|
||||||
labels:
|
|
||||||
app: authentik-worker
|
|
||||||
argocd.argoproj.io/instance: infra-authentik
|
|
||||||
spec:
|
|
||||||
replicas: 1
|
|
||||||
strategy:
|
|
||||||
type: Recreate # shares /media RWO PVC with server
|
|
||||||
selector:
|
|
||||||
matchLabels:
|
|
||||||
app: authentik-worker
|
|
||||||
template:
|
|
||||||
metadata:
|
|
||||||
labels:
|
|
||||||
app: authentik-worker
|
|
||||||
spec:
|
|
||||||
securityContext:
|
|
||||||
# Same as server pod — non-root uid 1000 needs PVC group write.
|
|
||||||
fsGroup: 1000
|
|
||||||
containers:
|
|
||||||
- name: worker
|
|
||||||
image: ghcr.io/goauthentik/server:2024.12.3
|
|
||||||
args: ["worker"]
|
|
||||||
env:
|
|
||||||
- name: AUTHENTIK_SECRET_KEY
|
|
||||||
valueFrom:
|
|
||||||
secretKeyRef:
|
|
||||||
name: authentik-credentials
|
|
||||||
key: AUTHENTIK_SECRET_KEY
|
|
||||||
- name: AUTHENTIK_REDIS__HOST
|
|
||||||
value: authentik-redis
|
|
||||||
- name: AUTHENTIK_REDIS__PASSWORD
|
|
||||||
valueFrom:
|
|
||||||
secretKeyRef:
|
|
||||||
name: authentik-credentials
|
|
||||||
key: REDIS_PASSWORD
|
|
||||||
- name: AUTHENTIK_POSTGRESQL__HOST
|
|
||||||
value: authentik-postgres
|
|
||||||
- name: AUTHENTIK_POSTGRESQL__NAME
|
|
||||||
value: authentik
|
|
||||||
- name: AUTHENTIK_POSTGRESQL__USER
|
|
||||||
value: authentik
|
|
||||||
- name: AUTHENTIK_POSTGRESQL__PASSWORD
|
|
||||||
valueFrom:
|
|
||||||
secretKeyRef:
|
|
||||||
name: authentik-credentials
|
|
||||||
key: POSTGRES_PASSWORD
|
|
||||||
- name: AUTHENTIK_DISABLE_UPDATE_CHECK
|
|
||||||
value: "true"
|
|
||||||
- name: AUTHENTIK_ERROR_REPORTING__ENABLED
|
|
||||||
value: "false"
|
|
||||||
- name: AUTHENTIK_LOG_LEVEL
|
|
||||||
value: info
|
|
||||||
resources:
|
|
||||||
requests: { cpu: 100m, memory: 256Mi }
|
|
||||||
limits: { cpu: 1000m, memory: 768Mi }
|
|
||||||
volumeMounts:
|
|
||||||
- name: media
|
|
||||||
mountPath: /media
|
|
||||||
volumes:
|
|
||||||
- name: media
|
|
||||||
persistentVolumeClaim:
|
|
||||||
claimName: authentik-media
|
|
||||||
|
|
||||||
---
|
|
||||||
apiVersion: v1
|
|
||||||
kind: Service
|
|
||||||
metadata:
|
|
||||||
name: authentik-server
|
|
||||||
namespace: authentik
|
|
||||||
spec:
|
|
||||||
selector:
|
|
||||||
app: authentik-server
|
|
||||||
ports:
|
|
||||||
- name: http
|
|
||||||
port: 9000
|
|
||||||
targetPort: 9000
|
|
||||||
- name: https
|
|
||||||
port: 9443
|
|
||||||
targetPort: 9443
|
|
||||||
|
|
||||||
---
|
|
||||||
# step-ca leaf certificate for id.iamworkin.lan.
|
|
||||||
# step-ca container resolver uses pfSense Unbound, so the public A record for id.iamworkin.lan
|
|
||||||
# MUST exist before this Certificate is applied (cert-manager HTTP-01 will silently 2h-backoff
|
|
||||||
# otherwise). Added 2026-05-25 via scripts/pfsense-add-id-host.py.
|
|
||||||
apiVersion: cert-manager.io/v1
|
|
||||||
kind: Certificate
|
|
||||||
metadata:
|
|
||||||
name: authentik-tls
|
|
||||||
namespace: authentik
|
|
||||||
spec:
|
|
||||||
secretName: authentik-tls
|
|
||||||
dnsNames:
|
|
||||||
- id.iamworkin.lan
|
|
||||||
issuerRef:
|
|
||||||
name: step-ca-acme
|
|
||||||
kind: ClusterIssuer
|
|
||||||
|
|
||||||
---
|
|
||||||
apiVersion: traefik.io/v1alpha1
|
|
||||||
kind: IngressRoute
|
|
||||||
metadata:
|
|
||||||
name: authentik
|
|
||||||
namespace: authentik
|
|
||||||
spec:
|
|
||||||
entryPoints: [websecure]
|
|
||||||
routes:
|
|
||||||
- match: Host(`id.iamworkin.lan`)
|
|
||||||
kind: Rule
|
|
||||||
services:
|
|
||||||
- name: authentik-server
|
|
||||||
port: 9000
|
|
||||||
tls:
|
|
||||||
secretName: authentik-tls
|
|
||||||
263
apps/fc-build-windows/README.md
Normal file
263
apps/fc-build-windows/README.md
Normal file
@@ -0,0 +1,263 @@
|
|||||||
|
# fc-build-windows runner gate
|
||||||
|
|
||||||
|
Status: OPEN-WITH-OPERATOR-ACTION as of 2026-05-20.
|
||||||
|
|
||||||
|
This directory is intentionally not a live runner deployment. It records the
|
||||||
|
exact gate for bringing up the Windows self-hosted runner fleet without faking
|
||||||
|
capacity in GitHub or Kubernetes.
|
||||||
|
|
||||||
|
## Lane evidence
|
||||||
|
|
||||||
|
- `D:\git\FlowerCore\FlowerCore.Notes\docs\dashboards\decisions-waiting.html`
|
||||||
|
lines 15078-15085: Q-MR-82 says the Updater Windows Sandbox E2E run is
|
||||||
|
queued and `bluejay-ws-sandbox-1` is offline.
|
||||||
|
- `D:\git\FlowerCore\FlowerCore.Notes\memory\project_morning_routine_8_2026_05_20.md`:
|
||||||
|
Morning Routine #8 carries Q-MR-82 as the fleet-wide Windows runner gap.
|
||||||
|
- `D:\git\FlowerCore\FlowerCore.Notes\docs\standards\sprint-37-codex-dispatch-log-2026-05-19.md`
|
||||||
|
lines 76, 84-85, and 97: keep BLUEJAY-WS out of runner plans, merge Linux
|
||||||
|
runner expansion separately, and keep true Windows-only workflows parked on
|
||||||
|
the Windows runner host substrate path.
|
||||||
|
- `D:\git\FlowerCore\FlowerCore.Notes\docs\ai-agents\codex-prompts\2026-05-20-xxxxl-sprint-42-orchestrator-briefs.md`
|
||||||
|
lane Cx-5: land a deployment only if a Windows runner image/substrate is
|
||||||
|
ready; otherwise commit an operator-action gate.
|
||||||
|
- `D:\git\FlowerCore\FlowerCore.Notes\memory\feedback_bluejay_ws_never_a_github_runner.md`:
|
||||||
|
BLUEJAY-WS is operator-only territory; Windows runners belong on a dedicated
|
||||||
|
KubeVirt Windows VM such as `ci1` or a sibling VM.
|
||||||
|
|
||||||
|
## Live probe summary
|
||||||
|
|
||||||
|
Commands run on 2026-05-20 from `D:\git\FlowerCore\bluejay-infra`:
|
||||||
|
|
||||||
|
```powershell
|
||||||
|
$env:KUBECONFIG="$env:USERPROFILE\.kube\rke2.yaml"
|
||||||
|
kubectl get nodes -o jsonpath='{range .items[*]}{.metadata.name}{"`t"}{.metadata.labels.kubernetes\.io/os}{"`n"}{end}'
|
||||||
|
```
|
||||||
|
|
||||||
|
Result: `rke2-agent1`, `rke2-agent2`, and `rke2-server` all report
|
||||||
|
`kubernetes.io/os=linux`. There is no Windows Kubernetes node, so Windows
|
||||||
|
containers on RKE2 cannot satisfy `fc-build-windows`.
|
||||||
|
|
||||||
|
```powershell
|
||||||
|
kubectl -n kubevirt-vms get vm,vmi,pods -o wide
|
||||||
|
```
|
||||||
|
|
||||||
|
Result: KubeVirt is healthy and `ci1` is `Running` / `Ready=True` on
|
||||||
|
`rke2-agent1` with VMI IP `10.42.103.35`.
|
||||||
|
|
||||||
|
```powershell
|
||||||
|
virtctl --kubeconfig $env:USERPROFILE\.kube\rke2.yaml port-forward vm/ci1.kubevirt-vms 15985:5985
|
||||||
|
```
|
||||||
|
|
||||||
|
Result during port tests: `dial tcp 10.42.103.35:5985: connect: no route to
|
||||||
|
host`. The same result was seen for RDP 3389 and SSH 22. The VM exists, but it
|
||||||
|
is not remotely reachable for runner bootstrap from this lane.
|
||||||
|
|
||||||
|
```powershell
|
||||||
|
gh api /repos/astoltz/FlowerCore.Updater/actions/runners `
|
||||||
|
--jq '.runners[]? | {name,status,busy,labels:[.labels[].name]}'
|
||||||
|
gh run list --repo astoltz/FlowerCore.Updater `
|
||||||
|
--workflow "Updater Windows Sandbox E2E" --limit 5
|
||||||
|
```
|
||||||
|
|
||||||
|
Result: GitHub has one Updater runner, `bluejay-ws-sandbox-1`, with
|
||||||
|
`status=offline`; run `26150689447` is still `queued`.
|
||||||
|
|
||||||
|
## Feasibility classification
|
||||||
|
|
||||||
|
### Option A: Windows containers on RKE2
|
||||||
|
|
||||||
|
Not feasible without operator-physical infrastructure work. Kubernetes Windows
|
||||||
|
containers require a Windows node. The current cluster has Linux-only RKE2
|
||||||
|
nodes.
|
||||||
|
|
||||||
|
### Option B: KubeVirt Windows VM
|
||||||
|
|
||||||
|
Partially present, not deployable from this lane.
|
||||||
|
|
||||||
|
`apps/kubevirt-vms/ci1.yaml` already defines a Windows Server 2025 KubeVirt VM
|
||||||
|
using `localhost/fc-win-server-2025:v1`, and the live VM is running. However:
|
||||||
|
|
||||||
|
- the guest is not reachable over RDP, WinRM, or SSH through `virtctl
|
||||||
|
port-forward`;
|
||||||
|
- the current root disk is a `containerDisk`, so runner installation inside the
|
||||||
|
running guest is not a durable fleet state unless the first-boot automation
|
||||||
|
re-registers on every boot or the VM is moved to a persistent PVC-backed
|
||||||
|
disk;
|
||||||
|
- FC.Updater `Updater Windows Sandbox E2E` uses
|
||||||
|
`[self-hosted, windows, windows-sandbox]`, while `fc-build-windows` build jobs
|
||||||
|
use `[self-hosted, windows, fc-build-windows]`. Do not advertise
|
||||||
|
`windows-sandbox` until Windows Sandbox has been proven in the guest.
|
||||||
|
|
||||||
|
### Option C: bluejay-ws-sandbox-1
|
||||||
|
|
||||||
|
Operator-only emergency fallback. GitHub shows it registered but offline. The
|
||||||
|
current memory says BLUEJAY-WS must not be a fleet runner host, so this lane
|
||||||
|
does not start or re-register it. If the operator deliberately overrides the
|
||||||
|
policy to drain an emergency queue, start the existing visible runner console
|
||||||
|
from the BLUEJAY-WS desktop and treat that as temporary break-glass, not the
|
||||||
|
permanent Q-MR-82 closure.
|
||||||
|
|
||||||
|
## Operator action plan
|
||||||
|
|
||||||
|
### 1. Pick the Windows host class
|
||||||
|
|
||||||
|
Use `ci1` or a sibling Windows Server 2025 VM for WPF build/test jobs that need
|
||||||
|
`fc-build-windows`.
|
||||||
|
|
||||||
|
Use a Windows 11 Pro/Enterprise KubeVirt VM for Updater or WorldBuilder
|
||||||
|
Windows Sandbox gates, unless Windows Sandbox support is explicitly proven on
|
||||||
|
the selected guest. The workflow labels must match the real capability:
|
||||||
|
|
||||||
|
- WPF build runner: `self-hosted,windows,fc-build-windows,ci1`
|
||||||
|
- Sandbox runner: `self-hosted,windows,windows-sandbox,ci-sandbox1`
|
||||||
|
|
||||||
|
### 2. Make the VM reachable and durable
|
||||||
|
|
||||||
|
From BLUEJAY-WS:
|
||||||
|
|
||||||
|
```powershell
|
||||||
|
$env:KUBECONFIG="$env:USERPROFILE\.kube\rke2.yaml"
|
||||||
|
kubectl -n kubevirt-vms get vm,vmi,pods -o wide
|
||||||
|
virtctl --kubeconfig $env:KUBECONFIG vnc ci1 -n kubevirt-vms
|
||||||
|
virtctl --kubeconfig $env:KUBECONFIG port-forward vm/ci1.kubevirt-vms 13389:3389
|
||||||
|
virtctl --kubeconfig $env:KUBECONFIG port-forward vm/ci1.kubevirt-vms 15985:5985
|
||||||
|
```
|
||||||
|
|
||||||
|
Before runner registration, fix the current port-forward failure. The expected
|
||||||
|
state is that RDP or WinRM accepts a connection through the control plane.
|
||||||
|
|
||||||
|
For durability, either:
|
||||||
|
|
||||||
|
- move the runner VM to a persistent PVC-backed root disk; or
|
||||||
|
- keep `containerDisk` and bake first-boot runner registration into the sysprep
|
||||||
|
flow using a non-expiring credential lookup path.
|
||||||
|
|
||||||
|
Do not install a runner by hand into a transient VM and call Q-MR-82 closed.
|
||||||
|
|
||||||
|
### 3. Install runner prerequisites inside the VM
|
||||||
|
|
||||||
|
Run in an elevated PowerShell session in the Windows runner guest:
|
||||||
|
|
||||||
|
```powershell
|
||||||
|
winget install Microsoft.DotNet.SDK.10 --silent
|
||||||
|
winget install Microsoft.DotNet.DesktopRuntime.8 --silent
|
||||||
|
winget install Microsoft.PowerShell --silent
|
||||||
|
winget install Git.Git --silent
|
||||||
|
winget install Microsoft.VisualStudio.2022.BuildTools --silent
|
||||||
|
winget install Google.Chrome --silent
|
||||||
|
```
|
||||||
|
|
||||||
|
For a Sandbox-capable runner only:
|
||||||
|
|
||||||
|
```powershell
|
||||||
|
Enable-WindowsOptionalFeature -Online -FeatureName Containers-DisposableClientVM -All
|
||||||
|
Restart-Computer -Force
|
||||||
|
```
|
||||||
|
|
||||||
|
After reboot:
|
||||||
|
|
||||||
|
```powershell
|
||||||
|
Get-CimInstance -ClassName Win32_OptionalFeature -Filter "Name='Containers-DisposableClientVM'"
|
||||||
|
Test-Path C:\Windows\System32\WindowsSandbox.exe
|
||||||
|
```
|
||||||
|
|
||||||
|
### 4. Register repo-scoped GitHub runners
|
||||||
|
|
||||||
|
The `astoltz` account uses repo-scoped runners. Generate a fresh one-hour
|
||||||
|
registration token per repo immediately before `config.cmd`.
|
||||||
|
|
||||||
|
From a trusted operator shell with `gh` authenticated:
|
||||||
|
|
||||||
|
```powershell
|
||||||
|
$repos = @(
|
||||||
|
"FlowerCore.Updater",
|
||||||
|
"FlowerCore.WorldBuilder",
|
||||||
|
"FlowerCore.DeviceManagement"
|
||||||
|
)
|
||||||
|
|
||||||
|
foreach ($repo in $repos) {
|
||||||
|
$token = gh api -X POST "/repos/astoltz/$repo/actions/runners/registration-token" --jq .token
|
||||||
|
$repoSlug = $repo.ToLowerInvariant().Replace("flowercore.", "").Replace(".", "-")
|
||||||
|
$runnerDir = "C:\fc-ghr\$repoSlug-fc-build-windows"
|
||||||
|
|
||||||
|
New-Item -ItemType Directory -Force -Path $runnerDir | Out-Null
|
||||||
|
Set-Location $runnerDir
|
||||||
|
|
||||||
|
if (-not (Test-Path ".\config.cmd")) {
|
||||||
|
Invoke-WebRequest `
|
||||||
|
-Uri "https://github.com/actions/runner/releases/download/v2.323.0/actions-runner-win-x64-2.323.0.zip" `
|
||||||
|
-OutFile "actions-runner.zip"
|
||||||
|
Add-Type -AssemblyName System.IO.Compression.FileSystem
|
||||||
|
[System.IO.Compression.ZipFile]::ExtractToDirectory((Resolve-Path actions-runner.zip), $runnerDir)
|
||||||
|
}
|
||||||
|
|
||||||
|
.\config.cmd `
|
||||||
|
--url "https://github.com/astoltz/$repo" `
|
||||||
|
--token $token `
|
||||||
|
--name "ci1-$repoSlug-fc-build-windows" `
|
||||||
|
--labels "self-hosted,windows,fc-build-windows,ci1" `
|
||||||
|
--work "_work" `
|
||||||
|
--unattended `
|
||||||
|
--replace
|
||||||
|
|
||||||
|
.\svc.ps1 install
|
||||||
|
.\svc.ps1 start
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
For Updater Sandbox E2E, register only after the guest proves Sandbox support,
|
||||||
|
and use `windows-sandbox` labels:
|
||||||
|
|
||||||
|
```powershell
|
||||||
|
$token = gh api -X POST "/repos/astoltz/FlowerCore.Updater/actions/runners/registration-token" --jq .token
|
||||||
|
.\config.cmd `
|
||||||
|
--url "https://github.com/astoltz/FlowerCore.Updater" `
|
||||||
|
--token $token `
|
||||||
|
--name "ci-sandbox1-updater" `
|
||||||
|
--labels "self-hosted,windows,windows-sandbox,ci-sandbox1" `
|
||||||
|
--work "_work" `
|
||||||
|
--unattended `
|
||||||
|
--replace
|
||||||
|
```
|
||||||
|
|
||||||
|
Keep registration tokens out of Git and logs. The durable credential source for
|
||||||
|
automation should be the existing 1Password item named `GitHub PAT (Runner
|
||||||
|
Registration)`, used only to mint short-lived repo registration tokens.
|
||||||
|
|
||||||
|
### 5. Verify GitHub and workflow pickup
|
||||||
|
|
||||||
|
```powershell
|
||||||
|
gh api /repos/astoltz/FlowerCore.Updater/actions/runners `
|
||||||
|
--jq '.runners[] | select(.labels[].name == "windows-sandbox") | {name,status,busy,labels:[.labels[].name]}'
|
||||||
|
|
||||||
|
gh api /repos/astoltz/FlowerCore.DeviceManagement/actions/runners `
|
||||||
|
--jq '.runners[] | select(.labels[].name == "fc-build-windows") | {name,status,busy,labels:[.labels[].name]}'
|
||||||
|
|
||||||
|
gh run list --repo astoltz/FlowerCore.Updater `
|
||||||
|
--workflow "Updater Windows Sandbox E2E" --limit 3
|
||||||
|
```
|
||||||
|
|
||||||
|
Q-MR-82 can be marked resolved only after the Updater run moves from `queued` to
|
||||||
|
`in_progress` or `completed` on an online runner, or after the affected WPF
|
||||||
|
build repos show online `fc-build-windows` repo-scoped runners and their queued
|
||||||
|
jobs start.
|
||||||
|
|
||||||
|
## Break-glass BLUEJAY-WS command
|
||||||
|
|
||||||
|
Only if the operator explicitly overrides the "BLUEJAY-WS is not a runner"
|
||||||
|
policy to drain a queue:
|
||||||
|
|
||||||
|
```powershell
|
||||||
|
Set-Location C:\fc-ghr\updater-sandbox
|
||||||
|
.\run.cmd
|
||||||
|
```
|
||||||
|
|
||||||
|
If a Windows service exists:
|
||||||
|
|
||||||
|
```powershell
|
||||||
|
Get-Service 'actions.runner.*'
|
||||||
|
Start-Service 'actions.runner.*'
|
||||||
|
```
|
||||||
|
|
||||||
|
This does not close Q-MR-82 permanently. It is a temporary queue drain until a
|
||||||
|
dedicated VM runner is online.
|
||||||
4
apps/fc-build-windows/kustomization.yaml
Normal file
4
apps/fc-build-windows/kustomization.yaml
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||||
|
kind: Kustomization
|
||||||
|
resources:
|
||||||
|
- operator-gate-configmap.yaml
|
||||||
61
apps/fc-build-windows/operator-gate-configmap.yaml
Normal file
61
apps/fc-build-windows/operator-gate-configmap.yaml
Normal file
@@ -0,0 +1,61 @@
|
|||||||
|
apiVersion: v1
|
||||||
|
kind: ConfigMap
|
||||||
|
metadata:
|
||||||
|
name: fc-build-windows-operator-gate
|
||||||
|
namespace: kubevirt-vms
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/name: fc-build-windows
|
||||||
|
app.kubernetes.io/component: operator-gate
|
||||||
|
app.kubernetes.io/part-of: github-runner
|
||||||
|
flowercore.io/q-card: Q-MR-82
|
||||||
|
annotations:
|
||||||
|
flowercore.io/outcome: OPEN-WITH-OPERATOR-ACTION
|
||||||
|
flowercore.io/live-runner: "false"
|
||||||
|
data:
|
||||||
|
outcome: OPEN-WITH-OPERATOR-ACTION
|
||||||
|
gate.md: |
|
||||||
|
Do not treat this ConfigMap as runner capacity.
|
||||||
|
|
||||||
|
Current probe, 2026-05-20:
|
||||||
|
- RKE2 nodes are linux-only; Windows containers require a Windows node.
|
||||||
|
- KubeVirt `ci1` is Running/Ready, but RDP 3389, WinRM 5985, and SSH 22
|
||||||
|
through `virtctl port-forward` return `connect: no route to host`.
|
||||||
|
- GitHub Updater runner list has only `bluejay-ws-sandbox-1`, status
|
||||||
|
offline. Updater Windows Sandbox E2E run 26150689447 remains queued.
|
||||||
|
|
||||||
|
Required operator action:
|
||||||
|
1. Make a dedicated Windows VM reachable and durable.
|
||||||
|
2. Install .NET 10 SDK, .NET 8 Desktop Runtime, Git, VS Build Tools, and
|
||||||
|
PowerShell 7.
|
||||||
|
3. Register repo-scoped runners with short-lived GitHub registration tokens.
|
||||||
|
4. Add `fc-build-windows` labels only to WPF build-capable guests.
|
||||||
|
5. Add `windows-sandbox` labels only after Sandbox support is proven.
|
||||||
|
registration-token-pattern.ps1: |
|
||||||
|
$repo = "FlowerCore.Updater"
|
||||||
|
$token = gh api -X POST "/repos/astoltz/$repo/actions/runners/registration-token" --jq .token
|
||||||
|
$runnerDir = "C:\fc-ghr\updater-fc-build-windows"
|
||||||
|
|
||||||
|
New-Item -ItemType Directory -Force -Path $runnerDir | Out-Null
|
||||||
|
Set-Location $runnerDir
|
||||||
|
|
||||||
|
# Install the Actions runner package here if config.cmd is absent.
|
||||||
|
.\config.cmd `
|
||||||
|
--url "https://github.com/astoltz/$repo" `
|
||||||
|
--token $token `
|
||||||
|
--name "ci1-updater-fc-build-windows" `
|
||||||
|
--labels "self-hosted,windows,fc-build-windows,ci1" `
|
||||||
|
--work "_work" `
|
||||||
|
--unattended `
|
||||||
|
--replace
|
||||||
|
|
||||||
|
.\svc.ps1 install
|
||||||
|
.\svc.ps1 start
|
||||||
|
verification.ps1: |
|
||||||
|
gh api /repos/astoltz/FlowerCore.Updater/actions/runners `
|
||||||
|
--jq '.runners[] | {name,status,busy,labels:[.labels[].name]}'
|
||||||
|
|
||||||
|
gh run list --repo astoltz/FlowerCore.Updater `
|
||||||
|
--workflow "Updater Windows Sandbox E2E" --limit 3
|
||||||
|
|
||||||
|
$env:KUBECONFIG="$env:USERPROFILE\.kube\rke2.yaml"
|
||||||
|
kubectl -n kubevirt-vms get vm,vmi,pods -o wide
|
||||||
2
apps/github-runner/.gitattributes
vendored
2
apps/github-runner/.gitattributes
vendored
@@ -1,2 +0,0 @@
|
|||||||
*.sh text eol=lf
|
|
||||||
Dockerfile text eol=lf
|
|
||||||
@@ -1,54 +0,0 @@
|
|||||||
FROM myoung34/github-runner:latest
|
|
||||||
|
|
||||||
ARG RUBY_VERSION=3.3.11
|
|
||||||
ARG RUBY_MINOR=3.3
|
|
||||||
ARG RUBY_BUILD_VERSION=v20260326
|
|
||||||
ARG RUNNER_UID=1001
|
|
||||||
ARG RUNNER_GID=1001
|
|
||||||
|
|
||||||
ENV RUNNER_TOOL_CACHE=/home/runner/_tool
|
|
||||||
ENV RUNNER_RUBY_TOOLCACHE=/opt/runner-toolcache
|
|
||||||
ENV PATH="/home/runner/_tool/Ruby/${RUBY_MINOR}/x64/bin:/opt/runner-toolcache/Ruby/${RUBY_MINOR}/x64/bin:${PATH}"
|
|
||||||
|
|
||||||
USER root
|
|
||||||
|
|
||||||
# Bake the IAmWorkin step-ca root CA into the system trust store. Without
|
|
||||||
# this, .NET HttpClient calls from CI tests against *.iamworkin.lan
|
|
||||||
# (e.g. https://selenium.iamworkin.lan/session) fail with `PartialChain`
|
|
||||||
# because the runner image's default Ubuntu trust bundle doesn't include
|
|
||||||
# our internal Root CA. update-ca-certificates regenerates
|
|
||||||
# /etc/ssl/certs/ca-certificates.crt, which OpenSSL + .NET on Linux read
|
|
||||||
# automatically — no SSL_CERT_FILE env var needed.
|
|
||||||
COPY step-ca-root.crt /usr/local/share/ca-certificates/iamworkin-step-ca-root.crt
|
|
||||||
|
|
||||||
RUN apt-get update \
|
|
||||||
&& DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
|
|
||||||
autoconf \
|
|
||||||
bison \
|
|
||||||
build-essential \
|
|
||||||
ca-certificates \
|
|
||||||
curl \
|
|
||||||
libdb-dev \
|
|
||||||
libffi-dev \
|
|
||||||
libgdbm-dev \
|
|
||||||
libgmp-dev \
|
|
||||||
libncurses-dev \
|
|
||||||
libreadline-dev \
|
|
||||||
libssl-dev \
|
|
||||||
libyaml-dev \
|
|
||||||
patch \
|
|
||||||
pkg-config \
|
|
||||||
uuid-dev \
|
|
||||||
zlib1g-dev \
|
|
||||||
&& update-ca-certificates \
|
|
||||||
&& curl -fsSL "https://github.com/rbenv/ruby-build/archive/refs/tags/${RUBY_BUILD_VERSION}.tar.gz" -o /tmp/ruby-build.tar.gz \
|
|
||||||
&& mkdir -p /tmp/ruby-build \
|
|
||||||
&& tar -xzf /tmp/ruby-build.tar.gz --strip-components=1 -C /tmp/ruby-build \
|
|
||||||
&& /tmp/ruby-build/install.sh \
|
|
||||||
&& rm -rf /tmp/ruby-build /tmp/ruby-build.tar.gz /var/lib/apt/lists/*
|
|
||||||
|
|
||||||
COPY install-ruby-toolcache.sh /usr/local/bin/install-ruby-toolcache.sh
|
|
||||||
|
|
||||||
RUN chmod +x /usr/local/bin/install-ruby-toolcache.sh \
|
|
||||||
&& RUBY_VERSION="${RUBY_VERSION}" RUBY_MINOR="${RUBY_MINOR}" TOOLCACHE_ROOT="${RUNNER_RUBY_TOOLCACHE}" RUNNER_UID="${RUNNER_UID}" RUNNER_GID="${RUNNER_GID}" /usr/local/bin/install-ruby-toolcache.sh \
|
|
||||||
&& ruby -v
|
|
||||||
@@ -7,17 +7,12 @@ Deployments with `kubectl`; update this manifest and let ArgoCD reconcile.
|
|||||||
|
|
||||||
All repo-scoped Linux runners use:
|
All repo-scoped Linux runners use:
|
||||||
|
|
||||||
- `localhost/fc-github-runner:v20260525-ruby3.3.11-stepca`, derived from
|
|
||||||
`myoung34/github-runner:latest`
|
|
||||||
- `ACCESS_TOKEN` from the `github-runner-token` Secret
|
- `ACCESS_TOKEN` from the `github-runner-token` Secret
|
||||||
- `RUN_AS_ROOT=false`
|
- `RUN_AS_ROOT=false`
|
||||||
- `EPHEMERAL=true`
|
- `EPHEMERAL=true`
|
||||||
- `LABELS=self-hosted,linux,fc-build-linux`
|
- `LABELS=self-hosted,linux,fc-build-linux`
|
||||||
- writable non-root paths under `/home/runner` for .NET, NuGet, XDG cache, and
|
- writable non-root paths under `/home/runner` for .NET, NuGet, XDG cache, and
|
||||||
Actions tool cache
|
Actions tool cache
|
||||||
- Ruby 3.3.11 seeded into `/home/runner/_tool/Ruby/3.3/x64` from the baked
|
|
||||||
`/opt/runner-toolcache` copy so `ruby/setup-ruby@v1` can discover it on
|
|
||||||
self-hosted `ubuntu-20.04-x64` runners
|
|
||||||
|
|
||||||
`github-runner` for `FlowerCore.Common` is single-replica because it retains the
|
`github-runner` for `FlowerCore.Common` is single-replica because it retains the
|
||||||
original Longhorn ReadWriteOnce NuGet PVC. Every other repo-scoped runner uses
|
original Longhorn ReadWriteOnce NuGet PVC. Every other repo-scoped runner uses
|
||||||
@@ -33,46 +28,6 @@ Sprint 32 final long-tail wave adds 16 two-replica Deployments:
|
|||||||
`FlowerCore.Provisioning`, `FlowerCore.Redis`, `FlowerCore.MessageBoard`, and
|
`FlowerCore.Provisioning`, `FlowerCore.Redis`, `FlowerCore.MessageBoard`, and
|
||||||
`FlowerCore.MenuBoard`.
|
`FlowerCore.MenuBoard`.
|
||||||
|
|
||||||
## Image Build
|
|
||||||
|
|
||||||
Ruby is baked with a pinned `ruby-build` release and Ruby patch version. The pod
|
|
||||||
still mounts an `emptyDir` over `/home/runner`, so the `setup-runner-home` init
|
|
||||||
container copies the baked toolcache from `/opt/runner-toolcache/Ruby` into
|
|
||||||
`/home/runner/_tool/Ruby` before the runner container starts.
|
|
||||||
|
|
||||||
The IAmWorkin step-ca root CA is also baked into the system trust store
|
|
||||||
(`/usr/local/share/ca-certificates/iamworkin-step-ca-root.crt`, registered by
|
|
||||||
`update-ca-certificates`). Without it, .NET HttpClient calls from CI tests
|
|
||||||
against `*.iamworkin.lan` (e.g. `https://selenium.iamworkin.lan/session`)
|
|
||||||
fail with `PartialChain`. To refresh the bundled cert when the root rotates,
|
|
||||||
re-extract from the cluster and overwrite `step-ca-root.crt`:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
kubectl get secret -n cert-manager step-ca-root \
|
|
||||||
-o jsonpath='{.data.ca\.crt}' | base64 -d > step-ca-root.crt
|
|
||||||
```
|
|
||||||
|
|
||||||
```bash
|
|
||||||
cd apps/github-runner
|
|
||||||
podman build -t localhost/fc-github-runner:v20260525-ruby3.3.11-stepca .
|
|
||||||
podman run --rm localhost/fc-github-runner:v20260525-ruby3.3.11-stepca ruby -v
|
|
||||||
podman run --rm localhost/fc-github-runner:v20260525-ruby3.3.11-stepca \
|
|
||||||
test -f /opt/runner-toolcache/Ruby/3.3/x64.complete
|
|
||||||
podman save localhost/fc-github-runner:v20260525-ruby3.3.11-stepca \
|
|
||||||
-o fc-github-runner-v20260525-ruby3.3.11-stepca.tar
|
|
||||||
```
|
|
||||||
|
|
||||||
Import the saved image on every schedulable RKE2 node before ArgoCD rolls the
|
|
||||||
Deployments:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
for node in rke2-server rke2-agent1 rke2-agent2; do
|
|
||||||
scp fc-github-runner-v20260525-ruby3.3.11-stepca.tar "$node:/tmp/"
|
|
||||||
ssh "$node" 'sudo ctr -a /run/k3s/containerd/containerd.sock -n k8s.io images rm localhost/fc-github-runner:v20260525-ruby3.3.11-stepca || true'
|
|
||||||
ssh "$node" 'sudo ctr -a /run/k3s/containerd/containerd.sock -n k8s.io images import /tmp/fc-github-runner-v20260525-ruby3.3.11-stepca.tar'
|
|
||||||
done
|
|
||||||
```
|
|
||||||
|
|
||||||
## Post-Merge Proof
|
## Post-Merge Proof
|
||||||
|
|
||||||
After the PR is merged and ArgoCD syncs, verify the runner fleet:
|
After the PR is merged and ArgoCD syncs, verify the runner fleet:
|
||||||
@@ -81,14 +36,6 @@ After the PR is merged and ArgoCD syncs, verify the runner fleet:
|
|||||||
kubectl -n github-runner get deploy,pods,pvc
|
kubectl -n github-runner get deploy,pods,pvc
|
||||||
```
|
```
|
||||||
|
|
||||||
Verify the Ruby toolcache in a fresh pod:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
kubectl -n github-runner exec deploy/github-runner-puppet -c runner -- ruby -v
|
|
||||||
kubectl -n github-runner exec deploy/github-runner-puppet -c runner -- sh -c \
|
|
||||||
'echo "$RUNNER_TOOL_CACHE" && test -f "$RUNNER_TOOL_CACHE/Ruby/3.3/x64.complete"'
|
|
||||||
```
|
|
||||||
|
|
||||||
Verify GitHub registration for the repo-scoped runners:
|
Verify GitHub registration for the repo-scoped runners:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
@@ -122,10 +69,6 @@ from GitHub Actions and verify it lands on an `rke2-linux-*` runner.
|
|||||||
- `actions/setup-dotnet` permission error at `/usr/share/dotnet`: check that
|
- `actions/setup-dotnet` permission error at `/usr/share/dotnet`: check that
|
||||||
`DOTNET_INSTALL_DIR=/home/runner/.dotnet` and related cache env vars are
|
`DOTNET_INSTALL_DIR=/home/runner/.dotnet` and related cache env vars are
|
||||||
present on the runner pod.
|
present on the runner pod.
|
||||||
- `ruby/setup-ruby@v1` says self-hosted runners must install Ruby in
|
|
||||||
`$RUNNER_TOOL_CACHE`: check that the init container copied
|
|
||||||
`/opt/runner-toolcache/Ruby` into `/home/runner/_tool/Ruby` and that
|
|
||||||
`/home/runner/_tool/Ruby/3.3/x64.complete` exists.
|
|
||||||
- `404` during runner registration: the fine-grained PAT is valid but missing
|
- `404` during runner registration: the fine-grained PAT is valid but missing
|
||||||
repository access for that repo. Add the repo to the PAT access list; the PAT
|
repository access for that repo. Add the repo to the PAT access list; the PAT
|
||||||
value does not change.
|
value does not change.
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@@ -1,19 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
RUBY_VERSION="${RUBY_VERSION:-3.3.11}"
|
|
||||||
RUBY_MINOR="${RUBY_MINOR:-3.3}"
|
|
||||||
TOOLCACHE_ROOT="${TOOLCACHE_ROOT:-/opt/runner-toolcache}"
|
|
||||||
RUNNER_UID="${RUNNER_UID:-1001}"
|
|
||||||
RUNNER_GID="${RUNNER_GID:-1001}"
|
|
||||||
RUBY_PREFIX="${TOOLCACHE_ROOT}/Ruby/${RUBY_VERSION}/x64"
|
|
||||||
|
|
||||||
mkdir -p "${TOOLCACHE_ROOT}/Ruby"
|
|
||||||
RUBY_CONFIGURE_OPTS="${RUBY_CONFIGURE_OPTS:---disable-install-doc --disable-yjit}" ruby-build "${RUBY_VERSION}" "${RUBY_PREFIX}"
|
|
||||||
|
|
||||||
touch "${TOOLCACHE_ROOT}/Ruby/${RUBY_VERSION}/x64.complete"
|
|
||||||
ln -sfn "${RUBY_VERSION}" "${TOOLCACHE_ROOT}/Ruby/${RUBY_MINOR}"
|
|
||||||
|
|
||||||
"${RUBY_PREFIX}/bin/ruby" -v
|
|
||||||
chown -R "${RUNNER_UID}:${RUNNER_GID}" "${TOOLCACHE_ROOT}"
|
|
||||||
chmod -R a+rX "${TOOLCACHE_ROOT}"
|
|
||||||
@@ -1,12 +0,0 @@
|
|||||||
-----BEGIN CERTIFICATE-----
|
|
||||||
MIIBxDCCAWqgAwIBAgIRAPY357G6ow6zMAL5+4bS2kkwCgYIKoZIzj0EAwIwQDEa
|
|
||||||
MBgGA1UEChMRSUFtV29ya2luIEFDTUUgQ0ExIjAgBgNVBAMTGUlBbVdvcmtpbiBB
|
|
||||||
Q01FIENBIFJvb3QgQ0EwHhcNMjYwMzA4MTgwNzExWhcNMzYwMzA1MTgwNzExWjBA
|
|
||||||
MRowGAYDVQQKExFJQW1Xb3JraW4gQUNNRSBDQTEiMCAGA1UEAxMZSUFtV29ya2lu
|
|
||||||
IEFDTUUgQ0EgUm9vdCBDQTBZMBMGByqGSM49AgEGCCqGSM49AwEHA0IABJ2n04X1
|
|
||||||
JZo5Zdq/i1Idv8+fqwZyAzBh7whbqj0SWsJL8UWRabCMqYCs7+dXO0xRSzqkwFDL
|
|
||||||
x+vooOai8RgRNhajRTBDMA4GA1UdDwEB/wQEAwIBBjASBgNVHRMBAf8ECDAGAQH/
|
|
||||||
AgEBMB0GA1UdDgQWBBRnuPPQR6iM/H6vOluiU3Sygayz8jAKBggqhkjOPQQDAgNI
|
|
||||||
ADBFAiEArQK9dYPGmAZsdYnjziuFVVE5NKZUcceYvGfGC+tLXUsCIAudF2zJrCRq
|
|
||||||
3mK50ZZET/fwTkJwiEF4824mjP8p1CKM
|
|
||||||
-----END CERTIFICATE-----
|
|
||||||
@@ -280,14 +280,13 @@ data:
|
|||||||
printer_model: "NuPrint 210"
|
printer_model: "NuPrint 210"
|
||||||
|
|
||||||
# Print.Web health (Blazor app on edge2:5200)
|
# Print.Web health (Blazor app on edge2:5200)
|
||||||
# Target `/health` (anonymous) — root path requires API key auth and returns 401.
|
|
||||||
- job_name: "probe-printweb"
|
- job_name: "probe-printweb"
|
||||||
metrics_path: /probe
|
metrics_path: /probe
|
||||||
params:
|
params:
|
||||||
module: [http_2xx]
|
module: [http_2xx]
|
||||||
scrape_interval: 30s
|
scrape_interval: 30s
|
||||||
static_configs:
|
static_configs:
|
||||||
- targets: ["http://10.0.57.16:5200/health"]
|
- targets: ["http://10.0.57.16:5200/"]
|
||||||
labels:
|
labels:
|
||||||
instance: "print-web"
|
instance: "print-web"
|
||||||
service: "print-web"
|
service: "print-web"
|
||||||
|
|||||||
@@ -24,16 +24,7 @@
|
|||||||
# (10.0.57.16:5200), public internet 80/443 (excluding RFC1918), and
|
# (10.0.57.16:5200), public internet 80/443 (excluding RFC1918), and
|
||||||
# fc-signage:5190 for the signage AAT lane.
|
# fc-signage:5190 for the signage AAT lane.
|
||||||
# - Ingress: Traefik (4444 + 8089 ACME-solver-style), intra-pod,
|
# - Ingress: Traefik (4444 + 8089 ACME-solver-style), intra-pod,
|
||||||
# telephony / gitea / fc-system / fc-signage / github-runner namespaces
|
# telephony / gitea / fc-system / fc-signage namespaces on 4444.
|
||||||
# on 4444.
|
|
||||||
#
|
|
||||||
# 2026-05-25: added github-runner ingress on 4444 so CI jobs running in
|
|
||||||
# self-hosted runner pods (e.g. FlowerCore.Print.Web `help-screenshots`)
|
|
||||||
# can reach the grid. Without this allow, the session POST to
|
|
||||||
# `selenium-hub.selenium.svc.cluster.local:4444` was DNAT'd to the hub
|
|
||||||
# pod IP and then dropped at the Calico ingress hook — Selenium UI showed
|
|
||||||
# 0/4 sessions while the .NET HTTP client timed out at 60s. Same family
|
|
||||||
# as `feedback_netpol_dnat_backend_port`, wrong-source-namespace flavor.
|
|
||||||
apiVersion: networking.k8s.io/v1
|
apiVersion: networking.k8s.io/v1
|
||||||
kind: NetworkPolicy
|
kind: NetworkPolicy
|
||||||
metadata:
|
metadata:
|
||||||
@@ -212,13 +203,6 @@ spec:
|
|||||||
ports:
|
ports:
|
||||||
- port: 4444
|
- port: 4444
|
||||||
protocol: TCP
|
protocol: TCP
|
||||||
- from:
|
|
||||||
- namespaceSelector:
|
|
||||||
matchLabels:
|
|
||||||
kubernetes.io/metadata.name: github-runner
|
|
||||||
ports:
|
|
||||||
- port: 4444
|
|
||||||
protocol: TCP
|
|
||||||
podSelector: {}
|
podSelector: {}
|
||||||
policyTypes:
|
policyTypes:
|
||||||
- Ingress
|
- Ingress
|
||||||
|
|||||||
@@ -1,427 +0,0 @@
|
|||||||
# Selenium Grid 4 — RKE2 deployment
|
|
||||||
#
|
|
||||||
# Hub + chrome + firefox + edge browser nodes serving fleet-wide AAT runs from
|
|
||||||
# the GitHub Actions self-hosted runners. ArgoCD owns this namespace from
|
|
||||||
# 2026-05-25 (`infra-selenium` Application; previously these resources were
|
|
||||||
# orphan kubectl-applied since 2026-03-15).
|
|
||||||
#
|
|
||||||
# Endpoints:
|
|
||||||
# - Internal cluster: http://selenium-hub.selenium.svc.cluster.local:4444
|
|
||||||
# - LAN LoadBalancer (MetalLB): http://10.0.56.208:4444
|
|
||||||
# - Traefik public: https://selenium.iamworkin.lan
|
|
||||||
#
|
|
||||||
# Browser maxSessions:
|
|
||||||
# - chrome 2 (bumped from 1 on 2026-05-25 morning-routine — AAT-heavy
|
|
||||||
# Print.Web help-screenshots was the global bottleneck;
|
|
||||||
# see commit history for ops/runner-replica-rightsize)
|
|
||||||
# - firefox 1
|
|
||||||
# - edge 1
|
|
||||||
#
|
|
||||||
# Screenshots + video recording write to NFS via the chrome video sidecar.
|
|
||||||
# See: CLAUDE.md "Selenium Grid & Visual AAT Testing" + bluejay-infra ADR notes.
|
|
||||||
---
|
|
||||||
apiVersion: v1
|
|
||||||
kind: Service
|
|
||||||
metadata:
|
|
||||||
labels:
|
|
||||||
app: selenium-hub
|
|
||||||
app.kubernetes.io/name: selenium-hub
|
|
||||||
app.kubernetes.io/part-of: selenium-grid
|
|
||||||
name: selenium-hub
|
|
||||||
namespace: selenium
|
|
||||||
spec:
|
|
||||||
ports:
|
|
||||||
- name: web
|
|
||||||
port: 4444
|
|
||||||
targetPort: 4444
|
|
||||||
- name: publish
|
|
||||||
port: 4442
|
|
||||||
targetPort: 4442
|
|
||||||
- name: subscribe
|
|
||||||
port: 4443
|
|
||||||
targetPort: 4443
|
|
||||||
selector:
|
|
||||||
app: selenium-hub
|
|
||||||
type: ClusterIP
|
|
||||||
---
|
|
||||||
apiVersion: v1
|
|
||||||
kind: Service
|
|
||||||
metadata:
|
|
||||||
annotations:
|
|
||||||
metallb.io/ip-allocated-from-pool: bluejay-pool
|
|
||||||
metallb.universe.tf/loadBalancerIPs: 10.0.56.208
|
|
||||||
labels:
|
|
||||||
app: selenium-hub
|
|
||||||
component: external-access
|
|
||||||
name: selenium-hub-external
|
|
||||||
namespace: selenium
|
|
||||||
spec:
|
|
||||||
clusterIP: 10.43.90.147
|
|
||||||
clusterIPs:
|
|
||||||
- 10.43.90.147
|
|
||||||
externalTrafficPolicy: Local
|
|
||||||
healthCheckNodePort: 32213
|
|
||||||
ports:
|
|
||||||
- name: web
|
|
||||||
nodePort: 32411
|
|
||||||
port: 4444
|
|
||||||
targetPort: 4444
|
|
||||||
- name: publish
|
|
||||||
nodePort: 32068
|
|
||||||
port: 4442
|
|
||||||
targetPort: 4442
|
|
||||||
- name: subscribe
|
|
||||||
nodePort: 31000
|
|
||||||
port: 4443
|
|
||||||
targetPort: 4443
|
|
||||||
selector:
|
|
||||||
app: selenium-hub
|
|
||||||
type: LoadBalancer
|
|
||||||
---
|
|
||||||
apiVersion: apps/v1
|
|
||||||
kind: Deployment
|
|
||||||
metadata:
|
|
||||||
labels:
|
|
||||||
app: selenium-hub
|
|
||||||
app.kubernetes.io/name: selenium-hub
|
|
||||||
app.kubernetes.io/part-of: selenium-grid
|
|
||||||
name: selenium-hub
|
|
||||||
namespace: selenium
|
|
||||||
spec:
|
|
||||||
replicas: 1
|
|
||||||
selector:
|
|
||||||
matchLabels:
|
|
||||||
app: selenium-hub
|
|
||||||
template:
|
|
||||||
metadata:
|
|
||||||
labels:
|
|
||||||
app: selenium-hub
|
|
||||||
app.kubernetes.io/name: selenium-hub
|
|
||||||
app.kubernetes.io/part-of: selenium-grid
|
|
||||||
spec:
|
|
||||||
containers:
|
|
||||||
- env:
|
|
||||||
- name: SE_NODE_SESSION_TIMEOUT
|
|
||||||
value: '300'
|
|
||||||
- name: SE_SESSION_REQUEST_TIMEOUT
|
|
||||||
value: '300'
|
|
||||||
- name: SE_SESSION_RETRY_INTERVAL
|
|
||||||
value: '5'
|
|
||||||
- name: JAVA_OPTS
|
|
||||||
value: -Xmx512m
|
|
||||||
image: selenium/hub:4.27.0
|
|
||||||
livenessProbe:
|
|
||||||
httpGet:
|
|
||||||
path: /wd/hub/status
|
|
||||||
port: 4444
|
|
||||||
initialDelaySeconds: 30
|
|
||||||
periodSeconds: 15
|
|
||||||
timeoutSeconds: 5
|
|
||||||
name: selenium-hub
|
|
||||||
ports:
|
|
||||||
- containerPort: 4444
|
|
||||||
name: web
|
|
||||||
- containerPort: 4442
|
|
||||||
name: publish
|
|
||||||
- containerPort: 4443
|
|
||||||
name: subscribe
|
|
||||||
readinessProbe:
|
|
||||||
httpGet:
|
|
||||||
path: /wd/hub/status
|
|
||||||
port: 4444
|
|
||||||
initialDelaySeconds: 10
|
|
||||||
periodSeconds: 5
|
|
||||||
timeoutSeconds: 5
|
|
||||||
# Hub baseline working set ~766Mi on 2026-05-25 (75% of prior 1Gi
|
|
||||||
# limit). Bump to 1.5Gi / 1Gi to keep ~50% headroom; matches the
|
|
||||||
# stampede-buffer pattern documented for multus
|
|
||||||
# (feedback_k8s_cni_multus_sizing). CPU left alone — observed 54m
|
|
||||||
# against a 500m limit, no contention.
|
|
||||||
resources:
|
|
||||||
limits:
|
|
||||||
cpu: 500m
|
|
||||||
memory: 1536Mi
|
|
||||||
requests:
|
|
||||||
cpu: 250m
|
|
||||||
memory: 1Gi
|
|
||||||
---
|
|
||||||
apiVersion: apps/v1
|
|
||||||
kind: Deployment
|
|
||||||
metadata:
|
|
||||||
labels:
|
|
||||||
app: selenium-node-chrome
|
|
||||||
app.kubernetes.io/name: selenium-node-chrome
|
|
||||||
app.kubernetes.io/part-of: selenium-grid
|
|
||||||
name: selenium-node-chrome
|
|
||||||
namespace: selenium
|
|
||||||
spec:
|
|
||||||
replicas: 1
|
|
||||||
selector:
|
|
||||||
matchLabels:
|
|
||||||
app: selenium-node-chrome
|
|
||||||
template:
|
|
||||||
metadata:
|
|
||||||
labels:
|
|
||||||
app: selenium-node-chrome
|
|
||||||
app.kubernetes.io/name: selenium-node-chrome
|
|
||||||
app.kubernetes.io/part-of: selenium-grid
|
|
||||||
spec:
|
|
||||||
containers:
|
|
||||||
- env:
|
|
||||||
- name: SE_EVENT_BUS_HOST
|
|
||||||
value: selenium-hub
|
|
||||||
- name: SE_EVENT_BUS_PUBLISH_PORT
|
|
||||||
value: '4442'
|
|
||||||
- name: SE_EVENT_BUS_SUBSCRIBE_PORT
|
|
||||||
value: '4443'
|
|
||||||
- name: SE_NODE_MAX_SESSIONS
|
|
||||||
value: '2'
|
|
||||||
- name: SE_NODE_OVERRIDE_MAX_SESSIONS
|
|
||||||
value: 'false'
|
|
||||||
- name: SE_VNC_NO_PASSWORD
|
|
||||||
value: '1'
|
|
||||||
- name: SE_SCREEN_WIDTH
|
|
||||||
value: '1920'
|
|
||||||
- name: SE_SCREEN_HEIGHT
|
|
||||||
value: '1080'
|
|
||||||
- name: SE_NODE_SESSION_TIMEOUT
|
|
||||||
value: '300'
|
|
||||||
image: selenium/node-chrome:4.27.0
|
|
||||||
livenessProbe:
|
|
||||||
httpGet:
|
|
||||||
path: /status
|
|
||||||
port: 5555
|
|
||||||
initialDelaySeconds: 30
|
|
||||||
periodSeconds: 15
|
|
||||||
name: selenium-chrome
|
|
||||||
ports:
|
|
||||||
- containerPort: 5555
|
|
||||||
name: node
|
|
||||||
readinessProbe:
|
|
||||||
httpGet:
|
|
||||||
path: /status
|
|
||||||
port: 5555
|
|
||||||
initialDelaySeconds: 15
|
|
||||||
periodSeconds: 5
|
|
||||||
# Chromium-based browser node. Bumped from 1Gi -> 2Gi (req 512Mi
|
|
||||||
# -> 1Gi) on 2026-05-25 — Edge had 51 OOMKills in 5d on the
|
|
||||||
# original 1Gi cap (~1 OOM every 2.4h), and Chrome at maxSessions=2
|
|
||||||
# was running 684Mi idle on the same cap. Matches the Firefox node's
|
|
||||||
# tested-stable 2Gi limit. CPU unchanged.
|
|
||||||
resources:
|
|
||||||
limits:
|
|
||||||
cpu: '1'
|
|
||||||
memory: 2Gi
|
|
||||||
requests:
|
|
||||||
cpu: 500m
|
|
||||||
memory: 1Gi
|
|
||||||
volumeMounts:
|
|
||||||
- mountPath: /dev/shm
|
|
||||||
name: dshm
|
|
||||||
- env:
|
|
||||||
- name: DISPLAY_CONTAINER_NAME
|
|
||||||
value: localhost
|
|
||||||
- name: SE_SCREEN_WIDTH
|
|
||||||
value: '1920'
|
|
||||||
- name: SE_SCREEN_HEIGHT
|
|
||||||
value: '1080'
|
|
||||||
- name: SE_VIDEO_FILE_NAME
|
|
||||||
value: auto
|
|
||||||
- name: SE_VIDEO_UPLOAD_ENABLED
|
|
||||||
value: 'false'
|
|
||||||
image: selenium/video:ffmpeg-7.1-20250101
|
|
||||||
name: video
|
|
||||||
resources:
|
|
||||||
limits:
|
|
||||||
cpu: 500m
|
|
||||||
memory: 768Mi
|
|
||||||
requests:
|
|
||||||
cpu: 250m
|
|
||||||
memory: 384Mi
|
|
||||||
volumeMounts:
|
|
||||||
- mountPath: /videos
|
|
||||||
name: selenium-videos
|
|
||||||
volumes:
|
|
||||||
- emptyDir:
|
|
||||||
medium: Memory
|
|
||||||
sizeLimit: 2Gi
|
|
||||||
name: dshm
|
|
||||||
- emptyDir:
|
|
||||||
sizeLimit: 5Gi
|
|
||||||
name: selenium-videos
|
|
||||||
---
|
|
||||||
apiVersion: apps/v1
|
|
||||||
kind: Deployment
|
|
||||||
metadata:
|
|
||||||
labels:
|
|
||||||
app: selenium-node-firefox
|
|
||||||
app.kubernetes.io/name: selenium-node-firefox
|
|
||||||
app.kubernetes.io/part-of: selenium-grid
|
|
||||||
name: selenium-node-firefox
|
|
||||||
namespace: selenium
|
|
||||||
spec:
|
|
||||||
replicas: 1
|
|
||||||
selector:
|
|
||||||
matchLabels:
|
|
||||||
app: selenium-node-firefox
|
|
||||||
template:
|
|
||||||
metadata:
|
|
||||||
labels:
|
|
||||||
app: selenium-node-firefox
|
|
||||||
app.kubernetes.io/name: selenium-node-firefox
|
|
||||||
app.kubernetes.io/part-of: selenium-grid
|
|
||||||
spec:
|
|
||||||
containers:
|
|
||||||
- env:
|
|
||||||
- name: SE_EVENT_BUS_HOST
|
|
||||||
value: selenium-hub
|
|
||||||
- name: SE_EVENT_BUS_PUBLISH_PORT
|
|
||||||
value: '4442'
|
|
||||||
- name: SE_EVENT_BUS_SUBSCRIBE_PORT
|
|
||||||
value: '4443'
|
|
||||||
- name: SE_NODE_MAX_SESSIONS
|
|
||||||
value: '1'
|
|
||||||
- name: SE_NODE_OVERRIDE_MAX_SESSIONS
|
|
||||||
value: 'true'
|
|
||||||
- name: SE_VNC_NO_PASSWORD
|
|
||||||
value: '1'
|
|
||||||
- name: SE_START_VNC
|
|
||||||
value: 'false'
|
|
||||||
- name: SE_SCREEN_WIDTH
|
|
||||||
value: '1920'
|
|
||||||
- name: SE_SCREEN_HEIGHT
|
|
||||||
value: '1080'
|
|
||||||
- name: SE_NODE_SESSION_TIMEOUT
|
|
||||||
value: '300'
|
|
||||||
image: selenium/node-firefox:4.27.0
|
|
||||||
livenessProbe:
|
|
||||||
failureThreshold: 5
|
|
||||||
httpGet:
|
|
||||||
path: /status
|
|
||||||
port: 5555
|
|
||||||
initialDelaySeconds: 30
|
|
||||||
periodSeconds: 15
|
|
||||||
timeoutSeconds: 5
|
|
||||||
name: selenium-firefox
|
|
||||||
ports:
|
|
||||||
- containerPort: 5555
|
|
||||||
name: node
|
|
||||||
readinessProbe:
|
|
||||||
failureThreshold: 5
|
|
||||||
httpGet:
|
|
||||||
path: /status
|
|
||||||
port: 5555
|
|
||||||
initialDelaySeconds: 15
|
|
||||||
periodSeconds: 5
|
|
||||||
timeoutSeconds: 5
|
|
||||||
resources:
|
|
||||||
limits:
|
|
||||||
cpu: '1'
|
|
||||||
memory: 2Gi
|
|
||||||
requests:
|
|
||||||
cpu: 500m
|
|
||||||
memory: 1Gi
|
|
||||||
volumeMounts:
|
|
||||||
- mountPath: /dev/shm
|
|
||||||
name: dshm
|
|
||||||
volumes:
|
|
||||||
- emptyDir:
|
|
||||||
medium: Memory
|
|
||||||
sizeLimit: 2Gi
|
|
||||||
name: dshm
|
|
||||||
---
|
|
||||||
apiVersion: apps/v1
|
|
||||||
kind: Deployment
|
|
||||||
metadata:
|
|
||||||
labels:
|
|
||||||
app: selenium-node-edge
|
|
||||||
app.kubernetes.io/name: selenium-node-edge
|
|
||||||
app.kubernetes.io/part-of: selenium-grid
|
|
||||||
name: selenium-node-edge
|
|
||||||
namespace: selenium
|
|
||||||
spec:
|
|
||||||
replicas: 1
|
|
||||||
selector:
|
|
||||||
matchLabels:
|
|
||||||
app: selenium-node-edge
|
|
||||||
template:
|
|
||||||
metadata:
|
|
||||||
labels:
|
|
||||||
app: selenium-node-edge
|
|
||||||
app.kubernetes.io/name: selenium-node-edge
|
|
||||||
app.kubernetes.io/part-of: selenium-grid
|
|
||||||
spec:
|
|
||||||
containers:
|
|
||||||
- env:
|
|
||||||
- name: SE_EVENT_BUS_HOST
|
|
||||||
value: selenium-hub
|
|
||||||
- name: SE_EVENT_BUS_PUBLISH_PORT
|
|
||||||
value: '4442'
|
|
||||||
- name: SE_EVENT_BUS_SUBSCRIBE_PORT
|
|
||||||
value: '4443'
|
|
||||||
- name: SE_NODE_MAX_SESSIONS
|
|
||||||
value: '1'
|
|
||||||
- name: SE_NODE_OVERRIDE_MAX_SESSIONS
|
|
||||||
value: 'true'
|
|
||||||
- name: SE_VNC_NO_PASSWORD
|
|
||||||
value: '1'
|
|
||||||
- name: SE_SCREEN_WIDTH
|
|
||||||
value: '1920'
|
|
||||||
- name: SE_SCREEN_HEIGHT
|
|
||||||
value: '1080'
|
|
||||||
- name: SE_NODE_SESSION_TIMEOUT
|
|
||||||
value: '300'
|
|
||||||
image: selenium/node-edge:4.27.0
|
|
||||||
livenessProbe:
|
|
||||||
httpGet:
|
|
||||||
path: /status
|
|
||||||
port: 5555
|
|
||||||
initialDelaySeconds: 30
|
|
||||||
periodSeconds: 15
|
|
||||||
name: selenium-edge
|
|
||||||
ports:
|
|
||||||
- containerPort: 5555
|
|
||||||
name: node
|
|
||||||
readinessProbe:
|
|
||||||
httpGet:
|
|
||||||
path: /status
|
|
||||||
port: 5555
|
|
||||||
initialDelaySeconds: 15
|
|
||||||
periodSeconds: 5
|
|
||||||
# Chromium-based browser node. Bumped from 1Gi -> 2Gi (req 512Mi
|
|
||||||
# -> 1Gi) on 2026-05-25 — Edge had 51 OOMKills in 5d on the
|
|
||||||
# original 1Gi cap (~1 OOM every 2.4h), and Chrome at maxSessions=2
|
|
||||||
# was running 684Mi idle on the same cap. Matches the Firefox node's
|
|
||||||
# tested-stable 2Gi limit. CPU unchanged.
|
|
||||||
resources:
|
|
||||||
limits:
|
|
||||||
cpu: '1'
|
|
||||||
memory: 2Gi
|
|
||||||
requests:
|
|
||||||
cpu: 500m
|
|
||||||
memory: 1Gi
|
|
||||||
volumeMounts:
|
|
||||||
- mountPath: /dev/shm
|
|
||||||
name: dshm
|
|
||||||
volumes:
|
|
||||||
- emptyDir:
|
|
||||||
medium: Memory
|
|
||||||
sizeLimit: 2Gi
|
|
||||||
name: dshm
|
|
||||||
---
|
|
||||||
apiVersion: traefik.io/v1alpha1
|
|
||||||
kind: IngressRoute
|
|
||||||
metadata:
|
|
||||||
name: selenium-hub
|
|
||||||
namespace: selenium
|
|
||||||
spec:
|
|
||||||
entryPoints:
|
|
||||||
- websecure
|
|
||||||
routes:
|
|
||||||
- kind: Rule
|
|
||||||
match: Host(`selenium.iamworkin.lan`)
|
|
||||||
services:
|
|
||||||
- name: selenium-hub
|
|
||||||
port: 4444
|
|
||||||
tls:
|
|
||||||
secretName: selenium-tls
|
|
||||||
@@ -67,7 +67,6 @@ public sealed class FleetManifestLintTests
|
|||||||
["github-runner-chat"] = "https://github.com/astoltz/FlowerCore.Chat",
|
["github-runner-chat"] = "https://github.com/astoltz/FlowerCore.Chat",
|
||||||
["github-runner-mysql"] = "https://github.com/astoltz/FlowerCore.MySQL",
|
["github-runner-mysql"] = "https://github.com/astoltz/FlowerCore.MySQL",
|
||||||
["github-runner-kiosk-linux"] = "https://github.com/astoltz/FlowerCore.Kiosk.Linux",
|
["github-runner-kiosk-linux"] = "https://github.com/astoltz/FlowerCore.Kiosk.Linux",
|
||||||
["github-runner-updater"] = "https://github.com/astoltz/FlowerCore.Updater",
|
|
||||||
};
|
};
|
||||||
|
|
||||||
private static readonly HashSet<string> ScaledLinuxRunnerDeployments = new(StringComparer.Ordinal)
|
private static readonly HashSet<string> ScaledLinuxRunnerDeployments = new(StringComparer.Ordinal)
|
||||||
@@ -81,7 +80,6 @@ public sealed class FleetManifestLintTests
|
|||||||
"github-runner-chat",
|
"github-runner-chat",
|
||||||
"github-runner-mysql",
|
"github-runner-mysql",
|
||||||
"github-runner-kiosk-linux",
|
"github-runner-kiosk-linux",
|
||||||
"github-runner-updater",
|
|
||||||
};
|
};
|
||||||
|
|
||||||
private static readonly IReadOnlyDictionary<string, string> WritableRunnerEnv = new Dictionary<string, string>(StringComparer.Ordinal)
|
private static readonly IReadOnlyDictionary<string, string> WritableRunnerEnv = new Dictionary<string, string>(StringComparer.Ordinal)
|
||||||
@@ -236,7 +234,7 @@ public sealed class FleetManifestLintTests
|
|||||||
{
|
{
|
||||||
deployments.Should().ContainKey(expectedRunner.Key);
|
deployments.Should().ContainKey(expectedRunner.Key);
|
||||||
|
|
||||||
var container = deployments[expectedRunner.Key].MainContainerMappings().Should().ContainSingle().Subject;
|
var container = deployments[expectedRunner.Key].ContainerMappings().Should().ContainSingle().Subject;
|
||||||
EnvValue(container, "REPO_URL").Should().Be(expectedRunner.Value);
|
EnvValue(container, "REPO_URL").Should().Be(expectedRunner.Value);
|
||||||
EnvValue(container, "EPHEMERAL").Should().Be("true");
|
EnvValue(container, "EPHEMERAL").Should().Be("true");
|
||||||
EnvValue(container, "LABELS").Should().Be("self-hosted,linux,fc-build-linux");
|
EnvValue(container, "LABELS").Should().Be("self-hosted,linux,fc-build-linux");
|
||||||
@@ -252,7 +250,7 @@ public sealed class FleetManifestLintTests
|
|||||||
{
|
{
|
||||||
foreach (var deployment in GitHubRunnerDeployments().Values)
|
foreach (var deployment in GitHubRunnerDeployments().Values)
|
||||||
{
|
{
|
||||||
var container = deployment.MainContainerMappings().Should().ContainSingle().Subject;
|
var container = deployment.ContainerMappings().Should().ContainSingle().Subject;
|
||||||
|
|
||||||
foreach (var expectedEnv in WritableRunnerEnv)
|
foreach (var expectedEnv in WritableRunnerEnv)
|
||||||
{
|
{
|
||||||
@@ -279,10 +277,7 @@ public sealed class FleetManifestLintTests
|
|||||||
foreach (var deploymentName in ScaledLinuxRunnerDeployments)
|
foreach (var deploymentName in ScaledLinuxRunnerDeployments)
|
||||||
{
|
{
|
||||||
var deployment = deployments[deploymentName];
|
var deployment = deployments[deploymentName];
|
||||||
// Scaled runners must have >= 2 replicas (avoid single-pod bottleneck).
|
ReplicaCount(deployment).Should().Be(2);
|
||||||
// Individual deployments may be tuned upward per CI activity — see
|
|
||||||
// "runners: right-size replica counts per 14d CI activity (#24)".
|
|
||||||
ReplicaCount(deployment).Should().BeGreaterOrEqualTo(2, $"{deploymentName} is in the scaled set and must run with at least 2 replicas");
|
|
||||||
|
|
||||||
var volumes = deployment.MappingSequence("spec", "template", "spec", "volumes");
|
var volumes = deployment.MappingSequence("spec", "template", "spec", "volumes");
|
||||||
var claimNames = volumes
|
var claimNames = volumes
|
||||||
@@ -308,108 +303,6 @@ public sealed class FleetManifestLintTests
|
|||||||
.Be("github-runner-nuget-cache");
|
.Be("github-runner-nuget-cache");
|
||||||
}
|
}
|
||||||
|
|
||||||
[Fact]
|
|
||||||
public void Runners_MustNotPinToOperatorWorkstationHosts()
|
|
||||||
{
|
|
||||||
// CRITICAL SAFETY (operator directive 2026-05-26): BLUEJAY-WS is the
|
|
||||||
// operator's primary workstation — host of the 1Password Connect
|
|
||||||
// bearer token, fcadmin SSH keys to noc1, signing CA private keys,
|
|
||||||
// and source for every FC repo. A self-hosted GitHub Actions runner
|
|
||||||
// there would execute arbitrary PR code with that local access.
|
|
||||||
// Build-side analog of the Sprint 9 NEW safe-account exclusion gate
|
|
||||||
// (Puppet GPO/AppLocker/WDAC/audit-forwarder modules refuse to apply
|
|
||||||
// on BLUEJAY-WS). This lint asserts no GitHub-runner Deployment in
|
|
||||||
// apps/github-runner/ pins to a forbidden operator-workstation host
|
|
||||||
// via nodeName, nodeSelector, nodeAffinity, or tolerations.
|
|
||||||
// Existing legacy `bluejay-ws-sandbox-1` GitHub-registered runner is
|
|
||||||
// out of scope here (it's a runtime registration, not a K8s
|
|
||||||
// Deployment) — see CLAUDE.md "Common Mistakes" entry and
|
|
||||||
// feedback_bluejay_ws_never_public_runner.md.
|
|
||||||
var forbiddenHostPatterns = new[]
|
|
||||||
{
|
|
||||||
"bluejay-ws",
|
|
||||||
"BLUEJAY-WS",
|
|
||||||
"bluejay-ws.iamworkin.lan",
|
|
||||||
"iamworkin-ws",
|
|
||||||
};
|
|
||||||
|
|
||||||
bool ContainsForbidden(string? value) =>
|
|
||||||
!string.IsNullOrWhiteSpace(value)
|
|
||||||
&& forbiddenHostPatterns.Any(pattern => value!.Contains(pattern, StringComparison.OrdinalIgnoreCase));
|
|
||||||
|
|
||||||
var violations = GitHubRunnerDeployments().Values.SelectMany(deployment =>
|
|
||||||
{
|
|
||||||
var local = new List<string>();
|
|
||||||
var podSpec = ManifestNodeExtensions.Mapping(deployment.Root, "spec", "template", "spec");
|
|
||||||
if (podSpec is null)
|
|
||||||
{
|
|
||||||
return local;
|
|
||||||
}
|
|
||||||
|
|
||||||
// nodeName: pins the pod to a specific node by name.
|
|
||||||
var nodeName = ManifestNodeExtensions.Scalar(podSpec, "nodeName");
|
|
||||||
if (ContainsForbidden(nodeName))
|
|
||||||
{
|
|
||||||
local.Add($"{deployment.Name} sets nodeName='{nodeName}' which targets a forbidden operator-workstation host.");
|
|
||||||
}
|
|
||||||
|
|
||||||
// nodeSelector: dict of label → value pinning the pod to nodes
|
|
||||||
// carrying matching labels. Examples that would trip this:
|
|
||||||
// kubernetes.io/hostname: bluejay-ws
|
|
||||||
// flowercore.io/host: bluejay-ws.iamworkin.lan
|
|
||||||
var nodeSelector = ManifestNodeExtensions.Mapping(podSpec, "nodeSelector");
|
|
||||||
if (nodeSelector is not null)
|
|
||||||
{
|
|
||||||
foreach (var entry in nodeSelector.Children)
|
|
||||||
{
|
|
||||||
var key = entry.Key is YamlScalarNode keyScalar ? keyScalar.Value : null;
|
|
||||||
var value = entry.Value is YamlScalarNode valueScalar ? valueScalar.Value : null;
|
|
||||||
if (ContainsForbidden(value))
|
|
||||||
{
|
|
||||||
local.Add($"{deployment.Name} has nodeSelector entry '{key}: {value}' which targets a forbidden operator-workstation host.");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// nodeAffinity: matchExpressions over node labels.
|
|
||||||
foreach (var term in ManifestNodeExtensions.MappingSequence(podSpec, "affinity", "nodeAffinity", "requiredDuringSchedulingIgnoredDuringExecution", "nodeSelectorTerms"))
|
|
||||||
{
|
|
||||||
foreach (var expr in ManifestNodeExtensions.MappingSequence(term, "matchExpressions"))
|
|
||||||
{
|
|
||||||
var key = ManifestNodeExtensions.Scalar(expr, "key");
|
|
||||||
foreach (var valueNode in ManifestNodeExtensions.ScalarSequence(expr, "values"))
|
|
||||||
{
|
|
||||||
if (ContainsForbidden(valueNode))
|
|
||||||
{
|
|
||||||
local.Add($"{deployment.Name} has nodeAffinity matchExpression '{key}' value '{valueNode}' which targets a forbidden operator-workstation host.");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// tolerations: scheduling onto a tainted operator-workstation
|
|
||||||
// node would let the runner run there. Forbid any toleration
|
|
||||||
// value that names the workstation.
|
|
||||||
foreach (var toleration in ManifestNodeExtensions.MappingSequence(podSpec, "tolerations"))
|
|
||||||
{
|
|
||||||
var key = ManifestNodeExtensions.Scalar(toleration, "key");
|
|
||||||
var value = ManifestNodeExtensions.Scalar(toleration, "value");
|
|
||||||
if (ContainsForbidden(key))
|
|
||||||
{
|
|
||||||
local.Add($"{deployment.Name} has toleration key '{key}' which targets a forbidden operator-workstation host.");
|
|
||||||
}
|
|
||||||
if (ContainsForbidden(value))
|
|
||||||
{
|
|
||||||
local.Add($"{deployment.Name} has toleration value '{value}' which targets a forbidden operator-workstation host.");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return local;
|
|
||||||
}).ToList();
|
|
||||||
|
|
||||||
violations.Should().BeEmpty("BLUEJAY-WS / iamworkin-ws must never host a fleet GitHub Actions runner; see CLAUDE.md 'Registering BLUEJAY-WS as a fleet GitHub Actions runner' and feedback_bluejay_ws_never_public_runner.md");
|
|
||||||
}
|
|
||||||
|
|
||||||
[Fact]
|
[Fact]
|
||||||
public void Monitoring_MustAlertWhenLinuxRunnerDeploymentIsUnavailable()
|
public void Monitoring_MustAlertWhenLinuxRunnerDeploymentIsUnavailable()
|
||||||
{
|
{
|
||||||
@@ -997,22 +890,6 @@ internal sealed record ManifestDocument(
|
|||||||
.ToList();
|
.ToList();
|
||||||
}
|
}
|
||||||
|
|
||||||
// MainContainerMappings excludes initContainers. Use this when asserting
|
|
||||||
// properties of the primary container (env, image, volumeMounts) where an
|
|
||||||
// initContainer would be a false-positive match — e.g. the GitHub runner
|
|
||||||
// image's `setup-runner-home` initContainer should not count toward the
|
|
||||||
// single-container assertions on the runner deployments.
|
|
||||||
public IReadOnlyList<YamlMappingNode> MainContainerMappings()
|
|
||||||
{
|
|
||||||
var podSpec = PodSpec();
|
|
||||||
if (podSpec is null)
|
|
||||||
{
|
|
||||||
return Array.Empty<YamlMappingNode>();
|
|
||||||
}
|
|
||||||
|
|
||||||
return ManifestNodeExtensions.MappingSequence(podSpec, "containers").ToList();
|
|
||||||
}
|
|
||||||
|
|
||||||
public IReadOnlyList<ContainerSpec> ContainerSpecs()
|
public IReadOnlyList<ContainerSpec> ContainerSpecs()
|
||||||
{
|
{
|
||||||
return ContainerMappings()
|
return ContainerMappings()
|
||||||
|
|||||||
Reference in New Issue
Block a user