Files
bluejay-infra/apps/github-runner/github-runner.yaml
Andrew Stoltz 9a15e4ce52 feat(github-runner): pod-env DOTNET_INSTALL_DIR + initContainer for non-root runner
Sprint 30 Cl-1 acceptance fix. Sets DOTNET_INSTALL_DIR + NUGET_PACKAGES + 4
sibling env vars on both Deployments so non-root runner (UID 1001) can
write to /home/runner/.dotnet + /home/runner/.nuget without the
per-workflow patch that ~25 flipped Linux repos currently carry.

initContainer pre-creates + chowns the dirs to runner:runner so the
runtime mkdir-or-write succeeds on first restore. emptyDir mounted at
/home/runner; the .nuget/packages PVC (Common runner) wins at its nested
mount path so the persistent NuGet cache survives ephemeral pod restarts.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-17 21:24:30 -05:00

399 lines
16 KiB
YAML

# GitHub Actions self-hosted Linux runner — Phase 2 K8s deployment
#
# Phase 1 (current): BLUEJAY-WS registered manually as a Windows runner
# with label "fc-build-windows" via config.cmd (see docs/infrastructure/
# self-hosted-runner-fleet.md §WPF Build Runner).
#
# Phase 2 (this file): ephemeral Linux runner in RKE2 for non-WPF builds
# (Blazor Server, class libraries, operators, integration tests). Reduces
# billing for ubuntu-24.04 jobs that run on GitHub-hosted runners today.
#
# Runner image: myoung34/github-runner:latest
# EPHEMERAL=true — each pod runs exactly one job then exits; the
# Deployment controller immediately recreates it and re-registers.
# Prevents job queue starvation when two jobs overlap.
#
# NuGet cache: 5Gi Longhorn RWO PVC mounted at /home/runner/.nuget/packages
# Persists NuGet packages across ephemeral pod restarts (not shared across
# simultaneous runner pods; single-replica constraint below).
#
# Credentials:
# OnePasswordItem "GitHub PAT (Runner Registration)" → Secret
# github-runner-token with field "credential" used as RUNNER_TOKEN.
# Operator must create/rotate the 1P item manually; registration tokens
# expire after 1h — use a fine-grained PAT with Administration:read/write
# scope on the target repos, or a re-registration script. See
# docs/infrastructure/self-hosted-runner-fleet.md §Security.
#
# Security model:
# - No ClusterRole / ClusterRoleBinding — runner has no K8s API access.
# - securityContext: runAsNonRoot with read-only root filesystem where
# possible (runner image needs /tmp and /home/runner writable).
# - Fork pull-request approval required on the GitHub repo settings.
# - RUNNER_ALLOW_RUNASROOT=false is the default.
#
# Cost: Phase 2 eliminates GitHub-hosted ubuntu-24.04 billing; break-even
# vs electricity is ~1 000 min/month at current TOU rates.
#
# Node placement: rke2-server (10.0.56.11) only — Longhorn RWO PVC must
# land on the same node as the volume, and the server node has the most
# spare capacity for burst CI workloads.
#
# Designs: docs/infrastructure/self-hosted-runner-fleet.md
# Questions: Q-CI-1..5 (all Recommended defaults)
---
apiVersion: v1
kind: Namespace
metadata:
name: github-runner
labels:
app.kubernetes.io/part-of: flowercore
app.kubernetes.io/managed-by: argocd
---
# 1Password secret sync — creates github-runner-token K8s Secret.
# Fields expected in the 1Password item:
# credential — GitHub fine-grained PAT (Administration:read/write on
# target repos) used by the runner image to mint a fresh
# short-lived registration token at pod start.
# Item path: IAmWorkin vault > "GitHub PAT (Runner Registration)"
# Operator MUST create this item before the Deployment will start cleanly.
apiVersion: onepassword.com/v1
kind: OnePasswordItem
metadata:
name: github-runner-token
namespace: github-runner
labels:
app.kubernetes.io/component: credentials
app.kubernetes.io/part-of: flowercore
spec:
itemPath: vaults/IAmWorkin/items/GitHub PAT (Runner Registration)
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: github-runner-nuget-cache
namespace: github-runner
labels:
app.kubernetes.io/component: cache
app.kubernetes.io/part-of: flowercore
spec:
accessModes:
- ReadWriteOnce
storageClassName: longhorn
resources:
requests:
storage: 5Gi
volumeMode: Filesystem
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: github-runner
namespace: github-runner
labels:
app.kubernetes.io/component: runner
app.kubernetes.io/part-of: flowercore
# No ClusterRole or ClusterRoleBinding — runner has zero K8s API privileges.
# CI jobs that need kubectl must supply their own kubeconfig via a secret
# injected at the job level, not via this service account.
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: github-runner
namespace: github-runner
labels:
app.kubernetes.io/name: github-runner
app.kubernetes.io/component: runner
app.kubernetes.io/part-of: flowercore
app.kubernetes.io/managed-by: argocd
flowercore.io/created-by: argocd
spec:
# Single replica enforced: the Longhorn RWO PVC can only be mounted by
# one pod at a time. Each pod re-registers as an ephemeral runner after
# completing a job (EPHEMERAL=true restarts the container, not the pod,
# so the PVC stays attached between jobs).
#
# 2026-05-16: bumped 0 -> 1 after operator provisioned the
# "GitHub PAT (Runner Registration)" 1P item with field=credential.
# Unblocks CI fleet-wide (was budget-exhausted on GH-hosted runners).
replicas: 1
selector:
matchLabels:
app.kubernetes.io/name: github-runner
# Use Recreate to avoid the Multi-Attach RWO error during rollouts.
strategy:
type: Recreate
template:
metadata:
labels:
app.kubernetes.io/name: github-runner
app.kubernetes.io/component: runner
app.kubernetes.io/part-of: flowercore
flowercore.io/created-by: argocd
spec:
serviceAccountName: github-runner
# Pin to rke2-server so the Longhorn RWO volume is always on the same node.
nodeSelector:
kubernetes.io/hostname: rke2-server
securityContext:
runAsNonRoot: true
runAsUser: 1001
runAsGroup: 1001
fsGroup: 1001
# Sprint 30 Cl-1 pod-env fix (2026-05-21): pre-create + chown
# /home/runner/.dotnet + /home/runner/.nuget so the non-root runner
# (UID 1001) can host setup-dotnet@v4 + dotnet restore writes without
# the per-workflow DOTNET_INSTALL_DIR patch ~25 flipped Linux repos
# have been carrying. Runs as root so chown succeeds; the main
# container then runs as 1001 against an emptyDir mounted at
# /home/runner. The PVC mount at /home/runner/.nuget/packages
# (Common runner) still wins at its nested path because Kubernetes
# honors the deeper mount.
initContainers:
- name: setup-runner-home
image: busybox:1.36
command:
- sh
- -c
- |
set -e
mkdir -p /home/runner/.dotnet /home/runner/.nuget/packages /home/runner/.nuget/NuGet
chown -R 1001:1001 /home/runner/.dotnet /home/runner/.nuget
chmod -R 755 /home/runner/.dotnet /home/runner/.nuget
securityContext:
runAsUser: 0
runAsNonRoot: false
volumeMounts:
- name: runner-home
mountPath: /home/runner
containers:
- name: runner
image: myoung34/github-runner:latest
imagePullPolicy: Always
env:
# GitHub org/repo targeting.
# Set REPO_URL for a repo-scoped runner (cheaper, simpler).
# Switch to ORG_NAME + empty REPO_URL for an org-scoped runner.
- name: REPO_URL
value: "https://github.com/astoltz/FlowerCore.Common"
- name: RUNNER_NAME_PREFIX
value: "rke2-linux"
- name: RUNNER_WORKDIR
value: "/tmp/runner/work"
# EPHEMERAL=true: runner deregisters after one job; container
# exits with code 0; Deployment controller restarts it and a
# fresh registration occurs. Prevents stale runner accumulation.
- name: EPHEMERAL
value: "true"
# Labels used by workflow files: runs-on: [self-hosted, linux, fc-build-linux]
- name: LABELS
value: "self-hosted,linux,fc-build-linux"
# PAT (not pre-minted registration token) — myoung34/github-runner
# mints registration tokens itself via GitHub API when ACCESS_TOKEN
# is set. Switched from RUNNER_TOKEN -> ACCESS_TOKEN on 2026-05-16
# because the 1P "GitHub PAT (Runner Registration)" item stores a
# fine-grained PAT, not a short-lived registration token.
- name: ACCESS_TOKEN
valueFrom:
secretKeyRef:
name: github-runner-token
key: credential
# myoung34/github-runner default entrypoint expects root for some
# setup steps. With securityContext.runAsUser=1001 the entrypoint
# short-circuits with "RUN_AS_ROOT env var is set to true but the
# user has been overridden and is not running as root". Tell the
# entrypoint we're explicitly NOT root so it skips the root-only
# setup steps (cache prewarm + apt updates — both already baked).
- name: RUN_AS_ROOT
value: "false"
# Sprint 30 Cl-1 pod-env fix (2026-05-21): retire the per-workflow
# DOTNET_INSTALL_DIR patch by setting it (+ siblings) here so ALL
# flipped Linux repos pick it up automatically. setup-dotnet@v4
# default writes to /usr/share/dotnet (root-only) or HOME-relative
# ~/.dotnet without HOME guard; pin both explicitly to the chowned
# emptyDir at /home/runner.
- name: DOTNET_INSTALL_DIR
value: "/home/runner/.dotnet"
- name: DOTNET_CLI_TELEMETRY_OPTOUT
value: "1"
- name: NUGET_PACKAGES
value: "/home/runner/.nuget/packages"
- name: DOTNET_NOLOGO
value: "1"
- name: DOTNET_GENERATE_ASPNET_CERTIFICATE
value: "false"
resources:
requests:
cpu: "500m"
memory: "1Gi"
limits:
cpu: "2000m"
memory: "4Gi"
volumeMounts:
# /home/runner emptyDir — owned by the non-root runner thanks
# to the setup-runner-home initContainer chown. Hosts .dotnet
# (setup-dotnet@v4 target) and provides a writable HOME without
# forcing a PVC. The PVC mount below at .nuget/packages wins
# at that nested path (deeper mount overrides), so the Common
# NuGet cache continues to persist across ephemeral pod restarts.
- name: runner-home
mountPath: /home/runner
- name: nuget-cache
mountPath: /home/runner/.nuget/packages
- name: tmp
mountPath: /tmp
# Liveness: runner process is alive.
livenessProbe:
exec:
command:
- /bin/sh
- -c
- "pgrep -f Runner.Listener > /dev/null"
initialDelaySeconds: 30
periodSeconds: 30
failureThreshold: 3
volumes:
- name: runner-home
emptyDir: {}
- name: nuget-cache
persistentVolumeClaim:
claimName: github-runner-nuget-cache
- name: tmp
emptyDir: {}
# Restart policy: Always — the Deployment controller handles
# re-registration after each ephemeral job completes.
restartPolicy: Always
---
# Shared.Pos repo-scoped Linux runner. Added 2026-05-20 to unstick the
# FlowerCore.Shared.Pos "Build, Test & Publish" workflow, which had been
# queued indefinitely after the Sprint 26 Mac POS Phase 1/2 PRs merged
# (no fc-build-linux runner was registered to Shared.Pos — GitHub user
# accounts have only repo-scoped runners). First concrete instance of the
# Sprint 29 Cx-1 Linux-runner-fleet pattern; the full per-repo fleet is
# codified by that lane. emptyDir nuget cache (no RWO PVC) so it shares no
# volume with the Common runner and needs no node pin.
apiVersion: apps/v1
kind: Deployment
metadata:
name: github-runner-sharedpos
namespace: github-runner
labels:
app.kubernetes.io/name: github-runner-sharedpos
app.kubernetes.io/component: runner
app.kubernetes.io/part-of: flowercore
app.kubernetes.io/managed-by: argocd
flowercore.io/created-by: argocd
spec:
# UN-PARKED 2026-05-21: Shared.Pos #5 fixed the non-root setup-dotnet path
# (DOTNET_INSTALL_DIR step-scoped). Re-enabled to run the now-fixable build.
replicas: 1
selector:
matchLabels:
app.kubernetes.io/name: github-runner-sharedpos
strategy:
type: Recreate
template:
metadata:
labels:
app.kubernetes.io/name: github-runner-sharedpos
app.kubernetes.io/component: runner
app.kubernetes.io/part-of: flowercore
flowercore.io/created-by: argocd
spec:
serviceAccountName: github-runner
securityContext:
runAsNonRoot: true
runAsUser: 1001
runAsGroup: 1001
fsGroup: 1001
# Sprint 30 Cl-1 pod-env fix (2026-05-21): see github-runner Deployment
# above for full rationale. Mirrored on the Shared.Pos runner so the
# per-workflow DOTNET_INSTALL_DIR patch can be retired fleet-wide
# rather than re-applied per repo as flipped lanes land.
initContainers:
- name: setup-runner-home
image: busybox:1.36
command:
- sh
- -c
- |
set -e
mkdir -p /home/runner/.dotnet /home/runner/.nuget/packages /home/runner/.nuget/NuGet
chown -R 1001:1001 /home/runner/.dotnet /home/runner/.nuget
chmod -R 755 /home/runner/.dotnet /home/runner/.nuget
securityContext:
runAsUser: 0
runAsNonRoot: false
volumeMounts:
- name: runner-home
mountPath: /home/runner
containers:
- name: runner
image: myoung34/github-runner:latest
imagePullPolicy: Always
env:
- name: REPO_URL
value: "https://github.com/astoltz/FlowerCore.Shared.Pos"
- name: RUNNER_NAME_PREFIX
value: "rke2-linux-sharedpos"
- name: RUNNER_WORKDIR
value: "/tmp/runner/work"
- name: EPHEMERAL
value: "true"
- name: LABELS
value: "self-hosted,linux,fc-build-linux"
- name: ACCESS_TOKEN
valueFrom:
secretKeyRef:
name: github-runner-token
key: credential
- name: RUN_AS_ROOT
value: "false"
# Sprint 30 Cl-1 pod-env fix (2026-05-21): retire per-workflow patch.
- name: DOTNET_INSTALL_DIR
value: "/home/runner/.dotnet"
- name: DOTNET_CLI_TELEMETRY_OPTOUT
value: "1"
- name: NUGET_PACKAGES
value: "/home/runner/.nuget/packages"
- name: DOTNET_NOLOGO
value: "1"
- name: DOTNET_GENERATE_ASPNET_CERTIFICATE
value: "false"
resources:
requests:
cpu: "500m"
memory: "1Gi"
limits:
cpu: "2000m"
memory: "4Gi"
volumeMounts:
# Shared.Pos runner uses emptyDir for nuget cache (no node pin
# via RWO PVC). /home/runner emptyDir hosts .dotnet via the
# setup-runner-home initContainer chown; the .nuget/packages
# emptyDir mount still wins at its nested path.
- name: runner-home
mountPath: /home/runner
- name: nuget-cache
mountPath: /home/runner/.nuget/packages
- name: tmp
mountPath: /tmp
livenessProbe:
exec:
command:
- /bin/sh
- -c
- "pgrep -f Runner.Listener > /dev/null"
initialDelaySeconds: 30
periodSeconds: 30
failureThreshold: 3
volumes:
- name: runner-home
emptyDir: {}
- name: nuget-cache
emptyDir: {}
- name: tmp
emptyDir: {}
restartPolicy: Always