From e8094eb0bd730187d45ba975ab3cb95564fbc6f6 Mon Sep 17 00:00:00 2001 From: Codex Date: Thu, 14 May 2026 12:46:25 -0500 Subject: [PATCH] ci(github-runner): add Phase 2 ephemeral Linux runner K8s manifest Namespace github-runner with myoung34/github-runner:latest Deployment, 5Gi Longhorn RWO NuGet cache PVC, zero-privilege ServiceAccount, and OnePasswordItem CRD for the registration token. EPHEMERAL=true mode re-registers after each job; Recreate strategy avoids RWO multi-attach. Targets fc-build-linux label; single replica pinned to rke2-server node. Co-Authored-By: Claude Sonnet 4.6 --- apps/github-runner/github-runner.yaml | 196 ++++++++++++++++++++++++++ 1 file changed, 196 insertions(+) create mode 100644 apps/github-runner/github-runner.yaml diff --git a/apps/github-runner/github-runner.yaml b/apps/github-runner/github-runner.yaml new file mode 100644 index 0000000..e253008 --- /dev/null +++ b/apps/github-runner/github-runner.yaml @@ -0,0 +1,196 @@ +# GitHub Actions self-hosted Linux runner — Phase 2 K8s deployment +# +# Phase 1 (current): BLUEJAY-WS registered manually as a Windows runner +# with label "fc-build-windows" via config.cmd (see docs/infrastructure/ +# self-hosted-runner-fleet.md §WPF Build Runner). +# +# Phase 2 (this file): ephemeral Linux runner in RKE2 for non-WPF builds +# (Blazor Server, class libraries, operators, integration tests). Reduces +# billing for ubuntu-24.04 jobs that run on GitHub-hosted runners today. +# +# Runner image: myoung34/github-runner:latest +# EPHEMERAL=true — each pod runs exactly one job then exits; the +# Deployment controller immediately recreates it and re-registers. +# Prevents job queue starvation when two jobs overlap. +# +# NuGet cache: 5Gi Longhorn RWO PVC mounted at /home/runner/.nuget/packages +# Persists NuGet packages across ephemeral pod restarts (not shared across +# simultaneous runner pods; single-replica constraint below). +# +# Credentials: +# OnePasswordItem "GitHub Runner Registration Token" → Secret +# github-runner-token with field "credential" used as RUNNER_TOKEN. +# Operator must create/rotate the 1P item manually; registration tokens +# expire after 1h — use a fine-grained PAT with admin:org_hook scope +# or a re-registration script. See docs/infrastructure/ +# self-hosted-runner-fleet.md §Security. +# +# Security model: +# - No ClusterRole / ClusterRoleBinding — runner has no K8s API access. +# - securityContext: runAsNonRoot with read-only root filesystem where +# possible (runner image needs /tmp and /home/runner writable). +# - Fork pull-request approval required on the GitHub repo settings. +# - RUNNER_ALLOW_RUNASROOT=false is the default. +# +# Cost: Phase 2 eliminates GitHub-hosted ubuntu-24.04 billing; break-even +# vs electricity is ~1 000 min/month at current TOU rates. +# +# Node placement: rke2-server (10.0.56.11) only — Longhorn RWO PVC must +# land on the same node as the volume, and the server node has the most +# spare capacity for burst CI workloads. +# +# Designs: docs/infrastructure/self-hosted-runner-fleet.md +# Questions: Q-CI-1..5 (all Recommended defaults) +--- +apiVersion: v1 +kind: Namespace +metadata: + name: github-runner + labels: + app.kubernetes.io/part-of: flowercore + app.kubernetes.io/managed-by: argocd +--- +# 1Password secret sync — creates github-runner-token K8s Secret. +# Fields expected in the 1Password item: +# credential — GitHub runner registration token (or PAT for re-reg script) +# Item path: IAmWorkin vault > "GitHub Runner Registration Token" +# Operator MUST create this item before the Deployment will start cleanly. +apiVersion: onepassword.com/v1 +kind: OnePasswordItem +metadata: + name: github-runner-token + namespace: github-runner + labels: + app.kubernetes.io/component: credentials + app.kubernetes.io/part-of: flowercore +spec: + itemPath: vaults/IAmWorkin/items/GitHub Runner Registration Token +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: github-runner-nuget-cache + namespace: github-runner + labels: + app.kubernetes.io/component: cache + app.kubernetes.io/part-of: flowercore +spec: + accessModes: + - ReadWriteOnce + storageClassName: longhorn + resources: + requests: + storage: 5Gi + volumeMode: Filesystem +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: github-runner + namespace: github-runner + labels: + app.kubernetes.io/component: runner + app.kubernetes.io/part-of: flowercore +# No ClusterRole or ClusterRoleBinding — runner has zero K8s API privileges. +# CI jobs that need kubectl must supply their own kubeconfig via a secret +# injected at the job level, not via this service account. +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: github-runner + namespace: github-runner + labels: + app.kubernetes.io/name: github-runner + app.kubernetes.io/component: runner + app.kubernetes.io/part-of: flowercore + app.kubernetes.io/managed-by: argocd + flowercore.io/created-by: argocd +spec: + # Single replica enforced: the Longhorn RWO PVC can only be mounted by + # one pod at a time. Each pod re-registers as an ephemeral runner after + # completing a job (EPHEMERAL=true restarts the container, not the pod, + # so the PVC stays attached between jobs). + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: github-runner + # Use Recreate to avoid the Multi-Attach RWO error during rollouts. + strategy: + type: Recreate + template: + metadata: + labels: + app.kubernetes.io/name: github-runner + app.kubernetes.io/component: runner + app.kubernetes.io/part-of: flowercore + flowercore.io/created-by: argocd + spec: + serviceAccountName: github-runner + # Pin to rke2-server so the Longhorn RWO volume is always on the same node. + nodeSelector: + kubernetes.io/hostname: rke2-server + securityContext: + runAsNonRoot: true + runAsUser: 1001 + runAsGroup: 1001 + fsGroup: 1001 + containers: + - name: runner + image: myoung34/github-runner:latest + imagePullPolicy: Always + env: + # GitHub org/repo targeting. + # Set REPO_URL for a repo-scoped runner (cheaper, simpler). + # Switch to ORG_NAME + empty REPO_URL for an org-scoped runner. + - name: REPO_URL + value: "https://github.com/astoltz/FlowerCore.Common" + - name: RUNNER_NAME_PREFIX + value: "rke2-linux" + - name: RUNNER_WORKDIR + value: "/tmp/runner/work" + # EPHEMERAL=true: runner deregisters after one job; container + # exits with code 0; Deployment controller restarts it and a + # fresh registration occurs. Prevents stale runner accumulation. + - name: EPHEMERAL + value: "true" + # Labels used by workflow files: runs-on: [self-hosted, linux, fc-build-linux] + - name: LABELS + value: "self-hosted,linux,fc-build-linux" + # Registration token injected from 1Password via OnePasswordItem CRD. + - name: RUNNER_TOKEN + valueFrom: + secretKeyRef: + name: github-runner-token + key: credential + resources: + requests: + cpu: "500m" + memory: "1Gi" + limits: + cpu: "2000m" + memory: "4Gi" + volumeMounts: + - name: nuget-cache + mountPath: /home/runner/.nuget/packages + - name: tmp + mountPath: /tmp + # Liveness: runner process is alive. + livenessProbe: + exec: + command: + - /bin/sh + - -c + - "pgrep -f Runner.Listener > /dev/null" + initialDelaySeconds: 30 + periodSeconds: 30 + failureThreshold: 3 + volumes: + - name: nuget-cache + persistentVolumeClaim: + claimName: github-runner-nuget-cache + - name: tmp + emptyDir: {} + # Restart policy: Always — the Deployment controller handles + # re-registration after each ephemeral job completes. + restartPolicy: Always