Set replicas back to 1 — the non-root setup-dotnet path issue that drove the park (replicas: 0 on 2026-05-21) is resolved. Shared.Pos #5 landed a step-scoped DOTNET_INSTALL_DIR override and today's Shared.Pos publish succeeded at 01:47 UTC against this runner. Reconciles git with live K8s state: the github-runner-sharedpos pod is already Running on rke2-agent2 (4h12m up, 4 restarts). Without this commit, the next ArgoCD selfHeal would scale it back to 0 and re-stop Shared.Pos publish flow. Follow-on: Sprint 30+ Cx-1 fleet-wide pod-env (DOTNET_INSTALL_DIR set on the Deployment env, not per-workflow) is still queued — once landed, the per-workflow patch can be backed out. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
307 lines
11 KiB
YAML
307 lines
11 KiB
YAML
# GitHub Actions self-hosted Linux runner — Phase 2 K8s deployment
|
|
#
|
|
# Phase 1 (current): BLUEJAY-WS registered manually as a Windows runner
|
|
# with label "fc-build-windows" via config.cmd (see docs/infrastructure/
|
|
# self-hosted-runner-fleet.md §WPF Build Runner).
|
|
#
|
|
# Phase 2 (this file): ephemeral Linux runner in RKE2 for non-WPF builds
|
|
# (Blazor Server, class libraries, operators, integration tests). Reduces
|
|
# billing for ubuntu-24.04 jobs that run on GitHub-hosted runners today.
|
|
#
|
|
# Runner image: myoung34/github-runner:latest
|
|
# EPHEMERAL=true — each pod runs exactly one job then exits; the
|
|
# Deployment controller immediately recreates it and re-registers.
|
|
# Prevents job queue starvation when two jobs overlap.
|
|
#
|
|
# NuGet cache: 5Gi Longhorn RWO PVC mounted at /home/runner/.nuget/packages
|
|
# Persists NuGet packages across ephemeral pod restarts (not shared across
|
|
# simultaneous runner pods; single-replica constraint below).
|
|
#
|
|
# Credentials:
|
|
# OnePasswordItem "GitHub PAT (Runner Registration)" → Secret
|
|
# github-runner-token with field "credential" used as RUNNER_TOKEN.
|
|
# Operator must create/rotate the 1P item manually; registration tokens
|
|
# expire after 1h — use a fine-grained PAT with Administration:read/write
|
|
# scope on the target repos, or a re-registration script. See
|
|
# docs/infrastructure/self-hosted-runner-fleet.md §Security.
|
|
#
|
|
# Security model:
|
|
# - No ClusterRole / ClusterRoleBinding — runner has no K8s API access.
|
|
# - securityContext: runAsNonRoot with read-only root filesystem where
|
|
# possible (runner image needs /tmp and /home/runner writable).
|
|
# - Fork pull-request approval required on the GitHub repo settings.
|
|
# - RUNNER_ALLOW_RUNASROOT=false is the default.
|
|
#
|
|
# Cost: Phase 2 eliminates GitHub-hosted ubuntu-24.04 billing; break-even
|
|
# vs electricity is ~1 000 min/month at current TOU rates.
|
|
#
|
|
# Node placement: rke2-server (10.0.56.11) only — Longhorn RWO PVC must
|
|
# land on the same node as the volume, and the server node has the most
|
|
# spare capacity for burst CI workloads.
|
|
#
|
|
# Designs: docs/infrastructure/self-hosted-runner-fleet.md
|
|
# Questions: Q-CI-1..5 (all Recommended defaults)
|
|
---
|
|
apiVersion: v1
|
|
kind: Namespace
|
|
metadata:
|
|
name: github-runner
|
|
labels:
|
|
app.kubernetes.io/part-of: flowercore
|
|
app.kubernetes.io/managed-by: argocd
|
|
---
|
|
# 1Password secret sync — creates github-runner-token K8s Secret.
|
|
# Fields expected in the 1Password item:
|
|
# credential — GitHub fine-grained PAT (Administration:read/write on
|
|
# target repos) used by the runner image to mint a fresh
|
|
# short-lived registration token at pod start.
|
|
# Item path: IAmWorkin vault > "GitHub PAT (Runner Registration)"
|
|
# Operator MUST create this item before the Deployment will start cleanly.
|
|
apiVersion: onepassword.com/v1
|
|
kind: OnePasswordItem
|
|
metadata:
|
|
name: github-runner-token
|
|
namespace: github-runner
|
|
labels:
|
|
app.kubernetes.io/component: credentials
|
|
app.kubernetes.io/part-of: flowercore
|
|
spec:
|
|
itemPath: vaults/IAmWorkin/items/GitHub PAT (Runner Registration)
|
|
---
|
|
apiVersion: v1
|
|
kind: PersistentVolumeClaim
|
|
metadata:
|
|
name: github-runner-nuget-cache
|
|
namespace: github-runner
|
|
labels:
|
|
app.kubernetes.io/component: cache
|
|
app.kubernetes.io/part-of: flowercore
|
|
spec:
|
|
accessModes:
|
|
- ReadWriteOnce
|
|
storageClassName: longhorn
|
|
resources:
|
|
requests:
|
|
storage: 5Gi
|
|
volumeMode: Filesystem
|
|
---
|
|
apiVersion: v1
|
|
kind: ServiceAccount
|
|
metadata:
|
|
name: github-runner
|
|
namespace: github-runner
|
|
labels:
|
|
app.kubernetes.io/component: runner
|
|
app.kubernetes.io/part-of: flowercore
|
|
# No ClusterRole or ClusterRoleBinding — runner has zero K8s API privileges.
|
|
# CI jobs that need kubectl must supply their own kubeconfig via a secret
|
|
# injected at the job level, not via this service account.
|
|
---
|
|
apiVersion: apps/v1
|
|
kind: Deployment
|
|
metadata:
|
|
name: github-runner
|
|
namespace: github-runner
|
|
labels:
|
|
app.kubernetes.io/name: github-runner
|
|
app.kubernetes.io/component: runner
|
|
app.kubernetes.io/part-of: flowercore
|
|
app.kubernetes.io/managed-by: argocd
|
|
flowercore.io/created-by: argocd
|
|
spec:
|
|
# Single replica enforced: the Longhorn RWO PVC can only be mounted by
|
|
# one pod at a time. Each pod re-registers as an ephemeral runner after
|
|
# completing a job (EPHEMERAL=true restarts the container, not the pod,
|
|
# so the PVC stays attached between jobs).
|
|
#
|
|
# 2026-05-16: bumped 0 -> 1 after operator provisioned the
|
|
# "GitHub PAT (Runner Registration)" 1P item with field=credential.
|
|
# Unblocks CI fleet-wide (was budget-exhausted on GH-hosted runners).
|
|
replicas: 1
|
|
selector:
|
|
matchLabels:
|
|
app.kubernetes.io/name: github-runner
|
|
# Use Recreate to avoid the Multi-Attach RWO error during rollouts.
|
|
strategy:
|
|
type: Recreate
|
|
template:
|
|
metadata:
|
|
labels:
|
|
app.kubernetes.io/name: github-runner
|
|
app.kubernetes.io/component: runner
|
|
app.kubernetes.io/part-of: flowercore
|
|
flowercore.io/created-by: argocd
|
|
spec:
|
|
serviceAccountName: github-runner
|
|
# Pin to rke2-server so the Longhorn RWO volume is always on the same node.
|
|
nodeSelector:
|
|
kubernetes.io/hostname: rke2-server
|
|
securityContext:
|
|
runAsNonRoot: true
|
|
runAsUser: 1001
|
|
runAsGroup: 1001
|
|
fsGroup: 1001
|
|
containers:
|
|
- name: runner
|
|
image: myoung34/github-runner:latest
|
|
imagePullPolicy: Always
|
|
env:
|
|
# GitHub org/repo targeting.
|
|
# Set REPO_URL for a repo-scoped runner (cheaper, simpler).
|
|
# Switch to ORG_NAME + empty REPO_URL for an org-scoped runner.
|
|
- name: REPO_URL
|
|
value: "https://github.com/astoltz/FlowerCore.Common"
|
|
- name: RUNNER_NAME_PREFIX
|
|
value: "rke2-linux"
|
|
- name: RUNNER_WORKDIR
|
|
value: "/tmp/runner/work"
|
|
# EPHEMERAL=true: runner deregisters after one job; container
|
|
# exits with code 0; Deployment controller restarts it and a
|
|
# fresh registration occurs. Prevents stale runner accumulation.
|
|
- name: EPHEMERAL
|
|
value: "true"
|
|
# Labels used by workflow files: runs-on: [self-hosted, linux, fc-build-linux]
|
|
- name: LABELS
|
|
value: "self-hosted,linux,fc-build-linux"
|
|
# PAT (not pre-minted registration token) — myoung34/github-runner
|
|
# mints registration tokens itself via GitHub API when ACCESS_TOKEN
|
|
# is set. Switched from RUNNER_TOKEN -> ACCESS_TOKEN on 2026-05-16
|
|
# because the 1P "GitHub PAT (Runner Registration)" item stores a
|
|
# fine-grained PAT, not a short-lived registration token.
|
|
- name: ACCESS_TOKEN
|
|
valueFrom:
|
|
secretKeyRef:
|
|
name: github-runner-token
|
|
key: credential
|
|
# myoung34/github-runner default entrypoint expects root for some
|
|
# setup steps. With securityContext.runAsUser=1001 the entrypoint
|
|
# short-circuits with "RUN_AS_ROOT env var is set to true but the
|
|
# user has been overridden and is not running as root". Tell the
|
|
# entrypoint we're explicitly NOT root so it skips the root-only
|
|
# setup steps (cache prewarm + apt updates — both already baked).
|
|
- name: RUN_AS_ROOT
|
|
value: "false"
|
|
resources:
|
|
requests:
|
|
cpu: "500m"
|
|
memory: "1Gi"
|
|
limits:
|
|
cpu: "2000m"
|
|
memory: "4Gi"
|
|
volumeMounts:
|
|
- name: nuget-cache
|
|
mountPath: /home/runner/.nuget/packages
|
|
- name: tmp
|
|
mountPath: /tmp
|
|
# Liveness: runner process is alive.
|
|
livenessProbe:
|
|
exec:
|
|
command:
|
|
- /bin/sh
|
|
- -c
|
|
- "pgrep -f Runner.Listener > /dev/null"
|
|
initialDelaySeconds: 30
|
|
periodSeconds: 30
|
|
failureThreshold: 3
|
|
volumes:
|
|
- name: nuget-cache
|
|
persistentVolumeClaim:
|
|
claimName: github-runner-nuget-cache
|
|
- name: tmp
|
|
emptyDir: {}
|
|
# Restart policy: Always — the Deployment controller handles
|
|
# re-registration after each ephemeral job completes.
|
|
restartPolicy: Always
|
|
---
|
|
# Shared.Pos repo-scoped Linux runner. Added 2026-05-20 to unstick the
|
|
# FlowerCore.Shared.Pos "Build, Test & Publish" workflow, which had been
|
|
# queued indefinitely after the Sprint 26 Mac POS Phase 1/2 PRs merged
|
|
# (no fc-build-linux runner was registered to Shared.Pos — GitHub user
|
|
# accounts have only repo-scoped runners). First concrete instance of the
|
|
# Sprint 29 Cx-1 Linux-runner-fleet pattern; the full per-repo fleet is
|
|
# codified by that lane. emptyDir nuget cache (no RWO PVC) so it shares no
|
|
# volume with the Common runner and needs no node pin.
|
|
apiVersion: apps/v1
|
|
kind: Deployment
|
|
metadata:
|
|
name: github-runner-sharedpos
|
|
namespace: github-runner
|
|
labels:
|
|
app.kubernetes.io/name: github-runner-sharedpos
|
|
app.kubernetes.io/component: runner
|
|
app.kubernetes.io/part-of: flowercore
|
|
app.kubernetes.io/managed-by: argocd
|
|
flowercore.io/created-by: argocd
|
|
spec:
|
|
# UN-PARKED 2026-05-21: Shared.Pos #5 fixed the non-root setup-dotnet path
|
|
# (DOTNET_INSTALL_DIR step-scoped). Re-enabled to run the now-fixable build.
|
|
replicas: 1
|
|
selector:
|
|
matchLabels:
|
|
app.kubernetes.io/name: github-runner-sharedpos
|
|
strategy:
|
|
type: Recreate
|
|
template:
|
|
metadata:
|
|
labels:
|
|
app.kubernetes.io/name: github-runner-sharedpos
|
|
app.kubernetes.io/component: runner
|
|
app.kubernetes.io/part-of: flowercore
|
|
flowercore.io/created-by: argocd
|
|
spec:
|
|
serviceAccountName: github-runner
|
|
securityContext:
|
|
runAsNonRoot: true
|
|
runAsUser: 1001
|
|
runAsGroup: 1001
|
|
fsGroup: 1001
|
|
containers:
|
|
- name: runner
|
|
image: myoung34/github-runner:latest
|
|
imagePullPolicy: Always
|
|
env:
|
|
- name: REPO_URL
|
|
value: "https://github.com/astoltz/FlowerCore.Shared.Pos"
|
|
- name: RUNNER_NAME_PREFIX
|
|
value: "rke2-linux-sharedpos"
|
|
- name: RUNNER_WORKDIR
|
|
value: "/tmp/runner/work"
|
|
- name: EPHEMERAL
|
|
value: "true"
|
|
- name: LABELS
|
|
value: "self-hosted,linux,fc-build-linux"
|
|
- name: ACCESS_TOKEN
|
|
valueFrom:
|
|
secretKeyRef:
|
|
name: github-runner-token
|
|
key: credential
|
|
- name: RUN_AS_ROOT
|
|
value: "false"
|
|
resources:
|
|
requests:
|
|
cpu: "500m"
|
|
memory: "1Gi"
|
|
limits:
|
|
cpu: "2000m"
|
|
memory: "4Gi"
|
|
volumeMounts:
|
|
- name: nuget-cache
|
|
mountPath: /home/runner/.nuget/packages
|
|
- name: tmp
|
|
mountPath: /tmp
|
|
livenessProbe:
|
|
exec:
|
|
command:
|
|
- /bin/sh
|
|
- -c
|
|
- "pgrep -f Runner.Listener > /dev/null"
|
|
initialDelaySeconds: 30
|
|
periodSeconds: 30
|
|
failureThreshold: 3
|
|
volumes:
|
|
- name: nuget-cache
|
|
emptyDir: {}
|
|
- name: tmp
|
|
emptyDir: {}
|
|
restartPolicy: Always
|