From 67064c4129673c63e12fe6762c2574a243d0ec6a Mon Sep 17 00:00:00 2001 From: Andrew Stoltz Date: Sun, 17 May 2026 16:27:41 -0500 Subject: [PATCH] feat(github-runner): harden Linux runner fleet --- apps/github-runner/README.md | 61 + apps/github-runner/github-runner.yaml | 1136 ++++++++++++++++- apps/monitoring/noc-monitoring.yaml | 51 + .../FleetManifestLintTests.cs | 167 +++ 4 files changed, 1375 insertions(+), 40 deletions(-) create mode 100644 apps/github-runner/README.md diff --git a/apps/github-runner/README.md b/apps/github-runner/README.md new file mode 100644 index 0000000..c677c10 --- /dev/null +++ b/apps/github-runner/README.md @@ -0,0 +1,61 @@ +# GitHub Runner Fleet + +ArgoCD owns `apps/github-runner/github-runner.yaml`. Do not patch live runner +Deployments with `kubectl`; update this manifest and let ArgoCD reconcile. + +## Runner Shape + +All repo-scoped Linux runners use: + +- `ACCESS_TOKEN` from the `github-runner-token` Secret +- `RUN_AS_ROOT=false` +- `EPHEMERAL=true` +- `LABELS=self-hosted,linux,fc-build-linux` +- writable non-root paths under `/home/runner` for .NET, NuGet, XDG cache, and + Actions tool cache + +`github-runner` for `FlowerCore.Common` is single-replica because it retains the +original Longhorn ReadWriteOnce NuGet PVC. `github-runner-sharedpos` and the top +Linux-cost repo runners use two replicas with per-pod `emptyDir` caches. That is +the safe backlog-drain strategy: no two pods share one RWO PVC. + +## Post-Merge Proof + +After the PR is merged and ArgoCD syncs, verify the runner fleet: + +```bash +kubectl -n github-runner get deploy,pods,pvc +``` + +Verify GitHub registration for the repo-scoped runners: + +```bash +for repo in FlowerCore.Common FlowerCore.Shared.Pos FlowerCore.Puppet FlowerCore.Signage \ + FlowerCore.DMS FlowerCore.Telephony FlowerCore.Print.Web FlowerCore.Chat \ + FlowerCore.MySQL FlowerCore.Kiosk.Linux; do + echo "=== $repo ===" + gh api "/repos/astoltz/$repo/actions/runners" \ + --jq '.runners[] | select(.labels[].name == "fc-build-linux") | {name,status,busy,labels:[.labels[].name]}' +done +``` + +Shared.Pos publish proof after the runner pod is online: + +```bash +gh run list --repo astoltz/FlowerCore.Shared.Pos \ + --workflow "Build, Test & Publish" --branch main --limit 5 +``` + +If the latest run is still queued after runner registration, rerun the workflow +from GitHub Actions and verify it lands on an `rke2-linux-*` runner. + +## Failure Notes + +- `actions/setup-dotnet` permission error at `/usr/share/dotnet`: check that + `DOTNET_INSTALL_DIR=/home/runner/.dotnet` and related cache env vars are + present on the runner pod. +- `404` during runner registration: the fine-grained PAT is valid but missing + repository access for that repo. Add the repo to the PAT access list; the PAT + value does not change. +- `Multi-Attach` volume error: only the Common runner uses a RWO PVC and it must + stay single-replica. New multi-replica runners use `emptyDir`. diff --git a/apps/github-runner/github-runner.yaml b/apps/github-runner/github-runner.yaml index 90ca5e1..85b4e30 100644 --- a/apps/github-runner/github-runner.yaml +++ b/apps/github-runner/github-runner.yaml @@ -1,46 +1,38 @@ -# GitHub Actions self-hosted Linux runner — Phase 2 K8s deployment +# GitHub Actions self-hosted Linux runner fleet - RKE2 deployment # -# Phase 1 (current): BLUEJAY-WS registered manually as a Windows runner -# with label "fc-build-windows" via config.cmd (see docs/infrastructure/ -# self-hosted-runner-fleet.md §WPF Build Runner). +# ArgoCD owns this namespace. Update this manifest and let the +# bluejay-infra ApplicationSet reconcile it. # -# Phase 2 (this file): ephemeral Linux runner in RKE2 for non-WPF builds -# (Blazor Server, class libraries, operators, integration tests). Reduces -# billing for ubuntu-24.04 jobs that run on GitHub-hosted runners today. +# astoltz is a GitHub user account, not an org, so runners are repo-scoped. +# Each Deployment registers one private FlowerCore repo with the shared +# ACCESS_TOKEN from the github-runner-token Secret. # -# Runner image: myoung34/github-runner:latest -# EPHEMERAL=true — each pod runs exactly one job then exits; the -# Deployment controller immediately recreates it and re-registers. -# Prevents job queue starvation when two jobs overlap. +# Runners registered here: +# FlowerCore.Common (single replica, 5Gi Longhorn RWO NuGet PVC) +# FlowerCore.Shared.Pos (two replicas, emptyDir cache) +# FlowerCore.Puppet, Signage, DMS, Telephony, Print.Web, Chat, MySQL, +# Kiosk.Linux (two replicas each, emptyDir cache) # -# NuGet cache: 5Gi Longhorn RWO PVC mounted at /home/runner/.nuget/packages -# Persists NuGet packages across ephemeral pod restarts (not shared across -# simultaneous runner pods; single-replica constraint below). +# Non-root CI safety: +# Runner pods run as uid 1001. HOME, DOTNET_INSTALL_DIR, DOTNET_CLI_HOME, +# NUGET_PACKAGES, XDG_CACHE_HOME, and RUNNER_TOOL_CACHE are all pointed at +# writable mounted paths under /home/runner so actions/setup-dotnet does not +# attempt to install into /usr/share/dotnet. # # Credentials: -# OnePasswordItem "GitHub PAT (Runner Registration)" → Secret -# github-runner-token with field "credential" used as RUNNER_TOKEN. -# Operator must create/rotate the 1P item manually; registration tokens -# expire after 1h — use a fine-grained PAT with Administration:read/write -# scope on the target repos, or a re-registration script. See -# docs/infrastructure/self-hosted-runner-fleet.md §Security. +# OnePasswordItem "GitHub PAT (Runner Registration)" syncs Secret +# github-runner-token with field "credential". myoung34/github-runner uses +# ACCESS_TOKEN to mint short-lived registration tokens on pod start. # # Security model: -# - No ClusterRole / ClusterRoleBinding — runner has no K8s API access. -# - securityContext: runAsNonRoot with read-only root filesystem where -# possible (runner image needs /tmp and /home/runner writable). -# - Fork pull-request approval required on the GitHub repo settings. -# - RUNNER_ALLOW_RUNASROOT=false is the default. -# -# Cost: Phase 2 eliminates GitHub-hosted ubuntu-24.04 billing; break-even -# vs electricity is ~1 000 min/month at current TOU rates. -# -# Node placement: rke2-server (10.0.56.11) only — Longhorn RWO PVC must -# land on the same node as the volume, and the server node has the most -# spare capacity for burst CI workloads. +# - No ClusterRole / ClusterRoleBinding. The ServiceAccount has no K8s API +# privileges. +# - Self-hosted runners are for private repos and trusted branches only. +# - Fork pull-request approval must remain required in GitHub repo settings. +# - Do not hardcode PATs or registration tokens. # # Designs: docs/infrastructure/self-hosted-runner-fleet.md -# Questions: Q-CI-1..5 (all Recommended defaults) +# ADR-172, Q-CI-26..40 --- apiVersion: v1 kind: Namespace @@ -108,6 +100,8 @@ metadata: app.kubernetes.io/part-of: flowercore app.kubernetes.io/managed-by: argocd flowercore.io/created-by: argocd + flowercore.io/runner-repo: common + flowercore.io/github-repo: FlowerCore.Common spec: # Single replica enforced: the Longhorn RWO PVC can only be mounted by # one pod at a time. Each pod re-registers as an ephemeral runner after @@ -131,6 +125,8 @@ spec: app.kubernetes.io/component: runner app.kubernetes.io/part-of: flowercore flowercore.io/created-by: argocd + flowercore.io/runner-repo: common + flowercore.io/github-repo: FlowerCore.Common spec: serviceAccountName: github-runner # Pin to rke2-server so the Longhorn RWO volume is always on the same node. @@ -189,10 +185,21 @@ spec: # Labels used by workflow files: runs-on: [self-hosted, linux, fc-build-linux] - name: LABELS value: "self-hosted,linux,fc-build-linux" + - name: HOME + value: "/home/runner" + - name: DOTNET_INSTALL_DIR + value: "/home/runner/.dotnet" + - name: DOTNET_CLI_HOME + value: "/home/runner" + - name: NUGET_PACKAGES + value: "/home/runner/.nuget/packages" + - name: XDG_CACHE_HOME + value: "/home/runner/.cache" + - name: RUNNER_TOOL_CACHE + value: "/home/runner/_tool" # PAT (not pre-minted registration token) — myoung34/github-runner # mints registration tokens itself via GitHub API when ACCESS_TOKEN - # is set. Switched from RUNNER_TOKEN -> ACCESS_TOKEN on 2026-05-16 - # because the 1P "GitHub PAT (Runner Registration)" item stores a + # is set. The 1P "GitHub PAT (Runner Registration)" item stores a # fine-grained PAT, not a short-lived registration token. - name: ACCESS_TOKEN valueFrom: @@ -271,8 +278,8 @@ spec: # (no fc-build-linux runner was registered to Shared.Pos — GitHub user # accounts have only repo-scoped runners). First concrete instance of the # Sprint 29 Cx-1 Linux-runner-fleet pattern; the full per-repo fleet is -# codified by that lane. emptyDir nuget cache (no RWO PVC) so it shares no -# volume with the Common runner and needs no node pin. +# codified by that lane. It uses emptyDir cache and two replicas so backlog +# drains without a shared RWO PVC multi-attach risk. apiVersion: apps/v1 kind: Deployment metadata: @@ -284,10 +291,13 @@ metadata: app.kubernetes.io/part-of: flowercore app.kubernetes.io/managed-by: argocd flowercore.io/created-by: argocd + flowercore.io/runner-repo: sharedpos + flowercore.io/github-repo: FlowerCore.Shared.Pos spec: # UN-PARKED 2026-05-21: Shared.Pos #5 fixed the non-root setup-dotnet path - # (DOTNET_INSTALL_DIR step-scoped). Re-enabled to run the now-fixable build. - replicas: 1 + # (DOTNET_INSTALL_DIR step-scoped). Sprint 30 Cl-8 capacity Q-CI-52: raised + # to replicas: 2 to absorb top-8 burst load per substrate-recommended default. + replicas: 2 selector: matchLabels: app.kubernetes.io/name: github-runner-sharedpos @@ -300,6 +310,8 @@ spec: app.kubernetes.io/component: runner app.kubernetes.io/part-of: flowercore flowercore.io/created-by: argocd + flowercore.io/runner-repo: sharedpos + flowercore.io/github-repo: FlowerCore.Shared.Pos spec: serviceAccountName: github-runner securityContext: @@ -343,6 +355,18 @@ spec: value: "true" - name: LABELS value: "self-hosted,linux,fc-build-linux" + - name: HOME + value: "/home/runner" + - name: DOTNET_INSTALL_DIR + value: "/home/runner/.dotnet" + - name: DOTNET_CLI_HOME + value: "/home/runner" + - name: NUGET_PACKAGES + value: "/home/runner/.nuget/packages" + - name: XDG_CACHE_HOME + value: "/home/runner/.cache" + - name: RUNNER_TOOL_CACHE + value: "/home/runner/_tool" - name: ACCESS_TOKEN valueFrom: secretKeyRef: @@ -392,7 +416,1039 @@ spec: - name: runner-home emptyDir: {} - name: nuget-cache - emptyDir: {} + emptyDir: + sizeLimit: 2Gi - name: tmp emptyDir: {} restartPolicy: Always +--- +# Runner for FlowerCore.Puppet. Two replicas use per-pod emptyDir caches, so +# backlog can drain without sharing a ReadWriteOnce PVC. +apiVersion: apps/v1 +kind: Deployment +metadata: + name: github-runner-puppet + namespace: github-runner + labels: + app.kubernetes.io/name: github-runner-puppet + app.kubernetes.io/component: runner + app.kubernetes.io/part-of: flowercore + app.kubernetes.io/managed-by: argocd + flowercore.io/created-by: argocd + flowercore.io/runner-repo: puppet + flowercore.io/github-repo: FlowerCore.Puppet +spec: + replicas: 2 + selector: + matchLabels: + app.kubernetes.io/name: github-runner-puppet + strategy: + type: Recreate + template: + metadata: + labels: + app.kubernetes.io/name: github-runner-puppet + app.kubernetes.io/component: runner + app.kubernetes.io/part-of: flowercore + flowercore.io/created-by: argocd + flowercore.io/runner-repo: puppet + flowercore.io/github-repo: FlowerCore.Puppet + spec: + serviceAccountName: github-runner + securityContext: + runAsNonRoot: true + runAsUser: 1001 + runAsGroup: 1001 + fsGroup: 1001 + initContainers: + - name: setup-runner-home + image: busybox:1.36 + command: + - sh + - -c + - | + set -e + mkdir -p /home/runner/.dotnet /home/runner/.nuget/packages /home/runner/.nuget/NuGet + chown -R 1001:1001 /home/runner/.dotnet /home/runner/.nuget + chmod -R 755 /home/runner/.dotnet /home/runner/.nuget + securityContext: + runAsUser: 0 + runAsNonRoot: false + volumeMounts: + - name: runner-home + mountPath: /home/runner + containers: + - name: runner + image: myoung34/github-runner:latest + imagePullPolicy: Always + env: + - name: REPO_URL + value: "https://github.com/astoltz/FlowerCore.Puppet" + - name: RUNNER_NAME_PREFIX + value: "rke2-linux-puppet" + - name: RUNNER_WORKDIR + value: "/tmp/runner/work" + - name: EPHEMERAL + value: "true" + - name: LABELS + value: "self-hosted,linux,fc-build-linux" + - name: HOME + value: "/home/runner" + - name: DOTNET_INSTALL_DIR + value: "/home/runner/.dotnet" + - name: DOTNET_CLI_TELEMETRY_OPTOUT + value: "1" + - name: DOTNET_NOLOGO + value: "1" + - name: DOTNET_GENERATE_ASPNET_CERTIFICATE + value: "false" + - name: DOTNET_CLI_HOME + value: "/home/runner" + - name: NUGET_PACKAGES + value: "/home/runner/.nuget/packages" + - name: XDG_CACHE_HOME + value: "/home/runner/.cache" + - name: RUNNER_TOOL_CACHE + value: "/home/runner/_tool" + - name: ACCESS_TOKEN + valueFrom: + secretKeyRef: + name: github-runner-token + key: credential + - name: RUN_AS_ROOT + value: "false" + resources: + requests: + cpu: "500m" + memory: "1Gi" + limits: + cpu: "2000m" + memory: "4Gi" + volumeMounts: + - name: runner-home + mountPath: /home/runner + - name: nuget-cache + mountPath: /home/runner/.nuget/packages + - name: tmp + mountPath: /tmp + livenessProbe: + exec: + command: + - /bin/sh + - -c + - "pgrep -f Runner.Listener > /dev/null" + initialDelaySeconds: 30 + periodSeconds: 30 + failureThreshold: 3 + volumes: + - name: runner-home + emptyDir: {} + - name: nuget-cache + emptyDir: + sizeLimit: 2Gi + - name: tmp + emptyDir: {} + restartPolicy: Always +--- +# Runner for FlowerCore.Signage. Two replicas use per-pod emptyDir caches, so +# backlog can drain without sharing a ReadWriteOnce PVC. +apiVersion: apps/v1 +kind: Deployment +metadata: + name: github-runner-signage + namespace: github-runner + labels: + app.kubernetes.io/name: github-runner-signage + app.kubernetes.io/component: runner + app.kubernetes.io/part-of: flowercore + app.kubernetes.io/managed-by: argocd + flowercore.io/created-by: argocd + flowercore.io/runner-repo: signage + flowercore.io/github-repo: FlowerCore.Signage +spec: + replicas: 2 + selector: + matchLabels: + app.kubernetes.io/name: github-runner-signage + strategy: + type: Recreate + template: + metadata: + labels: + app.kubernetes.io/name: github-runner-signage + app.kubernetes.io/component: runner + app.kubernetes.io/part-of: flowercore + flowercore.io/created-by: argocd + flowercore.io/runner-repo: signage + flowercore.io/github-repo: FlowerCore.Signage + spec: + serviceAccountName: github-runner + securityContext: + runAsNonRoot: true + runAsUser: 1001 + runAsGroup: 1001 + fsGroup: 1001 + initContainers: + - name: setup-runner-home + image: busybox:1.36 + command: + - sh + - -c + - | + set -e + mkdir -p /home/runner/.dotnet /home/runner/.nuget/packages /home/runner/.nuget/NuGet + chown -R 1001:1001 /home/runner/.dotnet /home/runner/.nuget + chmod -R 755 /home/runner/.dotnet /home/runner/.nuget + securityContext: + runAsUser: 0 + runAsNonRoot: false + volumeMounts: + - name: runner-home + mountPath: /home/runner + containers: + - name: runner + image: myoung34/github-runner:latest + imagePullPolicy: Always + env: + - name: REPO_URL + value: "https://github.com/astoltz/FlowerCore.Signage" + - name: RUNNER_NAME_PREFIX + value: "rke2-linux-signage" + - name: RUNNER_WORKDIR + value: "/tmp/runner/work" + - name: EPHEMERAL + value: "true" + - name: LABELS + value: "self-hosted,linux,fc-build-linux" + - name: HOME + value: "/home/runner" + - name: DOTNET_INSTALL_DIR + value: "/home/runner/.dotnet" + - name: DOTNET_CLI_TELEMETRY_OPTOUT + value: "1" + - name: DOTNET_NOLOGO + value: "1" + - name: DOTNET_GENERATE_ASPNET_CERTIFICATE + value: "false" + - name: DOTNET_CLI_HOME + value: "/home/runner" + - name: NUGET_PACKAGES + value: "/home/runner/.nuget/packages" + - name: XDG_CACHE_HOME + value: "/home/runner/.cache" + - name: RUNNER_TOOL_CACHE + value: "/home/runner/_tool" + - name: ACCESS_TOKEN + valueFrom: + secretKeyRef: + name: github-runner-token + key: credential + - name: RUN_AS_ROOT + value: "false" + resources: + requests: + cpu: "500m" + memory: "1Gi" + limits: + cpu: "2000m" + memory: "4Gi" + volumeMounts: + - name: runner-home + mountPath: /home/runner + - name: nuget-cache + mountPath: /home/runner/.nuget/packages + - name: tmp + mountPath: /tmp + livenessProbe: + exec: + command: + - /bin/sh + - -c + - "pgrep -f Runner.Listener > /dev/null" + initialDelaySeconds: 30 + periodSeconds: 30 + failureThreshold: 3 + volumes: + - name: runner-home + emptyDir: {} + - name: nuget-cache + emptyDir: + sizeLimit: 2Gi + - name: tmp + emptyDir: {} + restartPolicy: Always +--- +# Runner for FlowerCore.DMS. Two replicas use per-pod emptyDir caches, so +# backlog can drain without sharing a ReadWriteOnce PVC. +apiVersion: apps/v1 +kind: Deployment +metadata: + name: github-runner-dms + namespace: github-runner + labels: + app.kubernetes.io/name: github-runner-dms + app.kubernetes.io/component: runner + app.kubernetes.io/part-of: flowercore + app.kubernetes.io/managed-by: argocd + flowercore.io/created-by: argocd + flowercore.io/runner-repo: dms + flowercore.io/github-repo: FlowerCore.DMS +spec: + replicas: 2 + selector: + matchLabels: + app.kubernetes.io/name: github-runner-dms + strategy: + type: Recreate + template: + metadata: + labels: + app.kubernetes.io/name: github-runner-dms + app.kubernetes.io/component: runner + app.kubernetes.io/part-of: flowercore + flowercore.io/created-by: argocd + flowercore.io/runner-repo: dms + flowercore.io/github-repo: FlowerCore.DMS + spec: + serviceAccountName: github-runner + securityContext: + runAsNonRoot: true + runAsUser: 1001 + runAsGroup: 1001 + fsGroup: 1001 + initContainers: + - name: setup-runner-home + image: busybox:1.36 + command: + - sh + - -c + - | + set -e + mkdir -p /home/runner/.dotnet /home/runner/.nuget/packages /home/runner/.nuget/NuGet + chown -R 1001:1001 /home/runner/.dotnet /home/runner/.nuget + chmod -R 755 /home/runner/.dotnet /home/runner/.nuget + securityContext: + runAsUser: 0 + runAsNonRoot: false + volumeMounts: + - name: runner-home + mountPath: /home/runner + containers: + - name: runner + image: myoung34/github-runner:latest + imagePullPolicy: Always + env: + - name: REPO_URL + value: "https://github.com/astoltz/FlowerCore.DMS" + - name: RUNNER_NAME_PREFIX + value: "rke2-linux-dms" + - name: RUNNER_WORKDIR + value: "/tmp/runner/work" + - name: EPHEMERAL + value: "true" + - name: LABELS + value: "self-hosted,linux,fc-build-linux" + - name: HOME + value: "/home/runner" + - name: DOTNET_INSTALL_DIR + value: "/home/runner/.dotnet" + - name: DOTNET_CLI_TELEMETRY_OPTOUT + value: "1" + - name: DOTNET_NOLOGO + value: "1" + - name: DOTNET_GENERATE_ASPNET_CERTIFICATE + value: "false" + - name: DOTNET_CLI_HOME + value: "/home/runner" + - name: NUGET_PACKAGES + value: "/home/runner/.nuget/packages" + - name: XDG_CACHE_HOME + value: "/home/runner/.cache" + - name: RUNNER_TOOL_CACHE + value: "/home/runner/_tool" + - name: ACCESS_TOKEN + valueFrom: + secretKeyRef: + name: github-runner-token + key: credential + - name: RUN_AS_ROOT + value: "false" + resources: + requests: + cpu: "500m" + memory: "1Gi" + limits: + cpu: "2000m" + memory: "4Gi" + volumeMounts: + - name: runner-home + mountPath: /home/runner + - name: nuget-cache + mountPath: /home/runner/.nuget/packages + - name: tmp + mountPath: /tmp + livenessProbe: + exec: + command: + - /bin/sh + - -c + - "pgrep -f Runner.Listener > /dev/null" + initialDelaySeconds: 30 + periodSeconds: 30 + failureThreshold: 3 + volumes: + - name: runner-home + emptyDir: {} + - name: nuget-cache + emptyDir: + sizeLimit: 2Gi + - name: tmp + emptyDir: {} + restartPolicy: Always +--- +# Runner for FlowerCore.Telephony. Two replicas use per-pod emptyDir caches, +# so backlog can drain without sharing a ReadWriteOnce PVC. +apiVersion: apps/v1 +kind: Deployment +metadata: + name: github-runner-telephony + namespace: github-runner + labels: + app.kubernetes.io/name: github-runner-telephony + app.kubernetes.io/component: runner + app.kubernetes.io/part-of: flowercore + app.kubernetes.io/managed-by: argocd + flowercore.io/created-by: argocd + flowercore.io/runner-repo: telephony + flowercore.io/github-repo: FlowerCore.Telephony +spec: + replicas: 2 + selector: + matchLabels: + app.kubernetes.io/name: github-runner-telephony + strategy: + type: Recreate + template: + metadata: + labels: + app.kubernetes.io/name: github-runner-telephony + app.kubernetes.io/component: runner + app.kubernetes.io/part-of: flowercore + flowercore.io/created-by: argocd + flowercore.io/runner-repo: telephony + flowercore.io/github-repo: FlowerCore.Telephony + spec: + serviceAccountName: github-runner + securityContext: + runAsNonRoot: true + runAsUser: 1001 + runAsGroup: 1001 + fsGroup: 1001 + initContainers: + - name: setup-runner-home + image: busybox:1.36 + command: + - sh + - -c + - | + set -e + mkdir -p /home/runner/.dotnet /home/runner/.nuget/packages /home/runner/.nuget/NuGet + chown -R 1001:1001 /home/runner/.dotnet /home/runner/.nuget + chmod -R 755 /home/runner/.dotnet /home/runner/.nuget + securityContext: + runAsUser: 0 + runAsNonRoot: false + volumeMounts: + - name: runner-home + mountPath: /home/runner + containers: + - name: runner + image: myoung34/github-runner:latest + imagePullPolicy: Always + env: + - name: REPO_URL + value: "https://github.com/astoltz/FlowerCore.Telephony" + - name: RUNNER_NAME_PREFIX + value: "rke2-linux-telephony" + - name: RUNNER_WORKDIR + value: "/tmp/runner/work" + - name: EPHEMERAL + value: "true" + - name: LABELS + value: "self-hosted,linux,fc-build-linux" + - name: HOME + value: "/home/runner" + - name: DOTNET_INSTALL_DIR + value: "/home/runner/.dotnet" + - name: DOTNET_CLI_TELEMETRY_OPTOUT + value: "1" + - name: DOTNET_NOLOGO + value: "1" + - name: DOTNET_GENERATE_ASPNET_CERTIFICATE + value: "false" + - name: DOTNET_CLI_HOME + value: "/home/runner" + - name: NUGET_PACKAGES + value: "/home/runner/.nuget/packages" + - name: XDG_CACHE_HOME + value: "/home/runner/.cache" + - name: RUNNER_TOOL_CACHE + value: "/home/runner/_tool" + - name: ACCESS_TOKEN + valueFrom: + secretKeyRef: + name: github-runner-token + key: credential + - name: RUN_AS_ROOT + value: "false" + resources: + requests: + cpu: "500m" + memory: "1Gi" + limits: + cpu: "2000m" + memory: "4Gi" + volumeMounts: + - name: runner-home + mountPath: /home/runner + - name: nuget-cache + mountPath: /home/runner/.nuget/packages + - name: tmp + mountPath: /tmp + livenessProbe: + exec: + command: + - /bin/sh + - -c + - "pgrep -f Runner.Listener > /dev/null" + initialDelaySeconds: 30 + periodSeconds: 30 + failureThreshold: 3 + volumes: + - name: runner-home + emptyDir: {} + - name: nuget-cache + emptyDir: + sizeLimit: 2Gi + - name: tmp + emptyDir: {} + restartPolicy: Always +--- +# Runner for FlowerCore.Print.Web. Two replicas use per-pod emptyDir caches, +# so backlog can drain without sharing a ReadWriteOnce PVC. +apiVersion: apps/v1 +kind: Deployment +metadata: + name: github-runner-print-web + namespace: github-runner + labels: + app.kubernetes.io/name: github-runner-print-web + app.kubernetes.io/component: runner + app.kubernetes.io/part-of: flowercore + app.kubernetes.io/managed-by: argocd + flowercore.io/created-by: argocd + flowercore.io/runner-repo: print-web + flowercore.io/github-repo: FlowerCore.Print.Web +spec: + replicas: 2 + selector: + matchLabels: + app.kubernetes.io/name: github-runner-print-web + strategy: + type: Recreate + template: + metadata: + labels: + app.kubernetes.io/name: github-runner-print-web + app.kubernetes.io/component: runner + app.kubernetes.io/part-of: flowercore + flowercore.io/created-by: argocd + flowercore.io/runner-repo: print-web + flowercore.io/github-repo: FlowerCore.Print.Web + spec: + serviceAccountName: github-runner + securityContext: + runAsNonRoot: true + runAsUser: 1001 + runAsGroup: 1001 + fsGroup: 1001 + initContainers: + - name: setup-runner-home + image: busybox:1.36 + command: + - sh + - -c + - | + set -e + mkdir -p /home/runner/.dotnet /home/runner/.nuget/packages /home/runner/.nuget/NuGet + chown -R 1001:1001 /home/runner/.dotnet /home/runner/.nuget + chmod -R 755 /home/runner/.dotnet /home/runner/.nuget + securityContext: + runAsUser: 0 + runAsNonRoot: false + volumeMounts: + - name: runner-home + mountPath: /home/runner + containers: + - name: runner + image: myoung34/github-runner:latest + imagePullPolicy: Always + env: + - name: REPO_URL + value: "https://github.com/astoltz/FlowerCore.Print.Web" + - name: RUNNER_NAME_PREFIX + value: "rke2-linux-print-web" + - name: RUNNER_WORKDIR + value: "/tmp/runner/work" + - name: EPHEMERAL + value: "true" + - name: LABELS + value: "self-hosted,linux,fc-build-linux" + - name: HOME + value: "/home/runner" + - name: DOTNET_INSTALL_DIR + value: "/home/runner/.dotnet" + - name: DOTNET_CLI_TELEMETRY_OPTOUT + value: "1" + - name: DOTNET_NOLOGO + value: "1" + - name: DOTNET_GENERATE_ASPNET_CERTIFICATE + value: "false" + - name: DOTNET_CLI_HOME + value: "/home/runner" + - name: NUGET_PACKAGES + value: "/home/runner/.nuget/packages" + - name: XDG_CACHE_HOME + value: "/home/runner/.cache" + - name: RUNNER_TOOL_CACHE + value: "/home/runner/_tool" + - name: ACCESS_TOKEN + valueFrom: + secretKeyRef: + name: github-runner-token + key: credential + - name: RUN_AS_ROOT + value: "false" + resources: + requests: + cpu: "500m" + memory: "1Gi" + limits: + cpu: "2000m" + memory: "4Gi" + volumeMounts: + - name: runner-home + mountPath: /home/runner + - name: nuget-cache + mountPath: /home/runner/.nuget/packages + - name: tmp + mountPath: /tmp + livenessProbe: + exec: + command: + - /bin/sh + - -c + - "pgrep -f Runner.Listener > /dev/null" + initialDelaySeconds: 30 + periodSeconds: 30 + failureThreshold: 3 + volumes: + - name: runner-home + emptyDir: {} + - name: nuget-cache + emptyDir: + sizeLimit: 2Gi + - name: tmp + emptyDir: {} + restartPolicy: Always +--- +# Runner for FlowerCore.Chat. Two replicas use per-pod emptyDir caches, so +# backlog can drain without sharing a ReadWriteOnce PVC. +apiVersion: apps/v1 +kind: Deployment +metadata: + name: github-runner-chat + namespace: github-runner + labels: + app.kubernetes.io/name: github-runner-chat + app.kubernetes.io/component: runner + app.kubernetes.io/part-of: flowercore + app.kubernetes.io/managed-by: argocd + flowercore.io/created-by: argocd + flowercore.io/runner-repo: chat + flowercore.io/github-repo: FlowerCore.Chat +spec: + replicas: 2 + selector: + matchLabels: + app.kubernetes.io/name: github-runner-chat + strategy: + type: Recreate + template: + metadata: + labels: + app.kubernetes.io/name: github-runner-chat + app.kubernetes.io/component: runner + app.kubernetes.io/part-of: flowercore + flowercore.io/created-by: argocd + flowercore.io/runner-repo: chat + flowercore.io/github-repo: FlowerCore.Chat + spec: + serviceAccountName: github-runner + securityContext: + runAsNonRoot: true + runAsUser: 1001 + runAsGroup: 1001 + fsGroup: 1001 + initContainers: + - name: setup-runner-home + image: busybox:1.36 + command: + - sh + - -c + - | + set -e + mkdir -p /home/runner/.dotnet /home/runner/.nuget/packages /home/runner/.nuget/NuGet + chown -R 1001:1001 /home/runner/.dotnet /home/runner/.nuget + chmod -R 755 /home/runner/.dotnet /home/runner/.nuget + securityContext: + runAsUser: 0 + runAsNonRoot: false + volumeMounts: + - name: runner-home + mountPath: /home/runner + containers: + - name: runner + image: myoung34/github-runner:latest + imagePullPolicy: Always + env: + - name: REPO_URL + value: "https://github.com/astoltz/FlowerCore.Chat" + - name: RUNNER_NAME_PREFIX + value: "rke2-linux-chat" + - name: RUNNER_WORKDIR + value: "/tmp/runner/work" + - name: EPHEMERAL + value: "true" + - name: LABELS + value: "self-hosted,linux,fc-build-linux" + - name: HOME + value: "/home/runner" + - name: DOTNET_INSTALL_DIR + value: "/home/runner/.dotnet" + - name: DOTNET_CLI_TELEMETRY_OPTOUT + value: "1" + - name: DOTNET_NOLOGO + value: "1" + - name: DOTNET_GENERATE_ASPNET_CERTIFICATE + value: "false" + - name: DOTNET_CLI_HOME + value: "/home/runner" + - name: NUGET_PACKAGES + value: "/home/runner/.nuget/packages" + - name: XDG_CACHE_HOME + value: "/home/runner/.cache" + - name: RUNNER_TOOL_CACHE + value: "/home/runner/_tool" + - name: ACCESS_TOKEN + valueFrom: + secretKeyRef: + name: github-runner-token + key: credential + - name: RUN_AS_ROOT + value: "false" + resources: + requests: + cpu: "500m" + memory: "1Gi" + limits: + cpu: "2000m" + memory: "4Gi" + volumeMounts: + - name: runner-home + mountPath: /home/runner + - name: nuget-cache + mountPath: /home/runner/.nuget/packages + - name: tmp + mountPath: /tmp + livenessProbe: + exec: + command: + - /bin/sh + - -c + - "pgrep -f Runner.Listener > /dev/null" + initialDelaySeconds: 30 + periodSeconds: 30 + failureThreshold: 3 + volumes: + - name: runner-home + emptyDir: {} + - name: nuget-cache + emptyDir: + sizeLimit: 2Gi + - name: tmp + emptyDir: {} + restartPolicy: Always +--- +# Runner for FlowerCore.MySQL. Two replicas use per-pod emptyDir caches, so +# backlog can drain without sharing a ReadWriteOnce PVC. +apiVersion: apps/v1 +kind: Deployment +metadata: + name: github-runner-mysql + namespace: github-runner + labels: + app.kubernetes.io/name: github-runner-mysql + app.kubernetes.io/component: runner + app.kubernetes.io/part-of: flowercore + app.kubernetes.io/managed-by: argocd + flowercore.io/created-by: argocd + flowercore.io/runner-repo: mysql + flowercore.io/github-repo: FlowerCore.MySQL +spec: + replicas: 2 + selector: + matchLabels: + app.kubernetes.io/name: github-runner-mysql + strategy: + type: Recreate + template: + metadata: + labels: + app.kubernetes.io/name: github-runner-mysql + app.kubernetes.io/component: runner + app.kubernetes.io/part-of: flowercore + flowercore.io/created-by: argocd + flowercore.io/runner-repo: mysql + flowercore.io/github-repo: FlowerCore.MySQL + spec: + serviceAccountName: github-runner + securityContext: + runAsNonRoot: true + runAsUser: 1001 + runAsGroup: 1001 + fsGroup: 1001 + initContainers: + - name: setup-runner-home + image: busybox:1.36 + command: + - sh + - -c + - | + set -e + mkdir -p /home/runner/.dotnet /home/runner/.nuget/packages /home/runner/.nuget/NuGet + chown -R 1001:1001 /home/runner/.dotnet /home/runner/.nuget + chmod -R 755 /home/runner/.dotnet /home/runner/.nuget + securityContext: + runAsUser: 0 + runAsNonRoot: false + volumeMounts: + - name: runner-home + mountPath: /home/runner + containers: + - name: runner + image: myoung34/github-runner:latest + imagePullPolicy: Always + env: + - name: REPO_URL + value: "https://github.com/astoltz/FlowerCore.MySQL" + - name: RUNNER_NAME_PREFIX + value: "rke2-linux-mysql" + - name: RUNNER_WORKDIR + value: "/tmp/runner/work" + - name: EPHEMERAL + value: "true" + - name: LABELS + value: "self-hosted,linux,fc-build-linux" + - name: HOME + value: "/home/runner" + - name: DOTNET_INSTALL_DIR + value: "/home/runner/.dotnet" + - name: DOTNET_CLI_TELEMETRY_OPTOUT + value: "1" + - name: DOTNET_NOLOGO + value: "1" + - name: DOTNET_GENERATE_ASPNET_CERTIFICATE + value: "false" + - name: DOTNET_CLI_HOME + value: "/home/runner" + - name: NUGET_PACKAGES + value: "/home/runner/.nuget/packages" + - name: XDG_CACHE_HOME + value: "/home/runner/.cache" + - name: RUNNER_TOOL_CACHE + value: "/home/runner/_tool" + - name: ACCESS_TOKEN + valueFrom: + secretKeyRef: + name: github-runner-token + key: credential + - name: RUN_AS_ROOT + value: "false" + resources: + requests: + cpu: "500m" + memory: "1Gi" + limits: + cpu: "2000m" + memory: "4Gi" + volumeMounts: + - name: runner-home + mountPath: /home/runner + - name: nuget-cache + mountPath: /home/runner/.nuget/packages + - name: tmp + mountPath: /tmp + livenessProbe: + exec: + command: + - /bin/sh + - -c + - "pgrep -f Runner.Listener > /dev/null" + initialDelaySeconds: 30 + periodSeconds: 30 + failureThreshold: 3 + volumes: + - name: runner-home + emptyDir: {} + - name: nuget-cache + emptyDir: + sizeLimit: 2Gi + - name: tmp + emptyDir: {} + restartPolicy: Always +--- +# Runner for FlowerCore.Kiosk.Linux. Two replicas use per-pod emptyDir caches, +# so backlog can drain without sharing a ReadWriteOnce PVC. +apiVersion: apps/v1 +kind: Deployment +metadata: + name: github-runner-kiosk-linux + namespace: github-runner + labels: + app.kubernetes.io/name: github-runner-kiosk-linux + app.kubernetes.io/component: runner + app.kubernetes.io/part-of: flowercore + app.kubernetes.io/managed-by: argocd + flowercore.io/created-by: argocd + flowercore.io/runner-repo: kiosk-linux + flowercore.io/github-repo: FlowerCore.Kiosk.Linux +spec: + replicas: 2 + selector: + matchLabels: + app.kubernetes.io/name: github-runner-kiosk-linux + strategy: + type: Recreate + template: + metadata: + labels: + app.kubernetes.io/name: github-runner-kiosk-linux + app.kubernetes.io/component: runner + app.kubernetes.io/part-of: flowercore + flowercore.io/created-by: argocd + flowercore.io/runner-repo: kiosk-linux + flowercore.io/github-repo: FlowerCore.Kiosk.Linux + spec: + serviceAccountName: github-runner + securityContext: + runAsNonRoot: true + runAsUser: 1001 + runAsGroup: 1001 + fsGroup: 1001 + initContainers: + - name: setup-runner-home + image: busybox:1.36 + command: + - sh + - -c + - | + set -e + mkdir -p /home/runner/.dotnet /home/runner/.nuget/packages /home/runner/.nuget/NuGet + chown -R 1001:1001 /home/runner/.dotnet /home/runner/.nuget + chmod -R 755 /home/runner/.dotnet /home/runner/.nuget + securityContext: + runAsUser: 0 + runAsNonRoot: false + volumeMounts: + - name: runner-home + mountPath: /home/runner + containers: + - name: runner + image: myoung34/github-runner:latest + imagePullPolicy: Always + env: + - name: REPO_URL + value: "https://github.com/astoltz/FlowerCore.Kiosk.Linux" + - name: RUNNER_NAME_PREFIX + value: "rke2-linux-kiosk-linux" + - name: RUNNER_WORKDIR + value: "/tmp/runner/work" + - name: EPHEMERAL + value: "true" + - name: LABELS + value: "self-hosted,linux,fc-build-linux" + - name: HOME + value: "/home/runner" + - name: DOTNET_INSTALL_DIR + value: "/home/runner/.dotnet" + - name: DOTNET_CLI_TELEMETRY_OPTOUT + value: "1" + - name: DOTNET_NOLOGO + value: "1" + - name: DOTNET_GENERATE_ASPNET_CERTIFICATE + value: "false" + - name: DOTNET_CLI_HOME + value: "/home/runner" + - name: NUGET_PACKAGES + value: "/home/runner/.nuget/packages" + - name: XDG_CACHE_HOME + value: "/home/runner/.cache" + - name: RUNNER_TOOL_CACHE + value: "/home/runner/_tool" + - name: ACCESS_TOKEN + valueFrom: + secretKeyRef: + name: github-runner-token + key: credential + - name: RUN_AS_ROOT + value: "false" + resources: + requests: + cpu: "500m" + memory: "1Gi" + limits: + cpu: "2000m" + memory: "4Gi" + volumeMounts: + - name: runner-home + mountPath: /home/runner + - name: nuget-cache + mountPath: /home/runner/.nuget/packages + - name: tmp + mountPath: /tmp + livenessProbe: + exec: + command: + - /bin/sh + - -c + - "pgrep -f Runner.Listener > /dev/null" + initialDelaySeconds: 30 + periodSeconds: 30 + failureThreshold: 3 + volumes: + - name: runner-home + emptyDir: {} + - name: nuget-cache + emptyDir: + sizeLimit: 2Gi + - name: tmp + emptyDir: {} + restartPolicy: Always + +# Long-tail runner pattern: +# +# Add lower-volume repos on demand with the same emptyDir runner shape above. +# Use replicas: 1 by default, replicas: 2 only when queue time proves it is +# useful. Do not create a multi-replica Deployment that shares one RWO PVC. +# Common remains the only PVC-backed runner here, and it stays replicas: 1. diff --git a/apps/monitoring/noc-monitoring.yaml b/apps/monitoring/noc-monitoring.yaml index dd934b4..606776a 100644 --- a/apps/monitoring/noc-monitoring.yaml +++ b/apps/monitoring/noc-monitoring.yaml @@ -723,6 +723,24 @@ data: summary: "Mac mini GitHub runner offline ({{ $labels.runner }})" description: "A macmini-* GitHub Actions runner has not reported online for more than 10 minutes. Puppet manages its LaunchDaemon under /Library/LaunchDaemons/io.flowercore.github-runner-.plist; runners survive reboot and do not require a GUI session." + - name: linux-runners + rules: + - alert: LinuxRunnerOffline + expr: | + kube_deployment_status_replicas_ready{ + namespace="github-runner", + deployment=~"github-runner(|-(sharedpos|puppet|signage|dms|telephony|print-web|chat|mysql|kiosk-linux))" + } == 0 + for: 5m + labels: + severity: warning + alert_channel: irc + service: github-runner + team: ci + annotations: + summary: "Linux CI runner offline: {{ $labels.deployment }}" + description: "Deployment {{ $labels.deployment }} in namespace github-runner has 0 ready replicas for more than 5 minutes. CI jobs targeting this repo will queue until the runner pod restarts and re-registers with GitHub. Check pods with: kubectl -n github-runner get pods -l app.kubernetes.io/name={{ $labels.deployment }}. Check logs with: kubectl -n github-runner logs -l app.kubernetes.io/name={{ $labels.deployment }} --tail=50. Common causes: PAT missing repo access, runner CrashLoopBackOff, or node/resource pressure." + - name: remote-desktop rules: - alert: RemoteDesktopWebDown @@ -3421,6 +3439,39 @@ data: relativeTimeRange: {from: 120, to: 0} datasourceUid: __expr__ model: {type: threshold, expression: B, conditions: [{evaluator: {params: [600], type: gt}}], refId: C} + - orgId: 1 + name: CI Runners + folder: CI Alerts + interval: 1m + rules: + - uid: linux-runner-offline + title: LinuxRunnerOffline + condition: C + for: 5m + noDataState: OK + execErrState: Error + annotations: + summary: "Linux CI runner offline: {{ $labels.deployment }}" + description: "A github-runner namespace Deployment has 0 ready replicas for more than 5 minutes. CI jobs targeting that repo will queue until the runner pod restarts and re-registers." + runbook: "1. kubectl -n github-runner get pods -l app.kubernetes.io/name={{ $labels.deployment }} 2. kubectl -n github-runner logs -l app.kubernetes.io/name={{ $labels.deployment }} --tail=50 3. Verify PAT repo access if registration returns 404 4. Verify no RWO PVC is shared by scaled runners" + labels: + severity: warning + service: github-runner + alert_channel: irc + team: ci + data: + - refId: A + relativeTimeRange: {from: 300, to: 0} + datasourceUid: prometheus + model: {expr: 'kube_deployment_status_replicas_ready{namespace="github-runner",deployment=~"github-runner(|-(sharedpos|puppet|signage|dms|telephony|print-web|chat|mysql|kiosk-linux))"} == 0', instant: true, refId: A} + - refId: B + relativeTimeRange: {from: 300, to: 0} + datasourceUid: __expr__ + model: {type: reduce, expression: A, reducer: last, refId: B} + - refId: C + relativeTimeRange: {from: 300, to: 0} + datasourceUid: __expr__ + model: {type: threshold, expression: B, conditions: [{evaluator: {params: [0], type: gt}}], refId: C} - orgId: 1 name: Infrastructure folder: AI Stack Alerts diff --git a/tests/bluejay-infra-lint/FleetManifestLintTests.cs b/tests/bluejay-infra-lint/FleetManifestLintTests.cs index 4bba10f..fb3b98d 100644 --- a/tests/bluejay-infra-lint/FleetManifestLintTests.cs +++ b/tests/bluejay-infra-lint/FleetManifestLintTests.cs @@ -54,6 +54,43 @@ public sealed class FleetManifestLintTests "ttsreader-piper", }; + private static readonly IReadOnlyDictionary LinuxRunnerRepos = new Dictionary(StringComparer.Ordinal) + { + ["github-runner"] = "https://github.com/astoltz/FlowerCore.Common", + ["github-runner-sharedpos"] = "https://github.com/astoltz/FlowerCore.Shared.Pos", + ["github-runner-puppet"] = "https://github.com/astoltz/FlowerCore.Puppet", + ["github-runner-signage"] = "https://github.com/astoltz/FlowerCore.Signage", + ["github-runner-dms"] = "https://github.com/astoltz/FlowerCore.DMS", + ["github-runner-telephony"] = "https://github.com/astoltz/FlowerCore.Telephony", + ["github-runner-print-web"] = "https://github.com/astoltz/FlowerCore.Print.Web", + ["github-runner-chat"] = "https://github.com/astoltz/FlowerCore.Chat", + ["github-runner-mysql"] = "https://github.com/astoltz/FlowerCore.MySQL", + ["github-runner-kiosk-linux"] = "https://github.com/astoltz/FlowerCore.Kiosk.Linux", + }; + + private static readonly HashSet ScaledLinuxRunnerDeployments = new(StringComparer.Ordinal) + { + "github-runner-sharedpos", + "github-runner-puppet", + "github-runner-signage", + "github-runner-dms", + "github-runner-telephony", + "github-runner-print-web", + "github-runner-chat", + "github-runner-mysql", + "github-runner-kiosk-linux", + }; + + private static readonly IReadOnlyDictionary WritableRunnerEnv = new Dictionary(StringComparer.Ordinal) + { + ["HOME"] = "/home/runner", + ["DOTNET_INSTALL_DIR"] = "/home/runner/.dotnet", + ["DOTNET_CLI_HOME"] = "/home/runner", + ["NUGET_PACKAGES"] = "/home/runner/.nuget/packages", + ["XDG_CACHE_HOME"] = "/home/runner/.cache", + ["RUNNER_TOOL_CACHE"] = "/home/runner/_tool", + }; + [Fact] public void IngressRoutes_MustKeepServiceReferencesInTheSameNamespace() { @@ -187,6 +224,98 @@ public sealed class FleetManifestLintTests violations.Should().BeEmpty(); } + [Fact] + public void GitHubRunnerFleet_MustRegisterRequiredReposAsRepoScopedDeployments() + { + var deployments = GitHubRunnerDeployments(); + + foreach (var expectedRunner in LinuxRunnerRepos) + { + deployments.Should().ContainKey(expectedRunner.Key); + + var container = deployments[expectedRunner.Key].ContainerMappings().Should().ContainSingle().Subject; + EnvValue(container, "REPO_URL").Should().Be(expectedRunner.Value); + EnvValue(container, "EPHEMERAL").Should().Be("true"); + EnvValue(container, "LABELS").Should().Be("self-hosted,linux,fc-build-linux"); + EnvValue(container, "RUN_AS_ROOT").Should().Be("false"); + EnvValue(container, "ACCESS_TOKEN").Should().BeNull("ACCESS_TOKEN must come from github-runner-token Secret, not a literal"); + EnvSecretName(container, "ACCESS_TOKEN").Should().Be("github-runner-token"); + EnvSecretKey(container, "ACCESS_TOKEN").Should().Be("credential"); + } + } + + [Fact] + public void GitHubRunnerFleet_MustSetWritableNonRootDotnetAndCachePaths() + { + foreach (var deployment in GitHubRunnerDeployments().Values) + { + var container = deployment.ContainerMappings().Should().ContainSingle().Subject; + + foreach (var expectedEnv in WritableRunnerEnv) + { + EnvValue(container, expectedEnv.Key).Should().Be(expectedEnv.Value, $"{deployment.Name} must keep .NET paths writable for uid 1001"); + } + + var mounts = ManifestNodeExtensions.MappingSequence(container, "volumeMounts") + .ToDictionary( + mount => ManifestNodeExtensions.Scalar(mount, "name") ?? string.Empty, + mount => ManifestNodeExtensions.Scalar(mount, "mountPath") ?? string.Empty, + StringComparer.Ordinal); + + mounts.Should().Contain("runner-home", "/home/runner"); + mounts.Should().Contain("nuget-cache", "/home/runner/.nuget/packages"); + mounts.Should().Contain("tmp", "/tmp"); + } + } + + [Fact] + public void GitHubRunnerFleet_MustAvoidRwoMultiAttachForScaledDeployments() + { + var deployments = GitHubRunnerDeployments(); + + foreach (var deploymentName in ScaledLinuxRunnerDeployments) + { + var deployment = deployments[deploymentName]; + ReplicaCount(deployment).Should().Be(2); + + var volumes = deployment.MappingSequence("spec", "template", "spec", "volumes"); + var claimNames = volumes + .Select(volume => ManifestNodeExtensions.Scalar(volume, "persistentVolumeClaim", "claimName")) + .Where(value => !string.IsNullOrWhiteSpace(value)) + .ToList(); + + claimNames.Should().BeEmpty($"{deploymentName} is scaled and must not share a RWO PVC"); + volumes.Should().Contain(volume => + string.Equals(ManifestNodeExtensions.Scalar(volume, "name"), "nuget-cache", StringComparison.Ordinal) + && ManifestNodeExtensions.Mapping(volume, "emptyDir") != null); + } + + var common = deployments["github-runner"]; + ReplicaCount(common).Should().Be(1); + common.MappingSequence("spec", "template", "spec", "volumes") + .Select(volume => ManifestNodeExtensions.Scalar(volume, "persistentVolumeClaim", "claimName")) + .Where(value => !string.IsNullOrWhiteSpace(value)) + .Should() + .ContainSingle() + .Which + .Should() + .Be("github-runner-nuget-cache"); + } + + [Fact] + public void Monitoring_MustAlertWhenLinuxRunnerDeploymentIsUnavailable() + { + var monitoring = File.ReadAllText(Path.Combine(Inventory.BluejayRoot, "apps", "monitoring", "noc-monitoring.yaml")); + + monitoring.Should().Contain("MacMiniRunnerOffline"); + monitoring.Should().Contain("LinuxRunnerOffline"); + monitoring.Should().Contain("kube_deployment_status_replicas_ready"); + monitoring.Should().Contain("github-runner(|-(sharedpos|puppet|signage|dms|telephony|print-web|chat|mysql|kiosk-linux))"); + monitoring.Should().Contain("folder: CI Alerts"); + monitoring.Should().Contain("uid: linux-runner-offline"); + monitoring.Should().Contain("alert_channel: irc"); + } + [Fact] public void StatefulSets_WithVolumeClaimTemplates_MustDeclareFilesystemDefaults() { @@ -314,6 +443,44 @@ public sealed class FleetManifestLintTests $"{document.Descriptor} container '{containerName}' still uses {probeKey}.httpGet on /health.", }; } + + private static IReadOnlyDictionary GitHubRunnerDeployments() + { + return Inventory.Documents + .Where(document => document.Kind == "Deployment") + .Where(document => document.Namespace == "github-runner") + .ToDictionary(document => document.Name, StringComparer.Ordinal); + } + + private static int ReplicaCount(ManifestDocument document) + { + return int.TryParse(document.Scalar("spec", "replicas"), out var replicas) ? replicas : 1; + } + + private static string? EnvValue(YamlMappingNode container, string name) + { + return EnvMapping(container, name) is { } env ? ManifestNodeExtensions.Scalar(env, "value") : null; + } + + private static string? EnvSecretName(YamlMappingNode container, string name) + { + return EnvMapping(container, name) is { } env + ? ManifestNodeExtensions.Scalar(env, "valueFrom", "secretKeyRef", "name") + : null; + } + + private static string? EnvSecretKey(YamlMappingNode container, string name) + { + return EnvMapping(container, name) is { } env + ? ManifestNodeExtensions.Scalar(env, "valueFrom", "secretKeyRef", "key") + : null; + } + + private static YamlMappingNode? EnvMapping(YamlMappingNode container, string name) + { + return ManifestNodeExtensions.MappingSequence(container, "env") + .SingleOrDefault(env => string.Equals(ManifestNodeExtensions.Scalar(env, "name"), name, StringComparison.Ordinal)); + } } internal sealed class ManifestInventory -- 2.49.1