From 266b9cb8be60afc6f873a0e079adc41d0cfa4b61 Mon Sep 17 00:00:00 2001 From: Codex Date: Sun, 17 May 2026 13:55:55 -0500 Subject: [PATCH] feat(github-runner): add top Linux repo runners --- apps/github-runner/README.md | 38 + apps/github-runner/github-runner.yaml | 945 ++++++++++++++++-- apps/monitoring/noc-monitoring.yaml | 40 + .../FleetManifestLintTests.cs | 107 ++ 4 files changed, 1050 insertions(+), 80 deletions(-) create mode 100644 apps/github-runner/README.md diff --git a/apps/github-runner/README.md b/apps/github-runner/README.md new file mode 100644 index 0000000..b5c228d --- /dev/null +++ b/apps/github-runner/README.md @@ -0,0 +1,38 @@ +# github-runner + +ArgoCD-managed repo-scoped Linux GitHub Actions runners for FlowerCore. + +`astoltz` is a GitHub user account, not an organization, so each repository +needs its own runner registration. The existing Common runner remains +`Deployment/github-runner`; Sprint 29 adds one single-replica Deployment for +each top Linux-cost repo: + +- `FlowerCore.Puppet` +- `FlowerCore.Signage` +- `FlowerCore.DMS` +- `FlowerCore.Telephony` +- `FlowerCore.Print.Web` +- `FlowerCore.Chat` +- `FlowerCore.MySQL` +- `FlowerCore.Kiosk.Linux` + +Each runner uses `myoung34/github-runner:latest`, `EPHEMERAL=true`, and labels +`self-hosted,linux,fc-build-linux`. The shared `github-runner-token` Secret is +synced from the existing 1Password item `GitHub PAT (Runner Registration)` and +is consumed as `ACCESS_TOKEN`. + +Do not `kubectl apply` this app over ArgoCD. Merge to `main`, let +`infra-github-runner` sync, then verify from `noc1`: + +```bash +kubectl -n github-runner get deploy,pods,pvc + +for repo in FlowerCore.Puppet FlowerCore.Signage FlowerCore.DMS FlowerCore.Telephony FlowerCore.Print.Web FlowerCore.Chat FlowerCore.MySQL FlowerCore.Kiosk.Linux; do + gh api "/repos/astoltz/$repo/actions/runners" \ + --jq '.runners[] | select((.labels[].name == "fc-build-linux") and (.status == "online")) | {name,status,busy,labels:[.labels[].name]}' +done +``` + +`LinuxRunnerOffline` is declared in `apps/monitoring/noc-monitoring.yaml` and +fires when any Common or top-8 Linux runner deployment has no available replica +for 10 minutes. diff --git a/apps/github-runner/github-runner.yaml b/apps/github-runner/github-runner.yaml index fc08fad..66445c7 100644 --- a/apps/github-runner/github-runner.yaml +++ b/apps/github-runner/github-runner.yaml @@ -1,46 +1,23 @@ -# GitHub Actions self-hosted Linux runner — Phase 2 K8s deployment +# GitHub Actions self-hosted Linux runner fleet # -# Phase 1 (current): BLUEJAY-WS registered manually as a Windows runner -# with label "fc-build-windows" via config.cmd (see docs/infrastructure/ -# self-hosted-runner-fleet.md §WPF Build Runner). +# ArgoCD owns this namespace. Do not kubectl-apply ad hoc runner changes over +# it; update this manifest and let the bluejay-infra ApplicationSet reconcile. # -# Phase 2 (this file): ephemeral Linux runner in RKE2 for non-WPF builds -# (Blazor Server, class libraries, operators, integration tests). Reduces -# billing for ubuntu-24.04 jobs that run on GitHub-hosted runners today. +# astoltz is a GitHub user account, not an org, so runners must be repo-scoped. +# Each Deployment below registers exactly one ephemeral myoung34/github-runner +# instance against one private FlowerCore repo using the shared PAT from the +# github-runner-token Secret. # -# Runner image: myoung34/github-runner:latest -# EPHEMERAL=true — each pod runs exactly one job then exits; the -# Deployment controller immediately recreates it and re-registers. -# Prevents job queue starvation when two jobs overlap. +# Current shape: +# - Common runner preserved from the phase-2 pilot. +# - Sprint 29 top-8 Linux-cost repos added first: +# Puppet, Signage, DMS, Telephony, Print.Web, Chat, MySQL, Kiosk.Linux. # -# NuGet cache: 5Gi Longhorn RWO PVC mounted at /home/runner/.nuget/packages -# Persists NuGet packages across ephemeral pod restarts (not shared across -# simultaneous runner pods; single-replica constraint below). -# -# Credentials: -# OnePasswordItem "GitHub PAT (Runner Registration)" → Secret -# github-runner-token with field "credential" used as RUNNER_TOKEN. -# Operator must create/rotate the 1P item manually; registration tokens -# expire after 1h — use a fine-grained PAT with Administration:read/write -# scope on the target repos, or a re-registration script. See -# docs/infrastructure/self-hosted-runner-fleet.md §Security. -# -# Security model: -# - No ClusterRole / ClusterRoleBinding — runner has no K8s API access. -# - securityContext: runAsNonRoot with read-only root filesystem where -# possible (runner image needs /tmp and /home/runner writable). -# - Fork pull-request approval required on the GitHub repo settings. -# - RUNNER_ALLOW_RUNASROOT=false is the default. -# -# Cost: Phase 2 eliminates GitHub-hosted ubuntu-24.04 billing; break-even -# vs electricity is ~1 000 min/month at current TOU rates. -# -# Node placement: rke2-server (10.0.56.11) only — Longhorn RWO PVC must -# land on the same node as the volume, and the server node has the most -# spare capacity for burst CI workloads. -# -# Designs: docs/infrastructure/self-hosted-runner-fleet.md -# Questions: Q-CI-1..5 (all Recommended defaults) +# Security: +# - No ClusterRole / ClusterRoleBinding. +# - ServiceAccount has no K8s API privileges. +# - Self-hosted runners are for private repos and trusted branches only. +# - Fork pull-request approval must remain required in GitHub repo settings. --- apiVersion: v1 kind: Namespace @@ -52,11 +29,10 @@ metadata: --- # 1Password secret sync — creates github-runner-token K8s Secret. # Fields expected in the 1Password item: -# credential — GitHub fine-grained PAT (Administration:read/write on -# target repos) used by the runner image to mint a fresh -# short-lived registration token at pod start. +# credential — GitHub fine-grained PAT with Administration:read/write on +# each target repo. myoung34/github-runner uses ACCESS_TOKEN to +# mint fresh short-lived registration tokens at pod startup. # Item path: IAmWorkin vault > "GitHub PAT (Runner Registration)" -# Operator MUST create this item before the Deployment will start cleanly. apiVersion: onepassword.com/v1 kind: OnePasswordItem metadata: @@ -69,6 +45,15 @@ spec: itemPath: vaults/IAmWorkin/items/GitHub PAT (Runner Registration) --- apiVersion: v1 +kind: ServiceAccount +metadata: + name: github-runner + namespace: github-runner + labels: + app.kubernetes.io/component: runner + app.kubernetes.io/part-of: flowercore +--- +apiVersion: v1 kind: PersistentVolumeClaim metadata: name: github-runner-nuget-cache @@ -76,6 +61,7 @@ metadata: labels: app.kubernetes.io/component: cache app.kubernetes.io/part-of: flowercore + flowercore.io/github-repo: FlowerCore.Common spec: accessModes: - ReadWriteOnce @@ -86,16 +72,148 @@ spec: volumeMode: Filesystem --- apiVersion: v1 -kind: ServiceAccount +kind: PersistentVolumeClaim metadata: - name: github-runner + name: github-runner-puppet-nuget-cache namespace: github-runner labels: - app.kubernetes.io/component: runner + app.kubernetes.io/component: cache app.kubernetes.io/part-of: flowercore -# No ClusterRole or ClusterRoleBinding — runner has zero K8s API privileges. -# CI jobs that need kubectl must supply their own kubeconfig via a secret -# injected at the job level, not via this service account. + flowercore.io/github-repo: FlowerCore.Puppet +spec: + accessModes: + - ReadWriteOnce + storageClassName: longhorn + resources: + requests: + storage: 5Gi + volumeMode: Filesystem +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: github-runner-signage-nuget-cache + namespace: github-runner + labels: + app.kubernetes.io/component: cache + app.kubernetes.io/part-of: flowercore + flowercore.io/github-repo: FlowerCore.Signage +spec: + accessModes: + - ReadWriteOnce + storageClassName: longhorn + resources: + requests: + storage: 5Gi + volumeMode: Filesystem +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: github-runner-dms-nuget-cache + namespace: github-runner + labels: + app.kubernetes.io/component: cache + app.kubernetes.io/part-of: flowercore + flowercore.io/github-repo: FlowerCore.DMS +spec: + accessModes: + - ReadWriteOnce + storageClassName: longhorn + resources: + requests: + storage: 5Gi + volumeMode: Filesystem +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: github-runner-telephony-nuget-cache + namespace: github-runner + labels: + app.kubernetes.io/component: cache + app.kubernetes.io/part-of: flowercore + flowercore.io/github-repo: FlowerCore.Telephony +spec: + accessModes: + - ReadWriteOnce + storageClassName: longhorn + resources: + requests: + storage: 5Gi + volumeMode: Filesystem +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: github-runner-print-web-nuget-cache + namespace: github-runner + labels: + app.kubernetes.io/component: cache + app.kubernetes.io/part-of: flowercore + flowercore.io/github-repo: FlowerCore.Print.Web +spec: + accessModes: + - ReadWriteOnce + storageClassName: longhorn + resources: + requests: + storage: 5Gi + volumeMode: Filesystem +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: github-runner-chat-nuget-cache + namespace: github-runner + labels: + app.kubernetes.io/component: cache + app.kubernetes.io/part-of: flowercore + flowercore.io/github-repo: FlowerCore.Chat +spec: + accessModes: + - ReadWriteOnce + storageClassName: longhorn + resources: + requests: + storage: 5Gi + volumeMode: Filesystem +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: github-runner-mysql-nuget-cache + namespace: github-runner + labels: + app.kubernetes.io/component: cache + app.kubernetes.io/part-of: flowercore + flowercore.io/github-repo: FlowerCore.MySQL +spec: + accessModes: + - ReadWriteOnce + storageClassName: longhorn + resources: + requests: + storage: 5Gi + volumeMode: Filesystem +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: github-runner-kiosk-linux-nuget-cache + namespace: github-runner + labels: + app.kubernetes.io/component: cache + app.kubernetes.io/part-of: flowercore + flowercore.io/github-repo: FlowerCore.Kiosk.Linux +spec: + accessModes: + - ReadWriteOnce + storageClassName: longhorn + resources: + requests: + storage: 5Gi + volumeMode: Filesystem --- apiVersion: apps/v1 kind: Deployment @@ -108,20 +226,12 @@ metadata: app.kubernetes.io/part-of: flowercore app.kubernetes.io/managed-by: argocd flowercore.io/created-by: argocd + flowercore.io/github-repo: FlowerCore.Common spec: - # Single replica enforced: the Longhorn RWO PVC can only be mounted by - # one pod at a time. Each pod re-registers as an ephemeral runner after - # completing a job (EPHEMERAL=true restarts the container, not the pod, - # so the PVC stays attached between jobs). - # - # 2026-05-16: bumped 0 -> 1 after operator provisioned the - # "GitHub PAT (Runner Registration)" 1P item with field=credential. - # Unblocks CI fleet-wide (was budget-exhausted on GH-hosted runners). replicas: 1 selector: matchLabels: app.kubernetes.io/name: github-runner - # Use Recreate to avoid the Multi-Attach RWO error during rollouts. strategy: type: Recreate template: @@ -131,9 +241,9 @@ spec: app.kubernetes.io/component: runner app.kubernetes.io/part-of: flowercore flowercore.io/created-by: argocd + flowercore.io/github-repo: FlowerCore.Common spec: serviceAccountName: github-runner - # Pin to rke2-server so the Longhorn RWO volume is always on the same node. nodeSelector: kubernetes.io/hostname: rke2-server securityContext: @@ -146,39 +256,21 @@ spec: image: myoung34/github-runner:latest imagePullPolicy: Always env: - # GitHub org/repo targeting. - # Set REPO_URL for a repo-scoped runner (cheaper, simpler). - # Switch to ORG_NAME + empty REPO_URL for an org-scoped runner. - name: REPO_URL value: "https://github.com/astoltz/FlowerCore.Common" - name: RUNNER_NAME_PREFIX value: "rke2-linux" - name: RUNNER_WORKDIR value: "/tmp/runner/work" - # EPHEMERAL=true: runner deregisters after one job; container - # exits with code 0; Deployment controller restarts it and a - # fresh registration occurs. Prevents stale runner accumulation. - name: EPHEMERAL value: "true" - # Labels used by workflow files: runs-on: [self-hosted, linux, fc-build-linux] - name: LABELS value: "self-hosted,linux,fc-build-linux" - # PAT (not pre-minted registration token) — myoung34/github-runner - # mints registration tokens itself via GitHub API when ACCESS_TOKEN - # is set. Switched from RUNNER_TOKEN -> ACCESS_TOKEN on 2026-05-16 - # because the 1P "GitHub PAT (Runner Registration)" item stores a - # fine-grained PAT, not a short-lived registration token. - name: ACCESS_TOKEN valueFrom: secretKeyRef: name: github-runner-token key: credential - # myoung34/github-runner default entrypoint expects root for some - # setup steps. With securityContext.runAsUser=1001 the entrypoint - # short-circuits with "RUN_AS_ROOT env var is set to true but the - # user has been overridden and is not running as root". Tell the - # entrypoint we're explicitly NOT root so it skips the root-only - # setup steps (cache prewarm + apt updates — both already baked). - name: RUN_AS_ROOT value: "false" resources: @@ -193,7 +285,6 @@ spec: mountPath: /home/runner/.nuget/packages - name: tmp mountPath: /tmp - # Liveness: runner process is alive. livenessProbe: exec: command: @@ -209,6 +300,700 @@ spec: claimName: github-runner-nuget-cache - name: tmp emptyDir: {} - # Restart policy: Always — the Deployment controller handles - # re-registration after each ephemeral job completes. + restartPolicy: Always +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: github-runner-puppet + namespace: github-runner + labels: + app.kubernetes.io/name: github-runner-puppet + app.kubernetes.io/component: runner + app.kubernetes.io/part-of: flowercore + app.kubernetes.io/managed-by: argocd + flowercore.io/created-by: argocd + flowercore.io/github-repo: FlowerCore.Puppet +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: github-runner-puppet + strategy: + type: Recreate + template: + metadata: + labels: + app.kubernetes.io/name: github-runner-puppet + app.kubernetes.io/component: runner + app.kubernetes.io/part-of: flowercore + flowercore.io/created-by: argocd + flowercore.io/github-repo: FlowerCore.Puppet + spec: + serviceAccountName: github-runner + nodeSelector: + kubernetes.io/hostname: rke2-server + securityContext: + runAsNonRoot: true + runAsUser: 1001 + runAsGroup: 1001 + fsGroup: 1001 + containers: + - name: runner + image: myoung34/github-runner:latest + imagePullPolicy: Always + env: + - name: REPO_URL + value: "https://github.com/astoltz/FlowerCore.Puppet" + - name: RUNNER_NAME_PREFIX + value: "rke2-linux-puppet" + - name: RUNNER_WORKDIR + value: "/tmp/runner/work" + - name: EPHEMERAL + value: "true" + - name: LABELS + value: "self-hosted,linux,fc-build-linux" + - name: ACCESS_TOKEN + valueFrom: + secretKeyRef: + name: github-runner-token + key: credential + - name: RUN_AS_ROOT + value: "false" + resources: + requests: + cpu: "500m" + memory: "1Gi" + limits: + cpu: "2000m" + memory: "4Gi" + volumeMounts: + - name: nuget-cache + mountPath: /home/runner/.nuget/packages + - name: tmp + mountPath: /tmp + livenessProbe: + exec: + command: + - /bin/sh + - -c + - "pgrep -f Runner.Listener > /dev/null" + initialDelaySeconds: 30 + periodSeconds: 30 + failureThreshold: 3 + volumes: + - name: nuget-cache + persistentVolumeClaim: + claimName: github-runner-puppet-nuget-cache + - name: tmp + emptyDir: {} + restartPolicy: Always +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: github-runner-signage + namespace: github-runner + labels: + app.kubernetes.io/name: github-runner-signage + app.kubernetes.io/component: runner + app.kubernetes.io/part-of: flowercore + app.kubernetes.io/managed-by: argocd + flowercore.io/created-by: argocd + flowercore.io/github-repo: FlowerCore.Signage +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: github-runner-signage + strategy: + type: Recreate + template: + metadata: + labels: + app.kubernetes.io/name: github-runner-signage + app.kubernetes.io/component: runner + app.kubernetes.io/part-of: flowercore + flowercore.io/created-by: argocd + flowercore.io/github-repo: FlowerCore.Signage + spec: + serviceAccountName: github-runner + nodeSelector: + kubernetes.io/hostname: rke2-server + securityContext: + runAsNonRoot: true + runAsUser: 1001 + runAsGroup: 1001 + fsGroup: 1001 + containers: + - name: runner + image: myoung34/github-runner:latest + imagePullPolicy: Always + env: + - name: REPO_URL + value: "https://github.com/astoltz/FlowerCore.Signage" + - name: RUNNER_NAME_PREFIX + value: "rke2-linux-signage" + - name: RUNNER_WORKDIR + value: "/tmp/runner/work" + - name: EPHEMERAL + value: "true" + - name: LABELS + value: "self-hosted,linux,fc-build-linux" + - name: ACCESS_TOKEN + valueFrom: + secretKeyRef: + name: github-runner-token + key: credential + - name: RUN_AS_ROOT + value: "false" + resources: + requests: + cpu: "500m" + memory: "1Gi" + limits: + cpu: "2000m" + memory: "4Gi" + volumeMounts: + - name: nuget-cache + mountPath: /home/runner/.nuget/packages + - name: tmp + mountPath: /tmp + livenessProbe: + exec: + command: + - /bin/sh + - -c + - "pgrep -f Runner.Listener > /dev/null" + initialDelaySeconds: 30 + periodSeconds: 30 + failureThreshold: 3 + volumes: + - name: nuget-cache + persistentVolumeClaim: + claimName: github-runner-signage-nuget-cache + - name: tmp + emptyDir: {} + restartPolicy: Always +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: github-runner-dms + namespace: github-runner + labels: + app.kubernetes.io/name: github-runner-dms + app.kubernetes.io/component: runner + app.kubernetes.io/part-of: flowercore + app.kubernetes.io/managed-by: argocd + flowercore.io/created-by: argocd + flowercore.io/github-repo: FlowerCore.DMS +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: github-runner-dms + strategy: + type: Recreate + template: + metadata: + labels: + app.kubernetes.io/name: github-runner-dms + app.kubernetes.io/component: runner + app.kubernetes.io/part-of: flowercore + flowercore.io/created-by: argocd + flowercore.io/github-repo: FlowerCore.DMS + spec: + serviceAccountName: github-runner + nodeSelector: + kubernetes.io/hostname: rke2-server + securityContext: + runAsNonRoot: true + runAsUser: 1001 + runAsGroup: 1001 + fsGroup: 1001 + containers: + - name: runner + image: myoung34/github-runner:latest + imagePullPolicy: Always + env: + - name: REPO_URL + value: "https://github.com/astoltz/FlowerCore.DMS" + - name: RUNNER_NAME_PREFIX + value: "rke2-linux-dms" + - name: RUNNER_WORKDIR + value: "/tmp/runner/work" + - name: EPHEMERAL + value: "true" + - name: LABELS + value: "self-hosted,linux,fc-build-linux" + - name: ACCESS_TOKEN + valueFrom: + secretKeyRef: + name: github-runner-token + key: credential + - name: RUN_AS_ROOT + value: "false" + resources: + requests: + cpu: "500m" + memory: "1Gi" + limits: + cpu: "2000m" + memory: "4Gi" + volumeMounts: + - name: nuget-cache + mountPath: /home/runner/.nuget/packages + - name: tmp + mountPath: /tmp + livenessProbe: + exec: + command: + - /bin/sh + - -c + - "pgrep -f Runner.Listener > /dev/null" + initialDelaySeconds: 30 + periodSeconds: 30 + failureThreshold: 3 + volumes: + - name: nuget-cache + persistentVolumeClaim: + claimName: github-runner-dms-nuget-cache + - name: tmp + emptyDir: {} + restartPolicy: Always +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: github-runner-telephony + namespace: github-runner + labels: + app.kubernetes.io/name: github-runner-telephony + app.kubernetes.io/component: runner + app.kubernetes.io/part-of: flowercore + app.kubernetes.io/managed-by: argocd + flowercore.io/created-by: argocd + flowercore.io/github-repo: FlowerCore.Telephony +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: github-runner-telephony + strategy: + type: Recreate + template: + metadata: + labels: + app.kubernetes.io/name: github-runner-telephony + app.kubernetes.io/component: runner + app.kubernetes.io/part-of: flowercore + flowercore.io/created-by: argocd + flowercore.io/github-repo: FlowerCore.Telephony + spec: + serviceAccountName: github-runner + nodeSelector: + kubernetes.io/hostname: rke2-server + securityContext: + runAsNonRoot: true + runAsUser: 1001 + runAsGroup: 1001 + fsGroup: 1001 + containers: + - name: runner + image: myoung34/github-runner:latest + imagePullPolicy: Always + env: + - name: REPO_URL + value: "https://github.com/astoltz/FlowerCore.Telephony" + - name: RUNNER_NAME_PREFIX + value: "rke2-linux-telephony" + - name: RUNNER_WORKDIR + value: "/tmp/runner/work" + - name: EPHEMERAL + value: "true" + - name: LABELS + value: "self-hosted,linux,fc-build-linux" + - name: ACCESS_TOKEN + valueFrom: + secretKeyRef: + name: github-runner-token + key: credential + - name: RUN_AS_ROOT + value: "false" + resources: + requests: + cpu: "500m" + memory: "1Gi" + limits: + cpu: "2000m" + memory: "4Gi" + volumeMounts: + - name: nuget-cache + mountPath: /home/runner/.nuget/packages + - name: tmp + mountPath: /tmp + livenessProbe: + exec: + command: + - /bin/sh + - -c + - "pgrep -f Runner.Listener > /dev/null" + initialDelaySeconds: 30 + periodSeconds: 30 + failureThreshold: 3 + volumes: + - name: nuget-cache + persistentVolumeClaim: + claimName: github-runner-telephony-nuget-cache + - name: tmp + emptyDir: {} + restartPolicy: Always +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: github-runner-print-web + namespace: github-runner + labels: + app.kubernetes.io/name: github-runner-print-web + app.kubernetes.io/component: runner + app.kubernetes.io/part-of: flowercore + app.kubernetes.io/managed-by: argocd + flowercore.io/created-by: argocd + flowercore.io/github-repo: FlowerCore.Print.Web +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: github-runner-print-web + strategy: + type: Recreate + template: + metadata: + labels: + app.kubernetes.io/name: github-runner-print-web + app.kubernetes.io/component: runner + app.kubernetes.io/part-of: flowercore + flowercore.io/created-by: argocd + flowercore.io/github-repo: FlowerCore.Print.Web + spec: + serviceAccountName: github-runner + nodeSelector: + kubernetes.io/hostname: rke2-server + securityContext: + runAsNonRoot: true + runAsUser: 1001 + runAsGroup: 1001 + fsGroup: 1001 + containers: + - name: runner + image: myoung34/github-runner:latest + imagePullPolicy: Always + env: + - name: REPO_URL + value: "https://github.com/astoltz/FlowerCore.Print.Web" + - name: RUNNER_NAME_PREFIX + value: "rke2-linux-print-web" + - name: RUNNER_WORKDIR + value: "/tmp/runner/work" + - name: EPHEMERAL + value: "true" + - name: LABELS + value: "self-hosted,linux,fc-build-linux" + - name: ACCESS_TOKEN + valueFrom: + secretKeyRef: + name: github-runner-token + key: credential + - name: RUN_AS_ROOT + value: "false" + resources: + requests: + cpu: "500m" + memory: "1Gi" + limits: + cpu: "2000m" + memory: "4Gi" + volumeMounts: + - name: nuget-cache + mountPath: /home/runner/.nuget/packages + - name: tmp + mountPath: /tmp + livenessProbe: + exec: + command: + - /bin/sh + - -c + - "pgrep -f Runner.Listener > /dev/null" + initialDelaySeconds: 30 + periodSeconds: 30 + failureThreshold: 3 + volumes: + - name: nuget-cache + persistentVolumeClaim: + claimName: github-runner-print-web-nuget-cache + - name: tmp + emptyDir: {} + restartPolicy: Always +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: github-runner-chat + namespace: github-runner + labels: + app.kubernetes.io/name: github-runner-chat + app.kubernetes.io/component: runner + app.kubernetes.io/part-of: flowercore + app.kubernetes.io/managed-by: argocd + flowercore.io/created-by: argocd + flowercore.io/github-repo: FlowerCore.Chat +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: github-runner-chat + strategy: + type: Recreate + template: + metadata: + labels: + app.kubernetes.io/name: github-runner-chat + app.kubernetes.io/component: runner + app.kubernetes.io/part-of: flowercore + flowercore.io/created-by: argocd + flowercore.io/github-repo: FlowerCore.Chat + spec: + serviceAccountName: github-runner + nodeSelector: + kubernetes.io/hostname: rke2-server + securityContext: + runAsNonRoot: true + runAsUser: 1001 + runAsGroup: 1001 + fsGroup: 1001 + containers: + - name: runner + image: myoung34/github-runner:latest + imagePullPolicy: Always + env: + - name: REPO_URL + value: "https://github.com/astoltz/FlowerCore.Chat" + - name: RUNNER_NAME_PREFIX + value: "rke2-linux-chat" + - name: RUNNER_WORKDIR + value: "/tmp/runner/work" + - name: EPHEMERAL + value: "true" + - name: LABELS + value: "self-hosted,linux,fc-build-linux" + - name: ACCESS_TOKEN + valueFrom: + secretKeyRef: + name: github-runner-token + key: credential + - name: RUN_AS_ROOT + value: "false" + resources: + requests: + cpu: "500m" + memory: "1Gi" + limits: + cpu: "2000m" + memory: "4Gi" + volumeMounts: + - name: nuget-cache + mountPath: /home/runner/.nuget/packages + - name: tmp + mountPath: /tmp + livenessProbe: + exec: + command: + - /bin/sh + - -c + - "pgrep -f Runner.Listener > /dev/null" + initialDelaySeconds: 30 + periodSeconds: 30 + failureThreshold: 3 + volumes: + - name: nuget-cache + persistentVolumeClaim: + claimName: github-runner-chat-nuget-cache + - name: tmp + emptyDir: {} + restartPolicy: Always +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: github-runner-mysql + namespace: github-runner + labels: + app.kubernetes.io/name: github-runner-mysql + app.kubernetes.io/component: runner + app.kubernetes.io/part-of: flowercore + app.kubernetes.io/managed-by: argocd + flowercore.io/created-by: argocd + flowercore.io/github-repo: FlowerCore.MySQL +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: github-runner-mysql + strategy: + type: Recreate + template: + metadata: + labels: + app.kubernetes.io/name: github-runner-mysql + app.kubernetes.io/component: runner + app.kubernetes.io/part-of: flowercore + flowercore.io/created-by: argocd + flowercore.io/github-repo: FlowerCore.MySQL + spec: + serviceAccountName: github-runner + nodeSelector: + kubernetes.io/hostname: rke2-server + securityContext: + runAsNonRoot: true + runAsUser: 1001 + runAsGroup: 1001 + fsGroup: 1001 + containers: + - name: runner + image: myoung34/github-runner:latest + imagePullPolicy: Always + env: + - name: REPO_URL + value: "https://github.com/astoltz/FlowerCore.MySQL" + - name: RUNNER_NAME_PREFIX + value: "rke2-linux-mysql" + - name: RUNNER_WORKDIR + value: "/tmp/runner/work" + - name: EPHEMERAL + value: "true" + - name: LABELS + value: "self-hosted,linux,fc-build-linux" + - name: ACCESS_TOKEN + valueFrom: + secretKeyRef: + name: github-runner-token + key: credential + - name: RUN_AS_ROOT + value: "false" + resources: + requests: + cpu: "500m" + memory: "1Gi" + limits: + cpu: "2000m" + memory: "4Gi" + volumeMounts: + - name: nuget-cache + mountPath: /home/runner/.nuget/packages + - name: tmp + mountPath: /tmp + livenessProbe: + exec: + command: + - /bin/sh + - -c + - "pgrep -f Runner.Listener > /dev/null" + initialDelaySeconds: 30 + periodSeconds: 30 + failureThreshold: 3 + volumes: + - name: nuget-cache + persistentVolumeClaim: + claimName: github-runner-mysql-nuget-cache + - name: tmp + emptyDir: {} + restartPolicy: Always +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: github-runner-kiosk-linux + namespace: github-runner + labels: + app.kubernetes.io/name: github-runner-kiosk-linux + app.kubernetes.io/component: runner + app.kubernetes.io/part-of: flowercore + app.kubernetes.io/managed-by: argocd + flowercore.io/created-by: argocd + flowercore.io/github-repo: FlowerCore.Kiosk.Linux +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: github-runner-kiosk-linux + strategy: + type: Recreate + template: + metadata: + labels: + app.kubernetes.io/name: github-runner-kiosk-linux + app.kubernetes.io/component: runner + app.kubernetes.io/part-of: flowercore + flowercore.io/created-by: argocd + flowercore.io/github-repo: FlowerCore.Kiosk.Linux + spec: + serviceAccountName: github-runner + nodeSelector: + kubernetes.io/hostname: rke2-server + securityContext: + runAsNonRoot: true + runAsUser: 1001 + runAsGroup: 1001 + fsGroup: 1001 + containers: + - name: runner + image: myoung34/github-runner:latest + imagePullPolicy: Always + env: + - name: REPO_URL + value: "https://github.com/astoltz/FlowerCore.Kiosk.Linux" + - name: RUNNER_NAME_PREFIX + value: "rke2-linux-kiosk-linux" + - name: RUNNER_WORKDIR + value: "/tmp/runner/work" + - name: EPHEMERAL + value: "true" + - name: LABELS + value: "self-hosted,linux,fc-build-linux" + - name: ACCESS_TOKEN + valueFrom: + secretKeyRef: + name: github-runner-token + key: credential + - name: RUN_AS_ROOT + value: "false" + resources: + requests: + cpu: "500m" + memory: "1Gi" + limits: + cpu: "2000m" + memory: "4Gi" + volumeMounts: + - name: nuget-cache + mountPath: /home/runner/.nuget/packages + - name: tmp + mountPath: /tmp + livenessProbe: + exec: + command: + - /bin/sh + - -c + - "pgrep -f Runner.Listener > /dev/null" + initialDelaySeconds: 30 + periodSeconds: 30 + failureThreshold: 3 + volumes: + - name: nuget-cache + persistentVolumeClaim: + claimName: github-runner-kiosk-linux-nuget-cache + - name: tmp + emptyDir: {} restartPolicy: Always diff --git a/apps/monitoring/noc-monitoring.yaml b/apps/monitoring/noc-monitoring.yaml index 1698160..47fb00e 100644 --- a/apps/monitoring/noc-monitoring.yaml +++ b/apps/monitoring/noc-monitoring.yaml @@ -974,6 +974,19 @@ data: summary: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} replica mismatch" description: "Spec wants {{ $labels.spec_replicas }} but only {{ $value }} available. Likely a rollout stuck on probe failure, scheduling, or PVC." + - alert: LinuxRunnerOffline + expr: | + kube_deployment_status_replicas_available{namespace="github-runner",deployment=~"github-runner(|-(puppet|signage|dms|telephony|print-web|chat|mysql|kiosk-linux))"} < 1 + for: 10m + labels: + severity: warning + service: github-runner + alert_channel: thermal_print + annotations: + summary: "Linux GitHub Actions runner offline: {{ $labels.deployment }}" + description: "{{ $labels.deployment }} has no available runner pod for 10 minutes. GitHub jobs using [self-hosted, linux, fc-build-linux] for its repo will queue at $0 until the runner returns." + runbook_url: "https://gitea.iamworkin.lan/bluejay/FlowerCore.Notes/src/branch/master/docs/infrastructure/self-hosted-runner-fleet.md" + # Q-MR-3 (2026-05-11): multus memory pressure — catches the next OOM # cascade BEFORE multus is OOM-killed cluster-wide. The 2026-05-10 # outage (21h) hit because no alert fired on the rising multus working @@ -3427,6 +3440,33 @@ data: relativeTimeRange: {from: 120, to: 0} datasourceUid: __expr__ model: {type: threshold, expression: B, conditions: [{evaluator: {params: [1], type: lt}}], refId: C} + - uid: linux-runner-offline + title: LinuxRunnerOffline + condition: C + for: 10m + noDataState: Alerting + execErrState: OK + annotations: + summary: Linux GitHub Actions runner offline + description: "A repo-scoped fc-build-linux runner deployment has no available pod. Jobs will queue at $0 until ArgoCD/K8s returns the runner." + runbook_url: "https://gitea.iamworkin.lan/bluejay/FlowerCore.Notes/src/branch/master/docs/infrastructure/self-hosted-runner-fleet.md" + labels: + severity: warning + service: github-runner + alert_channel: thermal_print + data: + - refId: A + relativeTimeRange: {from: 600, to: 0} + datasourceUid: prometheus + model: {expr: 'min by(deployment) (kube_deployment_status_replicas_available{namespace="github-runner",deployment=~"github-runner(|-(puppet|signage|dms|telephony|print-web|chat|mysql|kiosk-linux))"})', instant: true, refId: A} + - refId: B + relativeTimeRange: {from: 600, to: 0} + datasourceUid: __expr__ + model: {type: reduce, expression: A, reducer: last, refId: B} + - refId: C + relativeTimeRange: {from: 600, to: 0} + datasourceUid: __expr__ + model: {type: threshold, expression: B, conditions: [{evaluator: {params: [1], type: lt}}], refId: C} - uid: high-cpu title: High CPU (>85%) condition: C diff --git a/tests/bluejay-infra-lint/FleetManifestLintTests.cs b/tests/bluejay-infra-lint/FleetManifestLintTests.cs index 4bba10f..be6712f 100644 --- a/tests/bluejay-infra-lint/FleetManifestLintTests.cs +++ b/tests/bluejay-infra-lint/FleetManifestLintTests.cs @@ -54,6 +54,18 @@ public sealed class FleetManifestLintTests "ttsreader-piper", }; + private static readonly IReadOnlyDictionary TopLinuxRunnerRepos = new Dictionary(StringComparer.Ordinal) + { + ["github-runner-puppet"] = "https://github.com/astoltz/FlowerCore.Puppet", + ["github-runner-signage"] = "https://github.com/astoltz/FlowerCore.Signage", + ["github-runner-dms"] = "https://github.com/astoltz/FlowerCore.DMS", + ["github-runner-telephony"] = "https://github.com/astoltz/FlowerCore.Telephony", + ["github-runner-print-web"] = "https://github.com/astoltz/FlowerCore.Print.Web", + ["github-runner-chat"] = "https://github.com/astoltz/FlowerCore.Chat", + ["github-runner-mysql"] = "https://github.com/astoltz/FlowerCore.MySQL", + ["github-runner-kiosk-linux"] = "https://github.com/astoltz/FlowerCore.Kiosk.Linux", + }; + [Fact] public void IngressRoutes_MustKeepServiceReferencesInTheSameNamespace() { @@ -187,6 +199,76 @@ public sealed class FleetManifestLintTests violations.Should().BeEmpty(); } + [Fact] + public void GitHubRunnerFleet_MustRegisterTopLinuxReposAsRepoScopedDeployments() + { + var deployments = Inventory.Documents + .Where(document => document.Kind == "Deployment") + .Where(document => document.Namespace == "github-runner") + .ToDictionary(document => document.Name, StringComparer.Ordinal); + + foreach (var expectedRunner in TopLinuxRunnerRepos) + { + deployments.Should().ContainKey(expectedRunner.Key); + + var container = deployments[expectedRunner.Key].ContainerMappings().Should().ContainSingle().Subject; + EnvValue(container, "REPO_URL").Should().Be(expectedRunner.Value); + EnvValue(container, "EPHEMERAL").Should().Be("true"); + EnvValue(container, "LABELS").Should().Be("self-hosted,linux,fc-build-linux"); + EnvValue(container, "ACCESS_TOKEN").Should().BeNull("ACCESS_TOKEN must come from github-runner-token Secret, not a literal"); + EnvSecretName(container, "ACCESS_TOKEN").Should().Be("github-runner-token"); + EnvSecretKey(container, "ACCESS_TOKEN").Should().Be("credential"); + } + } + + [Fact] + public void GitHubRunnerFleet_MustPreserveExistingCommonRunnerShape() + { + var common = Inventory.Documents + .Single(document => document.Kind == "Deployment" + && document.Namespace == "github-runner" + && document.Name == "github-runner"); + + var container = common.ContainerMappings().Should().ContainSingle().Subject; + EnvValue(container, "REPO_URL").Should().Be("https://github.com/astoltz/FlowerCore.Common"); + EnvValue(container, "RUNNER_NAME_PREFIX").Should().Be("rke2-linux"); + EnvValue(container, "LABELS").Should().Be("self-hosted,linux,fc-build-linux"); + + var claimNames = common.MappingSequence("spec", "template", "spec", "volumes") + .Select(volume => ManifestNodeExtensions.Scalar(volume, "persistentVolumeClaim", "claimName")) + .Where(value => !string.IsNullOrWhiteSpace(value)) + .ToList(); + + claimNames.Should().Contain("github-runner-nuget-cache"); + } + + [Fact] + public void GitHubRunnerFleet_MustUseOneRwoCachePerRepoScopedDeployment() + { + var pvcNames = Inventory.Documents + .Where(document => document.Kind == "PersistentVolumeClaim") + .Where(document => document.Namespace == "github-runner") + .Select(document => document.Name) + .ToHashSet(StringComparer.Ordinal); + + foreach (var deploymentName in TopLinuxRunnerRepos.Keys) + { + var suffix = deploymentName["github-runner-".Length..]; + pvcNames.Should().Contain($"github-runner-{suffix}-nuget-cache"); + } + } + + [Fact] + public void Monitoring_MustAlertWhenTopLinuxRunnerDeploymentIsUnavailable() + { + var monitoring = File.ReadAllText(Path.Combine(Inventory.BluejayRoot, "apps", "monitoring", "noc-monitoring.yaml")); + + monitoring.Should().Contain("LinuxRunnerOffline"); + monitoring.Should().Contain("kube_deployment_status_replicas_available{namespace=\"github-runner\""); + monitoring.Should().Contain("github-runner(|-(puppet|signage|dms|telephony|print-web|chat|mysql|kiosk-linux))"); + monitoring.Should().Contain("runbook_url: \"https://gitea.iamworkin.lan/bluejay/FlowerCore.Notes/src/branch/master/docs/infrastructure/self-hosted-runner-fleet.md\""); + } + [Fact] public void StatefulSets_WithVolumeClaimTemplates_MustDeclareFilesystemDefaults() { @@ -314,6 +396,31 @@ public sealed class FleetManifestLintTests $"{document.Descriptor} container '{containerName}' still uses {probeKey}.httpGet on /health.", }; } + + private static string? EnvValue(YamlMappingNode container, string name) + { + return EnvMapping(container, name) is { } env ? ManifestNodeExtensions.Scalar(env, "value") : null; + } + + private static string? EnvSecretName(YamlMappingNode container, string name) + { + return EnvMapping(container, name) is { } env + ? ManifestNodeExtensions.Scalar(env, "valueFrom", "secretKeyRef", "name") + : null; + } + + private static string? EnvSecretKey(YamlMappingNode container, string name) + { + return EnvMapping(container, name) is { } env + ? ManifestNodeExtensions.Scalar(env, "valueFrom", "secretKeyRef", "key") + : null; + } + + private static YamlMappingNode? EnvMapping(YamlMappingNode container, string name) + { + return ManifestNodeExtensions.MappingSequence(container, "env") + .SingleOrDefault(env => string.Equals(ManifestNodeExtensions.Scalar(env, "name"), name, StringComparison.Ordinal)); + } } internal sealed class ManifestInventory