From 9a4a8264d9ffa8cbc44a024020ce51d5807d5e4b Mon Sep 17 00:00:00 2001 From: Andrew Stoltz Date: Mon, 18 May 2026 17:44:29 -0500 Subject: [PATCH] github-runner: add DM and WorldBuilder runners --- apps/github-runner/README.md | 20 +- apps/github-runner/github-runner.yaml | 270 +++++++++++++++++- apps/monitoring/noc-monitoring.yaml | 4 +- .../FleetManifestLintTests.cs | 19 +- 4 files changed, 304 insertions(+), 9 deletions(-) diff --git a/apps/github-runner/README.md b/apps/github-runner/README.md index a7e3552..e4fa672 100644 --- a/apps/github-runner/README.md +++ b/apps/github-runner/README.md @@ -28,6 +28,10 @@ Sprint 32 final long-tail wave adds 16 two-replica Deployments: `FlowerCore.Provisioning`, `FlowerCore.Redis`, `FlowerCore.MessageBoard`, and `FlowerCore.MenuBoard`. +Sprint 37 Cx-2 closes the audited Linux runner gaps for +`FlowerCore.DeviceManagement` and `FlowerCore.WorldBuilder` with the same +two-replica `emptyDir` pattern. + ## Post-Merge Proof After the PR is merged and ArgoCD syncs, verify the runner fleet: @@ -47,7 +51,7 @@ for repo in FlowerCore.Common FlowerCore.Shared.Pos FlowerCore.Puppet FlowerCore FlowerCore.Distribution FlowerCore.Scoreboard FlowerCore.SegmentDisplay \ FlowerCore.Signage.Contracts FlowerCore.SignalControl FlowerCore.Intranet.Web \ FlowerCore.Provisioning FlowerCore.Redis FlowerCore.MessageBoard \ - FlowerCore.MenuBoard; do + FlowerCore.MenuBoard FlowerCore.DeviceManagement FlowerCore.WorldBuilder; do echo "=== $repo ===" gh api "/repos/astoltz/$repo/actions/runners" \ --jq '.runners[] | select(.labels[].name == "fc-build-linux") | {name,status,busy,labels:[.labels[].name]}' @@ -64,6 +68,20 @@ gh run list --repo astoltz/FlowerCore.Shared.Pos \ If the latest run is still queued after runner registration, rerun the workflow from GitHub Actions and verify it lands on an `rke2-linux-*` runner. +## Sprint 37 Cx-2 Gap Audit + +The 2026-05-18 GitHub workflow scan found these remaining repos with +`runs-on: [self-hosted, linux, fc-build-linux]` but no K8s runner Deployment: +`FlowerCore.AiStation.Linux`, `FlowerCore.PHP`, `FlowerCore.PiManager`, +`FlowerCore.Shared.Barcodes`, `FlowerCore.Shared.Lookup`, +`FlowerCore.Shared.Nodes`, `FlowerCore.Shared.PrintClient`, +`FlowerCore.Shared.Relay`, `FlowerCore.Shared.ShowRunner`, and +`FlowerCore.Shared.Storage`. + +Mixed/platform repos also have Linux workflow legs but need owner review before +adding Linux runner Deployments: `FlowerCore.Library.Mac`, +`FlowerCore.Signage.Agent.AppleTv`, and `FlowerCore.Signage.Player.Wpf`. + ## Failure Notes - `actions/setup-dotnet` permission error at `/usr/share/dotnet`: check that diff --git a/apps/github-runner/github-runner.yaml b/apps/github-runner/github-runner.yaml index 957391f..ec6159a 100644 --- a/apps/github-runner/github-runner.yaml +++ b/apps/github-runner/github-runner.yaml @@ -16,6 +16,8 @@ # DNS, Distribution, Scoreboard, SegmentDisplay, Signage.Contracts, # SignalControl, Intranet.Web, Provisioning, Redis, MessageBoard, MenuBoard # (Sprint 32 final long-tail wave; two replicas each, emptyDir cache) +# FlowerCore.DeviceManagement, WorldBuilder (Sprint 37 Cx-2 runner gap +# closure; two replicas each, emptyDir cache) # # Non-root CI safety: # Runner pods run as uid 1001. HOME, DOTNET_INSTALL_DIR, DOTNET_CLI_HOME, @@ -3767,9 +3769,271 @@ spec: - name: tmp emptyDir: {} restartPolicy: Always +--- +# Runner for FlowerCore.DeviceManagement. Added 2026-05-18 (Sprint 37 Cx-2) +# to close the Linux CI capacity gap for the DM service-tier workflows. Mirrors +# the Sprint 32 long-tail emptyDir pattern: two replicas, shared +# 1Password-backed ACCESS_TOKEN, and the common ServiceAccount. +apiVersion: apps/v1 +kind: Deployment +metadata: + name: github-runner-devicemgmt + namespace: github-runner + labels: + app.kubernetes.io/name: github-runner-devicemgmt + app.kubernetes.io/component: runner + app.kubernetes.io/part-of: flowercore + app.kubernetes.io/managed-by: argocd + flowercore.io/created-by: argocd + flowercore.io/runner-repo: devicemgmt + flowercore.io/github-repo: FlowerCore.DeviceManagement +spec: + replicas: 2 + selector: + matchLabels: + app.kubernetes.io/name: github-runner-devicemgmt + strategy: + type: Recreate + template: + metadata: + labels: + app.kubernetes.io/name: github-runner-devicemgmt + app.kubernetes.io/component: runner + app.kubernetes.io/part-of: flowercore + flowercore.io/created-by: argocd + flowercore.io/runner-repo: devicemgmt + flowercore.io/github-repo: FlowerCore.DeviceManagement + spec: + serviceAccountName: github-runner + securityContext: + runAsNonRoot: true + runAsUser: 1001 + runAsGroup: 1001 + fsGroup: 1001 + initContainers: + - name: setup-runner-home + image: busybox:1.36 + command: + - sh + - -c + - | + set -e + mkdir -p /home/runner/.dotnet /home/runner/.nuget/packages /home/runner/.nuget/NuGet + chown -R 1001:1001 /home/runner/.dotnet /home/runner/.nuget + chmod -R 755 /home/runner/.dotnet /home/runner/.nuget + securityContext: + runAsUser: 0 + runAsNonRoot: false + volumeMounts: + - name: runner-home + mountPath: /home/runner + containers: + - name: runner + image: myoung34/github-runner:latest + imagePullPolicy: Always + env: + - name: REPO_URL + value: "https://github.com/astoltz/FlowerCore.DeviceManagement" + - name: RUNNER_NAME_PREFIX + value: "rke2-linux-devicemgmt" + - name: RUNNER_WORKDIR + value: "/tmp/runner/work" + - name: EPHEMERAL + value: "true" + - name: LABELS + value: "self-hosted,linux,fc-build-linux" + - name: HOME + value: "/home/runner" + - name: DOTNET_INSTALL_DIR + value: "/home/runner/.dotnet" + - name: DOTNET_CLI_TELEMETRY_OPTOUT + value: "1" + - name: DOTNET_NOLOGO + value: "1" + - name: DOTNET_GENERATE_ASPNET_CERTIFICATE + value: "false" + - name: DOTNET_CLI_HOME + value: "/home/runner" + - name: NUGET_PACKAGES + value: "/home/runner/.nuget/packages" + - name: XDG_CACHE_HOME + value: "/home/runner/.cache" + - name: RUNNER_TOOL_CACHE + value: "/home/runner/_tool" + - name: ACCESS_TOKEN + valueFrom: + secretKeyRef: + name: github-runner-token + key: credential + - name: RUN_AS_ROOT + value: "false" + resources: + requests: + cpu: "500m" + memory: "1Gi" + limits: + cpu: "2000m" + memory: "4Gi" + volumeMounts: + - name: runner-home + mountPath: /home/runner + - name: nuget-cache + mountPath: /home/runner/.nuget/packages + - name: tmp + mountPath: /tmp + livenessProbe: + exec: + command: + - /bin/sh + - -c + - "pgrep -f Runner.Listener > /dev/null" + initialDelaySeconds: 30 + periodSeconds: 30 + failureThreshold: 3 + volumes: + - name: runner-home + emptyDir: {} + - name: nuget-cache + emptyDir: + sizeLimit: 2Gi + - name: tmp + emptyDir: {} + restartPolicy: Always +--- +# Runner for FlowerCore.WorldBuilder. Added 2026-05-18 (Sprint 37 Cx-2) +# to unblock WorldBuilder Linux CI jobs after the runner fleet audit found no +# repo-scoped deployment for the GitHub repo. Mirrors the Sprint 32 long-tail +# emptyDir pattern: two replicas, shared 1Password-backed ACCESS_TOKEN, and +# the common ServiceAccount. +apiVersion: apps/v1 +kind: Deployment +metadata: + name: github-runner-worldbuilder + namespace: github-runner + labels: + app.kubernetes.io/name: github-runner-worldbuilder + app.kubernetes.io/component: runner + app.kubernetes.io/part-of: flowercore + app.kubernetes.io/managed-by: argocd + flowercore.io/created-by: argocd + flowercore.io/runner-repo: worldbuilder + flowercore.io/github-repo: FlowerCore.WorldBuilder +spec: + replicas: 2 + selector: + matchLabels: + app.kubernetes.io/name: github-runner-worldbuilder + strategy: + type: Recreate + template: + metadata: + labels: + app.kubernetes.io/name: github-runner-worldbuilder + app.kubernetes.io/component: runner + app.kubernetes.io/part-of: flowercore + flowercore.io/created-by: argocd + flowercore.io/runner-repo: worldbuilder + flowercore.io/github-repo: FlowerCore.WorldBuilder + spec: + serviceAccountName: github-runner + securityContext: + runAsNonRoot: true + runAsUser: 1001 + runAsGroup: 1001 + fsGroup: 1001 + initContainers: + - name: setup-runner-home + image: busybox:1.36 + command: + - sh + - -c + - | + set -e + mkdir -p /home/runner/.dotnet /home/runner/.nuget/packages /home/runner/.nuget/NuGet + chown -R 1001:1001 /home/runner/.dotnet /home/runner/.nuget + chmod -R 755 /home/runner/.dotnet /home/runner/.nuget + securityContext: + runAsUser: 0 + runAsNonRoot: false + volumeMounts: + - name: runner-home + mountPath: /home/runner + containers: + - name: runner + image: myoung34/github-runner:latest + imagePullPolicy: Always + env: + - name: REPO_URL + value: "https://github.com/astoltz/FlowerCore.WorldBuilder" + - name: RUNNER_NAME_PREFIX + value: "rke2-linux-worldbuilder" + - name: RUNNER_WORKDIR + value: "/tmp/runner/work" + - name: EPHEMERAL + value: "true" + - name: LABELS + value: "self-hosted,linux,fc-build-linux" + - name: HOME + value: "/home/runner" + - name: DOTNET_INSTALL_DIR + value: "/home/runner/.dotnet" + - name: DOTNET_CLI_TELEMETRY_OPTOUT + value: "1" + - name: DOTNET_NOLOGO + value: "1" + - name: DOTNET_GENERATE_ASPNET_CERTIFICATE + value: "false" + - name: DOTNET_CLI_HOME + value: "/home/runner" + - name: NUGET_PACKAGES + value: "/home/runner/.nuget/packages" + - name: XDG_CACHE_HOME + value: "/home/runner/.cache" + - name: RUNNER_TOOL_CACHE + value: "/home/runner/_tool" + - name: ACCESS_TOKEN + valueFrom: + secretKeyRef: + name: github-runner-token + key: credential + - name: RUN_AS_ROOT + value: "false" + resources: + requests: + cpu: "500m" + memory: "1Gi" + limits: + cpu: "2000m" + memory: "4Gi" + volumeMounts: + - name: runner-home + mountPath: /home/runner + - name: nuget-cache + mountPath: /home/runner/.nuget/packages + - name: tmp + mountPath: /tmp + livenessProbe: + exec: + command: + - /bin/sh + - -c + - "pgrep -f Runner.Listener > /dev/null" + initialDelaySeconds: 30 + periodSeconds: 30 + failureThreshold: 3 + volumes: + - name: runner-home + emptyDir: {} + - name: nuget-cache + emptyDir: + sizeLimit: 2Gi + - name: tmp + emptyDir: {} + restartPolicy: Always # Long-tail runner pattern: # -# Sprint 32 added the final 16 long-tail repo-scoped Deployments above. Keep -# Common as the only PVC-backed runner at replicas: 1. Any future multi-replica -# runner must use per-pod emptyDir caches, not a shared ReadWriteOnce PVC. +# Sprint 32 added the final 16 long-tail repo-scoped Deployments, and Sprint 37 +# added the DM + WorldBuilder runner gap closures above. Keep Common as the +# only PVC-backed runner at replicas: 1. Any future multi-replica runner must +# use per-pod emptyDir caches, not a shared ReadWriteOnce PVC. diff --git a/apps/monitoring/noc-monitoring.yaml b/apps/monitoring/noc-monitoring.yaml index ea1b175..461830a 100644 --- a/apps/monitoring/noc-monitoring.yaml +++ b/apps/monitoring/noc-monitoring.yaml @@ -729,7 +729,7 @@ data: expr: | kube_deployment_status_replicas_ready{ namespace="github-runner", - deployment=~"github-runner(|-(sharedpos|puppet|signage|dms|telephony|print-web|chat|mysql|kiosk-linux))" + deployment=~"github-runner(|-.+)" } == 0 for: 5m labels: @@ -3509,7 +3509,7 @@ data: - refId: A relativeTimeRange: {from: 300, to: 0} datasourceUid: prometheus - model: {expr: 'kube_deployment_status_replicas_ready{namespace="github-runner",deployment=~"github-runner(|-(sharedpos|puppet|signage|dms|telephony|print-web|chat|mysql|kiosk-linux))"} == 0', instant: true, refId: A} + model: {expr: 'kube_deployment_status_replicas_ready{namespace="github-runner",deployment=~"github-runner(|-.+)"} == 0', instant: true, refId: A} - refId: B relativeTimeRange: {from: 300, to: 0} datasourceUid: __expr__ diff --git a/tests/bluejay-infra-lint/FleetManifestLintTests.cs b/tests/bluejay-infra-lint/FleetManifestLintTests.cs index eb9683d..cc5d166 100644 --- a/tests/bluejay-infra-lint/FleetManifestLintTests.cs +++ b/tests/bluejay-infra-lint/FleetManifestLintTests.cs @@ -67,6 +67,8 @@ public sealed class FleetManifestLintTests ["github-runner-chat"] = "https://github.com/astoltz/FlowerCore.Chat", ["github-runner-mysql"] = "https://github.com/astoltz/FlowerCore.MySQL", ["github-runner-kiosk-linux"] = "https://github.com/astoltz/FlowerCore.Kiosk.Linux", + ["github-runner-devicemgmt"] = "https://github.com/astoltz/FlowerCore.DeviceManagement", + ["github-runner-worldbuilder"] = "https://github.com/astoltz/FlowerCore.WorldBuilder", }; private static readonly HashSet ScaledLinuxRunnerDeployments = new(StringComparer.Ordinal) @@ -80,6 +82,8 @@ public sealed class FleetManifestLintTests "github-runner-chat", "github-runner-mysql", "github-runner-kiosk-linux", + "github-runner-devicemgmt", + "github-runner-worldbuilder", }; private static readonly IReadOnlyDictionary WritableRunnerEnv = new Dictionary(StringComparer.Ordinal) @@ -234,7 +238,7 @@ public sealed class FleetManifestLintTests { deployments.Should().ContainKey(expectedRunner.Key); - var container = deployments[expectedRunner.Key].ContainerMappings().Should().ContainSingle().Subject; + var container = RunnerContainer(deployments[expectedRunner.Key]); EnvValue(container, "REPO_URL").Should().Be(expectedRunner.Value); EnvValue(container, "EPHEMERAL").Should().Be("true"); EnvValue(container, "LABELS").Should().Be("self-hosted,linux,fc-build-linux"); @@ -250,7 +254,7 @@ public sealed class FleetManifestLintTests { foreach (var deployment in GitHubRunnerDeployments().Values) { - var container = deployment.ContainerMappings().Should().ContainSingle().Subject; + var container = RunnerContainer(deployment); foreach (var expectedEnv in WritableRunnerEnv) { @@ -311,7 +315,7 @@ public sealed class FleetManifestLintTests monitoring.Should().Contain("MacMiniRunnerOffline"); monitoring.Should().Contain("LinuxRunnerOffline"); monitoring.Should().Contain("kube_deployment_status_replicas_ready"); - monitoring.Should().Contain("github-runner(|-(sharedpos|puppet|signage|dms|telephony|print-web|chat|mysql|kiosk-linux))"); + monitoring.Should().Contain("github-runner(|-.+)"); monitoring.Should().Contain("folder: CI Alerts"); monitoring.Should().Contain("uid: linux-runner-offline"); monitoring.Should().Contain("alert_channel: irc"); @@ -641,6 +645,15 @@ public sealed class FleetManifestLintTests return EnvMapping(container, name) is { } env ? ManifestNodeExtensions.Scalar(env, "value") : null; } + private static YamlMappingNode RunnerContainer(ManifestDocument deployment) + { + return deployment.ContainerMappings() + .Where(container => string.Equals(ManifestNodeExtensions.Scalar(container, "name"), "runner", StringComparison.Ordinal)) + .Should() + .ContainSingle($"{deployment.Name} must keep exactly one main runner container") + .Subject; + } + private static string? EnvSecretName(YamlMappingNode container, string name) { return EnvMapping(container, name) is { } env -- 2.49.1