Sprint 37 Cx-2: add DM and WorldBuilder Linux runners #14

Open
bluejay wants to merge 1 commits from sprint37/cx-2-linux-runner-expansion into main
4 changed files with 304 additions and 9 deletions
Showing only changes of commit 9a4a8264d9 - Show all commits

View File

@@ -28,6 +28,10 @@ Sprint 32 final long-tail wave adds 16 two-replica Deployments:
`FlowerCore.Provisioning`, `FlowerCore.Redis`, `FlowerCore.MessageBoard`, and `FlowerCore.Provisioning`, `FlowerCore.Redis`, `FlowerCore.MessageBoard`, and
`FlowerCore.MenuBoard`. `FlowerCore.MenuBoard`.
Sprint 37 Cx-2 closes the audited Linux runner gaps for
`FlowerCore.DeviceManagement` and `FlowerCore.WorldBuilder` with the same
two-replica `emptyDir` pattern.
## Post-Merge Proof ## Post-Merge Proof
After the PR is merged and ArgoCD syncs, verify the runner fleet: After the PR is merged and ArgoCD syncs, verify the runner fleet:
@@ -47,7 +51,7 @@ for repo in FlowerCore.Common FlowerCore.Shared.Pos FlowerCore.Puppet FlowerCore
FlowerCore.Distribution FlowerCore.Scoreboard FlowerCore.SegmentDisplay \ FlowerCore.Distribution FlowerCore.Scoreboard FlowerCore.SegmentDisplay \
FlowerCore.Signage.Contracts FlowerCore.SignalControl FlowerCore.Intranet.Web \ FlowerCore.Signage.Contracts FlowerCore.SignalControl FlowerCore.Intranet.Web \
FlowerCore.Provisioning FlowerCore.Redis FlowerCore.MessageBoard \ FlowerCore.Provisioning FlowerCore.Redis FlowerCore.MessageBoard \
FlowerCore.MenuBoard; do FlowerCore.MenuBoard FlowerCore.DeviceManagement FlowerCore.WorldBuilder; do
echo "=== $repo ===" echo "=== $repo ==="
gh api "/repos/astoltz/$repo/actions/runners" \ gh api "/repos/astoltz/$repo/actions/runners" \
--jq '.runners[] | select(.labels[].name == "fc-build-linux") | {name,status,busy,labels:[.labels[].name]}' --jq '.runners[] | select(.labels[].name == "fc-build-linux") | {name,status,busy,labels:[.labels[].name]}'
@@ -64,6 +68,20 @@ gh run list --repo astoltz/FlowerCore.Shared.Pos \
If the latest run is still queued after runner registration, rerun the workflow If the latest run is still queued after runner registration, rerun the workflow
from GitHub Actions and verify it lands on an `rke2-linux-*` runner. from GitHub Actions and verify it lands on an `rke2-linux-*` runner.
## Sprint 37 Cx-2 Gap Audit
The 2026-05-18 GitHub workflow scan found these remaining repos with
`runs-on: [self-hosted, linux, fc-build-linux]` but no K8s runner Deployment:
`FlowerCore.AiStation.Linux`, `FlowerCore.PHP`, `FlowerCore.PiManager`,
`FlowerCore.Shared.Barcodes`, `FlowerCore.Shared.Lookup`,
`FlowerCore.Shared.Nodes`, `FlowerCore.Shared.PrintClient`,
`FlowerCore.Shared.Relay`, `FlowerCore.Shared.ShowRunner`, and
`FlowerCore.Shared.Storage`.
Mixed/platform repos also have Linux workflow legs but need owner review before
adding Linux runner Deployments: `FlowerCore.Library.Mac`,
`FlowerCore.Signage.Agent.AppleTv`, and `FlowerCore.Signage.Player.Wpf`.
## Failure Notes ## Failure Notes
- `actions/setup-dotnet` permission error at `/usr/share/dotnet`: check that - `actions/setup-dotnet` permission error at `/usr/share/dotnet`: check that

View File

@@ -16,6 +16,8 @@
# DNS, Distribution, Scoreboard, SegmentDisplay, Signage.Contracts, # DNS, Distribution, Scoreboard, SegmentDisplay, Signage.Contracts,
# SignalControl, Intranet.Web, Provisioning, Redis, MessageBoard, MenuBoard # SignalControl, Intranet.Web, Provisioning, Redis, MessageBoard, MenuBoard
# (Sprint 32 final long-tail wave; two replicas each, emptyDir cache) # (Sprint 32 final long-tail wave; two replicas each, emptyDir cache)
# FlowerCore.DeviceManagement, WorldBuilder (Sprint 37 Cx-2 runner gap
# closure; two replicas each, emptyDir cache)
# #
# Non-root CI safety: # Non-root CI safety:
# Runner pods run as uid 1001. HOME, DOTNET_INSTALL_DIR, DOTNET_CLI_HOME, # Runner pods run as uid 1001. HOME, DOTNET_INSTALL_DIR, DOTNET_CLI_HOME,
@@ -3767,9 +3769,271 @@ spec:
- name: tmp - name: tmp
emptyDir: {} emptyDir: {}
restartPolicy: Always restartPolicy: Always
---
# Runner for FlowerCore.DeviceManagement. Added 2026-05-18 (Sprint 37 Cx-2)
# to close the Linux CI capacity gap for the DM service-tier workflows. Mirrors
# the Sprint 32 long-tail emptyDir pattern: two replicas, shared
# 1Password-backed ACCESS_TOKEN, and the common ServiceAccount.
apiVersion: apps/v1
kind: Deployment
metadata:
name: github-runner-devicemgmt
namespace: github-runner
labels:
app.kubernetes.io/name: github-runner-devicemgmt
app.kubernetes.io/component: runner
app.kubernetes.io/part-of: flowercore
app.kubernetes.io/managed-by: argocd
flowercore.io/created-by: argocd
flowercore.io/runner-repo: devicemgmt
flowercore.io/github-repo: FlowerCore.DeviceManagement
spec:
replicas: 2
selector:
matchLabels:
app.kubernetes.io/name: github-runner-devicemgmt
strategy:
type: Recreate
template:
metadata:
labels:
app.kubernetes.io/name: github-runner-devicemgmt
app.kubernetes.io/component: runner
app.kubernetes.io/part-of: flowercore
flowercore.io/created-by: argocd
flowercore.io/runner-repo: devicemgmt
flowercore.io/github-repo: FlowerCore.DeviceManagement
spec:
serviceAccountName: github-runner
securityContext:
runAsNonRoot: true
runAsUser: 1001
runAsGroup: 1001
fsGroup: 1001
initContainers:
- name: setup-runner-home
image: busybox:1.36
command:
- sh
- -c
- |
set -e
mkdir -p /home/runner/.dotnet /home/runner/.nuget/packages /home/runner/.nuget/NuGet
chown -R 1001:1001 /home/runner/.dotnet /home/runner/.nuget
chmod -R 755 /home/runner/.dotnet /home/runner/.nuget
securityContext:
runAsUser: 0
runAsNonRoot: false
volumeMounts:
- name: runner-home
mountPath: /home/runner
containers:
- name: runner
image: myoung34/github-runner:latest
imagePullPolicy: Always
env:
- name: REPO_URL
value: "https://github.com/astoltz/FlowerCore.DeviceManagement"
- name: RUNNER_NAME_PREFIX
value: "rke2-linux-devicemgmt"
- name: RUNNER_WORKDIR
value: "/tmp/runner/work"
- name: EPHEMERAL
value: "true"
- name: LABELS
value: "self-hosted,linux,fc-build-linux"
- name: HOME
value: "/home/runner"
- name: DOTNET_INSTALL_DIR
value: "/home/runner/.dotnet"
- name: DOTNET_CLI_TELEMETRY_OPTOUT
value: "1"
- name: DOTNET_NOLOGO
value: "1"
- name: DOTNET_GENERATE_ASPNET_CERTIFICATE
value: "false"
- name: DOTNET_CLI_HOME
value: "/home/runner"
- name: NUGET_PACKAGES
value: "/home/runner/.nuget/packages"
- name: XDG_CACHE_HOME
value: "/home/runner/.cache"
- name: RUNNER_TOOL_CACHE
value: "/home/runner/_tool"
- name: ACCESS_TOKEN
valueFrom:
secretKeyRef:
name: github-runner-token
key: credential
- name: RUN_AS_ROOT
value: "false"
resources:
requests:
cpu: "500m"
memory: "1Gi"
limits:
cpu: "2000m"
memory: "4Gi"
volumeMounts:
- name: runner-home
mountPath: /home/runner
- name: nuget-cache
mountPath: /home/runner/.nuget/packages
- name: tmp
mountPath: /tmp
livenessProbe:
exec:
command:
- /bin/sh
- -c
- "pgrep -f Runner.Listener > /dev/null"
initialDelaySeconds: 30
periodSeconds: 30
failureThreshold: 3
volumes:
- name: runner-home
emptyDir: {}
- name: nuget-cache
emptyDir:
sizeLimit: 2Gi
- name: tmp
emptyDir: {}
restartPolicy: Always
---
# Runner for FlowerCore.WorldBuilder. Added 2026-05-18 (Sprint 37 Cx-2)
# to unblock WorldBuilder Linux CI jobs after the runner fleet audit found no
# repo-scoped deployment for the GitHub repo. Mirrors the Sprint 32 long-tail
# emptyDir pattern: two replicas, shared 1Password-backed ACCESS_TOKEN, and
# the common ServiceAccount.
apiVersion: apps/v1
kind: Deployment
metadata:
name: github-runner-worldbuilder
namespace: github-runner
labels:
app.kubernetes.io/name: github-runner-worldbuilder
app.kubernetes.io/component: runner
app.kubernetes.io/part-of: flowercore
app.kubernetes.io/managed-by: argocd
flowercore.io/created-by: argocd
flowercore.io/runner-repo: worldbuilder
flowercore.io/github-repo: FlowerCore.WorldBuilder
spec:
replicas: 2
selector:
matchLabels:
app.kubernetes.io/name: github-runner-worldbuilder
strategy:
type: Recreate
template:
metadata:
labels:
app.kubernetes.io/name: github-runner-worldbuilder
app.kubernetes.io/component: runner
app.kubernetes.io/part-of: flowercore
flowercore.io/created-by: argocd
flowercore.io/runner-repo: worldbuilder
flowercore.io/github-repo: FlowerCore.WorldBuilder
spec:
serviceAccountName: github-runner
securityContext:
runAsNonRoot: true
runAsUser: 1001
runAsGroup: 1001
fsGroup: 1001
initContainers:
- name: setup-runner-home
image: busybox:1.36
command:
- sh
- -c
- |
set -e
mkdir -p /home/runner/.dotnet /home/runner/.nuget/packages /home/runner/.nuget/NuGet
chown -R 1001:1001 /home/runner/.dotnet /home/runner/.nuget
chmod -R 755 /home/runner/.dotnet /home/runner/.nuget
securityContext:
runAsUser: 0
runAsNonRoot: false
volumeMounts:
- name: runner-home
mountPath: /home/runner
containers:
- name: runner
image: myoung34/github-runner:latest
imagePullPolicy: Always
env:
- name: REPO_URL
value: "https://github.com/astoltz/FlowerCore.WorldBuilder"
- name: RUNNER_NAME_PREFIX
value: "rke2-linux-worldbuilder"
- name: RUNNER_WORKDIR
value: "/tmp/runner/work"
- name: EPHEMERAL
value: "true"
- name: LABELS
value: "self-hosted,linux,fc-build-linux"
- name: HOME
value: "/home/runner"
- name: DOTNET_INSTALL_DIR
value: "/home/runner/.dotnet"
- name: DOTNET_CLI_TELEMETRY_OPTOUT
value: "1"
- name: DOTNET_NOLOGO
value: "1"
- name: DOTNET_GENERATE_ASPNET_CERTIFICATE
value: "false"
- name: DOTNET_CLI_HOME
value: "/home/runner"
- name: NUGET_PACKAGES
value: "/home/runner/.nuget/packages"
- name: XDG_CACHE_HOME
value: "/home/runner/.cache"
- name: RUNNER_TOOL_CACHE
value: "/home/runner/_tool"
- name: ACCESS_TOKEN
valueFrom:
secretKeyRef:
name: github-runner-token
key: credential
- name: RUN_AS_ROOT
value: "false"
resources:
requests:
cpu: "500m"
memory: "1Gi"
limits:
cpu: "2000m"
memory: "4Gi"
volumeMounts:
- name: runner-home
mountPath: /home/runner
- name: nuget-cache
mountPath: /home/runner/.nuget/packages
- name: tmp
mountPath: /tmp
livenessProbe:
exec:
command:
- /bin/sh
- -c
- "pgrep -f Runner.Listener > /dev/null"
initialDelaySeconds: 30
periodSeconds: 30
failureThreshold: 3
volumes:
- name: runner-home
emptyDir: {}
- name: nuget-cache
emptyDir:
sizeLimit: 2Gi
- name: tmp
emptyDir: {}
restartPolicy: Always
# Long-tail runner pattern: # Long-tail runner pattern:
# #
# Sprint 32 added the final 16 long-tail repo-scoped Deployments above. Keep # Sprint 32 added the final 16 long-tail repo-scoped Deployments, and Sprint 37
# Common as the only PVC-backed runner at replicas: 1. Any future multi-replica # added the DM + WorldBuilder runner gap closures above. Keep Common as the
# runner must use per-pod emptyDir caches, not a shared ReadWriteOnce PVC. # only PVC-backed runner at replicas: 1. Any future multi-replica runner must
# use per-pod emptyDir caches, not a shared ReadWriteOnce PVC.

View File

@@ -729,7 +729,7 @@ data:
expr: | expr: |
kube_deployment_status_replicas_ready{ kube_deployment_status_replicas_ready{
namespace="github-runner", namespace="github-runner",
deployment=~"github-runner(|-(sharedpos|puppet|signage|dms|telephony|print-web|chat|mysql|kiosk-linux))" deployment=~"github-runner(|-.+)"
} == 0 } == 0
for: 5m for: 5m
labels: labels:
@@ -3509,7 +3509,7 @@ data:
- refId: A - refId: A
relativeTimeRange: {from: 300, to: 0} relativeTimeRange: {from: 300, to: 0}
datasourceUid: prometheus datasourceUid: prometheus
model: {expr: 'kube_deployment_status_replicas_ready{namespace="github-runner",deployment=~"github-runner(|-(sharedpos|puppet|signage|dms|telephony|print-web|chat|mysql|kiosk-linux))"} == 0', instant: true, refId: A} model: {expr: 'kube_deployment_status_replicas_ready{namespace="github-runner",deployment=~"github-runner(|-.+)"} == 0', instant: true, refId: A}
- refId: B - refId: B
relativeTimeRange: {from: 300, to: 0} relativeTimeRange: {from: 300, to: 0}
datasourceUid: __expr__ datasourceUid: __expr__

View File

@@ -67,6 +67,8 @@ public sealed class FleetManifestLintTests
["github-runner-chat"] = "https://github.com/astoltz/FlowerCore.Chat", ["github-runner-chat"] = "https://github.com/astoltz/FlowerCore.Chat",
["github-runner-mysql"] = "https://github.com/astoltz/FlowerCore.MySQL", ["github-runner-mysql"] = "https://github.com/astoltz/FlowerCore.MySQL",
["github-runner-kiosk-linux"] = "https://github.com/astoltz/FlowerCore.Kiosk.Linux", ["github-runner-kiosk-linux"] = "https://github.com/astoltz/FlowerCore.Kiosk.Linux",
["github-runner-devicemgmt"] = "https://github.com/astoltz/FlowerCore.DeviceManagement",
["github-runner-worldbuilder"] = "https://github.com/astoltz/FlowerCore.WorldBuilder",
}; };
private static readonly HashSet<string> ScaledLinuxRunnerDeployments = new(StringComparer.Ordinal) private static readonly HashSet<string> ScaledLinuxRunnerDeployments = new(StringComparer.Ordinal)
@@ -80,6 +82,8 @@ public sealed class FleetManifestLintTests
"github-runner-chat", "github-runner-chat",
"github-runner-mysql", "github-runner-mysql",
"github-runner-kiosk-linux", "github-runner-kiosk-linux",
"github-runner-devicemgmt",
"github-runner-worldbuilder",
}; };
private static readonly IReadOnlyDictionary<string, string> WritableRunnerEnv = new Dictionary<string, string>(StringComparer.Ordinal) private static readonly IReadOnlyDictionary<string, string> WritableRunnerEnv = new Dictionary<string, string>(StringComparer.Ordinal)
@@ -234,7 +238,7 @@ public sealed class FleetManifestLintTests
{ {
deployments.Should().ContainKey(expectedRunner.Key); deployments.Should().ContainKey(expectedRunner.Key);
var container = deployments[expectedRunner.Key].ContainerMappings().Should().ContainSingle().Subject; var container = RunnerContainer(deployments[expectedRunner.Key]);
EnvValue(container, "REPO_URL").Should().Be(expectedRunner.Value); EnvValue(container, "REPO_URL").Should().Be(expectedRunner.Value);
EnvValue(container, "EPHEMERAL").Should().Be("true"); EnvValue(container, "EPHEMERAL").Should().Be("true");
EnvValue(container, "LABELS").Should().Be("self-hosted,linux,fc-build-linux"); EnvValue(container, "LABELS").Should().Be("self-hosted,linux,fc-build-linux");
@@ -250,7 +254,7 @@ public sealed class FleetManifestLintTests
{ {
foreach (var deployment in GitHubRunnerDeployments().Values) foreach (var deployment in GitHubRunnerDeployments().Values)
{ {
var container = deployment.ContainerMappings().Should().ContainSingle().Subject; var container = RunnerContainer(deployment);
foreach (var expectedEnv in WritableRunnerEnv) foreach (var expectedEnv in WritableRunnerEnv)
{ {
@@ -311,7 +315,7 @@ public sealed class FleetManifestLintTests
monitoring.Should().Contain("MacMiniRunnerOffline"); monitoring.Should().Contain("MacMiniRunnerOffline");
monitoring.Should().Contain("LinuxRunnerOffline"); monitoring.Should().Contain("LinuxRunnerOffline");
monitoring.Should().Contain("kube_deployment_status_replicas_ready"); monitoring.Should().Contain("kube_deployment_status_replicas_ready");
monitoring.Should().Contain("github-runner(|-(sharedpos|puppet|signage|dms|telephony|print-web|chat|mysql|kiosk-linux))"); monitoring.Should().Contain("github-runner(|-.+)");
monitoring.Should().Contain("folder: CI Alerts"); monitoring.Should().Contain("folder: CI Alerts");
monitoring.Should().Contain("uid: linux-runner-offline"); monitoring.Should().Contain("uid: linux-runner-offline");
monitoring.Should().Contain("alert_channel: irc"); monitoring.Should().Contain("alert_channel: irc");
@@ -641,6 +645,15 @@ public sealed class FleetManifestLintTests
return EnvMapping(container, name) is { } env ? ManifestNodeExtensions.Scalar(env, "value") : null; return EnvMapping(container, name) is { } env ? ManifestNodeExtensions.Scalar(env, "value") : null;
} }
private static YamlMappingNode RunnerContainer(ManifestDocument deployment)
{
return deployment.ContainerMappings()
.Where(container => string.Equals(ManifestNodeExtensions.Scalar(container, "name"), "runner", StringComparison.Ordinal))
.Should()
.ContainSingle($"{deployment.Name} must keep exactly one main runner container")
.Subject;
}
private static string? EnvSecretName(YamlMappingNode container, string name) private static string? EnvSecretName(YamlMappingNode container, string name)
{ {
return EnvMapping(container, name) is { } env return EnvMapping(container, name) is { } env