Sprint 37 Cx-2: add DM and WorldBuilder Linux runners #14

Open
bluejay wants to merge 1 commits from sprint37/cx-2-linux-runner-expansion into main
4 changed files with 304 additions and 9 deletions
Showing only changes of commit 9a4a8264d9 - Show all commits

View File

@@ -28,6 +28,10 @@ Sprint 32 final long-tail wave adds 16 two-replica Deployments:
`FlowerCore.Provisioning`, `FlowerCore.Redis`, `FlowerCore.MessageBoard`, and
`FlowerCore.MenuBoard`.
Sprint 37 Cx-2 closes the audited Linux runner gaps for
`FlowerCore.DeviceManagement` and `FlowerCore.WorldBuilder` with the same
two-replica `emptyDir` pattern.
## Post-Merge Proof
After the PR is merged and ArgoCD syncs, verify the runner fleet:
@@ -47,7 +51,7 @@ for repo in FlowerCore.Common FlowerCore.Shared.Pos FlowerCore.Puppet FlowerCore
FlowerCore.Distribution FlowerCore.Scoreboard FlowerCore.SegmentDisplay \
FlowerCore.Signage.Contracts FlowerCore.SignalControl FlowerCore.Intranet.Web \
FlowerCore.Provisioning FlowerCore.Redis FlowerCore.MessageBoard \
FlowerCore.MenuBoard; do
FlowerCore.MenuBoard FlowerCore.DeviceManagement FlowerCore.WorldBuilder; do
echo "=== $repo ==="
gh api "/repos/astoltz/$repo/actions/runners" \
--jq '.runners[] | select(.labels[].name == "fc-build-linux") | {name,status,busy,labels:[.labels[].name]}'
@@ -64,6 +68,20 @@ gh run list --repo astoltz/FlowerCore.Shared.Pos \
If the latest run is still queued after runner registration, rerun the workflow
from GitHub Actions and verify it lands on an `rke2-linux-*` runner.
## Sprint 37 Cx-2 Gap Audit
The 2026-05-18 GitHub workflow scan found these remaining repos with
`runs-on: [self-hosted, linux, fc-build-linux]` but no K8s runner Deployment:
`FlowerCore.AiStation.Linux`, `FlowerCore.PHP`, `FlowerCore.PiManager`,
`FlowerCore.Shared.Barcodes`, `FlowerCore.Shared.Lookup`,
`FlowerCore.Shared.Nodes`, `FlowerCore.Shared.PrintClient`,
`FlowerCore.Shared.Relay`, `FlowerCore.Shared.ShowRunner`, and
`FlowerCore.Shared.Storage`.
Mixed/platform repos also have Linux workflow legs but need owner review before
adding Linux runner Deployments: `FlowerCore.Library.Mac`,
`FlowerCore.Signage.Agent.AppleTv`, and `FlowerCore.Signage.Player.Wpf`.
## Failure Notes
- `actions/setup-dotnet` permission error at `/usr/share/dotnet`: check that

View File

@@ -16,6 +16,8 @@
# DNS, Distribution, Scoreboard, SegmentDisplay, Signage.Contracts,
# SignalControl, Intranet.Web, Provisioning, Redis, MessageBoard, MenuBoard
# (Sprint 32 final long-tail wave; two replicas each, emptyDir cache)
# FlowerCore.DeviceManagement, WorldBuilder (Sprint 37 Cx-2 runner gap
# closure; two replicas each, emptyDir cache)
#
# Non-root CI safety:
# Runner pods run as uid 1001. HOME, DOTNET_INSTALL_DIR, DOTNET_CLI_HOME,
@@ -3767,9 +3769,271 @@ spec:
- name: tmp
emptyDir: {}
restartPolicy: Always
---
# Runner for FlowerCore.DeviceManagement. Added 2026-05-18 (Sprint 37 Cx-2)
# to close the Linux CI capacity gap for the DM service-tier workflows. Mirrors
# the Sprint 32 long-tail emptyDir pattern: two replicas, shared
# 1Password-backed ACCESS_TOKEN, and the common ServiceAccount.
apiVersion: apps/v1
kind: Deployment
metadata:
name: github-runner-devicemgmt
namespace: github-runner
labels:
app.kubernetes.io/name: github-runner-devicemgmt
app.kubernetes.io/component: runner
app.kubernetes.io/part-of: flowercore
app.kubernetes.io/managed-by: argocd
flowercore.io/created-by: argocd
flowercore.io/runner-repo: devicemgmt
flowercore.io/github-repo: FlowerCore.DeviceManagement
spec:
replicas: 2
selector:
matchLabels:
app.kubernetes.io/name: github-runner-devicemgmt
strategy:
type: Recreate
template:
metadata:
labels:
app.kubernetes.io/name: github-runner-devicemgmt
app.kubernetes.io/component: runner
app.kubernetes.io/part-of: flowercore
flowercore.io/created-by: argocd
flowercore.io/runner-repo: devicemgmt
flowercore.io/github-repo: FlowerCore.DeviceManagement
spec:
serviceAccountName: github-runner
securityContext:
runAsNonRoot: true
runAsUser: 1001
runAsGroup: 1001
fsGroup: 1001
initContainers:
- name: setup-runner-home
image: busybox:1.36
command:
- sh
- -c
- |
set -e
mkdir -p /home/runner/.dotnet /home/runner/.nuget/packages /home/runner/.nuget/NuGet
chown -R 1001:1001 /home/runner/.dotnet /home/runner/.nuget
chmod -R 755 /home/runner/.dotnet /home/runner/.nuget
securityContext:
runAsUser: 0
runAsNonRoot: false
volumeMounts:
- name: runner-home
mountPath: /home/runner
containers:
- name: runner
image: myoung34/github-runner:latest
imagePullPolicy: Always
env:
- name: REPO_URL
value: "https://github.com/astoltz/FlowerCore.DeviceManagement"
- name: RUNNER_NAME_PREFIX
value: "rke2-linux-devicemgmt"
- name: RUNNER_WORKDIR
value: "/tmp/runner/work"
- name: EPHEMERAL
value: "true"
- name: LABELS
value: "self-hosted,linux,fc-build-linux"
- name: HOME
value: "/home/runner"
- name: DOTNET_INSTALL_DIR
value: "/home/runner/.dotnet"
- name: DOTNET_CLI_TELEMETRY_OPTOUT
value: "1"
- name: DOTNET_NOLOGO
value: "1"
- name: DOTNET_GENERATE_ASPNET_CERTIFICATE
value: "false"
- name: DOTNET_CLI_HOME
value: "/home/runner"
- name: NUGET_PACKAGES
value: "/home/runner/.nuget/packages"
- name: XDG_CACHE_HOME
value: "/home/runner/.cache"
- name: RUNNER_TOOL_CACHE
value: "/home/runner/_tool"
- name: ACCESS_TOKEN
valueFrom:
secretKeyRef:
name: github-runner-token
key: credential
- name: RUN_AS_ROOT
value: "false"
resources:
requests:
cpu: "500m"
memory: "1Gi"
limits:
cpu: "2000m"
memory: "4Gi"
volumeMounts:
- name: runner-home
mountPath: /home/runner
- name: nuget-cache
mountPath: /home/runner/.nuget/packages
- name: tmp
mountPath: /tmp
livenessProbe:
exec:
command:
- /bin/sh
- -c
- "pgrep -f Runner.Listener > /dev/null"
initialDelaySeconds: 30
periodSeconds: 30
failureThreshold: 3
volumes:
- name: runner-home
emptyDir: {}
- name: nuget-cache
emptyDir:
sizeLimit: 2Gi
- name: tmp
emptyDir: {}
restartPolicy: Always
---
# Runner for FlowerCore.WorldBuilder. Added 2026-05-18 (Sprint 37 Cx-2)
# to unblock WorldBuilder Linux CI jobs after the runner fleet audit found no
# repo-scoped deployment for the GitHub repo. Mirrors the Sprint 32 long-tail
# emptyDir pattern: two replicas, shared 1Password-backed ACCESS_TOKEN, and
# the common ServiceAccount.
apiVersion: apps/v1
kind: Deployment
metadata:
name: github-runner-worldbuilder
namespace: github-runner
labels:
app.kubernetes.io/name: github-runner-worldbuilder
app.kubernetes.io/component: runner
app.kubernetes.io/part-of: flowercore
app.kubernetes.io/managed-by: argocd
flowercore.io/created-by: argocd
flowercore.io/runner-repo: worldbuilder
flowercore.io/github-repo: FlowerCore.WorldBuilder
spec:
replicas: 2
selector:
matchLabels:
app.kubernetes.io/name: github-runner-worldbuilder
strategy:
type: Recreate
template:
metadata:
labels:
app.kubernetes.io/name: github-runner-worldbuilder
app.kubernetes.io/component: runner
app.kubernetes.io/part-of: flowercore
flowercore.io/created-by: argocd
flowercore.io/runner-repo: worldbuilder
flowercore.io/github-repo: FlowerCore.WorldBuilder
spec:
serviceAccountName: github-runner
securityContext:
runAsNonRoot: true
runAsUser: 1001
runAsGroup: 1001
fsGroup: 1001
initContainers:
- name: setup-runner-home
image: busybox:1.36
command:
- sh
- -c
- |
set -e
mkdir -p /home/runner/.dotnet /home/runner/.nuget/packages /home/runner/.nuget/NuGet
chown -R 1001:1001 /home/runner/.dotnet /home/runner/.nuget
chmod -R 755 /home/runner/.dotnet /home/runner/.nuget
securityContext:
runAsUser: 0
runAsNonRoot: false
volumeMounts:
- name: runner-home
mountPath: /home/runner
containers:
- name: runner
image: myoung34/github-runner:latest
imagePullPolicy: Always
env:
- name: REPO_URL
value: "https://github.com/astoltz/FlowerCore.WorldBuilder"
- name: RUNNER_NAME_PREFIX
value: "rke2-linux-worldbuilder"
- name: RUNNER_WORKDIR
value: "/tmp/runner/work"
- name: EPHEMERAL
value: "true"
- name: LABELS
value: "self-hosted,linux,fc-build-linux"
- name: HOME
value: "/home/runner"
- name: DOTNET_INSTALL_DIR
value: "/home/runner/.dotnet"
- name: DOTNET_CLI_TELEMETRY_OPTOUT
value: "1"
- name: DOTNET_NOLOGO
value: "1"
- name: DOTNET_GENERATE_ASPNET_CERTIFICATE
value: "false"
- name: DOTNET_CLI_HOME
value: "/home/runner"
- name: NUGET_PACKAGES
value: "/home/runner/.nuget/packages"
- name: XDG_CACHE_HOME
value: "/home/runner/.cache"
- name: RUNNER_TOOL_CACHE
value: "/home/runner/_tool"
- name: ACCESS_TOKEN
valueFrom:
secretKeyRef:
name: github-runner-token
key: credential
- name: RUN_AS_ROOT
value: "false"
resources:
requests:
cpu: "500m"
memory: "1Gi"
limits:
cpu: "2000m"
memory: "4Gi"
volumeMounts:
- name: runner-home
mountPath: /home/runner
- name: nuget-cache
mountPath: /home/runner/.nuget/packages
- name: tmp
mountPath: /tmp
livenessProbe:
exec:
command:
- /bin/sh
- -c
- "pgrep -f Runner.Listener > /dev/null"
initialDelaySeconds: 30
periodSeconds: 30
failureThreshold: 3
volumes:
- name: runner-home
emptyDir: {}
- name: nuget-cache
emptyDir:
sizeLimit: 2Gi
- name: tmp
emptyDir: {}
restartPolicy: Always
# Long-tail runner pattern:
#
# Sprint 32 added the final 16 long-tail repo-scoped Deployments above. Keep
# Common as the only PVC-backed runner at replicas: 1. Any future multi-replica
# runner must use per-pod emptyDir caches, not a shared ReadWriteOnce PVC.
# Sprint 32 added the final 16 long-tail repo-scoped Deployments, and Sprint 37
# added the DM + WorldBuilder runner gap closures above. Keep Common as the
# only PVC-backed runner at replicas: 1. Any future multi-replica runner must
# use per-pod emptyDir caches, not a shared ReadWriteOnce PVC.

View File

@@ -729,7 +729,7 @@ data:
expr: |
kube_deployment_status_replicas_ready{
namespace="github-runner",
deployment=~"github-runner(|-(sharedpos|puppet|signage|dms|telephony|print-web|chat|mysql|kiosk-linux))"
deployment=~"github-runner(|-.+)"
} == 0
for: 5m
labels:
@@ -3509,7 +3509,7 @@ data:
- refId: A
relativeTimeRange: {from: 300, to: 0}
datasourceUid: prometheus
model: {expr: 'kube_deployment_status_replicas_ready{namespace="github-runner",deployment=~"github-runner(|-(sharedpos|puppet|signage|dms|telephony|print-web|chat|mysql|kiosk-linux))"} == 0', instant: true, refId: A}
model: {expr: 'kube_deployment_status_replicas_ready{namespace="github-runner",deployment=~"github-runner(|-.+)"} == 0', instant: true, refId: A}
- refId: B
relativeTimeRange: {from: 300, to: 0}
datasourceUid: __expr__

View File

@@ -67,6 +67,8 @@ public sealed class FleetManifestLintTests
["github-runner-chat"] = "https://github.com/astoltz/FlowerCore.Chat",
["github-runner-mysql"] = "https://github.com/astoltz/FlowerCore.MySQL",
["github-runner-kiosk-linux"] = "https://github.com/astoltz/FlowerCore.Kiosk.Linux",
["github-runner-devicemgmt"] = "https://github.com/astoltz/FlowerCore.DeviceManagement",
["github-runner-worldbuilder"] = "https://github.com/astoltz/FlowerCore.WorldBuilder",
};
private static readonly HashSet<string> ScaledLinuxRunnerDeployments = new(StringComparer.Ordinal)
@@ -80,6 +82,8 @@ public sealed class FleetManifestLintTests
"github-runner-chat",
"github-runner-mysql",
"github-runner-kiosk-linux",
"github-runner-devicemgmt",
"github-runner-worldbuilder",
};
private static readonly IReadOnlyDictionary<string, string> WritableRunnerEnv = new Dictionary<string, string>(StringComparer.Ordinal)
@@ -234,7 +238,7 @@ public sealed class FleetManifestLintTests
{
deployments.Should().ContainKey(expectedRunner.Key);
var container = deployments[expectedRunner.Key].ContainerMappings().Should().ContainSingle().Subject;
var container = RunnerContainer(deployments[expectedRunner.Key]);
EnvValue(container, "REPO_URL").Should().Be(expectedRunner.Value);
EnvValue(container, "EPHEMERAL").Should().Be("true");
EnvValue(container, "LABELS").Should().Be("self-hosted,linux,fc-build-linux");
@@ -250,7 +254,7 @@ public sealed class FleetManifestLintTests
{
foreach (var deployment in GitHubRunnerDeployments().Values)
{
var container = deployment.ContainerMappings().Should().ContainSingle().Subject;
var container = RunnerContainer(deployment);
foreach (var expectedEnv in WritableRunnerEnv)
{
@@ -311,7 +315,7 @@ public sealed class FleetManifestLintTests
monitoring.Should().Contain("MacMiniRunnerOffline");
monitoring.Should().Contain("LinuxRunnerOffline");
monitoring.Should().Contain("kube_deployment_status_replicas_ready");
monitoring.Should().Contain("github-runner(|-(sharedpos|puppet|signage|dms|telephony|print-web|chat|mysql|kiosk-linux))");
monitoring.Should().Contain("github-runner(|-.+)");
monitoring.Should().Contain("folder: CI Alerts");
monitoring.Should().Contain("uid: linux-runner-offline");
monitoring.Should().Contain("alert_channel: irc");
@@ -641,6 +645,15 @@ public sealed class FleetManifestLintTests
return EnvMapping(container, name) is { } env ? ManifestNodeExtensions.Scalar(env, "value") : null;
}
private static YamlMappingNode RunnerContainer(ManifestDocument deployment)
{
return deployment.ContainerMappings()
.Where(container => string.Equals(ManifestNodeExtensions.Scalar(container, "name"), "runner", StringComparison.Ordinal))
.Should()
.ContainSingle($"{deployment.Name} must keep exactly one main runner container")
.Subject;
}
private static string? EnvSecretName(YamlMappingNode container, string name)
{
return EnvMapping(container, name) is { } env