Compare commits

..

5 Commits

Author SHA1 Message Date
Andrew Stoltz
ec78175526 tests: add bluejay-ws runner-exclusion lint + fix 3 stale runner-fleet assertions
Adds Runners_MustNotPinToOperatorWorkstationHosts lint test enforcing
operator directive 2026-05-26: BLUEJAY-WS / iamworkin-ws must never be
a fleet GitHub Actions runner. Build-side analog of the Sprint 9 NEW
safe-account exclusion gate (Puppet GPO/AppLocker/WDAC/audit-forwarder
modules refuse to apply on BLUEJAY-WS). Scans every github-runner
Deployment for forbidden nodeName, nodeSelector, nodeAffinity match
expressions, and toleration key/value pinning. See CLAUDE.md "Common
Mistakes" entry and feedback_bluejay_ws_never_public_runner.md.

Also fixes 3 pre-existing GitHubRunnerFleet_* lint failures that broke
when the runner image bumped to v20260525-ruby3.3.11-stepca (added a
setup-runner-home initContainer):

  * Add MainContainerMappings() helper (containers only, excludes
    initContainers) and switch
    GitHubRunnerFleet_MustRegisterRequiredReposAsRepoScopedDeployments
    + GitHubRunnerFleet_MustSetWritableNonRootDotnetAndCachePaths
    over to it. Without this, ContainerMappings().Should().ContainSingle()
    found the initContainer + runner = 2 containers and failed.

  * Loosen GitHubRunnerFleet_MustAvoidRwoMultiAttachForScaledDeployments
    ReplicaCount assertion from Be(2) to BeGreaterOrEqualTo(2). The
    semantic invariant is "at least 2 replicas so no single-pod
    bottleneck"; deployments tuned upward per 14d CI activity (e.g.
    github-runner-print-web at replicas: 3, see commit 1f1f682 PR #24)
    are valid.

Lint baseline: 6 failed -> 3 failed (the 3 remaining are unrelated:
PublicReadWriteIngressRoutes_* lives in FlowerCore.Updater/k8s/
ingressroute.yaml — separate PR; FcDeviceManagement_* needs operator
domain decision on the missing apps/fc-devicemgmt/argocd-application.yaml).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-25 22:41:00 -05:00
Andrew Stoltz
2cc91b6df0 runners: bump tts-reader memory limit 4Gi -> 8Gi
The github-runner-tts-reader pod was being OOMKilled (exit 137)
mid-`dotnet test` on the TtsReader 1000+ test suite. PR #21 CI
(the Windows -> Linux runner migration) flapped twice with the
"self-hosted runner lost communication" annotation before the
K8s-side symptoms surfaced via kubectl describe pod.

Requests bumped 1Gi -> 2Gi, limits 4Gi -> 8Gi. Comment added
inline so future fleet runs don't trip the same wall.

Unblocks PR #21 + the 9 other open TtsReader PRs that all rebase
through it.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-25 22:31:48 -05:00
0d2090fe81 runners: add github-runner-updater Deployment (#29)
Close runner-fleet gap for FlowerCore.Updater. Matches Sprint 32 long-tail pattern; registers entry in fleet-lint required-set.
2026-05-26 03:24:13 +00:00
Andrew Stoltz
bc3548e715 runners: add github-runner-pimanager Deployment
FlowerCore.PiManager build run 26417714843 sat queued 5h with zero
self-hosted runners registered to the repo. PiManager was missed in
the Sprint 32 long-tail sweep — every other FC repo got a dedicated
repo-scoped Deployment with its own ACCESS_TOKEN registration, but
PiManager fell through the cracks.

Adds a 2-replica ephemeral runner Deployment matching the Signage /
DMS / Print.Web pattern (per-pod emptyDir caches, no shared PVC,
labels `self-hosted,linux,fc-build-linux`, shared github-runner-token
PAT). Once ArgoCD syncs, the queued job will pick up automatically.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-25 20:33:44 -05:00
74333cc26b selenium: right-size hub + chrome + edge memory limits (#28) 2026-05-26 01:12:15 +00:00
2 changed files with 400 additions and 5 deletions

View File

@@ -1726,13 +1726,17 @@ spec:
key: credential
- name: RUN_AS_ROOT
value: "false"
# Bumped 2026-05-25: previous 4Gi limit caused OOMKill (exit 137)
# mid-`dotnet test` on TtsReader's 1000+ test suite. PR #21 CI flapped
# twice with "runner lost communication" before the K8s side
# symptoms surfaced. 8Gi gives ~30% headroom over peak observed.
resources:
requests:
cpu: "500m"
memory: "1Gi"
memory: "2Gi"
limits:
cpu: "2000m"
memory: "4Gi"
memory: "8Gi"
volumeMounts:
- name: runner-home
mountPath: /home/runner
@@ -3897,9 +3901,277 @@ spec:
- name: tmp
emptyDir: {}
restartPolicy: Always
---
# Runner for FlowerCore.PiManager. Two replicas use per-pod emptyDir caches, so
# backlog can drain without sharing a ReadWriteOnce PVC. Added 2026-05-25 to
# close the runner-fleet gap that left run 26417714843 queued for 5h.
apiVersion: apps/v1
kind: Deployment
metadata:
name: github-runner-pimanager
namespace: github-runner
labels:
app.kubernetes.io/name: github-runner-pimanager
app.kubernetes.io/component: runner
app.kubernetes.io/part-of: flowercore
app.kubernetes.io/managed-by: argocd
flowercore.io/created-by: argocd
flowercore.io/runner-repo: pimanager
flowercore.io/github-repo: FlowerCore.PiManager
spec:
replicas: 2
selector:
matchLabels:
app.kubernetes.io/name: github-runner-pimanager
strategy:
type: Recreate
template:
metadata:
labels:
app.kubernetes.io/name: github-runner-pimanager
app.kubernetes.io/component: runner
app.kubernetes.io/part-of: flowercore
flowercore.io/created-by: argocd
flowercore.io/runner-repo: pimanager
flowercore.io/github-repo: FlowerCore.PiManager
spec:
serviceAccountName: github-runner
securityContext:
runAsNonRoot: true
runAsUser: 1001
runAsGroup: 1001
fsGroup: 1001
initContainers:
- name: setup-runner-home
image: localhost/fc-github-runner:v20260525-ruby3.3.11-stepca
imagePullPolicy: Never
command:
- sh
- -c
- |
set -e
mkdir -p /home/runner/.dotnet /home/runner/.nuget/packages /home/runner/.nuget/NuGet /home/runner/.cache /home/runner/_tool
if [ -d /opt/runner-toolcache/Ruby ] && [ ! -d /home/runner/_tool/Ruby ]; then
cp -a /opt/runner-toolcache/Ruby /home/runner/_tool/
fi
chown -R 1001:1001 /home/runner/.dotnet /home/runner/.nuget /home/runner/.cache /home/runner/_tool
chmod -R 755 /home/runner/.dotnet /home/runner/.nuget /home/runner/.cache /home/runner/_tool
securityContext:
runAsUser: 0
runAsNonRoot: false
volumeMounts:
- name: runner-home
mountPath: /home/runner
containers:
- name: runner
image: localhost/fc-github-runner:v20260525-ruby3.3.11-stepca
imagePullPolicy: Never
env:
- name: REPO_URL
value: "https://github.com/astoltz/FlowerCore.PiManager"
- name: RUNNER_NAME_PREFIX
value: "rke2-linux-pimanager"
- name: RUNNER_WORKDIR
value: "/tmp/runner/work"
- name: EPHEMERAL
value: "true"
- name: LABELS
value: "self-hosted,linux,fc-build-linux"
- name: HOME
value: "/home/runner"
- name: DOTNET_INSTALL_DIR
value: "/home/runner/.dotnet"
- name: DOTNET_CLI_TELEMETRY_OPTOUT
value: "1"
- name: DOTNET_NOLOGO
value: "1"
- name: DOTNET_GENERATE_ASPNET_CERTIFICATE
value: "false"
- name: DOTNET_CLI_HOME
value: "/home/runner"
- name: NUGET_PACKAGES
value: "/home/runner/.nuget/packages"
- name: XDG_CACHE_HOME
value: "/home/runner/.cache"
- name: RUNNER_TOOL_CACHE
value: "/home/runner/_tool"
- name: ACCESS_TOKEN
valueFrom:
secretKeyRef:
name: github-runner-token
key: credential
- name: RUN_AS_ROOT
value: "false"
resources:
requests:
cpu: "500m"
memory: "1Gi"
limits:
cpu: "2000m"
memory: "4Gi"
volumeMounts:
- name: runner-home
mountPath: /home/runner
- name: nuget-cache
mountPath: /home/runner/.nuget/packages
- name: tmp
mountPath: /tmp
livenessProbe:
exec:
command:
- /bin/sh
- -c
- "pgrep -f Runner.Listener > /dev/null"
initialDelaySeconds: 30
periodSeconds: 30
failureThreshold: 3
volumes:
- name: runner-home
emptyDir: {}
- name: nuget-cache
emptyDir:
sizeLimit: 2Gi
- name: tmp
emptyDir: {}
restartPolicy: Always
---
# Runner for FlowerCore.Updater. Two replicas use per-pod emptyDir caches, so
# backlog can drain without sharing a ReadWriteOnce PVC. Added 2026-05-26 to
# close the runner-fleet gap that left the repo with only the offline
# windows-sandbox runner and no Linux PR-CI capacity for future workflows.
apiVersion: apps/v1
kind: Deployment
metadata:
name: github-runner-updater
namespace: github-runner
labels:
app.kubernetes.io/name: github-runner-updater
app.kubernetes.io/component: runner
app.kubernetes.io/part-of: flowercore
app.kubernetes.io/managed-by: argocd
flowercore.io/created-by: argocd
flowercore.io/runner-repo: updater
flowercore.io/github-repo: FlowerCore.Updater
spec:
replicas: 2
selector:
matchLabels:
app.kubernetes.io/name: github-runner-updater
strategy:
type: Recreate
template:
metadata:
labels:
app.kubernetes.io/name: github-runner-updater
app.kubernetes.io/component: runner
app.kubernetes.io/part-of: flowercore
flowercore.io/created-by: argocd
flowercore.io/runner-repo: updater
flowercore.io/github-repo: FlowerCore.Updater
spec:
serviceAccountName: github-runner
securityContext:
runAsNonRoot: true
runAsUser: 1001
runAsGroup: 1001
fsGroup: 1001
initContainers:
- name: setup-runner-home
image: localhost/fc-github-runner:v20260525-ruby3.3.11-stepca
imagePullPolicy: Never
command:
- sh
- -c
- |
set -e
mkdir -p /home/runner/.dotnet /home/runner/.nuget/packages /home/runner/.nuget/NuGet /home/runner/.cache /home/runner/_tool
if [ -d /opt/runner-toolcache/Ruby ] && [ ! -d /home/runner/_tool/Ruby ]; then
cp -a /opt/runner-toolcache/Ruby /home/runner/_tool/
fi
chown -R 1001:1001 /home/runner/.dotnet /home/runner/.nuget /home/runner/.cache /home/runner/_tool
chmod -R 755 /home/runner/.dotnet /home/runner/.nuget /home/runner/.cache /home/runner/_tool
securityContext:
runAsUser: 0
runAsNonRoot: false
volumeMounts:
- name: runner-home
mountPath: /home/runner
containers:
- name: runner
image: localhost/fc-github-runner:v20260525-ruby3.3.11-stepca
imagePullPolicy: Never
env:
- name: REPO_URL
value: "https://github.com/astoltz/FlowerCore.Updater"
- name: RUNNER_NAME_PREFIX
value: "rke2-linux-updater"
- name: RUNNER_WORKDIR
value: "/tmp/runner/work"
- name: EPHEMERAL
value: "true"
- name: LABELS
value: "self-hosted,linux,fc-build-linux"
- name: HOME
value: "/home/runner"
- name: DOTNET_INSTALL_DIR
value: "/home/runner/.dotnet"
- name: DOTNET_CLI_TELEMETRY_OPTOUT
value: "1"
- name: DOTNET_NOLOGO
value: "1"
- name: DOTNET_GENERATE_ASPNET_CERTIFICATE
value: "false"
- name: DOTNET_CLI_HOME
value: "/home/runner"
- name: NUGET_PACKAGES
value: "/home/runner/.nuget/packages"
- name: XDG_CACHE_HOME
value: "/home/runner/.cache"
- name: RUNNER_TOOL_CACHE
value: "/home/runner/_tool"
- name: ACCESS_TOKEN
valueFrom:
secretKeyRef:
name: github-runner-token
key: credential
- name: RUN_AS_ROOT
value: "false"
resources:
requests:
cpu: "500m"
memory: "1Gi"
limits:
cpu: "2000m"
memory: "4Gi"
volumeMounts:
- name: runner-home
mountPath: /home/runner
- name: nuget-cache
mountPath: /home/runner/.nuget/packages
- name: tmp
mountPath: /tmp
livenessProbe:
exec:
command:
- /bin/sh
- -c
- "pgrep -f Runner.Listener > /dev/null"
initialDelaySeconds: 30
periodSeconds: 30
failureThreshold: 3
volumes:
- name: runner-home
emptyDir: {}
- name: nuget-cache
emptyDir:
sizeLimit: 2Gi
- name: tmp
emptyDir: {}
restartPolicy: Always
# Long-tail runner pattern:
#
# Sprint 32 added the final 16 long-tail repo-scoped Deployments above. Keep
# Common as the only PVC-backed runner at replicas: 1. Any future multi-replica
# runner must use per-pod emptyDir caches, not a shared ReadWriteOnce PVC.
# 2026-05-25: PiManager added (was missed in the Sprint 32 long-tail sweep).

View File

@@ -67,6 +67,7 @@ public sealed class FleetManifestLintTests
["github-runner-chat"] = "https://github.com/astoltz/FlowerCore.Chat",
["github-runner-mysql"] = "https://github.com/astoltz/FlowerCore.MySQL",
["github-runner-kiosk-linux"] = "https://github.com/astoltz/FlowerCore.Kiosk.Linux",
["github-runner-updater"] = "https://github.com/astoltz/FlowerCore.Updater",
};
private static readonly HashSet<string> ScaledLinuxRunnerDeployments = new(StringComparer.Ordinal)
@@ -80,6 +81,7 @@ public sealed class FleetManifestLintTests
"github-runner-chat",
"github-runner-mysql",
"github-runner-kiosk-linux",
"github-runner-updater",
};
private static readonly IReadOnlyDictionary<string, string> WritableRunnerEnv = new Dictionary<string, string>(StringComparer.Ordinal)
@@ -234,7 +236,7 @@ public sealed class FleetManifestLintTests
{
deployments.Should().ContainKey(expectedRunner.Key);
var container = deployments[expectedRunner.Key].ContainerMappings().Should().ContainSingle().Subject;
var container = deployments[expectedRunner.Key].MainContainerMappings().Should().ContainSingle().Subject;
EnvValue(container, "REPO_URL").Should().Be(expectedRunner.Value);
EnvValue(container, "EPHEMERAL").Should().Be("true");
EnvValue(container, "LABELS").Should().Be("self-hosted,linux,fc-build-linux");
@@ -250,7 +252,7 @@ public sealed class FleetManifestLintTests
{
foreach (var deployment in GitHubRunnerDeployments().Values)
{
var container = deployment.ContainerMappings().Should().ContainSingle().Subject;
var container = deployment.MainContainerMappings().Should().ContainSingle().Subject;
foreach (var expectedEnv in WritableRunnerEnv)
{
@@ -277,7 +279,10 @@ public sealed class FleetManifestLintTests
foreach (var deploymentName in ScaledLinuxRunnerDeployments)
{
var deployment = deployments[deploymentName];
ReplicaCount(deployment).Should().Be(2);
// Scaled runners must have >= 2 replicas (avoid single-pod bottleneck).
// Individual deployments may be tuned upward per CI activity — see
// "runners: right-size replica counts per 14d CI activity (#24)".
ReplicaCount(deployment).Should().BeGreaterOrEqualTo(2, $"{deploymentName} is in the scaled set and must run with at least 2 replicas");
var volumes = deployment.MappingSequence("spec", "template", "spec", "volumes");
var claimNames = volumes
@@ -303,6 +308,108 @@ public sealed class FleetManifestLintTests
.Be("github-runner-nuget-cache");
}
[Fact]
public void Runners_MustNotPinToOperatorWorkstationHosts()
{
// CRITICAL SAFETY (operator directive 2026-05-26): BLUEJAY-WS is the
// operator's primary workstation — host of the 1Password Connect
// bearer token, fcadmin SSH keys to noc1, signing CA private keys,
// and source for every FC repo. A self-hosted GitHub Actions runner
// there would execute arbitrary PR code with that local access.
// Build-side analog of the Sprint 9 NEW safe-account exclusion gate
// (Puppet GPO/AppLocker/WDAC/audit-forwarder modules refuse to apply
// on BLUEJAY-WS). This lint asserts no GitHub-runner Deployment in
// apps/github-runner/ pins to a forbidden operator-workstation host
// via nodeName, nodeSelector, nodeAffinity, or tolerations.
// Existing legacy `bluejay-ws-sandbox-1` GitHub-registered runner is
// out of scope here (it's a runtime registration, not a K8s
// Deployment) — see CLAUDE.md "Common Mistakes" entry and
// feedback_bluejay_ws_never_public_runner.md.
var forbiddenHostPatterns = new[]
{
"bluejay-ws",
"BLUEJAY-WS",
"bluejay-ws.iamworkin.lan",
"iamworkin-ws",
};
bool ContainsForbidden(string? value) =>
!string.IsNullOrWhiteSpace(value)
&& forbiddenHostPatterns.Any(pattern => value!.Contains(pattern, StringComparison.OrdinalIgnoreCase));
var violations = GitHubRunnerDeployments().Values.SelectMany(deployment =>
{
var local = new List<string>();
var podSpec = ManifestNodeExtensions.Mapping(deployment.Root, "spec", "template", "spec");
if (podSpec is null)
{
return local;
}
// nodeName: pins the pod to a specific node by name.
var nodeName = ManifestNodeExtensions.Scalar(podSpec, "nodeName");
if (ContainsForbidden(nodeName))
{
local.Add($"{deployment.Name} sets nodeName='{nodeName}' which targets a forbidden operator-workstation host.");
}
// nodeSelector: dict of label → value pinning the pod to nodes
// carrying matching labels. Examples that would trip this:
// kubernetes.io/hostname: bluejay-ws
// flowercore.io/host: bluejay-ws.iamworkin.lan
var nodeSelector = ManifestNodeExtensions.Mapping(podSpec, "nodeSelector");
if (nodeSelector is not null)
{
foreach (var entry in nodeSelector.Children)
{
var key = entry.Key is YamlScalarNode keyScalar ? keyScalar.Value : null;
var value = entry.Value is YamlScalarNode valueScalar ? valueScalar.Value : null;
if (ContainsForbidden(value))
{
local.Add($"{deployment.Name} has nodeSelector entry '{key}: {value}' which targets a forbidden operator-workstation host.");
}
}
}
// nodeAffinity: matchExpressions over node labels.
foreach (var term in ManifestNodeExtensions.MappingSequence(podSpec, "affinity", "nodeAffinity", "requiredDuringSchedulingIgnoredDuringExecution", "nodeSelectorTerms"))
{
foreach (var expr in ManifestNodeExtensions.MappingSequence(term, "matchExpressions"))
{
var key = ManifestNodeExtensions.Scalar(expr, "key");
foreach (var valueNode in ManifestNodeExtensions.ScalarSequence(expr, "values"))
{
if (ContainsForbidden(valueNode))
{
local.Add($"{deployment.Name} has nodeAffinity matchExpression '{key}' value '{valueNode}' which targets a forbidden operator-workstation host.");
}
}
}
}
// tolerations: scheduling onto a tainted operator-workstation
// node would let the runner run there. Forbid any toleration
// value that names the workstation.
foreach (var toleration in ManifestNodeExtensions.MappingSequence(podSpec, "tolerations"))
{
var key = ManifestNodeExtensions.Scalar(toleration, "key");
var value = ManifestNodeExtensions.Scalar(toleration, "value");
if (ContainsForbidden(key))
{
local.Add($"{deployment.Name} has toleration key '{key}' which targets a forbidden operator-workstation host.");
}
if (ContainsForbidden(value))
{
local.Add($"{deployment.Name} has toleration value '{value}' which targets a forbidden operator-workstation host.");
}
}
return local;
}).ToList();
violations.Should().BeEmpty("BLUEJAY-WS / iamworkin-ws must never host a fleet GitHub Actions runner; see CLAUDE.md 'Registering BLUEJAY-WS as a fleet GitHub Actions runner' and feedback_bluejay_ws_never_public_runner.md");
}
[Fact]
public void Monitoring_MustAlertWhenLinuxRunnerDeploymentIsUnavailable()
{
@@ -890,6 +997,22 @@ internal sealed record ManifestDocument(
.ToList();
}
// MainContainerMappings excludes initContainers. Use this when asserting
// properties of the primary container (env, image, volumeMounts) where an
// initContainer would be a false-positive match — e.g. the GitHub runner
// image's `setup-runner-home` initContainer should not count toward the
// single-container assertions on the runner deployments.
public IReadOnlyList<YamlMappingNode> MainContainerMappings()
{
var podSpec = PodSpec();
if (podSpec is null)
{
return Array.Empty<YamlMappingNode>();
}
return ManifestNodeExtensions.MappingSequence(podSpec, "containers").ToList();
}
public IReadOnlyList<ContainerSpec> ContainerSpecs()
{
return ContainerMappings()