Compare commits

...

6 Commits

Author SHA1 Message Date
Andrew Stoltz
67064c4129 feat(github-runner): harden Linux runner fleet 2026-05-17 21:50:29 -05:00
b8c7e59005 Tighten Pi signage HDMI settle coverage (#3) 2026-05-18 02:35:17 +00:00
65ac8d6f01 feat(github-runner): pod-env DOTNET_INSTALL_DIR + initContainer for non-root runner (#7) 2026-05-18 02:25:18 +00:00
35844e0dbd chore(github-runner): un-park github-runner-sharedpos (Shared.Pos non-root build fixed) (#6) 2026-05-18 02:20:00 +00:00
b1e307151e chore(github-runner): un-park github-runner-sharedpos (replicas 1) after Shared.Pos CI fix merged 2026-05-17 21:54:16 +00:00
12b07219c7 chore(github-runner): park github-runner-sharedpos (replicas 0) until Cx-1 non-root fix
Shared.Pos build fails on non-root runner (setup-dotnet /usr/share/dotnet denied); churning runner drove HighCPU on rke2-agent2. Re-enable in the Sprint 30+ Cx-1 Linux-runner-fleet lane (DOTNET_INSTALL_DIR on pod env).
2026-05-17 21:50:35 +00:00
9 changed files with 1629 additions and 44 deletions

View File

@@ -1,3 +1,2 @@
# Restart kiosk and redeclare capabilities when HDMI connect/disconnect changes DRM state.
SUBSYSTEM=="drm", KERNEL=="card?-HDMI-A-?", ACTION=="change", RUN+="/usr/bin/systemctl restart flowercore-signage-player-pi.service"
SUBSYSTEM=="drm", KERNEL=="card?-HDMI-A-?", ACTION=="change", RUN+="/usr/bin/systemctl start flowercore-signage-detect-display.service"
# Settle DRM for 2s before restarting Chromium, then redeclare capabilities.
SUBSYSTEM=="drm", KERNEL=="card?-HDMI-A-?", ACTION=="change", RUN+="/usr/bin/systemctl start flowercore-signage-player-pi-hdmi.service"

View File

@@ -0,0 +1,22 @@
#!/usr/bin/env bats
setup() {
APP_ROOT="$(cd "$BATS_TEST_DIRNAME/.." && pwd)"
DETECT="$APP_ROOT/scripts/fc-signage-detect-display"
}
@test "display detection emits graceful disconnected profile when no hdmi connector is present" {
script="$(cat "$DETECT")"
[[ "$script" == *"displayConnected: false"* ]]
[[ "$script" == *"No HDMI display detected"* ]]
}
@test "display detection parses edid, falls back to kmsprint, and logs endpoint failures locally" {
script="$(cat "$DETECT")"
[[ "$script" == *"edid-decode"* ]]
[[ "$script" == *"HDR (Static|Dynamic) Metadata Block"* ]]
[[ "$script" == *"kmsprint"* ]]
[[ "$script" == *"/api/v1/nodes/\${NODE_ID}/capabilities"* ]]
[[ "$script" == *"/api/v1/displays/\${NODE_ID}/capability-profile"* ]]
[[ "$script" == *"capabilities.log"* ]]
}

View File

@@ -0,0 +1,64 @@
#!/usr/bin/env bats
setup() {
APP_ROOT="$(cd "$BATS_TEST_DIRNAME/.." && pwd)"
BOOTSTRAP="$APP_ROOT/scripts/flowercore-signage-bootstrap.sh"
RENEW="$APP_ROOT/scripts/flowercore-signage-renew-cert.sh"
}
@test "bootstrap is idempotent when node is already enrolled" {
script="$(cat "$BOOTSTRAP")"
[[ "$script" == *'[[ -s "$NODE_JSON" && -s "$CERT_DIR/client.p12" ]]'* ]]
[[ "$script" == *"already enrolled"* ]]
[[ "$script" == *"exit 0"* ]]
}
@test "bootstrap generates a stable node uuid and machine id" {
script="$(cat "$BOOTSTRAP")"
[[ "$script" == *"uuidgen"* ]]
[[ "$script" == *"nodeUuid"* ]]
[[ "$script" == *"machineId"* ]]
[[ "$script" == *"cut -c1-16"* ]]
}
@test "bootstrap posts to the canonical register endpoint" {
grep -q '/api/v1/nodes/register' "$BOOTSTRAP"
grep -q '"linux-arm64-pi"' "$BOOTSTRAP"
}
@test "bootstrap retries registration once for first-call races" {
script="$(cat "$BOOTSTRAP")"
[[ "$script" == *"for attempt in 1 2"* ]]
[[ "$script" == *"register attempt \$attempt returned"* ]]
[[ "$script" == *"sleep 5"* ]]
}
@test "bootstrap supports setup-code approval with manual polling fallback" {
script="$(cat "$BOOTSTRAP")"
[[ "$script" == *"signage-setup-code"* ]]
[[ "$script" == *"approve-via-setup-code"* ]]
[[ "$script" == *"+ 1800"* ]]
[[ "$script" == *"sleep 15"* ]]
}
@test "bootstrap generates an ecdsa p256 csr for the signage pi subject" {
script="$(cat "$BOOTSTRAP")"
[[ "$script" == *"ecparam -genkey -name prime256v1"* ]]
[[ "$script" == *'/CN=${NODE_ID}/O=FlowerCore/OU=SignagePlayer-Pi'* ]]
}
@test "bootstrap writes pkcs12 bundle with restrictive permissions" {
script="$(cat "$BOOTSTRAP")"
[[ "$script" == *"openssl pkcs12 -export"* ]]
[[ "$script" == *"client.p12.pass"* ]]
[[ "$script" == *"chmod 0640"* ]]
[[ "$script" == *"chmod 0600"* ]]
}
@test "renewal only calls renew endpoint inside the thirty-day window and swaps atomically" {
script="$(cat "$RENEW")"
[[ "$script" == *'-checkend $((30*24*3600))'* ]]
[[ "$script" == *"/api/v1/nodes/\${NODE_ID}/renew"* ]]
[[ "$script" == *"client.key.new"* ]]
[[ "$script" == *'mv "$CERT_DIR/client.p12.new" "$CERT_DIR/client.p12"'* ]]
}

View File

@@ -0,0 +1,68 @@
#!/usr/bin/env bats
setup() {
APP_ROOT="$(cd "$BATS_TEST_DIRNAME/.." && pwd)"
}
@test "player unit exists" {
[ -f "$APP_ROOT/systemd/flowercore-signage-player-pi.service" ]
}
@test "player unit uses simple chromium service with restart backoff" {
unit="$(cat "$APP_ROOT/systemd/flowercore-signage-player-pi.service")"
[[ "$unit" == *"Type=simple"* ]]
[[ "$unit" == *"Restart=always"* ]]
[[ "$unit" == *"RestartSec=10s"* ]]
[[ "$unit" == *"StartLimitBurst=5"* ]]
[[ "$unit" == *"StartLimitIntervalSec=300s"* ]]
}
@test "player unit caps chromium memory at two gigabytes" {
grep -q '^MemoryMax=2G$' "$APP_ROOT/systemd/flowercore-signage-player-pi.service"
grep -q '^MemoryHigh=1500M$' "$APP_ROOT/systemd/flowercore-signage-player-pi.service"
}
@test "player unit condition-gates startup on identity and p12 certificate" {
grep -q '^ConditionPathExists=/etc/flowercore/signage-node.json$' "$APP_ROOT/systemd/flowercore-signage-player-pi.service"
grep -q '^ConditionPathExists=/etc/fc-signage-player/client.p12$' "$APP_ROOT/systemd/flowercore-signage-player-pi.service"
}
@test "player unit runs prelaunch checks before chromium" {
grep -q '^ExecStartPre=/usr/local/bin/flowercore-signage-prelaunch.sh$' "$APP_ROOT/systemd/flowercore-signage-player-pi.service"
grep -q '^ExecStart=/usr/local/bin/flowercore-signage-launch.sh$' "$APP_ROOT/systemd/flowercore-signage-player-pi.service"
}
@test "hdmi udev rule routes through the two-second settle service" {
rule="$(cat "$APP_ROOT/systemd/99-flowercore-signage-hdmi.rules")"
[[ "$rule" == *'KERNEL=="card?-HDMI-A-?"'* ]]
[[ "$rule" == *"systemctl start flowercore-signage-player-pi-hdmi.service"* ]]
[[ "$rule" != *"systemctl restart flowercore-signage-player-pi.service"* ]]
}
@test "hdmi responder settles, declares display, then restarts chromium" {
responder="$(cat "$APP_ROOT/scripts/flowercore-signage-hdmi-respond.sh")"
[[ "$responder" == *"sleep 2"* ]]
[[ "$responder" == *"systemctl start flowercore-signage-detect-display.service"* ]]
[[ "$responder" == *"systemctl restart flowercore-signage-player-pi.service"* ]]
}
@test "chromium policy json is valid and disables credential prompts" {
command -v jq >/dev/null || skip "jq not installed"
jq -e '.AutofillAddressEnabled == false and .AutofillCreditCardEnabled == false and .PasswordManagerEnabled == false' \
"$APP_ROOT/chromium-policies/flowercore-signage.json" >/dev/null
}
@test "launch script tries embed URL and logs bare-player fallback" {
launch="$(cat "$APP_ROOT/scripts/flowercore-signage-launch.sh")"
[[ "$launch" == *'/player/${NODE_ID}/embed?token=${CERT_THUMB}'* ]]
[[ "$launch" == *"url-divergence.log"* ]]
[[ "$launch" == *'/player/${NODE_ID}?token=${CERT_THUMB}'* ]]
}
@test "prelaunch script validates required node and cert files" {
prelaunch="$(cat "$APP_ROOT/scripts/flowercore-signage-prelaunch.sh")"
[[ "$prelaunch" == *"/etc/flowercore/signage-node.json"* ]]
[[ "$prelaunch" == *"/etc/fc-signage-player/client.p12"* ]]
[[ "$prelaunch" == *"/etc/fc-signage-player/client.p12.pass"* ]]
[[ "$prelaunch" == *"exit 1"* ]]
}

View File

@@ -0,0 +1,61 @@
# GitHub Runner Fleet
ArgoCD owns `apps/github-runner/github-runner.yaml`. Do not patch live runner
Deployments with `kubectl`; update this manifest and let ArgoCD reconcile.
## Runner Shape
All repo-scoped Linux runners use:
- `ACCESS_TOKEN` from the `github-runner-token` Secret
- `RUN_AS_ROOT=false`
- `EPHEMERAL=true`
- `LABELS=self-hosted,linux,fc-build-linux`
- writable non-root paths under `/home/runner` for .NET, NuGet, XDG cache, and
Actions tool cache
`github-runner` for `FlowerCore.Common` is single-replica because it retains the
original Longhorn ReadWriteOnce NuGet PVC. `github-runner-sharedpos` and the top
Linux-cost repo runners use two replicas with per-pod `emptyDir` caches. That is
the safe backlog-drain strategy: no two pods share one RWO PVC.
## Post-Merge Proof
After the PR is merged and ArgoCD syncs, verify the runner fleet:
```bash
kubectl -n github-runner get deploy,pods,pvc
```
Verify GitHub registration for the repo-scoped runners:
```bash
for repo in FlowerCore.Common FlowerCore.Shared.Pos FlowerCore.Puppet FlowerCore.Signage \
FlowerCore.DMS FlowerCore.Telephony FlowerCore.Print.Web FlowerCore.Chat \
FlowerCore.MySQL FlowerCore.Kiosk.Linux; do
echo "=== $repo ==="
gh api "/repos/astoltz/$repo/actions/runners" \
--jq '.runners[] | select(.labels[].name == "fc-build-linux") | {name,status,busy,labels:[.labels[].name]}'
done
```
Shared.Pos publish proof after the runner pod is online:
```bash
gh run list --repo astoltz/FlowerCore.Shared.Pos \
--workflow "Build, Test & Publish" --branch main --limit 5
```
If the latest run is still queued after runner registration, rerun the workflow
from GitHub Actions and verify it lands on an `rke2-linux-*` runner.
## Failure Notes
- `actions/setup-dotnet` permission error at `/usr/share/dotnet`: check that
`DOTNET_INSTALL_DIR=/home/runner/.dotnet` and related cache env vars are
present on the runner pod.
- `404` during runner registration: the fine-grained PAT is valid but missing
repository access for that repo. Add the repo to the PAT access list; the PAT
value does not change.
- `Multi-Attach` volume error: only the Common runner uses a RWO PVC and it must
stay single-replica. New multi-replica runners use `emptyDir`.

File diff suppressed because it is too large Load Diff

View File

@@ -723,6 +723,24 @@ data:
summary: "Mac mini GitHub runner offline ({{ $labels.runner }})"
description: "A macmini-* GitHub Actions runner has not reported online for more than 10 minutes. Puppet manages its LaunchDaemon under /Library/LaunchDaemons/io.flowercore.github-runner-<slug>.plist; runners survive reboot and do not require a GUI session."
- name: linux-runners
rules:
- alert: LinuxRunnerOffline
expr: |
kube_deployment_status_replicas_ready{
namespace="github-runner",
deployment=~"github-runner(|-(sharedpos|puppet|signage|dms|telephony|print-web|chat|mysql|kiosk-linux))"
} == 0
for: 5m
labels:
severity: warning
alert_channel: irc
service: github-runner
team: ci
annotations:
summary: "Linux CI runner offline: {{ $labels.deployment }}"
description: "Deployment {{ $labels.deployment }} in namespace github-runner has 0 ready replicas for more than 5 minutes. CI jobs targeting this repo will queue until the runner pod restarts and re-registers with GitHub. Check pods with: kubectl -n github-runner get pods -l app.kubernetes.io/name={{ $labels.deployment }}. Check logs with: kubectl -n github-runner logs -l app.kubernetes.io/name={{ $labels.deployment }} --tail=50. Common causes: PAT missing repo access, runner CrashLoopBackOff, or node/resource pressure."
- name: remote-desktop
rules:
- alert: RemoteDesktopWebDown
@@ -3421,6 +3439,39 @@ data:
relativeTimeRange: {from: 120, to: 0}
datasourceUid: __expr__
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [600], type: gt}}], refId: C}
- orgId: 1
name: CI Runners
folder: CI Alerts
interval: 1m
rules:
- uid: linux-runner-offline
title: LinuxRunnerOffline
condition: C
for: 5m
noDataState: OK
execErrState: Error
annotations:
summary: "Linux CI runner offline: {{ $labels.deployment }}"
description: "A github-runner namespace Deployment has 0 ready replicas for more than 5 minutes. CI jobs targeting that repo will queue until the runner pod restarts and re-registers."
runbook: "1. kubectl -n github-runner get pods -l app.kubernetes.io/name={{ $labels.deployment }} 2. kubectl -n github-runner logs -l app.kubernetes.io/name={{ $labels.deployment }} --tail=50 3. Verify PAT repo access if registration returns 404 4. Verify no RWO PVC is shared by scaled runners"
labels:
severity: warning
service: github-runner
alert_channel: irc
team: ci
data:
- refId: A
relativeTimeRange: {from: 300, to: 0}
datasourceUid: prometheus
model: {expr: 'kube_deployment_status_replicas_ready{namespace="github-runner",deployment=~"github-runner(|-(sharedpos|puppet|signage|dms|telephony|print-web|chat|mysql|kiosk-linux))"} == 0', instant: true, refId: A}
- refId: B
relativeTimeRange: {from: 300, to: 0}
datasourceUid: __expr__
model: {type: reduce, expression: A, reducer: last, refId: B}
- refId: C
relativeTimeRange: {from: 300, to: 0}
datasourceUid: __expr__
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [0], type: gt}}], refId: C}
- orgId: 1
name: Infrastructure
folder: AI Stack Alerts

View File

@@ -54,6 +54,43 @@ public sealed class FleetManifestLintTests
"ttsreader-piper",
};
private static readonly IReadOnlyDictionary<string, string> LinuxRunnerRepos = new Dictionary<string, string>(StringComparer.Ordinal)
{
["github-runner"] = "https://github.com/astoltz/FlowerCore.Common",
["github-runner-sharedpos"] = "https://github.com/astoltz/FlowerCore.Shared.Pos",
["github-runner-puppet"] = "https://github.com/astoltz/FlowerCore.Puppet",
["github-runner-signage"] = "https://github.com/astoltz/FlowerCore.Signage",
["github-runner-dms"] = "https://github.com/astoltz/FlowerCore.DMS",
["github-runner-telephony"] = "https://github.com/astoltz/FlowerCore.Telephony",
["github-runner-print-web"] = "https://github.com/astoltz/FlowerCore.Print.Web",
["github-runner-chat"] = "https://github.com/astoltz/FlowerCore.Chat",
["github-runner-mysql"] = "https://github.com/astoltz/FlowerCore.MySQL",
["github-runner-kiosk-linux"] = "https://github.com/astoltz/FlowerCore.Kiosk.Linux",
};
private static readonly HashSet<string> ScaledLinuxRunnerDeployments = new(StringComparer.Ordinal)
{
"github-runner-sharedpos",
"github-runner-puppet",
"github-runner-signage",
"github-runner-dms",
"github-runner-telephony",
"github-runner-print-web",
"github-runner-chat",
"github-runner-mysql",
"github-runner-kiosk-linux",
};
private static readonly IReadOnlyDictionary<string, string> WritableRunnerEnv = new Dictionary<string, string>(StringComparer.Ordinal)
{
["HOME"] = "/home/runner",
["DOTNET_INSTALL_DIR"] = "/home/runner/.dotnet",
["DOTNET_CLI_HOME"] = "/home/runner",
["NUGET_PACKAGES"] = "/home/runner/.nuget/packages",
["XDG_CACHE_HOME"] = "/home/runner/.cache",
["RUNNER_TOOL_CACHE"] = "/home/runner/_tool",
};
[Fact]
public void IngressRoutes_MustKeepServiceReferencesInTheSameNamespace()
{
@@ -187,6 +224,98 @@ public sealed class FleetManifestLintTests
violations.Should().BeEmpty();
}
[Fact]
public void GitHubRunnerFleet_MustRegisterRequiredReposAsRepoScopedDeployments()
{
var deployments = GitHubRunnerDeployments();
foreach (var expectedRunner in LinuxRunnerRepos)
{
deployments.Should().ContainKey(expectedRunner.Key);
var container = deployments[expectedRunner.Key].ContainerMappings().Should().ContainSingle().Subject;
EnvValue(container, "REPO_URL").Should().Be(expectedRunner.Value);
EnvValue(container, "EPHEMERAL").Should().Be("true");
EnvValue(container, "LABELS").Should().Be("self-hosted,linux,fc-build-linux");
EnvValue(container, "RUN_AS_ROOT").Should().Be("false");
EnvValue(container, "ACCESS_TOKEN").Should().BeNull("ACCESS_TOKEN must come from github-runner-token Secret, not a literal");
EnvSecretName(container, "ACCESS_TOKEN").Should().Be("github-runner-token");
EnvSecretKey(container, "ACCESS_TOKEN").Should().Be("credential");
}
}
[Fact]
public void GitHubRunnerFleet_MustSetWritableNonRootDotnetAndCachePaths()
{
foreach (var deployment in GitHubRunnerDeployments().Values)
{
var container = deployment.ContainerMappings().Should().ContainSingle().Subject;
foreach (var expectedEnv in WritableRunnerEnv)
{
EnvValue(container, expectedEnv.Key).Should().Be(expectedEnv.Value, $"{deployment.Name} must keep .NET paths writable for uid 1001");
}
var mounts = ManifestNodeExtensions.MappingSequence(container, "volumeMounts")
.ToDictionary(
mount => ManifestNodeExtensions.Scalar(mount, "name") ?? string.Empty,
mount => ManifestNodeExtensions.Scalar(mount, "mountPath") ?? string.Empty,
StringComparer.Ordinal);
mounts.Should().Contain("runner-home", "/home/runner");
mounts.Should().Contain("nuget-cache", "/home/runner/.nuget/packages");
mounts.Should().Contain("tmp", "/tmp");
}
}
[Fact]
public void GitHubRunnerFleet_MustAvoidRwoMultiAttachForScaledDeployments()
{
var deployments = GitHubRunnerDeployments();
foreach (var deploymentName in ScaledLinuxRunnerDeployments)
{
var deployment = deployments[deploymentName];
ReplicaCount(deployment).Should().Be(2);
var volumes = deployment.MappingSequence("spec", "template", "spec", "volumes");
var claimNames = volumes
.Select(volume => ManifestNodeExtensions.Scalar(volume, "persistentVolumeClaim", "claimName"))
.Where(value => !string.IsNullOrWhiteSpace(value))
.ToList();
claimNames.Should().BeEmpty($"{deploymentName} is scaled and must not share a RWO PVC");
volumes.Should().Contain(volume =>
string.Equals(ManifestNodeExtensions.Scalar(volume, "name"), "nuget-cache", StringComparison.Ordinal)
&& ManifestNodeExtensions.Mapping(volume, "emptyDir") != null);
}
var common = deployments["github-runner"];
ReplicaCount(common).Should().Be(1);
common.MappingSequence("spec", "template", "spec", "volumes")
.Select(volume => ManifestNodeExtensions.Scalar(volume, "persistentVolumeClaim", "claimName"))
.Where(value => !string.IsNullOrWhiteSpace(value))
.Should()
.ContainSingle()
.Which
.Should()
.Be("github-runner-nuget-cache");
}
[Fact]
public void Monitoring_MustAlertWhenLinuxRunnerDeploymentIsUnavailable()
{
var monitoring = File.ReadAllText(Path.Combine(Inventory.BluejayRoot, "apps", "monitoring", "noc-monitoring.yaml"));
monitoring.Should().Contain("MacMiniRunnerOffline");
monitoring.Should().Contain("LinuxRunnerOffline");
monitoring.Should().Contain("kube_deployment_status_replicas_ready");
monitoring.Should().Contain("github-runner(|-(sharedpos|puppet|signage|dms|telephony|print-web|chat|mysql|kiosk-linux))");
monitoring.Should().Contain("folder: CI Alerts");
monitoring.Should().Contain("uid: linux-runner-offline");
monitoring.Should().Contain("alert_channel: irc");
}
[Fact]
public void StatefulSets_WithVolumeClaimTemplates_MustDeclareFilesystemDefaults()
{
@@ -314,6 +443,44 @@ public sealed class FleetManifestLintTests
$"{document.Descriptor} container '{containerName}' still uses {probeKey}.httpGet on /health.",
};
}
private static IReadOnlyDictionary<string, ManifestDocument> GitHubRunnerDeployments()
{
return Inventory.Documents
.Where(document => document.Kind == "Deployment")
.Where(document => document.Namespace == "github-runner")
.ToDictionary(document => document.Name, StringComparer.Ordinal);
}
private static int ReplicaCount(ManifestDocument document)
{
return int.TryParse(document.Scalar("spec", "replicas"), out var replicas) ? replicas : 1;
}
private static string? EnvValue(YamlMappingNode container, string name)
{
return EnvMapping(container, name) is { } env ? ManifestNodeExtensions.Scalar(env, "value") : null;
}
private static string? EnvSecretName(YamlMappingNode container, string name)
{
return EnvMapping(container, name) is { } env
? ManifestNodeExtensions.Scalar(env, "valueFrom", "secretKeyRef", "name")
: null;
}
private static string? EnvSecretKey(YamlMappingNode container, string name)
{
return EnvMapping(container, name) is { } env
? ManifestNodeExtensions.Scalar(env, "valueFrom", "secretKeyRef", "key")
: null;
}
private static YamlMappingNode? EnvMapping(YamlMappingNode container, string name)
{
return ManifestNodeExtensions.MappingSequence(container, "env")
.SingleOrDefault(env => string.Equals(ManifestNodeExtensions.Scalar(env, "name"), name, StringComparison.Ordinal));
}
}
internal sealed class ManifestInventory

View File

@@ -174,10 +174,13 @@ public sealed class PiSignagePlayerArtifactTests
public void HdmiRule_RestartsPlayerAndRunsCapabilityDetection()
{
var rule = Read("systemd/99-flowercore-signage-hdmi.rules");
var responder = Read("scripts/flowercore-signage-hdmi-respond.sh");
rule.Should().Contain("KERNEL==\"card?-HDMI-A-?\"");
rule.Should().Contain("restart flowercore-signage-player-pi.service");
rule.Should().Contain("start flowercore-signage-detect-display.service");
rule.Should().Contain("start flowercore-signage-player-pi-hdmi.service");
responder.Should().Contain("sleep 2");
responder.Should().Contain("start flowercore-signage-detect-display.service");
responder.Should().Contain("restart flowercore-signage-player-pi.service");
}
[Fact]