Compare commits
11 Commits
d6f4468a9c
...
runners/ad
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
ae030a5f33 | ||
| bc8c35896f | |||
|
|
2cc91b6df0 | ||
| 0d2090fe81 | |||
|
|
bc3548e715 | ||
| 74333cc26b | |||
|
|
7310fb88c2 | ||
| 148bc87b9a | |||
|
|
2a1e842100 | ||
| bc28430d24 | |||
|
|
cc92272217 |
@@ -12,6 +12,15 @@ ENV PATH="/home/runner/_tool/Ruby/${RUBY_MINOR}/x64/bin:/opt/runner-toolcache/Ru
|
||||
|
||||
USER root
|
||||
|
||||
# Bake the IAmWorkin step-ca root CA into the system trust store. Without
|
||||
# this, .NET HttpClient calls from CI tests against *.iamworkin.lan
|
||||
# (e.g. https://selenium.iamworkin.lan/session) fail with `PartialChain`
|
||||
# because the runner image's default Ubuntu trust bundle doesn't include
|
||||
# our internal Root CA. update-ca-certificates regenerates
|
||||
# /etc/ssl/certs/ca-certificates.crt, which OpenSSL + .NET on Linux read
|
||||
# automatically — no SSL_CERT_FILE env var needed.
|
||||
COPY step-ca-root.crt /usr/local/share/ca-certificates/iamworkin-step-ca-root.crt
|
||||
|
||||
RUN apt-get update \
|
||||
&& DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
|
||||
autoconf \
|
||||
@@ -31,6 +40,7 @@ RUN apt-get update \
|
||||
pkg-config \
|
||||
uuid-dev \
|
||||
zlib1g-dev \
|
||||
&& update-ca-certificates \
|
||||
&& curl -fsSL "https://github.com/rbenv/ruby-build/archive/refs/tags/${RUBY_BUILD_VERSION}.tar.gz" -o /tmp/ruby-build.tar.gz \
|
||||
&& mkdir -p /tmp/ruby-build \
|
||||
&& tar -xzf /tmp/ruby-build.tar.gz --strip-components=1 -C /tmp/ruby-build \
|
||||
|
||||
@@ -7,7 +7,7 @@ Deployments with `kubectl`; update this manifest and let ArgoCD reconcile.
|
||||
|
||||
All repo-scoped Linux runners use:
|
||||
|
||||
- `localhost/fc-github-runner:v20260520-ruby3.3.11`, derived from
|
||||
- `localhost/fc-github-runner:v20260525-ruby3.3.11-stepca`, derived from
|
||||
`myoung34/github-runner:latest`
|
||||
- `ACCESS_TOKEN` from the `github-runner-token` Secret
|
||||
- `RUN_AS_ROOT=false`
|
||||
@@ -40,14 +40,26 @@ still mounts an `emptyDir` over `/home/runner`, so the `setup-runner-home` init
|
||||
container copies the baked toolcache from `/opt/runner-toolcache/Ruby` into
|
||||
`/home/runner/_tool/Ruby` before the runner container starts.
|
||||
|
||||
The IAmWorkin step-ca root CA is also baked into the system trust store
|
||||
(`/usr/local/share/ca-certificates/iamworkin-step-ca-root.crt`, registered by
|
||||
`update-ca-certificates`). Without it, .NET HttpClient calls from CI tests
|
||||
against `*.iamworkin.lan` (e.g. `https://selenium.iamworkin.lan/session`)
|
||||
fail with `PartialChain`. To refresh the bundled cert when the root rotates,
|
||||
re-extract from the cluster and overwrite `step-ca-root.crt`:
|
||||
|
||||
```bash
|
||||
kubectl get secret -n cert-manager step-ca-root \
|
||||
-o jsonpath='{.data.ca\.crt}' | base64 -d > step-ca-root.crt
|
||||
```
|
||||
|
||||
```bash
|
||||
cd apps/github-runner
|
||||
podman build -t localhost/fc-github-runner:v20260520-ruby3.3.11 .
|
||||
podman run --rm localhost/fc-github-runner:v20260520-ruby3.3.11 ruby -v
|
||||
podman run --rm localhost/fc-github-runner:v20260520-ruby3.3.11 \
|
||||
podman build -t localhost/fc-github-runner:v20260525-ruby3.3.11-stepca .
|
||||
podman run --rm localhost/fc-github-runner:v20260525-ruby3.3.11-stepca ruby -v
|
||||
podman run --rm localhost/fc-github-runner:v20260525-ruby3.3.11-stepca \
|
||||
test -f /opt/runner-toolcache/Ruby/3.3/x64.complete
|
||||
podman save localhost/fc-github-runner:v20260520-ruby3.3.11 \
|
||||
-o fc-github-runner-v20260520-ruby3.3.11.tar
|
||||
podman save localhost/fc-github-runner:v20260525-ruby3.3.11-stepca \
|
||||
-o fc-github-runner-v20260525-ruby3.3.11-stepca.tar
|
||||
```
|
||||
|
||||
Import the saved image on every schedulable RKE2 node before ArgoCD rolls the
|
||||
@@ -55,9 +67,9 @@ Deployments:
|
||||
|
||||
```bash
|
||||
for node in rke2-server rke2-agent1 rke2-agent2; do
|
||||
scp fc-github-runner-v20260520-ruby3.3.11.tar "$node:/tmp/"
|
||||
ssh "$node" 'sudo ctr -a /run/k3s/containerd/containerd.sock -n k8s.io images rm localhost/fc-github-runner:v20260520-ruby3.3.11 || true'
|
||||
ssh "$node" 'sudo ctr -a /run/k3s/containerd/containerd.sock -n k8s.io images import /tmp/fc-github-runner-v20260520-ruby3.3.11.tar'
|
||||
scp fc-github-runner-v20260525-ruby3.3.11-stepca.tar "$node:/tmp/"
|
||||
ssh "$node" 'sudo ctr -a /run/k3s/containerd/containerd.sock -n k8s.io images rm localhost/fc-github-runner:v20260525-ruby3.3.11-stepca || true'
|
||||
ssh "$node" 'sudo ctr -a /run/k3s/containerd/containerd.sock -n k8s.io images import /tmp/fc-github-runner-v20260525-ruby3.3.11-stepca.tar'
|
||||
done
|
||||
```
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
12
apps/github-runner/step-ca-root.crt
Normal file
12
apps/github-runner/step-ca-root.crt
Normal file
@@ -0,0 +1,12 @@
|
||||
-----BEGIN CERTIFICATE-----
|
||||
MIIBxDCCAWqgAwIBAgIRAPY357G6ow6zMAL5+4bS2kkwCgYIKoZIzj0EAwIwQDEa
|
||||
MBgGA1UEChMRSUFtV29ya2luIEFDTUUgQ0ExIjAgBgNVBAMTGUlBbVdvcmtpbiBB
|
||||
Q01FIENBIFJvb3QgQ0EwHhcNMjYwMzA4MTgwNzExWhcNMzYwMzA1MTgwNzExWjBA
|
||||
MRowGAYDVQQKExFJQW1Xb3JraW4gQUNNRSBDQTEiMCAGA1UEAxMZSUFtV29ya2lu
|
||||
IEFDTUUgQ0EgUm9vdCBDQTBZMBMGByqGSM49AgEGCCqGSM49AwEHA0IABJ2n04X1
|
||||
JZo5Zdq/i1Idv8+fqwZyAzBh7whbqj0SWsJL8UWRabCMqYCs7+dXO0xRSzqkwFDL
|
||||
x+vooOai8RgRNhajRTBDMA4GA1UdDwEB/wQEAwIBBjASBgNVHRMBAf8ECDAGAQH/
|
||||
AgEBMB0GA1UdDgQWBBRnuPPQR6iM/H6vOluiU3Sygayz8jAKBggqhkjOPQQDAgNI
|
||||
ADBFAiEArQK9dYPGmAZsdYnjziuFVVE5NKZUcceYvGfGC+tLXUsCIAudF2zJrCRq
|
||||
3mK50ZZET/fwTkJwiEF4824mjP8p1CKM
|
||||
-----END CERTIFICATE-----
|
||||
@@ -24,7 +24,16 @@
|
||||
# (10.0.57.16:5200), public internet 80/443 (excluding RFC1918), and
|
||||
# fc-signage:5190 for the signage AAT lane.
|
||||
# - Ingress: Traefik (4444 + 8089 ACME-solver-style), intra-pod,
|
||||
# telephony / gitea / fc-system / fc-signage namespaces on 4444.
|
||||
# telephony / gitea / fc-system / fc-signage / github-runner namespaces
|
||||
# on 4444.
|
||||
#
|
||||
# 2026-05-25: added github-runner ingress on 4444 so CI jobs running in
|
||||
# self-hosted runner pods (e.g. FlowerCore.Print.Web `help-screenshots`)
|
||||
# can reach the grid. Without this allow, the session POST to
|
||||
# `selenium-hub.selenium.svc.cluster.local:4444` was DNAT'd to the hub
|
||||
# pod IP and then dropped at the Calico ingress hook — Selenium UI showed
|
||||
# 0/4 sessions while the .NET HTTP client timed out at 60s. Same family
|
||||
# as `feedback_netpol_dnat_backend_port`, wrong-source-namespace flavor.
|
||||
apiVersion: networking.k8s.io/v1
|
||||
kind: NetworkPolicy
|
||||
metadata:
|
||||
@@ -203,6 +212,13 @@ spec:
|
||||
ports:
|
||||
- port: 4444
|
||||
protocol: TCP
|
||||
- from:
|
||||
- namespaceSelector:
|
||||
matchLabels:
|
||||
kubernetes.io/metadata.name: github-runner
|
||||
ports:
|
||||
- port: 4444
|
||||
protocol: TCP
|
||||
podSelector: {}
|
||||
policyTypes:
|
||||
- Ingress
|
||||
|
||||
@@ -132,13 +132,18 @@ spec:
|
||||
initialDelaySeconds: 10
|
||||
periodSeconds: 5
|
||||
timeoutSeconds: 5
|
||||
# Hub baseline working set ~766Mi on 2026-05-25 (75% of prior 1Gi
|
||||
# limit). Bump to 1.5Gi / 1Gi to keep ~50% headroom; matches the
|
||||
# stampede-buffer pattern documented for multus
|
||||
# (feedback_k8s_cni_multus_sizing). CPU left alone — observed 54m
|
||||
# against a 500m limit, no contention.
|
||||
resources:
|
||||
limits:
|
||||
cpu: 500m
|
||||
memory: 1Gi
|
||||
memory: 1536Mi
|
||||
requests:
|
||||
cpu: 250m
|
||||
memory: 512Mi
|
||||
memory: 1Gi
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
@@ -198,13 +203,18 @@ spec:
|
||||
port: 5555
|
||||
initialDelaySeconds: 15
|
||||
periodSeconds: 5
|
||||
# Chromium-based browser node. Bumped from 1Gi -> 2Gi (req 512Mi
|
||||
# -> 1Gi) on 2026-05-25 — Edge had 51 OOMKills in 5d on the
|
||||
# original 1Gi cap (~1 OOM every 2.4h), and Chrome at maxSessions=2
|
||||
# was running 684Mi idle on the same cap. Matches the Firefox node's
|
||||
# tested-stable 2Gi limit. CPU unchanged.
|
||||
resources:
|
||||
limits:
|
||||
cpu: '1'
|
||||
memory: 1Gi
|
||||
memory: 2Gi
|
||||
requests:
|
||||
cpu: 500m
|
||||
memory: 512Mi
|
||||
memory: 1Gi
|
||||
volumeMounts:
|
||||
- mountPath: /dev/shm
|
||||
name: dshm
|
||||
@@ -378,13 +388,18 @@ spec:
|
||||
port: 5555
|
||||
initialDelaySeconds: 15
|
||||
periodSeconds: 5
|
||||
# Chromium-based browser node. Bumped from 1Gi -> 2Gi (req 512Mi
|
||||
# -> 1Gi) on 2026-05-25 — Edge had 51 OOMKills in 5d on the
|
||||
# original 1Gi cap (~1 OOM every 2.4h), and Chrome at maxSessions=2
|
||||
# was running 684Mi idle on the same cap. Matches the Firefox node's
|
||||
# tested-stable 2Gi limit. CPU unchanged.
|
||||
resources:
|
||||
limits:
|
||||
cpu: '1'
|
||||
memory: 1Gi
|
||||
memory: 2Gi
|
||||
requests:
|
||||
cpu: 500m
|
||||
memory: 512Mi
|
||||
memory: 1Gi
|
||||
volumeMounts:
|
||||
- mountPath: /dev/shm
|
||||
name: dshm
|
||||
|
||||
@@ -67,6 +67,7 @@ public sealed class FleetManifestLintTests
|
||||
["github-runner-chat"] = "https://github.com/astoltz/FlowerCore.Chat",
|
||||
["github-runner-mysql"] = "https://github.com/astoltz/FlowerCore.MySQL",
|
||||
["github-runner-kiosk-linux"] = "https://github.com/astoltz/FlowerCore.Kiosk.Linux",
|
||||
["github-runner-updater"] = "https://github.com/astoltz/FlowerCore.Updater",
|
||||
};
|
||||
|
||||
private static readonly HashSet<string> ScaledLinuxRunnerDeployments = new(StringComparer.Ordinal)
|
||||
@@ -80,6 +81,7 @@ public sealed class FleetManifestLintTests
|
||||
"github-runner-chat",
|
||||
"github-runner-mysql",
|
||||
"github-runner-kiosk-linux",
|
||||
"github-runner-updater",
|
||||
};
|
||||
|
||||
private static readonly IReadOnlyDictionary<string, string> WritableRunnerEnv = new Dictionary<string, string>(StringComparer.Ordinal)
|
||||
@@ -234,7 +236,7 @@ public sealed class FleetManifestLintTests
|
||||
{
|
||||
deployments.Should().ContainKey(expectedRunner.Key);
|
||||
|
||||
var container = deployments[expectedRunner.Key].ContainerMappings().Should().ContainSingle().Subject;
|
||||
var container = deployments[expectedRunner.Key].MainContainerMappings().Should().ContainSingle().Subject;
|
||||
EnvValue(container, "REPO_URL").Should().Be(expectedRunner.Value);
|
||||
EnvValue(container, "EPHEMERAL").Should().Be("true");
|
||||
EnvValue(container, "LABELS").Should().Be("self-hosted,linux,fc-build-linux");
|
||||
@@ -250,7 +252,7 @@ public sealed class FleetManifestLintTests
|
||||
{
|
||||
foreach (var deployment in GitHubRunnerDeployments().Values)
|
||||
{
|
||||
var container = deployment.ContainerMappings().Should().ContainSingle().Subject;
|
||||
var container = deployment.MainContainerMappings().Should().ContainSingle().Subject;
|
||||
|
||||
foreach (var expectedEnv in WritableRunnerEnv)
|
||||
{
|
||||
@@ -277,7 +279,10 @@ public sealed class FleetManifestLintTests
|
||||
foreach (var deploymentName in ScaledLinuxRunnerDeployments)
|
||||
{
|
||||
var deployment = deployments[deploymentName];
|
||||
ReplicaCount(deployment).Should().Be(2);
|
||||
// Scaled runners must have >= 2 replicas (avoid single-pod bottleneck).
|
||||
// Individual deployments may be tuned upward per CI activity — see
|
||||
// "runners: right-size replica counts per 14d CI activity (#24)".
|
||||
ReplicaCount(deployment).Should().BeGreaterOrEqualTo(2, $"{deploymentName} is in the scaled set and must run with at least 2 replicas");
|
||||
|
||||
var volumes = deployment.MappingSequence("spec", "template", "spec", "volumes");
|
||||
var claimNames = volumes
|
||||
@@ -303,6 +308,108 @@ public sealed class FleetManifestLintTests
|
||||
.Be("github-runner-nuget-cache");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Runners_MustNotPinToOperatorWorkstationHosts()
|
||||
{
|
||||
// CRITICAL SAFETY (operator directive 2026-05-26): BLUEJAY-WS is the
|
||||
// operator's primary workstation — host of the 1Password Connect
|
||||
// bearer token, fcadmin SSH keys to noc1, signing CA private keys,
|
||||
// and source for every FC repo. A self-hosted GitHub Actions runner
|
||||
// there would execute arbitrary PR code with that local access.
|
||||
// Build-side analog of the Sprint 9 NEW safe-account exclusion gate
|
||||
// (Puppet GPO/AppLocker/WDAC/audit-forwarder modules refuse to apply
|
||||
// on BLUEJAY-WS). This lint asserts no GitHub-runner Deployment in
|
||||
// apps/github-runner/ pins to a forbidden operator-workstation host
|
||||
// via nodeName, nodeSelector, nodeAffinity, or tolerations.
|
||||
// Existing legacy `bluejay-ws-sandbox-1` GitHub-registered runner is
|
||||
// out of scope here (it's a runtime registration, not a K8s
|
||||
// Deployment) — see CLAUDE.md "Common Mistakes" entry and
|
||||
// feedback_bluejay_ws_never_public_runner.md.
|
||||
var forbiddenHostPatterns = new[]
|
||||
{
|
||||
"bluejay-ws",
|
||||
"BLUEJAY-WS",
|
||||
"bluejay-ws.iamworkin.lan",
|
||||
"iamworkin-ws",
|
||||
};
|
||||
|
||||
bool ContainsForbidden(string? value) =>
|
||||
!string.IsNullOrWhiteSpace(value)
|
||||
&& forbiddenHostPatterns.Any(pattern => value!.Contains(pattern, StringComparison.OrdinalIgnoreCase));
|
||||
|
||||
var violations = GitHubRunnerDeployments().Values.SelectMany(deployment =>
|
||||
{
|
||||
var local = new List<string>();
|
||||
var podSpec = ManifestNodeExtensions.Mapping(deployment.Root, "spec", "template", "spec");
|
||||
if (podSpec is null)
|
||||
{
|
||||
return local;
|
||||
}
|
||||
|
||||
// nodeName: pins the pod to a specific node by name.
|
||||
var nodeName = ManifestNodeExtensions.Scalar(podSpec, "nodeName");
|
||||
if (ContainsForbidden(nodeName))
|
||||
{
|
||||
local.Add($"{deployment.Name} sets nodeName='{nodeName}' which targets a forbidden operator-workstation host.");
|
||||
}
|
||||
|
||||
// nodeSelector: dict of label → value pinning the pod to nodes
|
||||
// carrying matching labels. Examples that would trip this:
|
||||
// kubernetes.io/hostname: bluejay-ws
|
||||
// flowercore.io/host: bluejay-ws.iamworkin.lan
|
||||
var nodeSelector = ManifestNodeExtensions.Mapping(podSpec, "nodeSelector");
|
||||
if (nodeSelector is not null)
|
||||
{
|
||||
foreach (var entry in nodeSelector.Children)
|
||||
{
|
||||
var key = entry.Key is YamlScalarNode keyScalar ? keyScalar.Value : null;
|
||||
var value = entry.Value is YamlScalarNode valueScalar ? valueScalar.Value : null;
|
||||
if (ContainsForbidden(value))
|
||||
{
|
||||
local.Add($"{deployment.Name} has nodeSelector entry '{key}: {value}' which targets a forbidden operator-workstation host.");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// nodeAffinity: matchExpressions over node labels.
|
||||
foreach (var term in ManifestNodeExtensions.MappingSequence(podSpec, "affinity", "nodeAffinity", "requiredDuringSchedulingIgnoredDuringExecution", "nodeSelectorTerms"))
|
||||
{
|
||||
foreach (var expr in ManifestNodeExtensions.MappingSequence(term, "matchExpressions"))
|
||||
{
|
||||
var key = ManifestNodeExtensions.Scalar(expr, "key");
|
||||
foreach (var valueNode in ManifestNodeExtensions.ScalarSequence(expr, "values"))
|
||||
{
|
||||
if (ContainsForbidden(valueNode))
|
||||
{
|
||||
local.Add($"{deployment.Name} has nodeAffinity matchExpression '{key}' value '{valueNode}' which targets a forbidden operator-workstation host.");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// tolerations: scheduling onto a tainted operator-workstation
|
||||
// node would let the runner run there. Forbid any toleration
|
||||
// value that names the workstation.
|
||||
foreach (var toleration in ManifestNodeExtensions.MappingSequence(podSpec, "tolerations"))
|
||||
{
|
||||
var key = ManifestNodeExtensions.Scalar(toleration, "key");
|
||||
var value = ManifestNodeExtensions.Scalar(toleration, "value");
|
||||
if (ContainsForbidden(key))
|
||||
{
|
||||
local.Add($"{deployment.Name} has toleration key '{key}' which targets a forbidden operator-workstation host.");
|
||||
}
|
||||
if (ContainsForbidden(value))
|
||||
{
|
||||
local.Add($"{deployment.Name} has toleration value '{value}' which targets a forbidden operator-workstation host.");
|
||||
}
|
||||
}
|
||||
|
||||
return local;
|
||||
}).ToList();
|
||||
|
||||
violations.Should().BeEmpty("BLUEJAY-WS / iamworkin-ws must never host a fleet GitHub Actions runner; see CLAUDE.md 'Registering BLUEJAY-WS as a fleet GitHub Actions runner' and feedback_bluejay_ws_never_public_runner.md");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Monitoring_MustAlertWhenLinuxRunnerDeploymentIsUnavailable()
|
||||
{
|
||||
@@ -890,6 +997,22 @@ internal sealed record ManifestDocument(
|
||||
.ToList();
|
||||
}
|
||||
|
||||
// MainContainerMappings excludes initContainers. Use this when asserting
|
||||
// properties of the primary container (env, image, volumeMounts) where an
|
||||
// initContainer would be a false-positive match — e.g. the GitHub runner
|
||||
// image's `setup-runner-home` initContainer should not count toward the
|
||||
// single-container assertions on the runner deployments.
|
||||
public IReadOnlyList<YamlMappingNode> MainContainerMappings()
|
||||
{
|
||||
var podSpec = PodSpec();
|
||||
if (podSpec is null)
|
||||
{
|
||||
return Array.Empty<YamlMappingNode>();
|
||||
}
|
||||
|
||||
return ManifestNodeExtensions.MappingSequence(podSpec, "containers").ToList();
|
||||
}
|
||||
|
||||
public IReadOnlyList<ContainerSpec> ContainerSpecs()
|
||||
{
|
||||
return ContainerMappings()
|
||||
|
||||
Reference in New Issue
Block a user