Compare commits

..

5 Commits

Author SHA1 Message Date
Codex
266b9cb8be feat(github-runner): add top Linux repo runners 2026-05-17 13:55:55 -05:00
Codex
6f6ca50987 fix(github-runner): switch RUNNER_TOKEN -> ACCESS_TOKEN; set RUN_AS_ROOT=false
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-16 22:08:56 +00:00
Codex
c7be58c1f7 chore(github-runner): bump replicas 0 -> 1 (PAT provisioned)
Operator provisioned GitHub PAT (Runner Registration) 1P item. OnePasswordItem CRD already materialized the secret. Unblocks CI fleet-wide.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-16 22:04:03 +00:00
Codex
a1f5a393cd chore(github-runner): rename 1P item to GitHub PAT (Runner Registration)
Renames the OnePasswordItem.itemPath from "GitHub Runner Registration
Token" to "GitHub PAT (Runner Registration)" so the runner 1P entry
sits next to its siblings — GitHub PAT (Gitea Mirrors) and GitHub PAT
(NuGet Packages) — under a consistent "GitHub PAT (...)" naming pattern
and API_CREDENTIAL category.

Existing field "credential" remains the consumer (RUNNER_TOKEN env).
Comment block clarified to require Administration:read/write fine-grained
PAT scope on target repos.

Old 1P item renamed to "[DEPRECATED 2026-05-16] GitHub Runner
Registration" — kept as recovery backup; can be hard-deleted after the
first successful runner pod start against the new item path.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-16 22:01:41 +00:00
Codex
710340d8be chore(github-runner): rename 1P item to GitHub PAT (Runner Registration)
Renames the OnePasswordItem.itemPath from "GitHub Runner Registration
Token" to "GitHub PAT (Runner Registration)" so the runner 1P entry
sits next to its siblings — GitHub PAT (Gitea Mirrors) and GitHub PAT
(NuGet Packages) — under a consistent "GitHub PAT (...)" naming pattern
and API_CREDENTIAL category.

Existing field "credential" remains the consumer (RUNNER_TOKEN env).
Comment block clarified to require Administration:read/write fine-grained
PAT scope on target repos.

Old 1P item renamed to "[DEPRECATED 2026-05-16] GitHub Runner
Registration" — kept as recovery backup; can be hard-deleted after the
first successful runner pod start against the new item path.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-16 10:27:58 -05:00
4 changed files with 1056 additions and 80 deletions

View File

@@ -0,0 +1,38 @@
# github-runner
ArgoCD-managed repo-scoped Linux GitHub Actions runners for FlowerCore.
`astoltz` is a GitHub user account, not an organization, so each repository
needs its own runner registration. The existing Common runner remains
`Deployment/github-runner`; Sprint 29 adds one single-replica Deployment for
each top Linux-cost repo:
- `FlowerCore.Puppet`
- `FlowerCore.Signage`
- `FlowerCore.DMS`
- `FlowerCore.Telephony`
- `FlowerCore.Print.Web`
- `FlowerCore.Chat`
- `FlowerCore.MySQL`
- `FlowerCore.Kiosk.Linux`
Each runner uses `myoung34/github-runner:latest`, `EPHEMERAL=true`, and labels
`self-hosted,linux,fc-build-linux`. The shared `github-runner-token` Secret is
synced from the existing 1Password item `GitHub PAT (Runner Registration)` and
is consumed as `ACCESS_TOKEN`.
Do not `kubectl apply` this app over ArgoCD. Merge to `main`, let
`infra-github-runner` sync, then verify from `noc1`:
```bash
kubectl -n github-runner get deploy,pods,pvc
for repo in FlowerCore.Puppet FlowerCore.Signage FlowerCore.DMS FlowerCore.Telephony FlowerCore.Print.Web FlowerCore.Chat FlowerCore.MySQL FlowerCore.Kiosk.Linux; do
gh api "/repos/astoltz/$repo/actions/runners" \
--jq '.runners[] | select((.labels[].name == "fc-build-linux") and (.status == "online")) | {name,status,busy,labels:[.labels[].name]}'
done
```
`LinuxRunnerOffline` is declared in `apps/monitoring/noc-monitoring.yaml` and
fires when any Common or top-8 Linux runner deployment has no available replica
for 10 minutes.

File diff suppressed because it is too large Load Diff

View File

@@ -974,6 +974,19 @@ data:
summary: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} replica mismatch" summary: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} replica mismatch"
description: "Spec wants {{ $labels.spec_replicas }} but only {{ $value }} available. Likely a rollout stuck on probe failure, scheduling, or PVC." description: "Spec wants {{ $labels.spec_replicas }} but only {{ $value }} available. Likely a rollout stuck on probe failure, scheduling, or PVC."
- alert: LinuxRunnerOffline
expr: |
kube_deployment_status_replicas_available{namespace="github-runner",deployment=~"github-runner(|-(puppet|signage|dms|telephony|print-web|chat|mysql|kiosk-linux))"} < 1
for: 10m
labels:
severity: warning
service: github-runner
alert_channel: thermal_print
annotations:
summary: "Linux GitHub Actions runner offline: {{ $labels.deployment }}"
description: "{{ $labels.deployment }} has no available runner pod for 10 minutes. GitHub jobs using [self-hosted, linux, fc-build-linux] for its repo will queue at $0 until the runner returns."
runbook_url: "https://gitea.iamworkin.lan/bluejay/FlowerCore.Notes/src/branch/master/docs/infrastructure/self-hosted-runner-fleet.md"
# Q-MR-3 (2026-05-11): multus memory pressure — catches the next OOM # Q-MR-3 (2026-05-11): multus memory pressure — catches the next OOM
# cascade BEFORE multus is OOM-killed cluster-wide. The 2026-05-10 # cascade BEFORE multus is OOM-killed cluster-wide. The 2026-05-10
# outage (21h) hit because no alert fired on the rising multus working # outage (21h) hit because no alert fired on the rising multus working
@@ -3427,6 +3440,33 @@ data:
relativeTimeRange: {from: 120, to: 0} relativeTimeRange: {from: 120, to: 0}
datasourceUid: __expr__ datasourceUid: __expr__
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [1], type: lt}}], refId: C} model: {type: threshold, expression: B, conditions: [{evaluator: {params: [1], type: lt}}], refId: C}
- uid: linux-runner-offline
title: LinuxRunnerOffline
condition: C
for: 10m
noDataState: Alerting
execErrState: OK
annotations:
summary: Linux GitHub Actions runner offline
description: "A repo-scoped fc-build-linux runner deployment has no available pod. Jobs will queue at $0 until ArgoCD/K8s returns the runner."
runbook_url: "https://gitea.iamworkin.lan/bluejay/FlowerCore.Notes/src/branch/master/docs/infrastructure/self-hosted-runner-fleet.md"
labels:
severity: warning
service: github-runner
alert_channel: thermal_print
data:
- refId: A
relativeTimeRange: {from: 600, to: 0}
datasourceUid: prometheus
model: {expr: 'min by(deployment) (kube_deployment_status_replicas_available{namespace="github-runner",deployment=~"github-runner(|-(puppet|signage|dms|telephony|print-web|chat|mysql|kiosk-linux))"})', instant: true, refId: A}
- refId: B
relativeTimeRange: {from: 600, to: 0}
datasourceUid: __expr__
model: {type: reduce, expression: A, reducer: last, refId: B}
- refId: C
relativeTimeRange: {from: 600, to: 0}
datasourceUid: __expr__
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [1], type: lt}}], refId: C}
- uid: high-cpu - uid: high-cpu
title: High CPU (>85%) title: High CPU (>85%)
condition: C condition: C

View File

@@ -54,6 +54,18 @@ public sealed class FleetManifestLintTests
"ttsreader-piper", "ttsreader-piper",
}; };
private static readonly IReadOnlyDictionary<string, string> TopLinuxRunnerRepos = new Dictionary<string, string>(StringComparer.Ordinal)
{
["github-runner-puppet"] = "https://github.com/astoltz/FlowerCore.Puppet",
["github-runner-signage"] = "https://github.com/astoltz/FlowerCore.Signage",
["github-runner-dms"] = "https://github.com/astoltz/FlowerCore.DMS",
["github-runner-telephony"] = "https://github.com/astoltz/FlowerCore.Telephony",
["github-runner-print-web"] = "https://github.com/astoltz/FlowerCore.Print.Web",
["github-runner-chat"] = "https://github.com/astoltz/FlowerCore.Chat",
["github-runner-mysql"] = "https://github.com/astoltz/FlowerCore.MySQL",
["github-runner-kiosk-linux"] = "https://github.com/astoltz/FlowerCore.Kiosk.Linux",
};
[Fact] [Fact]
public void IngressRoutes_MustKeepServiceReferencesInTheSameNamespace() public void IngressRoutes_MustKeepServiceReferencesInTheSameNamespace()
{ {
@@ -187,6 +199,76 @@ public sealed class FleetManifestLintTests
violations.Should().BeEmpty(); violations.Should().BeEmpty();
} }
[Fact]
public void GitHubRunnerFleet_MustRegisterTopLinuxReposAsRepoScopedDeployments()
{
var deployments = Inventory.Documents
.Where(document => document.Kind == "Deployment")
.Where(document => document.Namespace == "github-runner")
.ToDictionary(document => document.Name, StringComparer.Ordinal);
foreach (var expectedRunner in TopLinuxRunnerRepos)
{
deployments.Should().ContainKey(expectedRunner.Key);
var container = deployments[expectedRunner.Key].ContainerMappings().Should().ContainSingle().Subject;
EnvValue(container, "REPO_URL").Should().Be(expectedRunner.Value);
EnvValue(container, "EPHEMERAL").Should().Be("true");
EnvValue(container, "LABELS").Should().Be("self-hosted,linux,fc-build-linux");
EnvValue(container, "ACCESS_TOKEN").Should().BeNull("ACCESS_TOKEN must come from github-runner-token Secret, not a literal");
EnvSecretName(container, "ACCESS_TOKEN").Should().Be("github-runner-token");
EnvSecretKey(container, "ACCESS_TOKEN").Should().Be("credential");
}
}
[Fact]
public void GitHubRunnerFleet_MustPreserveExistingCommonRunnerShape()
{
var common = Inventory.Documents
.Single(document => document.Kind == "Deployment"
&& document.Namespace == "github-runner"
&& document.Name == "github-runner");
var container = common.ContainerMappings().Should().ContainSingle().Subject;
EnvValue(container, "REPO_URL").Should().Be("https://github.com/astoltz/FlowerCore.Common");
EnvValue(container, "RUNNER_NAME_PREFIX").Should().Be("rke2-linux");
EnvValue(container, "LABELS").Should().Be("self-hosted,linux,fc-build-linux");
var claimNames = common.MappingSequence("spec", "template", "spec", "volumes")
.Select(volume => ManifestNodeExtensions.Scalar(volume, "persistentVolumeClaim", "claimName"))
.Where(value => !string.IsNullOrWhiteSpace(value))
.ToList();
claimNames.Should().Contain("github-runner-nuget-cache");
}
[Fact]
public void GitHubRunnerFleet_MustUseOneRwoCachePerRepoScopedDeployment()
{
var pvcNames = Inventory.Documents
.Where(document => document.Kind == "PersistentVolumeClaim")
.Where(document => document.Namespace == "github-runner")
.Select(document => document.Name)
.ToHashSet(StringComparer.Ordinal);
foreach (var deploymentName in TopLinuxRunnerRepos.Keys)
{
var suffix = deploymentName["github-runner-".Length..];
pvcNames.Should().Contain($"github-runner-{suffix}-nuget-cache");
}
}
[Fact]
public void Monitoring_MustAlertWhenTopLinuxRunnerDeploymentIsUnavailable()
{
var monitoring = File.ReadAllText(Path.Combine(Inventory.BluejayRoot, "apps", "monitoring", "noc-monitoring.yaml"));
monitoring.Should().Contain("LinuxRunnerOffline");
monitoring.Should().Contain("kube_deployment_status_replicas_available{namespace=\"github-runner\"");
monitoring.Should().Contain("github-runner(|-(puppet|signage|dms|telephony|print-web|chat|mysql|kiosk-linux))");
monitoring.Should().Contain("runbook_url: \"https://gitea.iamworkin.lan/bluejay/FlowerCore.Notes/src/branch/master/docs/infrastructure/self-hosted-runner-fleet.md\"");
}
[Fact] [Fact]
public void StatefulSets_WithVolumeClaimTemplates_MustDeclareFilesystemDefaults() public void StatefulSets_WithVolumeClaimTemplates_MustDeclareFilesystemDefaults()
{ {
@@ -314,6 +396,31 @@ public sealed class FleetManifestLintTests
$"{document.Descriptor} container '{containerName}' still uses {probeKey}.httpGet on /health.", $"{document.Descriptor} container '{containerName}' still uses {probeKey}.httpGet on /health.",
}; };
} }
private static string? EnvValue(YamlMappingNode container, string name)
{
return EnvMapping(container, name) is { } env ? ManifestNodeExtensions.Scalar(env, "value") : null;
}
private static string? EnvSecretName(YamlMappingNode container, string name)
{
return EnvMapping(container, name) is { } env
? ManifestNodeExtensions.Scalar(env, "valueFrom", "secretKeyRef", "name")
: null;
}
private static string? EnvSecretKey(YamlMappingNode container, string name)
{
return EnvMapping(container, name) is { } env
? ManifestNodeExtensions.Scalar(env, "valueFrom", "secretKeyRef", "key")
: null;
}
private static YamlMappingNode? EnvMapping(YamlMappingNode container, string name)
{
return ManifestNodeExtensions.MappingSequence(container, "env")
.SingleOrDefault(env => string.Equals(ManifestNodeExtensions.Scalar(env, "name"), name, StringComparison.Ordinal));
}
} }
internal sealed class ManifestInventory internal sealed class ManifestInventory