feat(github-runner): harden Linux runner fleet (#5)
This commit was merged in pull request #5.
This commit is contained in:
61
apps/github-runner/README.md
Normal file
61
apps/github-runner/README.md
Normal file
@@ -0,0 +1,61 @@
|
||||
# GitHub Runner Fleet
|
||||
|
||||
ArgoCD owns `apps/github-runner/github-runner.yaml`. Do not patch live runner
|
||||
Deployments with `kubectl`; update this manifest and let ArgoCD reconcile.
|
||||
|
||||
## Runner Shape
|
||||
|
||||
All repo-scoped Linux runners use:
|
||||
|
||||
- `ACCESS_TOKEN` from the `github-runner-token` Secret
|
||||
- `RUN_AS_ROOT=false`
|
||||
- `EPHEMERAL=true`
|
||||
- `LABELS=self-hosted,linux,fc-build-linux`
|
||||
- writable non-root paths under `/home/runner` for .NET, NuGet, XDG cache, and
|
||||
Actions tool cache
|
||||
|
||||
`github-runner` for `FlowerCore.Common` is single-replica because it retains the
|
||||
original Longhorn ReadWriteOnce NuGet PVC. `github-runner-sharedpos` and the top
|
||||
Linux-cost repo runners use two replicas with per-pod `emptyDir` caches. That is
|
||||
the safe backlog-drain strategy: no two pods share one RWO PVC.
|
||||
|
||||
## Post-Merge Proof
|
||||
|
||||
After the PR is merged and ArgoCD syncs, verify the runner fleet:
|
||||
|
||||
```bash
|
||||
kubectl -n github-runner get deploy,pods,pvc
|
||||
```
|
||||
|
||||
Verify GitHub registration for the repo-scoped runners:
|
||||
|
||||
```bash
|
||||
for repo in FlowerCore.Common FlowerCore.Shared.Pos FlowerCore.Puppet FlowerCore.Signage \
|
||||
FlowerCore.DMS FlowerCore.Telephony FlowerCore.Print.Web FlowerCore.Chat \
|
||||
FlowerCore.MySQL FlowerCore.Kiosk.Linux; do
|
||||
echo "=== $repo ==="
|
||||
gh api "/repos/astoltz/$repo/actions/runners" \
|
||||
--jq '.runners[] | select(.labels[].name == "fc-build-linux") | {name,status,busy,labels:[.labels[].name]}'
|
||||
done
|
||||
```
|
||||
|
||||
Shared.Pos publish proof after the runner pod is online:
|
||||
|
||||
```bash
|
||||
gh run list --repo astoltz/FlowerCore.Shared.Pos \
|
||||
--workflow "Build, Test & Publish" --branch main --limit 5
|
||||
```
|
||||
|
||||
If the latest run is still queued after runner registration, rerun the workflow
|
||||
from GitHub Actions and verify it lands on an `rke2-linux-*` runner.
|
||||
|
||||
## Failure Notes
|
||||
|
||||
- `actions/setup-dotnet` permission error at `/usr/share/dotnet`: check that
|
||||
`DOTNET_INSTALL_DIR=/home/runner/.dotnet` and related cache env vars are
|
||||
present on the runner pod.
|
||||
- `404` during runner registration: the fine-grained PAT is valid but missing
|
||||
repository access for that repo. Add the repo to the PAT access list; the PAT
|
||||
value does not change.
|
||||
- `Multi-Attach` volume error: only the Common runner uses a RWO PVC and it must
|
||||
stay single-replica. New multi-replica runners use `emptyDir`.
|
||||
File diff suppressed because it is too large
Load Diff
@@ -723,6 +723,24 @@ data:
|
||||
summary: "Mac mini GitHub runner offline ({{ $labels.runner }})"
|
||||
description: "A macmini-* GitHub Actions runner has not reported online for more than 10 minutes. Puppet manages its LaunchDaemon under /Library/LaunchDaemons/io.flowercore.github-runner-<slug>.plist; runners survive reboot and do not require a GUI session."
|
||||
|
||||
- name: linux-runners
|
||||
rules:
|
||||
- alert: LinuxRunnerOffline
|
||||
expr: |
|
||||
kube_deployment_status_replicas_ready{
|
||||
namespace="github-runner",
|
||||
deployment=~"github-runner(|-(sharedpos|puppet|signage|dms|telephony|print-web|chat|mysql|kiosk-linux))"
|
||||
} == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
alert_channel: irc
|
||||
service: github-runner
|
||||
team: ci
|
||||
annotations:
|
||||
summary: "Linux CI runner offline: {{ $labels.deployment }}"
|
||||
description: "Deployment {{ $labels.deployment }} in namespace github-runner has 0 ready replicas for more than 5 minutes. CI jobs targeting this repo will queue until the runner pod restarts and re-registers with GitHub. Check pods with: kubectl -n github-runner get pods -l app.kubernetes.io/name={{ $labels.deployment }}. Check logs with: kubectl -n github-runner logs -l app.kubernetes.io/name={{ $labels.deployment }} --tail=50. Common causes: PAT missing repo access, runner CrashLoopBackOff, or node/resource pressure."
|
||||
|
||||
- name: remote-desktop
|
||||
rules:
|
||||
- alert: RemoteDesktopWebDown
|
||||
@@ -3421,6 +3439,39 @@ data:
|
||||
relativeTimeRange: {from: 120, to: 0}
|
||||
datasourceUid: __expr__
|
||||
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [600], type: gt}}], refId: C}
|
||||
- orgId: 1
|
||||
name: CI Runners
|
||||
folder: CI Alerts
|
||||
interval: 1m
|
||||
rules:
|
||||
- uid: linux-runner-offline
|
||||
title: LinuxRunnerOffline
|
||||
condition: C
|
||||
for: 5m
|
||||
noDataState: OK
|
||||
execErrState: Error
|
||||
annotations:
|
||||
summary: "Linux CI runner offline: {{ $labels.deployment }}"
|
||||
description: "A github-runner namespace Deployment has 0 ready replicas for more than 5 minutes. CI jobs targeting that repo will queue until the runner pod restarts and re-registers."
|
||||
runbook: "1. kubectl -n github-runner get pods -l app.kubernetes.io/name={{ $labels.deployment }} 2. kubectl -n github-runner logs -l app.kubernetes.io/name={{ $labels.deployment }} --tail=50 3. Verify PAT repo access if registration returns 404 4. Verify no RWO PVC is shared by scaled runners"
|
||||
labels:
|
||||
severity: warning
|
||||
service: github-runner
|
||||
alert_channel: irc
|
||||
team: ci
|
||||
data:
|
||||
- refId: A
|
||||
relativeTimeRange: {from: 300, to: 0}
|
||||
datasourceUid: prometheus
|
||||
model: {expr: 'kube_deployment_status_replicas_ready{namespace="github-runner",deployment=~"github-runner(|-(sharedpos|puppet|signage|dms|telephony|print-web|chat|mysql|kiosk-linux))"} == 0', instant: true, refId: A}
|
||||
- refId: B
|
||||
relativeTimeRange: {from: 300, to: 0}
|
||||
datasourceUid: __expr__
|
||||
model: {type: reduce, expression: A, reducer: last, refId: B}
|
||||
- refId: C
|
||||
relativeTimeRange: {from: 300, to: 0}
|
||||
datasourceUid: __expr__
|
||||
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [0], type: gt}}], refId: C}
|
||||
- orgId: 1
|
||||
name: Infrastructure
|
||||
folder: AI Stack Alerts
|
||||
|
||||
Reference in New Issue
Block a user