feat(github-runner): add top Linux repo runners
This commit is contained in:
38
apps/github-runner/README.md
Normal file
38
apps/github-runner/README.md
Normal file
@@ -0,0 +1,38 @@
|
||||
# github-runner
|
||||
|
||||
ArgoCD-managed repo-scoped Linux GitHub Actions runners for FlowerCore.
|
||||
|
||||
`astoltz` is a GitHub user account, not an organization, so each repository
|
||||
needs its own runner registration. The existing Common runner remains
|
||||
`Deployment/github-runner`; Sprint 29 adds one single-replica Deployment for
|
||||
each top Linux-cost repo:
|
||||
|
||||
- `FlowerCore.Puppet`
|
||||
- `FlowerCore.Signage`
|
||||
- `FlowerCore.DMS`
|
||||
- `FlowerCore.Telephony`
|
||||
- `FlowerCore.Print.Web`
|
||||
- `FlowerCore.Chat`
|
||||
- `FlowerCore.MySQL`
|
||||
- `FlowerCore.Kiosk.Linux`
|
||||
|
||||
Each runner uses `myoung34/github-runner:latest`, `EPHEMERAL=true`, and labels
|
||||
`self-hosted,linux,fc-build-linux`. The shared `github-runner-token` Secret is
|
||||
synced from the existing 1Password item `GitHub PAT (Runner Registration)` and
|
||||
is consumed as `ACCESS_TOKEN`.
|
||||
|
||||
Do not `kubectl apply` this app over ArgoCD. Merge to `main`, let
|
||||
`infra-github-runner` sync, then verify from `noc1`:
|
||||
|
||||
```bash
|
||||
kubectl -n github-runner get deploy,pods,pvc
|
||||
|
||||
for repo in FlowerCore.Puppet FlowerCore.Signage FlowerCore.DMS FlowerCore.Telephony FlowerCore.Print.Web FlowerCore.Chat FlowerCore.MySQL FlowerCore.Kiosk.Linux; do
|
||||
gh api "/repos/astoltz/$repo/actions/runners" \
|
||||
--jq '.runners[] | select((.labels[].name == "fc-build-linux") and (.status == "online")) | {name,status,busy,labels:[.labels[].name]}'
|
||||
done
|
||||
```
|
||||
|
||||
`LinuxRunnerOffline` is declared in `apps/monitoring/noc-monitoring.yaml` and
|
||||
fires when any Common or top-8 Linux runner deployment has no available replica
|
||||
for 10 minutes.
|
||||
File diff suppressed because it is too large
Load Diff
@@ -974,6 +974,19 @@ data:
|
||||
summary: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} replica mismatch"
|
||||
description: "Spec wants {{ $labels.spec_replicas }} but only {{ $value }} available. Likely a rollout stuck on probe failure, scheduling, or PVC."
|
||||
|
||||
- alert: LinuxRunnerOffline
|
||||
expr: |
|
||||
kube_deployment_status_replicas_available{namespace="github-runner",deployment=~"github-runner(|-(puppet|signage|dms|telephony|print-web|chat|mysql|kiosk-linux))"} < 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
service: github-runner
|
||||
alert_channel: thermal_print
|
||||
annotations:
|
||||
summary: "Linux GitHub Actions runner offline: {{ $labels.deployment }}"
|
||||
description: "{{ $labels.deployment }} has no available runner pod for 10 minutes. GitHub jobs using [self-hosted, linux, fc-build-linux] for its repo will queue at $0 until the runner returns."
|
||||
runbook_url: "https://gitea.iamworkin.lan/bluejay/FlowerCore.Notes/src/branch/master/docs/infrastructure/self-hosted-runner-fleet.md"
|
||||
|
||||
# Q-MR-3 (2026-05-11): multus memory pressure — catches the next OOM
|
||||
# cascade BEFORE multus is OOM-killed cluster-wide. The 2026-05-10
|
||||
# outage (21h) hit because no alert fired on the rising multus working
|
||||
@@ -3427,6 +3440,33 @@ data:
|
||||
relativeTimeRange: {from: 120, to: 0}
|
||||
datasourceUid: __expr__
|
||||
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [1], type: lt}}], refId: C}
|
||||
- uid: linux-runner-offline
|
||||
title: LinuxRunnerOffline
|
||||
condition: C
|
||||
for: 10m
|
||||
noDataState: Alerting
|
||||
execErrState: OK
|
||||
annotations:
|
||||
summary: Linux GitHub Actions runner offline
|
||||
description: "A repo-scoped fc-build-linux runner deployment has no available pod. Jobs will queue at $0 until ArgoCD/K8s returns the runner."
|
||||
runbook_url: "https://gitea.iamworkin.lan/bluejay/FlowerCore.Notes/src/branch/master/docs/infrastructure/self-hosted-runner-fleet.md"
|
||||
labels:
|
||||
severity: warning
|
||||
service: github-runner
|
||||
alert_channel: thermal_print
|
||||
data:
|
||||
- refId: A
|
||||
relativeTimeRange: {from: 600, to: 0}
|
||||
datasourceUid: prometheus
|
||||
model: {expr: 'min by(deployment) (kube_deployment_status_replicas_available{namespace="github-runner",deployment=~"github-runner(|-(puppet|signage|dms|telephony|print-web|chat|mysql|kiosk-linux))"})', instant: true, refId: A}
|
||||
- refId: B
|
||||
relativeTimeRange: {from: 600, to: 0}
|
||||
datasourceUid: __expr__
|
||||
model: {type: reduce, expression: A, reducer: last, refId: B}
|
||||
- refId: C
|
||||
relativeTimeRange: {from: 600, to: 0}
|
||||
datasourceUid: __expr__
|
||||
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [1], type: lt}}], refId: C}
|
||||
- uid: high-cpu
|
||||
title: High CPU (>85%)
|
||||
condition: C
|
||||
|
||||
Reference in New Issue
Block a user