Compare commits
12 Commits
claude/git
...
67064c4129
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
67064c4129 | ||
| b8c7e59005 | |||
| 65ac8d6f01 | |||
| 35844e0dbd | |||
| b1e307151e | |||
| 12b07219c7 | |||
| 9fd32c4415 | |||
| ad670fb344 | |||
|
|
6f6ca50987 | ||
|
|
c7be58c1f7 | ||
|
|
a1f5a393cd | ||
|
|
710340d8be |
@@ -1,3 +1,2 @@
|
|||||||
# Restart kiosk and redeclare capabilities when HDMI connect/disconnect changes DRM state.
|
# Settle DRM for 2s before restarting Chromium, then redeclare capabilities.
|
||||||
SUBSYSTEM=="drm", KERNEL=="card?-HDMI-A-?", ACTION=="change", RUN+="/usr/bin/systemctl restart flowercore-signage-player-pi.service"
|
SUBSYSTEM=="drm", KERNEL=="card?-HDMI-A-?", ACTION=="change", RUN+="/usr/bin/systemctl start flowercore-signage-player-pi-hdmi.service"
|
||||||
SUBSYSTEM=="drm", KERNEL=="card?-HDMI-A-?", ACTION=="change", RUN+="/usr/bin/systemctl start flowercore-signage-detect-display.service"
|
|
||||||
|
|||||||
22
apps/fc-signage-pi-player/tests/display_capability.bats
Normal file
22
apps/fc-signage-pi-player/tests/display_capability.bats
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
#!/usr/bin/env bats
|
||||||
|
|
||||||
|
setup() {
|
||||||
|
APP_ROOT="$(cd "$BATS_TEST_DIRNAME/.." && pwd)"
|
||||||
|
DETECT="$APP_ROOT/scripts/fc-signage-detect-display"
|
||||||
|
}
|
||||||
|
|
||||||
|
@test "display detection emits graceful disconnected profile when no hdmi connector is present" {
|
||||||
|
script="$(cat "$DETECT")"
|
||||||
|
[[ "$script" == *"displayConnected: false"* ]]
|
||||||
|
[[ "$script" == *"No HDMI display detected"* ]]
|
||||||
|
}
|
||||||
|
|
||||||
|
@test "display detection parses edid, falls back to kmsprint, and logs endpoint failures locally" {
|
||||||
|
script="$(cat "$DETECT")"
|
||||||
|
[[ "$script" == *"edid-decode"* ]]
|
||||||
|
[[ "$script" == *"HDR (Static|Dynamic) Metadata Block"* ]]
|
||||||
|
[[ "$script" == *"kmsprint"* ]]
|
||||||
|
[[ "$script" == *"/api/v1/nodes/\${NODE_ID}/capabilities"* ]]
|
||||||
|
[[ "$script" == *"/api/v1/displays/\${NODE_ID}/capability-profile"* ]]
|
||||||
|
[[ "$script" == *"capabilities.log"* ]]
|
||||||
|
}
|
||||||
64
apps/fc-signage-pi-player/tests/identity_bootstrap.bats
Normal file
64
apps/fc-signage-pi-player/tests/identity_bootstrap.bats
Normal file
@@ -0,0 +1,64 @@
|
|||||||
|
#!/usr/bin/env bats
|
||||||
|
|
||||||
|
setup() {
|
||||||
|
APP_ROOT="$(cd "$BATS_TEST_DIRNAME/.." && pwd)"
|
||||||
|
BOOTSTRAP="$APP_ROOT/scripts/flowercore-signage-bootstrap.sh"
|
||||||
|
RENEW="$APP_ROOT/scripts/flowercore-signage-renew-cert.sh"
|
||||||
|
}
|
||||||
|
|
||||||
|
@test "bootstrap is idempotent when node is already enrolled" {
|
||||||
|
script="$(cat "$BOOTSTRAP")"
|
||||||
|
[[ "$script" == *'[[ -s "$NODE_JSON" && -s "$CERT_DIR/client.p12" ]]'* ]]
|
||||||
|
[[ "$script" == *"already enrolled"* ]]
|
||||||
|
[[ "$script" == *"exit 0"* ]]
|
||||||
|
}
|
||||||
|
|
||||||
|
@test "bootstrap generates a stable node uuid and machine id" {
|
||||||
|
script="$(cat "$BOOTSTRAP")"
|
||||||
|
[[ "$script" == *"uuidgen"* ]]
|
||||||
|
[[ "$script" == *"nodeUuid"* ]]
|
||||||
|
[[ "$script" == *"machineId"* ]]
|
||||||
|
[[ "$script" == *"cut -c1-16"* ]]
|
||||||
|
}
|
||||||
|
|
||||||
|
@test "bootstrap posts to the canonical register endpoint" {
|
||||||
|
grep -q '/api/v1/nodes/register' "$BOOTSTRAP"
|
||||||
|
grep -q '"linux-arm64-pi"' "$BOOTSTRAP"
|
||||||
|
}
|
||||||
|
|
||||||
|
@test "bootstrap retries registration once for first-call races" {
|
||||||
|
script="$(cat "$BOOTSTRAP")"
|
||||||
|
[[ "$script" == *"for attempt in 1 2"* ]]
|
||||||
|
[[ "$script" == *"register attempt \$attempt returned"* ]]
|
||||||
|
[[ "$script" == *"sleep 5"* ]]
|
||||||
|
}
|
||||||
|
|
||||||
|
@test "bootstrap supports setup-code approval with manual polling fallback" {
|
||||||
|
script="$(cat "$BOOTSTRAP")"
|
||||||
|
[[ "$script" == *"signage-setup-code"* ]]
|
||||||
|
[[ "$script" == *"approve-via-setup-code"* ]]
|
||||||
|
[[ "$script" == *"+ 1800"* ]]
|
||||||
|
[[ "$script" == *"sleep 15"* ]]
|
||||||
|
}
|
||||||
|
|
||||||
|
@test "bootstrap generates an ecdsa p256 csr for the signage pi subject" {
|
||||||
|
script="$(cat "$BOOTSTRAP")"
|
||||||
|
[[ "$script" == *"ecparam -genkey -name prime256v1"* ]]
|
||||||
|
[[ "$script" == *'/CN=${NODE_ID}/O=FlowerCore/OU=SignagePlayer-Pi'* ]]
|
||||||
|
}
|
||||||
|
|
||||||
|
@test "bootstrap writes pkcs12 bundle with restrictive permissions" {
|
||||||
|
script="$(cat "$BOOTSTRAP")"
|
||||||
|
[[ "$script" == *"openssl pkcs12 -export"* ]]
|
||||||
|
[[ "$script" == *"client.p12.pass"* ]]
|
||||||
|
[[ "$script" == *"chmod 0640"* ]]
|
||||||
|
[[ "$script" == *"chmod 0600"* ]]
|
||||||
|
}
|
||||||
|
|
||||||
|
@test "renewal only calls renew endpoint inside the thirty-day window and swaps atomically" {
|
||||||
|
script="$(cat "$RENEW")"
|
||||||
|
[[ "$script" == *'-checkend $((30*24*3600))'* ]]
|
||||||
|
[[ "$script" == *"/api/v1/nodes/\${NODE_ID}/renew"* ]]
|
||||||
|
[[ "$script" == *"client.key.new"* ]]
|
||||||
|
[[ "$script" == *'mv "$CERT_DIR/client.p12.new" "$CERT_DIR/client.p12"'* ]]
|
||||||
|
}
|
||||||
68
apps/fc-signage-pi-player/tests/systemd_kiosk_wrapper.bats
Normal file
68
apps/fc-signage-pi-player/tests/systemd_kiosk_wrapper.bats
Normal file
@@ -0,0 +1,68 @@
|
|||||||
|
#!/usr/bin/env bats
|
||||||
|
|
||||||
|
setup() {
|
||||||
|
APP_ROOT="$(cd "$BATS_TEST_DIRNAME/.." && pwd)"
|
||||||
|
}
|
||||||
|
|
||||||
|
@test "player unit exists" {
|
||||||
|
[ -f "$APP_ROOT/systemd/flowercore-signage-player-pi.service" ]
|
||||||
|
}
|
||||||
|
|
||||||
|
@test "player unit uses simple chromium service with restart backoff" {
|
||||||
|
unit="$(cat "$APP_ROOT/systemd/flowercore-signage-player-pi.service")"
|
||||||
|
[[ "$unit" == *"Type=simple"* ]]
|
||||||
|
[[ "$unit" == *"Restart=always"* ]]
|
||||||
|
[[ "$unit" == *"RestartSec=10s"* ]]
|
||||||
|
[[ "$unit" == *"StartLimitBurst=5"* ]]
|
||||||
|
[[ "$unit" == *"StartLimitIntervalSec=300s"* ]]
|
||||||
|
}
|
||||||
|
|
||||||
|
@test "player unit caps chromium memory at two gigabytes" {
|
||||||
|
grep -q '^MemoryMax=2G$' "$APP_ROOT/systemd/flowercore-signage-player-pi.service"
|
||||||
|
grep -q '^MemoryHigh=1500M$' "$APP_ROOT/systemd/flowercore-signage-player-pi.service"
|
||||||
|
}
|
||||||
|
|
||||||
|
@test "player unit condition-gates startup on identity and p12 certificate" {
|
||||||
|
grep -q '^ConditionPathExists=/etc/flowercore/signage-node.json$' "$APP_ROOT/systemd/flowercore-signage-player-pi.service"
|
||||||
|
grep -q '^ConditionPathExists=/etc/fc-signage-player/client.p12$' "$APP_ROOT/systemd/flowercore-signage-player-pi.service"
|
||||||
|
}
|
||||||
|
|
||||||
|
@test "player unit runs prelaunch checks before chromium" {
|
||||||
|
grep -q '^ExecStartPre=/usr/local/bin/flowercore-signage-prelaunch.sh$' "$APP_ROOT/systemd/flowercore-signage-player-pi.service"
|
||||||
|
grep -q '^ExecStart=/usr/local/bin/flowercore-signage-launch.sh$' "$APP_ROOT/systemd/flowercore-signage-player-pi.service"
|
||||||
|
}
|
||||||
|
|
||||||
|
@test "hdmi udev rule routes through the two-second settle service" {
|
||||||
|
rule="$(cat "$APP_ROOT/systemd/99-flowercore-signage-hdmi.rules")"
|
||||||
|
[[ "$rule" == *'KERNEL=="card?-HDMI-A-?"'* ]]
|
||||||
|
[[ "$rule" == *"systemctl start flowercore-signage-player-pi-hdmi.service"* ]]
|
||||||
|
[[ "$rule" != *"systemctl restart flowercore-signage-player-pi.service"* ]]
|
||||||
|
}
|
||||||
|
|
||||||
|
@test "hdmi responder settles, declares display, then restarts chromium" {
|
||||||
|
responder="$(cat "$APP_ROOT/scripts/flowercore-signage-hdmi-respond.sh")"
|
||||||
|
[[ "$responder" == *"sleep 2"* ]]
|
||||||
|
[[ "$responder" == *"systemctl start flowercore-signage-detect-display.service"* ]]
|
||||||
|
[[ "$responder" == *"systemctl restart flowercore-signage-player-pi.service"* ]]
|
||||||
|
}
|
||||||
|
|
||||||
|
@test "chromium policy json is valid and disables credential prompts" {
|
||||||
|
command -v jq >/dev/null || skip "jq not installed"
|
||||||
|
jq -e '.AutofillAddressEnabled == false and .AutofillCreditCardEnabled == false and .PasswordManagerEnabled == false' \
|
||||||
|
"$APP_ROOT/chromium-policies/flowercore-signage.json" >/dev/null
|
||||||
|
}
|
||||||
|
|
||||||
|
@test "launch script tries embed URL and logs bare-player fallback" {
|
||||||
|
launch="$(cat "$APP_ROOT/scripts/flowercore-signage-launch.sh")"
|
||||||
|
[[ "$launch" == *'/player/${NODE_ID}/embed?token=${CERT_THUMB}'* ]]
|
||||||
|
[[ "$launch" == *"url-divergence.log"* ]]
|
||||||
|
[[ "$launch" == *'/player/${NODE_ID}?token=${CERT_THUMB}'* ]]
|
||||||
|
}
|
||||||
|
|
||||||
|
@test "prelaunch script validates required node and cert files" {
|
||||||
|
prelaunch="$(cat "$APP_ROOT/scripts/flowercore-signage-prelaunch.sh")"
|
||||||
|
[[ "$prelaunch" == *"/etc/flowercore/signage-node.json"* ]]
|
||||||
|
[[ "$prelaunch" == *"/etc/fc-signage-player/client.p12"* ]]
|
||||||
|
[[ "$prelaunch" == *"/etc/fc-signage-player/client.p12.pass"* ]]
|
||||||
|
[[ "$prelaunch" == *"exit 1"* ]]
|
||||||
|
}
|
||||||
61
apps/github-runner/README.md
Normal file
61
apps/github-runner/README.md
Normal file
@@ -0,0 +1,61 @@
|
|||||||
|
# GitHub Runner Fleet
|
||||||
|
|
||||||
|
ArgoCD owns `apps/github-runner/github-runner.yaml`. Do not patch live runner
|
||||||
|
Deployments with `kubectl`; update this manifest and let ArgoCD reconcile.
|
||||||
|
|
||||||
|
## Runner Shape
|
||||||
|
|
||||||
|
All repo-scoped Linux runners use:
|
||||||
|
|
||||||
|
- `ACCESS_TOKEN` from the `github-runner-token` Secret
|
||||||
|
- `RUN_AS_ROOT=false`
|
||||||
|
- `EPHEMERAL=true`
|
||||||
|
- `LABELS=self-hosted,linux,fc-build-linux`
|
||||||
|
- writable non-root paths under `/home/runner` for .NET, NuGet, XDG cache, and
|
||||||
|
Actions tool cache
|
||||||
|
|
||||||
|
`github-runner` for `FlowerCore.Common` is single-replica because it retains the
|
||||||
|
original Longhorn ReadWriteOnce NuGet PVC. `github-runner-sharedpos` and the top
|
||||||
|
Linux-cost repo runners use two replicas with per-pod `emptyDir` caches. That is
|
||||||
|
the safe backlog-drain strategy: no two pods share one RWO PVC.
|
||||||
|
|
||||||
|
## Post-Merge Proof
|
||||||
|
|
||||||
|
After the PR is merged and ArgoCD syncs, verify the runner fleet:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
kubectl -n github-runner get deploy,pods,pvc
|
||||||
|
```
|
||||||
|
|
||||||
|
Verify GitHub registration for the repo-scoped runners:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
for repo in FlowerCore.Common FlowerCore.Shared.Pos FlowerCore.Puppet FlowerCore.Signage \
|
||||||
|
FlowerCore.DMS FlowerCore.Telephony FlowerCore.Print.Web FlowerCore.Chat \
|
||||||
|
FlowerCore.MySQL FlowerCore.Kiosk.Linux; do
|
||||||
|
echo "=== $repo ==="
|
||||||
|
gh api "/repos/astoltz/$repo/actions/runners" \
|
||||||
|
--jq '.runners[] | select(.labels[].name == "fc-build-linux") | {name,status,busy,labels:[.labels[].name]}'
|
||||||
|
done
|
||||||
|
```
|
||||||
|
|
||||||
|
Shared.Pos publish proof after the runner pod is online:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
gh run list --repo astoltz/FlowerCore.Shared.Pos \
|
||||||
|
--workflow "Build, Test & Publish" --branch main --limit 5
|
||||||
|
```
|
||||||
|
|
||||||
|
If the latest run is still queued after runner registration, rerun the workflow
|
||||||
|
from GitHub Actions and verify it lands on an `rke2-linux-*` runner.
|
||||||
|
|
||||||
|
## Failure Notes
|
||||||
|
|
||||||
|
- `actions/setup-dotnet` permission error at `/usr/share/dotnet`: check that
|
||||||
|
`DOTNET_INSTALL_DIR=/home/runner/.dotnet` and related cache env vars are
|
||||||
|
present on the runner pod.
|
||||||
|
- `404` during runner registration: the fine-grained PAT is valid but missing
|
||||||
|
repository access for that repo. Add the repo to the PAT access list; the PAT
|
||||||
|
value does not change.
|
||||||
|
- `Multi-Attach` volume error: only the Common runner uses a RWO PVC and it must
|
||||||
|
stay single-replica. New multi-replica runners use `emptyDir`.
|
||||||
File diff suppressed because it is too large
Load Diff
@@ -75,6 +75,20 @@ data:
|
|||||||
cluster: "rke2"
|
cluster: "rke2"
|
||||||
role: "agent"
|
role: "agent"
|
||||||
|
|
||||||
|
# Mac mini macOS runner node (INFRA VLAN)
|
||||||
|
- job_name: "macmini-node"
|
||||||
|
scrape_timeout: 15s
|
||||||
|
static_configs:
|
||||||
|
- targets: ["10.0.56.115:9100"]
|
||||||
|
labels:
|
||||||
|
instance: "macmini"
|
||||||
|
host: "macmini.iamworkin.lan"
|
||||||
|
vlan: "infra"
|
||||||
|
arch: "arm64"
|
||||||
|
role: "macos-runner"
|
||||||
|
puppet_managed: "true"
|
||||||
|
puppet_server: "puppet.iamworkin.lan"
|
||||||
|
|
||||||
# In-cluster node-exporter DaemonSet
|
# In-cluster node-exporter DaemonSet
|
||||||
- job_name: "k8s-node-exporter"
|
- job_name: "k8s-node-exporter"
|
||||||
kubernetes_sd_configs:
|
kubernetes_sd_configs:
|
||||||
@@ -697,6 +711,36 @@ data:
|
|||||||
summary: "Print.Web Ollama runner held for >10m ({{ $labels.model }})"
|
summary: "Print.Web Ollama runner held for >10m ({{ $labels.model }})"
|
||||||
description: "Print.Web reports model {{ $labels.model }} with {{ $value | printf \"%.0f\" }}s of keep-alive remaining. Check concurrent requests before the Pi 5 Ollama lane thrashes."
|
description: "Print.Web reports model {{ $labels.model }} with {{ $value | printf \"%.0f\" }}s of keep-alive remaining. Check concurrent requests before the Pi 5 Ollama lane thrashes."
|
||||||
|
|
||||||
|
- name: macmini-runners
|
||||||
|
rules:
|
||||||
|
- alert: MacMiniRunnerOffline
|
||||||
|
expr: (flowercore_github_runner_online{runner=~"macmini-.*"} == 0) or absent(flowercore_github_runner_online{runner=~"macmini-.*"})
|
||||||
|
for: 10m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
service: github-runner
|
||||||
|
annotations:
|
||||||
|
summary: "Mac mini GitHub runner offline ({{ $labels.runner }})"
|
||||||
|
description: "A macmini-* GitHub Actions runner has not reported online for more than 10 minutes. Puppet manages its LaunchDaemon under /Library/LaunchDaemons/io.flowercore.github-runner-<slug>.plist; runners survive reboot and do not require a GUI session."
|
||||||
|
|
||||||
|
- name: linux-runners
|
||||||
|
rules:
|
||||||
|
- alert: LinuxRunnerOffline
|
||||||
|
expr: |
|
||||||
|
kube_deployment_status_replicas_ready{
|
||||||
|
namespace="github-runner",
|
||||||
|
deployment=~"github-runner(|-(sharedpos|puppet|signage|dms|telephony|print-web|chat|mysql|kiosk-linux))"
|
||||||
|
} == 0
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
alert_channel: irc
|
||||||
|
service: github-runner
|
||||||
|
team: ci
|
||||||
|
annotations:
|
||||||
|
summary: "Linux CI runner offline: {{ $labels.deployment }}"
|
||||||
|
description: "Deployment {{ $labels.deployment }} in namespace github-runner has 0 ready replicas for more than 5 minutes. CI jobs targeting this repo will queue until the runner pod restarts and re-registers with GitHub. Check pods with: kubectl -n github-runner get pods -l app.kubernetes.io/name={{ $labels.deployment }}. Check logs with: kubectl -n github-runner logs -l app.kubernetes.io/name={{ $labels.deployment }} --tail=50. Common causes: PAT missing repo access, runner CrashLoopBackOff, or node/resource pressure."
|
||||||
|
|
||||||
- name: remote-desktop
|
- name: remote-desktop
|
||||||
rules:
|
rules:
|
||||||
- alert: RemoteDesktopWebDown
|
- alert: RemoteDesktopWebDown
|
||||||
@@ -3395,6 +3439,39 @@ data:
|
|||||||
relativeTimeRange: {from: 120, to: 0}
|
relativeTimeRange: {from: 120, to: 0}
|
||||||
datasourceUid: __expr__
|
datasourceUid: __expr__
|
||||||
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [600], type: gt}}], refId: C}
|
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [600], type: gt}}], refId: C}
|
||||||
|
- orgId: 1
|
||||||
|
name: CI Runners
|
||||||
|
folder: CI Alerts
|
||||||
|
interval: 1m
|
||||||
|
rules:
|
||||||
|
- uid: linux-runner-offline
|
||||||
|
title: LinuxRunnerOffline
|
||||||
|
condition: C
|
||||||
|
for: 5m
|
||||||
|
noDataState: OK
|
||||||
|
execErrState: Error
|
||||||
|
annotations:
|
||||||
|
summary: "Linux CI runner offline: {{ $labels.deployment }}"
|
||||||
|
description: "A github-runner namespace Deployment has 0 ready replicas for more than 5 minutes. CI jobs targeting that repo will queue until the runner pod restarts and re-registers."
|
||||||
|
runbook: "1. kubectl -n github-runner get pods -l app.kubernetes.io/name={{ $labels.deployment }} 2. kubectl -n github-runner logs -l app.kubernetes.io/name={{ $labels.deployment }} --tail=50 3. Verify PAT repo access if registration returns 404 4. Verify no RWO PVC is shared by scaled runners"
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
service: github-runner
|
||||||
|
alert_channel: irc
|
||||||
|
team: ci
|
||||||
|
data:
|
||||||
|
- refId: A
|
||||||
|
relativeTimeRange: {from: 300, to: 0}
|
||||||
|
datasourceUid: prometheus
|
||||||
|
model: {expr: 'kube_deployment_status_replicas_ready{namespace="github-runner",deployment=~"github-runner(|-(sharedpos|puppet|signage|dms|telephony|print-web|chat|mysql|kiosk-linux))"} == 0', instant: true, refId: A}
|
||||||
|
- refId: B
|
||||||
|
relativeTimeRange: {from: 300, to: 0}
|
||||||
|
datasourceUid: __expr__
|
||||||
|
model: {type: reduce, expression: A, reducer: last, refId: B}
|
||||||
|
- refId: C
|
||||||
|
relativeTimeRange: {from: 300, to: 0}
|
||||||
|
datasourceUid: __expr__
|
||||||
|
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [0], type: gt}}], refId: C}
|
||||||
- orgId: 1
|
- orgId: 1
|
||||||
name: Infrastructure
|
name: Infrastructure
|
||||||
folder: AI Stack Alerts
|
folder: AI Stack Alerts
|
||||||
@@ -3427,6 +3504,32 @@ data:
|
|||||||
relativeTimeRange: {from: 120, to: 0}
|
relativeTimeRange: {from: 120, to: 0}
|
||||||
datasourceUid: __expr__
|
datasourceUid: __expr__
|
||||||
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [1], type: lt}}], refId: C}
|
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [1], type: lt}}], refId: C}
|
||||||
|
- uid: macmini-runner-offline
|
||||||
|
title: MacMiniRunnerOffline
|
||||||
|
condition: C
|
||||||
|
for: 10m
|
||||||
|
noDataState: Alerting
|
||||||
|
execErrState: OK
|
||||||
|
annotations:
|
||||||
|
summary: Mac mini GitHub runner offline
|
||||||
|
description: "One or more macmini-* GitHub Actions runners have not reported online for more than 10 minutes. LaunchDaemons survive reboot and do not require the bluejay GUI session."
|
||||||
|
runbook: "1. ssh fcadmin@macmini.iamworkin.lan 2. launchctl print system/io.flowercore.github-runner-<slug> 3. Check /Users/fcadmin/Library/Logs/github-runners/<slug>/stderr.log 4. Re-register the repo runner if .runner is missing"
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
service: github-runner
|
||||||
|
data:
|
||||||
|
- refId: A
|
||||||
|
relativeTimeRange: {from: 600, to: 0}
|
||||||
|
datasourceUid: prometheus
|
||||||
|
model: {expr: 'min(flowercore_github_runner_online{runner=~"macmini-.*"} or vector(0))', instant: true, refId: A}
|
||||||
|
- refId: B
|
||||||
|
relativeTimeRange: {from: 600, to: 0}
|
||||||
|
datasourceUid: __expr__
|
||||||
|
model: {type: reduce, expression: A, reducer: last, refId: B}
|
||||||
|
- refId: C
|
||||||
|
relativeTimeRange: {from: 600, to: 0}
|
||||||
|
datasourceUid: __expr__
|
||||||
|
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [1], type: lt}}], refId: C}
|
||||||
- uid: high-cpu
|
- uid: high-cpu
|
||||||
title: High CPU (>85%)
|
title: High CPU (>85%)
|
||||||
condition: C
|
condition: C
|
||||||
|
|||||||
@@ -54,6 +54,43 @@ public sealed class FleetManifestLintTests
|
|||||||
"ttsreader-piper",
|
"ttsreader-piper",
|
||||||
};
|
};
|
||||||
|
|
||||||
|
private static readonly IReadOnlyDictionary<string, string> LinuxRunnerRepos = new Dictionary<string, string>(StringComparer.Ordinal)
|
||||||
|
{
|
||||||
|
["github-runner"] = "https://github.com/astoltz/FlowerCore.Common",
|
||||||
|
["github-runner-sharedpos"] = "https://github.com/astoltz/FlowerCore.Shared.Pos",
|
||||||
|
["github-runner-puppet"] = "https://github.com/astoltz/FlowerCore.Puppet",
|
||||||
|
["github-runner-signage"] = "https://github.com/astoltz/FlowerCore.Signage",
|
||||||
|
["github-runner-dms"] = "https://github.com/astoltz/FlowerCore.DMS",
|
||||||
|
["github-runner-telephony"] = "https://github.com/astoltz/FlowerCore.Telephony",
|
||||||
|
["github-runner-print-web"] = "https://github.com/astoltz/FlowerCore.Print.Web",
|
||||||
|
["github-runner-chat"] = "https://github.com/astoltz/FlowerCore.Chat",
|
||||||
|
["github-runner-mysql"] = "https://github.com/astoltz/FlowerCore.MySQL",
|
||||||
|
["github-runner-kiosk-linux"] = "https://github.com/astoltz/FlowerCore.Kiosk.Linux",
|
||||||
|
};
|
||||||
|
|
||||||
|
private static readonly HashSet<string> ScaledLinuxRunnerDeployments = new(StringComparer.Ordinal)
|
||||||
|
{
|
||||||
|
"github-runner-sharedpos",
|
||||||
|
"github-runner-puppet",
|
||||||
|
"github-runner-signage",
|
||||||
|
"github-runner-dms",
|
||||||
|
"github-runner-telephony",
|
||||||
|
"github-runner-print-web",
|
||||||
|
"github-runner-chat",
|
||||||
|
"github-runner-mysql",
|
||||||
|
"github-runner-kiosk-linux",
|
||||||
|
};
|
||||||
|
|
||||||
|
private static readonly IReadOnlyDictionary<string, string> WritableRunnerEnv = new Dictionary<string, string>(StringComparer.Ordinal)
|
||||||
|
{
|
||||||
|
["HOME"] = "/home/runner",
|
||||||
|
["DOTNET_INSTALL_DIR"] = "/home/runner/.dotnet",
|
||||||
|
["DOTNET_CLI_HOME"] = "/home/runner",
|
||||||
|
["NUGET_PACKAGES"] = "/home/runner/.nuget/packages",
|
||||||
|
["XDG_CACHE_HOME"] = "/home/runner/.cache",
|
||||||
|
["RUNNER_TOOL_CACHE"] = "/home/runner/_tool",
|
||||||
|
};
|
||||||
|
|
||||||
[Fact]
|
[Fact]
|
||||||
public void IngressRoutes_MustKeepServiceReferencesInTheSameNamespace()
|
public void IngressRoutes_MustKeepServiceReferencesInTheSameNamespace()
|
||||||
{
|
{
|
||||||
@@ -187,6 +224,98 @@ public sealed class FleetManifestLintTests
|
|||||||
violations.Should().BeEmpty();
|
violations.Should().BeEmpty();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public void GitHubRunnerFleet_MustRegisterRequiredReposAsRepoScopedDeployments()
|
||||||
|
{
|
||||||
|
var deployments = GitHubRunnerDeployments();
|
||||||
|
|
||||||
|
foreach (var expectedRunner in LinuxRunnerRepos)
|
||||||
|
{
|
||||||
|
deployments.Should().ContainKey(expectedRunner.Key);
|
||||||
|
|
||||||
|
var container = deployments[expectedRunner.Key].ContainerMappings().Should().ContainSingle().Subject;
|
||||||
|
EnvValue(container, "REPO_URL").Should().Be(expectedRunner.Value);
|
||||||
|
EnvValue(container, "EPHEMERAL").Should().Be("true");
|
||||||
|
EnvValue(container, "LABELS").Should().Be("self-hosted,linux,fc-build-linux");
|
||||||
|
EnvValue(container, "RUN_AS_ROOT").Should().Be("false");
|
||||||
|
EnvValue(container, "ACCESS_TOKEN").Should().BeNull("ACCESS_TOKEN must come from github-runner-token Secret, not a literal");
|
||||||
|
EnvSecretName(container, "ACCESS_TOKEN").Should().Be("github-runner-token");
|
||||||
|
EnvSecretKey(container, "ACCESS_TOKEN").Should().Be("credential");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public void GitHubRunnerFleet_MustSetWritableNonRootDotnetAndCachePaths()
|
||||||
|
{
|
||||||
|
foreach (var deployment in GitHubRunnerDeployments().Values)
|
||||||
|
{
|
||||||
|
var container = deployment.ContainerMappings().Should().ContainSingle().Subject;
|
||||||
|
|
||||||
|
foreach (var expectedEnv in WritableRunnerEnv)
|
||||||
|
{
|
||||||
|
EnvValue(container, expectedEnv.Key).Should().Be(expectedEnv.Value, $"{deployment.Name} must keep .NET paths writable for uid 1001");
|
||||||
|
}
|
||||||
|
|
||||||
|
var mounts = ManifestNodeExtensions.MappingSequence(container, "volumeMounts")
|
||||||
|
.ToDictionary(
|
||||||
|
mount => ManifestNodeExtensions.Scalar(mount, "name") ?? string.Empty,
|
||||||
|
mount => ManifestNodeExtensions.Scalar(mount, "mountPath") ?? string.Empty,
|
||||||
|
StringComparer.Ordinal);
|
||||||
|
|
||||||
|
mounts.Should().Contain("runner-home", "/home/runner");
|
||||||
|
mounts.Should().Contain("nuget-cache", "/home/runner/.nuget/packages");
|
||||||
|
mounts.Should().Contain("tmp", "/tmp");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public void GitHubRunnerFleet_MustAvoidRwoMultiAttachForScaledDeployments()
|
||||||
|
{
|
||||||
|
var deployments = GitHubRunnerDeployments();
|
||||||
|
|
||||||
|
foreach (var deploymentName in ScaledLinuxRunnerDeployments)
|
||||||
|
{
|
||||||
|
var deployment = deployments[deploymentName];
|
||||||
|
ReplicaCount(deployment).Should().Be(2);
|
||||||
|
|
||||||
|
var volumes = deployment.MappingSequence("spec", "template", "spec", "volumes");
|
||||||
|
var claimNames = volumes
|
||||||
|
.Select(volume => ManifestNodeExtensions.Scalar(volume, "persistentVolumeClaim", "claimName"))
|
||||||
|
.Where(value => !string.IsNullOrWhiteSpace(value))
|
||||||
|
.ToList();
|
||||||
|
|
||||||
|
claimNames.Should().BeEmpty($"{deploymentName} is scaled and must not share a RWO PVC");
|
||||||
|
volumes.Should().Contain(volume =>
|
||||||
|
string.Equals(ManifestNodeExtensions.Scalar(volume, "name"), "nuget-cache", StringComparison.Ordinal)
|
||||||
|
&& ManifestNodeExtensions.Mapping(volume, "emptyDir") != null);
|
||||||
|
}
|
||||||
|
|
||||||
|
var common = deployments["github-runner"];
|
||||||
|
ReplicaCount(common).Should().Be(1);
|
||||||
|
common.MappingSequence("spec", "template", "spec", "volumes")
|
||||||
|
.Select(volume => ManifestNodeExtensions.Scalar(volume, "persistentVolumeClaim", "claimName"))
|
||||||
|
.Where(value => !string.IsNullOrWhiteSpace(value))
|
||||||
|
.Should()
|
||||||
|
.ContainSingle()
|
||||||
|
.Which
|
||||||
|
.Should()
|
||||||
|
.Be("github-runner-nuget-cache");
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public void Monitoring_MustAlertWhenLinuxRunnerDeploymentIsUnavailable()
|
||||||
|
{
|
||||||
|
var monitoring = File.ReadAllText(Path.Combine(Inventory.BluejayRoot, "apps", "monitoring", "noc-monitoring.yaml"));
|
||||||
|
|
||||||
|
monitoring.Should().Contain("MacMiniRunnerOffline");
|
||||||
|
monitoring.Should().Contain("LinuxRunnerOffline");
|
||||||
|
monitoring.Should().Contain("kube_deployment_status_replicas_ready");
|
||||||
|
monitoring.Should().Contain("github-runner(|-(sharedpos|puppet|signage|dms|telephony|print-web|chat|mysql|kiosk-linux))");
|
||||||
|
monitoring.Should().Contain("folder: CI Alerts");
|
||||||
|
monitoring.Should().Contain("uid: linux-runner-offline");
|
||||||
|
monitoring.Should().Contain("alert_channel: irc");
|
||||||
|
}
|
||||||
|
|
||||||
[Fact]
|
[Fact]
|
||||||
public void StatefulSets_WithVolumeClaimTemplates_MustDeclareFilesystemDefaults()
|
public void StatefulSets_WithVolumeClaimTemplates_MustDeclareFilesystemDefaults()
|
||||||
{
|
{
|
||||||
@@ -314,6 +443,44 @@ public sealed class FleetManifestLintTests
|
|||||||
$"{document.Descriptor} container '{containerName}' still uses {probeKey}.httpGet on /health.",
|
$"{document.Descriptor} container '{containerName}' still uses {probeKey}.httpGet on /health.",
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static IReadOnlyDictionary<string, ManifestDocument> GitHubRunnerDeployments()
|
||||||
|
{
|
||||||
|
return Inventory.Documents
|
||||||
|
.Where(document => document.Kind == "Deployment")
|
||||||
|
.Where(document => document.Namespace == "github-runner")
|
||||||
|
.ToDictionary(document => document.Name, StringComparer.Ordinal);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static int ReplicaCount(ManifestDocument document)
|
||||||
|
{
|
||||||
|
return int.TryParse(document.Scalar("spec", "replicas"), out var replicas) ? replicas : 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static string? EnvValue(YamlMappingNode container, string name)
|
||||||
|
{
|
||||||
|
return EnvMapping(container, name) is { } env ? ManifestNodeExtensions.Scalar(env, "value") : null;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static string? EnvSecretName(YamlMappingNode container, string name)
|
||||||
|
{
|
||||||
|
return EnvMapping(container, name) is { } env
|
||||||
|
? ManifestNodeExtensions.Scalar(env, "valueFrom", "secretKeyRef", "name")
|
||||||
|
: null;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static string? EnvSecretKey(YamlMappingNode container, string name)
|
||||||
|
{
|
||||||
|
return EnvMapping(container, name) is { } env
|
||||||
|
? ManifestNodeExtensions.Scalar(env, "valueFrom", "secretKeyRef", "key")
|
||||||
|
: null;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static YamlMappingNode? EnvMapping(YamlMappingNode container, string name)
|
||||||
|
{
|
||||||
|
return ManifestNodeExtensions.MappingSequence(container, "env")
|
||||||
|
.SingleOrDefault(env => string.Equals(ManifestNodeExtensions.Scalar(env, "name"), name, StringComparison.Ordinal));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
internal sealed class ManifestInventory
|
internal sealed class ManifestInventory
|
||||||
|
|||||||
@@ -174,10 +174,13 @@ public sealed class PiSignagePlayerArtifactTests
|
|||||||
public void HdmiRule_RestartsPlayerAndRunsCapabilityDetection()
|
public void HdmiRule_RestartsPlayerAndRunsCapabilityDetection()
|
||||||
{
|
{
|
||||||
var rule = Read("systemd/99-flowercore-signage-hdmi.rules");
|
var rule = Read("systemd/99-flowercore-signage-hdmi.rules");
|
||||||
|
var responder = Read("scripts/flowercore-signage-hdmi-respond.sh");
|
||||||
|
|
||||||
rule.Should().Contain("KERNEL==\"card?-HDMI-A-?\"");
|
rule.Should().Contain("KERNEL==\"card?-HDMI-A-?\"");
|
||||||
rule.Should().Contain("restart flowercore-signage-player-pi.service");
|
rule.Should().Contain("start flowercore-signage-player-pi-hdmi.service");
|
||||||
rule.Should().Contain("start flowercore-signage-detect-display.service");
|
responder.Should().Contain("sleep 2");
|
||||||
|
responder.Should().Contain("start flowercore-signage-detect-display.service");
|
||||||
|
responder.Should().Contain("restart flowercore-signage-player-pi.service");
|
||||||
}
|
}
|
||||||
|
|
||||||
[Fact]
|
[Fact]
|
||||||
|
|||||||
Reference in New Issue
Block a user