Compare commits

..

1 Commits

Author SHA1 Message Date
Andrew Stoltz
9a4a8264d9 github-runner: add DM and WorldBuilder runners 2026-05-18 17:44:29 -05:00
12 changed files with 549 additions and 557 deletions

27
apps/brochure/README.md Normal file
View File

@@ -0,0 +1,27 @@
# FlowerCore Brochure
`apps/brochure` hosts the public brochure split from `FlowerCore.Intranet.Web`.
ArgoCD's `apps/*` ApplicationSet will create `infra-brochure` after this
directory lands on `main`.
## Runtime
- Host: `https://brochure.flowercore.io`
- Namespace: `brochure`
- Deployment: `brochure-web`
- Image: `localhost/fc-brochure-web:v20260524-sprint32`
- Port: `8080`
- Public route method allowlist: `GET` and `HEAD`
## Operator Actions
1. Publish and import `localhost/fc-brochure-web:v20260524-sprint32` to every
RKE2 node before sync, using the same podman save + `ctr images import`
flow as the Intranet deployment.
2. Create the Cloudflare DNS record for `brochure.flowercore.io` pointing at
the FlowerCore public edge.
3. Verify `infra-brochure` appears in ArgoCD, the certificate becomes Ready,
and `GET https://brochure.flowercore.io/` returns `200`.
The route intentionally does not expose `/ops/*` or `/admin/*`; the Brochure
web app returns `404` for those paths and Traefik only forwards read methods.

131
apps/brochure/brochure.yaml Normal file
View File

@@ -0,0 +1,131 @@
# FlowerCore Brochure public host
#
# Thin Blazor host for public What's New, walkthrough, and gallery content
# carved out of FlowerCore.Intranet.Web. The ApplicationSet creates
# infra-brochure from this directory after merge.
---
apiVersion: v1
kind: Namespace
metadata:
name: brochure
labels:
app.kubernetes.io/part-of: flowercore
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: brochure-web
namespace: brochure
labels:
app: brochure-web
app.kubernetes.io/name: brochure-web
app.kubernetes.io/part-of: flowercore
spec:
replicas: 1
revisionHistoryLimit: 3
selector:
matchLabels:
app: brochure-web
template:
metadata:
labels:
app: brochure-web
app.kubernetes.io/name: brochure-web
app.kubernetes.io/part-of: flowercore
spec:
containers:
- name: brochure-web
image: localhost/fc-brochure-web:v20260524-sprint32
imagePullPolicy: Never
ports:
- containerPort: 8080
name: http
env:
- name: ASPNETCORE_ENVIRONMENT
value: Production
- name: ASPNETCORE_URLS
value: "http://+:8080"
resources:
requests:
cpu: "25m"
memory: "128Mi"
limits:
cpu: "500m"
memory: "512Mi"
readinessProbe:
httpGet:
path: /health
port: http
initialDelaySeconds: 10
periodSeconds: 10
livenessProbe:
httpGet:
path: /health
port: http
initialDelaySeconds: 30
periodSeconds: 30
securityContext:
runAsNonRoot: true
runAsUser: 1654
runAsGroup: 1654
allowPrivilegeEscalation: false
readOnlyRootFilesystem: true
capabilities:
drop:
- ALL
volumeMounts:
- name: tmp
mountPath: /tmp
volumes:
- name: tmp
emptyDir: {}
---
apiVersion: v1
kind: Service
metadata:
name: brochure-web
namespace: brochure
labels:
app: brochure-web
app.kubernetes.io/name: brochure-web
app.kubernetes.io/part-of: flowercore
spec:
type: ClusterIP
selector:
app: brochure-web
ports:
- name: http
port: 8080
targetPort: http
---
apiVersion: cert-manager.io/v1
kind: Certificate
metadata:
name: brochure-web-tls
namespace: brochure
spec:
secretName: brochure-web-tls
issuerRef:
name: step-ca-acme
kind: ClusterIssuer
dnsNames:
- brochure.flowercore.io
duration: 720h
renewBefore: 240h
---
apiVersion: traefik.io/v1alpha1
kind: IngressRoute
metadata:
name: brochure-web-public
namespace: brochure
spec:
entryPoints:
- websecure
routes:
- match: Host(`brochure.flowercore.io`) && (Method(`GET`) || Method(`HEAD`))
kind: Rule
services:
- name: brochure-web
port: 8080
tls:
secretName: brochure-web-tls

View File

@@ -1,263 +0,0 @@
# fc-build-windows runner gate
Status: OPEN-WITH-OPERATOR-ACTION as of 2026-05-20.
This directory is intentionally not a live runner deployment. It records the
exact gate for bringing up the Windows self-hosted runner fleet without faking
capacity in GitHub or Kubernetes.
## Lane evidence
- `D:\git\FlowerCore\FlowerCore.Notes\docs\dashboards\decisions-waiting.html`
lines 15078-15085: Q-MR-82 says the Updater Windows Sandbox E2E run is
queued and `bluejay-ws-sandbox-1` is offline.
- `D:\git\FlowerCore\FlowerCore.Notes\memory\project_morning_routine_8_2026_05_20.md`:
Morning Routine #8 carries Q-MR-82 as the fleet-wide Windows runner gap.
- `D:\git\FlowerCore\FlowerCore.Notes\docs\standards\sprint-37-codex-dispatch-log-2026-05-19.md`
lines 76, 84-85, and 97: keep BLUEJAY-WS out of runner plans, merge Linux
runner expansion separately, and keep true Windows-only workflows parked on
the Windows runner host substrate path.
- `D:\git\FlowerCore\FlowerCore.Notes\docs\ai-agents\codex-prompts\2026-05-20-xxxxl-sprint-42-orchestrator-briefs.md`
lane Cx-5: land a deployment only if a Windows runner image/substrate is
ready; otherwise commit an operator-action gate.
- `D:\git\FlowerCore\FlowerCore.Notes\memory\feedback_bluejay_ws_never_a_github_runner.md`:
BLUEJAY-WS is operator-only territory; Windows runners belong on a dedicated
KubeVirt Windows VM such as `ci1` or a sibling VM.
## Live probe summary
Commands run on 2026-05-20 from `D:\git\FlowerCore\bluejay-infra`:
```powershell
$env:KUBECONFIG="$env:USERPROFILE\.kube\rke2.yaml"
kubectl get nodes -o jsonpath='{range .items[*]}{.metadata.name}{"`t"}{.metadata.labels.kubernetes\.io/os}{"`n"}{end}'
```
Result: `rke2-agent1`, `rke2-agent2`, and `rke2-server` all report
`kubernetes.io/os=linux`. There is no Windows Kubernetes node, so Windows
containers on RKE2 cannot satisfy `fc-build-windows`.
```powershell
kubectl -n kubevirt-vms get vm,vmi,pods -o wide
```
Result: KubeVirt is healthy and `ci1` is `Running` / `Ready=True` on
`rke2-agent1` with VMI IP `10.42.103.35`.
```powershell
virtctl --kubeconfig $env:USERPROFILE\.kube\rke2.yaml port-forward vm/ci1.kubevirt-vms 15985:5985
```
Result during port tests: `dial tcp 10.42.103.35:5985: connect: no route to
host`. The same result was seen for RDP 3389 and SSH 22. The VM exists, but it
is not remotely reachable for runner bootstrap from this lane.
```powershell
gh api /repos/astoltz/FlowerCore.Updater/actions/runners `
--jq '.runners[]? | {name,status,busy,labels:[.labels[].name]}'
gh run list --repo astoltz/FlowerCore.Updater `
--workflow "Updater Windows Sandbox E2E" --limit 5
```
Result: GitHub has one Updater runner, `bluejay-ws-sandbox-1`, with
`status=offline`; run `26150689447` is still `queued`.
## Feasibility classification
### Option A: Windows containers on RKE2
Not feasible without operator-physical infrastructure work. Kubernetes Windows
containers require a Windows node. The current cluster has Linux-only RKE2
nodes.
### Option B: KubeVirt Windows VM
Partially present, not deployable from this lane.
`apps/kubevirt-vms/ci1.yaml` already defines a Windows Server 2025 KubeVirt VM
using `localhost/fc-win-server-2025:v1`, and the live VM is running. However:
- the guest is not reachable over RDP, WinRM, or SSH through `virtctl
port-forward`;
- the current root disk is a `containerDisk`, so runner installation inside the
running guest is not a durable fleet state unless the first-boot automation
re-registers on every boot or the VM is moved to a persistent PVC-backed
disk;
- FC.Updater `Updater Windows Sandbox E2E` uses
`[self-hosted, windows, windows-sandbox]`, while `fc-build-windows` build jobs
use `[self-hosted, windows, fc-build-windows]`. Do not advertise
`windows-sandbox` until Windows Sandbox has been proven in the guest.
### Option C: bluejay-ws-sandbox-1
Operator-only emergency fallback. GitHub shows it registered but offline. The
current memory says BLUEJAY-WS must not be a fleet runner host, so this lane
does not start or re-register it. If the operator deliberately overrides the
policy to drain an emergency queue, start the existing visible runner console
from the BLUEJAY-WS desktop and treat that as temporary break-glass, not the
permanent Q-MR-82 closure.
## Operator action plan
### 1. Pick the Windows host class
Use `ci1` or a sibling Windows Server 2025 VM for WPF build/test jobs that need
`fc-build-windows`.
Use a Windows 11 Pro/Enterprise KubeVirt VM for Updater or WorldBuilder
Windows Sandbox gates, unless Windows Sandbox support is explicitly proven on
the selected guest. The workflow labels must match the real capability:
- WPF build runner: `self-hosted,windows,fc-build-windows,ci1`
- Sandbox runner: `self-hosted,windows,windows-sandbox,ci-sandbox1`
### 2. Make the VM reachable and durable
From BLUEJAY-WS:
```powershell
$env:KUBECONFIG="$env:USERPROFILE\.kube\rke2.yaml"
kubectl -n kubevirt-vms get vm,vmi,pods -o wide
virtctl --kubeconfig $env:KUBECONFIG vnc ci1 -n kubevirt-vms
virtctl --kubeconfig $env:KUBECONFIG port-forward vm/ci1.kubevirt-vms 13389:3389
virtctl --kubeconfig $env:KUBECONFIG port-forward vm/ci1.kubevirt-vms 15985:5985
```
Before runner registration, fix the current port-forward failure. The expected
state is that RDP or WinRM accepts a connection through the control plane.
For durability, either:
- move the runner VM to a persistent PVC-backed root disk; or
- keep `containerDisk` and bake first-boot runner registration into the sysprep
flow using a non-expiring credential lookup path.
Do not install a runner by hand into a transient VM and call Q-MR-82 closed.
### 3. Install runner prerequisites inside the VM
Run in an elevated PowerShell session in the Windows runner guest:
```powershell
winget install Microsoft.DotNet.SDK.10 --silent
winget install Microsoft.DotNet.DesktopRuntime.8 --silent
winget install Microsoft.PowerShell --silent
winget install Git.Git --silent
winget install Microsoft.VisualStudio.2022.BuildTools --silent
winget install Google.Chrome --silent
```
For a Sandbox-capable runner only:
```powershell
Enable-WindowsOptionalFeature -Online -FeatureName Containers-DisposableClientVM -All
Restart-Computer -Force
```
After reboot:
```powershell
Get-CimInstance -ClassName Win32_OptionalFeature -Filter "Name='Containers-DisposableClientVM'"
Test-Path C:\Windows\System32\WindowsSandbox.exe
```
### 4. Register repo-scoped GitHub runners
The `astoltz` account uses repo-scoped runners. Generate a fresh one-hour
registration token per repo immediately before `config.cmd`.
From a trusted operator shell with `gh` authenticated:
```powershell
$repos = @(
"FlowerCore.Updater",
"FlowerCore.WorldBuilder",
"FlowerCore.DeviceManagement"
)
foreach ($repo in $repos) {
$token = gh api -X POST "/repos/astoltz/$repo/actions/runners/registration-token" --jq .token
$repoSlug = $repo.ToLowerInvariant().Replace("flowercore.", "").Replace(".", "-")
$runnerDir = "C:\fc-ghr\$repoSlug-fc-build-windows"
New-Item -ItemType Directory -Force -Path $runnerDir | Out-Null
Set-Location $runnerDir
if (-not (Test-Path ".\config.cmd")) {
Invoke-WebRequest `
-Uri "https://github.com/actions/runner/releases/download/v2.323.0/actions-runner-win-x64-2.323.0.zip" `
-OutFile "actions-runner.zip"
Add-Type -AssemblyName System.IO.Compression.FileSystem
[System.IO.Compression.ZipFile]::ExtractToDirectory((Resolve-Path actions-runner.zip), $runnerDir)
}
.\config.cmd `
--url "https://github.com/astoltz/$repo" `
--token $token `
--name "ci1-$repoSlug-fc-build-windows" `
--labels "self-hosted,windows,fc-build-windows,ci1" `
--work "_work" `
--unattended `
--replace
.\svc.ps1 install
.\svc.ps1 start
}
```
For Updater Sandbox E2E, register only after the guest proves Sandbox support,
and use `windows-sandbox` labels:
```powershell
$token = gh api -X POST "/repos/astoltz/FlowerCore.Updater/actions/runners/registration-token" --jq .token
.\config.cmd `
--url "https://github.com/astoltz/FlowerCore.Updater" `
--token $token `
--name "ci-sandbox1-updater" `
--labels "self-hosted,windows,windows-sandbox,ci-sandbox1" `
--work "_work" `
--unattended `
--replace
```
Keep registration tokens out of Git and logs. The durable credential source for
automation should be the existing 1Password item named `GitHub PAT (Runner
Registration)`, used only to mint short-lived repo registration tokens.
### 5. Verify GitHub and workflow pickup
```powershell
gh api /repos/astoltz/FlowerCore.Updater/actions/runners `
--jq '.runners[] | select(.labels[].name == "windows-sandbox") | {name,status,busy,labels:[.labels[].name]}'
gh api /repos/astoltz/FlowerCore.DeviceManagement/actions/runners `
--jq '.runners[] | select(.labels[].name == "fc-build-windows") | {name,status,busy,labels:[.labels[].name]}'
gh run list --repo astoltz/FlowerCore.Updater `
--workflow "Updater Windows Sandbox E2E" --limit 3
```
Q-MR-82 can be marked resolved only after the Updater run moves from `queued` to
`in_progress` or `completed` on an online runner, or after the affected WPF
build repos show online `fc-build-windows` repo-scoped runners and their queued
jobs start.
## Break-glass BLUEJAY-WS command
Only if the operator explicitly overrides the "BLUEJAY-WS is not a runner"
policy to drain a queue:
```powershell
Set-Location C:\fc-ghr\updater-sandbox
.\run.cmd
```
If a Windows service exists:
```powershell
Get-Service 'actions.runner.*'
Start-Service 'actions.runner.*'
```
This does not close Q-MR-82 permanently. It is a temporary queue drain until a
dedicated VM runner is online.

View File

@@ -1,4 +0,0 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- operator-gate-configmap.yaml

View File

@@ -1,61 +0,0 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: fc-build-windows-operator-gate
namespace: kubevirt-vms
labels:
app.kubernetes.io/name: fc-build-windows
app.kubernetes.io/component: operator-gate
app.kubernetes.io/part-of: github-runner
flowercore.io/q-card: Q-MR-82
annotations:
flowercore.io/outcome: OPEN-WITH-OPERATOR-ACTION
flowercore.io/live-runner: "false"
data:
outcome: OPEN-WITH-OPERATOR-ACTION
gate.md: |
Do not treat this ConfigMap as runner capacity.
Current probe, 2026-05-20:
- RKE2 nodes are linux-only; Windows containers require a Windows node.
- KubeVirt `ci1` is Running/Ready, but RDP 3389, WinRM 5985, and SSH 22
through `virtctl port-forward` return `connect: no route to host`.
- GitHub Updater runner list has only `bluejay-ws-sandbox-1`, status
offline. Updater Windows Sandbox E2E run 26150689447 remains queued.
Required operator action:
1. Make a dedicated Windows VM reachable and durable.
2. Install .NET 10 SDK, .NET 8 Desktop Runtime, Git, VS Build Tools, and
PowerShell 7.
3. Register repo-scoped runners with short-lived GitHub registration tokens.
4. Add `fc-build-windows` labels only to WPF build-capable guests.
5. Add `windows-sandbox` labels only after Sandbox support is proven.
registration-token-pattern.ps1: |
$repo = "FlowerCore.Updater"
$token = gh api -X POST "/repos/astoltz/$repo/actions/runners/registration-token" --jq .token
$runnerDir = "C:\fc-ghr\updater-fc-build-windows"
New-Item -ItemType Directory -Force -Path $runnerDir | Out-Null
Set-Location $runnerDir
# Install the Actions runner package here if config.cmd is absent.
.\config.cmd `
--url "https://github.com/astoltz/$repo" `
--token $token `
--name "ci1-updater-fc-build-windows" `
--labels "self-hosted,windows,fc-build-windows,ci1" `
--work "_work" `
--unattended `
--replace
.\svc.ps1 install
.\svc.ps1 start
verification.ps1: |
gh api /repos/astoltz/FlowerCore.Updater/actions/runners `
--jq '.runners[] | {name,status,busy,labels:[.labels[].name]}'
gh run list --repo astoltz/FlowerCore.Updater `
--workflow "Updater Windows Sandbox E2E" --limit 3
$env:KUBECONFIG="$env:USERPROFILE\.kube\rke2.yaml"
kubectl -n kubevirt-vms get vm,vmi,pods -o wide

View File

@@ -0,0 +1,33 @@
# Explicit ArgoCD Application shape for bootstrap/review.
#
# The live bluejay-infra ApplicationSet already discovers apps/* directories
# and creates this same Application name (`infra-fc-devicemgmt`) automatically.
# Keep repoURL on the internal Gitea ClusterIP URL; ArgoCD does not trust the
# external step-ca HTTPS endpoint.
apiVersion: argoproj.io/v1alpha1
kind: Application
metadata:
name: infra-fc-devicemgmt
namespace: argocd
labels:
app.kubernetes.io/name: fc-devicemgmt
app.kubernetes.io/part-of: flowercore
app.kubernetes.io/managed-by: argocd
flowercore.io/tenant-id: system
flowercore.io/created-by: bluejay-infra
spec:
project: default
source:
repoURL: http://gitea-clusterip.gitea.svc.cluster.local:3000/bluejay/bluejay-infra.git
targetRevision: main
path: apps/fc-devicemgmt
destination:
server: https://kubernetes.default.svc
namespace: fc-devicemgmt
syncPolicy:
automated:
prune: true
selfHeal: true
syncOptions:
- CreateNamespace=true
- ServerSideApply=true

View File

@@ -47,7 +47,7 @@ spec:
fsGroupChangePolicy: OnRootMismatch fsGroupChangePolicy: OnRootMismatch
containers: containers:
- name: operator - name: operator
image: localhost/fc-devicemgmt-operator:v20260519-sp34cl3-fix image: localhost/fc-devicemgmt-operator:v20260512-cx5
imagePullPolicy: Never imagePullPolicy: Never
ports: ports:
- name: metrics - name: metrics

View File

@@ -4,22 +4,6 @@
# Sprint 9+ lane. This manifest is static-valid without requiring the image to # Sprint 9+ lane. This manifest is static-valid without requiring the image to
# exist yet; import localhost/fc-devicemgmt-web:<tag> to all schedulable RKE2 # exist yet; import localhost/fc-devicemgmt-web:<tag> to all schedulable RKE2
# nodes before letting ArgoCD sync a live rollout. # nodes before letting ArgoCD sync a live rollout.
#
# SCALED TO 0 — 2026-05-19 morning-routine cleanup.
# The Web pod cannot start until TWO upstream gaps close:
# 1. MySQL DB instance `flowercore_devicemgmt` (user `fc_devicemgmt`) is
# provisioned via fc-mysql Manager. The cluster currently has ZERO
# MySqlInstanceCrds and no `mysql.fc-mysql.svc:3306` Service, so the
# deployment-web container env `FlowerCore__Database__Host=mysql.fc-mysql.svc`
# points at nothing. Provision via the fc-mysql Manager UI/REST/MCP.
# 2. 1Password vault item `IAmWorkin/FlowerCore DeviceManagement Runtime`
# with 5 fields (DB-Password, mtls-ca.pem, mtls-client.crt, mtls-client.key,
# mtls-chain.pem) — see apps/fc-devicemgmt/1password-item.yaml. Mint mTLS
# from step-ca-agent ClusterIssuer per ADR-126; DB-Password must match the
# password configured for the MySQL user.
# Re-enable: change replicas back to 2 after both gaps close. The image tag
# in this file (v20260512-cx5) MAY also need a refresh — it predates the
# Sprint 34 Cl-3 operator fix; Web may have an analogous bug.
apiVersion: apps/v1 apiVersion: apps/v1
kind: Deployment kind: Deployment
metadata: metadata:
@@ -36,7 +20,7 @@ metadata:
annotations: annotations:
flowercore.io/traceability-standard: k8s-pod-ownership-and-traceability-standard flowercore.io/traceability-standard: k8s-pod-ownership-and-traceability-standard
spec: spec:
replicas: 0 replicas: 2
revisionHistoryLimit: 3 revisionHistoryLimit: 3
selector: selector:
matchLabels: matchLabels:

View File

@@ -28,6 +28,10 @@ Sprint 32 final long-tail wave adds 16 two-replica Deployments:
`FlowerCore.Provisioning`, `FlowerCore.Redis`, `FlowerCore.MessageBoard`, and `FlowerCore.Provisioning`, `FlowerCore.Redis`, `FlowerCore.MessageBoard`, and
`FlowerCore.MenuBoard`. `FlowerCore.MenuBoard`.
Sprint 37 Cx-2 closes the audited Linux runner gaps for
`FlowerCore.DeviceManagement` and `FlowerCore.WorldBuilder` with the same
two-replica `emptyDir` pattern.
## Post-Merge Proof ## Post-Merge Proof
After the PR is merged and ArgoCD syncs, verify the runner fleet: After the PR is merged and ArgoCD syncs, verify the runner fleet:
@@ -47,7 +51,7 @@ for repo in FlowerCore.Common FlowerCore.Shared.Pos FlowerCore.Puppet FlowerCore
FlowerCore.Distribution FlowerCore.Scoreboard FlowerCore.SegmentDisplay \ FlowerCore.Distribution FlowerCore.Scoreboard FlowerCore.SegmentDisplay \
FlowerCore.Signage.Contracts FlowerCore.SignalControl FlowerCore.Intranet.Web \ FlowerCore.Signage.Contracts FlowerCore.SignalControl FlowerCore.Intranet.Web \
FlowerCore.Provisioning FlowerCore.Redis FlowerCore.MessageBoard \ FlowerCore.Provisioning FlowerCore.Redis FlowerCore.MessageBoard \
FlowerCore.MenuBoard; do FlowerCore.MenuBoard FlowerCore.DeviceManagement FlowerCore.WorldBuilder; do
echo "=== $repo ===" echo "=== $repo ==="
gh api "/repos/astoltz/$repo/actions/runners" \ gh api "/repos/astoltz/$repo/actions/runners" \
--jq '.runners[] | select(.labels[].name == "fc-build-linux") | {name,status,busy,labels:[.labels[].name]}' --jq '.runners[] | select(.labels[].name == "fc-build-linux") | {name,status,busy,labels:[.labels[].name]}'
@@ -64,6 +68,20 @@ gh run list --repo astoltz/FlowerCore.Shared.Pos \
If the latest run is still queued after runner registration, rerun the workflow If the latest run is still queued after runner registration, rerun the workflow
from GitHub Actions and verify it lands on an `rke2-linux-*` runner. from GitHub Actions and verify it lands on an `rke2-linux-*` runner.
## Sprint 37 Cx-2 Gap Audit
The 2026-05-18 GitHub workflow scan found these remaining repos with
`runs-on: [self-hosted, linux, fc-build-linux]` but no K8s runner Deployment:
`FlowerCore.AiStation.Linux`, `FlowerCore.PHP`, `FlowerCore.PiManager`,
`FlowerCore.Shared.Barcodes`, `FlowerCore.Shared.Lookup`,
`FlowerCore.Shared.Nodes`, `FlowerCore.Shared.PrintClient`,
`FlowerCore.Shared.Relay`, `FlowerCore.Shared.ShowRunner`, and
`FlowerCore.Shared.Storage`.
Mixed/platform repos also have Linux workflow legs but need owner review before
adding Linux runner Deployments: `FlowerCore.Library.Mac`,
`FlowerCore.Signage.Agent.AppleTv`, and `FlowerCore.Signage.Player.Wpf`.
## Failure Notes ## Failure Notes
- `actions/setup-dotnet` permission error at `/usr/share/dotnet`: check that - `actions/setup-dotnet` permission error at `/usr/share/dotnet`: check that

View File

@@ -16,6 +16,8 @@
# DNS, Distribution, Scoreboard, SegmentDisplay, Signage.Contracts, # DNS, Distribution, Scoreboard, SegmentDisplay, Signage.Contracts,
# SignalControl, Intranet.Web, Provisioning, Redis, MessageBoard, MenuBoard # SignalControl, Intranet.Web, Provisioning, Redis, MessageBoard, MenuBoard
# (Sprint 32 final long-tail wave; two replicas each, emptyDir cache) # (Sprint 32 final long-tail wave; two replicas each, emptyDir cache)
# FlowerCore.DeviceManagement, WorldBuilder (Sprint 37 Cx-2 runner gap
# closure; two replicas each, emptyDir cache)
# #
# Non-root CI safety: # Non-root CI safety:
# Runner pods run as uid 1001. HOME, DOTNET_INSTALL_DIR, DOTNET_CLI_HOME, # Runner pods run as uid 1001. HOME, DOTNET_INSTALL_DIR, DOTNET_CLI_HOME,
@@ -3767,9 +3769,271 @@ spec:
- name: tmp - name: tmp
emptyDir: {} emptyDir: {}
restartPolicy: Always restartPolicy: Always
---
# Runner for FlowerCore.DeviceManagement. Added 2026-05-18 (Sprint 37 Cx-2)
# to close the Linux CI capacity gap for the DM service-tier workflows. Mirrors
# the Sprint 32 long-tail emptyDir pattern: two replicas, shared
# 1Password-backed ACCESS_TOKEN, and the common ServiceAccount.
apiVersion: apps/v1
kind: Deployment
metadata:
name: github-runner-devicemgmt
namespace: github-runner
labels:
app.kubernetes.io/name: github-runner-devicemgmt
app.kubernetes.io/component: runner
app.kubernetes.io/part-of: flowercore
app.kubernetes.io/managed-by: argocd
flowercore.io/created-by: argocd
flowercore.io/runner-repo: devicemgmt
flowercore.io/github-repo: FlowerCore.DeviceManagement
spec:
replicas: 2
selector:
matchLabels:
app.kubernetes.io/name: github-runner-devicemgmt
strategy:
type: Recreate
template:
metadata:
labels:
app.kubernetes.io/name: github-runner-devicemgmt
app.kubernetes.io/component: runner
app.kubernetes.io/part-of: flowercore
flowercore.io/created-by: argocd
flowercore.io/runner-repo: devicemgmt
flowercore.io/github-repo: FlowerCore.DeviceManagement
spec:
serviceAccountName: github-runner
securityContext:
runAsNonRoot: true
runAsUser: 1001
runAsGroup: 1001
fsGroup: 1001
initContainers:
- name: setup-runner-home
image: busybox:1.36
command:
- sh
- -c
- |
set -e
mkdir -p /home/runner/.dotnet /home/runner/.nuget/packages /home/runner/.nuget/NuGet
chown -R 1001:1001 /home/runner/.dotnet /home/runner/.nuget
chmod -R 755 /home/runner/.dotnet /home/runner/.nuget
securityContext:
runAsUser: 0
runAsNonRoot: false
volumeMounts:
- name: runner-home
mountPath: /home/runner
containers:
- name: runner
image: myoung34/github-runner:latest
imagePullPolicy: Always
env:
- name: REPO_URL
value: "https://github.com/astoltz/FlowerCore.DeviceManagement"
- name: RUNNER_NAME_PREFIX
value: "rke2-linux-devicemgmt"
- name: RUNNER_WORKDIR
value: "/tmp/runner/work"
- name: EPHEMERAL
value: "true"
- name: LABELS
value: "self-hosted,linux,fc-build-linux"
- name: HOME
value: "/home/runner"
- name: DOTNET_INSTALL_DIR
value: "/home/runner/.dotnet"
- name: DOTNET_CLI_TELEMETRY_OPTOUT
value: "1"
- name: DOTNET_NOLOGO
value: "1"
- name: DOTNET_GENERATE_ASPNET_CERTIFICATE
value: "false"
- name: DOTNET_CLI_HOME
value: "/home/runner"
- name: NUGET_PACKAGES
value: "/home/runner/.nuget/packages"
- name: XDG_CACHE_HOME
value: "/home/runner/.cache"
- name: RUNNER_TOOL_CACHE
value: "/home/runner/_tool"
- name: ACCESS_TOKEN
valueFrom:
secretKeyRef:
name: github-runner-token
key: credential
- name: RUN_AS_ROOT
value: "false"
resources:
requests:
cpu: "500m"
memory: "1Gi"
limits:
cpu: "2000m"
memory: "4Gi"
volumeMounts:
- name: runner-home
mountPath: /home/runner
- name: nuget-cache
mountPath: /home/runner/.nuget/packages
- name: tmp
mountPath: /tmp
livenessProbe:
exec:
command:
- /bin/sh
- -c
- "pgrep -f Runner.Listener > /dev/null"
initialDelaySeconds: 30
periodSeconds: 30
failureThreshold: 3
volumes:
- name: runner-home
emptyDir: {}
- name: nuget-cache
emptyDir:
sizeLimit: 2Gi
- name: tmp
emptyDir: {}
restartPolicy: Always
---
# Runner for FlowerCore.WorldBuilder. Added 2026-05-18 (Sprint 37 Cx-2)
# to unblock WorldBuilder Linux CI jobs after the runner fleet audit found no
# repo-scoped deployment for the GitHub repo. Mirrors the Sprint 32 long-tail
# emptyDir pattern: two replicas, shared 1Password-backed ACCESS_TOKEN, and
# the common ServiceAccount.
apiVersion: apps/v1
kind: Deployment
metadata:
name: github-runner-worldbuilder
namespace: github-runner
labels:
app.kubernetes.io/name: github-runner-worldbuilder
app.kubernetes.io/component: runner
app.kubernetes.io/part-of: flowercore
app.kubernetes.io/managed-by: argocd
flowercore.io/created-by: argocd
flowercore.io/runner-repo: worldbuilder
flowercore.io/github-repo: FlowerCore.WorldBuilder
spec:
replicas: 2
selector:
matchLabels:
app.kubernetes.io/name: github-runner-worldbuilder
strategy:
type: Recreate
template:
metadata:
labels:
app.kubernetes.io/name: github-runner-worldbuilder
app.kubernetes.io/component: runner
app.kubernetes.io/part-of: flowercore
flowercore.io/created-by: argocd
flowercore.io/runner-repo: worldbuilder
flowercore.io/github-repo: FlowerCore.WorldBuilder
spec:
serviceAccountName: github-runner
securityContext:
runAsNonRoot: true
runAsUser: 1001
runAsGroup: 1001
fsGroup: 1001
initContainers:
- name: setup-runner-home
image: busybox:1.36
command:
- sh
- -c
- |
set -e
mkdir -p /home/runner/.dotnet /home/runner/.nuget/packages /home/runner/.nuget/NuGet
chown -R 1001:1001 /home/runner/.dotnet /home/runner/.nuget
chmod -R 755 /home/runner/.dotnet /home/runner/.nuget
securityContext:
runAsUser: 0
runAsNonRoot: false
volumeMounts:
- name: runner-home
mountPath: /home/runner
containers:
- name: runner
image: myoung34/github-runner:latest
imagePullPolicy: Always
env:
- name: REPO_URL
value: "https://github.com/astoltz/FlowerCore.WorldBuilder"
- name: RUNNER_NAME_PREFIX
value: "rke2-linux-worldbuilder"
- name: RUNNER_WORKDIR
value: "/tmp/runner/work"
- name: EPHEMERAL
value: "true"
- name: LABELS
value: "self-hosted,linux,fc-build-linux"
- name: HOME
value: "/home/runner"
- name: DOTNET_INSTALL_DIR
value: "/home/runner/.dotnet"
- name: DOTNET_CLI_TELEMETRY_OPTOUT
value: "1"
- name: DOTNET_NOLOGO
value: "1"
- name: DOTNET_GENERATE_ASPNET_CERTIFICATE
value: "false"
- name: DOTNET_CLI_HOME
value: "/home/runner"
- name: NUGET_PACKAGES
value: "/home/runner/.nuget/packages"
- name: XDG_CACHE_HOME
value: "/home/runner/.cache"
- name: RUNNER_TOOL_CACHE
value: "/home/runner/_tool"
- name: ACCESS_TOKEN
valueFrom:
secretKeyRef:
name: github-runner-token
key: credential
- name: RUN_AS_ROOT
value: "false"
resources:
requests:
cpu: "500m"
memory: "1Gi"
limits:
cpu: "2000m"
memory: "4Gi"
volumeMounts:
- name: runner-home
mountPath: /home/runner
- name: nuget-cache
mountPath: /home/runner/.nuget/packages
- name: tmp
mountPath: /tmp
livenessProbe:
exec:
command:
- /bin/sh
- -c
- "pgrep -f Runner.Listener > /dev/null"
initialDelaySeconds: 30
periodSeconds: 30
failureThreshold: 3
volumes:
- name: runner-home
emptyDir: {}
- name: nuget-cache
emptyDir:
sizeLimit: 2Gi
- name: tmp
emptyDir: {}
restartPolicy: Always
# Long-tail runner pattern: # Long-tail runner pattern:
# #
# Sprint 32 added the final 16 long-tail repo-scoped Deployments above. Keep # Sprint 32 added the final 16 long-tail repo-scoped Deployments, and Sprint 37
# Common as the only PVC-backed runner at replicas: 1. Any future multi-replica # added the DM + WorldBuilder runner gap closures above. Keep Common as the
# runner must use per-pod emptyDir caches, not a shared ReadWriteOnce PVC. # only PVC-backed runner at replicas: 1. Any future multi-replica runner must
# use per-pod emptyDir caches, not a shared ReadWriteOnce PVC.

View File

@@ -729,7 +729,7 @@ data:
expr: | expr: |
kube_deployment_status_replicas_ready{ kube_deployment_status_replicas_ready{
namespace="github-runner", namespace="github-runner",
deployment=~"github-runner(|-(sharedpos|puppet|signage|dms|telephony|print-web|chat|mysql|kiosk-linux))" deployment=~"github-runner(|-.+)"
} == 0 } == 0
for: 5m for: 5m
labels: labels:
@@ -1273,55 +1273,24 @@ metadata:
data: data:
notify.py: | notify.py: |
#!/usr/bin/env python3 #!/usr/bin/env python3
"""HTTP->IRC alert relay with thermal-printer DIGEST forwarding. """HTTP->IRC alert relay with thermal printer forwarding for Grafana webhooks.
Listens on :9119, posts to #alerts on UnrealIRCd via raw IRC protocol.
Listens on :9119, posts to #alerts on UnrealIRCd, forwards to Print.Web Alerts tagged alert_channel=thermal_print also POST to Print.Web /api/print/alert.
/api/print/alert. Thermal printing is BATCHED into hourly digests by
default so the printer no longer spam-fires per Grafana webhook.
Routing (per Grafana webhook alert):
- IRC: always per-event (operator likes the stream)
- Thermal printer:
* severity in {critical,disaster,page} OR
label alert_channel=thermal_print_immediate -> print NOW
* label alert_channel=thermal_print -> enqueue into hourly digest
* everything else -> IRC only
- RESOLVED webhooks remove the alert from the digest buffer
Env vars (defaults preserve old behavior on first deploy):
THERMAL_PRINT_ENABLED default "true" - master kill switch
BATCH_INTERVAL_MIN default "60" - minutes between digest prints
BATCH_MAX_PENDING default "50" - force-flush threshold
HTTP surface:
POST / - Grafana webhook entry
POST /flush - manual digest flush (idempotent)
GET / - status + config + buffer depth + stats
""" """
import json, os, socket, sys, threading, time import json, socket, sys, time
from collections import defaultdict
from datetime import datetime, timezone
from http.server import HTTPServer, BaseHTTPRequestHandler from http.server import HTTPServer, BaseHTTPRequestHandler
from urllib.request import Request, urlopen from urllib.request import Request, urlopen
from urllib.error import URLError
THERMAL_PRINT_ENABLED = os.environ.get("THERMAL_PRINT_ENABLED", "true").lower() == "true" IRC_HOST = "unrealircd.irc.svc" # short name: CoreDNS ndots:5 + iamworkin.lan template hijacks full .cluster.local (see memory)
BATCH_INTERVAL_MIN = int(os.environ.get("BATCH_INTERVAL_MIN", "60")) IRC_PORT = 6667
BATCH_MAX_PENDING = int(os.environ.get("BATCH_MAX_PENDING", "50")) IRC_NICK = "grafana-bot"
IRC_CHANNEL = "#alerts"
IRC_HOST = os.environ.get("IRC_HOST", "unrealircd.irc.svc") PRINT_WEB_URL = "http://10.0.57.16:5200/api/print/alert"
IRC_PORT = int(os.environ.get("IRC_PORT", "6667")) PRINT_ENABLED = True
IRC_NICK = os.environ.get("IRC_NICK", "grafana-bot")
IRC_CHANNEL = os.environ.get("IRC_CHANNEL", "#alerts")
PRINT_WEB_URL = os.environ.get("PRINT_WEB_URL", "http://10.0.57.16:5200/api/print/alert")
_buffer_lock = threading.Lock()
_buffer = {} # fingerprint -> {"alert": dict, "first_seen": float, "last_seen": float}
_last_flush_time = time.time()
_stats = {"webhooks_received": 0, "irc_sent": 0, "print_immediate": 0,
"digest_flushed": 0, "buffer_dedup": 0, "buffer_added": 0,
"buffer_resolved": 0, "started_at": time.time()}
def send_irc(message): def send_irc(message):
"""Connect, handle PING, join, send, quit."""
try: try:
sock = socket.create_connection((IRC_HOST, IRC_PORT), timeout=15) sock = socket.create_connection((IRC_HOST, IRC_PORT), timeout=15)
sock.sendall(f"NICK {IRC_NICK}\r\n".encode()) sock.sendall(f"NICK {IRC_NICK}\r\n".encode())
@@ -1354,137 +1323,52 @@ data:
time.sleep(0.5) time.sleep(0.5)
sock.sendall(b"QUIT :alert delivered\r\n") sock.sendall(b"QUIT :alert delivered\r\n")
sock.close() sock.close()
_stats["irc_sent"] += 1
return True return True
except Exception as e: except Exception as e:
print(f"[irc-notify] IRC send failed: {e}", file=sys.stderr) print(f"[irc-notify] IRC send failed: {e}", file=sys.stderr)
return False return False
def post_thermal(payload, kind): def send_thermal_print(alert):
if not THERMAL_PRINT_ENABLED: if not PRINT_ENABLED: return
print(f"[irc-notify] thermal disabled; skip {kind} ({payload.get('title','?')[:40]})", file=sys.stderr) labels = alert.get("labels", {})
return False annotations = alert.get("annotations", {})
status = alert.get("status", "firing").upper()
summary = annotations.get("summary", "")
description = annotations.get("description", "")
runbook = annotations.get("runbook", "")
# Build a useful message: summary + description + runbook steps
parts = []
if summary: parts.append(summary)
if description and description != summary: parts.append(description)
if runbook: parts.append("STEPS: " + runbook)
message = " | ".join(parts) if parts else labels.get("alertname", "Unknown alert")
payload = {
"title": labels.get("alertname", "Unknown"),
"severity": labels.get("severity", "warning").capitalize(),
"host": labels.get("instance", labels.get("host", "unknown")),
"message": message,
"eventId": alert.get("fingerprint", ""),
"source": "Grafana",
"status": "RESOLVED" if status == "RESOLVED" else "PROBLEM",
"acknowledged": False
}
try: try:
req = Request(PRINT_WEB_URL, data=json.dumps(payload).encode("utf-8"), req = Request(PRINT_WEB_URL, data=json.dumps(payload).encode("utf-8"),
headers={"Content-Type": "application/json"}, method="POST") headers={"Content-Type": "application/json"}, method="POST")
resp = urlopen(req, timeout=10) resp = urlopen(req, timeout=10)
if kind == "immediate": _stats["print_immediate"] += 1 print(f"[irc-notify] Thermal print sent: {resp.read().decode()}", file=sys.stderr)
print(f"[irc-notify] thermal {kind} sent: {payload.get('title','?')[:50]}", file=sys.stderr)
return True
except Exception as e: except Exception as e:
print(f"[irc-notify] thermal {kind} failed: {e}", file=sys.stderr) print(f"[irc-notify] Thermal print failed: {e}", file=sys.stderr)
return False
def fingerprint_of(alert): def should_print(alert):
fp = alert.get("fingerprint", "")
if fp: return fp
labels = alert.get("labels", {}) labels = alert.get("labels", {})
target = labels.get("pod") or labels.get("instance") or labels.get("deployment") or labels.get("statefulset") or labels.get("namespace") or "" if labels.get("alert_channel") == "thermal_print": return True
return f"{labels.get('alertname','?')}/{labels.get('namespace','')}/{target}" if labels.get("severity", "").lower() in ("critical", "disaster"): return True
if alert.get("status", "").upper() == "RESOLVED": return False
def is_critical(alert): return False
return alert.get("labels", {}).get("severity", "").lower() in ("critical", "disaster", "page")
def is_immediate_label(alert):
return alert.get("labels", {}).get("alert_channel") == "thermal_print_immediate"
def is_batched_label(alert):
return alert.get("labels", {}).get("alert_channel") == "thermal_print"
def add_to_digest(alert):
"""Add an alert to the digest buffer. Returns True if the buffer GREW
(new fingerprint), False if it was a dedup, resolution, or no-op.
"""
if not THERMAL_PRINT_ENABLED: return False
fp = fingerprint_of(alert)
status = alert.get("status", "firing").lower()
with _buffer_lock:
if status == "resolved":
if fp in _buffer:
del _buffer[fp]
_stats["buffer_resolved"] += 1
return False
if fp in _buffer:
_buffer[fp]["last_seen"] = time.time()
_buffer[fp]["alert"] = alert
_stats["buffer_dedup"] += 1
return False
_buffer[fp] = {"alert": alert, "first_seen": time.time(), "last_seen": time.time()}
_stats["buffer_added"] += 1
return True
def build_digest_payload():
with _buffer_lock:
items = list(_buffer.values())
if not items: return None
by_name = defaultdict(list)
for item in items:
labels = item["alert"].get("labels", {})
by_name[labels.get("alertname", "Unknown")].append(item)
lines = []
for name, group in sorted(by_name.items()):
targets = []
for it in group[:5]:
labels = it["alert"].get("labels", {})
t = (labels.get("pod") or labels.get("instance") or labels.get("deployment")
or labels.get("statefulset") or labels.get("namespace") or "?")
targets.append(t)
more = f" (+{len(group)-5})" if len(group) > 5 else ""
sevs = sorted({it["alert"].get("labels", {}).get("severity", "warning") for it in group})
lines.append(f"[{'/'.join(sevs)}] {name} x{len(group)}: {', '.join(targets)}{more}")
now = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC")
title = f"Alert digest: {len(items)} firing"
body = "\n".join([
f"=== {title} ===",
f"as of {now}",
"",
*lines,
"",
"Stream: #alerts (IRC) | Triage: grafana-noc1.iamworkin.lan",
"Force-flush: POST irc-notify.monitoring.svc:9119/flush",
])
return {"title": title, "severity": "Warning", "host": "monitoring",
"message": body, "eventId": f"digest-{int(time.time())}",
"source": "Grafana digest", "status": "PROBLEM", "acknowledged": False}
def flush_digest():
payload = build_digest_payload()
if payload is None:
print("[irc-notify] flush: buffer empty, no digest sent", file=sys.stderr)
return False
sent = post_thermal(payload, "digest")
with _buffer_lock:
_buffer.clear()
if sent: _stats["digest_flushed"] += 1
return sent
def digest_loop():
global _last_flush_time
while True:
try:
now = time.time()
elapsed = now - _last_flush_time
if elapsed >= BATCH_INTERVAL_MIN * 60:
print(f"[irc-notify] digest tick: interval reached ({BATCH_INTERVAL_MIN}m); buffer={len(_buffer)}", file=sys.stderr)
flush_digest()
_last_flush_time = now
elif len(_buffer) >= BATCH_MAX_PENDING:
print(f"[irc-notify] digest tick: buffer full ({len(_buffer)}); force flush", file=sys.stderr)
flush_digest()
_last_flush_time = now
time.sleep(15)
except Exception as e:
print(f"[irc-notify] digest loop error: {e}", file=sys.stderr)
time.sleep(60)
class Handler(BaseHTTPRequestHandler): class Handler(BaseHTTPRequestHandler):
def do_POST(self): def do_POST(self):
if self.path == "/flush":
ok = flush_digest()
self.send_response(200); self.send_header("Content-Type", "application/json"); self.end_headers()
self.wfile.write(json.dumps({"flushed": ok, "buffer_after": len(_buffer)}).encode())
return
_stats["webhooks_received"] += 1
length = int(self.headers.get("Content-Length", 0)) length = int(self.headers.get("Content-Length", 0))
body = json.loads(self.rfile.read(length)) if length else {} body = json.loads(self.rfile.read(length)) if length else {}
for alert in body.get("alerts", []): for alert in body.get("alerts", []):
@@ -1499,56 +1383,22 @@ data:
msg = f"{icon}{sev_tag} {name}: {summary}" msg = f"{icon}{sev_tag} {name}: {summary}"
if desc: msg += f"\n {desc}" if desc: msg += f"\n {desc}"
send_irc(msg) send_irc(msg)
# Thermal routing — EVERYTHING (including criticals) goes into if should_print(alert): send_thermal_print(alert)
# the hourly digest. Only the explicit `alert_channel=thermal_print_immediate` self.send_response(200)
# label bypasses, and even that flushes-the-current-digest rather self.send_header("Content-Type", "application/json")
# than printing a standalone job, so the same fingerprint can't self.end_headers()
# spam the printer per webhook cycle.
if status == "RESOLVED":
add_to_digest(alert) # removes from buffer
continue
if is_immediate_label(alert):
# Explicit opt-in for "paper this NOW" — first arrival of a
# new fingerprint triggers an immediate digest flush; repeat
# webhooks for the same fingerprint dedupe in the buffer
# until the next interval or until the alert resolves.
new_in_buffer = add_to_digest(alert)
if new_in_buffer:
global _last_flush_time
flush_digest()
_last_flush_time = time.time()
elif is_critical(alert) or is_batched_label(alert):
add_to_digest(alert)
# else: IRC-only (warnings without thermal_print label)
self.send_response(200); self.send_header("Content-Type", "application/json"); self.end_headers()
self.wfile.write(b'{"status":"ok"}') self.wfile.write(b'{"status":"ok"}')
def do_GET(self): def do_GET(self):
self.send_response(200); self.send_header("Content-Type", "application/json"); self.end_headers() self.send_response(200)
with _buffer_lock: self.send_header("Content-Type", "application/json")
alertnames = sorted({it["alert"].get("labels", {}).get("alertname", "?") for it in _buffer.values()}) self.end_headers()
depth = len(_buffer) self.wfile.write(json.dumps({"service":"irc-notify","thermal_print":PRINT_ENABLED}).encode())
info = {
"service": "irc-notify",
"config": {"thermal_print_enabled": THERMAL_PRINT_ENABLED,
"batch_interval_min": BATCH_INTERVAL_MIN,
"batch_max_pending": BATCH_MAX_PENDING,
"irc_target": f"{IRC_HOST}:{IRC_PORT} {IRC_CHANNEL}",
"print_web_url": PRINT_WEB_URL},
"buffer": {"depth": depth, "alertnames": alertnames,
"seconds_since_last_flush": int(time.time() - _last_flush_time),
"seconds_until_next_flush": max(0, int(BATCH_INTERVAL_MIN*60 - (time.time() - _last_flush_time)))},
"stats": _stats,
}
self.wfile.write(json.dumps(info, indent=2).encode())
def log_message(self, format, *args): def log_message(self, format, *args):
print(f"[irc-notify] {args[0]}", file=sys.stderr) print(f"[irc-notify] {args[0]}", file=sys.stderr)
if __name__ == "__main__": if __name__ == "__main__":
threading.Thread(target=digest_loop, daemon=True).start()
server = HTTPServer(("0.0.0.0", 9119), Handler) server = HTTPServer(("0.0.0.0", 9119), Handler)
print(f"[irc-notify] :9119 -> IRC {IRC_HOST}:{IRC_PORT} {IRC_CHANNEL} | thermal={'ON' if THERMAL_PRINT_ENABLED else 'OFF'} | digest={BATCH_INTERVAL_MIN}m max={BATCH_MAX_PENDING}", file=sys.stderr) print(f"IRC alert relay :9119 -> {IRC_HOST}:{IRC_PORT} {IRC_CHANNEL} (thermal: {PRINT_ENABLED})")
server.serve_forever() server.serve_forever()
# ============================================================================= # =============================================================================
@@ -3659,7 +3509,7 @@ data:
- refId: A - refId: A
relativeTimeRange: {from: 300, to: 0} relativeTimeRange: {from: 300, to: 0}
datasourceUid: prometheus datasourceUid: prometheus
model: {expr: 'kube_deployment_status_replicas_ready{namespace="github-runner",deployment=~"github-runner(|-(sharedpos|puppet|signage|dms|telephony|print-web|chat|mysql|kiosk-linux))"} == 0', instant: true, refId: A} model: {expr: 'kube_deployment_status_replicas_ready{namespace="github-runner",deployment=~"github-runner(|-.+)"} == 0', instant: true, refId: A}
- refId: B - refId: B
relativeTimeRange: {from: 300, to: 0} relativeTimeRange: {from: 300, to: 0}
datasourceUid: __expr__ datasourceUid: __expr__

View File

@@ -67,6 +67,8 @@ public sealed class FleetManifestLintTests
["github-runner-chat"] = "https://github.com/astoltz/FlowerCore.Chat", ["github-runner-chat"] = "https://github.com/astoltz/FlowerCore.Chat",
["github-runner-mysql"] = "https://github.com/astoltz/FlowerCore.MySQL", ["github-runner-mysql"] = "https://github.com/astoltz/FlowerCore.MySQL",
["github-runner-kiosk-linux"] = "https://github.com/astoltz/FlowerCore.Kiosk.Linux", ["github-runner-kiosk-linux"] = "https://github.com/astoltz/FlowerCore.Kiosk.Linux",
["github-runner-devicemgmt"] = "https://github.com/astoltz/FlowerCore.DeviceManagement",
["github-runner-worldbuilder"] = "https://github.com/astoltz/FlowerCore.WorldBuilder",
}; };
private static readonly HashSet<string> ScaledLinuxRunnerDeployments = new(StringComparer.Ordinal) private static readonly HashSet<string> ScaledLinuxRunnerDeployments = new(StringComparer.Ordinal)
@@ -80,6 +82,8 @@ public sealed class FleetManifestLintTests
"github-runner-chat", "github-runner-chat",
"github-runner-mysql", "github-runner-mysql",
"github-runner-kiosk-linux", "github-runner-kiosk-linux",
"github-runner-devicemgmt",
"github-runner-worldbuilder",
}; };
private static readonly IReadOnlyDictionary<string, string> WritableRunnerEnv = new Dictionary<string, string>(StringComparer.Ordinal) private static readonly IReadOnlyDictionary<string, string> WritableRunnerEnv = new Dictionary<string, string>(StringComparer.Ordinal)
@@ -234,7 +238,7 @@ public sealed class FleetManifestLintTests
{ {
deployments.Should().ContainKey(expectedRunner.Key); deployments.Should().ContainKey(expectedRunner.Key);
var container = deployments[expectedRunner.Key].ContainerMappings().Should().ContainSingle().Subject; var container = RunnerContainer(deployments[expectedRunner.Key]);
EnvValue(container, "REPO_URL").Should().Be(expectedRunner.Value); EnvValue(container, "REPO_URL").Should().Be(expectedRunner.Value);
EnvValue(container, "EPHEMERAL").Should().Be("true"); EnvValue(container, "EPHEMERAL").Should().Be("true");
EnvValue(container, "LABELS").Should().Be("self-hosted,linux,fc-build-linux"); EnvValue(container, "LABELS").Should().Be("self-hosted,linux,fc-build-linux");
@@ -250,7 +254,7 @@ public sealed class FleetManifestLintTests
{ {
foreach (var deployment in GitHubRunnerDeployments().Values) foreach (var deployment in GitHubRunnerDeployments().Values)
{ {
var container = deployment.ContainerMappings().Should().ContainSingle().Subject; var container = RunnerContainer(deployment);
foreach (var expectedEnv in WritableRunnerEnv) foreach (var expectedEnv in WritableRunnerEnv)
{ {
@@ -311,7 +315,7 @@ public sealed class FleetManifestLintTests
monitoring.Should().Contain("MacMiniRunnerOffline"); monitoring.Should().Contain("MacMiniRunnerOffline");
monitoring.Should().Contain("LinuxRunnerOffline"); monitoring.Should().Contain("LinuxRunnerOffline");
monitoring.Should().Contain("kube_deployment_status_replicas_ready"); monitoring.Should().Contain("kube_deployment_status_replicas_ready");
monitoring.Should().Contain("github-runner(|-(sharedpos|puppet|signage|dms|telephony|print-web|chat|mysql|kiosk-linux))"); monitoring.Should().Contain("github-runner(|-.+)");
monitoring.Should().Contain("folder: CI Alerts"); monitoring.Should().Contain("folder: CI Alerts");
monitoring.Should().Contain("uid: linux-runner-offline"); monitoring.Should().Contain("uid: linux-runner-offline");
monitoring.Should().Contain("alert_channel: irc"); monitoring.Should().Contain("alert_channel: irc");
@@ -641,6 +645,15 @@ public sealed class FleetManifestLintTests
return EnvMapping(container, name) is { } env ? ManifestNodeExtensions.Scalar(env, "value") : null; return EnvMapping(container, name) is { } env ? ManifestNodeExtensions.Scalar(env, "value") : null;
} }
private static YamlMappingNode RunnerContainer(ManifestDocument deployment)
{
return deployment.ContainerMappings()
.Where(container => string.Equals(ManifestNodeExtensions.Scalar(container, "name"), "runner", StringComparison.Ordinal))
.Should()
.ContainSingle($"{deployment.Name} must keep exactly one main runner container")
.Subject;
}
private static string? EnvSecretName(YamlMappingNode container, string name) private static string? EnvSecretName(YamlMappingNode container, string name)
{ {
return EnvMapping(container, name) is { } env return EnvMapping(container, name) is { } env