Compare commits

..

1 Commits

Author SHA1 Message Date
Andrew Stoltz
9a4a8264d9 github-runner: add DM and WorldBuilder runners 2026-05-18 17:44:29 -05:00
9 changed files with 516 additions and 308 deletions

27
apps/brochure/README.md Normal file
View File

@@ -0,0 +1,27 @@
# FlowerCore Brochure
`apps/brochure` hosts the public brochure split from `FlowerCore.Intranet.Web`.
ArgoCD's `apps/*` ApplicationSet will create `infra-brochure` after this
directory lands on `main`.
## Runtime
- Host: `https://brochure.flowercore.io`
- Namespace: `brochure`
- Deployment: `brochure-web`
- Image: `localhost/fc-brochure-web:v20260524-sprint32`
- Port: `8080`
- Public route method allowlist: `GET` and `HEAD`
## Operator Actions
1. Publish and import `localhost/fc-brochure-web:v20260524-sprint32` to every
RKE2 node before sync, using the same podman save + `ctr images import`
flow as the Intranet deployment.
2. Create the Cloudflare DNS record for `brochure.flowercore.io` pointing at
the FlowerCore public edge.
3. Verify `infra-brochure` appears in ArgoCD, the certificate becomes Ready,
and `GET https://brochure.flowercore.io/` returns `200`.
The route intentionally does not expose `/ops/*` or `/admin/*`; the Brochure
web app returns `404` for those paths and Traefik only forwards read methods.

131
apps/brochure/brochure.yaml Normal file
View File

@@ -0,0 +1,131 @@
# FlowerCore Brochure public host
#
# Thin Blazor host for public What's New, walkthrough, and gallery content
# carved out of FlowerCore.Intranet.Web. The ApplicationSet creates
# infra-brochure from this directory after merge.
---
apiVersion: v1
kind: Namespace
metadata:
name: brochure
labels:
app.kubernetes.io/part-of: flowercore
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: brochure-web
namespace: brochure
labels:
app: brochure-web
app.kubernetes.io/name: brochure-web
app.kubernetes.io/part-of: flowercore
spec:
replicas: 1
revisionHistoryLimit: 3
selector:
matchLabels:
app: brochure-web
template:
metadata:
labels:
app: brochure-web
app.kubernetes.io/name: brochure-web
app.kubernetes.io/part-of: flowercore
spec:
containers:
- name: brochure-web
image: localhost/fc-brochure-web:v20260524-sprint32
imagePullPolicy: Never
ports:
- containerPort: 8080
name: http
env:
- name: ASPNETCORE_ENVIRONMENT
value: Production
- name: ASPNETCORE_URLS
value: "http://+:8080"
resources:
requests:
cpu: "25m"
memory: "128Mi"
limits:
cpu: "500m"
memory: "512Mi"
readinessProbe:
httpGet:
path: /health
port: http
initialDelaySeconds: 10
periodSeconds: 10
livenessProbe:
httpGet:
path: /health
port: http
initialDelaySeconds: 30
periodSeconds: 30
securityContext:
runAsNonRoot: true
runAsUser: 1654
runAsGroup: 1654
allowPrivilegeEscalation: false
readOnlyRootFilesystem: true
capabilities:
drop:
- ALL
volumeMounts:
- name: tmp
mountPath: /tmp
volumes:
- name: tmp
emptyDir: {}
---
apiVersion: v1
kind: Service
metadata:
name: brochure-web
namespace: brochure
labels:
app: brochure-web
app.kubernetes.io/name: brochure-web
app.kubernetes.io/part-of: flowercore
spec:
type: ClusterIP
selector:
app: brochure-web
ports:
- name: http
port: 8080
targetPort: http
---
apiVersion: cert-manager.io/v1
kind: Certificate
metadata:
name: brochure-web-tls
namespace: brochure
spec:
secretName: brochure-web-tls
issuerRef:
name: step-ca-acme
kind: ClusterIssuer
dnsNames:
- brochure.flowercore.io
duration: 720h
renewBefore: 240h
---
apiVersion: traefik.io/v1alpha1
kind: IngressRoute
metadata:
name: brochure-web-public
namespace: brochure
spec:
entryPoints:
- websecure
routes:
- match: Host(`brochure.flowercore.io`) && (Method(`GET`) || Method(`HEAD`))
kind: Rule
services:
- name: brochure-web
port: 8080
tls:
secretName: brochure-web-tls

View File

@@ -1,31 +0,0 @@
# Step issuer for FlowerCore.DeviceManagement runtime mTLS leaves.
#
# Requires the smallstep step-issuer CRDs/controller:
# stepclusterissuers.certmanager.step.sm
# The provisioner password lives in the live cert-manager Secret below; do not
# commit the password or generated private key material to this repo.
apiVersion: certmanager.step.sm/v1beta1
kind: StepClusterIssuer
metadata:
name: step-ca-agent
labels:
app.kubernetes.io/name: step-ca-agent
app.kubernetes.io/component: pki
app.kubernetes.io/part-of: flowercore
app.kubernetes.io/managed-by: argocd
flowercore.io/tenant-id: system
flowercore.io/created-by: bluejay-infra
annotations:
flowercore.io/traceability-standard: k8s-pod-ownership-and-traceability-standard
flowercore.io/provisioner-source: profile::pki::stepca
flowercore.io/secret-source: cert-manager/step-ca-agent-provisioner-password
spec:
url: https://10.0.56.10:9443
caBundle: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUJ4RENDQVdxZ0F3SUJBZ0lSQVBZMzU3RzZvdzZ6TUFMNSs0YlMya2t3Q2dZSUtvWkl6ajBFQXdJd1FERWEKTUJnR0ExVUVDaE1SU1VGdFYyOXlhMmx1SUVGRFRVVWdRMEV4SWpBZ0JnTlZCQU1UR1VsQmJWZHZjbXRwYmlCQgpRMDFGSUVOQklGSnZiM1FnUTBFd0hoY05Nall3TXpBNE1UZ3dOekV4V2hjTk16WXdNekExTVRnd056RXhXakJBCk1Sb3dHQVlEVlFRS0V4RkpRVzFYYjNKcmFXNGdRVU5OUlNCRFFURWlNQ0FHQTFVRUF4TVpTVUZ0VjI5eWEybHUKSUVGRFRVVWdRMEVnVW05dmRDQkRRVEJaTUJNR0J5cUdTTTQ5QWdFR0NDcUdTTTQ5QXdFSEEwSUFCSjJuMDRYMQpKWm81WmRxL2kxSWR2OCtmcXdaeUF6Qmg3d2hicWowU1dzSkw4VVdSYWJDTXFZQ3M3K2RYTzB4UlN6cWt3RkRMCngrdm9vT2FpOFJnUk5oYWpSVEJETUE0R0ExVWREd0VCL3dRRUF3SUJCakFTQmdOVkhSTUJBZjhFQ0RBR0FRSC8KQWdFQk1CMEdBMVVkRGdRV0JCUm51UFBRUjZpTS9INnZPbHVpVTNTeWdheXo4akFLQmdncWhrak9QUVFEQWdOSQpBREJGQWlFQXJRSzlkWVBHbUFac2RZbmp6aXVGVlZFNU5LWlVjY2VZdkdmR0MrdExYVXNDSUF1ZEYyekpyQ1JxCjNtSzUwWlpFVC9md1RrSndpRUY0ODI0bWpQOHAxQ0tNCi0tLS0tRU5EIENFUlRJRklDQVRFLS0tLS0K
provisioner:
name: step-ca-agent
kid: RF3A9welUYVOWBX8tr19aWyA2kQlxoGZN1dRwTElUEM
passwordRef:
name: step-ca-agent-provisioner-password
namespace: cert-manager
key: password

View File

@@ -47,7 +47,7 @@ spec:
fsGroupChangePolicy: OnRootMismatch fsGroupChangePolicy: OnRootMismatch
containers: containers:
- name: operator - name: operator
image: localhost/fc-devicemgmt-operator:v20260519-sp34cl3-fix image: localhost/fc-devicemgmt-operator:v20260512-cx5
imagePullPolicy: Never imagePullPolicy: Never
ports: ports:
- name: metrics - name: metrics

View File

@@ -4,22 +4,6 @@
# Sprint 9+ lane. This manifest is static-valid without requiring the image to # Sprint 9+ lane. This manifest is static-valid without requiring the image to
# exist yet; import localhost/fc-devicemgmt-web:<tag> to all schedulable RKE2 # exist yet; import localhost/fc-devicemgmt-web:<tag> to all schedulable RKE2
# nodes before letting ArgoCD sync a live rollout. # nodes before letting ArgoCD sync a live rollout.
#
# SCALED TO 0 — 2026-05-19 morning-routine cleanup.
# The Web pod cannot start until TWO upstream gaps close:
# 1. MySQL DB instance `flowercore_devicemgmt` (user `fc_devicemgmt`) is
# provisioned via fc-mysql Manager. The cluster currently has ZERO
# MySqlInstanceCrds and no `mysql.fc-mysql.svc:3306` Service, so the
# deployment-web container env `FlowerCore__Database__Host=mysql.fc-mysql.svc`
# points at nothing. Provision via the fc-mysql Manager UI/REST/MCP.
# 2. 1Password vault item `IAmWorkin/FlowerCore DeviceManagement Runtime`
# with 5 fields (DB-Password, mtls-ca.pem, mtls-client.crt, mtls-client.key,
# mtls-chain.pem) — see apps/fc-devicemgmt/1password-item.yaml. Mint mTLS
# from step-ca-agent ClusterIssuer per ADR-126; DB-Password must match the
# password configured for the MySQL user.
# Re-enable: change replicas back to 2 after both gaps close. The image tag
# in this file (v20260512-cx5) MAY also need a refresh — it predates the
# Sprint 34 Cl-3 operator fix; Web may have an analogous bug.
apiVersion: apps/v1 apiVersion: apps/v1
kind: Deployment kind: Deployment
metadata: metadata:
@@ -36,7 +20,7 @@ metadata:
annotations: annotations:
flowercore.io/traceability-standard: k8s-pod-ownership-and-traceability-standard flowercore.io/traceability-standard: k8s-pod-ownership-and-traceability-standard
spec: spec:
replicas: 0 replicas: 2
revisionHistoryLimit: 3 revisionHistoryLimit: 3
selector: selector:
matchLabels: matchLabels:

View File

@@ -28,6 +28,10 @@ Sprint 32 final long-tail wave adds 16 two-replica Deployments:
`FlowerCore.Provisioning`, `FlowerCore.Redis`, `FlowerCore.MessageBoard`, and `FlowerCore.Provisioning`, `FlowerCore.Redis`, `FlowerCore.MessageBoard`, and
`FlowerCore.MenuBoard`. `FlowerCore.MenuBoard`.
Sprint 37 Cx-2 closes the audited Linux runner gaps for
`FlowerCore.DeviceManagement` and `FlowerCore.WorldBuilder` with the same
two-replica `emptyDir` pattern.
## Post-Merge Proof ## Post-Merge Proof
After the PR is merged and ArgoCD syncs, verify the runner fleet: After the PR is merged and ArgoCD syncs, verify the runner fleet:
@@ -47,7 +51,7 @@ for repo in FlowerCore.Common FlowerCore.Shared.Pos FlowerCore.Puppet FlowerCore
FlowerCore.Distribution FlowerCore.Scoreboard FlowerCore.SegmentDisplay \ FlowerCore.Distribution FlowerCore.Scoreboard FlowerCore.SegmentDisplay \
FlowerCore.Signage.Contracts FlowerCore.SignalControl FlowerCore.Intranet.Web \ FlowerCore.Signage.Contracts FlowerCore.SignalControl FlowerCore.Intranet.Web \
FlowerCore.Provisioning FlowerCore.Redis FlowerCore.MessageBoard \ FlowerCore.Provisioning FlowerCore.Redis FlowerCore.MessageBoard \
FlowerCore.MenuBoard; do FlowerCore.MenuBoard FlowerCore.DeviceManagement FlowerCore.WorldBuilder; do
echo "=== $repo ===" echo "=== $repo ==="
gh api "/repos/astoltz/$repo/actions/runners" \ gh api "/repos/astoltz/$repo/actions/runners" \
--jq '.runners[] | select(.labels[].name == "fc-build-linux") | {name,status,busy,labels:[.labels[].name]}' --jq '.runners[] | select(.labels[].name == "fc-build-linux") | {name,status,busy,labels:[.labels[].name]}'
@@ -64,6 +68,20 @@ gh run list --repo astoltz/FlowerCore.Shared.Pos \
If the latest run is still queued after runner registration, rerun the workflow If the latest run is still queued after runner registration, rerun the workflow
from GitHub Actions and verify it lands on an `rke2-linux-*` runner. from GitHub Actions and verify it lands on an `rke2-linux-*` runner.
## Sprint 37 Cx-2 Gap Audit
The 2026-05-18 GitHub workflow scan found these remaining repos with
`runs-on: [self-hosted, linux, fc-build-linux]` but no K8s runner Deployment:
`FlowerCore.AiStation.Linux`, `FlowerCore.PHP`, `FlowerCore.PiManager`,
`FlowerCore.Shared.Barcodes`, `FlowerCore.Shared.Lookup`,
`FlowerCore.Shared.Nodes`, `FlowerCore.Shared.PrintClient`,
`FlowerCore.Shared.Relay`, `FlowerCore.Shared.ShowRunner`, and
`FlowerCore.Shared.Storage`.
Mixed/platform repos also have Linux workflow legs but need owner review before
adding Linux runner Deployments: `FlowerCore.Library.Mac`,
`FlowerCore.Signage.Agent.AppleTv`, and `FlowerCore.Signage.Player.Wpf`.
## Failure Notes ## Failure Notes
- `actions/setup-dotnet` permission error at `/usr/share/dotnet`: check that - `actions/setup-dotnet` permission error at `/usr/share/dotnet`: check that

View File

@@ -16,6 +16,8 @@
# DNS, Distribution, Scoreboard, SegmentDisplay, Signage.Contracts, # DNS, Distribution, Scoreboard, SegmentDisplay, Signage.Contracts,
# SignalControl, Intranet.Web, Provisioning, Redis, MessageBoard, MenuBoard # SignalControl, Intranet.Web, Provisioning, Redis, MessageBoard, MenuBoard
# (Sprint 32 final long-tail wave; two replicas each, emptyDir cache) # (Sprint 32 final long-tail wave; two replicas each, emptyDir cache)
# FlowerCore.DeviceManagement, WorldBuilder (Sprint 37 Cx-2 runner gap
# closure; two replicas each, emptyDir cache)
# #
# Non-root CI safety: # Non-root CI safety:
# Runner pods run as uid 1001. HOME, DOTNET_INSTALL_DIR, DOTNET_CLI_HOME, # Runner pods run as uid 1001. HOME, DOTNET_INSTALL_DIR, DOTNET_CLI_HOME,
@@ -3767,9 +3769,271 @@ spec:
- name: tmp - name: tmp
emptyDir: {} emptyDir: {}
restartPolicy: Always restartPolicy: Always
---
# Runner for FlowerCore.DeviceManagement. Added 2026-05-18 (Sprint 37 Cx-2)
# to close the Linux CI capacity gap for the DM service-tier workflows. Mirrors
# the Sprint 32 long-tail emptyDir pattern: two replicas, shared
# 1Password-backed ACCESS_TOKEN, and the common ServiceAccount.
apiVersion: apps/v1
kind: Deployment
metadata:
name: github-runner-devicemgmt
namespace: github-runner
labels:
app.kubernetes.io/name: github-runner-devicemgmt
app.kubernetes.io/component: runner
app.kubernetes.io/part-of: flowercore
app.kubernetes.io/managed-by: argocd
flowercore.io/created-by: argocd
flowercore.io/runner-repo: devicemgmt
flowercore.io/github-repo: FlowerCore.DeviceManagement
spec:
replicas: 2
selector:
matchLabels:
app.kubernetes.io/name: github-runner-devicemgmt
strategy:
type: Recreate
template:
metadata:
labels:
app.kubernetes.io/name: github-runner-devicemgmt
app.kubernetes.io/component: runner
app.kubernetes.io/part-of: flowercore
flowercore.io/created-by: argocd
flowercore.io/runner-repo: devicemgmt
flowercore.io/github-repo: FlowerCore.DeviceManagement
spec:
serviceAccountName: github-runner
securityContext:
runAsNonRoot: true
runAsUser: 1001
runAsGroup: 1001
fsGroup: 1001
initContainers:
- name: setup-runner-home
image: busybox:1.36
command:
- sh
- -c
- |
set -e
mkdir -p /home/runner/.dotnet /home/runner/.nuget/packages /home/runner/.nuget/NuGet
chown -R 1001:1001 /home/runner/.dotnet /home/runner/.nuget
chmod -R 755 /home/runner/.dotnet /home/runner/.nuget
securityContext:
runAsUser: 0
runAsNonRoot: false
volumeMounts:
- name: runner-home
mountPath: /home/runner
containers:
- name: runner
image: myoung34/github-runner:latest
imagePullPolicy: Always
env:
- name: REPO_URL
value: "https://github.com/astoltz/FlowerCore.DeviceManagement"
- name: RUNNER_NAME_PREFIX
value: "rke2-linux-devicemgmt"
- name: RUNNER_WORKDIR
value: "/tmp/runner/work"
- name: EPHEMERAL
value: "true"
- name: LABELS
value: "self-hosted,linux,fc-build-linux"
- name: HOME
value: "/home/runner"
- name: DOTNET_INSTALL_DIR
value: "/home/runner/.dotnet"
- name: DOTNET_CLI_TELEMETRY_OPTOUT
value: "1"
- name: DOTNET_NOLOGO
value: "1"
- name: DOTNET_GENERATE_ASPNET_CERTIFICATE
value: "false"
- name: DOTNET_CLI_HOME
value: "/home/runner"
- name: NUGET_PACKAGES
value: "/home/runner/.nuget/packages"
- name: XDG_CACHE_HOME
value: "/home/runner/.cache"
- name: RUNNER_TOOL_CACHE
value: "/home/runner/_tool"
- name: ACCESS_TOKEN
valueFrom:
secretKeyRef:
name: github-runner-token
key: credential
- name: RUN_AS_ROOT
value: "false"
resources:
requests:
cpu: "500m"
memory: "1Gi"
limits:
cpu: "2000m"
memory: "4Gi"
volumeMounts:
- name: runner-home
mountPath: /home/runner
- name: nuget-cache
mountPath: /home/runner/.nuget/packages
- name: tmp
mountPath: /tmp
livenessProbe:
exec:
command:
- /bin/sh
- -c
- "pgrep -f Runner.Listener > /dev/null"
initialDelaySeconds: 30
periodSeconds: 30
failureThreshold: 3
volumes:
- name: runner-home
emptyDir: {}
- name: nuget-cache
emptyDir:
sizeLimit: 2Gi
- name: tmp
emptyDir: {}
restartPolicy: Always
---
# Runner for FlowerCore.WorldBuilder. Added 2026-05-18 (Sprint 37 Cx-2)
# to unblock WorldBuilder Linux CI jobs after the runner fleet audit found no
# repo-scoped deployment for the GitHub repo. Mirrors the Sprint 32 long-tail
# emptyDir pattern: two replicas, shared 1Password-backed ACCESS_TOKEN, and
# the common ServiceAccount.
apiVersion: apps/v1
kind: Deployment
metadata:
name: github-runner-worldbuilder
namespace: github-runner
labels:
app.kubernetes.io/name: github-runner-worldbuilder
app.kubernetes.io/component: runner
app.kubernetes.io/part-of: flowercore
app.kubernetes.io/managed-by: argocd
flowercore.io/created-by: argocd
flowercore.io/runner-repo: worldbuilder
flowercore.io/github-repo: FlowerCore.WorldBuilder
spec:
replicas: 2
selector:
matchLabels:
app.kubernetes.io/name: github-runner-worldbuilder
strategy:
type: Recreate
template:
metadata:
labels:
app.kubernetes.io/name: github-runner-worldbuilder
app.kubernetes.io/component: runner
app.kubernetes.io/part-of: flowercore
flowercore.io/created-by: argocd
flowercore.io/runner-repo: worldbuilder
flowercore.io/github-repo: FlowerCore.WorldBuilder
spec:
serviceAccountName: github-runner
securityContext:
runAsNonRoot: true
runAsUser: 1001
runAsGroup: 1001
fsGroup: 1001
initContainers:
- name: setup-runner-home
image: busybox:1.36
command:
- sh
- -c
- |
set -e
mkdir -p /home/runner/.dotnet /home/runner/.nuget/packages /home/runner/.nuget/NuGet
chown -R 1001:1001 /home/runner/.dotnet /home/runner/.nuget
chmod -R 755 /home/runner/.dotnet /home/runner/.nuget
securityContext:
runAsUser: 0
runAsNonRoot: false
volumeMounts:
- name: runner-home
mountPath: /home/runner
containers:
- name: runner
image: myoung34/github-runner:latest
imagePullPolicy: Always
env:
- name: REPO_URL
value: "https://github.com/astoltz/FlowerCore.WorldBuilder"
- name: RUNNER_NAME_PREFIX
value: "rke2-linux-worldbuilder"
- name: RUNNER_WORKDIR
value: "/tmp/runner/work"
- name: EPHEMERAL
value: "true"
- name: LABELS
value: "self-hosted,linux,fc-build-linux"
- name: HOME
value: "/home/runner"
- name: DOTNET_INSTALL_DIR
value: "/home/runner/.dotnet"
- name: DOTNET_CLI_TELEMETRY_OPTOUT
value: "1"
- name: DOTNET_NOLOGO
value: "1"
- name: DOTNET_GENERATE_ASPNET_CERTIFICATE
value: "false"
- name: DOTNET_CLI_HOME
value: "/home/runner"
- name: NUGET_PACKAGES
value: "/home/runner/.nuget/packages"
- name: XDG_CACHE_HOME
value: "/home/runner/.cache"
- name: RUNNER_TOOL_CACHE
value: "/home/runner/_tool"
- name: ACCESS_TOKEN
valueFrom:
secretKeyRef:
name: github-runner-token
key: credential
- name: RUN_AS_ROOT
value: "false"
resources:
requests:
cpu: "500m"
memory: "1Gi"
limits:
cpu: "2000m"
memory: "4Gi"
volumeMounts:
- name: runner-home
mountPath: /home/runner
- name: nuget-cache
mountPath: /home/runner/.nuget/packages
- name: tmp
mountPath: /tmp
livenessProbe:
exec:
command:
- /bin/sh
- -c
- "pgrep -f Runner.Listener > /dev/null"
initialDelaySeconds: 30
periodSeconds: 30
failureThreshold: 3
volumes:
- name: runner-home
emptyDir: {}
- name: nuget-cache
emptyDir:
sizeLimit: 2Gi
- name: tmp
emptyDir: {}
restartPolicy: Always
# Long-tail runner pattern: # Long-tail runner pattern:
# #
# Sprint 32 added the final 16 long-tail repo-scoped Deployments above. Keep # Sprint 32 added the final 16 long-tail repo-scoped Deployments, and Sprint 37
# Common as the only PVC-backed runner at replicas: 1. Any future multi-replica # added the DM + WorldBuilder runner gap closures above. Keep Common as the
# runner must use per-pod emptyDir caches, not a shared ReadWriteOnce PVC. # only PVC-backed runner at replicas: 1. Any future multi-replica runner must
# use per-pod emptyDir caches, not a shared ReadWriteOnce PVC.

View File

@@ -729,7 +729,7 @@ data:
expr: | expr: |
kube_deployment_status_replicas_ready{ kube_deployment_status_replicas_ready{
namespace="github-runner", namespace="github-runner",
deployment=~"github-runner(|-(sharedpos|puppet|signage|dms|telephony|print-web|chat|mysql|kiosk-linux))" deployment=~"github-runner(|-.+)"
} == 0 } == 0
for: 5m for: 5m
labels: labels:
@@ -1273,55 +1273,24 @@ metadata:
data: data:
notify.py: | notify.py: |
#!/usr/bin/env python3 #!/usr/bin/env python3
"""HTTP->IRC alert relay with thermal-printer DIGEST forwarding. """HTTP->IRC alert relay with thermal printer forwarding for Grafana webhooks.
Listens on :9119, posts to #alerts on UnrealIRCd via raw IRC protocol.
Listens on :9119, posts to #alerts on UnrealIRCd, forwards to Print.Web Alerts tagged alert_channel=thermal_print also POST to Print.Web /api/print/alert.
/api/print/alert. Thermal printing is BATCHED into hourly digests by
default so the printer no longer spam-fires per Grafana webhook.
Routing (per Grafana webhook alert):
- IRC: always per-event (operator likes the stream)
- Thermal printer:
* severity in {critical,disaster,page} OR
label alert_channel=thermal_print_immediate -> print NOW
* label alert_channel=thermal_print -> enqueue into hourly digest
* everything else -> IRC only
- RESOLVED webhooks remove the alert from the digest buffer
Env vars (defaults preserve old behavior on first deploy):
THERMAL_PRINT_ENABLED default "true" - master kill switch
BATCH_INTERVAL_MIN default "60" - minutes between digest prints
BATCH_MAX_PENDING default "50" - force-flush threshold
HTTP surface:
POST / - Grafana webhook entry
POST /flush - manual digest flush (idempotent)
GET / - status + config + buffer depth + stats
""" """
import json, os, socket, sys, threading, time import json, socket, sys, time
from collections import defaultdict
from datetime import datetime, timezone
from http.server import HTTPServer, BaseHTTPRequestHandler from http.server import HTTPServer, BaseHTTPRequestHandler
from urllib.request import Request, urlopen from urllib.request import Request, urlopen
from urllib.error import URLError
THERMAL_PRINT_ENABLED = os.environ.get("THERMAL_PRINT_ENABLED", "true").lower() == "true" IRC_HOST = "unrealircd.irc.svc" # short name: CoreDNS ndots:5 + iamworkin.lan template hijacks full .cluster.local (see memory)
BATCH_INTERVAL_MIN = int(os.environ.get("BATCH_INTERVAL_MIN", "60")) IRC_PORT = 6667
BATCH_MAX_PENDING = int(os.environ.get("BATCH_MAX_PENDING", "50")) IRC_NICK = "grafana-bot"
IRC_CHANNEL = "#alerts"
IRC_HOST = os.environ.get("IRC_HOST", "unrealircd.irc.svc") PRINT_WEB_URL = "http://10.0.57.16:5200/api/print/alert"
IRC_PORT = int(os.environ.get("IRC_PORT", "6667")) PRINT_ENABLED = True
IRC_NICK = os.environ.get("IRC_NICK", "grafana-bot")
IRC_CHANNEL = os.environ.get("IRC_CHANNEL", "#alerts")
PRINT_WEB_URL = os.environ.get("PRINT_WEB_URL", "http://10.0.57.16:5200/api/print/alert")
_buffer_lock = threading.Lock()
_buffer = {} # fingerprint -> {"alert": dict, "first_seen": float, "last_seen": float}
_last_flush_time = time.time()
_stats = {"webhooks_received": 0, "irc_sent": 0, "print_immediate": 0,
"digest_flushed": 0, "buffer_dedup": 0, "buffer_added": 0,
"buffer_resolved": 0, "started_at": time.time()}
def send_irc(message): def send_irc(message):
"""Connect, handle PING, join, send, quit."""
try: try:
sock = socket.create_connection((IRC_HOST, IRC_PORT), timeout=15) sock = socket.create_connection((IRC_HOST, IRC_PORT), timeout=15)
sock.sendall(f"NICK {IRC_NICK}\r\n".encode()) sock.sendall(f"NICK {IRC_NICK}\r\n".encode())
@@ -1354,137 +1323,52 @@ data:
time.sleep(0.5) time.sleep(0.5)
sock.sendall(b"QUIT :alert delivered\r\n") sock.sendall(b"QUIT :alert delivered\r\n")
sock.close() sock.close()
_stats["irc_sent"] += 1
return True return True
except Exception as e: except Exception as e:
print(f"[irc-notify] IRC send failed: {e}", file=sys.stderr) print(f"[irc-notify] IRC send failed: {e}", file=sys.stderr)
return False return False
def post_thermal(payload, kind): def send_thermal_print(alert):
if not THERMAL_PRINT_ENABLED: if not PRINT_ENABLED: return
print(f"[irc-notify] thermal disabled; skip {kind} ({payload.get('title','?')[:40]})", file=sys.stderr) labels = alert.get("labels", {})
return False annotations = alert.get("annotations", {})
status = alert.get("status", "firing").upper()
summary = annotations.get("summary", "")
description = annotations.get("description", "")
runbook = annotations.get("runbook", "")
# Build a useful message: summary + description + runbook steps
parts = []
if summary: parts.append(summary)
if description and description != summary: parts.append(description)
if runbook: parts.append("STEPS: " + runbook)
message = " | ".join(parts) if parts else labels.get("alertname", "Unknown alert")
payload = {
"title": labels.get("alertname", "Unknown"),
"severity": labels.get("severity", "warning").capitalize(),
"host": labels.get("instance", labels.get("host", "unknown")),
"message": message,
"eventId": alert.get("fingerprint", ""),
"source": "Grafana",
"status": "RESOLVED" if status == "RESOLVED" else "PROBLEM",
"acknowledged": False
}
try: try:
req = Request(PRINT_WEB_URL, data=json.dumps(payload).encode("utf-8"), req = Request(PRINT_WEB_URL, data=json.dumps(payload).encode("utf-8"),
headers={"Content-Type": "application/json"}, method="POST") headers={"Content-Type": "application/json"}, method="POST")
resp = urlopen(req, timeout=10) resp = urlopen(req, timeout=10)
if kind == "immediate": _stats["print_immediate"] += 1 print(f"[irc-notify] Thermal print sent: {resp.read().decode()}", file=sys.stderr)
print(f"[irc-notify] thermal {kind} sent: {payload.get('title','?')[:50]}", file=sys.stderr)
return True
except Exception as e: except Exception as e:
print(f"[irc-notify] thermal {kind} failed: {e}", file=sys.stderr) print(f"[irc-notify] Thermal print failed: {e}", file=sys.stderr)
return False
def fingerprint_of(alert): def should_print(alert):
fp = alert.get("fingerprint", "")
if fp: return fp
labels = alert.get("labels", {}) labels = alert.get("labels", {})
target = labels.get("pod") or labels.get("instance") or labels.get("deployment") or labels.get("statefulset") or labels.get("namespace") or "" if labels.get("alert_channel") == "thermal_print": return True
return f"{labels.get('alertname','?')}/{labels.get('namespace','')}/{target}" if labels.get("severity", "").lower() in ("critical", "disaster"): return True
if alert.get("status", "").upper() == "RESOLVED": return False
def is_critical(alert): return False
return alert.get("labels", {}).get("severity", "").lower() in ("critical", "disaster", "page")
def is_immediate_label(alert):
return alert.get("labels", {}).get("alert_channel") == "thermal_print_immediate"
def is_batched_label(alert):
return alert.get("labels", {}).get("alert_channel") == "thermal_print"
def add_to_digest(alert):
"""Add an alert to the digest buffer. Returns True if the buffer GREW
(new fingerprint), False if it was a dedup, resolution, or no-op.
"""
if not THERMAL_PRINT_ENABLED: return False
fp = fingerprint_of(alert)
status = alert.get("status", "firing").lower()
with _buffer_lock:
if status == "resolved":
if fp in _buffer:
del _buffer[fp]
_stats["buffer_resolved"] += 1
return False
if fp in _buffer:
_buffer[fp]["last_seen"] = time.time()
_buffer[fp]["alert"] = alert
_stats["buffer_dedup"] += 1
return False
_buffer[fp] = {"alert": alert, "first_seen": time.time(), "last_seen": time.time()}
_stats["buffer_added"] += 1
return True
def build_digest_payload():
with _buffer_lock:
items = list(_buffer.values())
if not items: return None
by_name = defaultdict(list)
for item in items:
labels = item["alert"].get("labels", {})
by_name[labels.get("alertname", "Unknown")].append(item)
lines = []
for name, group in sorted(by_name.items()):
targets = []
for it in group[:5]:
labels = it["alert"].get("labels", {})
t = (labels.get("pod") or labels.get("instance") or labels.get("deployment")
or labels.get("statefulset") or labels.get("namespace") or "?")
targets.append(t)
more = f" (+{len(group)-5})" if len(group) > 5 else ""
sevs = sorted({it["alert"].get("labels", {}).get("severity", "warning") for it in group})
lines.append(f"[{'/'.join(sevs)}] {name} x{len(group)}: {', '.join(targets)}{more}")
now = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC")
title = f"Alert digest: {len(items)} firing"
body = "\n".join([
f"=== {title} ===",
f"as of {now}",
"",
*lines,
"",
"Stream: #alerts (IRC) | Triage: grafana-noc1.iamworkin.lan",
"Force-flush: POST irc-notify.monitoring.svc:9119/flush",
])
return {"title": title, "severity": "Warning", "host": "monitoring",
"message": body, "eventId": f"digest-{int(time.time())}",
"source": "Grafana digest", "status": "PROBLEM", "acknowledged": False}
def flush_digest():
payload = build_digest_payload()
if payload is None:
print("[irc-notify] flush: buffer empty, no digest sent", file=sys.stderr)
return False
sent = post_thermal(payload, "digest")
with _buffer_lock:
_buffer.clear()
if sent: _stats["digest_flushed"] += 1
return sent
def digest_loop():
global _last_flush_time
while True:
try:
now = time.time()
elapsed = now - _last_flush_time
if elapsed >= BATCH_INTERVAL_MIN * 60:
print(f"[irc-notify] digest tick: interval reached ({BATCH_INTERVAL_MIN}m); buffer={len(_buffer)}", file=sys.stderr)
flush_digest()
_last_flush_time = now
elif len(_buffer) >= BATCH_MAX_PENDING:
print(f"[irc-notify] digest tick: buffer full ({len(_buffer)}); force flush", file=sys.stderr)
flush_digest()
_last_flush_time = now
time.sleep(15)
except Exception as e:
print(f"[irc-notify] digest loop error: {e}", file=sys.stderr)
time.sleep(60)
class Handler(BaseHTTPRequestHandler): class Handler(BaseHTTPRequestHandler):
def do_POST(self): def do_POST(self):
if self.path == "/flush":
ok = flush_digest()
self.send_response(200); self.send_header("Content-Type", "application/json"); self.end_headers()
self.wfile.write(json.dumps({"flushed": ok, "buffer_after": len(_buffer)}).encode())
return
_stats["webhooks_received"] += 1
length = int(self.headers.get("Content-Length", 0)) length = int(self.headers.get("Content-Length", 0))
body = json.loads(self.rfile.read(length)) if length else {} body = json.loads(self.rfile.read(length)) if length else {}
for alert in body.get("alerts", []): for alert in body.get("alerts", []):
@@ -1499,56 +1383,22 @@ data:
msg = f"{icon}{sev_tag} {name}: {summary}" msg = f"{icon}{sev_tag} {name}: {summary}"
if desc: msg += f"\n {desc}" if desc: msg += f"\n {desc}"
send_irc(msg) send_irc(msg)
# Thermal routing — EVERYTHING (including criticals) goes into if should_print(alert): send_thermal_print(alert)
# the hourly digest. Only the explicit `alert_channel=thermal_print_immediate` self.send_response(200)
# label bypasses, and even that flushes-the-current-digest rather self.send_header("Content-Type", "application/json")
# than printing a standalone job, so the same fingerprint can't self.end_headers()
# spam the printer per webhook cycle.
if status == "RESOLVED":
add_to_digest(alert) # removes from buffer
continue
if is_immediate_label(alert):
# Explicit opt-in for "paper this NOW" — first arrival of a
# new fingerprint triggers an immediate digest flush; repeat
# webhooks for the same fingerprint dedupe in the buffer
# until the next interval or until the alert resolves.
new_in_buffer = add_to_digest(alert)
if new_in_buffer:
global _last_flush_time
flush_digest()
_last_flush_time = time.time()
elif is_critical(alert) or is_batched_label(alert):
add_to_digest(alert)
# else: IRC-only (warnings without thermal_print label)
self.send_response(200); self.send_header("Content-Type", "application/json"); self.end_headers()
self.wfile.write(b'{"status":"ok"}') self.wfile.write(b'{"status":"ok"}')
def do_GET(self): def do_GET(self):
self.send_response(200); self.send_header("Content-Type", "application/json"); self.end_headers() self.send_response(200)
with _buffer_lock: self.send_header("Content-Type", "application/json")
alertnames = sorted({it["alert"].get("labels", {}).get("alertname", "?") for it in _buffer.values()}) self.end_headers()
depth = len(_buffer) self.wfile.write(json.dumps({"service":"irc-notify","thermal_print":PRINT_ENABLED}).encode())
info = {
"service": "irc-notify",
"config": {"thermal_print_enabled": THERMAL_PRINT_ENABLED,
"batch_interval_min": BATCH_INTERVAL_MIN,
"batch_max_pending": BATCH_MAX_PENDING,
"irc_target": f"{IRC_HOST}:{IRC_PORT} {IRC_CHANNEL}",
"print_web_url": PRINT_WEB_URL},
"buffer": {"depth": depth, "alertnames": alertnames,
"seconds_since_last_flush": int(time.time() - _last_flush_time),
"seconds_until_next_flush": max(0, int(BATCH_INTERVAL_MIN*60 - (time.time() - _last_flush_time)))},
"stats": _stats,
}
self.wfile.write(json.dumps(info, indent=2).encode())
def log_message(self, format, *args): def log_message(self, format, *args):
print(f"[irc-notify] {args[0]}", file=sys.stderr) print(f"[irc-notify] {args[0]}", file=sys.stderr)
if __name__ == "__main__": if __name__ == "__main__":
threading.Thread(target=digest_loop, daemon=True).start()
server = HTTPServer(("0.0.0.0", 9119), Handler) server = HTTPServer(("0.0.0.0", 9119), Handler)
print(f"[irc-notify] :9119 -> IRC {IRC_HOST}:{IRC_PORT} {IRC_CHANNEL} | thermal={'ON' if THERMAL_PRINT_ENABLED else 'OFF'} | digest={BATCH_INTERVAL_MIN}m max={BATCH_MAX_PENDING}", file=sys.stderr) print(f"IRC alert relay :9119 -> {IRC_HOST}:{IRC_PORT} {IRC_CHANNEL} (thermal: {PRINT_ENABLED})")
server.serve_forever() server.serve_forever()
# ============================================================================= # =============================================================================
@@ -3659,7 +3509,7 @@ data:
- refId: A - refId: A
relativeTimeRange: {from: 300, to: 0} relativeTimeRange: {from: 300, to: 0}
datasourceUid: prometheus datasourceUid: prometheus
model: {expr: 'kube_deployment_status_replicas_ready{namespace="github-runner",deployment=~"github-runner(|-(sharedpos|puppet|signage|dms|telephony|print-web|chat|mysql|kiosk-linux))"} == 0', instant: true, refId: A} model: {expr: 'kube_deployment_status_replicas_ready{namespace="github-runner",deployment=~"github-runner(|-.+)"} == 0', instant: true, refId: A}
- refId: B - refId: B
relativeTimeRange: {from: 300, to: 0} relativeTimeRange: {from: 300, to: 0}
datasourceUid: __expr__ datasourceUid: __expr__

View File

@@ -67,6 +67,8 @@ public sealed class FleetManifestLintTests
["github-runner-chat"] = "https://github.com/astoltz/FlowerCore.Chat", ["github-runner-chat"] = "https://github.com/astoltz/FlowerCore.Chat",
["github-runner-mysql"] = "https://github.com/astoltz/FlowerCore.MySQL", ["github-runner-mysql"] = "https://github.com/astoltz/FlowerCore.MySQL",
["github-runner-kiosk-linux"] = "https://github.com/astoltz/FlowerCore.Kiosk.Linux", ["github-runner-kiosk-linux"] = "https://github.com/astoltz/FlowerCore.Kiosk.Linux",
["github-runner-devicemgmt"] = "https://github.com/astoltz/FlowerCore.DeviceManagement",
["github-runner-worldbuilder"] = "https://github.com/astoltz/FlowerCore.WorldBuilder",
}; };
private static readonly HashSet<string> ScaledLinuxRunnerDeployments = new(StringComparer.Ordinal) private static readonly HashSet<string> ScaledLinuxRunnerDeployments = new(StringComparer.Ordinal)
@@ -80,6 +82,8 @@ public sealed class FleetManifestLintTests
"github-runner-chat", "github-runner-chat",
"github-runner-mysql", "github-runner-mysql",
"github-runner-kiosk-linux", "github-runner-kiosk-linux",
"github-runner-devicemgmt",
"github-runner-worldbuilder",
}; };
private static readonly IReadOnlyDictionary<string, string> WritableRunnerEnv = new Dictionary<string, string>(StringComparer.Ordinal) private static readonly IReadOnlyDictionary<string, string> WritableRunnerEnv = new Dictionary<string, string>(StringComparer.Ordinal)
@@ -234,7 +238,7 @@ public sealed class FleetManifestLintTests
{ {
deployments.Should().ContainKey(expectedRunner.Key); deployments.Should().ContainKey(expectedRunner.Key);
var container = deployments[expectedRunner.Key].ContainerMappings().Should().ContainSingle().Subject; var container = RunnerContainer(deployments[expectedRunner.Key]);
EnvValue(container, "REPO_URL").Should().Be(expectedRunner.Value); EnvValue(container, "REPO_URL").Should().Be(expectedRunner.Value);
EnvValue(container, "EPHEMERAL").Should().Be("true"); EnvValue(container, "EPHEMERAL").Should().Be("true");
EnvValue(container, "LABELS").Should().Be("self-hosted,linux,fc-build-linux"); EnvValue(container, "LABELS").Should().Be("self-hosted,linux,fc-build-linux");
@@ -250,7 +254,7 @@ public sealed class FleetManifestLintTests
{ {
foreach (var deployment in GitHubRunnerDeployments().Values) foreach (var deployment in GitHubRunnerDeployments().Values)
{ {
var container = deployment.ContainerMappings().Should().ContainSingle().Subject; var container = RunnerContainer(deployment);
foreach (var expectedEnv in WritableRunnerEnv) foreach (var expectedEnv in WritableRunnerEnv)
{ {
@@ -311,7 +315,7 @@ public sealed class FleetManifestLintTests
monitoring.Should().Contain("MacMiniRunnerOffline"); monitoring.Should().Contain("MacMiniRunnerOffline");
monitoring.Should().Contain("LinuxRunnerOffline"); monitoring.Should().Contain("LinuxRunnerOffline");
monitoring.Should().Contain("kube_deployment_status_replicas_ready"); monitoring.Should().Contain("kube_deployment_status_replicas_ready");
monitoring.Should().Contain("github-runner(|-(sharedpos|puppet|signage|dms|telephony|print-web|chat|mysql|kiosk-linux))"); monitoring.Should().Contain("github-runner(|-.+)");
monitoring.Should().Contain("folder: CI Alerts"); monitoring.Should().Contain("folder: CI Alerts");
monitoring.Should().Contain("uid: linux-runner-offline"); monitoring.Should().Contain("uid: linux-runner-offline");
monitoring.Should().Contain("alert_channel: irc"); monitoring.Should().Contain("alert_channel: irc");
@@ -432,7 +436,6 @@ public sealed class FleetManifestLintTests
"1password-item.yaml", "1password-item.yaml",
"argocd-application.yaml", "argocd-application.yaml",
"certificate-web.yaml", "certificate-web.yaml",
"clusterissuer-step-ca-agent.yaml",
"clusterrole-operator.yaml", "clusterrole-operator.yaml",
"clusterrolebinding-operator.yaml", "clusterrolebinding-operator.yaml",
"deployment-operator.yaml", "deployment-operator.yaml",
@@ -517,53 +520,6 @@ public sealed class FleetManifestLintTests
.ContainSingle("devices.iamworkin.lan"); .ContainSingle("devices.iamworkin.lan");
} }
[Fact]
public void FcDeviceManagement_StepCaAgentIssuerMustTargetNocProvisioner()
{
var issuer = FcDeviceManagementDocuments()
.Single(document => document.Kind == "StepClusterIssuer" && document.Name == "step-ca-agent");
issuer.Scalar("apiVersion").Should().Be("certmanager.step.sm/v1beta1");
issuer.Scalar("spec", "url").Should().Be("https://10.0.56.10:9443");
issuer.Scalar("spec", "caBundle").Should().NotBeNullOrWhiteSpace();
issuer.Scalar("spec", "provisioner", "name").Should().Be("step-ca-agent");
issuer.Scalar("spec", "provisioner", "kid").Should().Be("RF3A9welUYVOWBX8tr19aWyA2kQlxoGZN1dRwTElUEM");
}
[Fact]
public void FcDeviceManagement_StepCaAgentIssuerMustReferencePasswordSecretOnly()
{
var issuer = FcDeviceManagementDocuments()
.Single(document => document.Kind == "StepClusterIssuer" && document.Name == "step-ca-agent");
issuer.Scalar("spec", "provisioner", "passwordRef", "name")
.Should()
.Be("step-ca-agent-provisioner-password");
issuer.Scalar("spec", "provisioner", "passwordRef", "namespace").Should().Be("cert-manager");
issuer.Scalar("spec", "provisioner", "passwordRef", "key").Should().Be("password");
var issuerText = File.ReadAllText(Path.Combine(Inventory.BluejayRoot, "apps", "fc-devicemgmt", "clusterissuer-step-ca-agent.yaml"));
issuerText.Should().NotContain("stringData:");
issuerText.Should().NotContain("password:");
issuerText.Should().NotContain("privateKey");
}
[Fact]
public void FcDeviceManagement_StepCaAgentIssuerMustCarryTraceabilityMetadata()
{
var issuer = FcDeviceManagementDocuments()
.Single(document => document.Kind == "StepClusterIssuer" && document.Name == "step-ca-agent");
issuer.Scalar("metadata", "labels", "app.kubernetes.io/managed-by").Should().Be("argocd");
issuer.Scalar("metadata", "labels", "flowercore.io/tenant-id").Should().Be("system");
issuer.Scalar("metadata", "annotations", "flowercore.io/provisioner-source")
.Should()
.Be("profile::pki::stepca");
issuer.Scalar("metadata", "annotations", "flowercore.io/secret-source")
.Should()
.Be("cert-manager/step-ca-agent-provisioner-password");
}
[Fact] [Fact]
public void FcDeviceManagement_OperatorRbacMustCoverDevicesAndOwnerLookup() public void FcDeviceManagement_OperatorRbacMustCoverDevicesAndOwnerLookup()
{ {
@@ -689,6 +645,15 @@ public sealed class FleetManifestLintTests
return EnvMapping(container, name) is { } env ? ManifestNodeExtensions.Scalar(env, "value") : null; return EnvMapping(container, name) is { } env ? ManifestNodeExtensions.Scalar(env, "value") : null;
} }
private static YamlMappingNode RunnerContainer(ManifestDocument deployment)
{
return deployment.ContainerMappings()
.Where(container => string.Equals(ManifestNodeExtensions.Scalar(container, "name"), "runner", StringComparison.Ordinal))
.Should()
.ContainSingle($"{deployment.Name} must keep exactly one main runner container")
.Subject;
}
private static string? EnvSecretName(YamlMappingNode container, string name) private static string? EnvSecretName(YamlMappingNode container, string name)
{ {
return EnvMapping(container, name) is { } env return EnvMapping(container, name) is { } env