Compare commits

..

1 Commits

Author SHA1 Message Date
Andrew Stoltz
9a4a8264d9 github-runner: add DM and WorldBuilder runners 2026-05-18 17:44:29 -05:00
10 changed files with 519 additions and 368 deletions

27
apps/brochure/README.md Normal file
View File

@@ -0,0 +1,27 @@
# FlowerCore Brochure
`apps/brochure` hosts the public brochure split from `FlowerCore.Intranet.Web`.
ArgoCD's `apps/*` ApplicationSet will create `infra-brochure` after this
directory lands on `main`.
## Runtime
- Host: `https://brochure.flowercore.io`
- Namespace: `brochure`
- Deployment: `brochure-web`
- Image: `localhost/fc-brochure-web:v20260524-sprint32`
- Port: `8080`
- Public route method allowlist: `GET` and `HEAD`
## Operator Actions
1. Publish and import `localhost/fc-brochure-web:v20260524-sprint32` to every
RKE2 node before sync, using the same podman save + `ctr images import`
flow as the Intranet deployment.
2. Create the Cloudflare DNS record for `brochure.flowercore.io` pointing at
the FlowerCore public edge.
3. Verify `infra-brochure` appears in ArgoCD, the certificate becomes Ready,
and `GET https://brochure.flowercore.io/` returns `200`.
The route intentionally does not expose `/ops/*` or `/admin/*`; the Brochure
web app returns `404` for those paths and Traefik only forwards read methods.

131
apps/brochure/brochure.yaml Normal file
View File

@@ -0,0 +1,131 @@
# FlowerCore Brochure public host
#
# Thin Blazor host for public What's New, walkthrough, and gallery content
# carved out of FlowerCore.Intranet.Web. The ApplicationSet creates
# infra-brochure from this directory after merge.
---
apiVersion: v1
kind: Namespace
metadata:
name: brochure
labels:
app.kubernetes.io/part-of: flowercore
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: brochure-web
namespace: brochure
labels:
app: brochure-web
app.kubernetes.io/name: brochure-web
app.kubernetes.io/part-of: flowercore
spec:
replicas: 1
revisionHistoryLimit: 3
selector:
matchLabels:
app: brochure-web
template:
metadata:
labels:
app: brochure-web
app.kubernetes.io/name: brochure-web
app.kubernetes.io/part-of: flowercore
spec:
containers:
- name: brochure-web
image: localhost/fc-brochure-web:v20260524-sprint32
imagePullPolicy: Never
ports:
- containerPort: 8080
name: http
env:
- name: ASPNETCORE_ENVIRONMENT
value: Production
- name: ASPNETCORE_URLS
value: "http://+:8080"
resources:
requests:
cpu: "25m"
memory: "128Mi"
limits:
cpu: "500m"
memory: "512Mi"
readinessProbe:
httpGet:
path: /health
port: http
initialDelaySeconds: 10
periodSeconds: 10
livenessProbe:
httpGet:
path: /health
port: http
initialDelaySeconds: 30
periodSeconds: 30
securityContext:
runAsNonRoot: true
runAsUser: 1654
runAsGroup: 1654
allowPrivilegeEscalation: false
readOnlyRootFilesystem: true
capabilities:
drop:
- ALL
volumeMounts:
- name: tmp
mountPath: /tmp
volumes:
- name: tmp
emptyDir: {}
---
apiVersion: v1
kind: Service
metadata:
name: brochure-web
namespace: brochure
labels:
app: brochure-web
app.kubernetes.io/name: brochure-web
app.kubernetes.io/part-of: flowercore
spec:
type: ClusterIP
selector:
app: brochure-web
ports:
- name: http
port: 8080
targetPort: http
---
apiVersion: cert-manager.io/v1
kind: Certificate
metadata:
name: brochure-web-tls
namespace: brochure
spec:
secretName: brochure-web-tls
issuerRef:
name: step-ca-acme
kind: ClusterIssuer
dnsNames:
- brochure.flowercore.io
duration: 720h
renewBefore: 240h
---
apiVersion: traefik.io/v1alpha1
kind: IngressRoute
metadata:
name: brochure-web-public
namespace: brochure
spec:
entryPoints:
- websecure
routes:
- match: Host(`brochure.flowercore.io`) && (Method(`GET`) || Method(`HEAD`))
kind: Rule
services:
- name: brochure-web
port: 8080
tls:
secretName: brochure-web-tls

View File

@@ -1,13 +1,10 @@
# FlowerCore Remote Desktop — TLS + Ingress
#
# Source-of-truth split:
# - bluejay-infra OWNS: Certificate, IngressRoute, all NetworkPolicies,
# and the explicit RemoteDesktopPoolCrd warm-pool intent in
# remotedesktop-pools.yaml.
# - bluejay-infra OWNS: Certificate, IngressRoute, all NetworkPolicies
# (see network-policies.yaml in this directory).
# - FlowerCore.RemoteDesktop OWNS: CRD definition/operator Deployment and
# scripts/deploy-web.sh Deployment + Service. Reason: image refs like
# `localhost/fc-desktop:linux-xfce`
# - FlowerCore.RemoteDesktop scripts/deploy-web.sh OWNS: Deployment +
# Service. Reason: image refs like `localhost/fc-desktop:linux-xfce`
# only exist on each node's containerd after a manual import, so a
# Deployment manifest in bluejay-infra would race the image-import
# step and crash-loop.

View File

@@ -1,101 +0,0 @@
# FlowerCore RemoteDesktop warm-pool intent.
#
# These CRDs are deliberately explicit. The RemoteDesktop warmup loop no
# longer scans template defaults to decide what to warm; every enabled pool
# here represents operator/GitOps intent and prevents a repeat of the
# orphan-pool leak from 2026-05-08.
---
apiVersion: flowercore.io/v1
kind: RemoteDesktopPoolCrd
metadata:
name: browser-lab-pool
namespace: fc-desktop
labels:
app.kubernetes.io/name: remotedesktop-pool
app.kubernetes.io/part-of: flowercore-remotedesktop
app.kubernetes.io/managed-by: bluejay-infra
spec:
templateSlug: browser-only
desiredSize: 1
enabled: true
reconcileNow: true
---
apiVersion: flowercore.io/v1
kind: RemoteDesktopPoolCrd
metadata:
name: opensuse-xfce-pool
namespace: fc-desktop
labels:
app.kubernetes.io/name: remotedesktop-pool
app.kubernetes.io/part-of: flowercore-remotedesktop
app.kubernetes.io/managed-by: bluejay-infra
spec:
templateSlug: opensuse-xfce
desiredSize: 1
enabled: true
userVolumeMode: LateAttach
reconcileNow: true
---
apiVersion: flowercore.io/v1
kind: RemoteDesktopPoolCrd
metadata:
name: dev-workstation-pool
namespace: fc-desktop
labels:
app.kubernetes.io/name: remotedesktop-pool
app.kubernetes.io/part-of: flowercore-remotedesktop
app.kubernetes.io/managed-by: bluejay-infra
spec:
templateSlug: dev-workstation
desiredSize: 1
enabled: true
userVolumeMode: LateAttach
reconcileNow: true
---
apiVersion: flowercore.io/v1
kind: RemoteDesktopPoolCrd
metadata:
name: ai-station-pool
namespace: fc-desktop
labels:
app.kubernetes.io/name: remotedesktop-pool
app.kubernetes.io/part-of: flowercore-remotedesktop
app.kubernetes.io/managed-by: bluejay-infra
spec:
templateSlug: ai-station
desiredSize: 1
enabled: true
userVolumeMode: LateAttach
reconcileNow: true
---
apiVersion: flowercore.io/v1
kind: RemoteDesktopPoolCrd
metadata:
name: linux-xfce-pool
namespace: fc-desktop
labels:
app.kubernetes.io/name: remotedesktop-pool
app.kubernetes.io/part-of: flowercore-remotedesktop
app.kubernetes.io/managed-by: bluejay-infra
spec:
templateSlug: linux-xfce
desiredSize: 1
enabled: true
userVolumeMode: LateAttach
reconcileNow: true
---
apiVersion: flowercore.io/v1
kind: RemoteDesktopPoolCrd
metadata:
name: linux-xfce-rdp-pool
namespace: fc-desktop
labels:
app.kubernetes.io/name: remotedesktop-pool
app.kubernetes.io/part-of: flowercore-remotedesktop
app.kubernetes.io/managed-by: bluejay-infra
spec:
templateSlug: linux-xfce-rdp
desiredSize: 1
enabled: true
userVolumeMode: LateAttach
reconcileNow: true

View File

@@ -47,7 +47,7 @@ spec:
fsGroupChangePolicy: OnRootMismatch
containers:
- name: operator
image: localhost/fc-devicemgmt-operator:v20260519-sp34cl3-fix
image: localhost/fc-devicemgmt-operator:v20260512-cx5
imagePullPolicy: Never
ports:
- name: metrics

View File

@@ -4,22 +4,6 @@
# Sprint 9+ lane. This manifest is static-valid without requiring the image to
# exist yet; import localhost/fc-devicemgmt-web:<tag> to all schedulable RKE2
# nodes before letting ArgoCD sync a live rollout.
#
# SCALED TO 0 — 2026-05-19 morning-routine cleanup.
# The Web pod cannot start until TWO upstream gaps close:
# 1. MySQL DB instance `flowercore_devicemgmt` (user `fc_devicemgmt`) is
# provisioned via fc-mysql Manager. The cluster currently has ZERO
# MySqlInstanceCrds and no `mysql.fc-mysql.svc:3306` Service, so the
# deployment-web container env `FlowerCore__Database__Host=mysql.fc-mysql.svc`
# points at nothing. Provision via the fc-mysql Manager UI/REST/MCP.
# 2. 1Password vault item `IAmWorkin/FlowerCore DeviceManagement Runtime`
# with 5 fields (DB-Password, mtls-ca.pem, mtls-client.crt, mtls-client.key,
# mtls-chain.pem) — see apps/fc-devicemgmt/1password-item.yaml. Mint mTLS
# from step-ca-agent ClusterIssuer per ADR-126; DB-Password must match the
# password configured for the MySQL user.
# Re-enable: change replicas back to 2 after both gaps close. The image tag
# in this file (v20260512-cx5) MAY also need a refresh — it predates the
# Sprint 34 Cl-3 operator fix; Web may have an analogous bug.
apiVersion: apps/v1
kind: Deployment
metadata:
@@ -36,7 +20,7 @@ metadata:
annotations:
flowercore.io/traceability-standard: k8s-pod-ownership-and-traceability-standard
spec:
replicas: 0
replicas: 2
revisionHistoryLimit: 3
selector:
matchLabels:

View File

@@ -28,6 +28,10 @@ Sprint 32 final long-tail wave adds 16 two-replica Deployments:
`FlowerCore.Provisioning`, `FlowerCore.Redis`, `FlowerCore.MessageBoard`, and
`FlowerCore.MenuBoard`.
Sprint 37 Cx-2 closes the audited Linux runner gaps for
`FlowerCore.DeviceManagement` and `FlowerCore.WorldBuilder` with the same
two-replica `emptyDir` pattern.
## Post-Merge Proof
After the PR is merged and ArgoCD syncs, verify the runner fleet:
@@ -47,7 +51,7 @@ for repo in FlowerCore.Common FlowerCore.Shared.Pos FlowerCore.Puppet FlowerCore
FlowerCore.Distribution FlowerCore.Scoreboard FlowerCore.SegmentDisplay \
FlowerCore.Signage.Contracts FlowerCore.SignalControl FlowerCore.Intranet.Web \
FlowerCore.Provisioning FlowerCore.Redis FlowerCore.MessageBoard \
FlowerCore.MenuBoard; do
FlowerCore.MenuBoard FlowerCore.DeviceManagement FlowerCore.WorldBuilder; do
echo "=== $repo ==="
gh api "/repos/astoltz/$repo/actions/runners" \
--jq '.runners[] | select(.labels[].name == "fc-build-linux") | {name,status,busy,labels:[.labels[].name]}'
@@ -64,6 +68,20 @@ gh run list --repo astoltz/FlowerCore.Shared.Pos \
If the latest run is still queued after runner registration, rerun the workflow
from GitHub Actions and verify it lands on an `rke2-linux-*` runner.
## Sprint 37 Cx-2 Gap Audit
The 2026-05-18 GitHub workflow scan found these remaining repos with
`runs-on: [self-hosted, linux, fc-build-linux]` but no K8s runner Deployment:
`FlowerCore.AiStation.Linux`, `FlowerCore.PHP`, `FlowerCore.PiManager`,
`FlowerCore.Shared.Barcodes`, `FlowerCore.Shared.Lookup`,
`FlowerCore.Shared.Nodes`, `FlowerCore.Shared.PrintClient`,
`FlowerCore.Shared.Relay`, `FlowerCore.Shared.ShowRunner`, and
`FlowerCore.Shared.Storage`.
Mixed/platform repos also have Linux workflow legs but need owner review before
adding Linux runner Deployments: `FlowerCore.Library.Mac`,
`FlowerCore.Signage.Agent.AppleTv`, and `FlowerCore.Signage.Player.Wpf`.
## Failure Notes
- `actions/setup-dotnet` permission error at `/usr/share/dotnet`: check that

View File

@@ -16,6 +16,8 @@
# DNS, Distribution, Scoreboard, SegmentDisplay, Signage.Contracts,
# SignalControl, Intranet.Web, Provisioning, Redis, MessageBoard, MenuBoard
# (Sprint 32 final long-tail wave; two replicas each, emptyDir cache)
# FlowerCore.DeviceManagement, WorldBuilder (Sprint 37 Cx-2 runner gap
# closure; two replicas each, emptyDir cache)
#
# Non-root CI safety:
# Runner pods run as uid 1001. HOME, DOTNET_INSTALL_DIR, DOTNET_CLI_HOME,
@@ -3767,9 +3769,271 @@ spec:
- name: tmp
emptyDir: {}
restartPolicy: Always
---
# Runner for FlowerCore.DeviceManagement. Added 2026-05-18 (Sprint 37 Cx-2)
# to close the Linux CI capacity gap for the DM service-tier workflows. Mirrors
# the Sprint 32 long-tail emptyDir pattern: two replicas, shared
# 1Password-backed ACCESS_TOKEN, and the common ServiceAccount.
apiVersion: apps/v1
kind: Deployment
metadata:
name: github-runner-devicemgmt
namespace: github-runner
labels:
app.kubernetes.io/name: github-runner-devicemgmt
app.kubernetes.io/component: runner
app.kubernetes.io/part-of: flowercore
app.kubernetes.io/managed-by: argocd
flowercore.io/created-by: argocd
flowercore.io/runner-repo: devicemgmt
flowercore.io/github-repo: FlowerCore.DeviceManagement
spec:
replicas: 2
selector:
matchLabels:
app.kubernetes.io/name: github-runner-devicemgmt
strategy:
type: Recreate
template:
metadata:
labels:
app.kubernetes.io/name: github-runner-devicemgmt
app.kubernetes.io/component: runner
app.kubernetes.io/part-of: flowercore
flowercore.io/created-by: argocd
flowercore.io/runner-repo: devicemgmt
flowercore.io/github-repo: FlowerCore.DeviceManagement
spec:
serviceAccountName: github-runner
securityContext:
runAsNonRoot: true
runAsUser: 1001
runAsGroup: 1001
fsGroup: 1001
initContainers:
- name: setup-runner-home
image: busybox:1.36
command:
- sh
- -c
- |
set -e
mkdir -p /home/runner/.dotnet /home/runner/.nuget/packages /home/runner/.nuget/NuGet
chown -R 1001:1001 /home/runner/.dotnet /home/runner/.nuget
chmod -R 755 /home/runner/.dotnet /home/runner/.nuget
securityContext:
runAsUser: 0
runAsNonRoot: false
volumeMounts:
- name: runner-home
mountPath: /home/runner
containers:
- name: runner
image: myoung34/github-runner:latest
imagePullPolicy: Always
env:
- name: REPO_URL
value: "https://github.com/astoltz/FlowerCore.DeviceManagement"
- name: RUNNER_NAME_PREFIX
value: "rke2-linux-devicemgmt"
- name: RUNNER_WORKDIR
value: "/tmp/runner/work"
- name: EPHEMERAL
value: "true"
- name: LABELS
value: "self-hosted,linux,fc-build-linux"
- name: HOME
value: "/home/runner"
- name: DOTNET_INSTALL_DIR
value: "/home/runner/.dotnet"
- name: DOTNET_CLI_TELEMETRY_OPTOUT
value: "1"
- name: DOTNET_NOLOGO
value: "1"
- name: DOTNET_GENERATE_ASPNET_CERTIFICATE
value: "false"
- name: DOTNET_CLI_HOME
value: "/home/runner"
- name: NUGET_PACKAGES
value: "/home/runner/.nuget/packages"
- name: XDG_CACHE_HOME
value: "/home/runner/.cache"
- name: RUNNER_TOOL_CACHE
value: "/home/runner/_tool"
- name: ACCESS_TOKEN
valueFrom:
secretKeyRef:
name: github-runner-token
key: credential
- name: RUN_AS_ROOT
value: "false"
resources:
requests:
cpu: "500m"
memory: "1Gi"
limits:
cpu: "2000m"
memory: "4Gi"
volumeMounts:
- name: runner-home
mountPath: /home/runner
- name: nuget-cache
mountPath: /home/runner/.nuget/packages
- name: tmp
mountPath: /tmp
livenessProbe:
exec:
command:
- /bin/sh
- -c
- "pgrep -f Runner.Listener > /dev/null"
initialDelaySeconds: 30
periodSeconds: 30
failureThreshold: 3
volumes:
- name: runner-home
emptyDir: {}
- name: nuget-cache
emptyDir:
sizeLimit: 2Gi
- name: tmp
emptyDir: {}
restartPolicy: Always
---
# Runner for FlowerCore.WorldBuilder. Added 2026-05-18 (Sprint 37 Cx-2)
# to unblock WorldBuilder Linux CI jobs after the runner fleet audit found no
# repo-scoped deployment for the GitHub repo. Mirrors the Sprint 32 long-tail
# emptyDir pattern: two replicas, shared 1Password-backed ACCESS_TOKEN, and
# the common ServiceAccount.
apiVersion: apps/v1
kind: Deployment
metadata:
name: github-runner-worldbuilder
namespace: github-runner
labels:
app.kubernetes.io/name: github-runner-worldbuilder
app.kubernetes.io/component: runner
app.kubernetes.io/part-of: flowercore
app.kubernetes.io/managed-by: argocd
flowercore.io/created-by: argocd
flowercore.io/runner-repo: worldbuilder
flowercore.io/github-repo: FlowerCore.WorldBuilder
spec:
replicas: 2
selector:
matchLabels:
app.kubernetes.io/name: github-runner-worldbuilder
strategy:
type: Recreate
template:
metadata:
labels:
app.kubernetes.io/name: github-runner-worldbuilder
app.kubernetes.io/component: runner
app.kubernetes.io/part-of: flowercore
flowercore.io/created-by: argocd
flowercore.io/runner-repo: worldbuilder
flowercore.io/github-repo: FlowerCore.WorldBuilder
spec:
serviceAccountName: github-runner
securityContext:
runAsNonRoot: true
runAsUser: 1001
runAsGroup: 1001
fsGroup: 1001
initContainers:
- name: setup-runner-home
image: busybox:1.36
command:
- sh
- -c
- |
set -e
mkdir -p /home/runner/.dotnet /home/runner/.nuget/packages /home/runner/.nuget/NuGet
chown -R 1001:1001 /home/runner/.dotnet /home/runner/.nuget
chmod -R 755 /home/runner/.dotnet /home/runner/.nuget
securityContext:
runAsUser: 0
runAsNonRoot: false
volumeMounts:
- name: runner-home
mountPath: /home/runner
containers:
- name: runner
image: myoung34/github-runner:latest
imagePullPolicy: Always
env:
- name: REPO_URL
value: "https://github.com/astoltz/FlowerCore.WorldBuilder"
- name: RUNNER_NAME_PREFIX
value: "rke2-linux-worldbuilder"
- name: RUNNER_WORKDIR
value: "/tmp/runner/work"
- name: EPHEMERAL
value: "true"
- name: LABELS
value: "self-hosted,linux,fc-build-linux"
- name: HOME
value: "/home/runner"
- name: DOTNET_INSTALL_DIR
value: "/home/runner/.dotnet"
- name: DOTNET_CLI_TELEMETRY_OPTOUT
value: "1"
- name: DOTNET_NOLOGO
value: "1"
- name: DOTNET_GENERATE_ASPNET_CERTIFICATE
value: "false"
- name: DOTNET_CLI_HOME
value: "/home/runner"
- name: NUGET_PACKAGES
value: "/home/runner/.nuget/packages"
- name: XDG_CACHE_HOME
value: "/home/runner/.cache"
- name: RUNNER_TOOL_CACHE
value: "/home/runner/_tool"
- name: ACCESS_TOKEN
valueFrom:
secretKeyRef:
name: github-runner-token
key: credential
- name: RUN_AS_ROOT
value: "false"
resources:
requests:
cpu: "500m"
memory: "1Gi"
limits:
cpu: "2000m"
memory: "4Gi"
volumeMounts:
- name: runner-home
mountPath: /home/runner
- name: nuget-cache
mountPath: /home/runner/.nuget/packages
- name: tmp
mountPath: /tmp
livenessProbe:
exec:
command:
- /bin/sh
- -c
- "pgrep -f Runner.Listener > /dev/null"
initialDelaySeconds: 30
periodSeconds: 30
failureThreshold: 3
volumes:
- name: runner-home
emptyDir: {}
- name: nuget-cache
emptyDir:
sizeLimit: 2Gi
- name: tmp
emptyDir: {}
restartPolicy: Always
# Long-tail runner pattern:
#
# Sprint 32 added the final 16 long-tail repo-scoped Deployments above. Keep
# Common as the only PVC-backed runner at replicas: 1. Any future multi-replica
# runner must use per-pod emptyDir caches, not a shared ReadWriteOnce PVC.
# Sprint 32 added the final 16 long-tail repo-scoped Deployments, and Sprint 37
# added the DM + WorldBuilder runner gap closures above. Keep Common as the
# only PVC-backed runner at replicas: 1. Any future multi-replica runner must
# use per-pod emptyDir caches, not a shared ReadWriteOnce PVC.

View File

@@ -729,7 +729,7 @@ data:
expr: |
kube_deployment_status_replicas_ready{
namespace="github-runner",
deployment=~"github-runner(|-(sharedpos|puppet|signage|dms|telephony|print-web|chat|mysql|kiosk-linux))"
deployment=~"github-runner(|-.+)"
} == 0
for: 5m
labels:
@@ -1273,55 +1273,24 @@ metadata:
data:
notify.py: |
#!/usr/bin/env python3
"""HTTP->IRC alert relay with thermal-printer DIGEST forwarding.
Listens on :9119, posts to #alerts on UnrealIRCd, forwards to Print.Web
/api/print/alert. Thermal printing is BATCHED into hourly digests by
default so the printer no longer spam-fires per Grafana webhook.
Routing (per Grafana webhook alert):
- IRC: always per-event (operator likes the stream)
- Thermal printer:
* severity in {critical,disaster,page} OR
label alert_channel=thermal_print_immediate -> print NOW
* label alert_channel=thermal_print -> enqueue into hourly digest
* everything else -> IRC only
- RESOLVED webhooks remove the alert from the digest buffer
Env vars (defaults preserve old behavior on first deploy):
THERMAL_PRINT_ENABLED default "true" - master kill switch
BATCH_INTERVAL_MIN default "60" - minutes between digest prints
BATCH_MAX_PENDING default "50" - force-flush threshold
HTTP surface:
POST / - Grafana webhook entry
POST /flush - manual digest flush (idempotent)
GET / - status + config + buffer depth + stats
"""HTTP->IRC alert relay with thermal printer forwarding for Grafana webhooks.
Listens on :9119, posts to #alerts on UnrealIRCd via raw IRC protocol.
Alerts tagged alert_channel=thermal_print also POST to Print.Web /api/print/alert.
"""
import json, os, socket, sys, threading, time
from collections import defaultdict
from datetime import datetime, timezone
import json, socket, sys, time
from http.server import HTTPServer, BaseHTTPRequestHandler
from urllib.request import Request, urlopen
from urllib.error import URLError
THERMAL_PRINT_ENABLED = os.environ.get("THERMAL_PRINT_ENABLED", "true").lower() == "true"
BATCH_INTERVAL_MIN = int(os.environ.get("BATCH_INTERVAL_MIN", "60"))
BATCH_MAX_PENDING = int(os.environ.get("BATCH_MAX_PENDING", "50"))
IRC_HOST = os.environ.get("IRC_HOST", "unrealircd.irc.svc")
IRC_PORT = int(os.environ.get("IRC_PORT", "6667"))
IRC_NICK = os.environ.get("IRC_NICK", "grafana-bot")
IRC_CHANNEL = os.environ.get("IRC_CHANNEL", "#alerts")
PRINT_WEB_URL = os.environ.get("PRINT_WEB_URL", "http://10.0.57.16:5200/api/print/alert")
_buffer_lock = threading.Lock()
_buffer = {} # fingerprint -> {"alert": dict, "first_seen": float, "last_seen": float}
_last_flush_time = time.time()
_stats = {"webhooks_received": 0, "irc_sent": 0, "print_immediate": 0,
"digest_flushed": 0, "buffer_dedup": 0, "buffer_added": 0,
"buffer_resolved": 0, "started_at": time.time()}
IRC_HOST = "unrealircd.irc.svc" # short name: CoreDNS ndots:5 + iamworkin.lan template hijacks full .cluster.local (see memory)
IRC_PORT = 6667
IRC_NICK = "grafana-bot"
IRC_CHANNEL = "#alerts"
PRINT_WEB_URL = "http://10.0.57.16:5200/api/print/alert"
PRINT_ENABLED = True
def send_irc(message):
"""Connect, handle PING, join, send, quit."""
try:
sock = socket.create_connection((IRC_HOST, IRC_PORT), timeout=15)
sock.sendall(f"NICK {IRC_NICK}\r\n".encode())
@@ -1354,137 +1323,52 @@ data:
time.sleep(0.5)
sock.sendall(b"QUIT :alert delivered\r\n")
sock.close()
_stats["irc_sent"] += 1
return True
except Exception as e:
print(f"[irc-notify] IRC send failed: {e}", file=sys.stderr)
return False
def post_thermal(payload, kind):
if not THERMAL_PRINT_ENABLED:
print(f"[irc-notify] thermal disabled; skip {kind} ({payload.get('title','?')[:40]})", file=sys.stderr)
return False
def send_thermal_print(alert):
if not PRINT_ENABLED: return
labels = alert.get("labels", {})
annotations = alert.get("annotations", {})
status = alert.get("status", "firing").upper()
summary = annotations.get("summary", "")
description = annotations.get("description", "")
runbook = annotations.get("runbook", "")
# Build a useful message: summary + description + runbook steps
parts = []
if summary: parts.append(summary)
if description and description != summary: parts.append(description)
if runbook: parts.append("STEPS: " + runbook)
message = " | ".join(parts) if parts else labels.get("alertname", "Unknown alert")
payload = {
"title": labels.get("alertname", "Unknown"),
"severity": labels.get("severity", "warning").capitalize(),
"host": labels.get("instance", labels.get("host", "unknown")),
"message": message,
"eventId": alert.get("fingerprint", ""),
"source": "Grafana",
"status": "RESOLVED" if status == "RESOLVED" else "PROBLEM",
"acknowledged": False
}
try:
req = Request(PRINT_WEB_URL, data=json.dumps(payload).encode("utf-8"),
headers={"Content-Type": "application/json"}, method="POST")
resp = urlopen(req, timeout=10)
if kind == "immediate": _stats["print_immediate"] += 1
print(f"[irc-notify] thermal {kind} sent: {payload.get('title','?')[:50]}", file=sys.stderr)
return True
print(f"[irc-notify] Thermal print sent: {resp.read().decode()}", file=sys.stderr)
except Exception as e:
print(f"[irc-notify] thermal {kind} failed: {e}", file=sys.stderr)
return False
print(f"[irc-notify] Thermal print failed: {e}", file=sys.stderr)
def fingerprint_of(alert):
fp = alert.get("fingerprint", "")
if fp: return fp
def should_print(alert):
labels = alert.get("labels", {})
target = labels.get("pod") or labels.get("instance") or labels.get("deployment") or labels.get("statefulset") or labels.get("namespace") or ""
return f"{labels.get('alertname','?')}/{labels.get('namespace','')}/{target}"
def is_critical(alert):
return alert.get("labels", {}).get("severity", "").lower() in ("critical", "disaster", "page")
def is_immediate_label(alert):
return alert.get("labels", {}).get("alert_channel") == "thermal_print_immediate"
def is_batched_label(alert):
return alert.get("labels", {}).get("alert_channel") == "thermal_print"
def add_to_digest(alert):
"""Add an alert to the digest buffer. Returns True if the buffer GREW
(new fingerprint), False if it was a dedup, resolution, or no-op.
"""
if not THERMAL_PRINT_ENABLED: return False
fp = fingerprint_of(alert)
status = alert.get("status", "firing").lower()
with _buffer_lock:
if status == "resolved":
if fp in _buffer:
del _buffer[fp]
_stats["buffer_resolved"] += 1
if labels.get("alert_channel") == "thermal_print": return True
if labels.get("severity", "").lower() in ("critical", "disaster"): return True
if alert.get("status", "").upper() == "RESOLVED": return False
return False
if fp in _buffer:
_buffer[fp]["last_seen"] = time.time()
_buffer[fp]["alert"] = alert
_stats["buffer_dedup"] += 1
return False
_buffer[fp] = {"alert": alert, "first_seen": time.time(), "last_seen": time.time()}
_stats["buffer_added"] += 1
return True
def build_digest_payload():
with _buffer_lock:
items = list(_buffer.values())
if not items: return None
by_name = defaultdict(list)
for item in items:
labels = item["alert"].get("labels", {})
by_name[labels.get("alertname", "Unknown")].append(item)
lines = []
for name, group in sorted(by_name.items()):
targets = []
for it in group[:5]:
labels = it["alert"].get("labels", {})
t = (labels.get("pod") or labels.get("instance") or labels.get("deployment")
or labels.get("statefulset") or labels.get("namespace") or "?")
targets.append(t)
more = f" (+{len(group)-5})" if len(group) > 5 else ""
sevs = sorted({it["alert"].get("labels", {}).get("severity", "warning") for it in group})
lines.append(f"[{'/'.join(sevs)}] {name} x{len(group)}: {', '.join(targets)}{more}")
now = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC")
title = f"Alert digest: {len(items)} firing"
body = "\n".join([
f"=== {title} ===",
f"as of {now}",
"",
*lines,
"",
"Stream: #alerts (IRC) | Triage: grafana-noc1.iamworkin.lan",
"Force-flush: POST irc-notify.monitoring.svc:9119/flush",
])
return {"title": title, "severity": "Warning", "host": "monitoring",
"message": body, "eventId": f"digest-{int(time.time())}",
"source": "Grafana digest", "status": "PROBLEM", "acknowledged": False}
def flush_digest():
payload = build_digest_payload()
if payload is None:
print("[irc-notify] flush: buffer empty, no digest sent", file=sys.stderr)
return False
sent = post_thermal(payload, "digest")
with _buffer_lock:
_buffer.clear()
if sent: _stats["digest_flushed"] += 1
return sent
def digest_loop():
global _last_flush_time
while True:
try:
now = time.time()
elapsed = now - _last_flush_time
if elapsed >= BATCH_INTERVAL_MIN * 60:
print(f"[irc-notify] digest tick: interval reached ({BATCH_INTERVAL_MIN}m); buffer={len(_buffer)}", file=sys.stderr)
flush_digest()
_last_flush_time = now
elif len(_buffer) >= BATCH_MAX_PENDING:
print(f"[irc-notify] digest tick: buffer full ({len(_buffer)}); force flush", file=sys.stderr)
flush_digest()
_last_flush_time = now
time.sleep(15)
except Exception as e:
print(f"[irc-notify] digest loop error: {e}", file=sys.stderr)
time.sleep(60)
class Handler(BaseHTTPRequestHandler):
def do_POST(self):
if self.path == "/flush":
ok = flush_digest()
self.send_response(200); self.send_header("Content-Type", "application/json"); self.end_headers()
self.wfile.write(json.dumps({"flushed": ok, "buffer_after": len(_buffer)}).encode())
return
_stats["webhooks_received"] += 1
length = int(self.headers.get("Content-Length", 0))
body = json.loads(self.rfile.read(length)) if length else {}
for alert in body.get("alerts", []):
@@ -1499,56 +1383,22 @@ data:
msg = f"{icon}{sev_tag} {name}: {summary}"
if desc: msg += f"\n {desc}"
send_irc(msg)
# Thermal routing — EVERYTHING (including criticals) goes into
# the hourly digest. Only the explicit `alert_channel=thermal_print_immediate`
# label bypasses, and even that flushes-the-current-digest rather
# than printing a standalone job, so the same fingerprint can't
# spam the printer per webhook cycle.
if status == "RESOLVED":
add_to_digest(alert) # removes from buffer
continue
if is_immediate_label(alert):
# Explicit opt-in for "paper this NOW" — first arrival of a
# new fingerprint triggers an immediate digest flush; repeat
# webhooks for the same fingerprint dedupe in the buffer
# until the next interval or until the alert resolves.
new_in_buffer = add_to_digest(alert)
if new_in_buffer:
global _last_flush_time
flush_digest()
_last_flush_time = time.time()
elif is_critical(alert) or is_batched_label(alert):
add_to_digest(alert)
# else: IRC-only (warnings without thermal_print label)
self.send_response(200); self.send_header("Content-Type", "application/json"); self.end_headers()
if should_print(alert): send_thermal_print(alert)
self.send_response(200)
self.send_header("Content-Type", "application/json")
self.end_headers()
self.wfile.write(b'{"status":"ok"}')
def do_GET(self):
self.send_response(200); self.send_header("Content-Type", "application/json"); self.end_headers()
with _buffer_lock:
alertnames = sorted({it["alert"].get("labels", {}).get("alertname", "?") for it in _buffer.values()})
depth = len(_buffer)
info = {
"service": "irc-notify",
"config": {"thermal_print_enabled": THERMAL_PRINT_ENABLED,
"batch_interval_min": BATCH_INTERVAL_MIN,
"batch_max_pending": BATCH_MAX_PENDING,
"irc_target": f"{IRC_HOST}:{IRC_PORT} {IRC_CHANNEL}",
"print_web_url": PRINT_WEB_URL},
"buffer": {"depth": depth, "alertnames": alertnames,
"seconds_since_last_flush": int(time.time() - _last_flush_time),
"seconds_until_next_flush": max(0, int(BATCH_INTERVAL_MIN*60 - (time.time() - _last_flush_time)))},
"stats": _stats,
}
self.wfile.write(json.dumps(info, indent=2).encode())
self.send_response(200)
self.send_header("Content-Type", "application/json")
self.end_headers()
self.wfile.write(json.dumps({"service":"irc-notify","thermal_print":PRINT_ENABLED}).encode())
def log_message(self, format, *args):
print(f"[irc-notify] {args[0]}", file=sys.stderr)
if __name__ == "__main__":
threading.Thread(target=digest_loop, daemon=True).start()
server = HTTPServer(("0.0.0.0", 9119), Handler)
print(f"[irc-notify] :9119 -> IRC {IRC_HOST}:{IRC_PORT} {IRC_CHANNEL} | thermal={'ON' if THERMAL_PRINT_ENABLED else 'OFF'} | digest={BATCH_INTERVAL_MIN}m max={BATCH_MAX_PENDING}", file=sys.stderr)
print(f"IRC alert relay :9119 -> {IRC_HOST}:{IRC_PORT} {IRC_CHANNEL} (thermal: {PRINT_ENABLED})")
server.serve_forever()
# =============================================================================
@@ -3659,7 +3509,7 @@ data:
- refId: A
relativeTimeRange: {from: 300, to: 0}
datasourceUid: prometheus
model: {expr: 'kube_deployment_status_replicas_ready{namespace="github-runner",deployment=~"github-runner(|-(sharedpos|puppet|signage|dms|telephony|print-web|chat|mysql|kiosk-linux))"} == 0', instant: true, refId: A}
model: {expr: 'kube_deployment_status_replicas_ready{namespace="github-runner",deployment=~"github-runner(|-.+)"} == 0', instant: true, refId: A}
- refId: B
relativeTimeRange: {from: 300, to: 0}
datasourceUid: __expr__

View File

@@ -67,6 +67,8 @@ public sealed class FleetManifestLintTests
["github-runner-chat"] = "https://github.com/astoltz/FlowerCore.Chat",
["github-runner-mysql"] = "https://github.com/astoltz/FlowerCore.MySQL",
["github-runner-kiosk-linux"] = "https://github.com/astoltz/FlowerCore.Kiosk.Linux",
["github-runner-devicemgmt"] = "https://github.com/astoltz/FlowerCore.DeviceManagement",
["github-runner-worldbuilder"] = "https://github.com/astoltz/FlowerCore.WorldBuilder",
};
private static readonly HashSet<string> ScaledLinuxRunnerDeployments = new(StringComparer.Ordinal)
@@ -80,6 +82,8 @@ public sealed class FleetManifestLintTests
"github-runner-chat",
"github-runner-mysql",
"github-runner-kiosk-linux",
"github-runner-devicemgmt",
"github-runner-worldbuilder",
};
private static readonly IReadOnlyDictionary<string, string> WritableRunnerEnv = new Dictionary<string, string>(StringComparer.Ordinal)
@@ -234,7 +238,7 @@ public sealed class FleetManifestLintTests
{
deployments.Should().ContainKey(expectedRunner.Key);
var container = deployments[expectedRunner.Key].ContainerMappings().Should().ContainSingle().Subject;
var container = RunnerContainer(deployments[expectedRunner.Key]);
EnvValue(container, "REPO_URL").Should().Be(expectedRunner.Value);
EnvValue(container, "EPHEMERAL").Should().Be("true");
EnvValue(container, "LABELS").Should().Be("self-hosted,linux,fc-build-linux");
@@ -250,7 +254,7 @@ public sealed class FleetManifestLintTests
{
foreach (var deployment in GitHubRunnerDeployments().Values)
{
var container = deployment.ContainerMappings().Should().ContainSingle().Subject;
var container = RunnerContainer(deployment);
foreach (var expectedEnv in WritableRunnerEnv)
{
@@ -311,7 +315,7 @@ public sealed class FleetManifestLintTests
monitoring.Should().Contain("MacMiniRunnerOffline");
monitoring.Should().Contain("LinuxRunnerOffline");
monitoring.Should().Contain("kube_deployment_status_replicas_ready");
monitoring.Should().Contain("github-runner(|-(sharedpos|puppet|signage|dms|telephony|print-web|chat|mysql|kiosk-linux))");
monitoring.Should().Contain("github-runner(|-.+)");
monitoring.Should().Contain("folder: CI Alerts");
monitoring.Should().Contain("uid: linux-runner-offline");
monitoring.Should().Contain("alert_channel: irc");
@@ -387,38 +391,6 @@ public sealed class FleetManifestLintTests
violations.Should().BeEmpty();
}
[Fact]
public void RemoteDesktopPoolCrds_MustExplicitlyOptInHookReadyTemplates()
{
var expectedModes = new Dictionary<string, string?>(StringComparer.Ordinal)
{
["browser-only"] = null,
["opensuse-xfce"] = "LateAttach",
["dev-workstation"] = "LateAttach",
["ai-station"] = "LateAttach",
["linux-xfce"] = "LateAttach",
["linux-xfce-rdp"] = "LateAttach",
};
var pools = Inventory.Documents
.Where(document => document.Kind == "RemoteDesktopPoolCrd")
.Where(document => document.RelativePath == "fc-desktop/remotedesktop-pools.yaml")
.ToDictionary(
document => document.Scalar("spec", "templateSlug") ?? string.Empty,
StringComparer.Ordinal);
pools.Keys.Should().BeEquivalentTo(expectedModes.Keys);
foreach (var expected in expectedModes)
{
var pool = pools[expected.Key];
pool.Namespace.Should().Be("fc-desktop");
pool.Scalar("spec", "desiredSize").Should().Be("1");
pool.Scalar("spec", "enabled").Should().Be("true");
pool.Scalar("spec", "reconcileNow").Should().Be("true");
pool.Scalar("spec", "userVolumeMode").Should().Be(expected.Value);
}
}
[Fact]
public void PublicEgressDeployments_MustOptOutOfIamworkinLanSearchSuffixes()
{
@@ -673,6 +645,15 @@ public sealed class FleetManifestLintTests
return EnvMapping(container, name) is { } env ? ManifestNodeExtensions.Scalar(env, "value") : null;
}
private static YamlMappingNode RunnerContainer(ManifestDocument deployment)
{
return deployment.ContainerMappings()
.Where(container => string.Equals(ManifestNodeExtensions.Scalar(container, "name"), "runner", StringComparison.Ordinal))
.Should()
.ContainSingle($"{deployment.Name} must keep exactly one main runner container")
.Subject;
}
private static string? EnvSecretName(YamlMappingNode container, string name)
{
return EnvMapping(container, name) is { } env