Compare commits

..

1 Commits

Author SHA1 Message Date
Andrew Stoltz
9a4a8264d9 github-runner: add DM and WorldBuilder runners 2026-05-18 17:44:29 -05:00
15 changed files with 726 additions and 991 deletions

27
apps/brochure/README.md Normal file
View File

@@ -0,0 +1,27 @@
# FlowerCore Brochure
`apps/brochure` hosts the public brochure split from `FlowerCore.Intranet.Web`.
ArgoCD's `apps/*` ApplicationSet will create `infra-brochure` after this
directory lands on `main`.
## Runtime
- Host: `https://brochure.flowercore.io`
- Namespace: `brochure`
- Deployment: `brochure-web`
- Image: `localhost/fc-brochure-web:v20260524-sprint32`
- Port: `8080`
- Public route method allowlist: `GET` and `HEAD`
## Operator Actions
1. Publish and import `localhost/fc-brochure-web:v20260524-sprint32` to every
RKE2 node before sync, using the same podman save + `ctr images import`
flow as the Intranet deployment.
2. Create the Cloudflare DNS record for `brochure.flowercore.io` pointing at
the FlowerCore public edge.
3. Verify `infra-brochure` appears in ArgoCD, the certificate becomes Ready,
and `GET https://brochure.flowercore.io/` returns `200`.
The route intentionally does not expose `/ops/*` or `/admin/*`; the Brochure
web app returns `404` for those paths and Traefik only forwards read methods.

131
apps/brochure/brochure.yaml Normal file
View File

@@ -0,0 +1,131 @@
# FlowerCore Brochure public host
#
# Thin Blazor host for public What's New, walkthrough, and gallery content
# carved out of FlowerCore.Intranet.Web. The ApplicationSet creates
# infra-brochure from this directory after merge.
---
apiVersion: v1
kind: Namespace
metadata:
name: brochure
labels:
app.kubernetes.io/part-of: flowercore
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: brochure-web
namespace: brochure
labels:
app: brochure-web
app.kubernetes.io/name: brochure-web
app.kubernetes.io/part-of: flowercore
spec:
replicas: 1
revisionHistoryLimit: 3
selector:
matchLabels:
app: brochure-web
template:
metadata:
labels:
app: brochure-web
app.kubernetes.io/name: brochure-web
app.kubernetes.io/part-of: flowercore
spec:
containers:
- name: brochure-web
image: localhost/fc-brochure-web:v20260524-sprint32
imagePullPolicy: Never
ports:
- containerPort: 8080
name: http
env:
- name: ASPNETCORE_ENVIRONMENT
value: Production
- name: ASPNETCORE_URLS
value: "http://+:8080"
resources:
requests:
cpu: "25m"
memory: "128Mi"
limits:
cpu: "500m"
memory: "512Mi"
readinessProbe:
httpGet:
path: /health
port: http
initialDelaySeconds: 10
periodSeconds: 10
livenessProbe:
httpGet:
path: /health
port: http
initialDelaySeconds: 30
periodSeconds: 30
securityContext:
runAsNonRoot: true
runAsUser: 1654
runAsGroup: 1654
allowPrivilegeEscalation: false
readOnlyRootFilesystem: true
capabilities:
drop:
- ALL
volumeMounts:
- name: tmp
mountPath: /tmp
volumes:
- name: tmp
emptyDir: {}
---
apiVersion: v1
kind: Service
metadata:
name: brochure-web
namespace: brochure
labels:
app: brochure-web
app.kubernetes.io/name: brochure-web
app.kubernetes.io/part-of: flowercore
spec:
type: ClusterIP
selector:
app: brochure-web
ports:
- name: http
port: 8080
targetPort: http
---
apiVersion: cert-manager.io/v1
kind: Certificate
metadata:
name: brochure-web-tls
namespace: brochure
spec:
secretName: brochure-web-tls
issuerRef:
name: step-ca-acme
kind: ClusterIssuer
dnsNames:
- brochure.flowercore.io
duration: 720h
renewBefore: 240h
---
apiVersion: traefik.io/v1alpha1
kind: IngressRoute
metadata:
name: brochure-web-public
namespace: brochure
spec:
entryPoints:
- websecure
routes:
- match: Host(`brochure.flowercore.io`) && (Method(`GET`) || Method(`HEAD`))
kind: Rule
services:
- name: brochure-web
port: 8080
tls:
secretName: brochure-web-tls

View File

@@ -1,33 +0,0 @@
# FlowerCore Remote Desktop - session pod resource defaults
#
# Namespace-level LimitRange for Sprint 44 Phase 1. This defends the
# fc-desktop namespace from unbounded container requests while the
# per-tenant advisory FairShareEvaluator lands in FlowerCore.RemoteDesktop.
apiVersion: v1
kind: LimitRange
metadata:
name: fc-desktop-pod-defaults
namespace: fc-desktop
labels:
app.kubernetes.io/name: fc-desktop
app.kubernetes.io/part-of: remotedesktop
app.kubernetes.io/component: capacity-guard
app.kubernetes.io/managed-by: argocd
flowercore.io/owner: infra
annotations:
flowercore.io/phase: sprint-44-cx-9-phase-a
spec:
limits:
- type: Container
default:
cpu: "1.0"
memory: "2Gi"
defaultRequest:
cpu: "500m"
memory: "1Gi"
max:
cpu: "2.0"
memory: "4Gi"
min:
cpu: "100m"
memory: "128Mi"

View File

@@ -1,36 +0,0 @@
# FlowerCore Remote Desktop - namespace ResourceQuota (GitOps-managed)
#
# Adopts the live fc-desktop-session-cap object created during the
# 2026-05-19 prewarm-cascade triage. Sprint 44 Phase 1 keeps the pod,
# CPU, and memory guard unchanged, then adds storage/PVC backstops from
# the fc-desktop CPU expansion substrate.
#
# Two-phase deploy note:
# Phase A: apply this ResourceQuota and limitrange.yaml with the current
# FlowerCore.RemoteDesktop image.
# Phase B: bump the service image only after the RemoteDesktop service
# admission/fair-share code lands in that repo.
apiVersion: v1
kind: ResourceQuota
metadata:
name: fc-desktop-session-cap
namespace: fc-desktop
labels:
app.kubernetes.io/name: fc-desktop
app.kubernetes.io/part-of: remotedesktop
app.kubernetes.io/component: capacity-guard
app.kubernetes.io/managed-by: argocd
flowercore.io/owner: infra
annotations:
flowercore.io/rationale: |
Operator-requested limit 2026-05-19: cluster CPU exhausted by RD
pool prewarm cascade. Preserve count/pods=15 plus requests.cpu=8
and requests.memory=16Gi until capacity expansion lands.
flowercore.io/phase: sprint-44-cx-9-phase-a
spec:
hard:
count/pods: "15"
requests.cpu: "8"
requests.memory: "16Gi"
requests.storage: "500Gi"
persistentvolumeclaims: "30"

View File

@@ -0,0 +1,33 @@
# Explicit ArgoCD Application shape for bootstrap/review.
#
# The live bluejay-infra ApplicationSet already discovers apps/* directories
# and creates this same Application name (`infra-fc-devicemgmt`) automatically.
# Keep repoURL on the internal Gitea ClusterIP URL; ArgoCD does not trust the
# external step-ca HTTPS endpoint.
apiVersion: argoproj.io/v1alpha1
kind: Application
metadata:
name: infra-fc-devicemgmt
namespace: argocd
labels:
app.kubernetes.io/name: fc-devicemgmt
app.kubernetes.io/part-of: flowercore
app.kubernetes.io/managed-by: argocd
flowercore.io/tenant-id: system
flowercore.io/created-by: bluejay-infra
spec:
project: default
source:
repoURL: http://gitea-clusterip.gitea.svc.cluster.local:3000/bluejay/bluejay-infra.git
targetRevision: main
path: apps/fc-devicemgmt
destination:
server: https://kubernetes.default.svc
namespace: fc-devicemgmt
syncPolicy:
automated:
prune: true
selfHeal: true
syncOptions:
- CreateNamespace=true
- ServerSideApply=true

View File

@@ -47,7 +47,7 @@ spec:
fsGroupChangePolicy: OnRootMismatch
containers:
- name: operator
image: localhost/fc-devicemgmt-operator:v20260519-sp34cl3-fix
image: localhost/fc-devicemgmt-operator:v20260512-cx5
imagePullPolicy: Never
ports:
- name: metrics

View File

@@ -4,22 +4,6 @@
# Sprint 9+ lane. This manifest is static-valid without requiring the image to
# exist yet; import localhost/fc-devicemgmt-web:<tag> to all schedulable RKE2
# nodes before letting ArgoCD sync a live rollout.
#
# SCALED TO 0 — 2026-05-19 morning-routine cleanup.
# The Web pod cannot start until TWO upstream gaps close:
# 1. MySQL DB instance `flowercore_devicemgmt` (user `fc_devicemgmt`) is
# provisioned via fc-mysql Manager. The cluster currently has ZERO
# MySqlInstanceCrds and no `mysql.fc-mysql.svc:3306` Service, so the
# deployment-web container env `FlowerCore__Database__Host=mysql.fc-mysql.svc`
# points at nothing. Provision via the fc-mysql Manager UI/REST/MCP.
# 2. 1Password vault item `IAmWorkin/FlowerCore DeviceManagement Runtime`
# with 5 fields (DB-Password, mtls-ca.pem, mtls-client.crt, mtls-client.key,
# mtls-chain.pem) — see apps/fc-devicemgmt/1password-item.yaml. Mint mTLS
# from step-ca-agent ClusterIssuer per ADR-126; DB-Password must match the
# password configured for the MySQL user.
# Re-enable: change replicas back to 2 after both gaps close. The image tag
# in this file (v20260512-cx5) MAY also need a refresh — it predates the
# Sprint 34 Cl-3 operator fix; Web may have an analogous bug.
apiVersion: apps/v1
kind: Deployment
metadata:
@@ -36,7 +20,7 @@ metadata:
annotations:
flowercore.io/traceability-standard: k8s-pod-ownership-and-traceability-standard
spec:
replicas: 0
replicas: 2
revisionHistoryLimit: 3
selector:
matchLabels:

View File

@@ -1,2 +0,0 @@
*.sh text eol=lf
Dockerfile text eol=lf

View File

@@ -1,44 +0,0 @@
FROM myoung34/github-runner:latest
ARG RUBY_VERSION=3.3.11
ARG RUBY_MINOR=3.3
ARG RUBY_BUILD_VERSION=v20260326
ARG RUNNER_UID=1001
ARG RUNNER_GID=1001
ENV RUNNER_TOOL_CACHE=/home/runner/_tool
ENV RUNNER_RUBY_TOOLCACHE=/opt/runner-toolcache
ENV PATH="/home/runner/_tool/Ruby/${RUBY_MINOR}/x64/bin:/opt/runner-toolcache/Ruby/${RUBY_MINOR}/x64/bin:${PATH}"
USER root
RUN apt-get update \
&& DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
autoconf \
bison \
build-essential \
ca-certificates \
curl \
libdb-dev \
libffi-dev \
libgdbm-dev \
libgmp-dev \
libncurses-dev \
libreadline-dev \
libssl-dev \
libyaml-dev \
patch \
pkg-config \
uuid-dev \
zlib1g-dev \
&& curl -fsSL "https://github.com/rbenv/ruby-build/archive/refs/tags/${RUBY_BUILD_VERSION}.tar.gz" -o /tmp/ruby-build.tar.gz \
&& mkdir -p /tmp/ruby-build \
&& tar -xzf /tmp/ruby-build.tar.gz --strip-components=1 -C /tmp/ruby-build \
&& /tmp/ruby-build/install.sh \
&& rm -rf /tmp/ruby-build /tmp/ruby-build.tar.gz /var/lib/apt/lists/*
COPY install-ruby-toolcache.sh /usr/local/bin/install-ruby-toolcache.sh
RUN chmod +x /usr/local/bin/install-ruby-toolcache.sh \
&& RUBY_VERSION="${RUBY_VERSION}" RUBY_MINOR="${RUBY_MINOR}" TOOLCACHE_ROOT="${RUNNER_RUBY_TOOLCACHE}" RUNNER_UID="${RUNNER_UID}" RUNNER_GID="${RUNNER_GID}" /usr/local/bin/install-ruby-toolcache.sh \
&& ruby -v

View File

@@ -7,17 +7,12 @@ Deployments with `kubectl`; update this manifest and let ArgoCD reconcile.
All repo-scoped Linux runners use:
- `localhost/fc-github-runner:v20260520-ruby3.3.11`, derived from
`myoung34/github-runner:latest`
- `ACCESS_TOKEN` from the `github-runner-token` Secret
- `RUN_AS_ROOT=false`
- `EPHEMERAL=true`
- `LABELS=self-hosted,linux,fc-build-linux`
- writable non-root paths under `/home/runner` for .NET, NuGet, XDG cache, and
Actions tool cache
- Ruby 3.3.11 seeded into `/home/runner/_tool/Ruby/3.3/x64` from the baked
`/opt/runner-toolcache` copy so `ruby/setup-ruby@v1` can discover it on
self-hosted `ubuntu-20.04-x64` runners
`github-runner` for `FlowerCore.Common` is single-replica because it retains the
original Longhorn ReadWriteOnce NuGet PVC. Every other repo-scoped runner uses
@@ -33,33 +28,9 @@ Sprint 32 final long-tail wave adds 16 two-replica Deployments:
`FlowerCore.Provisioning`, `FlowerCore.Redis`, `FlowerCore.MessageBoard`, and
`FlowerCore.MenuBoard`.
## Image Build
Ruby is baked with a pinned `ruby-build` release and Ruby patch version. The pod
still mounts an `emptyDir` over `/home/runner`, so the `setup-runner-home` init
container copies the baked toolcache from `/opt/runner-toolcache/Ruby` into
`/home/runner/_tool/Ruby` before the runner container starts.
```bash
cd apps/github-runner
podman build -t localhost/fc-github-runner:v20260520-ruby3.3.11 .
podman run --rm localhost/fc-github-runner:v20260520-ruby3.3.11 ruby -v
podman run --rm localhost/fc-github-runner:v20260520-ruby3.3.11 \
test -f /opt/runner-toolcache/Ruby/3.3/x64.complete
podman save localhost/fc-github-runner:v20260520-ruby3.3.11 \
-o fc-github-runner-v20260520-ruby3.3.11.tar
```
Import the saved image on every schedulable RKE2 node before ArgoCD rolls the
Deployments:
```bash
for node in rke2-server rke2-agent1 rke2-agent2; do
scp fc-github-runner-v20260520-ruby3.3.11.tar "$node:/tmp/"
ssh "$node" 'sudo ctr -a /run/k3s/containerd/containerd.sock -n k8s.io images rm localhost/fc-github-runner:v20260520-ruby3.3.11 || true'
ssh "$node" 'sudo ctr -a /run/k3s/containerd/containerd.sock -n k8s.io images import /tmp/fc-github-runner-v20260520-ruby3.3.11.tar'
done
```
Sprint 37 Cx-2 closes the audited Linux runner gaps for
`FlowerCore.DeviceManagement` and `FlowerCore.WorldBuilder` with the same
two-replica `emptyDir` pattern.
## Post-Merge Proof
@@ -69,14 +40,6 @@ After the PR is merged and ArgoCD syncs, verify the runner fleet:
kubectl -n github-runner get deploy,pods,pvc
```
Verify the Ruby toolcache in a fresh pod:
```bash
kubectl -n github-runner exec deploy/github-runner-puppet -c runner -- ruby -v
kubectl -n github-runner exec deploy/github-runner-puppet -c runner -- sh -c \
'echo "$RUNNER_TOOL_CACHE" && test -f "$RUNNER_TOOL_CACHE/Ruby/3.3/x64.complete"'
```
Verify GitHub registration for the repo-scoped runners:
```bash
@@ -88,7 +51,7 @@ for repo in FlowerCore.Common FlowerCore.Shared.Pos FlowerCore.Puppet FlowerCore
FlowerCore.Distribution FlowerCore.Scoreboard FlowerCore.SegmentDisplay \
FlowerCore.Signage.Contracts FlowerCore.SignalControl FlowerCore.Intranet.Web \
FlowerCore.Provisioning FlowerCore.Redis FlowerCore.MessageBoard \
FlowerCore.MenuBoard; do
FlowerCore.MenuBoard FlowerCore.DeviceManagement FlowerCore.WorldBuilder; do
echo "=== $repo ==="
gh api "/repos/astoltz/$repo/actions/runners" \
--jq '.runners[] | select(.labels[].name == "fc-build-linux") | {name,status,busy,labels:[.labels[].name]}'
@@ -105,15 +68,25 @@ gh run list --repo astoltz/FlowerCore.Shared.Pos \
If the latest run is still queued after runner registration, rerun the workflow
from GitHub Actions and verify it lands on an `rke2-linux-*` runner.
## Sprint 37 Cx-2 Gap Audit
The 2026-05-18 GitHub workflow scan found these remaining repos with
`runs-on: [self-hosted, linux, fc-build-linux]` but no K8s runner Deployment:
`FlowerCore.AiStation.Linux`, `FlowerCore.PHP`, `FlowerCore.PiManager`,
`FlowerCore.Shared.Barcodes`, `FlowerCore.Shared.Lookup`,
`FlowerCore.Shared.Nodes`, `FlowerCore.Shared.PrintClient`,
`FlowerCore.Shared.Relay`, `FlowerCore.Shared.ShowRunner`, and
`FlowerCore.Shared.Storage`.
Mixed/platform repos also have Linux workflow legs but need owner review before
adding Linux runner Deployments: `FlowerCore.Library.Mac`,
`FlowerCore.Signage.Agent.AppleTv`, and `FlowerCore.Signage.Player.Wpf`.
## Failure Notes
- `actions/setup-dotnet` permission error at `/usr/share/dotnet`: check that
`DOTNET_INSTALL_DIR=/home/runner/.dotnet` and related cache env vars are
present on the runner pod.
- `ruby/setup-ruby@v1` says self-hosted runners must install Ruby in
`$RUNNER_TOOL_CACHE`: check that the init container copied
`/opt/runner-toolcache/Ruby` into `/home/runner/_tool/Ruby` and that
`/home/runner/_tool/Ruby/3.3/x64.complete` exists.
- `404` during runner registration: the fine-grained PAT is valid but missing
repository access for that repo. Add the repo to the PAT access list; the PAT
value does not change.

File diff suppressed because it is too large Load Diff

View File

@@ -1,19 +0,0 @@
#!/usr/bin/env bash
set -euo pipefail
RUBY_VERSION="${RUBY_VERSION:-3.3.11}"
RUBY_MINOR="${RUBY_MINOR:-3.3}"
TOOLCACHE_ROOT="${TOOLCACHE_ROOT:-/opt/runner-toolcache}"
RUNNER_UID="${RUNNER_UID:-1001}"
RUNNER_GID="${RUNNER_GID:-1001}"
RUBY_PREFIX="${TOOLCACHE_ROOT}/Ruby/${RUBY_VERSION}/x64"
mkdir -p "${TOOLCACHE_ROOT}/Ruby"
RUBY_CONFIGURE_OPTS="${RUBY_CONFIGURE_OPTS:---disable-install-doc --disable-yjit}" ruby-build "${RUBY_VERSION}" "${RUBY_PREFIX}"
touch "${TOOLCACHE_ROOT}/Ruby/${RUBY_VERSION}/x64.complete"
ln -sfn "${RUBY_VERSION}" "${TOOLCACHE_ROOT}/Ruby/${RUBY_MINOR}"
"${RUBY_PREFIX}/bin/ruby" -v
chown -R "${RUNNER_UID}:${RUNNER_GID}" "${TOOLCACHE_ROOT}"
chmod -R a+rX "${TOOLCACHE_ROOT}"

View File

@@ -280,14 +280,13 @@ data:
printer_model: "NuPrint 210"
# Print.Web health (Blazor app on edge2:5200)
# Target `/health` (anonymous) — root path requires API key auth and returns 401.
- job_name: "probe-printweb"
metrics_path: /probe
params:
module: [http_2xx]
scrape_interval: 30s
static_configs:
- targets: ["http://10.0.57.16:5200/health"]
- targets: ["http://10.0.57.16:5200/"]
labels:
instance: "print-web"
service: "print-web"
@@ -730,7 +729,7 @@ data:
expr: |
kube_deployment_status_replicas_ready{
namespace="github-runner",
deployment=~"github-runner(|-(sharedpos|puppet|signage|dms|telephony|print-web|chat|mysql|kiosk-linux))"
deployment=~"github-runner(|-.+)"
} == 0
for: 5m
labels:
@@ -1274,55 +1273,24 @@ metadata:
data:
notify.py: |
#!/usr/bin/env python3
"""HTTP->IRC alert relay with thermal-printer DIGEST forwarding.
Listens on :9119, posts to #alerts on UnrealIRCd, forwards to Print.Web
/api/print/alert. Thermal printing is BATCHED into hourly digests by
default so the printer no longer spam-fires per Grafana webhook.
Routing (per Grafana webhook alert):
- IRC: always per-event (operator likes the stream)
- Thermal printer:
* severity in {critical,disaster,page} OR
label alert_channel=thermal_print_immediate -> print NOW
* label alert_channel=thermal_print -> enqueue into hourly digest
* everything else -> IRC only
- RESOLVED webhooks remove the alert from the digest buffer
Env vars (defaults preserve old behavior on first deploy):
THERMAL_PRINT_ENABLED default "true" - master kill switch
BATCH_INTERVAL_MIN default "60" - minutes between digest prints
BATCH_MAX_PENDING default "50" - force-flush threshold
HTTP surface:
POST / - Grafana webhook entry
POST /flush - manual digest flush (idempotent)
GET / - status + config + buffer depth + stats
"""HTTP->IRC alert relay with thermal printer forwarding for Grafana webhooks.
Listens on :9119, posts to #alerts on UnrealIRCd via raw IRC protocol.
Alerts tagged alert_channel=thermal_print also POST to Print.Web /api/print/alert.
"""
import json, os, socket, sys, threading, time
from collections import defaultdict
from datetime import datetime, timezone
import json, socket, sys, time
from http.server import HTTPServer, BaseHTTPRequestHandler
from urllib.request import Request, urlopen
from urllib.error import URLError
THERMAL_PRINT_ENABLED = os.environ.get("THERMAL_PRINT_ENABLED", "true").lower() == "true"
BATCH_INTERVAL_MIN = int(os.environ.get("BATCH_INTERVAL_MIN", "60"))
BATCH_MAX_PENDING = int(os.environ.get("BATCH_MAX_PENDING", "50"))
IRC_HOST = os.environ.get("IRC_HOST", "unrealircd.irc.svc")
IRC_PORT = int(os.environ.get("IRC_PORT", "6667"))
IRC_NICK = os.environ.get("IRC_NICK", "grafana-bot")
IRC_CHANNEL = os.environ.get("IRC_CHANNEL", "#alerts")
PRINT_WEB_URL = os.environ.get("PRINT_WEB_URL", "http://10.0.57.16:5200/api/print/alert")
_buffer_lock = threading.Lock()
_buffer = {} # fingerprint -> {"alert": dict, "first_seen": float, "last_seen": float}
_last_flush_time = time.time()
_stats = {"webhooks_received": 0, "irc_sent": 0, "print_immediate": 0,
"digest_flushed": 0, "buffer_dedup": 0, "buffer_added": 0,
"buffer_resolved": 0, "started_at": time.time()}
IRC_HOST = "unrealircd.irc.svc" # short name: CoreDNS ndots:5 + iamworkin.lan template hijacks full .cluster.local (see memory)
IRC_PORT = 6667
IRC_NICK = "grafana-bot"
IRC_CHANNEL = "#alerts"
PRINT_WEB_URL = "http://10.0.57.16:5200/api/print/alert"
PRINT_ENABLED = True
def send_irc(message):
"""Connect, handle PING, join, send, quit."""
try:
sock = socket.create_connection((IRC_HOST, IRC_PORT), timeout=15)
sock.sendall(f"NICK {IRC_NICK}\r\n".encode())
@@ -1355,137 +1323,52 @@ data:
time.sleep(0.5)
sock.sendall(b"QUIT :alert delivered\r\n")
sock.close()
_stats["irc_sent"] += 1
return True
except Exception as e:
print(f"[irc-notify] IRC send failed: {e}", file=sys.stderr)
return False
def post_thermal(payload, kind):
if not THERMAL_PRINT_ENABLED:
print(f"[irc-notify] thermal disabled; skip {kind} ({payload.get('title','?')[:40]})", file=sys.stderr)
return False
def send_thermal_print(alert):
if not PRINT_ENABLED: return
labels = alert.get("labels", {})
annotations = alert.get("annotations", {})
status = alert.get("status", "firing").upper()
summary = annotations.get("summary", "")
description = annotations.get("description", "")
runbook = annotations.get("runbook", "")
# Build a useful message: summary + description + runbook steps
parts = []
if summary: parts.append(summary)
if description and description != summary: parts.append(description)
if runbook: parts.append("STEPS: " + runbook)
message = " | ".join(parts) if parts else labels.get("alertname", "Unknown alert")
payload = {
"title": labels.get("alertname", "Unknown"),
"severity": labels.get("severity", "warning").capitalize(),
"host": labels.get("instance", labels.get("host", "unknown")),
"message": message,
"eventId": alert.get("fingerprint", ""),
"source": "Grafana",
"status": "RESOLVED" if status == "RESOLVED" else "PROBLEM",
"acknowledged": False
}
try:
req = Request(PRINT_WEB_URL, data=json.dumps(payload).encode("utf-8"),
headers={"Content-Type": "application/json"}, method="POST")
resp = urlopen(req, timeout=10)
if kind == "immediate": _stats["print_immediate"] += 1
print(f"[irc-notify] thermal {kind} sent: {payload.get('title','?')[:50]}", file=sys.stderr)
return True
print(f"[irc-notify] Thermal print sent: {resp.read().decode()}", file=sys.stderr)
except Exception as e:
print(f"[irc-notify] thermal {kind} failed: {e}", file=sys.stderr)
return False
print(f"[irc-notify] Thermal print failed: {e}", file=sys.stderr)
def fingerprint_of(alert):
fp = alert.get("fingerprint", "")
if fp: return fp
def should_print(alert):
labels = alert.get("labels", {})
target = labels.get("pod") or labels.get("instance") or labels.get("deployment") or labels.get("statefulset") or labels.get("namespace") or ""
return f"{labels.get('alertname','?')}/{labels.get('namespace','')}/{target}"
def is_critical(alert):
return alert.get("labels", {}).get("severity", "").lower() in ("critical", "disaster", "page")
def is_immediate_label(alert):
return alert.get("labels", {}).get("alert_channel") == "thermal_print_immediate"
def is_batched_label(alert):
return alert.get("labels", {}).get("alert_channel") == "thermal_print"
def add_to_digest(alert):
"""Add an alert to the digest buffer. Returns True if the buffer GREW
(new fingerprint), False if it was a dedup, resolution, or no-op.
"""
if not THERMAL_PRINT_ENABLED: return False
fp = fingerprint_of(alert)
status = alert.get("status", "firing").lower()
with _buffer_lock:
if status == "resolved":
if fp in _buffer:
del _buffer[fp]
_stats["buffer_resolved"] += 1
if labels.get("alert_channel") == "thermal_print": return True
if labels.get("severity", "").lower() in ("critical", "disaster"): return True
if alert.get("status", "").upper() == "RESOLVED": return False
return False
if fp in _buffer:
_buffer[fp]["last_seen"] = time.time()
_buffer[fp]["alert"] = alert
_stats["buffer_dedup"] += 1
return False
_buffer[fp] = {"alert": alert, "first_seen": time.time(), "last_seen": time.time()}
_stats["buffer_added"] += 1
return True
def build_digest_payload():
with _buffer_lock:
items = list(_buffer.values())
if not items: return None
by_name = defaultdict(list)
for item in items:
labels = item["alert"].get("labels", {})
by_name[labels.get("alertname", "Unknown")].append(item)
lines = []
for name, group in sorted(by_name.items()):
targets = []
for it in group[:5]:
labels = it["alert"].get("labels", {})
t = (labels.get("pod") or labels.get("instance") or labels.get("deployment")
or labels.get("statefulset") or labels.get("namespace") or "?")
targets.append(t)
more = f" (+{len(group)-5})" if len(group) > 5 else ""
sevs = sorted({it["alert"].get("labels", {}).get("severity", "warning") for it in group})
lines.append(f"[{'/'.join(sevs)}] {name} x{len(group)}: {', '.join(targets)}{more}")
now = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC")
title = f"Alert digest: {len(items)} firing"
body = "\n".join([
f"=== {title} ===",
f"as of {now}",
"",
*lines,
"",
"Stream: #alerts (IRC) | Triage: grafana-noc1.iamworkin.lan",
"Force-flush: POST irc-notify.monitoring.svc:9119/flush",
])
return {"title": title, "severity": "Warning", "host": "monitoring",
"message": body, "eventId": f"digest-{int(time.time())}",
"source": "Grafana digest", "status": "PROBLEM", "acknowledged": False}
def flush_digest():
payload = build_digest_payload()
if payload is None:
print("[irc-notify] flush: buffer empty, no digest sent", file=sys.stderr)
return False
sent = post_thermal(payload, "digest")
with _buffer_lock:
_buffer.clear()
if sent: _stats["digest_flushed"] += 1
return sent
def digest_loop():
global _last_flush_time
while True:
try:
now = time.time()
elapsed = now - _last_flush_time
if elapsed >= BATCH_INTERVAL_MIN * 60:
print(f"[irc-notify] digest tick: interval reached ({BATCH_INTERVAL_MIN}m); buffer={len(_buffer)}", file=sys.stderr)
flush_digest()
_last_flush_time = now
elif len(_buffer) >= BATCH_MAX_PENDING:
print(f"[irc-notify] digest tick: buffer full ({len(_buffer)}); force flush", file=sys.stderr)
flush_digest()
_last_flush_time = now
time.sleep(15)
except Exception as e:
print(f"[irc-notify] digest loop error: {e}", file=sys.stderr)
time.sleep(60)
class Handler(BaseHTTPRequestHandler):
def do_POST(self):
if self.path == "/flush":
ok = flush_digest()
self.send_response(200); self.send_header("Content-Type", "application/json"); self.end_headers()
self.wfile.write(json.dumps({"flushed": ok, "buffer_after": len(_buffer)}).encode())
return
_stats["webhooks_received"] += 1
length = int(self.headers.get("Content-Length", 0))
body = json.loads(self.rfile.read(length)) if length else {}
for alert in body.get("alerts", []):
@@ -1500,56 +1383,22 @@ data:
msg = f"{icon}{sev_tag} {name}: {summary}"
if desc: msg += f"\n {desc}"
send_irc(msg)
# Thermal routing — EVERYTHING (including criticals) goes into
# the hourly digest. Only the explicit `alert_channel=thermal_print_immediate`
# label bypasses, and even that flushes-the-current-digest rather
# than printing a standalone job, so the same fingerprint can't
# spam the printer per webhook cycle.
if status == "RESOLVED":
add_to_digest(alert) # removes from buffer
continue
if is_immediate_label(alert):
# Explicit opt-in for "paper this NOW" — first arrival of a
# new fingerprint triggers an immediate digest flush; repeat
# webhooks for the same fingerprint dedupe in the buffer
# until the next interval or until the alert resolves.
new_in_buffer = add_to_digest(alert)
if new_in_buffer:
global _last_flush_time
flush_digest()
_last_flush_time = time.time()
elif is_critical(alert) or is_batched_label(alert):
add_to_digest(alert)
# else: IRC-only (warnings without thermal_print label)
self.send_response(200); self.send_header("Content-Type", "application/json"); self.end_headers()
if should_print(alert): send_thermal_print(alert)
self.send_response(200)
self.send_header("Content-Type", "application/json")
self.end_headers()
self.wfile.write(b'{"status":"ok"}')
def do_GET(self):
self.send_response(200); self.send_header("Content-Type", "application/json"); self.end_headers()
with _buffer_lock:
alertnames = sorted({it["alert"].get("labels", {}).get("alertname", "?") for it in _buffer.values()})
depth = len(_buffer)
info = {
"service": "irc-notify",
"config": {"thermal_print_enabled": THERMAL_PRINT_ENABLED,
"batch_interval_min": BATCH_INTERVAL_MIN,
"batch_max_pending": BATCH_MAX_PENDING,
"irc_target": f"{IRC_HOST}:{IRC_PORT} {IRC_CHANNEL}",
"print_web_url": PRINT_WEB_URL},
"buffer": {"depth": depth, "alertnames": alertnames,
"seconds_since_last_flush": int(time.time() - _last_flush_time),
"seconds_until_next_flush": max(0, int(BATCH_INTERVAL_MIN*60 - (time.time() - _last_flush_time)))},
"stats": _stats,
}
self.wfile.write(json.dumps(info, indent=2).encode())
self.send_response(200)
self.send_header("Content-Type", "application/json")
self.end_headers()
self.wfile.write(json.dumps({"service":"irc-notify","thermal_print":PRINT_ENABLED}).encode())
def log_message(self, format, *args):
print(f"[irc-notify] {args[0]}", file=sys.stderr)
if __name__ == "__main__":
threading.Thread(target=digest_loop, daemon=True).start()
server = HTTPServer(("0.0.0.0", 9119), Handler)
print(f"[irc-notify] :9119 -> IRC {IRC_HOST}:{IRC_PORT} {IRC_CHANNEL} | thermal={'ON' if THERMAL_PRINT_ENABLED else 'OFF'} | digest={BATCH_INTERVAL_MIN}m max={BATCH_MAX_PENDING}", file=sys.stderr)
print(f"IRC alert relay :9119 -> {IRC_HOST}:{IRC_PORT} {IRC_CHANNEL} (thermal: {PRINT_ENABLED})")
server.serve_forever()
# =============================================================================
@@ -3660,7 +3509,7 @@ data:
- refId: A
relativeTimeRange: {from: 300, to: 0}
datasourceUid: prometheus
model: {expr: 'kube_deployment_status_replicas_ready{namespace="github-runner",deployment=~"github-runner(|-(sharedpos|puppet|signage|dms|telephony|print-web|chat|mysql|kiosk-linux))"} == 0', instant: true, refId: A}
model: {expr: 'kube_deployment_status_replicas_ready{namespace="github-runner",deployment=~"github-runner(|-.+)"} == 0', instant: true, refId: A}
- refId: B
relativeTimeRange: {from: 300, to: 0}
datasourceUid: __expr__

View File

@@ -1,285 +0,0 @@
using FluentAssertions;
using YamlDotNet.RepresentationModel;
using Xunit;
namespace BluejayInfraLint.Tests;
[Trait("Category", "Unit")]
public sealed class FcDesktopCapacityPolicyTests
{
private static readonly ManifestInventory Inventory = ManifestInventory.Load();
[Fact]
public void FcDesktop_AppDirectoryMustExist()
{
Directory.Exists(Path.Combine(Inventory.BluejayRoot, "apps", "fc-desktop"))
.Should()
.BeTrue();
}
[Fact]
public void FcDesktop_MustHaveExactlyOneResourceQuota()
{
FcDesktopDocuments()
.Where(document => document.Kind == "ResourceQuota")
.Should()
.ContainSingle();
}
[Fact]
public void FcDesktop_ResourceQuotaMustAdoptLiveSessionCapObject()
{
var quota = ResourceQuota();
quota.RelativePath.Should().Be("fc-desktop/resourcequota.yaml");
quota.Name.Should().Be("fc-desktop-session-cap");
quota.Namespace.Should().Be("fc-desktop");
}
[Theory]
[InlineData("count/pods", "15")]
[InlineData("requests.cpu", "8")]
[InlineData("requests.memory", "16Gi")]
[InlineData("requests.storage", "500Gi")]
[InlineData("persistentvolumeclaims", "30")]
public void FcDesktop_ResourceQuotaMustDeclarePhaseOneHardLimits(string key, string value)
{
ResourceQuota().Scalar("spec", "hard", key).Should().Be(value);
}
[Fact]
public void FcDesktop_ResourceQuotaMustCarryTraceableLabels()
{
ResourceQuotaLabels()
.Should()
.Contain(new Dictionary<string, string>
{
["app.kubernetes.io/name"] = "fc-desktop",
["app.kubernetes.io/part-of"] = "remotedesktop",
["app.kubernetes.io/component"] = "capacity-guard",
["app.kubernetes.io/managed-by"] = "argocd",
["flowercore.io/owner"] = "infra",
});
}
[Fact]
public void FcDesktop_ResourceQuotaMustUseRequestsKeysForComputeCap()
{
var hardKeys = HardLimitKeys(ResourceQuota());
hardKeys.Should().Contain(new[] { "requests.cpu", "requests.memory" });
hardKeys.Should().NotContain(new[] { "cpu", "memory" });
}
[Fact]
public void FcDesktop_ResourceQuotaMustAvoidDestructiveArgoAnnotations()
{
var quota = ResourceQuota();
quota.Scalar("metadata", "annotations", "argocd.argoproj.io/hook").Should().BeNull();
quota.Scalar("metadata", "annotations", "argocd.argoproj.io/hook-delete-policy").Should().BeNull();
var syncOptions = quota.Scalar("metadata", "annotations", "argocd.argoproj.io/sync-options") ?? string.Empty;
syncOptions.Should().NotContain("Force=true");
syncOptions.Should().NotContain("Replace=true");
}
[Fact]
public void FcDesktop_ResourceQuotaMustRecordPhaseAInfraOnlyScope()
{
ResourceQuota().Scalar("metadata", "annotations", "flowercore.io/phase")
.Should()
.Be("sprint-44-cx-9-phase-a");
}
[Fact]
public void FcDesktop_MustHaveExactlyOneLimitRange()
{
FcDesktopDocuments()
.Where(document => document.Kind == "LimitRange")
.Should()
.ContainSingle();
}
[Fact]
public void FcDesktop_LimitRangeMustLiveBesideResourceQuota()
{
var limitRange = LimitRange();
limitRange.RelativePath.Should().Be("fc-desktop/limitrange.yaml");
limitRange.Name.Should().Be("fc-desktop-pod-defaults");
limitRange.Namespace.Should().Be("fc-desktop");
}
[Fact]
public void FcDesktop_LimitRangeMustHaveSingleContainerRule()
{
var limit = LimitRangeRule();
LimitRange().MappingSequence("spec", "limits").Should().ContainSingle();
ManifestNodeExtensions.Scalar(limit, "type").Should().Be("Container");
}
[Theory]
[InlineData("default", "cpu", "1.0")]
[InlineData("default", "memory", "2Gi")]
[InlineData("defaultRequest", "cpu", "500m")]
[InlineData("defaultRequest", "memory", "1Gi")]
[InlineData("max", "cpu", "2.0")]
[InlineData("max", "memory", "4Gi")]
[InlineData("min", "cpu", "100m")]
[InlineData("min", "memory", "128Mi")]
public void FcDesktop_LimitRangeMustDeclarePerPodShape(string section, string key, string value)
{
ManifestNodeExtensions.Scalar(LimitRangeRule(), section, key).Should().Be(value);
}
[Fact]
public void FcDesktop_LimitRangeMustCarryTraceableLabels()
{
LimitRangeLabels()
.Should()
.Contain(new Dictionary<string, string>
{
["app.kubernetes.io/name"] = "fc-desktop",
["app.kubernetes.io/part-of"] = "remotedesktop",
["app.kubernetes.io/component"] = "capacity-guard",
["app.kubernetes.io/managed-by"] = "argocd",
["flowercore.io/owner"] = "infra",
});
}
[Fact]
public void FcDesktop_LimitRangeMustAvoidDestructiveArgoAnnotations()
{
var limitRange = LimitRange();
limitRange.Scalar("metadata", "annotations", "argocd.argoproj.io/hook").Should().BeNull();
limitRange.Scalar("metadata", "annotations", "argocd.argoproj.io/hook-delete-policy").Should().BeNull();
var syncOptions = limitRange.Scalar("metadata", "annotations", "argocd.argoproj.io/sync-options") ?? string.Empty;
syncOptions.Should().NotContain("Force=true");
syncOptions.Should().NotContain("Replace=true");
}
[Fact]
public void FcDesktop_LimitRangeMustRecordPhaseAInfraOnlyScope()
{
LimitRange().Scalar("metadata", "annotations", "flowercore.io/phase")
.Should()
.Be("sprint-44-cx-9-phase-a");
}
[Fact]
public void FcDesktop_BluejayInfraMustNotOwnDeploymentOrService()
{
FcDesktopDocuments()
.Select(document => document.Kind)
.Should()
.NotContain(new[] { "Deployment", "Service" });
}
[Fact]
public void FcDesktop_BluejayInfraMustOnlyOwnInfraResourceKinds()
{
var allowedKinds = new HashSet<string>(StringComparer.Ordinal)
{
"Certificate",
"IngressRoute",
"NetworkPolicy",
"ResourceQuota",
"LimitRange",
};
FcDesktopDocuments()
.Select(document => document.Kind)
.Should()
.OnlyContain(kind => allowedKinds.Contains(kind));
}
[Fact]
public void FcDesktop_NetworkPolicySetMustRemainPresent()
{
FcDesktopDocuments()
.Where(document => document.Kind == "NetworkPolicy")
.Select(document => document.Name)
.Should()
.BeEquivalentTo(
"desktop-isolation",
"fc-desktop-default-deny",
"remotedesktop-web-isolation",
"cm-acme-http-solver-allow");
}
[Fact]
public void FcDesktop_TlsIngressMustRemainOwnedByInfra()
{
FcDesktopDocuments()
.Should()
.Contain(document => document.Kind == "Certificate" && document.Name == "remotedesktop-web-tls")
.And
.Contain(document => document.Kind == "IngressRoute" && document.Name == "remotedesktop-web");
}
private static IReadOnlyList<ManifestDocument> FcDesktopDocuments()
{
return Inventory.Documents
.Where(document => document.RelativePath.StartsWith("fc-desktop/", StringComparison.Ordinal))
.ToList();
}
private static ManifestDocument ResourceQuota()
{
return FcDesktopDocuments()
.Single(document => document.Kind == "ResourceQuota");
}
private static ManifestDocument LimitRange()
{
return FcDesktopDocuments()
.Single(document => document.Kind == "LimitRange");
}
private static YamlMappingNode LimitRangeRule()
{
return LimitRange()
.MappingSequence("spec", "limits")
.Single();
}
private static IReadOnlySet<string> HardLimitKeys(ManifestDocument document)
{
var hard = ManifestNodeExtensions.Mapping(document.Root, "spec", "hard")
?? throw new InvalidOperationException($"{document.Descriptor} is missing spec.hard.");
return hard.Children.Keys
.OfType<YamlScalarNode>()
.Select(key => key.Value)
.Where(value => !string.IsNullOrWhiteSpace(value))
.Cast<string>()
.ToHashSet(StringComparer.Ordinal);
}
private static IReadOnlyDictionary<string, string> ResourceQuotaLabels()
{
return Labels(ResourceQuota());
}
private static IReadOnlyDictionary<string, string> LimitRangeLabels()
{
return Labels(LimitRange());
}
private static IReadOnlyDictionary<string, string> Labels(ManifestDocument document)
{
var labels = ManifestNodeExtensions.Mapping(document.Root, "metadata", "labels")
?? throw new InvalidOperationException($"{document.Descriptor} is missing metadata.labels.");
return labels.Children
.Where(entry => entry.Key is YamlScalarNode && entry.Value is YamlScalarNode)
.ToDictionary(
entry => ((YamlScalarNode)entry.Key).Value ?? string.Empty,
entry => ((YamlScalarNode)entry.Value).Value ?? string.Empty,
StringComparer.Ordinal);
}
}

View File

@@ -67,6 +67,8 @@ public sealed class FleetManifestLintTests
["github-runner-chat"] = "https://github.com/astoltz/FlowerCore.Chat",
["github-runner-mysql"] = "https://github.com/astoltz/FlowerCore.MySQL",
["github-runner-kiosk-linux"] = "https://github.com/astoltz/FlowerCore.Kiosk.Linux",
["github-runner-devicemgmt"] = "https://github.com/astoltz/FlowerCore.DeviceManagement",
["github-runner-worldbuilder"] = "https://github.com/astoltz/FlowerCore.WorldBuilder",
};
private static readonly HashSet<string> ScaledLinuxRunnerDeployments = new(StringComparer.Ordinal)
@@ -80,6 +82,8 @@ public sealed class FleetManifestLintTests
"github-runner-chat",
"github-runner-mysql",
"github-runner-kiosk-linux",
"github-runner-devicemgmt",
"github-runner-worldbuilder",
};
private static readonly IReadOnlyDictionary<string, string> WritableRunnerEnv = new Dictionary<string, string>(StringComparer.Ordinal)
@@ -311,7 +315,7 @@ public sealed class FleetManifestLintTests
monitoring.Should().Contain("MacMiniRunnerOffline");
monitoring.Should().Contain("LinuxRunnerOffline");
monitoring.Should().Contain("kube_deployment_status_replicas_ready");
monitoring.Should().Contain("github-runner(|-(sharedpos|puppet|signage|dms|telephony|print-web|chat|mysql|kiosk-linux))");
monitoring.Should().Contain("github-runner(|-.+)");
monitoring.Should().Contain("folder: CI Alerts");
monitoring.Should().Contain("uid: linux-runner-offline");
monitoring.Should().Contain("alert_channel: irc");
@@ -430,6 +434,7 @@ public sealed class FleetManifestLintTests
var expectedFiles = new[]
{
"1password-item.yaml",
"argocd-application.yaml",
"certificate-web.yaml",
"clusterrole-operator.yaml",
"clusterrolebinding-operator.yaml",
@@ -585,15 +590,17 @@ public sealed class FleetManifestLintTests
}
[Fact]
public void FcDeviceManagement_MustRelyOnApplicationSetDiscovery()
public void FcDeviceManagement_ArgocdApplicationMustMatchApplicationSetDiscoveryConventions()
{
FcDeviceManagementDocuments()
.Should()
.NotContain(document => document.Kind == "Application", "the root ApplicationSet owns apps/fc-devicemgmt discovery");
var application = FcDeviceManagementDocuments()
.Single(document => document.Kind == "Application" && document.Name == "infra-fc-devicemgmt");
FcDeviceManagementDocuments()
application.Namespace.Should().Be("argocd");
application.Scalar("spec", "source", "repoURL")
.Should()
.Contain(document => document.Kind == "Namespace" && document.Name == "fc-devicemgmt");
.Be("http://gitea-clusterip.gitea.svc.cluster.local:3000/bluejay/bluejay-infra.git");
application.Scalar("spec", "source", "path").Should().Be("apps/fc-devicemgmt");
application.Scalar("spec", "destination", "namespace").Should().Be("fc-devicemgmt");
}
private static IEnumerable<string> ProbeViolations(
@@ -628,12 +635,6 @@ public sealed class FleetManifestLintTests
.ToDictionary(document => document.Name, StringComparer.Ordinal);
}
private static YamlMappingNode RunnerContainer(ManifestDocument deployment)
{
return deployment.ContainerMappings()
.Single(container => string.Equals(ManifestNodeExtensions.Scalar(container, "name"), "runner", StringComparison.Ordinal));
}
private static int ReplicaCount(ManifestDocument document)
{
return int.TryParse(document.Scalar("spec", "replicas"), out var replicas) ? replicas : 1;
@@ -644,6 +645,15 @@ public sealed class FleetManifestLintTests
return EnvMapping(container, name) is { } env ? ManifestNodeExtensions.Scalar(env, "value") : null;
}
private static YamlMappingNode RunnerContainer(ManifestDocument deployment)
{
return deployment.ContainerMappings()
.Where(container => string.Equals(ManifestNodeExtensions.Scalar(container, "name"), "runner", StringComparison.Ordinal))
.Should()
.ContainSingle($"{deployment.Name} must keep exactly one main runner container")
.Subject;
}
private static string? EnvSecretName(YamlMappingNode container, string name)
{
return EnvMapping(container, name) is { } env