Compare commits

..

28 Commits

Author SHA1 Message Date
Andrew Stoltz
9a4a8264d9 github-runner: add DM and WorldBuilder runners 2026-05-18 17:44:29 -05:00
Andrew Stoltz
200aeab032 ttsreader: deploy study mode repair image 2026-05-18 16:33:08 -05:00
Andrew Stoltz
8182616d4c ttsreader: point render piper to edge1 demo endpoint 2026-05-18 16:06:37 -05:00
Andrew Stoltz
f0862ac03c ttsreader: deploy sprint36 demo audio image 2026-05-18 16:04:59 -05:00
Andrew Stoltz
46c392605e monitoring: mirror PuppetServiceFailed alert from Notes (Sprint 33 Cx-7 Phase B)
Mirrors the live `puppet` alert group from
FlowerCore.Notes/scripts/monitoring/alerts.yml into the K8s ConfigMap so a
future in-cluster Prometheus inherits the ruleset automatically.

Source of truth remains the Notes file (live Podman Prometheus on noc1).
See feedback_monitoring_k8s_target_vs_live_podman.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-18 11:11:07 -05:00
89b147bbdd docs(openvox): document quadlet durability smoke (#12) 2026-05-18 04:53:02 +00:00
d7238a5e3b feat(brochure): add public brochure GitOps app (#13) 2026-05-18 04:52:37 +00:00
fc444a02a1 feat(chat): add public twin ingress (#11) 2026-05-18 04:52:20 +00:00
83d4883d55 feat(worldbuilder): pin k8s demo to fake backend (#10) 2026-05-18 04:52:11 +00:00
f8fe3b2688 feat(github-runner): add final long-tail runners (#9) 2026-05-18 04:52:01 +00:00
f2ab892ebc feat(github-runner): add Marquee + TtsReader per-repo runners (#8) 2026-05-18 03:27:14 +00:00
fef68a9560 feat(fc-devicemgmt): add Kubernetes deployment manifests (#1)
Sprint 8 IMPL lane Cx-5: fc-devicemgmt K8s manifests (rebased onto main 2026-05-18; 13 files, +944).

Namespace + Web Deployment (replicas:2, MySQL backend) + Operator Deployment (replicas:1, KubeOps leader-elect) + Service + Certificate (step-ca-acme ClusterIssuer) + Traefik IngressRoute (devices.iamworkin.lan internal) + ServiceAccount + ClusterRole + ClusterRoleBinding + NetworkPolicy (CNI DNAT-aware backend ports) + OnePasswordItem (5-field consolidated) + ArgoCD Application bootstrap shape + lint coverage.

Follow-ups (not merge blockers):
- localhost/fc-devicemgmt-{web,operator}:v20260512-cx5 must be imported to all 3 RKE2 nodes; pods will ErrImageNeverPull until imported.
- 1Password vault item 'FlowerCore DeviceManagement Runtime' must be created with 5 fields before pods can start.
- DNS devices.iamworkin.lan -> 10.0.56.200 already present.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-18 02:56:23 +00:00
Andrew Stoltz
6fe77225ae fix(github-runner): dedupe DOTNET_INSTALL_DIR+NUGET_PACKAGES on base+sharedpos
PR #5 rebase concatenated PR #5 env additions onto PR #7 env additions on
the base + sharedpos Deployments, producing duplicate-key validation
errors in ArgoCD's structured merge. The DOTNET_INSTALL_DIR and
NUGET_PACKAGES values are identical between PR #5 and PR #7; keep the
PR #7 originals and retain only the unique new env vars from PR #5
(DOTNET_CLI_TELEMETRY_OPTOUT, DOTNET_NOLOGO, DOTNET_GENERATE_ASPNET_CERTIFICATE).

No behavioral change — same final env var set, no duplicates.
2026-05-17 21:53:05 -05:00
634b9c4169 feat(github-runner): harden Linux runner fleet (#5) 2026-05-18 02:51:02 +00:00
b8c7e59005 Tighten Pi signage HDMI settle coverage (#3) 2026-05-18 02:35:17 +00:00
65ac8d6f01 feat(github-runner): pod-env DOTNET_INSTALL_DIR + initContainer for non-root runner (#7) 2026-05-18 02:25:18 +00:00
35844e0dbd chore(github-runner): un-park github-runner-sharedpos (Shared.Pos non-root build fixed) (#6) 2026-05-18 02:20:00 +00:00
b1e307151e chore(github-runner): un-park github-runner-sharedpos (replicas 1) after Shared.Pos CI fix merged 2026-05-17 21:54:16 +00:00
12b07219c7 chore(github-runner): park github-runner-sharedpos (replicas 0) until Cx-1 non-root fix
Shared.Pos build fails on non-root runner (setup-dotnet /usr/share/dotnet denied); churning runner drove HighCPU on rke2-agent2. Re-enable in the Sprint 30+ Cx-1 Linux-runner-fleet lane (DOTNET_INSTALL_DIR on pod env).
2026-05-17 21:50:35 +00:00
9fd32c4415 feat(monitoring): MacMiniRunnerOffline alert (Sprint 28 reconcile) 2026-05-17 19:50:29 +00:00
ad670fb344 feat(github-runner): add Shared.Pos repo-scoped Linux runner (unstick stuck publish) 2026-05-17 19:50:23 +00:00
Codex
6f6ca50987 fix(github-runner): switch RUNNER_TOKEN -> ACCESS_TOKEN; set RUN_AS_ROOT=false
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-16 22:08:56 +00:00
Codex
c7be58c1f7 chore(github-runner): bump replicas 0 -> 1 (PAT provisioned)
Operator provisioned GitHub PAT (Runner Registration) 1P item. OnePasswordItem CRD already materialized the secret. Unblocks CI fleet-wide.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-16 22:04:03 +00:00
Codex
a1f5a393cd chore(github-runner): rename 1P item to GitHub PAT (Runner Registration)
Renames the OnePasswordItem.itemPath from "GitHub Runner Registration
Token" to "GitHub PAT (Runner Registration)" so the runner 1P entry
sits next to its siblings — GitHub PAT (Gitea Mirrors) and GitHub PAT
(NuGet Packages) — under a consistent "GitHub PAT (...)" naming pattern
and API_CREDENTIAL category.

Existing field "credential" remains the consumer (RUNNER_TOKEN env).
Comment block clarified to require Administration:read/write fine-grained
PAT scope on target repos.

Old 1P item renamed to "[DEPRECATED 2026-05-16] GitHub Runner
Registration" — kept as recovery backup; can be hard-deleted after the
first successful runner pod start against the new item path.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-16 22:01:41 +00:00
Codex
710340d8be chore(github-runner): rename 1P item to GitHub PAT (Runner Registration)
Renames the OnePasswordItem.itemPath from "GitHub Runner Registration
Token" to "GitHub PAT (Runner Registration)" so the runner 1P entry
sits next to its siblings — GitHub PAT (Gitea Mirrors) and GitHub PAT
(NuGet Packages) — under a consistent "GitHub PAT (...)" naming pattern
and API_CREDENTIAL category.

Existing field "credential" remains the consumer (RUNNER_TOKEN env).
Comment block clarified to require Administration:read/write fine-grained
PAT scope on target repos.

Old 1P item renamed to "[DEPRECATED 2026-05-16] GitHub Runner
Registration" — kept as recovery backup; can be hard-deleted after the
first successful runner pod start against the new item path.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-16 10:27:58 -05:00
Andrew Stoltz
7d2daaa4f8 chore(github-runner): replicas 1 → 0 until 1Password token provisioned
github-runner-token OnePasswordItem exists but the underlying 1Password
vault item hasn't been created yet, so the operator can't mint the K8s
Secret. Pod stuck in CreateContainerConfigError → DeploymentReplicasMismatch
alert fires.

Scaling to 0 keeps the manifest infrastructure intact but stops trying
to schedule until operator:
1. Creates "GitHub Runner Registration Token" item in IAmWorkin vault
2. Generates a token at github.com/astoltz/<repo>/settings/actions/runners/new
3. Updates the OnePasswordItem itemPath to point at it
4. Bumps replicas back to 1 via PR
2026-05-15 16:18:19 -05:00
Andrew Stoltz
e50e103ba0 fix(zabbix): bump web probe timeouts 5s→15s + add failureThreshold
zabbix-web nginx+PHP-FPM container serves / at ~3-5s baseline with
occasional 6-7s spikes (probe path renders full dashboard via PHP).
kube-probe was killing the container after 3 consecutive 5s-timeout
499s, producing CrashLoopBackOff alert noise even though the app
was serving real traffic fine.

15s timeout absorbs the natural variance; explicit failureThreshold=3
documents the policy (was implicit default).

Closes the firing PodCrashLoopBackOff (zabbix-web) + pending
HTTPServiceSlow/HTTPServiceDegraded alerts. zabbix.iamworkin.lan
remains slow at the application layer (separate work — PHP-FPM
warm-up + Zabbix server "host not found" agent lookup spam need
their own fixes) but the pod restart loop stops.
2026-05-15 15:59:04 -05:00
Codex
e8094eb0bd ci(github-runner): add Phase 2 ephemeral Linux runner K8s manifest
Namespace github-runner with myoung34/github-runner:latest Deployment,
5Gi Longhorn RWO NuGet cache PVC, zero-privilege ServiceAccount, and
OnePasswordItem CRD for the registration token. EPHEMERAL=true mode
re-registers after each job; Recreate strategy avoids RWO multi-attach.
Targets fc-build-linux label; single replica pinned to rke2-server node.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-14 12:46:25 -05:00
33 changed files with 6070 additions and 22 deletions

View File

@@ -118,6 +118,7 @@ That test project sweeps `bluejay-infra/apps/**` plus the canonical sibling `Flo
## References
- OpenVox noc1 durability runbook: `docs/runbooks/openvoxserver-quadlet-durability.md`
- Cert-manager recovery playbook: `FlowerCore.Notes/memory/project_cert_manager_recovery_2026_04_22.md`
- Why pfSense DNS is required: `FlowerCore.Notes/memory/feedback_pfsense_dns_required_for_acme.md`
- Public DNS operator host: `https://dns.iamworkin.lan`

27
apps/brochure/README.md Normal file
View File

@@ -0,0 +1,27 @@
# FlowerCore Brochure
`apps/brochure` hosts the public brochure split from `FlowerCore.Intranet.Web`.
ArgoCD's `apps/*` ApplicationSet will create `infra-brochure` after this
directory lands on `main`.
## Runtime
- Host: `https://brochure.flowercore.io`
- Namespace: `brochure`
- Deployment: `brochure-web`
- Image: `localhost/fc-brochure-web:v20260524-sprint32`
- Port: `8080`
- Public route method allowlist: `GET` and `HEAD`
## Operator Actions
1. Publish and import `localhost/fc-brochure-web:v20260524-sprint32` to every
RKE2 node before sync, using the same podman save + `ctr images import`
flow as the Intranet deployment.
2. Create the Cloudflare DNS record for `brochure.flowercore.io` pointing at
the FlowerCore public edge.
3. Verify `infra-brochure` appears in ArgoCD, the certificate becomes Ready,
and `GET https://brochure.flowercore.io/` returns `200`.
The route intentionally does not expose `/ops/*` or `/admin/*`; the Brochure
web app returns `404` for those paths and Traefik only forwards read methods.

131
apps/brochure/brochure.yaml Normal file
View File

@@ -0,0 +1,131 @@
# FlowerCore Brochure public host
#
# Thin Blazor host for public What's New, walkthrough, and gallery content
# carved out of FlowerCore.Intranet.Web. The ApplicationSet creates
# infra-brochure from this directory after merge.
---
apiVersion: v1
kind: Namespace
metadata:
name: brochure
labels:
app.kubernetes.io/part-of: flowercore
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: brochure-web
namespace: brochure
labels:
app: brochure-web
app.kubernetes.io/name: brochure-web
app.kubernetes.io/part-of: flowercore
spec:
replicas: 1
revisionHistoryLimit: 3
selector:
matchLabels:
app: brochure-web
template:
metadata:
labels:
app: brochure-web
app.kubernetes.io/name: brochure-web
app.kubernetes.io/part-of: flowercore
spec:
containers:
- name: brochure-web
image: localhost/fc-brochure-web:v20260524-sprint32
imagePullPolicy: Never
ports:
- containerPort: 8080
name: http
env:
- name: ASPNETCORE_ENVIRONMENT
value: Production
- name: ASPNETCORE_URLS
value: "http://+:8080"
resources:
requests:
cpu: "25m"
memory: "128Mi"
limits:
cpu: "500m"
memory: "512Mi"
readinessProbe:
httpGet:
path: /health
port: http
initialDelaySeconds: 10
periodSeconds: 10
livenessProbe:
httpGet:
path: /health
port: http
initialDelaySeconds: 30
periodSeconds: 30
securityContext:
runAsNonRoot: true
runAsUser: 1654
runAsGroup: 1654
allowPrivilegeEscalation: false
readOnlyRootFilesystem: true
capabilities:
drop:
- ALL
volumeMounts:
- name: tmp
mountPath: /tmp
volumes:
- name: tmp
emptyDir: {}
---
apiVersion: v1
kind: Service
metadata:
name: brochure-web
namespace: brochure
labels:
app: brochure-web
app.kubernetes.io/name: brochure-web
app.kubernetes.io/part-of: flowercore
spec:
type: ClusterIP
selector:
app: brochure-web
ports:
- name: http
port: 8080
targetPort: http
---
apiVersion: cert-manager.io/v1
kind: Certificate
metadata:
name: brochure-web-tls
namespace: brochure
spec:
secretName: brochure-web-tls
issuerRef:
name: step-ca-acme
kind: ClusterIssuer
dnsNames:
- brochure.flowercore.io
duration: 720h
renewBefore: 240h
---
apiVersion: traefik.io/v1alpha1
kind: IngressRoute
metadata:
name: brochure-web-public
namespace: brochure
spec:
entryPoints:
- websecure
routes:
- match: Host(`brochure.flowercore.io`) && (Method(`GET`) || Method(`HEAD`))
kind: Rule
services:
- name: brochure-web
port: 8080
tls:
secretName: brochure-web-tls

View File

@@ -30,3 +30,41 @@ spec:
port: 80
tls:
secretName: chat-web-tls
---
# Public host profile marker. The app treats this header as authoritative for
# the public twin, while the internal chat.iamworkin.lan route does not attach
# it and keeps the operator-oriented UI.
apiVersion: traefik.io/v1alpha1
kind: Middleware
metadata:
name: chat-public-profile-header
namespace: fc-chat
spec:
headers:
customRequestHeaders:
X-FC-Chat-Host-Profile: "public"
---
# Public Cloudflare-fronted twin for the anonymous chat surface. Operator
# paths are intentionally absent from the allowlist below, so /admin,
# /operator, /console, /ops, /api/operator, and /operatorhub miss this route
# and return Traefik 404 before reaching the pod. Operator action still needed:
# create/verify Cloudflare DNS chat.flowercore.io -> public Traefik endpoint
# and mirror the cf-origin-flowercore-io TLS secret into namespace fc-chat.
apiVersion: traefik.io/v1alpha1
kind: IngressRoute
metadata:
name: chat-web-public
namespace: fc-chat
spec:
entryPoints:
- websecure
routes:
- match: Host(`chat.flowercore.io`) && (Path(`/`) || Path(`/chat`) || PathPrefix(`/_blazor`) || PathPrefix(`/_framework`) || PathPrefix(`/_content`) || PathPrefix(`/avatars`) || PathPrefix(`/css`) || PathPrefix(`/js`) || PathPrefix(`/favicon`) || PathPrefix(`/chathub`)) && (Method(`GET`) || Method(`HEAD`) || Method(`POST`) || Method(`OPTIONS`))
kind: Rule
middlewares:
- name: chat-public-profile-header
services:
- name: chat-web
port: 80
tls:
secretName: cf-origin-flowercore-io

View File

@@ -0,0 +1,26 @@
# Runtime secrets for FlowerCore.DeviceManagement.
#
# OnePasswordItem operator syncs this item into a Kubernetes Secret with the
# same name. Expected fields:
# DB-Password
# mtls-ca.pem
# mtls-client.crt
# mtls-client.key
# mtls-chain.pem
#
# Do not add literal secret values to this repo. Runtime pods consume the
# synced Secret through env vars and read-only mounts.
apiVersion: onepassword.com/v1
kind: OnePasswordItem
metadata:
name: fc-devicemgmt-runtime
namespace: fc-devicemgmt
labels:
app.kubernetes.io/name: fc-devicemgmt
app.kubernetes.io/component: secrets
app.kubernetes.io/part-of: flowercore
app.kubernetes.io/managed-by: argocd
flowercore.io/tenant-id: system
flowercore.io/created-by: bluejay-infra
spec:
itemPath: "vaults/IAmWorkin/items/FlowerCore DeviceManagement Runtime"

View File

@@ -0,0 +1,33 @@
# Explicit ArgoCD Application shape for bootstrap/review.
#
# The live bluejay-infra ApplicationSet already discovers apps/* directories
# and creates this same Application name (`infra-fc-devicemgmt`) automatically.
# Keep repoURL on the internal Gitea ClusterIP URL; ArgoCD does not trust the
# external step-ca HTTPS endpoint.
apiVersion: argoproj.io/v1alpha1
kind: Application
metadata:
name: infra-fc-devicemgmt
namespace: argocd
labels:
app.kubernetes.io/name: fc-devicemgmt
app.kubernetes.io/part-of: flowercore
app.kubernetes.io/managed-by: argocd
flowercore.io/tenant-id: system
flowercore.io/created-by: bluejay-infra
spec:
project: default
source:
repoURL: http://gitea-clusterip.gitea.svc.cluster.local:3000/bluejay/bluejay-infra.git
targetRevision: main
path: apps/fc-devicemgmt
destination:
server: https://kubernetes.default.svc
namespace: fc-devicemgmt
syncPolicy:
automated:
prune: true
selfHeal: true
syncOptions:
- CreateNamespace=true
- ServerSideApply=true

View File

@@ -0,0 +1,30 @@
# Certificate for devices.iamworkin.lan.
#
# Preflight gate: FlowerCore.DNS / pfSense must contain an explicit A record:
# devices.iamworkin.lan -> 10.0.56.200
# before this Certificate is synced. step-ca ACME cannot see the CoreDNS
# wildcard, so missing pfSense DNS produces cert-manager HTTP-01 backoff
# (feedback_pfsense_dns_required_for_acme).
apiVersion: cert-manager.io/v1
kind: Certificate
metadata:
name: fc-devicemgmt-web-tls
namespace: fc-devicemgmt
labels:
app.kubernetes.io/name: fc-devicemgmt-web
app.kubernetes.io/component: web
app.kubernetes.io/part-of: flowercore
app.kubernetes.io/managed-by: argocd
flowercore.io/tenant-id: system
flowercore.io/created-by: bluejay-infra
annotations:
flowercore.io/dns-preflight: "devices.iamworkin.lan must resolve to 10.0.56.200 before ACME sync"
spec:
secretName: fc-devicemgmt-web-tls
issuerRef:
name: step-ca-acme
kind: ClusterIssuer
dnsNames:
- devices.iamworkin.lan
duration: 720h
renewBefore: 240h

View File

@@ -0,0 +1,81 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: fc-devicemgmt-operator
labels:
app.kubernetes.io/name: fc-devicemgmt-operator
app.kubernetes.io/component: operator
app.kubernetes.io/part-of: flowercore
app.kubernetes.io/managed-by: argocd
flowercore.io/tenant-id: system
flowercore.io/created-by: bluejay-infra
rules:
- apiGroups:
- devices.flowercore.io
resources:
- '*'
verbs:
- get
- list
- watch
- create
- update
- patch
- delete
- apiGroups:
- devices.flowercore.io
resources:
- devices/status
- devices/finalizers
- devicegroups/status
- devicegroups/finalizers
- devicepolicies/status
- devicepolicies/finalizers
- remotecommands/status
- remotecommands/finalizers
verbs:
- get
- update
- patch
- apiGroups:
- apps
resources:
- deployments
verbs:
- get
- apiGroups:
- ""
resources:
- pods
- services
- configmaps
- secrets
- events
verbs:
- get
- list
- watch
- create
- update
- patch
- delete
- apiGroups:
- batch
resources:
- jobs
verbs:
- get
- list
- watch
- create
- update
- patch
- delete
- apiGroups:
- networking.k8s.io
resources:
- networkpolicies
verbs:
- get
- list
- watch

View File

@@ -0,0 +1,19 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: fc-devicemgmt-operator
labels:
app.kubernetes.io/name: fc-devicemgmt-operator
app.kubernetes.io/component: operator
app.kubernetes.io/part-of: flowercore
app.kubernetes.io/managed-by: argocd
flowercore.io/tenant-id: system
flowercore.io/created-by: bluejay-infra
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: fc-devicemgmt-operator
subjects:
- kind: ServiceAccount
name: fc-devicemgmt-operator
namespace: fc-devicemgmt

View File

@@ -0,0 +1,109 @@
# FlowerCore.DeviceManagement Operator.
#
# KubeOps controller for devices.flowercore.io resources. Operator-created
# children must set OwnerReferences + traceability labels/annotations per
# k8s-pod-ownership-and-traceability-standard.md. RBAC below grants
# apps/deployments/get so the process can resolve its own Deployment UID.
apiVersion: apps/v1
kind: Deployment
metadata:
name: fc-devicemgmt-operator
namespace: fc-devicemgmt
labels:
app: fc-devicemgmt-operator
app.kubernetes.io/name: fc-devicemgmt-operator
app.kubernetes.io/component: operator
app.kubernetes.io/part-of: flowercore
app.kubernetes.io/managed-by: argocd
flowercore.io/tenant-id: system
flowercore.io/created-by: bluejay-infra
annotations:
flowercore.io/traceability-standard: k8s-pod-ownership-and-traceability-standard
spec:
replicas: 1
revisionHistoryLimit: 3
selector:
matchLabels:
app: fc-devicemgmt-operator
template:
metadata:
labels:
app: fc-devicemgmt-operator
app.kubernetes.io/name: fc-devicemgmt-operator
app.kubernetes.io/component: operator
app.kubernetes.io/part-of: flowercore
app.kubernetes.io/managed-by: argocd
flowercore.io/tenant-id: system
flowercore.io/created-by: bluejay-infra
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "8080"
prometheus.io/path: "/metrics"
flowercore.io/audit-trace-id: "runtime-activity-trace"
spec:
serviceAccountName: fc-devicemgmt-operator
securityContext:
fsGroup: 1654
fsGroupChangePolicy: OnRootMismatch
containers:
- name: operator
image: localhost/fc-devicemgmt-operator:v20260512-cx5
imagePullPolicy: Never
ports:
- name: metrics
containerPort: 8080
env:
- name: ASPNETCORE_ENVIRONMENT
value: "Production"
- name: DOTNET_SYSTEM_GLOBALIZATION_INVARIANT
value: "false"
- name: POD_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: POD_NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: FLOWERCORE_KUBERNETES_OWNER_DEPLOYMENT
value: "fc-devicemgmt-operator"
- name: FlowerCore__Service__Name
value: "FlowerCore.DeviceManagement.Operator"
- name: FlowerCore__DeviceManagement__DefaultTenantId
value: "system"
resources:
requests:
cpu: 50m
memory: 128Mi
limits:
cpu: 500m
memory: 512Mi
readinessProbe:
tcpSocket:
port: 8080
initialDelaySeconds: 5
periodSeconds: 10
livenessProbe:
tcpSocket:
port: 8080
initialDelaySeconds: 20
periodSeconds: 30
securityContext:
runAsNonRoot: true
runAsUser: 1654
runAsGroup: 1654
allowPrivilegeEscalation: false
readOnlyRootFilesystem: true
capabilities:
drop:
- ALL
volumeMounts:
- name: tmp
mountPath: /tmp
- name: logs
mountPath: /app/logs
volumes:
- name: tmp
emptyDir: {}
- name: logs
emptyDir: {}

View File

@@ -0,0 +1,135 @@
# FlowerCore.DeviceManagement Web.
#
# Source repo is expected to ship FlowerCore.DeviceManagement.Web in a later
# Sprint 9+ lane. This manifest is static-valid without requiring the image to
# exist yet; import localhost/fc-devicemgmt-web:<tag> to all schedulable RKE2
# nodes before letting ArgoCD sync a live rollout.
apiVersion: apps/v1
kind: Deployment
metadata:
name: fc-devicemgmt-web
namespace: fc-devicemgmt
labels:
app: fc-devicemgmt-web
app.kubernetes.io/name: fc-devicemgmt-web
app.kubernetes.io/component: web
app.kubernetes.io/part-of: flowercore
app.kubernetes.io/managed-by: argocd
flowercore.io/tenant-id: system
flowercore.io/created-by: bluejay-infra
annotations:
flowercore.io/traceability-standard: k8s-pod-ownership-and-traceability-standard
spec:
replicas: 2
revisionHistoryLimit: 3
selector:
matchLabels:
app: fc-devicemgmt-web
template:
metadata:
labels:
app: fc-devicemgmt-web
app.kubernetes.io/name: fc-devicemgmt-web
app.kubernetes.io/component: web
app.kubernetes.io/part-of: flowercore
app.kubernetes.io/managed-by: argocd
flowercore.io/tenant-id: system
flowercore.io/created-by: bluejay-infra
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "8080"
prometheus.io/path: "/metrics"
flowercore.io/audit-trace-id: "runtime-activity-trace"
spec:
securityContext:
fsGroup: 1654
fsGroupChangePolicy: OnRootMismatch
containers:
- name: web
image: localhost/fc-devicemgmt-web:v20260512-cx5
imagePullPolicy: Never
ports:
- name: http
containerPort: 8080
env:
- name: ASPNETCORE_URLS
value: "http://+:8080"
- name: ASPNETCORE_ENVIRONMENT
value: "Production"
- name: DOTNET_SYSTEM_GLOBALIZATION_INVARIANT
value: "false"
- name: FlowerCore__Service__Name
value: "FlowerCore.DeviceManagement.Web"
- name: FlowerCore__DeviceManagement__DefaultTenantId
value: "system"
- name: FlowerCore__Database__Provider
value: "MySql"
- name: FlowerCore__Database__Host
value: "mysql.fc-mysql.svc"
- name: FlowerCore__Database__Database
value: "flowercore_devicemgmt"
- name: FlowerCore__Database__User
value: "fc_devicemgmt"
- name: FlowerCore__Database__Password
valueFrom:
secretKeyRef:
name: fc-devicemgmt-runtime
key: DB-Password
- name: FlowerCore__DeviceManagement__AgentMtls__CaPath
value: "/secrets/devicemgmt-mtls/mtls-ca.pem"
- name: FlowerCore__DeviceManagement__AgentMtls__ClientCertificatePath
value: "/secrets/devicemgmt-mtls/mtls-client.crt"
- name: FlowerCore__DeviceManagement__AgentMtls__ClientKeyPath
value: "/secrets/devicemgmt-mtls/mtls-client.key"
- name: FlowerCore__EventBus__Redis__Configuration
value: "redis.fc-redis.svc:6379"
resources:
requests:
cpu: 100m
memory: 256Mi
limits:
cpu: 1000m
memory: 768Mi
startupProbe:
tcpSocket:
port: 8080
initialDelaySeconds: 5
periodSeconds: 5
failureThreshold: 30
readinessProbe:
tcpSocket:
port: 8080
periodSeconds: 10
failureThreshold: 3
livenessProbe:
tcpSocket:
port: 8080
initialDelaySeconds: 30
periodSeconds: 30
failureThreshold: 3
securityContext:
runAsNonRoot: true
runAsUser: 1654
runAsGroup: 1654
allowPrivilegeEscalation: false
readOnlyRootFilesystem: true
capabilities:
drop:
- ALL
volumeMounts:
- name: tmp
mountPath: /tmp
- name: logs
mountPath: /app/logs
- name: devicemgmt-mtls
mountPath: /secrets/devicemgmt-mtls
readOnly: true
volumes:
- name: tmp
emptyDir: {}
- name: logs
emptyDir: {}
- name: devicemgmt-mtls
secret:
secretName: fc-devicemgmt-runtime
defaultMode: 0400

View File

@@ -0,0 +1,55 @@
# LAN ingress for FlowerCore.DeviceManagement Web.
#
# RKE2 Traefik has no built-in ACME resolver configured. Keep TLS certificate
# ownership in cert-manager Certificate/fc-devicemgmt-web-tls.
apiVersion: traefik.io/v1alpha1
kind: IngressRoute
metadata:
name: fc-devicemgmt-web
namespace: fc-devicemgmt
labels:
app.kubernetes.io/name: fc-devicemgmt-web
app.kubernetes.io/component: web
app.kubernetes.io/part-of: flowercore
app.kubernetes.io/managed-by: argocd
flowercore.io/tenant-id: system
flowercore.io/created-by: bluejay-infra
spec:
entryPoints:
- websecure
routes:
- match: Host(`devices.iamworkin.lan`)
kind: Rule
services:
- name: fc-devicemgmt-web
port: 80
tls:
secretName: fc-devicemgmt-web-tls
# Future public agent/update host gate (OFF by default):
#
# Do not enable `update.flowercore.io` here until Authentik OIDC Q-OIDC-1
# resolves the public-device-management auth model and route ownership with
# UpdateCenter. When enabled, use a separate public IngressRoute with an
# explicit Method allowlist, public-host auth middleware, and public TLS
# certificate strategy. Leaving this as comments keeps ArgoCD from stealing
# live UpdateCenter traffic.
#
# apiVersion: traefik.io/v1alpha1
# kind: IngressRoute
# metadata:
# name: fc-devicemgmt-web-public
# namespace: fc-devicemgmt
# annotations:
# flowercore.io/public-host-gate: "disabled-until-Q-OIDC-1"
# spec:
# entryPoints:
# - websecure
# routes:
# - match: Host(`update.flowercore.io`) && (Method(`GET`) || Method(`HEAD`) || Method(`POST`) || Method(`OPTIONS`))
# kind: Rule
# services:
# - name: fc-devicemgmt-web
# port: 80
# tls:
# secretName: fc-devicemgmt-public-tls

View File

@@ -0,0 +1,13 @@
# FlowerCore.DeviceManagement namespace.
#
# ArgoCD discovers this directory as Application `infra-fc-devicemgmt`.
apiVersion: v1
kind: Namespace
metadata:
name: fc-devicemgmt
labels:
app.kubernetes.io/name: fc-devicemgmt
app.kubernetes.io/part-of: flowercore
app.kubernetes.io/managed-by: argocd
flowercore.io/tenant-id: system
flowercore.io/created-by: bluejay-infra

View File

@@ -0,0 +1,224 @@
# FlowerCore.DeviceManagement NetworkPolicies.
#
# NetworkPolicies belong in bluejay-infra so ArgoCD owns rebuild state.
# Rules include Traefik post-DNAT backend ports per
# feedback_netpol_dnat_backend_port and Synology NFS egress for the requested
# cold-tier / future artifact path.
---
apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
metadata:
name: fc-devicemgmt-web-isolation
namespace: fc-devicemgmt
labels:
app.kubernetes.io/name: fc-devicemgmt-web
app.kubernetes.io/component: web
app.kubernetes.io/part-of: flowercore
app.kubernetes.io/managed-by: argocd
flowercore.io/tenant-id: system
flowercore.io/created-by: bluejay-infra
spec:
podSelector:
matchLabels:
app: fc-devicemgmt-web
policyTypes:
- Ingress
- Egress
ingress:
# LAN edge: only cluster Traefik should reach the Web pod for
# devices.iamworkin.lan.
- from:
- namespaceSelector:
matchLabels:
kubernetes.io/metadata.name: traefik-system
podSelector:
matchLabels:
app.kubernetes.io/name: traefik
ports:
- port: 8080
protocol: TCP
# Direct LAN diagnostics are allowed only from FlowerCore LAN/VPN ranges.
- from:
- ipBlock:
cidr: 10.0.56.0/24
- ipBlock:
cidr: 10.0.57.0/24
- ipBlock:
cidr: 10.0.58.0/24
- ipBlock:
cidr: 10.0.68.0/27
ports:
- port: 8080
protocol: TCP
egress:
# CoreDNS.
- to:
- namespaceSelector:
matchLabels:
kubernetes.io/metadata.name: kube-system
podSelector:
matchLabels:
k8s-app: kube-dns
ports:
- port: 53
protocol: UDP
- port: 53
protocol: TCP
# Database namespace.
- to:
- namespaceSelector:
matchLabels:
kubernetes.io/metadata.name: fc-mysql
ports:
- port: 3306
protocol: TCP
# Redis backplane for multi-replica SignalR / live-status fan-out.
- to:
- namespaceSelector:
matchLabels:
kubernetes.io/metadata.name: fc-redis
ports:
- port: 6379
protocol: TCP
# Traefik VIP / in-cluster Traefik for self-callbacks and public URL
# generation tests. Include post-DNAT backend ports 8443 + 8080.
- to:
- ipBlock:
cidr: 10.0.56.200/32
- namespaceSelector:
matchLabels:
kubernetes.io/metadata.name: traefik-system
podSelector:
matchLabels:
app.kubernetes.io/name: traefik
ports:
- port: 80
protocol: TCP
- port: 443
protocol: TCP
- port: 8080
protocol: TCP
- port: 8443
protocol: TCP
# Agent egress: LAN/VPN devices may run DM Agent in Generic, Kiosk, Pi,
# ThinClient, or Server mode. Keep this private-range only.
- to:
- ipBlock:
cidr: 10.0.56.0/24
- ipBlock:
cidr: 10.0.57.0/24
- ipBlock:
cidr: 10.0.58.0/24
- ipBlock:
cidr: 10.0.68.0/27
ports:
- port: 80
protocol: TCP
- port: 443
protocol: TCP
- port: 8080
protocol: TCP
- port: 8443
protocol: TCP
- port: 5000
protocol: TCP
- port: 5001
protocol: TCP
# Synology NFS cold-tier / artifact mount allowance.
- to:
- ipBlock:
cidr: 10.0.58.3/32
ports:
- port: 2049
protocol: TCP
- port: 2049
protocol: UDP
- port: 111
protocol: TCP
- port: 111
protocol: UDP
---
apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
metadata:
name: fc-devicemgmt-operator-isolation
namespace: fc-devicemgmt
labels:
app.kubernetes.io/name: fc-devicemgmt-operator
app.kubernetes.io/component: operator
app.kubernetes.io/part-of: flowercore
app.kubernetes.io/managed-by: argocd
flowercore.io/tenant-id: system
flowercore.io/created-by: bluejay-infra
spec:
podSelector:
matchLabels:
app: fc-devicemgmt-operator
policyTypes:
- Ingress
- Egress
ingress:
- from:
- namespaceSelector:
matchLabels:
kubernetes.io/metadata.name: monitoring
ports:
- port: 8080
protocol: TCP
egress:
# CoreDNS.
- to:
- namespaceSelector:
matchLabels:
kubernetes.io/metadata.name: kube-system
podSelector:
matchLabels:
k8s-app: kube-dns
ports:
- port: 53
protocol: UDP
- port: 53
protocol: TCP
# Kubernetes API for KubeOps reconciliation and Deployment UID lookup.
- to: []
ports:
- port: 443
protocol: TCP
- port: 6443
protocol: TCP
# Agent egress for operator-initiated probes / fallback command dispatch.
- to:
- ipBlock:
cidr: 10.0.56.0/24
- ipBlock:
cidr: 10.0.57.0/24
- ipBlock:
cidr: 10.0.58.0/24
- ipBlock:
cidr: 10.0.68.0/27
ports:
- port: 80
protocol: TCP
- port: 443
protocol: TCP
- port: 8080
protocol: TCP
- port: 8443
protocol: TCP
- port: 5000
protocol: TCP
- port: 5001
protocol: TCP
# Synology NFS allowance for future cold-tier/audit archival jobs.
- to:
- ipBlock:
cidr: 10.0.58.3/32
ports:
- port: 2049
protocol: TCP
- port: 2049
protocol: UDP
- port: 111
protocol: TCP
- port: 111
protocol: UDP

View File

@@ -0,0 +1,22 @@
apiVersion: v1
kind: Service
metadata:
name: fc-devicemgmt-web
namespace: fc-devicemgmt
labels:
app: fc-devicemgmt-web
app.kubernetes.io/name: fc-devicemgmt-web
app.kubernetes.io/component: web
app.kubernetes.io/part-of: flowercore
app.kubernetes.io/managed-by: argocd
flowercore.io/tenant-id: system
flowercore.io/created-by: bluejay-infra
spec:
type: ClusterIP
selector:
app: fc-devicemgmt-web
ports:
- name: http
port: 80
targetPort: 8080
protocol: TCP

View File

@@ -0,0 +1,12 @@
apiVersion: v1
kind: ServiceAccount
metadata:
name: fc-devicemgmt-operator
namespace: fc-devicemgmt
labels:
app.kubernetes.io/name: fc-devicemgmt-operator
app.kubernetes.io/component: operator
app.kubernetes.io/part-of: flowercore
app.kubernetes.io/managed-by: argocd
flowercore.io/tenant-id: system
flowercore.io/created-by: bluejay-infra

View File

@@ -1,3 +1,2 @@
# Restart kiosk and redeclare capabilities when HDMI connect/disconnect changes DRM state.
SUBSYSTEM=="drm", KERNEL=="card?-HDMI-A-?", ACTION=="change", RUN+="/usr/bin/systemctl restart flowercore-signage-player-pi.service"
SUBSYSTEM=="drm", KERNEL=="card?-HDMI-A-?", ACTION=="change", RUN+="/usr/bin/systemctl start flowercore-signage-detect-display.service"
# Settle DRM for 2s before restarting Chromium, then redeclare capabilities.
SUBSYSTEM=="drm", KERNEL=="card?-HDMI-A-?", ACTION=="change", RUN+="/usr/bin/systemctl start flowercore-signage-player-pi-hdmi.service"

View File

@@ -0,0 +1,22 @@
#!/usr/bin/env bats
setup() {
APP_ROOT="$(cd "$BATS_TEST_DIRNAME/.." && pwd)"
DETECT="$APP_ROOT/scripts/fc-signage-detect-display"
}
@test "display detection emits graceful disconnected profile when no hdmi connector is present" {
script="$(cat "$DETECT")"
[[ "$script" == *"displayConnected: false"* ]]
[[ "$script" == *"No HDMI display detected"* ]]
}
@test "display detection parses edid, falls back to kmsprint, and logs endpoint failures locally" {
script="$(cat "$DETECT")"
[[ "$script" == *"edid-decode"* ]]
[[ "$script" == *"HDR (Static|Dynamic) Metadata Block"* ]]
[[ "$script" == *"kmsprint"* ]]
[[ "$script" == *"/api/v1/nodes/\${NODE_ID}/capabilities"* ]]
[[ "$script" == *"/api/v1/displays/\${NODE_ID}/capability-profile"* ]]
[[ "$script" == *"capabilities.log"* ]]
}

View File

@@ -0,0 +1,64 @@
#!/usr/bin/env bats
setup() {
APP_ROOT="$(cd "$BATS_TEST_DIRNAME/.." && pwd)"
BOOTSTRAP="$APP_ROOT/scripts/flowercore-signage-bootstrap.sh"
RENEW="$APP_ROOT/scripts/flowercore-signage-renew-cert.sh"
}
@test "bootstrap is idempotent when node is already enrolled" {
script="$(cat "$BOOTSTRAP")"
[[ "$script" == *'[[ -s "$NODE_JSON" && -s "$CERT_DIR/client.p12" ]]'* ]]
[[ "$script" == *"already enrolled"* ]]
[[ "$script" == *"exit 0"* ]]
}
@test "bootstrap generates a stable node uuid and machine id" {
script="$(cat "$BOOTSTRAP")"
[[ "$script" == *"uuidgen"* ]]
[[ "$script" == *"nodeUuid"* ]]
[[ "$script" == *"machineId"* ]]
[[ "$script" == *"cut -c1-16"* ]]
}
@test "bootstrap posts to the canonical register endpoint" {
grep -q '/api/v1/nodes/register' "$BOOTSTRAP"
grep -q '"linux-arm64-pi"' "$BOOTSTRAP"
}
@test "bootstrap retries registration once for first-call races" {
script="$(cat "$BOOTSTRAP")"
[[ "$script" == *"for attempt in 1 2"* ]]
[[ "$script" == *"register attempt \$attempt returned"* ]]
[[ "$script" == *"sleep 5"* ]]
}
@test "bootstrap supports setup-code approval with manual polling fallback" {
script="$(cat "$BOOTSTRAP")"
[[ "$script" == *"signage-setup-code"* ]]
[[ "$script" == *"approve-via-setup-code"* ]]
[[ "$script" == *"+ 1800"* ]]
[[ "$script" == *"sleep 15"* ]]
}
@test "bootstrap generates an ecdsa p256 csr for the signage pi subject" {
script="$(cat "$BOOTSTRAP")"
[[ "$script" == *"ecparam -genkey -name prime256v1"* ]]
[[ "$script" == *'/CN=${NODE_ID}/O=FlowerCore/OU=SignagePlayer-Pi'* ]]
}
@test "bootstrap writes pkcs12 bundle with restrictive permissions" {
script="$(cat "$BOOTSTRAP")"
[[ "$script" == *"openssl pkcs12 -export"* ]]
[[ "$script" == *"client.p12.pass"* ]]
[[ "$script" == *"chmod 0640"* ]]
[[ "$script" == *"chmod 0600"* ]]
}
@test "renewal only calls renew endpoint inside the thirty-day window and swaps atomically" {
script="$(cat "$RENEW")"
[[ "$script" == *'-checkend $((30*24*3600))'* ]]
[[ "$script" == *"/api/v1/nodes/\${NODE_ID}/renew"* ]]
[[ "$script" == *"client.key.new"* ]]
[[ "$script" == *'mv "$CERT_DIR/client.p12.new" "$CERT_DIR/client.p12"'* ]]
}

View File

@@ -0,0 +1,68 @@
#!/usr/bin/env bats
setup() {
APP_ROOT="$(cd "$BATS_TEST_DIRNAME/.." && pwd)"
}
@test "player unit exists" {
[ -f "$APP_ROOT/systemd/flowercore-signage-player-pi.service" ]
}
@test "player unit uses simple chromium service with restart backoff" {
unit="$(cat "$APP_ROOT/systemd/flowercore-signage-player-pi.service")"
[[ "$unit" == *"Type=simple"* ]]
[[ "$unit" == *"Restart=always"* ]]
[[ "$unit" == *"RestartSec=10s"* ]]
[[ "$unit" == *"StartLimitBurst=5"* ]]
[[ "$unit" == *"StartLimitIntervalSec=300s"* ]]
}
@test "player unit caps chromium memory at two gigabytes" {
grep -q '^MemoryMax=2G$' "$APP_ROOT/systemd/flowercore-signage-player-pi.service"
grep -q '^MemoryHigh=1500M$' "$APP_ROOT/systemd/flowercore-signage-player-pi.service"
}
@test "player unit condition-gates startup on identity and p12 certificate" {
grep -q '^ConditionPathExists=/etc/flowercore/signage-node.json$' "$APP_ROOT/systemd/flowercore-signage-player-pi.service"
grep -q '^ConditionPathExists=/etc/fc-signage-player/client.p12$' "$APP_ROOT/systemd/flowercore-signage-player-pi.service"
}
@test "player unit runs prelaunch checks before chromium" {
grep -q '^ExecStartPre=/usr/local/bin/flowercore-signage-prelaunch.sh$' "$APP_ROOT/systemd/flowercore-signage-player-pi.service"
grep -q '^ExecStart=/usr/local/bin/flowercore-signage-launch.sh$' "$APP_ROOT/systemd/flowercore-signage-player-pi.service"
}
@test "hdmi udev rule routes through the two-second settle service" {
rule="$(cat "$APP_ROOT/systemd/99-flowercore-signage-hdmi.rules")"
[[ "$rule" == *'KERNEL=="card?-HDMI-A-?"'* ]]
[[ "$rule" == *"systemctl start flowercore-signage-player-pi-hdmi.service"* ]]
[[ "$rule" != *"systemctl restart flowercore-signage-player-pi.service"* ]]
}
@test "hdmi responder settles, declares display, then restarts chromium" {
responder="$(cat "$APP_ROOT/scripts/flowercore-signage-hdmi-respond.sh")"
[[ "$responder" == *"sleep 2"* ]]
[[ "$responder" == *"systemctl start flowercore-signage-detect-display.service"* ]]
[[ "$responder" == *"systemctl restart flowercore-signage-player-pi.service"* ]]
}
@test "chromium policy json is valid and disables credential prompts" {
command -v jq >/dev/null || skip "jq not installed"
jq -e '.AutofillAddressEnabled == false and .AutofillCreditCardEnabled == false and .PasswordManagerEnabled == false' \
"$APP_ROOT/chromium-policies/flowercore-signage.json" >/dev/null
}
@test "launch script tries embed URL and logs bare-player fallback" {
launch="$(cat "$APP_ROOT/scripts/flowercore-signage-launch.sh")"
[[ "$launch" == *'/player/${NODE_ID}/embed?token=${CERT_THUMB}'* ]]
[[ "$launch" == *"url-divergence.log"* ]]
[[ "$launch" == *'/player/${NODE_ID}?token=${CERT_THUMB}'* ]]
}
@test "prelaunch script validates required node and cert files" {
prelaunch="$(cat "$APP_ROOT/scripts/flowercore-signage-prelaunch.sh")"
[[ "$prelaunch" == *"/etc/flowercore/signage-node.json"* ]]
[[ "$prelaunch" == *"/etc/fc-signage-player/client.p12"* ]]
[[ "$prelaunch" == *"/etc/fc-signage-player/client.p12.pass"* ]]
[[ "$prelaunch" == *"exit 1"* ]]
}

View File

@@ -532,7 +532,7 @@ spec:
fsGroupChangePolicy: OnRootMismatch
containers:
- name: web
image: localhost/fc-ttsreader-web:v20260506-phase6
image: localhost/fc-ttsreader-web:v20260518-sprint36-demo-finish-b132cbf
imagePullPolicy: Never
ports:
- containerPort: 5217
@@ -555,9 +555,13 @@ spec:
- name: TtsReader__Jobs__Root
value: "/data/jobs"
- name: TtsReader__Piper__Host
value: "ttsreader-piper.fc-ttsreader.svc.cluster.local."
value: "10.0.57.17"
- name: TtsReader__Piper__Port
value: "10200"
value: "8500"
- name: TtsReader__Piper__Transport
value: "http"
- name: TtsReader__Piper__HttpPath
value: "/tts"
- name: TtsReader__Kokoro__Enabled
value: "true"
- name: TtsReader__Kokoro__BaseUrl

View File

@@ -0,0 +1,94 @@
# GitHub Runner Fleet
ArgoCD owns `apps/github-runner/github-runner.yaml`. Do not patch live runner
Deployments with `kubectl`; update this manifest and let ArgoCD reconcile.
## Runner Shape
All repo-scoped Linux runners use:
- `ACCESS_TOKEN` from the `github-runner-token` Secret
- `RUN_AS_ROOT=false`
- `EPHEMERAL=true`
- `LABELS=self-hosted,linux,fc-build-linux`
- writable non-root paths under `/home/runner` for .NET, NuGet, XDG cache, and
Actions tool cache
`github-runner` for `FlowerCore.Common` is single-replica because it retains the
original Longhorn ReadWriteOnce NuGet PVC. Every other repo-scoped runner uses
two replicas with per-pod `emptyDir` caches. That is the safe backlog-drain
strategy: no two pods share one RWO PVC.
Sprint 32 final long-tail wave adds 16 two-replica Deployments:
`FlowerCore.Knowledge`, `FlowerCore.LlmBridge`, `FlowerCore.Media`,
`FlowerCore.Presentations`, `FlowerCore.RemoteDesktop`, `FlowerCore.DNS`,
`FlowerCore.Distribution`, `FlowerCore.Scoreboard`,
`FlowerCore.SegmentDisplay`, `FlowerCore.Signage.Contracts`,
`FlowerCore.SignalControl`, `FlowerCore.Intranet.Web`,
`FlowerCore.Provisioning`, `FlowerCore.Redis`, `FlowerCore.MessageBoard`, and
`FlowerCore.MenuBoard`.
Sprint 37 Cx-2 closes the audited Linux runner gaps for
`FlowerCore.DeviceManagement` and `FlowerCore.WorldBuilder` with the same
two-replica `emptyDir` pattern.
## Post-Merge Proof
After the PR is merged and ArgoCD syncs, verify the runner fleet:
```bash
kubectl -n github-runner get deploy,pods,pvc
```
Verify GitHub registration for the repo-scoped runners:
```bash
for repo in FlowerCore.Common FlowerCore.Shared.Pos FlowerCore.Puppet FlowerCore.Signage \
FlowerCore.DMS FlowerCore.Telephony FlowerCore.Print.Web FlowerCore.Chat \
FlowerCore.MySQL FlowerCore.Kiosk.Linux FlowerCore.Marquee FlowerCore.TtsReader \
FlowerCore.Knowledge FlowerCore.LlmBridge FlowerCore.Media \
FlowerCore.Presentations FlowerCore.RemoteDesktop FlowerCore.DNS \
FlowerCore.Distribution FlowerCore.Scoreboard FlowerCore.SegmentDisplay \
FlowerCore.Signage.Contracts FlowerCore.SignalControl FlowerCore.Intranet.Web \
FlowerCore.Provisioning FlowerCore.Redis FlowerCore.MessageBoard \
FlowerCore.MenuBoard FlowerCore.DeviceManagement FlowerCore.WorldBuilder; do
echo "=== $repo ==="
gh api "/repos/astoltz/$repo/actions/runners" \
--jq '.runners[] | select(.labels[].name == "fc-build-linux") | {name,status,busy,labels:[.labels[].name]}'
done
```
Shared.Pos publish proof after the runner pod is online:
```bash
gh run list --repo astoltz/FlowerCore.Shared.Pos \
--workflow "Build, Test & Publish" --branch main --limit 5
```
If the latest run is still queued after runner registration, rerun the workflow
from GitHub Actions and verify it lands on an `rke2-linux-*` runner.
## Sprint 37 Cx-2 Gap Audit
The 2026-05-18 GitHub workflow scan found these remaining repos with
`runs-on: [self-hosted, linux, fc-build-linux]` but no K8s runner Deployment:
`FlowerCore.AiStation.Linux`, `FlowerCore.PHP`, `FlowerCore.PiManager`,
`FlowerCore.Shared.Barcodes`, `FlowerCore.Shared.Lookup`,
`FlowerCore.Shared.Nodes`, `FlowerCore.Shared.PrintClient`,
`FlowerCore.Shared.Relay`, `FlowerCore.Shared.ShowRunner`, and
`FlowerCore.Shared.Storage`.
Mixed/platform repos also have Linux workflow legs but need owner review before
adding Linux runner Deployments: `FlowerCore.Library.Mac`,
`FlowerCore.Signage.Agent.AppleTv`, and `FlowerCore.Signage.Player.Wpf`.
## Failure Notes
- `actions/setup-dotnet` permission error at `/usr/share/dotnet`: check that
`DOTNET_INSTALL_DIR=/home/runner/.dotnet` and related cache env vars are
present on the runner pod.
- `404` during runner registration: the fine-grained PAT is valid but missing
repository access for that repo. Add the repo to the PAT access list; the PAT
value does not change.
- `Multi-Attach` volume error: only the Common runner uses a RWO PVC and it must
stay single-replica. New multi-replica runners use `emptyDir`.

File diff suppressed because it is too large Load Diff

View File

@@ -75,6 +75,20 @@ data:
cluster: "rke2"
role: "agent"
# Mac mini macOS runner node (INFRA VLAN)
- job_name: "macmini-node"
scrape_timeout: 15s
static_configs:
- targets: ["10.0.56.115:9100"]
labels:
instance: "macmini"
host: "macmini.iamworkin.lan"
vlan: "infra"
arch: "arm64"
role: "macos-runner"
puppet_managed: "true"
puppet_server: "puppet.iamworkin.lan"
# In-cluster node-exporter DaemonSet
- job_name: "k8s-node-exporter"
kubernetes_sd_configs:
@@ -697,6 +711,36 @@ data:
summary: "Print.Web Ollama runner held for >10m ({{ $labels.model }})"
description: "Print.Web reports model {{ $labels.model }} with {{ $value | printf \"%.0f\" }}s of keep-alive remaining. Check concurrent requests before the Pi 5 Ollama lane thrashes."
- name: macmini-runners
rules:
- alert: MacMiniRunnerOffline
expr: (flowercore_github_runner_online{runner=~"macmini-.*"} == 0) or absent(flowercore_github_runner_online{runner=~"macmini-.*"})
for: 10m
labels:
severity: warning
service: github-runner
annotations:
summary: "Mac mini GitHub runner offline ({{ $labels.runner }})"
description: "A macmini-* GitHub Actions runner has not reported online for more than 10 minutes. Puppet manages its LaunchDaemon under /Library/LaunchDaemons/io.flowercore.github-runner-<slug>.plist; runners survive reboot and do not require a GUI session."
- name: linux-runners
rules:
- alert: LinuxRunnerOffline
expr: |
kube_deployment_status_replicas_ready{
namespace="github-runner",
deployment=~"github-runner(|-.+)"
} == 0
for: 5m
labels:
severity: warning
alert_channel: irc
service: github-runner
team: ci
annotations:
summary: "Linux CI runner offline: {{ $labels.deployment }}"
description: "Deployment {{ $labels.deployment }} in namespace github-runner has 0 ready replicas for more than 5 minutes. CI jobs targeting this repo will queue until the runner pod restarts and re-registers with GitHub. Check pods with: kubectl -n github-runner get pods -l app.kubernetes.io/name={{ $labels.deployment }}. Check logs with: kubectl -n github-runner logs -l app.kubernetes.io/name={{ $labels.deployment }} --tail=50. Common causes: PAT missing repo access, runner CrashLoopBackOff, or node/resource pressure."
- name: remote-desktop
rules:
- alert: RemoteDesktopWebDown
@@ -922,6 +966,52 @@ data:
annotations:
summary: "Disk usage high on {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}%)"
# Puppet agent + service alerts.
# Mirror of FlowerCore.Notes/scripts/monitoring/alerts.yml `puppet` group
# so a future migration to in-cluster Prometheus inherits the ruleset.
# Source-of-truth for the live Podman Prometheus on noc1 is the Notes file.
# See feedback_monitoring_k8s_target_vs_live_podman.
- name: puppet
rules:
- alert: PuppetAgentReportStale
expr: puppet_last_run_age_seconds > 7200
for: 30m
labels:
severity: warning
alert_channel: irc
annotations:
summary: "Puppet agent {{ $labels.instance }} hasn't reported in over 2h"
description: "Last run age: {{ $value | humanizeDuration }}. The puppet agent on {{ $labels.instance }} may be stopped, the node may be powered off, or noc1 may be unreachable from this node."
runbook: "1. SSH to node (via noc1 jumpbox if needed) 2. sudo systemctl status puppet 3. sudo puppet agent -t --noop to force a run 4. Check r10k: ssh fcadmin@10.0.56.10 'sudo podman logs openvoxserver --tail 50' 5. Verify noc1 reachability: ping puppet.iamworkin.lan"
- alert: PuppetAgentReportCritical
expr: puppet_last_run_age_seconds > 86400
for: 1h
labels:
severity: critical
alert_channel: irc
annotations:
summary: "Puppet agent {{ $labels.instance }} silent for over 24h — node is unmanaged"
description: "Last run age: {{ $value | humanizeDuration }}. Node {{ $labels.instance }} has not submitted a Puppet report in over 24 hours. Config drift is accumulating — investigate immediately. If intentional (maintenance), add to the exclusion filter or silence in Grafana."
runbook: "URGENT: 1. Check node power state 2. SSH via noc1 jumpbox: ssh fcadmin@10.0.56.10 then ssh <node> 3. sudo systemctl status puppet 4. sudo systemctl start puppet + sudo puppet agent -t 5. Check for network partitions (VLAN connectivity to 10.0.56.10) 6. If node was recently reimaged: sudo puppet agent -t to re-register with new SSL cert"
# Sprint 33 Cx-7 Phase B (2026-05-25 postmortem follow-up):
# Detects puppet.service in failed state — distinct from PuppetAgentReportStale
# which catches "agent hasn't run." This catches "systemd gave up restarting it"
# (CA-verify loop or other fatal exit). Requires node-exporter systemd collector
# enabled with --collector.systemd. If `node_systemd_unit_state` has no series
# for a node, the collector is disabled there — flag in postmortem follow-up.
- alert: PuppetServiceFailed
expr: node_systemd_unit_state{name="puppet.service",state="failed"} == 1
for: 5m
labels:
severity: warning
alert_channel: irc
annotations:
summary: "Puppet service failed on {{ $labels.instance }}"
description: "puppet.service on {{ $labels.instance }} has been in failed state for 5+ minutes. systemd has stopped auto-restarting (CA-verify-loop or other exit). Manual `systemctl status puppet` confirms. Run `sudo systemctl start puppet` to recover; investigate journal for root cause."
runbook_url: "https://github.com/astoltz/FlowerCore.Notes/blob/master/memory/feedback_puppet_service_dead_after_ca_loop_alert_misreads.md"
# K8s pod-state alerts. Require kube-state-metrics scrape (added
# 2026-04-26 — see scrape_configs above). Would have surfaced the
# agent-zero ollama-proxy 172x crash-loop instead of letting it
@@ -3395,6 +3485,39 @@ data:
relativeTimeRange: {from: 120, to: 0}
datasourceUid: __expr__
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [600], type: gt}}], refId: C}
- orgId: 1
name: CI Runners
folder: CI Alerts
interval: 1m
rules:
- uid: linux-runner-offline
title: LinuxRunnerOffline
condition: C
for: 5m
noDataState: OK
execErrState: Error
annotations:
summary: "Linux CI runner offline: {{ $labels.deployment }}"
description: "A github-runner namespace Deployment has 0 ready replicas for more than 5 minutes. CI jobs targeting that repo will queue until the runner pod restarts and re-registers."
runbook: "1. kubectl -n github-runner get pods -l app.kubernetes.io/name={{ $labels.deployment }} 2. kubectl -n github-runner logs -l app.kubernetes.io/name={{ $labels.deployment }} --tail=50 3. Verify PAT repo access if registration returns 404 4. Verify no RWO PVC is shared by scaled runners"
labels:
severity: warning
service: github-runner
alert_channel: irc
team: ci
data:
- refId: A
relativeTimeRange: {from: 300, to: 0}
datasourceUid: prometheus
model: {expr: 'kube_deployment_status_replicas_ready{namespace="github-runner",deployment=~"github-runner(|-.+)"} == 0', instant: true, refId: A}
- refId: B
relativeTimeRange: {from: 300, to: 0}
datasourceUid: __expr__
model: {type: reduce, expression: A, reducer: last, refId: B}
- refId: C
relativeTimeRange: {from: 300, to: 0}
datasourceUid: __expr__
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [0], type: gt}}], refId: C}
- orgId: 1
name: Infrastructure
folder: AI Stack Alerts
@@ -3427,6 +3550,32 @@ data:
relativeTimeRange: {from: 120, to: 0}
datasourceUid: __expr__
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [1], type: lt}}], refId: C}
- uid: macmini-runner-offline
title: MacMiniRunnerOffline
condition: C
for: 10m
noDataState: Alerting
execErrState: OK
annotations:
summary: Mac mini GitHub runner offline
description: "One or more macmini-* GitHub Actions runners have not reported online for more than 10 minutes. LaunchDaemons survive reboot and do not require the bluejay GUI session."
runbook: "1. ssh fcadmin@macmini.iamworkin.lan 2. launchctl print system/io.flowercore.github-runner-<slug> 3. Check /Users/fcadmin/Library/Logs/github-runners/<slug>/stderr.log 4. Re-register the repo runner if .runner is missing"
labels:
severity: warning
service: github-runner
data:
- refId: A
relativeTimeRange: {from: 600, to: 0}
datasourceUid: prometheus
model: {expr: 'min(flowercore_github_runner_online{runner=~"macmini-.*"} or vector(0))', instant: true, refId: A}
- refId: B
relativeTimeRange: {from: 600, to: 0}
datasourceUid: __expr__
model: {type: reduce, expression: A, reducer: last, refId: B}
- refId: C
relativeTimeRange: {from: 600, to: 0}
datasourceUid: __expr__
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [1], type: lt}}], refId: C}
- uid: high-cpu
title: High CPU (>85%)
condition: C

View File

@@ -28,9 +28,12 @@ Source: `D:\git\FlowerCore\FlowerCore.WorldBuilder` (master)
Memory: `feedback_rke2_image_import_per_node_scp`.
3. **Bump image tag** in `worldbuilder.yaml` and git push.
ArgoCD ApplicationSet picks up within ~3 minutes.
4. **First production render** — open `https://worldbuilder.iamworkin.lan`,
create World → Character → Storyboard → ExportJob, confirm artifact
downloads. ComfyUI lives on BLUEJAY-WS at `http://10.0.56.20:8188`.
4. **First production render** — open
`https://worldbuilder.iamworkin.lan/studio/c32e0000-0000-4000-8000-000000000004`
and confirm the Cyberpunk Blue Jay demo prompt loads with five seeded fake
generated images. This Sprint 32 visitor-safe profile uses
`ClientMode=fake`; switch the image-generation env vars back to ComfyUI only
for an operator-owned GPU render lane.
## Health probes
@@ -53,8 +56,13 @@ Source: `D:\git\FlowerCore\FlowerCore.WorldBuilder` (master)
## Image generation backend
`FlowerCore:WorldBuilder:ImageGeneration:BaseUrl=http://10.0.56.20:8188` —
ComfyUI runs on BLUEJAY-WS Windows (R9700 / gfx1201 / ROCm 7.2.1). Pod reaches
the workstation directly across the 10.0.56.0/24 VLAN (no Podman-style host-
filter issues — K8s pods route via Calico, which is L3-routed across the
VLAN).
Sprint 32 pins the Kubernetes profile to
`FlowerCore:WorldBuilder:ImageGeneration:ClientMode=fake` with
`BaseUrl=http://127.0.0.1:1`. That keeps the public/internal visitor demo
deterministic, avoids GPU exposure, and still exercises the studio/gallery
surface with persisted generated-image metadata.
The previous ComfyUI backend target was `http://10.0.56.20:8188` on
BLUEJAY-WS (R9700 / gfx1201 / ROCm 7.2.1). Re-enable it only in an
operator-owned follow-up that also verifies workstation reachability and image
import freshness.

View File

@@ -16,7 +16,11 @@ kind: Namespace
metadata:
name: fc-worldbuilder
labels:
app.kubernetes.io/name: fc-worldbuilder
app.kubernetes.io/part-of: flowercore
app.kubernetes.io/managed-by: argocd
flowercore.io/tenant-id: system
flowercore.io/created-by: bluejay-infra
---
# SQLite DB + generated image gallery + PDF/PNG exports.
# Longhorn RWO — single replica with `Recreate` rollout strategy keeps it safe.
@@ -25,6 +29,13 @@ kind: PersistentVolumeClaim
metadata:
name: worldbuilder-data
namespace: fc-worldbuilder
labels:
app.kubernetes.io/name: worldbuilder-data
app.kubernetes.io/component: storage
app.kubernetes.io/part-of: flowercore
app.kubernetes.io/managed-by: argocd
flowercore.io/tenant-id: system
flowercore.io/created-by: bluejay-infra
spec:
accessModes:
- ReadWriteOnce
@@ -40,7 +51,13 @@ metadata:
namespace: fc-worldbuilder
labels:
app.kubernetes.io/name: worldbuilder-web
app.kubernetes.io/component: web
app.kubernetes.io/part-of: flowercore
app.kubernetes.io/managed-by: argocd
flowercore.io/tenant-id: system
flowercore.io/created-by: bluejay-infra
annotations:
flowercore.io/traceability-standard: k8s-pod-ownership-and-traceability-standard
spec:
replicas: 1
revisionHistoryLimit: 3
@@ -54,11 +71,16 @@ spec:
metadata:
labels:
app.kubernetes.io/name: worldbuilder-web
app.kubernetes.io/component: web
app.kubernetes.io/part-of: flowercore
app.kubernetes.io/managed-by: argocd
flowercore.io/tenant-id: system
flowercore.io/created-by: bluejay-infra
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "8080"
prometheus.io/path: "/metrics/prometheus"
flowercore.io/audit-trace-id: "worldbuilder-runtime-demo"
spec:
securityContext:
fsGroup: 1654
@@ -92,11 +114,14 @@ spec:
value: "/data/gallery"
- name: FlowerCore__WorldBuilder__Export__RootPath
value: "/data/exports"
# ComfyUI on BLUEJAY-WS (R9700 / gfx1201 / ROCm 7.2.1).
# Visitor-safe Sprint 32 profile: fake backend keeps public demo
# rendering deterministic and avoids exposing BLUEJAY-WS GPU.
- name: FlowerCore__WorldBuilder__ImageGeneration__BaseUrl
value: "http://10.0.56.20:8188"
value: "http://127.0.0.1:1"
- name: FlowerCore__WorldBuilder__ImageGeneration__ClientMode
value: "comfyui"
value: "fake"
- name: FlowerCore__WorldBuilder__ImageGeneration__BackendId
value: "fake"
resources:
# Cluster CPU-request budget runs hot (99% on all 3 nodes at deploy
# time) while actual CPU usage is well below capacity. Idle Blazor
@@ -165,7 +190,11 @@ metadata:
namespace: fc-worldbuilder
labels:
app.kubernetes.io/name: worldbuilder-web
app.kubernetes.io/component: web
app.kubernetes.io/part-of: flowercore
app.kubernetes.io/managed-by: argocd
flowercore.io/tenant-id: system
flowercore.io/created-by: bluejay-infra
spec:
type: ClusterIP
selector:
@@ -180,6 +209,13 @@ kind: Certificate
metadata:
name: worldbuilder-web-tls
namespace: fc-worldbuilder
labels:
app.kubernetes.io/name: worldbuilder-web-tls
app.kubernetes.io/component: ingress
app.kubernetes.io/part-of: flowercore
app.kubernetes.io/managed-by: argocd
flowercore.io/tenant-id: system
flowercore.io/created-by: bluejay-infra
spec:
secretName: worldbuilder-web-tls
issuerRef:
@@ -200,6 +236,13 @@ kind: IngressRoute
metadata:
name: worldbuilder-web
namespace: fc-worldbuilder
labels:
app.kubernetes.io/name: worldbuilder-web
app.kubernetes.io/component: ingress
app.kubernetes.io/part-of: flowercore
app.kubernetes.io/managed-by: argocd
flowercore.io/tenant-id: system
flowercore.io/created-by: bluejay-infra
spec:
entryPoints:
- websecure

View File

@@ -305,15 +305,17 @@ spec:
path: /
port: 8080
initialDelaySeconds: 60
timeoutSeconds: 5
timeoutSeconds: 15
periodSeconds: 10
failureThreshold: 3
readinessProbe:
httpGet:
path: /
port: 8080
initialDelaySeconds: 30
periodSeconds: 5
timeoutSeconds: 5
timeoutSeconds: 15
failureThreshold: 3
---
apiVersion: v1
kind: Service

View File

@@ -0,0 +1,84 @@
# openvoxserver Quadlet Durability
This runbook documents the noc1 `openvoxserver` durability fix for the Puppet control-repo deploy path. The service is a noc1 host artifact, not an ArgoCD application, so discovery always starts on noc1 rather than in `apps/*`.
## Current State
As of the Sprint 32 Cx-12 apply on 2026-05-17:
- `/etc/containers/systemd/openvoxserver.container` has a `GIT_SSH_COMMAND` environment entry that points at the persisted serverdata deploy key.
- `/etc/systemd/system/openvoxserver-safeconfig.service` is enabled and active, and reapplies `git config --global --add safe.directory *` inside the running container.
- `/opt/puppet/r10k-deploy.sh` self-heals before each fetch by setting `safe.directory`, the repo-local `core.sshCommand`, and the persisted `known_hosts` file when needed.
- `puppet-deploy.service` exits `0/SUCCESS` after the apply and the control repo reports `HEAD == origin/master`.
- `systemctl cat openvoxserver` does not currently resolve to a generated unit on noc1. The container is running through Podman with `restart=always`, so destructive recreate smoke must not run until the generated unit is present.
## Discovery
Run every command through noc1 as `fcadmin`; do not assume BLUEJAY-WS can reach container-local surfaces directly.
```bash
ssh -i ~/.ssh/fcadmin_ed25519 fcadmin@10.0.56.10 "hostname && sudo -n true"
ssh -i ~/.ssh/fcadmin_ed25519 fcadmin@10.0.56.10 "sudo find /etc/containers/systemd /usr/share/containers/systemd /etc/systemd/system -name 'openvoxserver*' 2>/dev/null"
ssh -i ~/.ssh/fcadmin_ed25519 fcadmin@10.0.56.10 "sudo sed -n '1,220p' /etc/containers/systemd/openvoxserver.container"
ssh -i ~/.ssh/fcadmin_ed25519 fcadmin@10.0.56.10 "sudo systemctl cat puppet-deploy.service"
```
If a future noc1 profile manages these files, update the Puppet control repo and let `puppet-deploy.service` apply the change. On 2026-05-17, host `puppet` was not installed, so Cx-12 used a direct noc1 host edit.
## Durable Fix Shape
The Quadlet keeps the deploy key as a path reference only:
```ini
Environment=GIT_SSH_COMMAND=ssh -i /opt/puppetlabs/server/data/puppetserver/.puppet-deploy-key -o StrictHostKeyChecking=yes -o IdentitiesOnly=yes -o UserKnownHostsFile=/opt/puppetlabs/server/data/puppetserver/.known_hosts
```
The safeconfig service is intentionally independent of `openvoxserver.service` until the generated unit exists. It waits for the `openvoxserver` container name and then runs:
```bash
/usr/bin/podman exec openvoxserver git config --global --add safe.directory *
```
The deploy script self-heals inside the container before it fetches the control repo:
```bash
git config --global --add safe.directory "*" 2>/dev/null || true
DEPLOY_KEY="/opt/puppetlabs/server/data/puppetserver/.puppet-deploy-key"
KNOWN_HOSTS="/opt/puppetlabs/server/data/puppetserver/.known_hosts"
REPO="/etc/puppetlabs/code/environments/production"
export GIT_SSH_COMMAND="ssh -i $DEPLOY_KEY -o StrictHostKeyChecking=yes -o IdentitiesOnly=yes -o UserKnownHostsFile=$KNOWN_HOSTS"
git -C "$REPO" config core.sshCommand "$GIT_SSH_COMMAND" 2>/dev/null || true
```
## Validation
Non-destructive validation:
```bash
ssh -i ~/.ssh/fcadmin_ed25519 fcadmin@10.0.56.10 "sudo grep -n 'GIT_SSH_COMMAND' /etc/containers/systemd/openvoxserver.container"
ssh -i ~/.ssh/fcadmin_ed25519 fcadmin@10.0.56.10 "sudo systemctl status openvoxserver-safeconfig.service --no-pager -l"
ssh -i ~/.ssh/fcadmin_ed25519 fcadmin@10.0.56.10 "sudo systemctl start puppet-deploy.service && sudo systemctl status puppet-deploy.service --no-pager -l"
ssh -i ~/.ssh/fcadmin_ed25519 fcadmin@10.0.56.10 "sudo podman exec openvoxserver git -C /etc/puppetlabs/code/environments/production config --get core.sshCommand"
```
Destructive recreate smoke is opt-in only:
```bash
scp scripts/monitoring/openvox-recreate-smoke.sh fcadmin@10.0.56.10:/tmp/openvox-recreate-smoke.sh
ssh -i ~/.ssh/fcadmin_ed25519 fcadmin@10.0.56.10 "chmod +x /tmp/openvox-recreate-smoke.sh && sudo OPENVOX_RECREATE_SMOKE=1 /tmp/openvox-recreate-smoke.sh"
```
Do not run the smoke during normal sprint work. It stops and removes the production container before starting it again through systemd, and it now refuses to continue unless `systemctl cat openvoxserver` succeeds.
## Credential Rotation Note
When rotating the Puppet deploy key, update the persisted serverdata copy on noc1:
```bash
sudo install -m 0600 -o root -g root <new-deploy-key> /opt/puppet/serverdata/.puppet-deploy-key
sudo podman exec openvoxserver sh -c "ssh-keyscan github.com > /opt/puppetlabs/server/data/puppetserver/.known_hosts"
sudo systemctl start openvoxserver-safeconfig.service
sudo systemctl start puppet-deploy.service
```
Never commit the deploy key or print it in logs.

View File

@@ -0,0 +1,48 @@
#!/usr/bin/env bash
set -euo pipefail
if [ "${OPENVOX_RECREATE_SMOKE:-}" != "1" ]; then
echo "SKIP: set OPENVOX_RECREATE_SMOKE=1 to run the destructive openvoxserver recreate smoke." >&2
exit 64
fi
SUDO="${SUDO:-sudo}"
REPO="/etc/puppetlabs/code/environments/production"
CORE_SSH_COMMAND_FRAGMENT=".puppet-deploy-key"
if ! $SUDO systemctl cat openvoxserver >/dev/null 2>&1; then
echo "SKIP: systemctl cat openvoxserver failed; refusing to remove a container without a verified systemd recreate path." >&2
exit 65
fi
before="$($SUDO podman exec openvoxserver git -C "$REPO" rev-parse --short HEAD)"
echo "Before recreate: $before"
$SUDO systemctl stop openvoxserver
$SUDO podman rm openvoxserver 2>/dev/null || true
$SUDO systemctl start openvoxserver
sleep 50
$SUDO systemctl start puppet-deploy.service
sleep 5
$SUDO systemctl status puppet-deploy.service --no-pager -l
after="$($SUDO podman exec openvoxserver git -C "$REPO" rev-parse --short origin/master)"
echo "After recreate origin/master: $after"
$SUDO test -d /opt/puppet/code/environments/production/site-modules/profile/manifests
core_ssh="$($SUDO podman exec openvoxserver git -C "$REPO" config --get core.sshCommand)"
case "$core_ssh" in
*"$CORE_SSH_COMMAND_FRAGMENT"*) ;;
*)
echo "FAIL: core.sshCommand does not reference the persisted deploy key." >&2
exit 1
;;
esac
$SUDO podman exec openvoxserver git -C "$REPO" status --short --branch
echo "PASS: openvoxserver recreate smoke completed without git safety or deploy-key failure."

View File

@@ -13,6 +13,7 @@ public sealed class FleetManifestLintTests
private static readonly HashSet<string> PublicReadOnlyHosts = new(StringComparer.Ordinal)
{
"brochure.flowercore.io",
"dist.flowercore.io",
"dns.iamworkin.lan",
};
@@ -54,6 +55,47 @@ public sealed class FleetManifestLintTests
"ttsreader-piper",
};
private static readonly IReadOnlyDictionary<string, string> LinuxRunnerRepos = new Dictionary<string, string>(StringComparer.Ordinal)
{
["github-runner"] = "https://github.com/astoltz/FlowerCore.Common",
["github-runner-sharedpos"] = "https://github.com/astoltz/FlowerCore.Shared.Pos",
["github-runner-puppet"] = "https://github.com/astoltz/FlowerCore.Puppet",
["github-runner-signage"] = "https://github.com/astoltz/FlowerCore.Signage",
["github-runner-dms"] = "https://github.com/astoltz/FlowerCore.DMS",
["github-runner-telephony"] = "https://github.com/astoltz/FlowerCore.Telephony",
["github-runner-print-web"] = "https://github.com/astoltz/FlowerCore.Print.Web",
["github-runner-chat"] = "https://github.com/astoltz/FlowerCore.Chat",
["github-runner-mysql"] = "https://github.com/astoltz/FlowerCore.MySQL",
["github-runner-kiosk-linux"] = "https://github.com/astoltz/FlowerCore.Kiosk.Linux",
["github-runner-devicemgmt"] = "https://github.com/astoltz/FlowerCore.DeviceManagement",
["github-runner-worldbuilder"] = "https://github.com/astoltz/FlowerCore.WorldBuilder",
};
private static readonly HashSet<string> ScaledLinuxRunnerDeployments = new(StringComparer.Ordinal)
{
"github-runner-sharedpos",
"github-runner-puppet",
"github-runner-signage",
"github-runner-dms",
"github-runner-telephony",
"github-runner-print-web",
"github-runner-chat",
"github-runner-mysql",
"github-runner-kiosk-linux",
"github-runner-devicemgmt",
"github-runner-worldbuilder",
};
private static readonly IReadOnlyDictionary<string, string> WritableRunnerEnv = new Dictionary<string, string>(StringComparer.Ordinal)
{
["HOME"] = "/home/runner",
["DOTNET_INSTALL_DIR"] = "/home/runner/.dotnet",
["DOTNET_CLI_HOME"] = "/home/runner",
["NUGET_PACKAGES"] = "/home/runner/.nuget/packages",
["XDG_CACHE_HOME"] = "/home/runner/.cache",
["RUNNER_TOOL_CACHE"] = "/home/runner/_tool",
};
[Fact]
public void IngressRoutes_MustKeepServiceReferencesInTheSameNamespace()
{
@@ -187,6 +229,98 @@ public sealed class FleetManifestLintTests
violations.Should().BeEmpty();
}
[Fact]
public void GitHubRunnerFleet_MustRegisterRequiredReposAsRepoScopedDeployments()
{
var deployments = GitHubRunnerDeployments();
foreach (var expectedRunner in LinuxRunnerRepos)
{
deployments.Should().ContainKey(expectedRunner.Key);
var container = RunnerContainer(deployments[expectedRunner.Key]);
EnvValue(container, "REPO_URL").Should().Be(expectedRunner.Value);
EnvValue(container, "EPHEMERAL").Should().Be("true");
EnvValue(container, "LABELS").Should().Be("self-hosted,linux,fc-build-linux");
EnvValue(container, "RUN_AS_ROOT").Should().Be("false");
EnvValue(container, "ACCESS_TOKEN").Should().BeNull("ACCESS_TOKEN must come from github-runner-token Secret, not a literal");
EnvSecretName(container, "ACCESS_TOKEN").Should().Be("github-runner-token");
EnvSecretKey(container, "ACCESS_TOKEN").Should().Be("credential");
}
}
[Fact]
public void GitHubRunnerFleet_MustSetWritableNonRootDotnetAndCachePaths()
{
foreach (var deployment in GitHubRunnerDeployments().Values)
{
var container = RunnerContainer(deployment);
foreach (var expectedEnv in WritableRunnerEnv)
{
EnvValue(container, expectedEnv.Key).Should().Be(expectedEnv.Value, $"{deployment.Name} must keep .NET paths writable for uid 1001");
}
var mounts = ManifestNodeExtensions.MappingSequence(container, "volumeMounts")
.ToDictionary(
mount => ManifestNodeExtensions.Scalar(mount, "name") ?? string.Empty,
mount => ManifestNodeExtensions.Scalar(mount, "mountPath") ?? string.Empty,
StringComparer.Ordinal);
mounts.Should().Contain("runner-home", "/home/runner");
mounts.Should().Contain("nuget-cache", "/home/runner/.nuget/packages");
mounts.Should().Contain("tmp", "/tmp");
}
}
[Fact]
public void GitHubRunnerFleet_MustAvoidRwoMultiAttachForScaledDeployments()
{
var deployments = GitHubRunnerDeployments();
foreach (var deploymentName in ScaledLinuxRunnerDeployments)
{
var deployment = deployments[deploymentName];
ReplicaCount(deployment).Should().Be(2);
var volumes = deployment.MappingSequence("spec", "template", "spec", "volumes");
var claimNames = volumes
.Select(volume => ManifestNodeExtensions.Scalar(volume, "persistentVolumeClaim", "claimName"))
.Where(value => !string.IsNullOrWhiteSpace(value))
.ToList();
claimNames.Should().BeEmpty($"{deploymentName} is scaled and must not share a RWO PVC");
volumes.Should().Contain(volume =>
string.Equals(ManifestNodeExtensions.Scalar(volume, "name"), "nuget-cache", StringComparison.Ordinal)
&& ManifestNodeExtensions.Mapping(volume, "emptyDir") != null);
}
var common = deployments["github-runner"];
ReplicaCount(common).Should().Be(1);
common.MappingSequence("spec", "template", "spec", "volumes")
.Select(volume => ManifestNodeExtensions.Scalar(volume, "persistentVolumeClaim", "claimName"))
.Where(value => !string.IsNullOrWhiteSpace(value))
.Should()
.ContainSingle()
.Which
.Should()
.Be("github-runner-nuget-cache");
}
[Fact]
public void Monitoring_MustAlertWhenLinuxRunnerDeploymentIsUnavailable()
{
var monitoring = File.ReadAllText(Path.Combine(Inventory.BluejayRoot, "apps", "monitoring", "noc-monitoring.yaml"));
monitoring.Should().Contain("MacMiniRunnerOffline");
monitoring.Should().Contain("LinuxRunnerOffline");
monitoring.Should().Contain("kube_deployment_status_replicas_ready");
monitoring.Should().Contain("github-runner(|-.+)");
monitoring.Should().Contain("folder: CI Alerts");
monitoring.Should().Contain("uid: linux-runner-offline");
monitoring.Should().Contain("alert_channel: irc");
}
[Fact]
public void StatefulSets_WithVolumeClaimTemplates_MustDeclareFilesystemDefaults()
{
@@ -291,6 +425,184 @@ public sealed class FleetManifestLintTests
violations.Should().BeEmpty();
}
[Fact]
public void FcDeviceManagement_MustShipExpectedManifestSet()
{
var appRoot = Path.Combine(Inventory.BluejayRoot, "apps", "fc-devicemgmt");
Directory.Exists(appRoot).Should().BeTrue("Sprint 8 Cx-5 owns apps/fc-devicemgmt.");
var expectedFiles = new[]
{
"1password-item.yaml",
"argocd-application.yaml",
"certificate-web.yaml",
"clusterrole-operator.yaml",
"clusterrolebinding-operator.yaml",
"deployment-operator.yaml",
"deployment-web.yaml",
"ingressroute-web.yaml",
"namespace.yaml",
"network-policy.yaml",
"service-web.yaml",
"serviceaccount-operator.yaml",
};
Directory.GetFiles(appRoot, "*.yaml")
.Select(Path.GetFileName)
.Should()
.BeEquivalentTo(expectedFiles);
foreach (var expectedFile in expectedFiles)
{
FcDeviceManagementDocuments()
.Should()
.Contain(document => document.RelativePath == $"fc-devicemgmt/{expectedFile}");
}
}
[Fact]
public void FcDeviceManagement_ObjectsMustCarryStandardTraceabilityLabels()
{
var requiredLabels = new[]
{
"app.kubernetes.io/name",
"app.kubernetes.io/part-of",
"app.kubernetes.io/managed-by",
"flowercore.io/tenant-id",
"flowercore.io/created-by",
};
var violations = FcDeviceManagementDocuments()
.SelectMany(document => requiredLabels
.Where(label => string.IsNullOrWhiteSpace(document.Scalar("metadata", "labels", label)))
.Select(label => $"{document.Descriptor} is missing metadata.labels['{label}']."))
.Concat(FcDeviceManagementDocuments()
.Where(document => document.Kind == "Deployment")
.SelectMany(document => requiredLabels
.Where(label => string.IsNullOrWhiteSpace(document.Scalar("spec", "template", "metadata", "labels", label)))
.Select(label => $"{document.Descriptor} pod template is missing metadata.labels['{label}'].")))
.Concat(FcDeviceManagementDocuments()
.Where(document => document.Kind == "Deployment")
.Where(document => string.IsNullOrWhiteSpace(document.Scalar("spec", "template", "metadata", "annotations", "flowercore.io/audit-trace-id")))
.Select(document => $"{document.Descriptor} pod template is missing flowercore.io/audit-trace-id."))
.ToList();
violations.Should().BeEmpty();
}
[Fact]
public void FcDeviceManagement_IngressMustUseCertManagerAndKeepPublicHostDisabled()
{
var appText = string.Join(
Environment.NewLine,
Directory.GetFiles(Path.Combine(Inventory.BluejayRoot, "apps", "fc-devicemgmt"), "*.yaml")
.Select(File.ReadAllText));
appText.Should().NotContain("certResolver");
appText.Should().Contain("update.flowercore.io");
appText.Should().Contain("disabled-until-Q-OIDC-1");
FcDeviceManagementDocuments()
.Where(document => document.Kind == "IngressRoute")
.SelectMany(document => document.MappingSequence("spec", "routes"))
.Select(route => ManifestNodeExtensions.Scalar(route, "match") ?? string.Empty)
.Should()
.Contain(match => match.Contains("Host(`devices.iamworkin.lan`)", StringComparison.Ordinal))
.And.NotContain(match => match.Contains("Host(`update.flowercore.io`)", StringComparison.Ordinal));
var certificate = FcDeviceManagementDocuments()
.Single(document => document.Kind == "Certificate" && document.Name == "fc-devicemgmt-web-tls");
certificate.Scalar("spec", "issuerRef", "name").Should().Be("step-ca-acme");
certificate.Scalar("spec", "issuerRef", "kind").Should().Be("ClusterIssuer");
ManifestNodeExtensions.ScalarSequence(certificate.Root, "spec", "dnsNames")
.Should()
.ContainSingle("devices.iamworkin.lan");
}
[Fact]
public void FcDeviceManagement_OperatorRbacMustCoverDevicesAndOwnerLookup()
{
var clusterRole = FcDeviceManagementDocuments()
.Single(document => document.Kind == "ClusterRole" && document.Name == "fc-devicemgmt-operator");
var allScalars = clusterRole.AllScalars().ToList();
allScalars.Should().Contain("devices.flowercore.io");
allScalars.Should().Contain("*");
allScalars.Should().Contain("deployments");
allScalars.Should().Contain("get");
var operatorDeployment = FcDeviceManagementDocuments()
.Single(document => document.Kind == "Deployment" && document.Name == "fc-devicemgmt-operator");
operatorDeployment.AllScalars().Should().Contain("FLOWERCORE_KUBERNETES_OWNER_DEPLOYMENT");
operatorDeployment.AllScalars().Should().Contain("fc-devicemgmt-operator");
}
[Fact]
public void FcDeviceManagement_RuntimeSecretsMustUseOnePasswordItemPattern()
{
var item = FcDeviceManagementDocuments()
.Single(document => document.Kind == "OnePasswordItem" && document.Name == "fc-devicemgmt-runtime");
item.Scalar("spec", "itemPath")
.Should()
.Be("vaults/IAmWorkin/items/FlowerCore DeviceManagement Runtime");
var appText = string.Join(
Environment.NewLine,
Directory.GetFiles(Path.Combine(Inventory.BluejayRoot, "apps", "fc-devicemgmt"), "*.yaml")
.Select(File.ReadAllText));
FcDeviceManagementDocuments().Should().NotContain(document => document.Kind == "Secret");
appText.Should().Contain("secretKeyRef:");
appText.Should().Contain("secretName: fc-devicemgmt-runtime");
appText.Should().NotContain("stringData:");
appText.Should().NotContain("from-literal");
appText.Should().NotContain("tls.key:");
}
[Fact]
public void FcDeviceManagement_NetworkPoliciesMustAllowLanAgentsSynologyAndDnatPorts()
{
var policies = FcDeviceManagementDocuments()
.Where(document => document.Kind == "NetworkPolicy")
.ToList();
policies.Should().HaveCount(2);
var combinedScalars = policies.SelectMany(policy => policy.AllScalars()).ToList();
combinedScalars.Should().Contain("10.0.56.0/24");
combinedScalars.Should().Contain("10.0.57.0/24");
combinedScalars.Should().Contain("10.0.58.0/24");
combinedScalars.Should().Contain("10.0.68.0/27");
combinedScalars.Should().Contain("10.0.58.3/32");
var combinedEgressPorts = policies.SelectMany(policy => policy.EgressPorts()).ToHashSet(StringComparer.Ordinal);
combinedEgressPorts.Should().Contain(new[] { "80", "443", "8080", "8443", "2049", "111" });
var traefikVipPolicies = policies
.Where(policy => policy.AllScalars().Any(value => value.Contains("10.0.56.200", StringComparison.Ordinal)))
.ToList();
traefikVipPolicies.Should().ContainSingle();
traefikVipPolicies[0].EgressPorts().Should().Contain(new[] { "80", "443", "8080", "8443" });
}
[Fact]
public void FcDeviceManagement_ArgocdApplicationMustMatchApplicationSetDiscoveryConventions()
{
var application = FcDeviceManagementDocuments()
.Single(document => document.Kind == "Application" && document.Name == "infra-fc-devicemgmt");
application.Namespace.Should().Be("argocd");
application.Scalar("spec", "source", "repoURL")
.Should()
.Be("http://gitea-clusterip.gitea.svc.cluster.local:3000/bluejay/bluejay-infra.git");
application.Scalar("spec", "source", "path").Should().Be("apps/fc-devicemgmt");
application.Scalar("spec", "destination", "namespace").Should().Be("fc-devicemgmt");
}
private static IEnumerable<string> ProbeViolations(
ManifestDocument document,
YamlMappingNode container,
@@ -314,6 +626,60 @@ public sealed class FleetManifestLintTests
$"{document.Descriptor} container '{containerName}' still uses {probeKey}.httpGet on /health.",
};
}
private static IReadOnlyDictionary<string, ManifestDocument> GitHubRunnerDeployments()
{
return Inventory.Documents
.Where(document => document.Kind == "Deployment")
.Where(document => document.Namespace == "github-runner")
.ToDictionary(document => document.Name, StringComparer.Ordinal);
}
private static int ReplicaCount(ManifestDocument document)
{
return int.TryParse(document.Scalar("spec", "replicas"), out var replicas) ? replicas : 1;
}
private static string? EnvValue(YamlMappingNode container, string name)
{
return EnvMapping(container, name) is { } env ? ManifestNodeExtensions.Scalar(env, "value") : null;
}
private static YamlMappingNode RunnerContainer(ManifestDocument deployment)
{
return deployment.ContainerMappings()
.Where(container => string.Equals(ManifestNodeExtensions.Scalar(container, "name"), "runner", StringComparison.Ordinal))
.Should()
.ContainSingle($"{deployment.Name} must keep exactly one main runner container")
.Subject;
}
private static string? EnvSecretName(YamlMappingNode container, string name)
{
return EnvMapping(container, name) is { } env
? ManifestNodeExtensions.Scalar(env, "valueFrom", "secretKeyRef", "name")
: null;
}
private static string? EnvSecretKey(YamlMappingNode container, string name)
{
return EnvMapping(container, name) is { } env
? ManifestNodeExtensions.Scalar(env, "valueFrom", "secretKeyRef", "key")
: null;
}
private static YamlMappingNode? EnvMapping(YamlMappingNode container, string name)
{
return ManifestNodeExtensions.MappingSequence(container, "env")
.SingleOrDefault(env => string.Equals(ManifestNodeExtensions.Scalar(env, "name"), name, StringComparison.Ordinal));
}
private static IReadOnlyList<ManifestDocument> FcDeviceManagementDocuments()
{
return Inventory.Documents
.Where(document => document.RelativePath.StartsWith("fc-devicemgmt/", StringComparison.Ordinal))
.ToList();
}
}
internal sealed class ManifestInventory

View File

@@ -0,0 +1,99 @@
using FluentAssertions;
using Xunit;
namespace BluejayInfraLint.Tests;
[Trait("Category", "Unit")]
public sealed class OpenVoxServerDurabilityTests
{
private static readonly string Root = FindRepoRoot();
private static readonly string RunbookPath = Path.Combine(Root, "docs", "runbooks", "openvoxserver-quadlet-durability.md");
private static readonly string SmokePath = Path.Combine(Root, "scripts", "monitoring", "openvox-recreate-smoke.sh");
[Fact]
public void Runbook_DocumentsHostArtifactAndNonArgoPath()
{
var runbook = File.ReadAllText(RunbookPath);
runbook.Should().Contain("noc1 host artifact");
runbook.Should().Contain("not an ArgoCD application");
runbook.Should().Contain("systemctl cat openvoxserver");
runbook.Should().Contain("/etc/containers/systemd/openvoxserver.container");
}
[Fact]
public void Runbook_DocumentsCx12LiveApplyState()
{
var runbook = File.ReadAllText(RunbookPath);
runbook.Should().Contain("Sprint 32 Cx-12");
runbook.Should().Contain("openvoxserver-safeconfig.service");
runbook.Should().Contain("/opt/puppet/r10k-deploy.sh");
runbook.Should().Contain("HEAD == origin/master");
}
[Fact]
public void SmokeScript_IsExplicitlyOptIn()
{
var smoke = File.ReadAllText(SmokePath);
smoke.Should().Contain("OPENVOX_RECREATE_SMOKE");
smoke.Should().Contain("exit 64");
smoke.IndexOf("OPENVOX_RECREATE_SMOKE", StringComparison.Ordinal)
.Should().BeLessThan(smoke.IndexOf("systemctl stop openvoxserver", StringComparison.Ordinal));
}
[Fact]
public void SmokeScript_RequiresGeneratedSystemdUnitBeforeRemovingContainer()
{
var smoke = File.ReadAllText(SmokePath);
smoke.Should().Contain("systemctl cat openvoxserver");
smoke.Should().Contain("refusing to remove a container without a verified systemd recreate path");
smoke.IndexOf("systemctl cat openvoxserver", StringComparison.Ordinal)
.Should().BeLessThan(smoke.IndexOf("podman rm openvoxserver", StringComparison.Ordinal));
}
[Fact]
public void Artifacts_DoNotStoreSecretsOrPaidRunnerLabels()
{
var forbidden = new[]
{
"BEGIN OPENSSH PRIVATE KEY",
"BEGIN RSA PRIVATE KEY",
"ubuntu-latest",
"windows-latest",
"macos-latest",
};
var violations = new[] { RunbookPath, SmokePath }
.SelectMany(path =>
{
var text = File.ReadAllText(path);
return forbidden
.Where(token => text.Contains(token, StringComparison.OrdinalIgnoreCase))
.Select(token => $"{Path.GetRelativePath(Root, path)} contains forbidden token {token}");
})
.ToList();
violations.Should().BeEmpty();
}
private static string FindRepoRoot()
{
var current = new DirectoryInfo(AppContext.BaseDirectory);
while (current is not null)
{
if (Directory.Exists(Path.Combine(current.FullName, "apps"))
&& Directory.Exists(Path.Combine(current.FullName, "scripts"))
&& File.Exists(Path.Combine(current.FullName, "README.md")))
{
return current.FullName;
}
current = current.Parent;
}
throw new DirectoryNotFoundException("Could not find bluejay-infra root.");
}
}

View File

@@ -174,10 +174,13 @@ public sealed class PiSignagePlayerArtifactTests
public void HdmiRule_RestartsPlayerAndRunsCapabilityDetection()
{
var rule = Read("systemd/99-flowercore-signage-hdmi.rules");
var responder = Read("scripts/flowercore-signage-hdmi-respond.sh");
rule.Should().Contain("KERNEL==\"card?-HDMI-A-?\"");
rule.Should().Contain("restart flowercore-signage-player-pi.service");
rule.Should().Contain("start flowercore-signage-detect-display.service");
rule.Should().Contain("start flowercore-signage-player-pi-hdmi.service");
responder.Should().Contain("sleep 2");
responder.Should().Contain("start flowercore-signage-detect-display.service");
responder.Should().Contain("restart flowercore-signage-player-pi.service");
}
[Fact]

View File

@@ -1,6 +1,6 @@
package bluejayinfra.public_method_allowlist
public_hosts := {"dist.flowercore.io", "dns.iamworkin.lan"}
public_hosts := {"brochure.flowercore.io", "dist.flowercore.io", "dns.iamworkin.lan"}
deny[msg] {
input.kind == "IngressRoute"