Compare commits
1 Commits
74333cc26b
...
claude/git
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a0f8fd1790 |
@@ -118,7 +118,6 @@ That test project sweeps `bluejay-infra/apps/**` plus the canonical sibling `Flo
|
|||||||
|
|
||||||
## References
|
## References
|
||||||
|
|
||||||
- OpenVox noc1 durability runbook: `docs/runbooks/openvoxserver-quadlet-durability.md`
|
|
||||||
- Cert-manager recovery playbook: `FlowerCore.Notes/memory/project_cert_manager_recovery_2026_04_22.md`
|
- Cert-manager recovery playbook: `FlowerCore.Notes/memory/project_cert_manager_recovery_2026_04_22.md`
|
||||||
- Why pfSense DNS is required: `FlowerCore.Notes/memory/feedback_pfsense_dns_required_for_acme.md`
|
- Why pfSense DNS is required: `FlowerCore.Notes/memory/feedback_pfsense_dns_required_for_acme.md`
|
||||||
- Public DNS operator host: `https://dns.iamworkin.lan`
|
- Public DNS operator host: `https://dns.iamworkin.lan`
|
||||||
|
|||||||
@@ -1,448 +0,0 @@
|
|||||||
# Authentik OIDC backend
|
|
||||||
# ArgoCD-managed. BlueJay Lab.
|
|
||||||
#
|
|
||||||
# Stack:
|
|
||||||
# - PostgreSQL 16 StatefulSet (single replica, Longhorn RWO 5Gi)
|
|
||||||
# - Redis 7 Deployment (no persistence — session/cache only)
|
|
||||||
# - Authentik server + worker Deployments (image ghcr.io/goauthentik/server:2024.12.3)
|
|
||||||
# - Media PVC shared between server + worker (Longhorn RWO 2Gi)
|
|
||||||
# - Certificate via step-ca-acme ClusterIssuer
|
|
||||||
# - Traefik IngressRoute at id.iamworkin.lan
|
|
||||||
#
|
|
||||||
# Secrets come from 1Password item "authentik-credentials" (IAmWorkin vault, id y6i74ch22q5wvm7znquq4nhhcu)
|
|
||||||
# via the OnePasswordItem CRD, materialized into k8s Secret authentik/authentik-credentials.
|
|
||||||
#
|
|
||||||
# Why the discovery URL is /application/o/pimanager/ : Authentik issues per-application OIDC providers.
|
|
||||||
# The pimanager OIDC application/provider is created after the cluster pods are healthy (manual or
|
|
||||||
# via API once the bootstrap token is available — see Notes substrate).
|
|
||||||
|
|
||||||
---
|
|
||||||
apiVersion: v1
|
|
||||||
kind: Namespace
|
|
||||||
metadata:
|
|
||||||
name: authentik
|
|
||||||
labels:
|
|
||||||
app.kubernetes.io/part-of: bluejay-infra
|
|
||||||
|
|
||||||
---
|
|
||||||
# 1Password operator pulls the authentik-credentials item into a k8s Secret of the same name.
|
|
||||||
# Field labels in 1P become Secret keys: AUTHENTIK_SECRET_KEY, POSTGRES_PASSWORD, REDIS_PASSWORD,
|
|
||||||
# BOOTSTRAP_ADMIN_PASSWORD, BOOTSTRAP_ADMIN_TOKEN, BOOTSTRAP_ADMIN_EMAIL.
|
|
||||||
apiVersion: onepassword.com/v1
|
|
||||||
kind: OnePasswordItem
|
|
||||||
metadata:
|
|
||||||
name: authentik-credentials
|
|
||||||
namespace: authentik
|
|
||||||
spec:
|
|
||||||
itemPath: "vaults/IAmWorkin/items/authentik-credentials"
|
|
||||||
|
|
||||||
---
|
|
||||||
# Shared media volume for server + worker pods.
|
|
||||||
apiVersion: v1
|
|
||||||
kind: PersistentVolumeClaim
|
|
||||||
metadata:
|
|
||||||
name: authentik-media
|
|
||||||
namespace: authentik
|
|
||||||
spec:
|
|
||||||
storageClassName: longhorn
|
|
||||||
accessModes: [ReadWriteOnce]
|
|
||||||
resources:
|
|
||||||
requests:
|
|
||||||
storage: 2Gi
|
|
||||||
|
|
||||||
---
|
|
||||||
# PostgreSQL 16 StatefulSet — Authentik's primary store.
|
|
||||||
apiVersion: apps/v1
|
|
||||||
kind: StatefulSet
|
|
||||||
metadata:
|
|
||||||
name: authentik-postgres
|
|
||||||
namespace: authentik
|
|
||||||
labels:
|
|
||||||
app: authentik-postgres
|
|
||||||
argocd.argoproj.io/instance: infra-authentik
|
|
||||||
spec:
|
|
||||||
persistentVolumeClaimRetentionPolicy:
|
|
||||||
whenDeleted: Retain
|
|
||||||
whenScaled: Retain
|
|
||||||
podManagementPolicy: OrderedReady
|
|
||||||
serviceName: authentik-postgres
|
|
||||||
replicas: 1
|
|
||||||
revisionHistoryLimit: 10
|
|
||||||
selector:
|
|
||||||
matchLabels:
|
|
||||||
app: authentik-postgres
|
|
||||||
template:
|
|
||||||
metadata:
|
|
||||||
labels:
|
|
||||||
app: authentik-postgres
|
|
||||||
spec:
|
|
||||||
containers:
|
|
||||||
- name: postgres
|
|
||||||
image: postgres:16-alpine
|
|
||||||
ports:
|
|
||||||
- containerPort: 5432
|
|
||||||
name: postgres
|
|
||||||
env:
|
|
||||||
- name: POSTGRES_USER
|
|
||||||
value: authentik
|
|
||||||
- name: POSTGRES_PASSWORD
|
|
||||||
valueFrom:
|
|
||||||
secretKeyRef:
|
|
||||||
name: authentik-credentials
|
|
||||||
key: POSTGRES_PASSWORD
|
|
||||||
- name: POSTGRES_DB
|
|
||||||
value: authentik
|
|
||||||
- name: POSTGRES_INITDB_ARGS
|
|
||||||
value: "--encoding=UTF-8 --lc-collate=C --lc-ctype=C"
|
|
||||||
- name: PGDATA
|
|
||||||
value: /var/lib/postgresql/data/pgdata
|
|
||||||
readinessProbe:
|
|
||||||
exec:
|
|
||||||
command: ["pg_isready", "-U", "authentik"]
|
|
||||||
initialDelaySeconds: 5
|
|
||||||
periodSeconds: 5
|
|
||||||
livenessProbe:
|
|
||||||
exec:
|
|
||||||
command: ["pg_isready", "-U", "authentik"]
|
|
||||||
initialDelaySeconds: 30
|
|
||||||
periodSeconds: 30
|
|
||||||
resources:
|
|
||||||
requests: { cpu: 100m, memory: 256Mi }
|
|
||||||
limits: { cpu: 1000m, memory: 1Gi }
|
|
||||||
volumeMounts:
|
|
||||||
- name: pgdata
|
|
||||||
mountPath: /var/lib/postgresql/data
|
|
||||||
volumeClaimTemplates:
|
|
||||||
- metadata:
|
|
||||||
name: pgdata
|
|
||||||
spec:
|
|
||||||
storageClassName: longhorn
|
|
||||||
accessModes: [ReadWriteOnce]
|
|
||||||
volumeMode: Filesystem
|
|
||||||
resources:
|
|
||||||
requests:
|
|
||||||
storage: 5Gi
|
|
||||||
|
|
||||||
---
|
|
||||||
apiVersion: v1
|
|
||||||
kind: Service
|
|
||||||
metadata:
|
|
||||||
name: authentik-postgres
|
|
||||||
namespace: authentik
|
|
||||||
spec:
|
|
||||||
clusterIP: None
|
|
||||||
selector:
|
|
||||||
app: authentik-postgres
|
|
||||||
ports:
|
|
||||||
- name: postgres
|
|
||||||
port: 5432
|
|
||||||
targetPort: 5432
|
|
||||||
|
|
||||||
---
|
|
||||||
# Redis 7 — session storage + Celery broker. No persistence needed (cache).
|
|
||||||
apiVersion: apps/v1
|
|
||||||
kind: Deployment
|
|
||||||
metadata:
|
|
||||||
name: authentik-redis
|
|
||||||
namespace: authentik
|
|
||||||
labels:
|
|
||||||
app: authentik-redis
|
|
||||||
argocd.argoproj.io/instance: infra-authentik
|
|
||||||
spec:
|
|
||||||
replicas: 1
|
|
||||||
strategy:
|
|
||||||
type: Recreate
|
|
||||||
selector:
|
|
||||||
matchLabels:
|
|
||||||
app: authentik-redis
|
|
||||||
template:
|
|
||||||
metadata:
|
|
||||||
labels:
|
|
||||||
app: authentik-redis
|
|
||||||
spec:
|
|
||||||
containers:
|
|
||||||
- name: redis
|
|
||||||
image: redis:7-alpine
|
|
||||||
args:
|
|
||||||
- "--save"
|
|
||||||
- ""
|
|
||||||
- "--appendonly"
|
|
||||||
- "no"
|
|
||||||
- "--requirepass"
|
|
||||||
- "$(REDIS_PASSWORD)"
|
|
||||||
env:
|
|
||||||
- name: REDIS_PASSWORD
|
|
||||||
valueFrom:
|
|
||||||
secretKeyRef:
|
|
||||||
name: authentik-credentials
|
|
||||||
key: REDIS_PASSWORD
|
|
||||||
ports:
|
|
||||||
- containerPort: 6379
|
|
||||||
name: redis
|
|
||||||
readinessProbe:
|
|
||||||
tcpSocket: { port: 6379 }
|
|
||||||
initialDelaySeconds: 5
|
|
||||||
periodSeconds: 5
|
|
||||||
livenessProbe:
|
|
||||||
tcpSocket: { port: 6379 }
|
|
||||||
initialDelaySeconds: 30
|
|
||||||
periodSeconds: 30
|
|
||||||
resources:
|
|
||||||
requests: { cpu: 50m, memory: 64Mi }
|
|
||||||
limits: { cpu: 500m, memory: 256Mi }
|
|
||||||
|
|
||||||
---
|
|
||||||
apiVersion: v1
|
|
||||||
kind: Service
|
|
||||||
metadata:
|
|
||||||
name: authentik-redis
|
|
||||||
namespace: authentik
|
|
||||||
spec:
|
|
||||||
selector:
|
|
||||||
app: authentik-redis
|
|
||||||
ports:
|
|
||||||
- name: redis
|
|
||||||
port: 6379
|
|
||||||
targetPort: 6379
|
|
||||||
|
|
||||||
---
|
|
||||||
# Authentik server Deployment — HTTP frontend on :9000.
|
|
||||||
apiVersion: apps/v1
|
|
||||||
kind: Deployment
|
|
||||||
metadata:
|
|
||||||
name: authentik-server
|
|
||||||
namespace: authentik
|
|
||||||
labels:
|
|
||||||
app: authentik-server
|
|
||||||
argocd.argoproj.io/instance: infra-authentik
|
|
||||||
spec:
|
|
||||||
replicas: 1
|
|
||||||
strategy:
|
|
||||||
type: Recreate # shares /media RWO PVC with worker
|
|
||||||
selector:
|
|
||||||
matchLabels:
|
|
||||||
app: authentik-server
|
|
||||||
template:
|
|
||||||
metadata:
|
|
||||||
labels:
|
|
||||||
app: authentik-server
|
|
||||||
spec:
|
|
||||||
securityContext:
|
|
||||||
# Authentik image runs as uid 1000 "authentik" but the Longhorn PVC mounts
|
|
||||||
# root:root by default. fsGroup recursively chgrp + chmod g+rwx so the
|
|
||||||
# non-root container can mkdir /media/public during the tenant_files migration.
|
|
||||||
fsGroup: 1000
|
|
||||||
containers:
|
|
||||||
- name: server
|
|
||||||
image: ghcr.io/goauthentik/server:2024.12.3
|
|
||||||
args: ["server"]
|
|
||||||
ports:
|
|
||||||
- containerPort: 9000
|
|
||||||
name: http
|
|
||||||
- containerPort: 9443
|
|
||||||
name: https
|
|
||||||
env:
|
|
||||||
- name: AUTHENTIK_SECRET_KEY
|
|
||||||
valueFrom:
|
|
||||||
secretKeyRef:
|
|
||||||
name: authentik-credentials
|
|
||||||
key: AUTHENTIK_SECRET_KEY
|
|
||||||
- name: AUTHENTIK_REDIS__HOST
|
|
||||||
value: authentik-redis
|
|
||||||
- name: AUTHENTIK_REDIS__PASSWORD
|
|
||||||
valueFrom:
|
|
||||||
secretKeyRef:
|
|
||||||
name: authentik-credentials
|
|
||||||
key: REDIS_PASSWORD
|
|
||||||
- name: AUTHENTIK_POSTGRESQL__HOST
|
|
||||||
value: authentik-postgres
|
|
||||||
- name: AUTHENTIK_POSTGRESQL__NAME
|
|
||||||
value: authentik
|
|
||||||
- name: AUTHENTIK_POSTGRESQL__USER
|
|
||||||
value: authentik
|
|
||||||
- name: AUTHENTIK_POSTGRESQL__PASSWORD
|
|
||||||
valueFrom:
|
|
||||||
secretKeyRef:
|
|
||||||
name: authentik-credentials
|
|
||||||
key: POSTGRES_PASSWORD
|
|
||||||
- name: AUTHENTIK_BOOTSTRAP_PASSWORD
|
|
||||||
valueFrom:
|
|
||||||
secretKeyRef:
|
|
||||||
name: authentik-credentials
|
|
||||||
key: BOOTSTRAP_ADMIN_PASSWORD
|
|
||||||
- name: AUTHENTIK_BOOTSTRAP_TOKEN
|
|
||||||
valueFrom:
|
|
||||||
secretKeyRef:
|
|
||||||
name: authentik-credentials
|
|
||||||
key: BOOTSTRAP_ADMIN_TOKEN
|
|
||||||
- name: AUTHENTIK_BOOTSTRAP_EMAIL
|
|
||||||
valueFrom:
|
|
||||||
secretKeyRef:
|
|
||||||
name: authentik-credentials
|
|
||||||
key: BOOTSTRAP_ADMIN_EMAIL
|
|
||||||
- name: AUTHENTIK_DISABLE_UPDATE_CHECK
|
|
||||||
value: "true"
|
|
||||||
- name: AUTHENTIK_ERROR_REPORTING__ENABLED
|
|
||||||
value: "false"
|
|
||||||
- name: AUTHENTIK_LOG_LEVEL
|
|
||||||
value: info
|
|
||||||
# First-boot Authentik can take 3+ min on the migration phase
|
|
||||||
# (waiting on DB lock while worker also runs migrations). Initial
|
|
||||||
# delays are generous so kubelet doesn't kill the pod mid-migration;
|
|
||||||
# periodSeconds keeps post-startup probing responsive.
|
|
||||||
readinessProbe:
|
|
||||||
httpGet:
|
|
||||||
path: /-/health/ready/
|
|
||||||
port: 9000
|
|
||||||
initialDelaySeconds: 60
|
|
||||||
periodSeconds: 10
|
|
||||||
timeoutSeconds: 5
|
|
||||||
failureThreshold: 12
|
|
||||||
livenessProbe:
|
|
||||||
httpGet:
|
|
||||||
path: /-/health/live/
|
|
||||||
port: 9000
|
|
||||||
initialDelaySeconds: 300
|
|
||||||
periodSeconds: 30
|
|
||||||
timeoutSeconds: 10
|
|
||||||
failureThreshold: 3
|
|
||||||
startupProbe:
|
|
||||||
httpGet:
|
|
||||||
path: /-/health/live/
|
|
||||||
port: 9000
|
|
||||||
initialDelaySeconds: 30
|
|
||||||
periodSeconds: 15
|
|
||||||
timeoutSeconds: 10
|
|
||||||
failureThreshold: 40 # 30s + 40*15s = 10.5 min budget
|
|
||||||
resources:
|
|
||||||
requests: { cpu: 150m, memory: 512Mi }
|
|
||||||
limits: { cpu: 1500m, memory: 1Gi }
|
|
||||||
volumeMounts:
|
|
||||||
- name: media
|
|
||||||
mountPath: /media
|
|
||||||
volumes:
|
|
||||||
- name: media
|
|
||||||
persistentVolumeClaim:
|
|
||||||
claimName: authentik-media
|
|
||||||
|
|
||||||
---
|
|
||||||
# Authentik worker Deployment — runs Celery background tasks.
|
|
||||||
apiVersion: apps/v1
|
|
||||||
kind: Deployment
|
|
||||||
metadata:
|
|
||||||
name: authentik-worker
|
|
||||||
namespace: authentik
|
|
||||||
labels:
|
|
||||||
app: authentik-worker
|
|
||||||
argocd.argoproj.io/instance: infra-authentik
|
|
||||||
spec:
|
|
||||||
replicas: 1
|
|
||||||
strategy:
|
|
||||||
type: Recreate # shares /media RWO PVC with server
|
|
||||||
selector:
|
|
||||||
matchLabels:
|
|
||||||
app: authentik-worker
|
|
||||||
template:
|
|
||||||
metadata:
|
|
||||||
labels:
|
|
||||||
app: authentik-worker
|
|
||||||
spec:
|
|
||||||
securityContext:
|
|
||||||
# Same as server pod — non-root uid 1000 needs PVC group write.
|
|
||||||
fsGroup: 1000
|
|
||||||
containers:
|
|
||||||
- name: worker
|
|
||||||
image: ghcr.io/goauthentik/server:2024.12.3
|
|
||||||
args: ["worker"]
|
|
||||||
env:
|
|
||||||
- name: AUTHENTIK_SECRET_KEY
|
|
||||||
valueFrom:
|
|
||||||
secretKeyRef:
|
|
||||||
name: authentik-credentials
|
|
||||||
key: AUTHENTIK_SECRET_KEY
|
|
||||||
- name: AUTHENTIK_REDIS__HOST
|
|
||||||
value: authentik-redis
|
|
||||||
- name: AUTHENTIK_REDIS__PASSWORD
|
|
||||||
valueFrom:
|
|
||||||
secretKeyRef:
|
|
||||||
name: authentik-credentials
|
|
||||||
key: REDIS_PASSWORD
|
|
||||||
- name: AUTHENTIK_POSTGRESQL__HOST
|
|
||||||
value: authentik-postgres
|
|
||||||
- name: AUTHENTIK_POSTGRESQL__NAME
|
|
||||||
value: authentik
|
|
||||||
- name: AUTHENTIK_POSTGRESQL__USER
|
|
||||||
value: authentik
|
|
||||||
- name: AUTHENTIK_POSTGRESQL__PASSWORD
|
|
||||||
valueFrom:
|
|
||||||
secretKeyRef:
|
|
||||||
name: authentik-credentials
|
|
||||||
key: POSTGRES_PASSWORD
|
|
||||||
- name: AUTHENTIK_DISABLE_UPDATE_CHECK
|
|
||||||
value: "true"
|
|
||||||
- name: AUTHENTIK_ERROR_REPORTING__ENABLED
|
|
||||||
value: "false"
|
|
||||||
- name: AUTHENTIK_LOG_LEVEL
|
|
||||||
value: info
|
|
||||||
resources:
|
|
||||||
requests: { cpu: 100m, memory: 256Mi }
|
|
||||||
limits: { cpu: 1000m, memory: 768Mi }
|
|
||||||
volumeMounts:
|
|
||||||
- name: media
|
|
||||||
mountPath: /media
|
|
||||||
volumes:
|
|
||||||
- name: media
|
|
||||||
persistentVolumeClaim:
|
|
||||||
claimName: authentik-media
|
|
||||||
|
|
||||||
---
|
|
||||||
apiVersion: v1
|
|
||||||
kind: Service
|
|
||||||
metadata:
|
|
||||||
name: authentik-server
|
|
||||||
namespace: authentik
|
|
||||||
spec:
|
|
||||||
selector:
|
|
||||||
app: authentik-server
|
|
||||||
ports:
|
|
||||||
- name: http
|
|
||||||
port: 9000
|
|
||||||
targetPort: 9000
|
|
||||||
- name: https
|
|
||||||
port: 9443
|
|
||||||
targetPort: 9443
|
|
||||||
|
|
||||||
---
|
|
||||||
# step-ca leaf certificate for id.iamworkin.lan.
|
|
||||||
# step-ca container resolver uses pfSense Unbound, so the public A record for id.iamworkin.lan
|
|
||||||
# MUST exist before this Certificate is applied (cert-manager HTTP-01 will silently 2h-backoff
|
|
||||||
# otherwise). Added 2026-05-25 via scripts/pfsense-add-id-host.py.
|
|
||||||
apiVersion: cert-manager.io/v1
|
|
||||||
kind: Certificate
|
|
||||||
metadata:
|
|
||||||
name: authentik-tls
|
|
||||||
namespace: authentik
|
|
||||||
spec:
|
|
||||||
secretName: authentik-tls
|
|
||||||
dnsNames:
|
|
||||||
- id.iamworkin.lan
|
|
||||||
issuerRef:
|
|
||||||
name: step-ca-acme
|
|
||||||
kind: ClusterIssuer
|
|
||||||
|
|
||||||
---
|
|
||||||
apiVersion: traefik.io/v1alpha1
|
|
||||||
kind: IngressRoute
|
|
||||||
metadata:
|
|
||||||
name: authentik
|
|
||||||
namespace: authentik
|
|
||||||
spec:
|
|
||||||
entryPoints: [websecure]
|
|
||||||
routes:
|
|
||||||
- match: Host(`id.iamworkin.lan`)
|
|
||||||
kind: Rule
|
|
||||||
services:
|
|
||||||
- name: authentik-server
|
|
||||||
port: 9000
|
|
||||||
tls:
|
|
||||||
secretName: authentik-tls
|
|
||||||
@@ -30,41 +30,3 @@ spec:
|
|||||||
port: 80
|
port: 80
|
||||||
tls:
|
tls:
|
||||||
secretName: chat-web-tls
|
secretName: chat-web-tls
|
||||||
---
|
|
||||||
# Public host profile marker. The app treats this header as authoritative for
|
|
||||||
# the public twin, while the internal chat.iamworkin.lan route does not attach
|
|
||||||
# it and keeps the operator-oriented UI.
|
|
||||||
apiVersion: traefik.io/v1alpha1
|
|
||||||
kind: Middleware
|
|
||||||
metadata:
|
|
||||||
name: chat-public-profile-header
|
|
||||||
namespace: fc-chat
|
|
||||||
spec:
|
|
||||||
headers:
|
|
||||||
customRequestHeaders:
|
|
||||||
X-FC-Chat-Host-Profile: "public"
|
|
||||||
---
|
|
||||||
# Public Cloudflare-fronted twin for the anonymous chat surface. Operator
|
|
||||||
# paths are intentionally absent from the allowlist below, so /admin,
|
|
||||||
# /operator, /console, /ops, /api/operator, and /operatorhub miss this route
|
|
||||||
# and return Traefik 404 before reaching the pod. Operator action still needed:
|
|
||||||
# create/verify Cloudflare DNS chat.flowercore.io -> public Traefik endpoint
|
|
||||||
# and mirror the cf-origin-flowercore-io TLS secret into namespace fc-chat.
|
|
||||||
apiVersion: traefik.io/v1alpha1
|
|
||||||
kind: IngressRoute
|
|
||||||
metadata:
|
|
||||||
name: chat-web-public
|
|
||||||
namespace: fc-chat
|
|
||||||
spec:
|
|
||||||
entryPoints:
|
|
||||||
- websecure
|
|
||||||
routes:
|
|
||||||
- match: Host(`chat.flowercore.io`) && (Path(`/`) || Path(`/chat`) || PathPrefix(`/_blazor`) || PathPrefix(`/_framework`) || PathPrefix(`/_content`) || PathPrefix(`/avatars`) || PathPrefix(`/css`) || PathPrefix(`/js`) || PathPrefix(`/favicon`) || PathPrefix(`/chathub`)) && (Method(`GET`) || Method(`HEAD`) || Method(`POST`) || Method(`OPTIONS`))
|
|
||||||
kind: Rule
|
|
||||||
middlewares:
|
|
||||||
- name: chat-public-profile-header
|
|
||||||
services:
|
|
||||||
- name: chat-web
|
|
||||||
port: 80
|
|
||||||
tls:
|
|
||||||
secretName: cf-origin-flowercore-io
|
|
||||||
|
|||||||
@@ -1,26 +0,0 @@
|
|||||||
# Runtime secrets for FlowerCore.DeviceManagement.
|
|
||||||
#
|
|
||||||
# OnePasswordItem operator syncs this item into a Kubernetes Secret with the
|
|
||||||
# same name. Expected fields:
|
|
||||||
# DB-Password
|
|
||||||
# mtls-ca.pem
|
|
||||||
# mtls-client.crt
|
|
||||||
# mtls-client.key
|
|
||||||
# mtls-chain.pem
|
|
||||||
#
|
|
||||||
# Do not add literal secret values to this repo. Runtime pods consume the
|
|
||||||
# synced Secret through env vars and read-only mounts.
|
|
||||||
apiVersion: onepassword.com/v1
|
|
||||||
kind: OnePasswordItem
|
|
||||||
metadata:
|
|
||||||
name: fc-devicemgmt-runtime
|
|
||||||
namespace: fc-devicemgmt
|
|
||||||
labels:
|
|
||||||
app.kubernetes.io/name: fc-devicemgmt
|
|
||||||
app.kubernetes.io/component: secrets
|
|
||||||
app.kubernetes.io/part-of: flowercore
|
|
||||||
app.kubernetes.io/managed-by: argocd
|
|
||||||
flowercore.io/tenant-id: system
|
|
||||||
flowercore.io/created-by: bluejay-infra
|
|
||||||
spec:
|
|
||||||
itemPath: "vaults/IAmWorkin/items/FlowerCore DeviceManagement Runtime"
|
|
||||||
@@ -1,30 +0,0 @@
|
|||||||
# Certificate for devices.iamworkin.lan.
|
|
||||||
#
|
|
||||||
# Preflight gate: FlowerCore.DNS / pfSense must contain an explicit A record:
|
|
||||||
# devices.iamworkin.lan -> 10.0.56.200
|
|
||||||
# before this Certificate is synced. step-ca ACME cannot see the CoreDNS
|
|
||||||
# wildcard, so missing pfSense DNS produces cert-manager HTTP-01 backoff
|
|
||||||
# (feedback_pfsense_dns_required_for_acme).
|
|
||||||
apiVersion: cert-manager.io/v1
|
|
||||||
kind: Certificate
|
|
||||||
metadata:
|
|
||||||
name: fc-devicemgmt-web-tls
|
|
||||||
namespace: fc-devicemgmt
|
|
||||||
labels:
|
|
||||||
app.kubernetes.io/name: fc-devicemgmt-web
|
|
||||||
app.kubernetes.io/component: web
|
|
||||||
app.kubernetes.io/part-of: flowercore
|
|
||||||
app.kubernetes.io/managed-by: argocd
|
|
||||||
flowercore.io/tenant-id: system
|
|
||||||
flowercore.io/created-by: bluejay-infra
|
|
||||||
annotations:
|
|
||||||
flowercore.io/dns-preflight: "devices.iamworkin.lan must resolve to 10.0.56.200 before ACME sync"
|
|
||||||
spec:
|
|
||||||
secretName: fc-devicemgmt-web-tls
|
|
||||||
issuerRef:
|
|
||||||
name: step-ca-acme
|
|
||||||
kind: ClusterIssuer
|
|
||||||
dnsNames:
|
|
||||||
- devices.iamworkin.lan
|
|
||||||
duration: 720h
|
|
||||||
renewBefore: 240h
|
|
||||||
@@ -1,81 +0,0 @@
|
|||||||
apiVersion: rbac.authorization.k8s.io/v1
|
|
||||||
kind: ClusterRole
|
|
||||||
metadata:
|
|
||||||
name: fc-devicemgmt-operator
|
|
||||||
labels:
|
|
||||||
app.kubernetes.io/name: fc-devicemgmt-operator
|
|
||||||
app.kubernetes.io/component: operator
|
|
||||||
app.kubernetes.io/part-of: flowercore
|
|
||||||
app.kubernetes.io/managed-by: argocd
|
|
||||||
flowercore.io/tenant-id: system
|
|
||||||
flowercore.io/created-by: bluejay-infra
|
|
||||||
rules:
|
|
||||||
- apiGroups:
|
|
||||||
- devices.flowercore.io
|
|
||||||
resources:
|
|
||||||
- '*'
|
|
||||||
verbs:
|
|
||||||
- get
|
|
||||||
- list
|
|
||||||
- watch
|
|
||||||
- create
|
|
||||||
- update
|
|
||||||
- patch
|
|
||||||
- delete
|
|
||||||
- apiGroups:
|
|
||||||
- devices.flowercore.io
|
|
||||||
resources:
|
|
||||||
- devices/status
|
|
||||||
- devices/finalizers
|
|
||||||
- devicegroups/status
|
|
||||||
- devicegroups/finalizers
|
|
||||||
- devicepolicies/status
|
|
||||||
- devicepolicies/finalizers
|
|
||||||
- remotecommands/status
|
|
||||||
- remotecommands/finalizers
|
|
||||||
verbs:
|
|
||||||
- get
|
|
||||||
- update
|
|
||||||
- patch
|
|
||||||
- apiGroups:
|
|
||||||
- apps
|
|
||||||
resources:
|
|
||||||
- deployments
|
|
||||||
verbs:
|
|
||||||
- get
|
|
||||||
- apiGroups:
|
|
||||||
- ""
|
|
||||||
resources:
|
|
||||||
- pods
|
|
||||||
- services
|
|
||||||
- configmaps
|
|
||||||
- secrets
|
|
||||||
- events
|
|
||||||
verbs:
|
|
||||||
- get
|
|
||||||
- list
|
|
||||||
- watch
|
|
||||||
- create
|
|
||||||
- update
|
|
||||||
- patch
|
|
||||||
- delete
|
|
||||||
- apiGroups:
|
|
||||||
- batch
|
|
||||||
resources:
|
|
||||||
- jobs
|
|
||||||
verbs:
|
|
||||||
- get
|
|
||||||
- list
|
|
||||||
- watch
|
|
||||||
- create
|
|
||||||
- update
|
|
||||||
- patch
|
|
||||||
- delete
|
|
||||||
- apiGroups:
|
|
||||||
- networking.k8s.io
|
|
||||||
resources:
|
|
||||||
- networkpolicies
|
|
||||||
verbs:
|
|
||||||
- get
|
|
||||||
- list
|
|
||||||
- watch
|
|
||||||
@@ -1,19 +0,0 @@
|
|||||||
apiVersion: rbac.authorization.k8s.io/v1
|
|
||||||
kind: ClusterRoleBinding
|
|
||||||
metadata:
|
|
||||||
name: fc-devicemgmt-operator
|
|
||||||
labels:
|
|
||||||
app.kubernetes.io/name: fc-devicemgmt-operator
|
|
||||||
app.kubernetes.io/component: operator
|
|
||||||
app.kubernetes.io/part-of: flowercore
|
|
||||||
app.kubernetes.io/managed-by: argocd
|
|
||||||
flowercore.io/tenant-id: system
|
|
||||||
flowercore.io/created-by: bluejay-infra
|
|
||||||
roleRef:
|
|
||||||
apiGroup: rbac.authorization.k8s.io
|
|
||||||
kind: ClusterRole
|
|
||||||
name: fc-devicemgmt-operator
|
|
||||||
subjects:
|
|
||||||
- kind: ServiceAccount
|
|
||||||
name: fc-devicemgmt-operator
|
|
||||||
namespace: fc-devicemgmt
|
|
||||||
@@ -1,109 +0,0 @@
|
|||||||
# FlowerCore.DeviceManagement Operator.
|
|
||||||
#
|
|
||||||
# KubeOps controller for devices.flowercore.io resources. Operator-created
|
|
||||||
# children must set OwnerReferences + traceability labels/annotations per
|
|
||||||
# k8s-pod-ownership-and-traceability-standard.md. RBAC below grants
|
|
||||||
# apps/deployments/get so the process can resolve its own Deployment UID.
|
|
||||||
apiVersion: apps/v1
|
|
||||||
kind: Deployment
|
|
||||||
metadata:
|
|
||||||
name: fc-devicemgmt-operator
|
|
||||||
namespace: fc-devicemgmt
|
|
||||||
labels:
|
|
||||||
app: fc-devicemgmt-operator
|
|
||||||
app.kubernetes.io/name: fc-devicemgmt-operator
|
|
||||||
app.kubernetes.io/component: operator
|
|
||||||
app.kubernetes.io/part-of: flowercore
|
|
||||||
app.kubernetes.io/managed-by: argocd
|
|
||||||
flowercore.io/tenant-id: system
|
|
||||||
flowercore.io/created-by: bluejay-infra
|
|
||||||
annotations:
|
|
||||||
flowercore.io/traceability-standard: k8s-pod-ownership-and-traceability-standard
|
|
||||||
spec:
|
|
||||||
replicas: 1
|
|
||||||
revisionHistoryLimit: 3
|
|
||||||
selector:
|
|
||||||
matchLabels:
|
|
||||||
app: fc-devicemgmt-operator
|
|
||||||
template:
|
|
||||||
metadata:
|
|
||||||
labels:
|
|
||||||
app: fc-devicemgmt-operator
|
|
||||||
app.kubernetes.io/name: fc-devicemgmt-operator
|
|
||||||
app.kubernetes.io/component: operator
|
|
||||||
app.kubernetes.io/part-of: flowercore
|
|
||||||
app.kubernetes.io/managed-by: argocd
|
|
||||||
flowercore.io/tenant-id: system
|
|
||||||
flowercore.io/created-by: bluejay-infra
|
|
||||||
annotations:
|
|
||||||
prometheus.io/scrape: "true"
|
|
||||||
prometheus.io/port: "8080"
|
|
||||||
prometheus.io/path: "/metrics"
|
|
||||||
flowercore.io/audit-trace-id: "runtime-activity-trace"
|
|
||||||
spec:
|
|
||||||
serviceAccountName: fc-devicemgmt-operator
|
|
||||||
securityContext:
|
|
||||||
fsGroup: 1654
|
|
||||||
fsGroupChangePolicy: OnRootMismatch
|
|
||||||
containers:
|
|
||||||
- name: operator
|
|
||||||
image: localhost/fc-devicemgmt-operator:v20260519-sp34cl3-fix
|
|
||||||
imagePullPolicy: Never
|
|
||||||
ports:
|
|
||||||
- name: metrics
|
|
||||||
containerPort: 8080
|
|
||||||
env:
|
|
||||||
- name: ASPNETCORE_ENVIRONMENT
|
|
||||||
value: "Production"
|
|
||||||
- name: DOTNET_SYSTEM_GLOBALIZATION_INVARIANT
|
|
||||||
value: "false"
|
|
||||||
- name: POD_NAME
|
|
||||||
valueFrom:
|
|
||||||
fieldRef:
|
|
||||||
fieldPath: metadata.name
|
|
||||||
- name: POD_NAMESPACE
|
|
||||||
valueFrom:
|
|
||||||
fieldRef:
|
|
||||||
fieldPath: metadata.namespace
|
|
||||||
- name: FLOWERCORE_KUBERNETES_OWNER_DEPLOYMENT
|
|
||||||
value: "fc-devicemgmt-operator"
|
|
||||||
- name: FlowerCore__Service__Name
|
|
||||||
value: "FlowerCore.DeviceManagement.Operator"
|
|
||||||
- name: FlowerCore__DeviceManagement__DefaultTenantId
|
|
||||||
value: "system"
|
|
||||||
resources:
|
|
||||||
requests:
|
|
||||||
cpu: 50m
|
|
||||||
memory: 128Mi
|
|
||||||
limits:
|
|
||||||
cpu: 500m
|
|
||||||
memory: 512Mi
|
|
||||||
readinessProbe:
|
|
||||||
tcpSocket:
|
|
||||||
port: 8080
|
|
||||||
initialDelaySeconds: 5
|
|
||||||
periodSeconds: 10
|
|
||||||
livenessProbe:
|
|
||||||
tcpSocket:
|
|
||||||
port: 8080
|
|
||||||
initialDelaySeconds: 20
|
|
||||||
periodSeconds: 30
|
|
||||||
securityContext:
|
|
||||||
runAsNonRoot: true
|
|
||||||
runAsUser: 1654
|
|
||||||
runAsGroup: 1654
|
|
||||||
allowPrivilegeEscalation: false
|
|
||||||
readOnlyRootFilesystem: true
|
|
||||||
capabilities:
|
|
||||||
drop:
|
|
||||||
- ALL
|
|
||||||
volumeMounts:
|
|
||||||
- name: tmp
|
|
||||||
mountPath: /tmp
|
|
||||||
- name: logs
|
|
||||||
mountPath: /app/logs
|
|
||||||
volumes:
|
|
||||||
- name: tmp
|
|
||||||
emptyDir: {}
|
|
||||||
- name: logs
|
|
||||||
emptyDir: {}
|
|
||||||
@@ -1,151 +0,0 @@
|
|||||||
# FlowerCore.DeviceManagement Web.
|
|
||||||
#
|
|
||||||
# Source repo is expected to ship FlowerCore.DeviceManagement.Web in a later
|
|
||||||
# Sprint 9+ lane. This manifest is static-valid without requiring the image to
|
|
||||||
# exist yet; import localhost/fc-devicemgmt-web:<tag> to all schedulable RKE2
|
|
||||||
# nodes before letting ArgoCD sync a live rollout.
|
|
||||||
#
|
|
||||||
# SCALED TO 0 — 2026-05-19 morning-routine cleanup.
|
|
||||||
# The Web pod cannot start until TWO upstream gaps close:
|
|
||||||
# 1. MySQL DB instance `flowercore_devicemgmt` (user `fc_devicemgmt`) is
|
|
||||||
# provisioned via fc-mysql Manager. The cluster currently has ZERO
|
|
||||||
# MySqlInstanceCrds and no `mysql.fc-mysql.svc:3306` Service, so the
|
|
||||||
# deployment-web container env `FlowerCore__Database__Host=mysql.fc-mysql.svc`
|
|
||||||
# points at nothing. Provision via the fc-mysql Manager UI/REST/MCP.
|
|
||||||
# 2. 1Password vault item `IAmWorkin/FlowerCore DeviceManagement Runtime`
|
|
||||||
# with 5 fields (DB-Password, mtls-ca.pem, mtls-client.crt, mtls-client.key,
|
|
||||||
# mtls-chain.pem) — see apps/fc-devicemgmt/1password-item.yaml. Mint mTLS
|
|
||||||
# from step-ca-agent ClusterIssuer per ADR-126; DB-Password must match the
|
|
||||||
# password configured for the MySQL user.
|
|
||||||
# Re-enable: change replicas back to 2 after both gaps close. The image tag
|
|
||||||
# in this file (v20260512-cx5) MAY also need a refresh — it predates the
|
|
||||||
# Sprint 34 Cl-3 operator fix; Web may have an analogous bug.
|
|
||||||
apiVersion: apps/v1
|
|
||||||
kind: Deployment
|
|
||||||
metadata:
|
|
||||||
name: fc-devicemgmt-web
|
|
||||||
namespace: fc-devicemgmt
|
|
||||||
labels:
|
|
||||||
app: fc-devicemgmt-web
|
|
||||||
app.kubernetes.io/name: fc-devicemgmt-web
|
|
||||||
app.kubernetes.io/component: web
|
|
||||||
app.kubernetes.io/part-of: flowercore
|
|
||||||
app.kubernetes.io/managed-by: argocd
|
|
||||||
flowercore.io/tenant-id: system
|
|
||||||
flowercore.io/created-by: bluejay-infra
|
|
||||||
annotations:
|
|
||||||
flowercore.io/traceability-standard: k8s-pod-ownership-and-traceability-standard
|
|
||||||
spec:
|
|
||||||
replicas: 0
|
|
||||||
revisionHistoryLimit: 3
|
|
||||||
selector:
|
|
||||||
matchLabels:
|
|
||||||
app: fc-devicemgmt-web
|
|
||||||
template:
|
|
||||||
metadata:
|
|
||||||
labels:
|
|
||||||
app: fc-devicemgmt-web
|
|
||||||
app.kubernetes.io/name: fc-devicemgmt-web
|
|
||||||
app.kubernetes.io/component: web
|
|
||||||
app.kubernetes.io/part-of: flowercore
|
|
||||||
app.kubernetes.io/managed-by: argocd
|
|
||||||
flowercore.io/tenant-id: system
|
|
||||||
flowercore.io/created-by: bluejay-infra
|
|
||||||
annotations:
|
|
||||||
prometheus.io/scrape: "true"
|
|
||||||
prometheus.io/port: "8080"
|
|
||||||
prometheus.io/path: "/metrics"
|
|
||||||
flowercore.io/audit-trace-id: "runtime-activity-trace"
|
|
||||||
spec:
|
|
||||||
securityContext:
|
|
||||||
fsGroup: 1654
|
|
||||||
fsGroupChangePolicy: OnRootMismatch
|
|
||||||
containers:
|
|
||||||
- name: web
|
|
||||||
image: localhost/fc-devicemgmt-web:v20260512-cx5
|
|
||||||
imagePullPolicy: Never
|
|
||||||
ports:
|
|
||||||
- name: http
|
|
||||||
containerPort: 8080
|
|
||||||
env:
|
|
||||||
- name: ASPNETCORE_URLS
|
|
||||||
value: "http://+:8080"
|
|
||||||
- name: ASPNETCORE_ENVIRONMENT
|
|
||||||
value: "Production"
|
|
||||||
- name: DOTNET_SYSTEM_GLOBALIZATION_INVARIANT
|
|
||||||
value: "false"
|
|
||||||
- name: FlowerCore__Service__Name
|
|
||||||
value: "FlowerCore.DeviceManagement.Web"
|
|
||||||
- name: FlowerCore__DeviceManagement__DefaultTenantId
|
|
||||||
value: "system"
|
|
||||||
- name: FlowerCore__Database__Provider
|
|
||||||
value: "MySql"
|
|
||||||
- name: FlowerCore__Database__Host
|
|
||||||
value: "mysql.fc-mysql.svc"
|
|
||||||
- name: FlowerCore__Database__Database
|
|
||||||
value: "flowercore_devicemgmt"
|
|
||||||
- name: FlowerCore__Database__User
|
|
||||||
value: "fc_devicemgmt"
|
|
||||||
- name: FlowerCore__Database__Password
|
|
||||||
valueFrom:
|
|
||||||
secretKeyRef:
|
|
||||||
name: fc-devicemgmt-runtime
|
|
||||||
key: DB-Password
|
|
||||||
- name: FlowerCore__DeviceManagement__AgentMtls__CaPath
|
|
||||||
value: "/secrets/devicemgmt-mtls/mtls-ca.pem"
|
|
||||||
- name: FlowerCore__DeviceManagement__AgentMtls__ClientCertificatePath
|
|
||||||
value: "/secrets/devicemgmt-mtls/mtls-client.crt"
|
|
||||||
- name: FlowerCore__DeviceManagement__AgentMtls__ClientKeyPath
|
|
||||||
value: "/secrets/devicemgmt-mtls/mtls-client.key"
|
|
||||||
- name: FlowerCore__EventBus__Redis__Configuration
|
|
||||||
value: "redis.fc-redis.svc:6379"
|
|
||||||
resources:
|
|
||||||
requests:
|
|
||||||
cpu: 100m
|
|
||||||
memory: 256Mi
|
|
||||||
limits:
|
|
||||||
cpu: 1000m
|
|
||||||
memory: 768Mi
|
|
||||||
startupProbe:
|
|
||||||
tcpSocket:
|
|
||||||
port: 8080
|
|
||||||
initialDelaySeconds: 5
|
|
||||||
periodSeconds: 5
|
|
||||||
failureThreshold: 30
|
|
||||||
readinessProbe:
|
|
||||||
tcpSocket:
|
|
||||||
port: 8080
|
|
||||||
periodSeconds: 10
|
|
||||||
failureThreshold: 3
|
|
||||||
livenessProbe:
|
|
||||||
tcpSocket:
|
|
||||||
port: 8080
|
|
||||||
initialDelaySeconds: 30
|
|
||||||
periodSeconds: 30
|
|
||||||
failureThreshold: 3
|
|
||||||
securityContext:
|
|
||||||
runAsNonRoot: true
|
|
||||||
runAsUser: 1654
|
|
||||||
runAsGroup: 1654
|
|
||||||
allowPrivilegeEscalation: false
|
|
||||||
readOnlyRootFilesystem: true
|
|
||||||
capabilities:
|
|
||||||
drop:
|
|
||||||
- ALL
|
|
||||||
volumeMounts:
|
|
||||||
- name: tmp
|
|
||||||
mountPath: /tmp
|
|
||||||
- name: logs
|
|
||||||
mountPath: /app/logs
|
|
||||||
- name: devicemgmt-mtls
|
|
||||||
mountPath: /secrets/devicemgmt-mtls
|
|
||||||
readOnly: true
|
|
||||||
volumes:
|
|
||||||
- name: tmp
|
|
||||||
emptyDir: {}
|
|
||||||
- name: logs
|
|
||||||
emptyDir: {}
|
|
||||||
- name: devicemgmt-mtls
|
|
||||||
secret:
|
|
||||||
secretName: fc-devicemgmt-runtime
|
|
||||||
defaultMode: 0400
|
|
||||||
@@ -1,55 +0,0 @@
|
|||||||
# LAN ingress for FlowerCore.DeviceManagement Web.
|
|
||||||
#
|
|
||||||
# RKE2 Traefik has no built-in ACME resolver configured. Keep TLS certificate
|
|
||||||
# ownership in cert-manager Certificate/fc-devicemgmt-web-tls.
|
|
||||||
apiVersion: traefik.io/v1alpha1
|
|
||||||
kind: IngressRoute
|
|
||||||
metadata:
|
|
||||||
name: fc-devicemgmt-web
|
|
||||||
namespace: fc-devicemgmt
|
|
||||||
labels:
|
|
||||||
app.kubernetes.io/name: fc-devicemgmt-web
|
|
||||||
app.kubernetes.io/component: web
|
|
||||||
app.kubernetes.io/part-of: flowercore
|
|
||||||
app.kubernetes.io/managed-by: argocd
|
|
||||||
flowercore.io/tenant-id: system
|
|
||||||
flowercore.io/created-by: bluejay-infra
|
|
||||||
spec:
|
|
||||||
entryPoints:
|
|
||||||
- websecure
|
|
||||||
routes:
|
|
||||||
- match: Host(`devices.iamworkin.lan`)
|
|
||||||
kind: Rule
|
|
||||||
services:
|
|
||||||
- name: fc-devicemgmt-web
|
|
||||||
port: 80
|
|
||||||
tls:
|
|
||||||
secretName: fc-devicemgmt-web-tls
|
|
||||||
|
|
||||||
# Future public agent/update host gate (OFF by default):
|
|
||||||
#
|
|
||||||
# Do not enable `update.flowercore.io` here until Authentik OIDC Q-OIDC-1
|
|
||||||
# resolves the public-device-management auth model and route ownership with
|
|
||||||
# UpdateCenter. When enabled, use a separate public IngressRoute with an
|
|
||||||
# explicit Method allowlist, public-host auth middleware, and public TLS
|
|
||||||
# certificate strategy. Leaving this as comments keeps ArgoCD from stealing
|
|
||||||
# live UpdateCenter traffic.
|
|
||||||
#
|
|
||||||
# apiVersion: traefik.io/v1alpha1
|
|
||||||
# kind: IngressRoute
|
|
||||||
# metadata:
|
|
||||||
# name: fc-devicemgmt-web-public
|
|
||||||
# namespace: fc-devicemgmt
|
|
||||||
# annotations:
|
|
||||||
# flowercore.io/public-host-gate: "disabled-until-Q-OIDC-1"
|
|
||||||
# spec:
|
|
||||||
# entryPoints:
|
|
||||||
# - websecure
|
|
||||||
# routes:
|
|
||||||
# - match: Host(`update.flowercore.io`) && (Method(`GET`) || Method(`HEAD`) || Method(`POST`) || Method(`OPTIONS`))
|
|
||||||
# kind: Rule
|
|
||||||
# services:
|
|
||||||
# - name: fc-devicemgmt-web
|
|
||||||
# port: 80
|
|
||||||
# tls:
|
|
||||||
# secretName: fc-devicemgmt-public-tls
|
|
||||||
@@ -1,13 +0,0 @@
|
|||||||
# FlowerCore.DeviceManagement namespace.
|
|
||||||
#
|
|
||||||
# ArgoCD discovers this directory as Application `infra-fc-devicemgmt`.
|
|
||||||
apiVersion: v1
|
|
||||||
kind: Namespace
|
|
||||||
metadata:
|
|
||||||
name: fc-devicemgmt
|
|
||||||
labels:
|
|
||||||
app.kubernetes.io/name: fc-devicemgmt
|
|
||||||
app.kubernetes.io/part-of: flowercore
|
|
||||||
app.kubernetes.io/managed-by: argocd
|
|
||||||
flowercore.io/tenant-id: system
|
|
||||||
flowercore.io/created-by: bluejay-infra
|
|
||||||
@@ -1,224 +0,0 @@
|
|||||||
# FlowerCore.DeviceManagement NetworkPolicies.
|
|
||||||
#
|
|
||||||
# NetworkPolicies belong in bluejay-infra so ArgoCD owns rebuild state.
|
|
||||||
# Rules include Traefik post-DNAT backend ports per
|
|
||||||
# feedback_netpol_dnat_backend_port and Synology NFS egress for the requested
|
|
||||||
# cold-tier / future artifact path.
|
|
||||||
---
|
|
||||||
apiVersion: networking.k8s.io/v1
|
|
||||||
kind: NetworkPolicy
|
|
||||||
metadata:
|
|
||||||
name: fc-devicemgmt-web-isolation
|
|
||||||
namespace: fc-devicemgmt
|
|
||||||
labels:
|
|
||||||
app.kubernetes.io/name: fc-devicemgmt-web
|
|
||||||
app.kubernetes.io/component: web
|
|
||||||
app.kubernetes.io/part-of: flowercore
|
|
||||||
app.kubernetes.io/managed-by: argocd
|
|
||||||
flowercore.io/tenant-id: system
|
|
||||||
flowercore.io/created-by: bluejay-infra
|
|
||||||
spec:
|
|
||||||
podSelector:
|
|
||||||
matchLabels:
|
|
||||||
app: fc-devicemgmt-web
|
|
||||||
policyTypes:
|
|
||||||
- Ingress
|
|
||||||
- Egress
|
|
||||||
ingress:
|
|
||||||
# LAN edge: only cluster Traefik should reach the Web pod for
|
|
||||||
# devices.iamworkin.lan.
|
|
||||||
- from:
|
|
||||||
- namespaceSelector:
|
|
||||||
matchLabels:
|
|
||||||
kubernetes.io/metadata.name: traefik-system
|
|
||||||
podSelector:
|
|
||||||
matchLabels:
|
|
||||||
app.kubernetes.io/name: traefik
|
|
||||||
ports:
|
|
||||||
- port: 8080
|
|
||||||
protocol: TCP
|
|
||||||
# Direct LAN diagnostics are allowed only from FlowerCore LAN/VPN ranges.
|
|
||||||
- from:
|
|
||||||
- ipBlock:
|
|
||||||
cidr: 10.0.56.0/24
|
|
||||||
- ipBlock:
|
|
||||||
cidr: 10.0.57.0/24
|
|
||||||
- ipBlock:
|
|
||||||
cidr: 10.0.58.0/24
|
|
||||||
- ipBlock:
|
|
||||||
cidr: 10.0.68.0/27
|
|
||||||
ports:
|
|
||||||
- port: 8080
|
|
||||||
protocol: TCP
|
|
||||||
egress:
|
|
||||||
# CoreDNS.
|
|
||||||
- to:
|
|
||||||
- namespaceSelector:
|
|
||||||
matchLabels:
|
|
||||||
kubernetes.io/metadata.name: kube-system
|
|
||||||
podSelector:
|
|
||||||
matchLabels:
|
|
||||||
k8s-app: kube-dns
|
|
||||||
ports:
|
|
||||||
- port: 53
|
|
||||||
protocol: UDP
|
|
||||||
- port: 53
|
|
||||||
protocol: TCP
|
|
||||||
# Database namespace.
|
|
||||||
- to:
|
|
||||||
- namespaceSelector:
|
|
||||||
matchLabels:
|
|
||||||
kubernetes.io/metadata.name: fc-mysql
|
|
||||||
ports:
|
|
||||||
- port: 3306
|
|
||||||
protocol: TCP
|
|
||||||
# Redis backplane for multi-replica SignalR / live-status fan-out.
|
|
||||||
- to:
|
|
||||||
- namespaceSelector:
|
|
||||||
matchLabels:
|
|
||||||
kubernetes.io/metadata.name: fc-redis
|
|
||||||
ports:
|
|
||||||
- port: 6379
|
|
||||||
protocol: TCP
|
|
||||||
# Traefik VIP / in-cluster Traefik for self-callbacks and public URL
|
|
||||||
# generation tests. Include post-DNAT backend ports 8443 + 8080.
|
|
||||||
- to:
|
|
||||||
- ipBlock:
|
|
||||||
cidr: 10.0.56.200/32
|
|
||||||
- namespaceSelector:
|
|
||||||
matchLabels:
|
|
||||||
kubernetes.io/metadata.name: traefik-system
|
|
||||||
podSelector:
|
|
||||||
matchLabels:
|
|
||||||
app.kubernetes.io/name: traefik
|
|
||||||
ports:
|
|
||||||
- port: 80
|
|
||||||
protocol: TCP
|
|
||||||
- port: 443
|
|
||||||
protocol: TCP
|
|
||||||
- port: 8080
|
|
||||||
protocol: TCP
|
|
||||||
- port: 8443
|
|
||||||
protocol: TCP
|
|
||||||
# Agent egress: LAN/VPN devices may run DM Agent in Generic, Kiosk, Pi,
|
|
||||||
# ThinClient, or Server mode. Keep this private-range only.
|
|
||||||
- to:
|
|
||||||
- ipBlock:
|
|
||||||
cidr: 10.0.56.0/24
|
|
||||||
- ipBlock:
|
|
||||||
cidr: 10.0.57.0/24
|
|
||||||
- ipBlock:
|
|
||||||
cidr: 10.0.58.0/24
|
|
||||||
- ipBlock:
|
|
||||||
cidr: 10.0.68.0/27
|
|
||||||
ports:
|
|
||||||
- port: 80
|
|
||||||
protocol: TCP
|
|
||||||
- port: 443
|
|
||||||
protocol: TCP
|
|
||||||
- port: 8080
|
|
||||||
protocol: TCP
|
|
||||||
- port: 8443
|
|
||||||
protocol: TCP
|
|
||||||
- port: 5000
|
|
||||||
protocol: TCP
|
|
||||||
- port: 5001
|
|
||||||
protocol: TCP
|
|
||||||
# Synology NFS cold-tier / artifact mount allowance.
|
|
||||||
- to:
|
|
||||||
- ipBlock:
|
|
||||||
cidr: 10.0.58.3/32
|
|
||||||
ports:
|
|
||||||
- port: 2049
|
|
||||||
protocol: TCP
|
|
||||||
- port: 2049
|
|
||||||
protocol: UDP
|
|
||||||
- port: 111
|
|
||||||
protocol: TCP
|
|
||||||
- port: 111
|
|
||||||
protocol: UDP
|
|
||||||
---
|
|
||||||
apiVersion: networking.k8s.io/v1
|
|
||||||
kind: NetworkPolicy
|
|
||||||
metadata:
|
|
||||||
name: fc-devicemgmt-operator-isolation
|
|
||||||
namespace: fc-devicemgmt
|
|
||||||
labels:
|
|
||||||
app.kubernetes.io/name: fc-devicemgmt-operator
|
|
||||||
app.kubernetes.io/component: operator
|
|
||||||
app.kubernetes.io/part-of: flowercore
|
|
||||||
app.kubernetes.io/managed-by: argocd
|
|
||||||
flowercore.io/tenant-id: system
|
|
||||||
flowercore.io/created-by: bluejay-infra
|
|
||||||
spec:
|
|
||||||
podSelector:
|
|
||||||
matchLabels:
|
|
||||||
app: fc-devicemgmt-operator
|
|
||||||
policyTypes:
|
|
||||||
- Ingress
|
|
||||||
- Egress
|
|
||||||
ingress:
|
|
||||||
- from:
|
|
||||||
- namespaceSelector:
|
|
||||||
matchLabels:
|
|
||||||
kubernetes.io/metadata.name: monitoring
|
|
||||||
ports:
|
|
||||||
- port: 8080
|
|
||||||
protocol: TCP
|
|
||||||
egress:
|
|
||||||
# CoreDNS.
|
|
||||||
- to:
|
|
||||||
- namespaceSelector:
|
|
||||||
matchLabels:
|
|
||||||
kubernetes.io/metadata.name: kube-system
|
|
||||||
podSelector:
|
|
||||||
matchLabels:
|
|
||||||
k8s-app: kube-dns
|
|
||||||
ports:
|
|
||||||
- port: 53
|
|
||||||
protocol: UDP
|
|
||||||
- port: 53
|
|
||||||
protocol: TCP
|
|
||||||
# Kubernetes API for KubeOps reconciliation and Deployment UID lookup.
|
|
||||||
- to: []
|
|
||||||
ports:
|
|
||||||
- port: 443
|
|
||||||
protocol: TCP
|
|
||||||
- port: 6443
|
|
||||||
protocol: TCP
|
|
||||||
# Agent egress for operator-initiated probes / fallback command dispatch.
|
|
||||||
- to:
|
|
||||||
- ipBlock:
|
|
||||||
cidr: 10.0.56.0/24
|
|
||||||
- ipBlock:
|
|
||||||
cidr: 10.0.57.0/24
|
|
||||||
- ipBlock:
|
|
||||||
cidr: 10.0.58.0/24
|
|
||||||
- ipBlock:
|
|
||||||
cidr: 10.0.68.0/27
|
|
||||||
ports:
|
|
||||||
- port: 80
|
|
||||||
protocol: TCP
|
|
||||||
- port: 443
|
|
||||||
protocol: TCP
|
|
||||||
- port: 8080
|
|
||||||
protocol: TCP
|
|
||||||
- port: 8443
|
|
||||||
protocol: TCP
|
|
||||||
- port: 5000
|
|
||||||
protocol: TCP
|
|
||||||
- port: 5001
|
|
||||||
protocol: TCP
|
|
||||||
# Synology NFS allowance for future cold-tier/audit archival jobs.
|
|
||||||
- to:
|
|
||||||
- ipBlock:
|
|
||||||
cidr: 10.0.58.3/32
|
|
||||||
ports:
|
|
||||||
- port: 2049
|
|
||||||
protocol: TCP
|
|
||||||
- port: 2049
|
|
||||||
protocol: UDP
|
|
||||||
- port: 111
|
|
||||||
protocol: TCP
|
|
||||||
- port: 111
|
|
||||||
protocol: UDP
|
|
||||||
@@ -1,22 +0,0 @@
|
|||||||
apiVersion: v1
|
|
||||||
kind: Service
|
|
||||||
metadata:
|
|
||||||
name: fc-devicemgmt-web
|
|
||||||
namespace: fc-devicemgmt
|
|
||||||
labels:
|
|
||||||
app: fc-devicemgmt-web
|
|
||||||
app.kubernetes.io/name: fc-devicemgmt-web
|
|
||||||
app.kubernetes.io/component: web
|
|
||||||
app.kubernetes.io/part-of: flowercore
|
|
||||||
app.kubernetes.io/managed-by: argocd
|
|
||||||
flowercore.io/tenant-id: system
|
|
||||||
flowercore.io/created-by: bluejay-infra
|
|
||||||
spec:
|
|
||||||
type: ClusterIP
|
|
||||||
selector:
|
|
||||||
app: fc-devicemgmt-web
|
|
||||||
ports:
|
|
||||||
- name: http
|
|
||||||
port: 80
|
|
||||||
targetPort: 8080
|
|
||||||
protocol: TCP
|
|
||||||
@@ -1,12 +0,0 @@
|
|||||||
apiVersion: v1
|
|
||||||
kind: ServiceAccount
|
|
||||||
metadata:
|
|
||||||
name: fc-devicemgmt-operator
|
|
||||||
namespace: fc-devicemgmt
|
|
||||||
labels:
|
|
||||||
app.kubernetes.io/name: fc-devicemgmt-operator
|
|
||||||
app.kubernetes.io/component: operator
|
|
||||||
app.kubernetes.io/part-of: flowercore
|
|
||||||
app.kubernetes.io/managed-by: argocd
|
|
||||||
flowercore.io/tenant-id: system
|
|
||||||
flowercore.io/created-by: bluejay-infra
|
|
||||||
@@ -1,2 +1,3 @@
|
|||||||
# Settle DRM for 2s before restarting Chromium, then redeclare capabilities.
|
# Restart kiosk and redeclare capabilities when HDMI connect/disconnect changes DRM state.
|
||||||
SUBSYSTEM=="drm", KERNEL=="card?-HDMI-A-?", ACTION=="change", RUN+="/usr/bin/systemctl start flowercore-signage-player-pi-hdmi.service"
|
SUBSYSTEM=="drm", KERNEL=="card?-HDMI-A-?", ACTION=="change", RUN+="/usr/bin/systemctl restart flowercore-signage-player-pi.service"
|
||||||
|
SUBSYSTEM=="drm", KERNEL=="card?-HDMI-A-?", ACTION=="change", RUN+="/usr/bin/systemctl start flowercore-signage-detect-display.service"
|
||||||
|
|||||||
@@ -1,22 +0,0 @@
|
|||||||
#!/usr/bin/env bats
|
|
||||||
|
|
||||||
setup() {
|
|
||||||
APP_ROOT="$(cd "$BATS_TEST_DIRNAME/.." && pwd)"
|
|
||||||
DETECT="$APP_ROOT/scripts/fc-signage-detect-display"
|
|
||||||
}
|
|
||||||
|
|
||||||
@test "display detection emits graceful disconnected profile when no hdmi connector is present" {
|
|
||||||
script="$(cat "$DETECT")"
|
|
||||||
[[ "$script" == *"displayConnected: false"* ]]
|
|
||||||
[[ "$script" == *"No HDMI display detected"* ]]
|
|
||||||
}
|
|
||||||
|
|
||||||
@test "display detection parses edid, falls back to kmsprint, and logs endpoint failures locally" {
|
|
||||||
script="$(cat "$DETECT")"
|
|
||||||
[[ "$script" == *"edid-decode"* ]]
|
|
||||||
[[ "$script" == *"HDR (Static|Dynamic) Metadata Block"* ]]
|
|
||||||
[[ "$script" == *"kmsprint"* ]]
|
|
||||||
[[ "$script" == *"/api/v1/nodes/\${NODE_ID}/capabilities"* ]]
|
|
||||||
[[ "$script" == *"/api/v1/displays/\${NODE_ID}/capability-profile"* ]]
|
|
||||||
[[ "$script" == *"capabilities.log"* ]]
|
|
||||||
}
|
|
||||||
@@ -1,64 +0,0 @@
|
|||||||
#!/usr/bin/env bats
|
|
||||||
|
|
||||||
setup() {
|
|
||||||
APP_ROOT="$(cd "$BATS_TEST_DIRNAME/.." && pwd)"
|
|
||||||
BOOTSTRAP="$APP_ROOT/scripts/flowercore-signage-bootstrap.sh"
|
|
||||||
RENEW="$APP_ROOT/scripts/flowercore-signage-renew-cert.sh"
|
|
||||||
}
|
|
||||||
|
|
||||||
@test "bootstrap is idempotent when node is already enrolled" {
|
|
||||||
script="$(cat "$BOOTSTRAP")"
|
|
||||||
[[ "$script" == *'[[ -s "$NODE_JSON" && -s "$CERT_DIR/client.p12" ]]'* ]]
|
|
||||||
[[ "$script" == *"already enrolled"* ]]
|
|
||||||
[[ "$script" == *"exit 0"* ]]
|
|
||||||
}
|
|
||||||
|
|
||||||
@test "bootstrap generates a stable node uuid and machine id" {
|
|
||||||
script="$(cat "$BOOTSTRAP")"
|
|
||||||
[[ "$script" == *"uuidgen"* ]]
|
|
||||||
[[ "$script" == *"nodeUuid"* ]]
|
|
||||||
[[ "$script" == *"machineId"* ]]
|
|
||||||
[[ "$script" == *"cut -c1-16"* ]]
|
|
||||||
}
|
|
||||||
|
|
||||||
@test "bootstrap posts to the canonical register endpoint" {
|
|
||||||
grep -q '/api/v1/nodes/register' "$BOOTSTRAP"
|
|
||||||
grep -q '"linux-arm64-pi"' "$BOOTSTRAP"
|
|
||||||
}
|
|
||||||
|
|
||||||
@test "bootstrap retries registration once for first-call races" {
|
|
||||||
script="$(cat "$BOOTSTRAP")"
|
|
||||||
[[ "$script" == *"for attempt in 1 2"* ]]
|
|
||||||
[[ "$script" == *"register attempt \$attempt returned"* ]]
|
|
||||||
[[ "$script" == *"sleep 5"* ]]
|
|
||||||
}
|
|
||||||
|
|
||||||
@test "bootstrap supports setup-code approval with manual polling fallback" {
|
|
||||||
script="$(cat "$BOOTSTRAP")"
|
|
||||||
[[ "$script" == *"signage-setup-code"* ]]
|
|
||||||
[[ "$script" == *"approve-via-setup-code"* ]]
|
|
||||||
[[ "$script" == *"+ 1800"* ]]
|
|
||||||
[[ "$script" == *"sleep 15"* ]]
|
|
||||||
}
|
|
||||||
|
|
||||||
@test "bootstrap generates an ecdsa p256 csr for the signage pi subject" {
|
|
||||||
script="$(cat "$BOOTSTRAP")"
|
|
||||||
[[ "$script" == *"ecparam -genkey -name prime256v1"* ]]
|
|
||||||
[[ "$script" == *'/CN=${NODE_ID}/O=FlowerCore/OU=SignagePlayer-Pi'* ]]
|
|
||||||
}
|
|
||||||
|
|
||||||
@test "bootstrap writes pkcs12 bundle with restrictive permissions" {
|
|
||||||
script="$(cat "$BOOTSTRAP")"
|
|
||||||
[[ "$script" == *"openssl pkcs12 -export"* ]]
|
|
||||||
[[ "$script" == *"client.p12.pass"* ]]
|
|
||||||
[[ "$script" == *"chmod 0640"* ]]
|
|
||||||
[[ "$script" == *"chmod 0600"* ]]
|
|
||||||
}
|
|
||||||
|
|
||||||
@test "renewal only calls renew endpoint inside the thirty-day window and swaps atomically" {
|
|
||||||
script="$(cat "$RENEW")"
|
|
||||||
[[ "$script" == *'-checkend $((30*24*3600))'* ]]
|
|
||||||
[[ "$script" == *"/api/v1/nodes/\${NODE_ID}/renew"* ]]
|
|
||||||
[[ "$script" == *"client.key.new"* ]]
|
|
||||||
[[ "$script" == *'mv "$CERT_DIR/client.p12.new" "$CERT_DIR/client.p12"'* ]]
|
|
||||||
}
|
|
||||||
@@ -1,68 +0,0 @@
|
|||||||
#!/usr/bin/env bats
|
|
||||||
|
|
||||||
setup() {
|
|
||||||
APP_ROOT="$(cd "$BATS_TEST_DIRNAME/.." && pwd)"
|
|
||||||
}
|
|
||||||
|
|
||||||
@test "player unit exists" {
|
|
||||||
[ -f "$APP_ROOT/systemd/flowercore-signage-player-pi.service" ]
|
|
||||||
}
|
|
||||||
|
|
||||||
@test "player unit uses simple chromium service with restart backoff" {
|
|
||||||
unit="$(cat "$APP_ROOT/systemd/flowercore-signage-player-pi.service")"
|
|
||||||
[[ "$unit" == *"Type=simple"* ]]
|
|
||||||
[[ "$unit" == *"Restart=always"* ]]
|
|
||||||
[[ "$unit" == *"RestartSec=10s"* ]]
|
|
||||||
[[ "$unit" == *"StartLimitBurst=5"* ]]
|
|
||||||
[[ "$unit" == *"StartLimitIntervalSec=300s"* ]]
|
|
||||||
}
|
|
||||||
|
|
||||||
@test "player unit caps chromium memory at two gigabytes" {
|
|
||||||
grep -q '^MemoryMax=2G$' "$APP_ROOT/systemd/flowercore-signage-player-pi.service"
|
|
||||||
grep -q '^MemoryHigh=1500M$' "$APP_ROOT/systemd/flowercore-signage-player-pi.service"
|
|
||||||
}
|
|
||||||
|
|
||||||
@test "player unit condition-gates startup on identity and p12 certificate" {
|
|
||||||
grep -q '^ConditionPathExists=/etc/flowercore/signage-node.json$' "$APP_ROOT/systemd/flowercore-signage-player-pi.service"
|
|
||||||
grep -q '^ConditionPathExists=/etc/fc-signage-player/client.p12$' "$APP_ROOT/systemd/flowercore-signage-player-pi.service"
|
|
||||||
}
|
|
||||||
|
|
||||||
@test "player unit runs prelaunch checks before chromium" {
|
|
||||||
grep -q '^ExecStartPre=/usr/local/bin/flowercore-signage-prelaunch.sh$' "$APP_ROOT/systemd/flowercore-signage-player-pi.service"
|
|
||||||
grep -q '^ExecStart=/usr/local/bin/flowercore-signage-launch.sh$' "$APP_ROOT/systemd/flowercore-signage-player-pi.service"
|
|
||||||
}
|
|
||||||
|
|
||||||
@test "hdmi udev rule routes through the two-second settle service" {
|
|
||||||
rule="$(cat "$APP_ROOT/systemd/99-flowercore-signage-hdmi.rules")"
|
|
||||||
[[ "$rule" == *'KERNEL=="card?-HDMI-A-?"'* ]]
|
|
||||||
[[ "$rule" == *"systemctl start flowercore-signage-player-pi-hdmi.service"* ]]
|
|
||||||
[[ "$rule" != *"systemctl restart flowercore-signage-player-pi.service"* ]]
|
|
||||||
}
|
|
||||||
|
|
||||||
@test "hdmi responder settles, declares display, then restarts chromium" {
|
|
||||||
responder="$(cat "$APP_ROOT/scripts/flowercore-signage-hdmi-respond.sh")"
|
|
||||||
[[ "$responder" == *"sleep 2"* ]]
|
|
||||||
[[ "$responder" == *"systemctl start flowercore-signage-detect-display.service"* ]]
|
|
||||||
[[ "$responder" == *"systemctl restart flowercore-signage-player-pi.service"* ]]
|
|
||||||
}
|
|
||||||
|
|
||||||
@test "chromium policy json is valid and disables credential prompts" {
|
|
||||||
command -v jq >/dev/null || skip "jq not installed"
|
|
||||||
jq -e '.AutofillAddressEnabled == false and .AutofillCreditCardEnabled == false and .PasswordManagerEnabled == false' \
|
|
||||||
"$APP_ROOT/chromium-policies/flowercore-signage.json" >/dev/null
|
|
||||||
}
|
|
||||||
|
|
||||||
@test "launch script tries embed URL and logs bare-player fallback" {
|
|
||||||
launch="$(cat "$APP_ROOT/scripts/flowercore-signage-launch.sh")"
|
|
||||||
[[ "$launch" == *'/player/${NODE_ID}/embed?token=${CERT_THUMB}'* ]]
|
|
||||||
[[ "$launch" == *"url-divergence.log"* ]]
|
|
||||||
[[ "$launch" == *'/player/${NODE_ID}?token=${CERT_THUMB}'* ]]
|
|
||||||
}
|
|
||||||
|
|
||||||
@test "prelaunch script validates required node and cert files" {
|
|
||||||
prelaunch="$(cat "$APP_ROOT/scripts/flowercore-signage-prelaunch.sh")"
|
|
||||||
[[ "$prelaunch" == *"/etc/flowercore/signage-node.json"* ]]
|
|
||||||
[[ "$prelaunch" == *"/etc/fc-signage-player/client.p12"* ]]
|
|
||||||
[[ "$prelaunch" == *"/etc/fc-signage-player/client.p12.pass"* ]]
|
|
||||||
[[ "$prelaunch" == *"exit 1"* ]]
|
|
||||||
}
|
|
||||||
@@ -532,7 +532,7 @@ spec:
|
|||||||
fsGroupChangePolicy: OnRootMismatch
|
fsGroupChangePolicy: OnRootMismatch
|
||||||
containers:
|
containers:
|
||||||
- name: web
|
- name: web
|
||||||
image: localhost/fc-ttsreader-web:v20260518-sprint36-demo-finish-b132cbf
|
image: localhost/fc-ttsreader-web:v20260506-phase6
|
||||||
imagePullPolicy: Never
|
imagePullPolicy: Never
|
||||||
ports:
|
ports:
|
||||||
- containerPort: 5217
|
- containerPort: 5217
|
||||||
@@ -555,13 +555,9 @@ spec:
|
|||||||
- name: TtsReader__Jobs__Root
|
- name: TtsReader__Jobs__Root
|
||||||
value: "/data/jobs"
|
value: "/data/jobs"
|
||||||
- name: TtsReader__Piper__Host
|
- name: TtsReader__Piper__Host
|
||||||
value: "10.0.57.17"
|
value: "ttsreader-piper.fc-ttsreader.svc.cluster.local."
|
||||||
- name: TtsReader__Piper__Port
|
- name: TtsReader__Piper__Port
|
||||||
value: "8500"
|
value: "10200"
|
||||||
- name: TtsReader__Piper__Transport
|
|
||||||
value: "http"
|
|
||||||
- name: TtsReader__Piper__HttpPath
|
|
||||||
value: "/tts"
|
|
||||||
- name: TtsReader__Kokoro__Enabled
|
- name: TtsReader__Kokoro__Enabled
|
||||||
value: "true"
|
value: "true"
|
||||||
- name: TtsReader__Kokoro__BaseUrl
|
- name: TtsReader__Kokoro__BaseUrl
|
||||||
|
|||||||
2
apps/github-runner/.gitattributes
vendored
2
apps/github-runner/.gitattributes
vendored
@@ -1,2 +0,0 @@
|
|||||||
*.sh text eol=lf
|
|
||||||
Dockerfile text eol=lf
|
|
||||||
@@ -1,54 +0,0 @@
|
|||||||
FROM myoung34/github-runner:latest
|
|
||||||
|
|
||||||
ARG RUBY_VERSION=3.3.11
|
|
||||||
ARG RUBY_MINOR=3.3
|
|
||||||
ARG RUBY_BUILD_VERSION=v20260326
|
|
||||||
ARG RUNNER_UID=1001
|
|
||||||
ARG RUNNER_GID=1001
|
|
||||||
|
|
||||||
ENV RUNNER_TOOL_CACHE=/home/runner/_tool
|
|
||||||
ENV RUNNER_RUBY_TOOLCACHE=/opt/runner-toolcache
|
|
||||||
ENV PATH="/home/runner/_tool/Ruby/${RUBY_MINOR}/x64/bin:/opt/runner-toolcache/Ruby/${RUBY_MINOR}/x64/bin:${PATH}"
|
|
||||||
|
|
||||||
USER root
|
|
||||||
|
|
||||||
# Bake the IAmWorkin step-ca root CA into the system trust store. Without
|
|
||||||
# this, .NET HttpClient calls from CI tests against *.iamworkin.lan
|
|
||||||
# (e.g. https://selenium.iamworkin.lan/session) fail with `PartialChain`
|
|
||||||
# because the runner image's default Ubuntu trust bundle doesn't include
|
|
||||||
# our internal Root CA. update-ca-certificates regenerates
|
|
||||||
# /etc/ssl/certs/ca-certificates.crt, which OpenSSL + .NET on Linux read
|
|
||||||
# automatically — no SSL_CERT_FILE env var needed.
|
|
||||||
COPY step-ca-root.crt /usr/local/share/ca-certificates/iamworkin-step-ca-root.crt
|
|
||||||
|
|
||||||
RUN apt-get update \
|
|
||||||
&& DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
|
|
||||||
autoconf \
|
|
||||||
bison \
|
|
||||||
build-essential \
|
|
||||||
ca-certificates \
|
|
||||||
curl \
|
|
||||||
libdb-dev \
|
|
||||||
libffi-dev \
|
|
||||||
libgdbm-dev \
|
|
||||||
libgmp-dev \
|
|
||||||
libncurses-dev \
|
|
||||||
libreadline-dev \
|
|
||||||
libssl-dev \
|
|
||||||
libyaml-dev \
|
|
||||||
patch \
|
|
||||||
pkg-config \
|
|
||||||
uuid-dev \
|
|
||||||
zlib1g-dev \
|
|
||||||
&& update-ca-certificates \
|
|
||||||
&& curl -fsSL "https://github.com/rbenv/ruby-build/archive/refs/tags/${RUBY_BUILD_VERSION}.tar.gz" -o /tmp/ruby-build.tar.gz \
|
|
||||||
&& mkdir -p /tmp/ruby-build \
|
|
||||||
&& tar -xzf /tmp/ruby-build.tar.gz --strip-components=1 -C /tmp/ruby-build \
|
|
||||||
&& /tmp/ruby-build/install.sh \
|
|
||||||
&& rm -rf /tmp/ruby-build /tmp/ruby-build.tar.gz /var/lib/apt/lists/*
|
|
||||||
|
|
||||||
COPY install-ruby-toolcache.sh /usr/local/bin/install-ruby-toolcache.sh
|
|
||||||
|
|
||||||
RUN chmod +x /usr/local/bin/install-ruby-toolcache.sh \
|
|
||||||
&& RUBY_VERSION="${RUBY_VERSION}" RUBY_MINOR="${RUBY_MINOR}" TOOLCACHE_ROOT="${RUNNER_RUBY_TOOLCACHE}" RUNNER_UID="${RUNNER_UID}" RUNNER_GID="${RUNNER_GID}" /usr/local/bin/install-ruby-toolcache.sh \
|
|
||||||
&& ruby -v
|
|
||||||
@@ -1,133 +0,0 @@
|
|||||||
# GitHub Runner Fleet
|
|
||||||
|
|
||||||
ArgoCD owns `apps/github-runner/github-runner.yaml`. Do not patch live runner
|
|
||||||
Deployments with `kubectl`; update this manifest and let ArgoCD reconcile.
|
|
||||||
|
|
||||||
## Runner Shape
|
|
||||||
|
|
||||||
All repo-scoped Linux runners use:
|
|
||||||
|
|
||||||
- `localhost/fc-github-runner:v20260525-ruby3.3.11-stepca`, derived from
|
|
||||||
`myoung34/github-runner:latest`
|
|
||||||
- `ACCESS_TOKEN` from the `github-runner-token` Secret
|
|
||||||
- `RUN_AS_ROOT=false`
|
|
||||||
- `EPHEMERAL=true`
|
|
||||||
- `LABELS=self-hosted,linux,fc-build-linux`
|
|
||||||
- writable non-root paths under `/home/runner` for .NET, NuGet, XDG cache, and
|
|
||||||
Actions tool cache
|
|
||||||
- Ruby 3.3.11 seeded into `/home/runner/_tool/Ruby/3.3/x64` from the baked
|
|
||||||
`/opt/runner-toolcache` copy so `ruby/setup-ruby@v1` can discover it on
|
|
||||||
self-hosted `ubuntu-20.04-x64` runners
|
|
||||||
|
|
||||||
`github-runner` for `FlowerCore.Common` is single-replica because it retains the
|
|
||||||
original Longhorn ReadWriteOnce NuGet PVC. Every other repo-scoped runner uses
|
|
||||||
two replicas with per-pod `emptyDir` caches. That is the safe backlog-drain
|
|
||||||
strategy: no two pods share one RWO PVC.
|
|
||||||
|
|
||||||
Sprint 32 final long-tail wave adds 16 two-replica Deployments:
|
|
||||||
`FlowerCore.Knowledge`, `FlowerCore.LlmBridge`, `FlowerCore.Media`,
|
|
||||||
`FlowerCore.Presentations`, `FlowerCore.RemoteDesktop`, `FlowerCore.DNS`,
|
|
||||||
`FlowerCore.Distribution`, `FlowerCore.Scoreboard`,
|
|
||||||
`FlowerCore.SegmentDisplay`, `FlowerCore.Signage.Contracts`,
|
|
||||||
`FlowerCore.SignalControl`, `FlowerCore.Intranet.Web`,
|
|
||||||
`FlowerCore.Provisioning`, `FlowerCore.Redis`, `FlowerCore.MessageBoard`, and
|
|
||||||
`FlowerCore.MenuBoard`.
|
|
||||||
|
|
||||||
## Image Build
|
|
||||||
|
|
||||||
Ruby is baked with a pinned `ruby-build` release and Ruby patch version. The pod
|
|
||||||
still mounts an `emptyDir` over `/home/runner`, so the `setup-runner-home` init
|
|
||||||
container copies the baked toolcache from `/opt/runner-toolcache/Ruby` into
|
|
||||||
`/home/runner/_tool/Ruby` before the runner container starts.
|
|
||||||
|
|
||||||
The IAmWorkin step-ca root CA is also baked into the system trust store
|
|
||||||
(`/usr/local/share/ca-certificates/iamworkin-step-ca-root.crt`, registered by
|
|
||||||
`update-ca-certificates`). Without it, .NET HttpClient calls from CI tests
|
|
||||||
against `*.iamworkin.lan` (e.g. `https://selenium.iamworkin.lan/session`)
|
|
||||||
fail with `PartialChain`. To refresh the bundled cert when the root rotates,
|
|
||||||
re-extract from the cluster and overwrite `step-ca-root.crt`:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
kubectl get secret -n cert-manager step-ca-root \
|
|
||||||
-o jsonpath='{.data.ca\.crt}' | base64 -d > step-ca-root.crt
|
|
||||||
```
|
|
||||||
|
|
||||||
```bash
|
|
||||||
cd apps/github-runner
|
|
||||||
podman build -t localhost/fc-github-runner:v20260525-ruby3.3.11-stepca .
|
|
||||||
podman run --rm localhost/fc-github-runner:v20260525-ruby3.3.11-stepca ruby -v
|
|
||||||
podman run --rm localhost/fc-github-runner:v20260525-ruby3.3.11-stepca \
|
|
||||||
test -f /opt/runner-toolcache/Ruby/3.3/x64.complete
|
|
||||||
podman save localhost/fc-github-runner:v20260525-ruby3.3.11-stepca \
|
|
||||||
-o fc-github-runner-v20260525-ruby3.3.11-stepca.tar
|
|
||||||
```
|
|
||||||
|
|
||||||
Import the saved image on every schedulable RKE2 node before ArgoCD rolls the
|
|
||||||
Deployments:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
for node in rke2-server rke2-agent1 rke2-agent2; do
|
|
||||||
scp fc-github-runner-v20260525-ruby3.3.11-stepca.tar "$node:/tmp/"
|
|
||||||
ssh "$node" 'sudo ctr -a /run/k3s/containerd/containerd.sock -n k8s.io images rm localhost/fc-github-runner:v20260525-ruby3.3.11-stepca || true'
|
|
||||||
ssh "$node" 'sudo ctr -a /run/k3s/containerd/containerd.sock -n k8s.io images import /tmp/fc-github-runner-v20260525-ruby3.3.11-stepca.tar'
|
|
||||||
done
|
|
||||||
```
|
|
||||||
|
|
||||||
## Post-Merge Proof
|
|
||||||
|
|
||||||
After the PR is merged and ArgoCD syncs, verify the runner fleet:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
kubectl -n github-runner get deploy,pods,pvc
|
|
||||||
```
|
|
||||||
|
|
||||||
Verify the Ruby toolcache in a fresh pod:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
kubectl -n github-runner exec deploy/github-runner-puppet -c runner -- ruby -v
|
|
||||||
kubectl -n github-runner exec deploy/github-runner-puppet -c runner -- sh -c \
|
|
||||||
'echo "$RUNNER_TOOL_CACHE" && test -f "$RUNNER_TOOL_CACHE/Ruby/3.3/x64.complete"'
|
|
||||||
```
|
|
||||||
|
|
||||||
Verify GitHub registration for the repo-scoped runners:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
for repo in FlowerCore.Common FlowerCore.Shared.Pos FlowerCore.Puppet FlowerCore.Signage \
|
|
||||||
FlowerCore.DMS FlowerCore.Telephony FlowerCore.Print.Web FlowerCore.Chat \
|
|
||||||
FlowerCore.MySQL FlowerCore.Kiosk.Linux FlowerCore.Marquee FlowerCore.TtsReader \
|
|
||||||
FlowerCore.Knowledge FlowerCore.LlmBridge FlowerCore.Media \
|
|
||||||
FlowerCore.Presentations FlowerCore.RemoteDesktop FlowerCore.DNS \
|
|
||||||
FlowerCore.Distribution FlowerCore.Scoreboard FlowerCore.SegmentDisplay \
|
|
||||||
FlowerCore.Signage.Contracts FlowerCore.SignalControl FlowerCore.Intranet.Web \
|
|
||||||
FlowerCore.Provisioning FlowerCore.Redis FlowerCore.MessageBoard \
|
|
||||||
FlowerCore.MenuBoard; do
|
|
||||||
echo "=== $repo ==="
|
|
||||||
gh api "/repos/astoltz/$repo/actions/runners" \
|
|
||||||
--jq '.runners[] | select(.labels[].name == "fc-build-linux") | {name,status,busy,labels:[.labels[].name]}'
|
|
||||||
done
|
|
||||||
```
|
|
||||||
|
|
||||||
Shared.Pos publish proof after the runner pod is online:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
gh run list --repo astoltz/FlowerCore.Shared.Pos \
|
|
||||||
--workflow "Build, Test & Publish" --branch main --limit 5
|
|
||||||
```
|
|
||||||
|
|
||||||
If the latest run is still queued after runner registration, rerun the workflow
|
|
||||||
from GitHub Actions and verify it lands on an `rke2-linux-*` runner.
|
|
||||||
|
|
||||||
## Failure Notes
|
|
||||||
|
|
||||||
- `actions/setup-dotnet` permission error at `/usr/share/dotnet`: check that
|
|
||||||
`DOTNET_INSTALL_DIR=/home/runner/.dotnet` and related cache env vars are
|
|
||||||
present on the runner pod.
|
|
||||||
- `ruby/setup-ruby@v1` says self-hosted runners must install Ruby in
|
|
||||||
`$RUNNER_TOOL_CACHE`: check that the init container copied
|
|
||||||
`/opt/runner-toolcache/Ruby` into `/home/runner/_tool/Ruby` and that
|
|
||||||
`/home/runner/_tool/Ruby/3.3/x64.complete` exists.
|
|
||||||
- `404` during runner registration: the fine-grained PAT is valid but missing
|
|
||||||
repository access for that repo. Add the repo to the PAT access list; the PAT
|
|
||||||
value does not change.
|
|
||||||
- `Multi-Attach` volume error: only the Common runner uses a RWO PVC and it must
|
|
||||||
stay single-replica. New multi-replica runners use `emptyDir`.
|
|
||||||
File diff suppressed because it is too large
Load Diff
@@ -1,19 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
RUBY_VERSION="${RUBY_VERSION:-3.3.11}"
|
|
||||||
RUBY_MINOR="${RUBY_MINOR:-3.3}"
|
|
||||||
TOOLCACHE_ROOT="${TOOLCACHE_ROOT:-/opt/runner-toolcache}"
|
|
||||||
RUNNER_UID="${RUNNER_UID:-1001}"
|
|
||||||
RUNNER_GID="${RUNNER_GID:-1001}"
|
|
||||||
RUBY_PREFIX="${TOOLCACHE_ROOT}/Ruby/${RUBY_VERSION}/x64"
|
|
||||||
|
|
||||||
mkdir -p "${TOOLCACHE_ROOT}/Ruby"
|
|
||||||
RUBY_CONFIGURE_OPTS="${RUBY_CONFIGURE_OPTS:---disable-install-doc --disable-yjit}" ruby-build "${RUBY_VERSION}" "${RUBY_PREFIX}"
|
|
||||||
|
|
||||||
touch "${TOOLCACHE_ROOT}/Ruby/${RUBY_VERSION}/x64.complete"
|
|
||||||
ln -sfn "${RUBY_VERSION}" "${TOOLCACHE_ROOT}/Ruby/${RUBY_MINOR}"
|
|
||||||
|
|
||||||
"${RUBY_PREFIX}/bin/ruby" -v
|
|
||||||
chown -R "${RUNNER_UID}:${RUNNER_GID}" "${TOOLCACHE_ROOT}"
|
|
||||||
chmod -R a+rX "${TOOLCACHE_ROOT}"
|
|
||||||
@@ -1,12 +0,0 @@
|
|||||||
-----BEGIN CERTIFICATE-----
|
|
||||||
MIIBxDCCAWqgAwIBAgIRAPY357G6ow6zMAL5+4bS2kkwCgYIKoZIzj0EAwIwQDEa
|
|
||||||
MBgGA1UEChMRSUFtV29ya2luIEFDTUUgQ0ExIjAgBgNVBAMTGUlBbVdvcmtpbiBB
|
|
||||||
Q01FIENBIFJvb3QgQ0EwHhcNMjYwMzA4MTgwNzExWhcNMzYwMzA1MTgwNzExWjBA
|
|
||||||
MRowGAYDVQQKExFJQW1Xb3JraW4gQUNNRSBDQTEiMCAGA1UEAxMZSUFtV29ya2lu
|
|
||||||
IEFDTUUgQ0EgUm9vdCBDQTBZMBMGByqGSM49AgEGCCqGSM49AwEHA0IABJ2n04X1
|
|
||||||
JZo5Zdq/i1Idv8+fqwZyAzBh7whbqj0SWsJL8UWRabCMqYCs7+dXO0xRSzqkwFDL
|
|
||||||
x+vooOai8RgRNhajRTBDMA4GA1UdDwEB/wQEAwIBBjASBgNVHRMBAf8ECDAGAQH/
|
|
||||||
AgEBMB0GA1UdDgQWBBRnuPPQR6iM/H6vOluiU3Sygayz8jAKBggqhkjOPQQDAgNI
|
|
||||||
ADBFAiEArQK9dYPGmAZsdYnjziuFVVE5NKZUcceYvGfGC+tLXUsCIAudF2zJrCRq
|
|
||||||
3mK50ZZET/fwTkJwiEF4824mjP8p1CKM
|
|
||||||
-----END CERTIFICATE-----
|
|
||||||
@@ -75,20 +75,6 @@ data:
|
|||||||
cluster: "rke2"
|
cluster: "rke2"
|
||||||
role: "agent"
|
role: "agent"
|
||||||
|
|
||||||
# Mac mini macOS runner node (INFRA VLAN)
|
|
||||||
- job_name: "macmini-node"
|
|
||||||
scrape_timeout: 15s
|
|
||||||
static_configs:
|
|
||||||
- targets: ["10.0.56.115:9100"]
|
|
||||||
labels:
|
|
||||||
instance: "macmini"
|
|
||||||
host: "macmini.iamworkin.lan"
|
|
||||||
vlan: "infra"
|
|
||||||
arch: "arm64"
|
|
||||||
role: "macos-runner"
|
|
||||||
puppet_managed: "true"
|
|
||||||
puppet_server: "puppet.iamworkin.lan"
|
|
||||||
|
|
||||||
# In-cluster node-exporter DaemonSet
|
# In-cluster node-exporter DaemonSet
|
||||||
- job_name: "k8s-node-exporter"
|
- job_name: "k8s-node-exporter"
|
||||||
kubernetes_sd_configs:
|
kubernetes_sd_configs:
|
||||||
@@ -280,14 +266,13 @@ data:
|
|||||||
printer_model: "NuPrint 210"
|
printer_model: "NuPrint 210"
|
||||||
|
|
||||||
# Print.Web health (Blazor app on edge2:5200)
|
# Print.Web health (Blazor app on edge2:5200)
|
||||||
# Target `/health` (anonymous) — root path requires API key auth and returns 401.
|
|
||||||
- job_name: "probe-printweb"
|
- job_name: "probe-printweb"
|
||||||
metrics_path: /probe
|
metrics_path: /probe
|
||||||
params:
|
params:
|
||||||
module: [http_2xx]
|
module: [http_2xx]
|
||||||
scrape_interval: 30s
|
scrape_interval: 30s
|
||||||
static_configs:
|
static_configs:
|
||||||
- targets: ["http://10.0.57.16:5200/health"]
|
- targets: ["http://10.0.57.16:5200/"]
|
||||||
labels:
|
labels:
|
||||||
instance: "print-web"
|
instance: "print-web"
|
||||||
service: "print-web"
|
service: "print-web"
|
||||||
@@ -712,36 +697,6 @@ data:
|
|||||||
summary: "Print.Web Ollama runner held for >10m ({{ $labels.model }})"
|
summary: "Print.Web Ollama runner held for >10m ({{ $labels.model }})"
|
||||||
description: "Print.Web reports model {{ $labels.model }} with {{ $value | printf \"%.0f\" }}s of keep-alive remaining. Check concurrent requests before the Pi 5 Ollama lane thrashes."
|
description: "Print.Web reports model {{ $labels.model }} with {{ $value | printf \"%.0f\" }}s of keep-alive remaining. Check concurrent requests before the Pi 5 Ollama lane thrashes."
|
||||||
|
|
||||||
- name: macmini-runners
|
|
||||||
rules:
|
|
||||||
- alert: MacMiniRunnerOffline
|
|
||||||
expr: (flowercore_github_runner_online{runner=~"macmini-.*"} == 0) or absent(flowercore_github_runner_online{runner=~"macmini-.*"})
|
|
||||||
for: 10m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
service: github-runner
|
|
||||||
annotations:
|
|
||||||
summary: "Mac mini GitHub runner offline ({{ $labels.runner }})"
|
|
||||||
description: "A macmini-* GitHub Actions runner has not reported online for more than 10 minutes. Puppet manages its LaunchDaemon under /Library/LaunchDaemons/io.flowercore.github-runner-<slug>.plist; runners survive reboot and do not require a GUI session."
|
|
||||||
|
|
||||||
- name: linux-runners
|
|
||||||
rules:
|
|
||||||
- alert: LinuxRunnerOffline
|
|
||||||
expr: |
|
|
||||||
kube_deployment_status_replicas_ready{
|
|
||||||
namespace="github-runner",
|
|
||||||
deployment=~"github-runner(|-(sharedpos|puppet|signage|dms|telephony|print-web|chat|mysql|kiosk-linux))"
|
|
||||||
} == 0
|
|
||||||
for: 5m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
alert_channel: irc
|
|
||||||
service: github-runner
|
|
||||||
team: ci
|
|
||||||
annotations:
|
|
||||||
summary: "Linux CI runner offline: {{ $labels.deployment }}"
|
|
||||||
description: "Deployment {{ $labels.deployment }} in namespace github-runner has 0 ready replicas for more than 5 minutes. CI jobs targeting this repo will queue until the runner pod restarts and re-registers with GitHub. Check pods with: kubectl -n github-runner get pods -l app.kubernetes.io/name={{ $labels.deployment }}. Check logs with: kubectl -n github-runner logs -l app.kubernetes.io/name={{ $labels.deployment }} --tail=50. Common causes: PAT missing repo access, runner CrashLoopBackOff, or node/resource pressure."
|
|
||||||
|
|
||||||
- name: remote-desktop
|
- name: remote-desktop
|
||||||
rules:
|
rules:
|
||||||
- alert: RemoteDesktopWebDown
|
- alert: RemoteDesktopWebDown
|
||||||
@@ -967,52 +922,6 @@ data:
|
|||||||
annotations:
|
annotations:
|
||||||
summary: "Disk usage high on {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}%)"
|
summary: "Disk usage high on {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}%)"
|
||||||
|
|
||||||
# Puppet agent + service alerts.
|
|
||||||
# Mirror of FlowerCore.Notes/scripts/monitoring/alerts.yml `puppet` group
|
|
||||||
# so a future migration to in-cluster Prometheus inherits the ruleset.
|
|
||||||
# Source-of-truth for the live Podman Prometheus on noc1 is the Notes file.
|
|
||||||
# See feedback_monitoring_k8s_target_vs_live_podman.
|
|
||||||
- name: puppet
|
|
||||||
rules:
|
|
||||||
- alert: PuppetAgentReportStale
|
|
||||||
expr: puppet_last_run_age_seconds > 7200
|
|
||||||
for: 30m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
alert_channel: irc
|
|
||||||
annotations:
|
|
||||||
summary: "Puppet agent {{ $labels.instance }} hasn't reported in over 2h"
|
|
||||||
description: "Last run age: {{ $value | humanizeDuration }}. The puppet agent on {{ $labels.instance }} may be stopped, the node may be powered off, or noc1 may be unreachable from this node."
|
|
||||||
runbook: "1. SSH to node (via noc1 jumpbox if needed) 2. sudo systemctl status puppet 3. sudo puppet agent -t --noop to force a run 4. Check r10k: ssh fcadmin@10.0.56.10 'sudo podman logs openvoxserver --tail 50' 5. Verify noc1 reachability: ping puppet.iamworkin.lan"
|
|
||||||
|
|
||||||
- alert: PuppetAgentReportCritical
|
|
||||||
expr: puppet_last_run_age_seconds > 86400
|
|
||||||
for: 1h
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
alert_channel: irc
|
|
||||||
annotations:
|
|
||||||
summary: "Puppet agent {{ $labels.instance }} silent for over 24h — node is unmanaged"
|
|
||||||
description: "Last run age: {{ $value | humanizeDuration }}. Node {{ $labels.instance }} has not submitted a Puppet report in over 24 hours. Config drift is accumulating — investigate immediately. If intentional (maintenance), add to the exclusion filter or silence in Grafana."
|
|
||||||
runbook: "URGENT: 1. Check node power state 2. SSH via noc1 jumpbox: ssh fcadmin@10.0.56.10 then ssh <node> 3. sudo systemctl status puppet 4. sudo systemctl start puppet + sudo puppet agent -t 5. Check for network partitions (VLAN connectivity to 10.0.56.10) 6. If node was recently reimaged: sudo puppet agent -t to re-register with new SSL cert"
|
|
||||||
|
|
||||||
# Sprint 33 Cx-7 Phase B (2026-05-25 postmortem follow-up):
|
|
||||||
# Detects puppet.service in failed state — distinct from PuppetAgentReportStale
|
|
||||||
# which catches "agent hasn't run." This catches "systemd gave up restarting it"
|
|
||||||
# (CA-verify loop or other fatal exit). Requires node-exporter systemd collector
|
|
||||||
# enabled with --collector.systemd. If `node_systemd_unit_state` has no series
|
|
||||||
# for a node, the collector is disabled there — flag in postmortem follow-up.
|
|
||||||
- alert: PuppetServiceFailed
|
|
||||||
expr: node_systemd_unit_state{name="puppet.service",state="failed"} == 1
|
|
||||||
for: 5m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
alert_channel: irc
|
|
||||||
annotations:
|
|
||||||
summary: "Puppet service failed on {{ $labels.instance }}"
|
|
||||||
description: "puppet.service on {{ $labels.instance }} has been in failed state for 5+ minutes. systemd has stopped auto-restarting (CA-verify-loop or other exit). Manual `systemctl status puppet` confirms. Run `sudo systemctl start puppet` to recover; investigate journal for root cause."
|
|
||||||
runbook_url: "https://github.com/astoltz/FlowerCore.Notes/blob/master/memory/feedback_puppet_service_dead_after_ca_loop_alert_misreads.md"
|
|
||||||
|
|
||||||
# K8s pod-state alerts. Require kube-state-metrics scrape (added
|
# K8s pod-state alerts. Require kube-state-metrics scrape (added
|
||||||
# 2026-04-26 — see scrape_configs above). Would have surfaced the
|
# 2026-04-26 — see scrape_configs above). Would have surfaced the
|
||||||
# agent-zero ollama-proxy 172x crash-loop instead of letting it
|
# agent-zero ollama-proxy 172x crash-loop instead of letting it
|
||||||
@@ -1274,55 +1183,24 @@ metadata:
|
|||||||
data:
|
data:
|
||||||
notify.py: |
|
notify.py: |
|
||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
"""HTTP->IRC alert relay with thermal-printer DIGEST forwarding.
|
"""HTTP->IRC alert relay with thermal printer forwarding for Grafana webhooks.
|
||||||
|
Listens on :9119, posts to #alerts on UnrealIRCd via raw IRC protocol.
|
||||||
Listens on :9119, posts to #alerts on UnrealIRCd, forwards to Print.Web
|
Alerts tagged alert_channel=thermal_print also POST to Print.Web /api/print/alert.
|
||||||
/api/print/alert. Thermal printing is BATCHED into hourly digests by
|
|
||||||
default so the printer no longer spam-fires per Grafana webhook.
|
|
||||||
|
|
||||||
Routing (per Grafana webhook alert):
|
|
||||||
- IRC: always per-event (operator likes the stream)
|
|
||||||
- Thermal printer:
|
|
||||||
* severity in {critical,disaster,page} OR
|
|
||||||
label alert_channel=thermal_print_immediate -> print NOW
|
|
||||||
* label alert_channel=thermal_print -> enqueue into hourly digest
|
|
||||||
* everything else -> IRC only
|
|
||||||
- RESOLVED webhooks remove the alert from the digest buffer
|
|
||||||
|
|
||||||
Env vars (defaults preserve old behavior on first deploy):
|
|
||||||
THERMAL_PRINT_ENABLED default "true" - master kill switch
|
|
||||||
BATCH_INTERVAL_MIN default "60" - minutes between digest prints
|
|
||||||
BATCH_MAX_PENDING default "50" - force-flush threshold
|
|
||||||
|
|
||||||
HTTP surface:
|
|
||||||
POST / - Grafana webhook entry
|
|
||||||
POST /flush - manual digest flush (idempotent)
|
|
||||||
GET / - status + config + buffer depth + stats
|
|
||||||
"""
|
"""
|
||||||
import json, os, socket, sys, threading, time
|
import json, socket, sys, time
|
||||||
from collections import defaultdict
|
|
||||||
from datetime import datetime, timezone
|
|
||||||
from http.server import HTTPServer, BaseHTTPRequestHandler
|
from http.server import HTTPServer, BaseHTTPRequestHandler
|
||||||
from urllib.request import Request, urlopen
|
from urllib.request import Request, urlopen
|
||||||
|
from urllib.error import URLError
|
||||||
|
|
||||||
THERMAL_PRINT_ENABLED = os.environ.get("THERMAL_PRINT_ENABLED", "true").lower() == "true"
|
IRC_HOST = "unrealircd.irc.svc" # short name: CoreDNS ndots:5 + iamworkin.lan template hijacks full .cluster.local (see memory)
|
||||||
BATCH_INTERVAL_MIN = int(os.environ.get("BATCH_INTERVAL_MIN", "60"))
|
IRC_PORT = 6667
|
||||||
BATCH_MAX_PENDING = int(os.environ.get("BATCH_MAX_PENDING", "50"))
|
IRC_NICK = "grafana-bot"
|
||||||
|
IRC_CHANNEL = "#alerts"
|
||||||
IRC_HOST = os.environ.get("IRC_HOST", "unrealircd.irc.svc")
|
PRINT_WEB_URL = "http://10.0.57.16:5200/api/print/alert"
|
||||||
IRC_PORT = int(os.environ.get("IRC_PORT", "6667"))
|
PRINT_ENABLED = True
|
||||||
IRC_NICK = os.environ.get("IRC_NICK", "grafana-bot")
|
|
||||||
IRC_CHANNEL = os.environ.get("IRC_CHANNEL", "#alerts")
|
|
||||||
PRINT_WEB_URL = os.environ.get("PRINT_WEB_URL", "http://10.0.57.16:5200/api/print/alert")
|
|
||||||
|
|
||||||
_buffer_lock = threading.Lock()
|
|
||||||
_buffer = {} # fingerprint -> {"alert": dict, "first_seen": float, "last_seen": float}
|
|
||||||
_last_flush_time = time.time()
|
|
||||||
_stats = {"webhooks_received": 0, "irc_sent": 0, "print_immediate": 0,
|
|
||||||
"digest_flushed": 0, "buffer_dedup": 0, "buffer_added": 0,
|
|
||||||
"buffer_resolved": 0, "started_at": time.time()}
|
|
||||||
|
|
||||||
def send_irc(message):
|
def send_irc(message):
|
||||||
|
"""Connect, handle PING, join, send, quit."""
|
||||||
try:
|
try:
|
||||||
sock = socket.create_connection((IRC_HOST, IRC_PORT), timeout=15)
|
sock = socket.create_connection((IRC_HOST, IRC_PORT), timeout=15)
|
||||||
sock.sendall(f"NICK {IRC_NICK}\r\n".encode())
|
sock.sendall(f"NICK {IRC_NICK}\r\n".encode())
|
||||||
@@ -1355,137 +1233,52 @@ data:
|
|||||||
time.sleep(0.5)
|
time.sleep(0.5)
|
||||||
sock.sendall(b"QUIT :alert delivered\r\n")
|
sock.sendall(b"QUIT :alert delivered\r\n")
|
||||||
sock.close()
|
sock.close()
|
||||||
_stats["irc_sent"] += 1
|
|
||||||
return True
|
return True
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"[irc-notify] IRC send failed: {e}", file=sys.stderr)
|
print(f"[irc-notify] IRC send failed: {e}", file=sys.stderr)
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def post_thermal(payload, kind):
|
def send_thermal_print(alert):
|
||||||
if not THERMAL_PRINT_ENABLED:
|
if not PRINT_ENABLED: return
|
||||||
print(f"[irc-notify] thermal disabled; skip {kind} ({payload.get('title','?')[:40]})", file=sys.stderr)
|
labels = alert.get("labels", {})
|
||||||
return False
|
annotations = alert.get("annotations", {})
|
||||||
|
status = alert.get("status", "firing").upper()
|
||||||
|
summary = annotations.get("summary", "")
|
||||||
|
description = annotations.get("description", "")
|
||||||
|
runbook = annotations.get("runbook", "")
|
||||||
|
# Build a useful message: summary + description + runbook steps
|
||||||
|
parts = []
|
||||||
|
if summary: parts.append(summary)
|
||||||
|
if description and description != summary: parts.append(description)
|
||||||
|
if runbook: parts.append("STEPS: " + runbook)
|
||||||
|
message = " | ".join(parts) if parts else labels.get("alertname", "Unknown alert")
|
||||||
|
payload = {
|
||||||
|
"title": labels.get("alertname", "Unknown"),
|
||||||
|
"severity": labels.get("severity", "warning").capitalize(),
|
||||||
|
"host": labels.get("instance", labels.get("host", "unknown")),
|
||||||
|
"message": message,
|
||||||
|
"eventId": alert.get("fingerprint", ""),
|
||||||
|
"source": "Grafana",
|
||||||
|
"status": "RESOLVED" if status == "RESOLVED" else "PROBLEM",
|
||||||
|
"acknowledged": False
|
||||||
|
}
|
||||||
try:
|
try:
|
||||||
req = Request(PRINT_WEB_URL, data=json.dumps(payload).encode("utf-8"),
|
req = Request(PRINT_WEB_URL, data=json.dumps(payload).encode("utf-8"),
|
||||||
headers={"Content-Type": "application/json"}, method="POST")
|
headers={"Content-Type": "application/json"}, method="POST")
|
||||||
resp = urlopen(req, timeout=10)
|
resp = urlopen(req, timeout=10)
|
||||||
if kind == "immediate": _stats["print_immediate"] += 1
|
print(f"[irc-notify] Thermal print sent: {resp.read().decode()}", file=sys.stderr)
|
||||||
print(f"[irc-notify] thermal {kind} sent: {payload.get('title','?')[:50]}", file=sys.stderr)
|
|
||||||
return True
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"[irc-notify] thermal {kind} failed: {e}", file=sys.stderr)
|
print(f"[irc-notify] Thermal print failed: {e}", file=sys.stderr)
|
||||||
return False
|
|
||||||
|
|
||||||
def fingerprint_of(alert):
|
def should_print(alert):
|
||||||
fp = alert.get("fingerprint", "")
|
|
||||||
if fp: return fp
|
|
||||||
labels = alert.get("labels", {})
|
labels = alert.get("labels", {})
|
||||||
target = labels.get("pod") or labels.get("instance") or labels.get("deployment") or labels.get("statefulset") or labels.get("namespace") or ""
|
if labels.get("alert_channel") == "thermal_print": return True
|
||||||
return f"{labels.get('alertname','?')}/{labels.get('namespace','')}/{target}"
|
if labels.get("severity", "").lower() in ("critical", "disaster"): return True
|
||||||
|
if alert.get("status", "").upper() == "RESOLVED": return False
|
||||||
def is_critical(alert):
|
return False
|
||||||
return alert.get("labels", {}).get("severity", "").lower() in ("critical", "disaster", "page")
|
|
||||||
|
|
||||||
def is_immediate_label(alert):
|
|
||||||
return alert.get("labels", {}).get("alert_channel") == "thermal_print_immediate"
|
|
||||||
|
|
||||||
def is_batched_label(alert):
|
|
||||||
return alert.get("labels", {}).get("alert_channel") == "thermal_print"
|
|
||||||
|
|
||||||
def add_to_digest(alert):
|
|
||||||
"""Add an alert to the digest buffer. Returns True if the buffer GREW
|
|
||||||
(new fingerprint), False if it was a dedup, resolution, or no-op.
|
|
||||||
"""
|
|
||||||
if not THERMAL_PRINT_ENABLED: return False
|
|
||||||
fp = fingerprint_of(alert)
|
|
||||||
status = alert.get("status", "firing").lower()
|
|
||||||
with _buffer_lock:
|
|
||||||
if status == "resolved":
|
|
||||||
if fp in _buffer:
|
|
||||||
del _buffer[fp]
|
|
||||||
_stats["buffer_resolved"] += 1
|
|
||||||
return False
|
|
||||||
if fp in _buffer:
|
|
||||||
_buffer[fp]["last_seen"] = time.time()
|
|
||||||
_buffer[fp]["alert"] = alert
|
|
||||||
_stats["buffer_dedup"] += 1
|
|
||||||
return False
|
|
||||||
_buffer[fp] = {"alert": alert, "first_seen": time.time(), "last_seen": time.time()}
|
|
||||||
_stats["buffer_added"] += 1
|
|
||||||
return True
|
|
||||||
|
|
||||||
def build_digest_payload():
|
|
||||||
with _buffer_lock:
|
|
||||||
items = list(_buffer.values())
|
|
||||||
if not items: return None
|
|
||||||
by_name = defaultdict(list)
|
|
||||||
for item in items:
|
|
||||||
labels = item["alert"].get("labels", {})
|
|
||||||
by_name[labels.get("alertname", "Unknown")].append(item)
|
|
||||||
lines = []
|
|
||||||
for name, group in sorted(by_name.items()):
|
|
||||||
targets = []
|
|
||||||
for it in group[:5]:
|
|
||||||
labels = it["alert"].get("labels", {})
|
|
||||||
t = (labels.get("pod") or labels.get("instance") or labels.get("deployment")
|
|
||||||
or labels.get("statefulset") or labels.get("namespace") or "?")
|
|
||||||
targets.append(t)
|
|
||||||
more = f" (+{len(group)-5})" if len(group) > 5 else ""
|
|
||||||
sevs = sorted({it["alert"].get("labels", {}).get("severity", "warning") for it in group})
|
|
||||||
lines.append(f"[{'/'.join(sevs)}] {name} x{len(group)}: {', '.join(targets)}{more}")
|
|
||||||
now = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC")
|
|
||||||
title = f"Alert digest: {len(items)} firing"
|
|
||||||
body = "\n".join([
|
|
||||||
f"=== {title} ===",
|
|
||||||
f"as of {now}",
|
|
||||||
"",
|
|
||||||
*lines,
|
|
||||||
"",
|
|
||||||
"Stream: #alerts (IRC) | Triage: grafana-noc1.iamworkin.lan",
|
|
||||||
"Force-flush: POST irc-notify.monitoring.svc:9119/flush",
|
|
||||||
])
|
|
||||||
return {"title": title, "severity": "Warning", "host": "monitoring",
|
|
||||||
"message": body, "eventId": f"digest-{int(time.time())}",
|
|
||||||
"source": "Grafana digest", "status": "PROBLEM", "acknowledged": False}
|
|
||||||
|
|
||||||
def flush_digest():
|
|
||||||
payload = build_digest_payload()
|
|
||||||
if payload is None:
|
|
||||||
print("[irc-notify] flush: buffer empty, no digest sent", file=sys.stderr)
|
|
||||||
return False
|
|
||||||
sent = post_thermal(payload, "digest")
|
|
||||||
with _buffer_lock:
|
|
||||||
_buffer.clear()
|
|
||||||
if sent: _stats["digest_flushed"] += 1
|
|
||||||
return sent
|
|
||||||
|
|
||||||
def digest_loop():
|
|
||||||
global _last_flush_time
|
|
||||||
while True:
|
|
||||||
try:
|
|
||||||
now = time.time()
|
|
||||||
elapsed = now - _last_flush_time
|
|
||||||
if elapsed >= BATCH_INTERVAL_MIN * 60:
|
|
||||||
print(f"[irc-notify] digest tick: interval reached ({BATCH_INTERVAL_MIN}m); buffer={len(_buffer)}", file=sys.stderr)
|
|
||||||
flush_digest()
|
|
||||||
_last_flush_time = now
|
|
||||||
elif len(_buffer) >= BATCH_MAX_PENDING:
|
|
||||||
print(f"[irc-notify] digest tick: buffer full ({len(_buffer)}); force flush", file=sys.stderr)
|
|
||||||
flush_digest()
|
|
||||||
_last_flush_time = now
|
|
||||||
time.sleep(15)
|
|
||||||
except Exception as e:
|
|
||||||
print(f"[irc-notify] digest loop error: {e}", file=sys.stderr)
|
|
||||||
time.sleep(60)
|
|
||||||
|
|
||||||
class Handler(BaseHTTPRequestHandler):
|
class Handler(BaseHTTPRequestHandler):
|
||||||
def do_POST(self):
|
def do_POST(self):
|
||||||
if self.path == "/flush":
|
|
||||||
ok = flush_digest()
|
|
||||||
self.send_response(200); self.send_header("Content-Type", "application/json"); self.end_headers()
|
|
||||||
self.wfile.write(json.dumps({"flushed": ok, "buffer_after": len(_buffer)}).encode())
|
|
||||||
return
|
|
||||||
_stats["webhooks_received"] += 1
|
|
||||||
length = int(self.headers.get("Content-Length", 0))
|
length = int(self.headers.get("Content-Length", 0))
|
||||||
body = json.loads(self.rfile.read(length)) if length else {}
|
body = json.loads(self.rfile.read(length)) if length else {}
|
||||||
for alert in body.get("alerts", []):
|
for alert in body.get("alerts", []):
|
||||||
@@ -1500,56 +1293,22 @@ data:
|
|||||||
msg = f"{icon}{sev_tag} {name}: {summary}"
|
msg = f"{icon}{sev_tag} {name}: {summary}"
|
||||||
if desc: msg += f"\n {desc}"
|
if desc: msg += f"\n {desc}"
|
||||||
send_irc(msg)
|
send_irc(msg)
|
||||||
# Thermal routing — EVERYTHING (including criticals) goes into
|
if should_print(alert): send_thermal_print(alert)
|
||||||
# the hourly digest. Only the explicit `alert_channel=thermal_print_immediate`
|
self.send_response(200)
|
||||||
# label bypasses, and even that flushes-the-current-digest rather
|
self.send_header("Content-Type", "application/json")
|
||||||
# than printing a standalone job, so the same fingerprint can't
|
self.end_headers()
|
||||||
# spam the printer per webhook cycle.
|
|
||||||
if status == "RESOLVED":
|
|
||||||
add_to_digest(alert) # removes from buffer
|
|
||||||
continue
|
|
||||||
if is_immediate_label(alert):
|
|
||||||
# Explicit opt-in for "paper this NOW" — first arrival of a
|
|
||||||
# new fingerprint triggers an immediate digest flush; repeat
|
|
||||||
# webhooks for the same fingerprint dedupe in the buffer
|
|
||||||
# until the next interval or until the alert resolves.
|
|
||||||
new_in_buffer = add_to_digest(alert)
|
|
||||||
if new_in_buffer:
|
|
||||||
global _last_flush_time
|
|
||||||
flush_digest()
|
|
||||||
_last_flush_time = time.time()
|
|
||||||
elif is_critical(alert) or is_batched_label(alert):
|
|
||||||
add_to_digest(alert)
|
|
||||||
# else: IRC-only (warnings without thermal_print label)
|
|
||||||
self.send_response(200); self.send_header("Content-Type", "application/json"); self.end_headers()
|
|
||||||
self.wfile.write(b'{"status":"ok"}')
|
self.wfile.write(b'{"status":"ok"}')
|
||||||
|
|
||||||
def do_GET(self):
|
def do_GET(self):
|
||||||
self.send_response(200); self.send_header("Content-Type", "application/json"); self.end_headers()
|
self.send_response(200)
|
||||||
with _buffer_lock:
|
self.send_header("Content-Type", "application/json")
|
||||||
alertnames = sorted({it["alert"].get("labels", {}).get("alertname", "?") for it in _buffer.values()})
|
self.end_headers()
|
||||||
depth = len(_buffer)
|
self.wfile.write(json.dumps({"service":"irc-notify","thermal_print":PRINT_ENABLED}).encode())
|
||||||
info = {
|
|
||||||
"service": "irc-notify",
|
|
||||||
"config": {"thermal_print_enabled": THERMAL_PRINT_ENABLED,
|
|
||||||
"batch_interval_min": BATCH_INTERVAL_MIN,
|
|
||||||
"batch_max_pending": BATCH_MAX_PENDING,
|
|
||||||
"irc_target": f"{IRC_HOST}:{IRC_PORT} {IRC_CHANNEL}",
|
|
||||||
"print_web_url": PRINT_WEB_URL},
|
|
||||||
"buffer": {"depth": depth, "alertnames": alertnames,
|
|
||||||
"seconds_since_last_flush": int(time.time() - _last_flush_time),
|
|
||||||
"seconds_until_next_flush": max(0, int(BATCH_INTERVAL_MIN*60 - (time.time() - _last_flush_time)))},
|
|
||||||
"stats": _stats,
|
|
||||||
}
|
|
||||||
self.wfile.write(json.dumps(info, indent=2).encode())
|
|
||||||
|
|
||||||
def log_message(self, format, *args):
|
def log_message(self, format, *args):
|
||||||
print(f"[irc-notify] {args[0]}", file=sys.stderr)
|
print(f"[irc-notify] {args[0]}", file=sys.stderr)
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
threading.Thread(target=digest_loop, daemon=True).start()
|
|
||||||
server = HTTPServer(("0.0.0.0", 9119), Handler)
|
server = HTTPServer(("0.0.0.0", 9119), Handler)
|
||||||
print(f"[irc-notify] :9119 -> IRC {IRC_HOST}:{IRC_PORT} {IRC_CHANNEL} | thermal={'ON' if THERMAL_PRINT_ENABLED else 'OFF'} | digest={BATCH_INTERVAL_MIN}m max={BATCH_MAX_PENDING}", file=sys.stderr)
|
print(f"IRC alert relay :9119 -> {IRC_HOST}:{IRC_PORT} {IRC_CHANNEL} (thermal: {PRINT_ENABLED})")
|
||||||
server.serve_forever()
|
server.serve_forever()
|
||||||
|
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
@@ -3636,39 +3395,6 @@ data:
|
|||||||
relativeTimeRange: {from: 120, to: 0}
|
relativeTimeRange: {from: 120, to: 0}
|
||||||
datasourceUid: __expr__
|
datasourceUid: __expr__
|
||||||
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [600], type: gt}}], refId: C}
|
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [600], type: gt}}], refId: C}
|
||||||
- orgId: 1
|
|
||||||
name: CI Runners
|
|
||||||
folder: CI Alerts
|
|
||||||
interval: 1m
|
|
||||||
rules:
|
|
||||||
- uid: linux-runner-offline
|
|
||||||
title: LinuxRunnerOffline
|
|
||||||
condition: C
|
|
||||||
for: 5m
|
|
||||||
noDataState: OK
|
|
||||||
execErrState: Error
|
|
||||||
annotations:
|
|
||||||
summary: "Linux CI runner offline: {{ $labels.deployment }}"
|
|
||||||
description: "A github-runner namespace Deployment has 0 ready replicas for more than 5 minutes. CI jobs targeting that repo will queue until the runner pod restarts and re-registers."
|
|
||||||
runbook: "1. kubectl -n github-runner get pods -l app.kubernetes.io/name={{ $labels.deployment }} 2. kubectl -n github-runner logs -l app.kubernetes.io/name={{ $labels.deployment }} --tail=50 3. Verify PAT repo access if registration returns 404 4. Verify no RWO PVC is shared by scaled runners"
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
service: github-runner
|
|
||||||
alert_channel: irc
|
|
||||||
team: ci
|
|
||||||
data:
|
|
||||||
- refId: A
|
|
||||||
relativeTimeRange: {from: 300, to: 0}
|
|
||||||
datasourceUid: prometheus
|
|
||||||
model: {expr: 'kube_deployment_status_replicas_ready{namespace="github-runner",deployment=~"github-runner(|-(sharedpos|puppet|signage|dms|telephony|print-web|chat|mysql|kiosk-linux))"} == 0', instant: true, refId: A}
|
|
||||||
- refId: B
|
|
||||||
relativeTimeRange: {from: 300, to: 0}
|
|
||||||
datasourceUid: __expr__
|
|
||||||
model: {type: reduce, expression: A, reducer: last, refId: B}
|
|
||||||
- refId: C
|
|
||||||
relativeTimeRange: {from: 300, to: 0}
|
|
||||||
datasourceUid: __expr__
|
|
||||||
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [0], type: gt}}], refId: C}
|
|
||||||
- orgId: 1
|
- orgId: 1
|
||||||
name: Infrastructure
|
name: Infrastructure
|
||||||
folder: AI Stack Alerts
|
folder: AI Stack Alerts
|
||||||
@@ -3701,32 +3427,6 @@ data:
|
|||||||
relativeTimeRange: {from: 120, to: 0}
|
relativeTimeRange: {from: 120, to: 0}
|
||||||
datasourceUid: __expr__
|
datasourceUid: __expr__
|
||||||
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [1], type: lt}}], refId: C}
|
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [1], type: lt}}], refId: C}
|
||||||
- uid: macmini-runner-offline
|
|
||||||
title: MacMiniRunnerOffline
|
|
||||||
condition: C
|
|
||||||
for: 10m
|
|
||||||
noDataState: Alerting
|
|
||||||
execErrState: OK
|
|
||||||
annotations:
|
|
||||||
summary: Mac mini GitHub runner offline
|
|
||||||
description: "One or more macmini-* GitHub Actions runners have not reported online for more than 10 minutes. LaunchDaemons survive reboot and do not require the bluejay GUI session."
|
|
||||||
runbook: "1. ssh fcadmin@macmini.iamworkin.lan 2. launchctl print system/io.flowercore.github-runner-<slug> 3. Check /Users/fcadmin/Library/Logs/github-runners/<slug>/stderr.log 4. Re-register the repo runner if .runner is missing"
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
service: github-runner
|
|
||||||
data:
|
|
||||||
- refId: A
|
|
||||||
relativeTimeRange: {from: 600, to: 0}
|
|
||||||
datasourceUid: prometheus
|
|
||||||
model: {expr: 'min(flowercore_github_runner_online{runner=~"macmini-.*"} or vector(0))', instant: true, refId: A}
|
|
||||||
- refId: B
|
|
||||||
relativeTimeRange: {from: 600, to: 0}
|
|
||||||
datasourceUid: __expr__
|
|
||||||
model: {type: reduce, expression: A, reducer: last, refId: B}
|
|
||||||
- refId: C
|
|
||||||
relativeTimeRange: {from: 600, to: 0}
|
|
||||||
datasourceUid: __expr__
|
|
||||||
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [1], type: lt}}], refId: C}
|
|
||||||
- uid: high-cpu
|
- uid: high-cpu
|
||||||
title: High CPU (>85%)
|
title: High CPU (>85%)
|
||||||
condition: C
|
condition: C
|
||||||
|
|||||||
@@ -24,16 +24,7 @@
|
|||||||
# (10.0.57.16:5200), public internet 80/443 (excluding RFC1918), and
|
# (10.0.57.16:5200), public internet 80/443 (excluding RFC1918), and
|
||||||
# fc-signage:5190 for the signage AAT lane.
|
# fc-signage:5190 for the signage AAT lane.
|
||||||
# - Ingress: Traefik (4444 + 8089 ACME-solver-style), intra-pod,
|
# - Ingress: Traefik (4444 + 8089 ACME-solver-style), intra-pod,
|
||||||
# telephony / gitea / fc-system / fc-signage / github-runner namespaces
|
# telephony / gitea / fc-system / fc-signage namespaces on 4444.
|
||||||
# on 4444.
|
|
||||||
#
|
|
||||||
# 2026-05-25: added github-runner ingress on 4444 so CI jobs running in
|
|
||||||
# self-hosted runner pods (e.g. FlowerCore.Print.Web `help-screenshots`)
|
|
||||||
# can reach the grid. Without this allow, the session POST to
|
|
||||||
# `selenium-hub.selenium.svc.cluster.local:4444` was DNAT'd to the hub
|
|
||||||
# pod IP and then dropped at the Calico ingress hook — Selenium UI showed
|
|
||||||
# 0/4 sessions while the .NET HTTP client timed out at 60s. Same family
|
|
||||||
# as `feedback_netpol_dnat_backend_port`, wrong-source-namespace flavor.
|
|
||||||
apiVersion: networking.k8s.io/v1
|
apiVersion: networking.k8s.io/v1
|
||||||
kind: NetworkPolicy
|
kind: NetworkPolicy
|
||||||
metadata:
|
metadata:
|
||||||
@@ -212,13 +203,6 @@ spec:
|
|||||||
ports:
|
ports:
|
||||||
- port: 4444
|
- port: 4444
|
||||||
protocol: TCP
|
protocol: TCP
|
||||||
- from:
|
|
||||||
- namespaceSelector:
|
|
||||||
matchLabels:
|
|
||||||
kubernetes.io/metadata.name: github-runner
|
|
||||||
ports:
|
|
||||||
- port: 4444
|
|
||||||
protocol: TCP
|
|
||||||
podSelector: {}
|
podSelector: {}
|
||||||
policyTypes:
|
policyTypes:
|
||||||
- Ingress
|
- Ingress
|
||||||
|
|||||||
@@ -1,427 +0,0 @@
|
|||||||
# Selenium Grid 4 — RKE2 deployment
|
|
||||||
#
|
|
||||||
# Hub + chrome + firefox + edge browser nodes serving fleet-wide AAT runs from
|
|
||||||
# the GitHub Actions self-hosted runners. ArgoCD owns this namespace from
|
|
||||||
# 2026-05-25 (`infra-selenium` Application; previously these resources were
|
|
||||||
# orphan kubectl-applied since 2026-03-15).
|
|
||||||
#
|
|
||||||
# Endpoints:
|
|
||||||
# - Internal cluster: http://selenium-hub.selenium.svc.cluster.local:4444
|
|
||||||
# - LAN LoadBalancer (MetalLB): http://10.0.56.208:4444
|
|
||||||
# - Traefik public: https://selenium.iamworkin.lan
|
|
||||||
#
|
|
||||||
# Browser maxSessions:
|
|
||||||
# - chrome 2 (bumped from 1 on 2026-05-25 morning-routine — AAT-heavy
|
|
||||||
# Print.Web help-screenshots was the global bottleneck;
|
|
||||||
# see commit history for ops/runner-replica-rightsize)
|
|
||||||
# - firefox 1
|
|
||||||
# - edge 1
|
|
||||||
#
|
|
||||||
# Screenshots + video recording write to NFS via the chrome video sidecar.
|
|
||||||
# See: CLAUDE.md "Selenium Grid & Visual AAT Testing" + bluejay-infra ADR notes.
|
|
||||||
---
|
|
||||||
apiVersion: v1
|
|
||||||
kind: Service
|
|
||||||
metadata:
|
|
||||||
labels:
|
|
||||||
app: selenium-hub
|
|
||||||
app.kubernetes.io/name: selenium-hub
|
|
||||||
app.kubernetes.io/part-of: selenium-grid
|
|
||||||
name: selenium-hub
|
|
||||||
namespace: selenium
|
|
||||||
spec:
|
|
||||||
ports:
|
|
||||||
- name: web
|
|
||||||
port: 4444
|
|
||||||
targetPort: 4444
|
|
||||||
- name: publish
|
|
||||||
port: 4442
|
|
||||||
targetPort: 4442
|
|
||||||
- name: subscribe
|
|
||||||
port: 4443
|
|
||||||
targetPort: 4443
|
|
||||||
selector:
|
|
||||||
app: selenium-hub
|
|
||||||
type: ClusterIP
|
|
||||||
---
|
|
||||||
apiVersion: v1
|
|
||||||
kind: Service
|
|
||||||
metadata:
|
|
||||||
annotations:
|
|
||||||
metallb.io/ip-allocated-from-pool: bluejay-pool
|
|
||||||
metallb.universe.tf/loadBalancerIPs: 10.0.56.208
|
|
||||||
labels:
|
|
||||||
app: selenium-hub
|
|
||||||
component: external-access
|
|
||||||
name: selenium-hub-external
|
|
||||||
namespace: selenium
|
|
||||||
spec:
|
|
||||||
clusterIP: 10.43.90.147
|
|
||||||
clusterIPs:
|
|
||||||
- 10.43.90.147
|
|
||||||
externalTrafficPolicy: Local
|
|
||||||
healthCheckNodePort: 32213
|
|
||||||
ports:
|
|
||||||
- name: web
|
|
||||||
nodePort: 32411
|
|
||||||
port: 4444
|
|
||||||
targetPort: 4444
|
|
||||||
- name: publish
|
|
||||||
nodePort: 32068
|
|
||||||
port: 4442
|
|
||||||
targetPort: 4442
|
|
||||||
- name: subscribe
|
|
||||||
nodePort: 31000
|
|
||||||
port: 4443
|
|
||||||
targetPort: 4443
|
|
||||||
selector:
|
|
||||||
app: selenium-hub
|
|
||||||
type: LoadBalancer
|
|
||||||
---
|
|
||||||
apiVersion: apps/v1
|
|
||||||
kind: Deployment
|
|
||||||
metadata:
|
|
||||||
labels:
|
|
||||||
app: selenium-hub
|
|
||||||
app.kubernetes.io/name: selenium-hub
|
|
||||||
app.kubernetes.io/part-of: selenium-grid
|
|
||||||
name: selenium-hub
|
|
||||||
namespace: selenium
|
|
||||||
spec:
|
|
||||||
replicas: 1
|
|
||||||
selector:
|
|
||||||
matchLabels:
|
|
||||||
app: selenium-hub
|
|
||||||
template:
|
|
||||||
metadata:
|
|
||||||
labels:
|
|
||||||
app: selenium-hub
|
|
||||||
app.kubernetes.io/name: selenium-hub
|
|
||||||
app.kubernetes.io/part-of: selenium-grid
|
|
||||||
spec:
|
|
||||||
containers:
|
|
||||||
- env:
|
|
||||||
- name: SE_NODE_SESSION_TIMEOUT
|
|
||||||
value: '300'
|
|
||||||
- name: SE_SESSION_REQUEST_TIMEOUT
|
|
||||||
value: '300'
|
|
||||||
- name: SE_SESSION_RETRY_INTERVAL
|
|
||||||
value: '5'
|
|
||||||
- name: JAVA_OPTS
|
|
||||||
value: -Xmx512m
|
|
||||||
image: selenium/hub:4.27.0
|
|
||||||
livenessProbe:
|
|
||||||
httpGet:
|
|
||||||
path: /wd/hub/status
|
|
||||||
port: 4444
|
|
||||||
initialDelaySeconds: 30
|
|
||||||
periodSeconds: 15
|
|
||||||
timeoutSeconds: 5
|
|
||||||
name: selenium-hub
|
|
||||||
ports:
|
|
||||||
- containerPort: 4444
|
|
||||||
name: web
|
|
||||||
- containerPort: 4442
|
|
||||||
name: publish
|
|
||||||
- containerPort: 4443
|
|
||||||
name: subscribe
|
|
||||||
readinessProbe:
|
|
||||||
httpGet:
|
|
||||||
path: /wd/hub/status
|
|
||||||
port: 4444
|
|
||||||
initialDelaySeconds: 10
|
|
||||||
periodSeconds: 5
|
|
||||||
timeoutSeconds: 5
|
|
||||||
# Hub baseline working set ~766Mi on 2026-05-25 (75% of prior 1Gi
|
|
||||||
# limit). Bump to 1.5Gi / 1Gi to keep ~50% headroom; matches the
|
|
||||||
# stampede-buffer pattern documented for multus
|
|
||||||
# (feedback_k8s_cni_multus_sizing). CPU left alone — observed 54m
|
|
||||||
# against a 500m limit, no contention.
|
|
||||||
resources:
|
|
||||||
limits:
|
|
||||||
cpu: 500m
|
|
||||||
memory: 1536Mi
|
|
||||||
requests:
|
|
||||||
cpu: 250m
|
|
||||||
memory: 1Gi
|
|
||||||
---
|
|
||||||
apiVersion: apps/v1
|
|
||||||
kind: Deployment
|
|
||||||
metadata:
|
|
||||||
labels:
|
|
||||||
app: selenium-node-chrome
|
|
||||||
app.kubernetes.io/name: selenium-node-chrome
|
|
||||||
app.kubernetes.io/part-of: selenium-grid
|
|
||||||
name: selenium-node-chrome
|
|
||||||
namespace: selenium
|
|
||||||
spec:
|
|
||||||
replicas: 1
|
|
||||||
selector:
|
|
||||||
matchLabels:
|
|
||||||
app: selenium-node-chrome
|
|
||||||
template:
|
|
||||||
metadata:
|
|
||||||
labels:
|
|
||||||
app: selenium-node-chrome
|
|
||||||
app.kubernetes.io/name: selenium-node-chrome
|
|
||||||
app.kubernetes.io/part-of: selenium-grid
|
|
||||||
spec:
|
|
||||||
containers:
|
|
||||||
- env:
|
|
||||||
- name: SE_EVENT_BUS_HOST
|
|
||||||
value: selenium-hub
|
|
||||||
- name: SE_EVENT_BUS_PUBLISH_PORT
|
|
||||||
value: '4442'
|
|
||||||
- name: SE_EVENT_BUS_SUBSCRIBE_PORT
|
|
||||||
value: '4443'
|
|
||||||
- name: SE_NODE_MAX_SESSIONS
|
|
||||||
value: '2'
|
|
||||||
- name: SE_NODE_OVERRIDE_MAX_SESSIONS
|
|
||||||
value: 'false'
|
|
||||||
- name: SE_VNC_NO_PASSWORD
|
|
||||||
value: '1'
|
|
||||||
- name: SE_SCREEN_WIDTH
|
|
||||||
value: '1920'
|
|
||||||
- name: SE_SCREEN_HEIGHT
|
|
||||||
value: '1080'
|
|
||||||
- name: SE_NODE_SESSION_TIMEOUT
|
|
||||||
value: '300'
|
|
||||||
image: selenium/node-chrome:4.27.0
|
|
||||||
livenessProbe:
|
|
||||||
httpGet:
|
|
||||||
path: /status
|
|
||||||
port: 5555
|
|
||||||
initialDelaySeconds: 30
|
|
||||||
periodSeconds: 15
|
|
||||||
name: selenium-chrome
|
|
||||||
ports:
|
|
||||||
- containerPort: 5555
|
|
||||||
name: node
|
|
||||||
readinessProbe:
|
|
||||||
httpGet:
|
|
||||||
path: /status
|
|
||||||
port: 5555
|
|
||||||
initialDelaySeconds: 15
|
|
||||||
periodSeconds: 5
|
|
||||||
# Chromium-based browser node. Bumped from 1Gi -> 2Gi (req 512Mi
|
|
||||||
# -> 1Gi) on 2026-05-25 — Edge had 51 OOMKills in 5d on the
|
|
||||||
# original 1Gi cap (~1 OOM every 2.4h), and Chrome at maxSessions=2
|
|
||||||
# was running 684Mi idle on the same cap. Matches the Firefox node's
|
|
||||||
# tested-stable 2Gi limit. CPU unchanged.
|
|
||||||
resources:
|
|
||||||
limits:
|
|
||||||
cpu: '1'
|
|
||||||
memory: 2Gi
|
|
||||||
requests:
|
|
||||||
cpu: 500m
|
|
||||||
memory: 1Gi
|
|
||||||
volumeMounts:
|
|
||||||
- mountPath: /dev/shm
|
|
||||||
name: dshm
|
|
||||||
- env:
|
|
||||||
- name: DISPLAY_CONTAINER_NAME
|
|
||||||
value: localhost
|
|
||||||
- name: SE_SCREEN_WIDTH
|
|
||||||
value: '1920'
|
|
||||||
- name: SE_SCREEN_HEIGHT
|
|
||||||
value: '1080'
|
|
||||||
- name: SE_VIDEO_FILE_NAME
|
|
||||||
value: auto
|
|
||||||
- name: SE_VIDEO_UPLOAD_ENABLED
|
|
||||||
value: 'false'
|
|
||||||
image: selenium/video:ffmpeg-7.1-20250101
|
|
||||||
name: video
|
|
||||||
resources:
|
|
||||||
limits:
|
|
||||||
cpu: 500m
|
|
||||||
memory: 768Mi
|
|
||||||
requests:
|
|
||||||
cpu: 250m
|
|
||||||
memory: 384Mi
|
|
||||||
volumeMounts:
|
|
||||||
- mountPath: /videos
|
|
||||||
name: selenium-videos
|
|
||||||
volumes:
|
|
||||||
- emptyDir:
|
|
||||||
medium: Memory
|
|
||||||
sizeLimit: 2Gi
|
|
||||||
name: dshm
|
|
||||||
- emptyDir:
|
|
||||||
sizeLimit: 5Gi
|
|
||||||
name: selenium-videos
|
|
||||||
---
|
|
||||||
apiVersion: apps/v1
|
|
||||||
kind: Deployment
|
|
||||||
metadata:
|
|
||||||
labels:
|
|
||||||
app: selenium-node-firefox
|
|
||||||
app.kubernetes.io/name: selenium-node-firefox
|
|
||||||
app.kubernetes.io/part-of: selenium-grid
|
|
||||||
name: selenium-node-firefox
|
|
||||||
namespace: selenium
|
|
||||||
spec:
|
|
||||||
replicas: 1
|
|
||||||
selector:
|
|
||||||
matchLabels:
|
|
||||||
app: selenium-node-firefox
|
|
||||||
template:
|
|
||||||
metadata:
|
|
||||||
labels:
|
|
||||||
app: selenium-node-firefox
|
|
||||||
app.kubernetes.io/name: selenium-node-firefox
|
|
||||||
app.kubernetes.io/part-of: selenium-grid
|
|
||||||
spec:
|
|
||||||
containers:
|
|
||||||
- env:
|
|
||||||
- name: SE_EVENT_BUS_HOST
|
|
||||||
value: selenium-hub
|
|
||||||
- name: SE_EVENT_BUS_PUBLISH_PORT
|
|
||||||
value: '4442'
|
|
||||||
- name: SE_EVENT_BUS_SUBSCRIBE_PORT
|
|
||||||
value: '4443'
|
|
||||||
- name: SE_NODE_MAX_SESSIONS
|
|
||||||
value: '1'
|
|
||||||
- name: SE_NODE_OVERRIDE_MAX_SESSIONS
|
|
||||||
value: 'true'
|
|
||||||
- name: SE_VNC_NO_PASSWORD
|
|
||||||
value: '1'
|
|
||||||
- name: SE_START_VNC
|
|
||||||
value: 'false'
|
|
||||||
- name: SE_SCREEN_WIDTH
|
|
||||||
value: '1920'
|
|
||||||
- name: SE_SCREEN_HEIGHT
|
|
||||||
value: '1080'
|
|
||||||
- name: SE_NODE_SESSION_TIMEOUT
|
|
||||||
value: '300'
|
|
||||||
image: selenium/node-firefox:4.27.0
|
|
||||||
livenessProbe:
|
|
||||||
failureThreshold: 5
|
|
||||||
httpGet:
|
|
||||||
path: /status
|
|
||||||
port: 5555
|
|
||||||
initialDelaySeconds: 30
|
|
||||||
periodSeconds: 15
|
|
||||||
timeoutSeconds: 5
|
|
||||||
name: selenium-firefox
|
|
||||||
ports:
|
|
||||||
- containerPort: 5555
|
|
||||||
name: node
|
|
||||||
readinessProbe:
|
|
||||||
failureThreshold: 5
|
|
||||||
httpGet:
|
|
||||||
path: /status
|
|
||||||
port: 5555
|
|
||||||
initialDelaySeconds: 15
|
|
||||||
periodSeconds: 5
|
|
||||||
timeoutSeconds: 5
|
|
||||||
resources:
|
|
||||||
limits:
|
|
||||||
cpu: '1'
|
|
||||||
memory: 2Gi
|
|
||||||
requests:
|
|
||||||
cpu: 500m
|
|
||||||
memory: 1Gi
|
|
||||||
volumeMounts:
|
|
||||||
- mountPath: /dev/shm
|
|
||||||
name: dshm
|
|
||||||
volumes:
|
|
||||||
- emptyDir:
|
|
||||||
medium: Memory
|
|
||||||
sizeLimit: 2Gi
|
|
||||||
name: dshm
|
|
||||||
---
|
|
||||||
apiVersion: apps/v1
|
|
||||||
kind: Deployment
|
|
||||||
metadata:
|
|
||||||
labels:
|
|
||||||
app: selenium-node-edge
|
|
||||||
app.kubernetes.io/name: selenium-node-edge
|
|
||||||
app.kubernetes.io/part-of: selenium-grid
|
|
||||||
name: selenium-node-edge
|
|
||||||
namespace: selenium
|
|
||||||
spec:
|
|
||||||
replicas: 1
|
|
||||||
selector:
|
|
||||||
matchLabels:
|
|
||||||
app: selenium-node-edge
|
|
||||||
template:
|
|
||||||
metadata:
|
|
||||||
labels:
|
|
||||||
app: selenium-node-edge
|
|
||||||
app.kubernetes.io/name: selenium-node-edge
|
|
||||||
app.kubernetes.io/part-of: selenium-grid
|
|
||||||
spec:
|
|
||||||
containers:
|
|
||||||
- env:
|
|
||||||
- name: SE_EVENT_BUS_HOST
|
|
||||||
value: selenium-hub
|
|
||||||
- name: SE_EVENT_BUS_PUBLISH_PORT
|
|
||||||
value: '4442'
|
|
||||||
- name: SE_EVENT_BUS_SUBSCRIBE_PORT
|
|
||||||
value: '4443'
|
|
||||||
- name: SE_NODE_MAX_SESSIONS
|
|
||||||
value: '1'
|
|
||||||
- name: SE_NODE_OVERRIDE_MAX_SESSIONS
|
|
||||||
value: 'true'
|
|
||||||
- name: SE_VNC_NO_PASSWORD
|
|
||||||
value: '1'
|
|
||||||
- name: SE_SCREEN_WIDTH
|
|
||||||
value: '1920'
|
|
||||||
- name: SE_SCREEN_HEIGHT
|
|
||||||
value: '1080'
|
|
||||||
- name: SE_NODE_SESSION_TIMEOUT
|
|
||||||
value: '300'
|
|
||||||
image: selenium/node-edge:4.27.0
|
|
||||||
livenessProbe:
|
|
||||||
httpGet:
|
|
||||||
path: /status
|
|
||||||
port: 5555
|
|
||||||
initialDelaySeconds: 30
|
|
||||||
periodSeconds: 15
|
|
||||||
name: selenium-edge
|
|
||||||
ports:
|
|
||||||
- containerPort: 5555
|
|
||||||
name: node
|
|
||||||
readinessProbe:
|
|
||||||
httpGet:
|
|
||||||
path: /status
|
|
||||||
port: 5555
|
|
||||||
initialDelaySeconds: 15
|
|
||||||
periodSeconds: 5
|
|
||||||
# Chromium-based browser node. Bumped from 1Gi -> 2Gi (req 512Mi
|
|
||||||
# -> 1Gi) on 2026-05-25 — Edge had 51 OOMKills in 5d on the
|
|
||||||
# original 1Gi cap (~1 OOM every 2.4h), and Chrome at maxSessions=2
|
|
||||||
# was running 684Mi idle on the same cap. Matches the Firefox node's
|
|
||||||
# tested-stable 2Gi limit. CPU unchanged.
|
|
||||||
resources:
|
|
||||||
limits:
|
|
||||||
cpu: '1'
|
|
||||||
memory: 2Gi
|
|
||||||
requests:
|
|
||||||
cpu: 500m
|
|
||||||
memory: 1Gi
|
|
||||||
volumeMounts:
|
|
||||||
- mountPath: /dev/shm
|
|
||||||
name: dshm
|
|
||||||
volumes:
|
|
||||||
- emptyDir:
|
|
||||||
medium: Memory
|
|
||||||
sizeLimit: 2Gi
|
|
||||||
name: dshm
|
|
||||||
---
|
|
||||||
apiVersion: traefik.io/v1alpha1
|
|
||||||
kind: IngressRoute
|
|
||||||
metadata:
|
|
||||||
name: selenium-hub
|
|
||||||
namespace: selenium
|
|
||||||
spec:
|
|
||||||
entryPoints:
|
|
||||||
- websecure
|
|
||||||
routes:
|
|
||||||
- kind: Rule
|
|
||||||
match: Host(`selenium.iamworkin.lan`)
|
|
||||||
services:
|
|
||||||
- name: selenium-hub
|
|
||||||
port: 4444
|
|
||||||
tls:
|
|
||||||
secretName: selenium-tls
|
|
||||||
@@ -28,12 +28,9 @@ Source: `D:\git\FlowerCore\FlowerCore.WorldBuilder` (master)
|
|||||||
Memory: `feedback_rke2_image_import_per_node_scp`.
|
Memory: `feedback_rke2_image_import_per_node_scp`.
|
||||||
3. **Bump image tag** in `worldbuilder.yaml` and git push.
|
3. **Bump image tag** in `worldbuilder.yaml` and git push.
|
||||||
ArgoCD ApplicationSet picks up within ~3 minutes.
|
ArgoCD ApplicationSet picks up within ~3 minutes.
|
||||||
4. **First production render** — open
|
4. **First production render** — open `https://worldbuilder.iamworkin.lan`,
|
||||||
`https://worldbuilder.iamworkin.lan/studio/c32e0000-0000-4000-8000-000000000004`
|
create World → Character → Storyboard → ExportJob, confirm artifact
|
||||||
and confirm the Cyberpunk Blue Jay demo prompt loads with five seeded fake
|
downloads. ComfyUI lives on BLUEJAY-WS at `http://10.0.56.20:8188`.
|
||||||
generated images. This Sprint 32 visitor-safe profile uses
|
|
||||||
`ClientMode=fake`; switch the image-generation env vars back to ComfyUI only
|
|
||||||
for an operator-owned GPU render lane.
|
|
||||||
|
|
||||||
## Health probes
|
## Health probes
|
||||||
|
|
||||||
@@ -56,13 +53,8 @@ Source: `D:\git\FlowerCore\FlowerCore.WorldBuilder` (master)
|
|||||||
|
|
||||||
## Image generation backend
|
## Image generation backend
|
||||||
|
|
||||||
Sprint 32 pins the Kubernetes profile to
|
`FlowerCore:WorldBuilder:ImageGeneration:BaseUrl=http://10.0.56.20:8188` —
|
||||||
`FlowerCore:WorldBuilder:ImageGeneration:ClientMode=fake` with
|
ComfyUI runs on BLUEJAY-WS Windows (R9700 / gfx1201 / ROCm 7.2.1). Pod reaches
|
||||||
`BaseUrl=http://127.0.0.1:1`. That keeps the public/internal visitor demo
|
the workstation directly across the 10.0.56.0/24 VLAN (no Podman-style host-
|
||||||
deterministic, avoids GPU exposure, and still exercises the studio/gallery
|
filter issues — K8s pods route via Calico, which is L3-routed across the
|
||||||
surface with persisted generated-image metadata.
|
VLAN).
|
||||||
|
|
||||||
The previous ComfyUI backend target was `http://10.0.56.20:8188` on
|
|
||||||
BLUEJAY-WS (R9700 / gfx1201 / ROCm 7.2.1). Re-enable it only in an
|
|
||||||
operator-owned follow-up that also verifies workstation reachability and image
|
|
||||||
import freshness.
|
|
||||||
|
|||||||
@@ -16,11 +16,7 @@ kind: Namespace
|
|||||||
metadata:
|
metadata:
|
||||||
name: fc-worldbuilder
|
name: fc-worldbuilder
|
||||||
labels:
|
labels:
|
||||||
app.kubernetes.io/name: fc-worldbuilder
|
|
||||||
app.kubernetes.io/part-of: flowercore
|
app.kubernetes.io/part-of: flowercore
|
||||||
app.kubernetes.io/managed-by: argocd
|
|
||||||
flowercore.io/tenant-id: system
|
|
||||||
flowercore.io/created-by: bluejay-infra
|
|
||||||
---
|
---
|
||||||
# SQLite DB + generated image gallery + PDF/PNG exports.
|
# SQLite DB + generated image gallery + PDF/PNG exports.
|
||||||
# Longhorn RWO — single replica with `Recreate` rollout strategy keeps it safe.
|
# Longhorn RWO — single replica with `Recreate` rollout strategy keeps it safe.
|
||||||
@@ -29,13 +25,6 @@ kind: PersistentVolumeClaim
|
|||||||
metadata:
|
metadata:
|
||||||
name: worldbuilder-data
|
name: worldbuilder-data
|
||||||
namespace: fc-worldbuilder
|
namespace: fc-worldbuilder
|
||||||
labels:
|
|
||||||
app.kubernetes.io/name: worldbuilder-data
|
|
||||||
app.kubernetes.io/component: storage
|
|
||||||
app.kubernetes.io/part-of: flowercore
|
|
||||||
app.kubernetes.io/managed-by: argocd
|
|
||||||
flowercore.io/tenant-id: system
|
|
||||||
flowercore.io/created-by: bluejay-infra
|
|
||||||
spec:
|
spec:
|
||||||
accessModes:
|
accessModes:
|
||||||
- ReadWriteOnce
|
- ReadWriteOnce
|
||||||
@@ -51,13 +40,7 @@ metadata:
|
|||||||
namespace: fc-worldbuilder
|
namespace: fc-worldbuilder
|
||||||
labels:
|
labels:
|
||||||
app.kubernetes.io/name: worldbuilder-web
|
app.kubernetes.io/name: worldbuilder-web
|
||||||
app.kubernetes.io/component: web
|
|
||||||
app.kubernetes.io/part-of: flowercore
|
app.kubernetes.io/part-of: flowercore
|
||||||
app.kubernetes.io/managed-by: argocd
|
|
||||||
flowercore.io/tenant-id: system
|
|
||||||
flowercore.io/created-by: bluejay-infra
|
|
||||||
annotations:
|
|
||||||
flowercore.io/traceability-standard: k8s-pod-ownership-and-traceability-standard
|
|
||||||
spec:
|
spec:
|
||||||
replicas: 1
|
replicas: 1
|
||||||
revisionHistoryLimit: 3
|
revisionHistoryLimit: 3
|
||||||
@@ -71,16 +54,11 @@ spec:
|
|||||||
metadata:
|
metadata:
|
||||||
labels:
|
labels:
|
||||||
app.kubernetes.io/name: worldbuilder-web
|
app.kubernetes.io/name: worldbuilder-web
|
||||||
app.kubernetes.io/component: web
|
|
||||||
app.kubernetes.io/part-of: flowercore
|
app.kubernetes.io/part-of: flowercore
|
||||||
app.kubernetes.io/managed-by: argocd
|
|
||||||
flowercore.io/tenant-id: system
|
|
||||||
flowercore.io/created-by: bluejay-infra
|
|
||||||
annotations:
|
annotations:
|
||||||
prometheus.io/scrape: "true"
|
prometheus.io/scrape: "true"
|
||||||
prometheus.io/port: "8080"
|
prometheus.io/port: "8080"
|
||||||
prometheus.io/path: "/metrics/prometheus"
|
prometheus.io/path: "/metrics/prometheus"
|
||||||
flowercore.io/audit-trace-id: "worldbuilder-runtime-demo"
|
|
||||||
spec:
|
spec:
|
||||||
securityContext:
|
securityContext:
|
||||||
fsGroup: 1654
|
fsGroup: 1654
|
||||||
@@ -114,14 +92,11 @@ spec:
|
|||||||
value: "/data/gallery"
|
value: "/data/gallery"
|
||||||
- name: FlowerCore__WorldBuilder__Export__RootPath
|
- name: FlowerCore__WorldBuilder__Export__RootPath
|
||||||
value: "/data/exports"
|
value: "/data/exports"
|
||||||
# Visitor-safe Sprint 32 profile: fake backend keeps public demo
|
# ComfyUI on BLUEJAY-WS (R9700 / gfx1201 / ROCm 7.2.1).
|
||||||
# rendering deterministic and avoids exposing BLUEJAY-WS GPU.
|
|
||||||
- name: FlowerCore__WorldBuilder__ImageGeneration__BaseUrl
|
- name: FlowerCore__WorldBuilder__ImageGeneration__BaseUrl
|
||||||
value: "http://127.0.0.1:1"
|
value: "http://10.0.56.20:8188"
|
||||||
- name: FlowerCore__WorldBuilder__ImageGeneration__ClientMode
|
- name: FlowerCore__WorldBuilder__ImageGeneration__ClientMode
|
||||||
value: "fake"
|
value: "comfyui"
|
||||||
- name: FlowerCore__WorldBuilder__ImageGeneration__BackendId
|
|
||||||
value: "fake"
|
|
||||||
resources:
|
resources:
|
||||||
# Cluster CPU-request budget runs hot (99% on all 3 nodes at deploy
|
# Cluster CPU-request budget runs hot (99% on all 3 nodes at deploy
|
||||||
# time) while actual CPU usage is well below capacity. Idle Blazor
|
# time) while actual CPU usage is well below capacity. Idle Blazor
|
||||||
@@ -190,11 +165,7 @@ metadata:
|
|||||||
namespace: fc-worldbuilder
|
namespace: fc-worldbuilder
|
||||||
labels:
|
labels:
|
||||||
app.kubernetes.io/name: worldbuilder-web
|
app.kubernetes.io/name: worldbuilder-web
|
||||||
app.kubernetes.io/component: web
|
|
||||||
app.kubernetes.io/part-of: flowercore
|
app.kubernetes.io/part-of: flowercore
|
||||||
app.kubernetes.io/managed-by: argocd
|
|
||||||
flowercore.io/tenant-id: system
|
|
||||||
flowercore.io/created-by: bluejay-infra
|
|
||||||
spec:
|
spec:
|
||||||
type: ClusterIP
|
type: ClusterIP
|
||||||
selector:
|
selector:
|
||||||
@@ -209,13 +180,6 @@ kind: Certificate
|
|||||||
metadata:
|
metadata:
|
||||||
name: worldbuilder-web-tls
|
name: worldbuilder-web-tls
|
||||||
namespace: fc-worldbuilder
|
namespace: fc-worldbuilder
|
||||||
labels:
|
|
||||||
app.kubernetes.io/name: worldbuilder-web-tls
|
|
||||||
app.kubernetes.io/component: ingress
|
|
||||||
app.kubernetes.io/part-of: flowercore
|
|
||||||
app.kubernetes.io/managed-by: argocd
|
|
||||||
flowercore.io/tenant-id: system
|
|
||||||
flowercore.io/created-by: bluejay-infra
|
|
||||||
spec:
|
spec:
|
||||||
secretName: worldbuilder-web-tls
|
secretName: worldbuilder-web-tls
|
||||||
issuerRef:
|
issuerRef:
|
||||||
@@ -236,13 +200,6 @@ kind: IngressRoute
|
|||||||
metadata:
|
metadata:
|
||||||
name: worldbuilder-web
|
name: worldbuilder-web
|
||||||
namespace: fc-worldbuilder
|
namespace: fc-worldbuilder
|
||||||
labels:
|
|
||||||
app.kubernetes.io/name: worldbuilder-web
|
|
||||||
app.kubernetes.io/component: ingress
|
|
||||||
app.kubernetes.io/part-of: flowercore
|
|
||||||
app.kubernetes.io/managed-by: argocd
|
|
||||||
flowercore.io/tenant-id: system
|
|
||||||
flowercore.io/created-by: bluejay-infra
|
|
||||||
spec:
|
spec:
|
||||||
entryPoints:
|
entryPoints:
|
||||||
- websecure
|
- websecure
|
||||||
|
|||||||
@@ -1,84 +0,0 @@
|
|||||||
# openvoxserver Quadlet Durability
|
|
||||||
|
|
||||||
This runbook documents the noc1 `openvoxserver` durability fix for the Puppet control-repo deploy path. The service is a noc1 host artifact, not an ArgoCD application, so discovery always starts on noc1 rather than in `apps/*`.
|
|
||||||
|
|
||||||
## Current State
|
|
||||||
|
|
||||||
As of the Sprint 32 Cx-12 apply on 2026-05-17:
|
|
||||||
|
|
||||||
- `/etc/containers/systemd/openvoxserver.container` has a `GIT_SSH_COMMAND` environment entry that points at the persisted serverdata deploy key.
|
|
||||||
- `/etc/systemd/system/openvoxserver-safeconfig.service` is enabled and active, and reapplies `git config --global --add safe.directory *` inside the running container.
|
|
||||||
- `/opt/puppet/r10k-deploy.sh` self-heals before each fetch by setting `safe.directory`, the repo-local `core.sshCommand`, and the persisted `known_hosts` file when needed.
|
|
||||||
- `puppet-deploy.service` exits `0/SUCCESS` after the apply and the control repo reports `HEAD == origin/master`.
|
|
||||||
- `systemctl cat openvoxserver` does not currently resolve to a generated unit on noc1. The container is running through Podman with `restart=always`, so destructive recreate smoke must not run until the generated unit is present.
|
|
||||||
|
|
||||||
## Discovery
|
|
||||||
|
|
||||||
Run every command through noc1 as `fcadmin`; do not assume BLUEJAY-WS can reach container-local surfaces directly.
|
|
||||||
|
|
||||||
```bash
|
|
||||||
ssh -i ~/.ssh/fcadmin_ed25519 fcadmin@10.0.56.10 "hostname && sudo -n true"
|
|
||||||
ssh -i ~/.ssh/fcadmin_ed25519 fcadmin@10.0.56.10 "sudo find /etc/containers/systemd /usr/share/containers/systemd /etc/systemd/system -name 'openvoxserver*' 2>/dev/null"
|
|
||||||
ssh -i ~/.ssh/fcadmin_ed25519 fcadmin@10.0.56.10 "sudo sed -n '1,220p' /etc/containers/systemd/openvoxserver.container"
|
|
||||||
ssh -i ~/.ssh/fcadmin_ed25519 fcadmin@10.0.56.10 "sudo systemctl cat puppet-deploy.service"
|
|
||||||
```
|
|
||||||
|
|
||||||
If a future noc1 profile manages these files, update the Puppet control repo and let `puppet-deploy.service` apply the change. On 2026-05-17, host `puppet` was not installed, so Cx-12 used a direct noc1 host edit.
|
|
||||||
|
|
||||||
## Durable Fix Shape
|
|
||||||
|
|
||||||
The Quadlet keeps the deploy key as a path reference only:
|
|
||||||
|
|
||||||
```ini
|
|
||||||
Environment=GIT_SSH_COMMAND=ssh -i /opt/puppetlabs/server/data/puppetserver/.puppet-deploy-key -o StrictHostKeyChecking=yes -o IdentitiesOnly=yes -o UserKnownHostsFile=/opt/puppetlabs/server/data/puppetserver/.known_hosts
|
|
||||||
```
|
|
||||||
|
|
||||||
The safeconfig service is intentionally independent of `openvoxserver.service` until the generated unit exists. It waits for the `openvoxserver` container name and then runs:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
/usr/bin/podman exec openvoxserver git config --global --add safe.directory *
|
|
||||||
```
|
|
||||||
|
|
||||||
The deploy script self-heals inside the container before it fetches the control repo:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
git config --global --add safe.directory "*" 2>/dev/null || true
|
|
||||||
DEPLOY_KEY="/opt/puppetlabs/server/data/puppetserver/.puppet-deploy-key"
|
|
||||||
KNOWN_HOSTS="/opt/puppetlabs/server/data/puppetserver/.known_hosts"
|
|
||||||
REPO="/etc/puppetlabs/code/environments/production"
|
|
||||||
export GIT_SSH_COMMAND="ssh -i $DEPLOY_KEY -o StrictHostKeyChecking=yes -o IdentitiesOnly=yes -o UserKnownHostsFile=$KNOWN_HOSTS"
|
|
||||||
git -C "$REPO" config core.sshCommand "$GIT_SSH_COMMAND" 2>/dev/null || true
|
|
||||||
```
|
|
||||||
|
|
||||||
## Validation
|
|
||||||
|
|
||||||
Non-destructive validation:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
ssh -i ~/.ssh/fcadmin_ed25519 fcadmin@10.0.56.10 "sudo grep -n 'GIT_SSH_COMMAND' /etc/containers/systemd/openvoxserver.container"
|
|
||||||
ssh -i ~/.ssh/fcadmin_ed25519 fcadmin@10.0.56.10 "sudo systemctl status openvoxserver-safeconfig.service --no-pager -l"
|
|
||||||
ssh -i ~/.ssh/fcadmin_ed25519 fcadmin@10.0.56.10 "sudo systemctl start puppet-deploy.service && sudo systemctl status puppet-deploy.service --no-pager -l"
|
|
||||||
ssh -i ~/.ssh/fcadmin_ed25519 fcadmin@10.0.56.10 "sudo podman exec openvoxserver git -C /etc/puppetlabs/code/environments/production config --get core.sshCommand"
|
|
||||||
```
|
|
||||||
|
|
||||||
Destructive recreate smoke is opt-in only:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
scp scripts/monitoring/openvox-recreate-smoke.sh fcadmin@10.0.56.10:/tmp/openvox-recreate-smoke.sh
|
|
||||||
ssh -i ~/.ssh/fcadmin_ed25519 fcadmin@10.0.56.10 "chmod +x /tmp/openvox-recreate-smoke.sh && sudo OPENVOX_RECREATE_SMOKE=1 /tmp/openvox-recreate-smoke.sh"
|
|
||||||
```
|
|
||||||
|
|
||||||
Do not run the smoke during normal sprint work. It stops and removes the production container before starting it again through systemd, and it now refuses to continue unless `systemctl cat openvoxserver` succeeds.
|
|
||||||
|
|
||||||
## Credential Rotation Note
|
|
||||||
|
|
||||||
When rotating the Puppet deploy key, update the persisted serverdata copy on noc1:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
sudo install -m 0600 -o root -g root <new-deploy-key> /opt/puppet/serverdata/.puppet-deploy-key
|
|
||||||
sudo podman exec openvoxserver sh -c "ssh-keyscan github.com > /opt/puppetlabs/server/data/puppetserver/.known_hosts"
|
|
||||||
sudo systemctl start openvoxserver-safeconfig.service
|
|
||||||
sudo systemctl start puppet-deploy.service
|
|
||||||
```
|
|
||||||
|
|
||||||
Never commit the deploy key or print it in logs.
|
|
||||||
@@ -1,48 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
if [ "${OPENVOX_RECREATE_SMOKE:-}" != "1" ]; then
|
|
||||||
echo "SKIP: set OPENVOX_RECREATE_SMOKE=1 to run the destructive openvoxserver recreate smoke." >&2
|
|
||||||
exit 64
|
|
||||||
fi
|
|
||||||
|
|
||||||
SUDO="${SUDO:-sudo}"
|
|
||||||
REPO="/etc/puppetlabs/code/environments/production"
|
|
||||||
CORE_SSH_COMMAND_FRAGMENT=".puppet-deploy-key"
|
|
||||||
|
|
||||||
if ! $SUDO systemctl cat openvoxserver >/dev/null 2>&1; then
|
|
||||||
echo "SKIP: systemctl cat openvoxserver failed; refusing to remove a container without a verified systemd recreate path." >&2
|
|
||||||
exit 65
|
|
||||||
fi
|
|
||||||
|
|
||||||
before="$($SUDO podman exec openvoxserver git -C "$REPO" rev-parse --short HEAD)"
|
|
||||||
echo "Before recreate: $before"
|
|
||||||
|
|
||||||
$SUDO systemctl stop openvoxserver
|
|
||||||
$SUDO podman rm openvoxserver 2>/dev/null || true
|
|
||||||
$SUDO systemctl start openvoxserver
|
|
||||||
|
|
||||||
sleep 50
|
|
||||||
|
|
||||||
$SUDO systemctl start puppet-deploy.service
|
|
||||||
sleep 5
|
|
||||||
|
|
||||||
$SUDO systemctl status puppet-deploy.service --no-pager -l
|
|
||||||
|
|
||||||
after="$($SUDO podman exec openvoxserver git -C "$REPO" rev-parse --short origin/master)"
|
|
||||||
echo "After recreate origin/master: $after"
|
|
||||||
|
|
||||||
$SUDO test -d /opt/puppet/code/environments/production/site-modules/profile/manifests
|
|
||||||
|
|
||||||
core_ssh="$($SUDO podman exec openvoxserver git -C "$REPO" config --get core.sshCommand)"
|
|
||||||
case "$core_ssh" in
|
|
||||||
*"$CORE_SSH_COMMAND_FRAGMENT"*) ;;
|
|
||||||
*)
|
|
||||||
echo "FAIL: core.sshCommand does not reference the persisted deploy key." >&2
|
|
||||||
exit 1
|
|
||||||
;;
|
|
||||||
esac
|
|
||||||
|
|
||||||
$SUDO podman exec openvoxserver git -C "$REPO" status --short --branch
|
|
||||||
|
|
||||||
echo "PASS: openvoxserver recreate smoke completed without git safety or deploy-key failure."
|
|
||||||
@@ -13,7 +13,6 @@ public sealed class FleetManifestLintTests
|
|||||||
|
|
||||||
private static readonly HashSet<string> PublicReadOnlyHosts = new(StringComparer.Ordinal)
|
private static readonly HashSet<string> PublicReadOnlyHosts = new(StringComparer.Ordinal)
|
||||||
{
|
{
|
||||||
"brochure.flowercore.io",
|
|
||||||
"dist.flowercore.io",
|
"dist.flowercore.io",
|
||||||
"dns.iamworkin.lan",
|
"dns.iamworkin.lan",
|
||||||
};
|
};
|
||||||
@@ -55,43 +54,6 @@ public sealed class FleetManifestLintTests
|
|||||||
"ttsreader-piper",
|
"ttsreader-piper",
|
||||||
};
|
};
|
||||||
|
|
||||||
private static readonly IReadOnlyDictionary<string, string> LinuxRunnerRepos = new Dictionary<string, string>(StringComparer.Ordinal)
|
|
||||||
{
|
|
||||||
["github-runner"] = "https://github.com/astoltz/FlowerCore.Common",
|
|
||||||
["github-runner-sharedpos"] = "https://github.com/astoltz/FlowerCore.Shared.Pos",
|
|
||||||
["github-runner-puppet"] = "https://github.com/astoltz/FlowerCore.Puppet",
|
|
||||||
["github-runner-signage"] = "https://github.com/astoltz/FlowerCore.Signage",
|
|
||||||
["github-runner-dms"] = "https://github.com/astoltz/FlowerCore.DMS",
|
|
||||||
["github-runner-telephony"] = "https://github.com/astoltz/FlowerCore.Telephony",
|
|
||||||
["github-runner-print-web"] = "https://github.com/astoltz/FlowerCore.Print.Web",
|
|
||||||
["github-runner-chat"] = "https://github.com/astoltz/FlowerCore.Chat",
|
|
||||||
["github-runner-mysql"] = "https://github.com/astoltz/FlowerCore.MySQL",
|
|
||||||
["github-runner-kiosk-linux"] = "https://github.com/astoltz/FlowerCore.Kiosk.Linux",
|
|
||||||
};
|
|
||||||
|
|
||||||
private static readonly HashSet<string> ScaledLinuxRunnerDeployments = new(StringComparer.Ordinal)
|
|
||||||
{
|
|
||||||
"github-runner-sharedpos",
|
|
||||||
"github-runner-puppet",
|
|
||||||
"github-runner-signage",
|
|
||||||
"github-runner-dms",
|
|
||||||
"github-runner-telephony",
|
|
||||||
"github-runner-print-web",
|
|
||||||
"github-runner-chat",
|
|
||||||
"github-runner-mysql",
|
|
||||||
"github-runner-kiosk-linux",
|
|
||||||
};
|
|
||||||
|
|
||||||
private static readonly IReadOnlyDictionary<string, string> WritableRunnerEnv = new Dictionary<string, string>(StringComparer.Ordinal)
|
|
||||||
{
|
|
||||||
["HOME"] = "/home/runner",
|
|
||||||
["DOTNET_INSTALL_DIR"] = "/home/runner/.dotnet",
|
|
||||||
["DOTNET_CLI_HOME"] = "/home/runner",
|
|
||||||
["NUGET_PACKAGES"] = "/home/runner/.nuget/packages",
|
|
||||||
["XDG_CACHE_HOME"] = "/home/runner/.cache",
|
|
||||||
["RUNNER_TOOL_CACHE"] = "/home/runner/_tool",
|
|
||||||
};
|
|
||||||
|
|
||||||
[Fact]
|
[Fact]
|
||||||
public void IngressRoutes_MustKeepServiceReferencesInTheSameNamespace()
|
public void IngressRoutes_MustKeepServiceReferencesInTheSameNamespace()
|
||||||
{
|
{
|
||||||
@@ -225,98 +187,6 @@ public sealed class FleetManifestLintTests
|
|||||||
violations.Should().BeEmpty();
|
violations.Should().BeEmpty();
|
||||||
}
|
}
|
||||||
|
|
||||||
[Fact]
|
|
||||||
public void GitHubRunnerFleet_MustRegisterRequiredReposAsRepoScopedDeployments()
|
|
||||||
{
|
|
||||||
var deployments = GitHubRunnerDeployments();
|
|
||||||
|
|
||||||
foreach (var expectedRunner in LinuxRunnerRepos)
|
|
||||||
{
|
|
||||||
deployments.Should().ContainKey(expectedRunner.Key);
|
|
||||||
|
|
||||||
var container = deployments[expectedRunner.Key].ContainerMappings().Should().ContainSingle().Subject;
|
|
||||||
EnvValue(container, "REPO_URL").Should().Be(expectedRunner.Value);
|
|
||||||
EnvValue(container, "EPHEMERAL").Should().Be("true");
|
|
||||||
EnvValue(container, "LABELS").Should().Be("self-hosted,linux,fc-build-linux");
|
|
||||||
EnvValue(container, "RUN_AS_ROOT").Should().Be("false");
|
|
||||||
EnvValue(container, "ACCESS_TOKEN").Should().BeNull("ACCESS_TOKEN must come from github-runner-token Secret, not a literal");
|
|
||||||
EnvSecretName(container, "ACCESS_TOKEN").Should().Be("github-runner-token");
|
|
||||||
EnvSecretKey(container, "ACCESS_TOKEN").Should().Be("credential");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
[Fact]
|
|
||||||
public void GitHubRunnerFleet_MustSetWritableNonRootDotnetAndCachePaths()
|
|
||||||
{
|
|
||||||
foreach (var deployment in GitHubRunnerDeployments().Values)
|
|
||||||
{
|
|
||||||
var container = deployment.ContainerMappings().Should().ContainSingle().Subject;
|
|
||||||
|
|
||||||
foreach (var expectedEnv in WritableRunnerEnv)
|
|
||||||
{
|
|
||||||
EnvValue(container, expectedEnv.Key).Should().Be(expectedEnv.Value, $"{deployment.Name} must keep .NET paths writable for uid 1001");
|
|
||||||
}
|
|
||||||
|
|
||||||
var mounts = ManifestNodeExtensions.MappingSequence(container, "volumeMounts")
|
|
||||||
.ToDictionary(
|
|
||||||
mount => ManifestNodeExtensions.Scalar(mount, "name") ?? string.Empty,
|
|
||||||
mount => ManifestNodeExtensions.Scalar(mount, "mountPath") ?? string.Empty,
|
|
||||||
StringComparer.Ordinal);
|
|
||||||
|
|
||||||
mounts.Should().Contain("runner-home", "/home/runner");
|
|
||||||
mounts.Should().Contain("nuget-cache", "/home/runner/.nuget/packages");
|
|
||||||
mounts.Should().Contain("tmp", "/tmp");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
[Fact]
|
|
||||||
public void GitHubRunnerFleet_MustAvoidRwoMultiAttachForScaledDeployments()
|
|
||||||
{
|
|
||||||
var deployments = GitHubRunnerDeployments();
|
|
||||||
|
|
||||||
foreach (var deploymentName in ScaledLinuxRunnerDeployments)
|
|
||||||
{
|
|
||||||
var deployment = deployments[deploymentName];
|
|
||||||
ReplicaCount(deployment).Should().Be(2);
|
|
||||||
|
|
||||||
var volumes = deployment.MappingSequence("spec", "template", "spec", "volumes");
|
|
||||||
var claimNames = volumes
|
|
||||||
.Select(volume => ManifestNodeExtensions.Scalar(volume, "persistentVolumeClaim", "claimName"))
|
|
||||||
.Where(value => !string.IsNullOrWhiteSpace(value))
|
|
||||||
.ToList();
|
|
||||||
|
|
||||||
claimNames.Should().BeEmpty($"{deploymentName} is scaled and must not share a RWO PVC");
|
|
||||||
volumes.Should().Contain(volume =>
|
|
||||||
string.Equals(ManifestNodeExtensions.Scalar(volume, "name"), "nuget-cache", StringComparison.Ordinal)
|
|
||||||
&& ManifestNodeExtensions.Mapping(volume, "emptyDir") != null);
|
|
||||||
}
|
|
||||||
|
|
||||||
var common = deployments["github-runner"];
|
|
||||||
ReplicaCount(common).Should().Be(1);
|
|
||||||
common.MappingSequence("spec", "template", "spec", "volumes")
|
|
||||||
.Select(volume => ManifestNodeExtensions.Scalar(volume, "persistentVolumeClaim", "claimName"))
|
|
||||||
.Where(value => !string.IsNullOrWhiteSpace(value))
|
|
||||||
.Should()
|
|
||||||
.ContainSingle()
|
|
||||||
.Which
|
|
||||||
.Should()
|
|
||||||
.Be("github-runner-nuget-cache");
|
|
||||||
}
|
|
||||||
|
|
||||||
[Fact]
|
|
||||||
public void Monitoring_MustAlertWhenLinuxRunnerDeploymentIsUnavailable()
|
|
||||||
{
|
|
||||||
var monitoring = File.ReadAllText(Path.Combine(Inventory.BluejayRoot, "apps", "monitoring", "noc-monitoring.yaml"));
|
|
||||||
|
|
||||||
monitoring.Should().Contain("MacMiniRunnerOffline");
|
|
||||||
monitoring.Should().Contain("LinuxRunnerOffline");
|
|
||||||
monitoring.Should().Contain("kube_deployment_status_replicas_ready");
|
|
||||||
monitoring.Should().Contain("github-runner(|-(sharedpos|puppet|signage|dms|telephony|print-web|chat|mysql|kiosk-linux))");
|
|
||||||
monitoring.Should().Contain("folder: CI Alerts");
|
|
||||||
monitoring.Should().Contain("uid: linux-runner-offline");
|
|
||||||
monitoring.Should().Contain("alert_channel: irc");
|
|
||||||
}
|
|
||||||
|
|
||||||
[Fact]
|
[Fact]
|
||||||
public void StatefulSets_WithVolumeClaimTemplates_MustDeclareFilesystemDefaults()
|
public void StatefulSets_WithVolumeClaimTemplates_MustDeclareFilesystemDefaults()
|
||||||
{
|
{
|
||||||
@@ -421,184 +291,6 @@ public sealed class FleetManifestLintTests
|
|||||||
violations.Should().BeEmpty();
|
violations.Should().BeEmpty();
|
||||||
}
|
}
|
||||||
|
|
||||||
[Fact]
|
|
||||||
public void FcDeviceManagement_MustShipExpectedManifestSet()
|
|
||||||
{
|
|
||||||
var appRoot = Path.Combine(Inventory.BluejayRoot, "apps", "fc-devicemgmt");
|
|
||||||
Directory.Exists(appRoot).Should().BeTrue("Sprint 8 Cx-5 owns apps/fc-devicemgmt.");
|
|
||||||
|
|
||||||
var expectedFiles = new[]
|
|
||||||
{
|
|
||||||
"1password-item.yaml",
|
|
||||||
"argocd-application.yaml",
|
|
||||||
"certificate-web.yaml",
|
|
||||||
"clusterrole-operator.yaml",
|
|
||||||
"clusterrolebinding-operator.yaml",
|
|
||||||
"deployment-operator.yaml",
|
|
||||||
"deployment-web.yaml",
|
|
||||||
"ingressroute-web.yaml",
|
|
||||||
"namespace.yaml",
|
|
||||||
"network-policy.yaml",
|
|
||||||
"service-web.yaml",
|
|
||||||
"serviceaccount-operator.yaml",
|
|
||||||
};
|
|
||||||
|
|
||||||
Directory.GetFiles(appRoot, "*.yaml")
|
|
||||||
.Select(Path.GetFileName)
|
|
||||||
.Should()
|
|
||||||
.BeEquivalentTo(expectedFiles);
|
|
||||||
|
|
||||||
foreach (var expectedFile in expectedFiles)
|
|
||||||
{
|
|
||||||
FcDeviceManagementDocuments()
|
|
||||||
.Should()
|
|
||||||
.Contain(document => document.RelativePath == $"fc-devicemgmt/{expectedFile}");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
[Fact]
|
|
||||||
public void FcDeviceManagement_ObjectsMustCarryStandardTraceabilityLabels()
|
|
||||||
{
|
|
||||||
var requiredLabels = new[]
|
|
||||||
{
|
|
||||||
"app.kubernetes.io/name",
|
|
||||||
"app.kubernetes.io/part-of",
|
|
||||||
"app.kubernetes.io/managed-by",
|
|
||||||
"flowercore.io/tenant-id",
|
|
||||||
"flowercore.io/created-by",
|
|
||||||
};
|
|
||||||
|
|
||||||
var violations = FcDeviceManagementDocuments()
|
|
||||||
.SelectMany(document => requiredLabels
|
|
||||||
.Where(label => string.IsNullOrWhiteSpace(document.Scalar("metadata", "labels", label)))
|
|
||||||
.Select(label => $"{document.Descriptor} is missing metadata.labels['{label}']."))
|
|
||||||
.Concat(FcDeviceManagementDocuments()
|
|
||||||
.Where(document => document.Kind == "Deployment")
|
|
||||||
.SelectMany(document => requiredLabels
|
|
||||||
.Where(label => string.IsNullOrWhiteSpace(document.Scalar("spec", "template", "metadata", "labels", label)))
|
|
||||||
.Select(label => $"{document.Descriptor} pod template is missing metadata.labels['{label}'].")))
|
|
||||||
.Concat(FcDeviceManagementDocuments()
|
|
||||||
.Where(document => document.Kind == "Deployment")
|
|
||||||
.Where(document => string.IsNullOrWhiteSpace(document.Scalar("spec", "template", "metadata", "annotations", "flowercore.io/audit-trace-id")))
|
|
||||||
.Select(document => $"{document.Descriptor} pod template is missing flowercore.io/audit-trace-id."))
|
|
||||||
.ToList();
|
|
||||||
|
|
||||||
violations.Should().BeEmpty();
|
|
||||||
}
|
|
||||||
|
|
||||||
[Fact]
|
|
||||||
public void FcDeviceManagement_IngressMustUseCertManagerAndKeepPublicHostDisabled()
|
|
||||||
{
|
|
||||||
var appText = string.Join(
|
|
||||||
Environment.NewLine,
|
|
||||||
Directory.GetFiles(Path.Combine(Inventory.BluejayRoot, "apps", "fc-devicemgmt"), "*.yaml")
|
|
||||||
.Select(File.ReadAllText));
|
|
||||||
|
|
||||||
appText.Should().NotContain("certResolver");
|
|
||||||
appText.Should().Contain("update.flowercore.io");
|
|
||||||
appText.Should().Contain("disabled-until-Q-OIDC-1");
|
|
||||||
|
|
||||||
FcDeviceManagementDocuments()
|
|
||||||
.Where(document => document.Kind == "IngressRoute")
|
|
||||||
.SelectMany(document => document.MappingSequence("spec", "routes"))
|
|
||||||
.Select(route => ManifestNodeExtensions.Scalar(route, "match") ?? string.Empty)
|
|
||||||
.Should()
|
|
||||||
.Contain(match => match.Contains("Host(`devices.iamworkin.lan`)", StringComparison.Ordinal))
|
|
||||||
.And.NotContain(match => match.Contains("Host(`update.flowercore.io`)", StringComparison.Ordinal));
|
|
||||||
|
|
||||||
var certificate = FcDeviceManagementDocuments()
|
|
||||||
.Single(document => document.Kind == "Certificate" && document.Name == "fc-devicemgmt-web-tls");
|
|
||||||
|
|
||||||
certificate.Scalar("spec", "issuerRef", "name").Should().Be("step-ca-acme");
|
|
||||||
certificate.Scalar("spec", "issuerRef", "kind").Should().Be("ClusterIssuer");
|
|
||||||
ManifestNodeExtensions.ScalarSequence(certificate.Root, "spec", "dnsNames")
|
|
||||||
.Should()
|
|
||||||
.ContainSingle("devices.iamworkin.lan");
|
|
||||||
}
|
|
||||||
|
|
||||||
[Fact]
|
|
||||||
public void FcDeviceManagement_OperatorRbacMustCoverDevicesAndOwnerLookup()
|
|
||||||
{
|
|
||||||
var clusterRole = FcDeviceManagementDocuments()
|
|
||||||
.Single(document => document.Kind == "ClusterRole" && document.Name == "fc-devicemgmt-operator");
|
|
||||||
var allScalars = clusterRole.AllScalars().ToList();
|
|
||||||
|
|
||||||
allScalars.Should().Contain("devices.flowercore.io");
|
|
||||||
allScalars.Should().Contain("*");
|
|
||||||
allScalars.Should().Contain("deployments");
|
|
||||||
allScalars.Should().Contain("get");
|
|
||||||
|
|
||||||
var operatorDeployment = FcDeviceManagementDocuments()
|
|
||||||
.Single(document => document.Kind == "Deployment" && document.Name == "fc-devicemgmt-operator");
|
|
||||||
|
|
||||||
operatorDeployment.AllScalars().Should().Contain("FLOWERCORE_KUBERNETES_OWNER_DEPLOYMENT");
|
|
||||||
operatorDeployment.AllScalars().Should().Contain("fc-devicemgmt-operator");
|
|
||||||
}
|
|
||||||
|
|
||||||
[Fact]
|
|
||||||
public void FcDeviceManagement_RuntimeSecretsMustUseOnePasswordItemPattern()
|
|
||||||
{
|
|
||||||
var item = FcDeviceManagementDocuments()
|
|
||||||
.Single(document => document.Kind == "OnePasswordItem" && document.Name == "fc-devicemgmt-runtime");
|
|
||||||
|
|
||||||
item.Scalar("spec", "itemPath")
|
|
||||||
.Should()
|
|
||||||
.Be("vaults/IAmWorkin/items/FlowerCore DeviceManagement Runtime");
|
|
||||||
|
|
||||||
var appText = string.Join(
|
|
||||||
Environment.NewLine,
|
|
||||||
Directory.GetFiles(Path.Combine(Inventory.BluejayRoot, "apps", "fc-devicemgmt"), "*.yaml")
|
|
||||||
.Select(File.ReadAllText));
|
|
||||||
|
|
||||||
FcDeviceManagementDocuments().Should().NotContain(document => document.Kind == "Secret");
|
|
||||||
appText.Should().Contain("secretKeyRef:");
|
|
||||||
appText.Should().Contain("secretName: fc-devicemgmt-runtime");
|
|
||||||
appText.Should().NotContain("stringData:");
|
|
||||||
appText.Should().NotContain("from-literal");
|
|
||||||
appText.Should().NotContain("tls.key:");
|
|
||||||
}
|
|
||||||
|
|
||||||
[Fact]
|
|
||||||
public void FcDeviceManagement_NetworkPoliciesMustAllowLanAgentsSynologyAndDnatPorts()
|
|
||||||
{
|
|
||||||
var policies = FcDeviceManagementDocuments()
|
|
||||||
.Where(document => document.Kind == "NetworkPolicy")
|
|
||||||
.ToList();
|
|
||||||
|
|
||||||
policies.Should().HaveCount(2);
|
|
||||||
|
|
||||||
var combinedScalars = policies.SelectMany(policy => policy.AllScalars()).ToList();
|
|
||||||
combinedScalars.Should().Contain("10.0.56.0/24");
|
|
||||||
combinedScalars.Should().Contain("10.0.57.0/24");
|
|
||||||
combinedScalars.Should().Contain("10.0.58.0/24");
|
|
||||||
combinedScalars.Should().Contain("10.0.68.0/27");
|
|
||||||
combinedScalars.Should().Contain("10.0.58.3/32");
|
|
||||||
|
|
||||||
var combinedEgressPorts = policies.SelectMany(policy => policy.EgressPorts()).ToHashSet(StringComparer.Ordinal);
|
|
||||||
combinedEgressPorts.Should().Contain(new[] { "80", "443", "8080", "8443", "2049", "111" });
|
|
||||||
|
|
||||||
var traefikVipPolicies = policies
|
|
||||||
.Where(policy => policy.AllScalars().Any(value => value.Contains("10.0.56.200", StringComparison.Ordinal)))
|
|
||||||
.ToList();
|
|
||||||
|
|
||||||
traefikVipPolicies.Should().ContainSingle();
|
|
||||||
traefikVipPolicies[0].EgressPorts().Should().Contain(new[] { "80", "443", "8080", "8443" });
|
|
||||||
}
|
|
||||||
|
|
||||||
[Fact]
|
|
||||||
public void FcDeviceManagement_ArgocdApplicationMustMatchApplicationSetDiscoveryConventions()
|
|
||||||
{
|
|
||||||
var application = FcDeviceManagementDocuments()
|
|
||||||
.Single(document => document.Kind == "Application" && document.Name == "infra-fc-devicemgmt");
|
|
||||||
|
|
||||||
application.Namespace.Should().Be("argocd");
|
|
||||||
application.Scalar("spec", "source", "repoURL")
|
|
||||||
.Should()
|
|
||||||
.Be("http://gitea-clusterip.gitea.svc.cluster.local:3000/bluejay/bluejay-infra.git");
|
|
||||||
application.Scalar("spec", "source", "path").Should().Be("apps/fc-devicemgmt");
|
|
||||||
application.Scalar("spec", "destination", "namespace").Should().Be("fc-devicemgmt");
|
|
||||||
}
|
|
||||||
|
|
||||||
private static IEnumerable<string> ProbeViolations(
|
private static IEnumerable<string> ProbeViolations(
|
||||||
ManifestDocument document,
|
ManifestDocument document,
|
||||||
YamlMappingNode container,
|
YamlMappingNode container,
|
||||||
@@ -622,51 +314,6 @@ public sealed class FleetManifestLintTests
|
|||||||
$"{document.Descriptor} container '{containerName}' still uses {probeKey}.httpGet on /health.",
|
$"{document.Descriptor} container '{containerName}' still uses {probeKey}.httpGet on /health.",
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
private static IReadOnlyDictionary<string, ManifestDocument> GitHubRunnerDeployments()
|
|
||||||
{
|
|
||||||
return Inventory.Documents
|
|
||||||
.Where(document => document.Kind == "Deployment")
|
|
||||||
.Where(document => document.Namespace == "github-runner")
|
|
||||||
.ToDictionary(document => document.Name, StringComparer.Ordinal);
|
|
||||||
}
|
|
||||||
|
|
||||||
private static int ReplicaCount(ManifestDocument document)
|
|
||||||
{
|
|
||||||
return int.TryParse(document.Scalar("spec", "replicas"), out var replicas) ? replicas : 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
private static string? EnvValue(YamlMappingNode container, string name)
|
|
||||||
{
|
|
||||||
return EnvMapping(container, name) is { } env ? ManifestNodeExtensions.Scalar(env, "value") : null;
|
|
||||||
}
|
|
||||||
|
|
||||||
private static string? EnvSecretName(YamlMappingNode container, string name)
|
|
||||||
{
|
|
||||||
return EnvMapping(container, name) is { } env
|
|
||||||
? ManifestNodeExtensions.Scalar(env, "valueFrom", "secretKeyRef", "name")
|
|
||||||
: null;
|
|
||||||
}
|
|
||||||
|
|
||||||
private static string? EnvSecretKey(YamlMappingNode container, string name)
|
|
||||||
{
|
|
||||||
return EnvMapping(container, name) is { } env
|
|
||||||
? ManifestNodeExtensions.Scalar(env, "valueFrom", "secretKeyRef", "key")
|
|
||||||
: null;
|
|
||||||
}
|
|
||||||
|
|
||||||
private static YamlMappingNode? EnvMapping(YamlMappingNode container, string name)
|
|
||||||
{
|
|
||||||
return ManifestNodeExtensions.MappingSequence(container, "env")
|
|
||||||
.SingleOrDefault(env => string.Equals(ManifestNodeExtensions.Scalar(env, "name"), name, StringComparison.Ordinal));
|
|
||||||
}
|
|
||||||
|
|
||||||
private static IReadOnlyList<ManifestDocument> FcDeviceManagementDocuments()
|
|
||||||
{
|
|
||||||
return Inventory.Documents
|
|
||||||
.Where(document => document.RelativePath.StartsWith("fc-devicemgmt/", StringComparison.Ordinal))
|
|
||||||
.ToList();
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
internal sealed class ManifestInventory
|
internal sealed class ManifestInventory
|
||||||
|
|||||||
@@ -1,99 +0,0 @@
|
|||||||
using FluentAssertions;
|
|
||||||
using Xunit;
|
|
||||||
|
|
||||||
namespace BluejayInfraLint.Tests;
|
|
||||||
|
|
||||||
[Trait("Category", "Unit")]
|
|
||||||
public sealed class OpenVoxServerDurabilityTests
|
|
||||||
{
|
|
||||||
private static readonly string Root = FindRepoRoot();
|
|
||||||
private static readonly string RunbookPath = Path.Combine(Root, "docs", "runbooks", "openvoxserver-quadlet-durability.md");
|
|
||||||
private static readonly string SmokePath = Path.Combine(Root, "scripts", "monitoring", "openvox-recreate-smoke.sh");
|
|
||||||
|
|
||||||
[Fact]
|
|
||||||
public void Runbook_DocumentsHostArtifactAndNonArgoPath()
|
|
||||||
{
|
|
||||||
var runbook = File.ReadAllText(RunbookPath);
|
|
||||||
|
|
||||||
runbook.Should().Contain("noc1 host artifact");
|
|
||||||
runbook.Should().Contain("not an ArgoCD application");
|
|
||||||
runbook.Should().Contain("systemctl cat openvoxserver");
|
|
||||||
runbook.Should().Contain("/etc/containers/systemd/openvoxserver.container");
|
|
||||||
}
|
|
||||||
|
|
||||||
[Fact]
|
|
||||||
public void Runbook_DocumentsCx12LiveApplyState()
|
|
||||||
{
|
|
||||||
var runbook = File.ReadAllText(RunbookPath);
|
|
||||||
|
|
||||||
runbook.Should().Contain("Sprint 32 Cx-12");
|
|
||||||
runbook.Should().Contain("openvoxserver-safeconfig.service");
|
|
||||||
runbook.Should().Contain("/opt/puppet/r10k-deploy.sh");
|
|
||||||
runbook.Should().Contain("HEAD == origin/master");
|
|
||||||
}
|
|
||||||
|
|
||||||
[Fact]
|
|
||||||
public void SmokeScript_IsExplicitlyOptIn()
|
|
||||||
{
|
|
||||||
var smoke = File.ReadAllText(SmokePath);
|
|
||||||
|
|
||||||
smoke.Should().Contain("OPENVOX_RECREATE_SMOKE");
|
|
||||||
smoke.Should().Contain("exit 64");
|
|
||||||
smoke.IndexOf("OPENVOX_RECREATE_SMOKE", StringComparison.Ordinal)
|
|
||||||
.Should().BeLessThan(smoke.IndexOf("systemctl stop openvoxserver", StringComparison.Ordinal));
|
|
||||||
}
|
|
||||||
|
|
||||||
[Fact]
|
|
||||||
public void SmokeScript_RequiresGeneratedSystemdUnitBeforeRemovingContainer()
|
|
||||||
{
|
|
||||||
var smoke = File.ReadAllText(SmokePath);
|
|
||||||
|
|
||||||
smoke.Should().Contain("systemctl cat openvoxserver");
|
|
||||||
smoke.Should().Contain("refusing to remove a container without a verified systemd recreate path");
|
|
||||||
smoke.IndexOf("systemctl cat openvoxserver", StringComparison.Ordinal)
|
|
||||||
.Should().BeLessThan(smoke.IndexOf("podman rm openvoxserver", StringComparison.Ordinal));
|
|
||||||
}
|
|
||||||
|
|
||||||
[Fact]
|
|
||||||
public void Artifacts_DoNotStoreSecretsOrPaidRunnerLabels()
|
|
||||||
{
|
|
||||||
var forbidden = new[]
|
|
||||||
{
|
|
||||||
"BEGIN OPENSSH PRIVATE KEY",
|
|
||||||
"BEGIN RSA PRIVATE KEY",
|
|
||||||
"ubuntu-latest",
|
|
||||||
"windows-latest",
|
|
||||||
"macos-latest",
|
|
||||||
};
|
|
||||||
|
|
||||||
var violations = new[] { RunbookPath, SmokePath }
|
|
||||||
.SelectMany(path =>
|
|
||||||
{
|
|
||||||
var text = File.ReadAllText(path);
|
|
||||||
return forbidden
|
|
||||||
.Where(token => text.Contains(token, StringComparison.OrdinalIgnoreCase))
|
|
||||||
.Select(token => $"{Path.GetRelativePath(Root, path)} contains forbidden token {token}");
|
|
||||||
})
|
|
||||||
.ToList();
|
|
||||||
|
|
||||||
violations.Should().BeEmpty();
|
|
||||||
}
|
|
||||||
|
|
||||||
private static string FindRepoRoot()
|
|
||||||
{
|
|
||||||
var current = new DirectoryInfo(AppContext.BaseDirectory);
|
|
||||||
while (current is not null)
|
|
||||||
{
|
|
||||||
if (Directory.Exists(Path.Combine(current.FullName, "apps"))
|
|
||||||
&& Directory.Exists(Path.Combine(current.FullName, "scripts"))
|
|
||||||
&& File.Exists(Path.Combine(current.FullName, "README.md")))
|
|
||||||
{
|
|
||||||
return current.FullName;
|
|
||||||
}
|
|
||||||
|
|
||||||
current = current.Parent;
|
|
||||||
}
|
|
||||||
|
|
||||||
throw new DirectoryNotFoundException("Could not find bluejay-infra root.");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -174,13 +174,10 @@ public sealed class PiSignagePlayerArtifactTests
|
|||||||
public void HdmiRule_RestartsPlayerAndRunsCapabilityDetection()
|
public void HdmiRule_RestartsPlayerAndRunsCapabilityDetection()
|
||||||
{
|
{
|
||||||
var rule = Read("systemd/99-flowercore-signage-hdmi.rules");
|
var rule = Read("systemd/99-flowercore-signage-hdmi.rules");
|
||||||
var responder = Read("scripts/flowercore-signage-hdmi-respond.sh");
|
|
||||||
|
|
||||||
rule.Should().Contain("KERNEL==\"card?-HDMI-A-?\"");
|
rule.Should().Contain("KERNEL==\"card?-HDMI-A-?\"");
|
||||||
rule.Should().Contain("start flowercore-signage-player-pi-hdmi.service");
|
rule.Should().Contain("restart flowercore-signage-player-pi.service");
|
||||||
responder.Should().Contain("sleep 2");
|
rule.Should().Contain("start flowercore-signage-detect-display.service");
|
||||||
responder.Should().Contain("start flowercore-signage-detect-display.service");
|
|
||||||
responder.Should().Contain("restart flowercore-signage-player-pi.service");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
[Fact]
|
[Fact]
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
package bluejayinfra.public_method_allowlist
|
package bluejayinfra.public_method_allowlist
|
||||||
|
|
||||||
public_hosts := {"brochure.flowercore.io", "dist.flowercore.io", "dns.iamworkin.lan"}
|
public_hosts := {"dist.flowercore.io", "dns.iamworkin.lan"}
|
||||||
|
|
||||||
deny[msg] {
|
deny[msg] {
|
||||||
input.kind == "IngressRoute"
|
input.kind == "IngressRoute"
|
||||||
|
|||||||
Reference in New Issue
Block a user