Compare commits

..

2 Commits

Author SHA1 Message Date
Andrew Stoltz
34dda0c99c feat(infra): prestage broader app exposure hardening 2026-06-04 15:55:07 -05:00
Andrew Stoltz
e1e0159b06 test(lint): reconcile baseline infra assertions 2026-06-04 15:40:57 -05:00
10 changed files with 459 additions and 707 deletions

View File

@@ -2,22 +2,6 @@
Infrastructure manifests for ArgoCD. An `ApplicationSet` in `argocd` namespace watches the `apps/*` directories in this repo and creates one `Application` per subdir (prefixed `infra-<name>`). Infrastructure manifests for ArgoCD. An `ApplicationSet` in `argocd` namespace watches the `apps/*` directories in this repo and creates one `Application` per subdir (prefixed `infra-<name>`).
## Root GitOps ApplicationSet
`argocd/applicationset-bluejay-infra.yaml` is the root of this GitOps tree, but
it is **NOT self-managed** by ArgoCD. Apply it manually when the root generator
or sync policy changes:
```bash
kubectl -n argocd apply -f argocd/applicationset-bluejay-infra.yaml
```
Keep the per-StatefulSet `ignoreDifferences` entries in that file synced with
the live ApplicationSet. They intentionally cover `zabbix-postgres`,
`guac-mysql`, `matrix-postgres`, and `authentik-postgres` so ArgoCD does not
loop forever on server-side-apply `volumeClaimTemplates` status drift. Every new
StatefulSet with `volumeClaimTemplates` needs its own entry appended.
## Adding a new service to the cluster ## Adding a new service to the cluster
Follow these steps in order. **Step 1 must run before step 3** — if you skip it, cert-manager HTTP-01 will silently fail for ~2h per cert (exponential backoff) until someone diagnoses the DNS. Follow these steps in order. **Step 1 must run before step 3** — if you skip it, cert-manager HTTP-01 will silently fail for ~2h per cert (exponential backoff) until someone diagnoses the DNS.

View File

@@ -1,453 +1,448 @@
# Authentik OIDC backend # Authentik OIDC backend
# ArgoCD-managed. BlueJay Lab. # ArgoCD-managed. BlueJay Lab.
# #
# Stack: # Stack:
# - PostgreSQL 16 StatefulSet (single replica, Longhorn RWO 5Gi) # - PostgreSQL 16 StatefulSet (single replica, Longhorn RWO 5Gi)
# - Redis 7 Deployment (no persistence — session/cache only) # - Redis 7 Deployment (no persistence — session/cache only)
# - Authentik server + worker Deployments (image ghcr.io/goauthentik/server:2024.12.3) # - Authentik server + worker Deployments (image ghcr.io/goauthentik/server:2024.12.3)
# - Media PVC shared between server + worker (Longhorn RWO 2Gi) # - Media PVC shared between server + worker (Longhorn RWO 2Gi)
# - Certificate via step-ca-acme ClusterIssuer # - Certificate via step-ca-acme ClusterIssuer
# - Traefik IngressRoute at id.iamworkin.lan # - Traefik IngressRoute at id.iamworkin.lan
# #
# Secrets come from 1Password item "authentik-credentials" (IAmWorkin vault, id y6i74ch22q5wvm7znquq4nhhcu) # Secrets come from 1Password item "authentik-credentials" (IAmWorkin vault, id y6i74ch22q5wvm7znquq4nhhcu)
# via the OnePasswordItem CRD, materialized into k8s Secret authentik/authentik-credentials. # via the OnePasswordItem CRD, materialized into k8s Secret authentik/authentik-credentials.
# #
# Why the discovery URL is /application/o/pimanager/ : Authentik issues per-application OIDC providers. # Why the discovery URL is /application/o/pimanager/ : Authentik issues per-application OIDC providers.
# The pimanager OIDC application/provider is created after the cluster pods are healthy (manual or # The pimanager OIDC application/provider is created after the cluster pods are healthy (manual or
# via API once the bootstrap token is available — see Notes substrate). # via API once the bootstrap token is available — see Notes substrate).
--- ---
apiVersion: v1 apiVersion: v1
kind: Namespace kind: Namespace
metadata: metadata:
name: authentik name: authentik
labels: labels:
app.kubernetes.io/part-of: bluejay-infra app.kubernetes.io/part-of: bluejay-infra
--- ---
# 1Password operator pulls the authentik-credentials item into a k8s Secret of the same name. # 1Password operator pulls the authentik-credentials item into a k8s Secret of the same name.
# Field labels in 1P become Secret keys: AUTHENTIK_SECRET_KEY, POSTGRES_PASSWORD, REDIS_PASSWORD, # Field labels in 1P become Secret keys: AUTHENTIK_SECRET_KEY, POSTGRES_PASSWORD, REDIS_PASSWORD,
# BOOTSTRAP_ADMIN_PASSWORD, BOOTSTRAP_ADMIN_TOKEN, BOOTSTRAP_ADMIN_EMAIL. # BOOTSTRAP_ADMIN_PASSWORD, BOOTSTRAP_ADMIN_TOKEN, BOOTSTRAP_ADMIN_EMAIL.
apiVersion: onepassword.com/v1 apiVersion: onepassword.com/v1
kind: OnePasswordItem kind: OnePasswordItem
metadata: metadata:
name: authentik-credentials name: authentik-credentials
namespace: authentik namespace: authentik
spec: spec:
itemPath: "vaults/IAmWorkin/items/authentik-credentials" itemPath: "vaults/IAmWorkin/items/authentik-credentials"
--- ---
# Shared media volume for server + worker pods. # Shared media volume for server + worker pods.
apiVersion: v1 apiVersion: v1
kind: PersistentVolumeClaim kind: PersistentVolumeClaim
metadata: metadata:
name: authentik-media name: authentik-media
namespace: authentik namespace: authentik
spec: spec:
storageClassName: longhorn storageClassName: longhorn
accessModes: [ReadWriteOnce] accessModes: [ReadWriteOnce]
resources: resources:
requests: requests:
storage: 2Gi storage: 2Gi
--- ---
# PostgreSQL 16 StatefulSet — Authentik's primary store. # PostgreSQL 16 StatefulSet — Authentik's primary store.
apiVersion: apps/v1 apiVersion: apps/v1
kind: StatefulSet kind: StatefulSet
metadata: metadata:
name: authentik-postgres name: authentik-postgres
namespace: authentik namespace: authentik
labels: labels:
app: authentik-postgres app: authentik-postgres
argocd.argoproj.io/instance: infra-authentik argocd.argoproj.io/instance: infra-authentik
spec: spec:
persistentVolumeClaimRetentionPolicy: persistentVolumeClaimRetentionPolicy:
whenDeleted: Retain whenDeleted: Retain
whenScaled: Retain whenScaled: Retain
podManagementPolicy: OrderedReady podManagementPolicy: OrderedReady
serviceName: authentik-postgres serviceName: authentik-postgres
replicas: 1 replicas: 1
revisionHistoryLimit: 10 revisionHistoryLimit: 10
selector: selector:
matchLabels: matchLabels:
app: authentik-postgres app: authentik-postgres
template: template:
metadata: metadata:
labels: labels:
app: authentik-postgres app: authentik-postgres
spec: spec:
containers: containers:
- name: postgres - name: postgres
image: postgres:16-alpine image: postgres:16-alpine
ports: ports:
- containerPort: 5432 - containerPort: 5432
name: postgres name: postgres
env: env:
- name: POSTGRES_USER - name: POSTGRES_USER
value: authentik value: authentik
- name: POSTGRES_PASSWORD - name: POSTGRES_PASSWORD
valueFrom: valueFrom:
secretKeyRef: secretKeyRef:
name: authentik-credentials name: authentik-credentials
key: POSTGRES_PASSWORD key: POSTGRES_PASSWORD
- name: POSTGRES_DB - name: POSTGRES_DB
value: authentik value: authentik
- name: POSTGRES_INITDB_ARGS - name: POSTGRES_INITDB_ARGS
value: "--encoding=UTF-8 --lc-collate=C --lc-ctype=C" value: "--encoding=UTF-8 --lc-collate=C --lc-ctype=C"
- name: PGDATA - name: PGDATA
value: /var/lib/postgresql/data/pgdata value: /var/lib/postgresql/data/pgdata
readinessProbe: readinessProbe:
exec: exec:
command: ["pg_isready", "-U", "authentik"] command: ["pg_isready", "-U", "authentik"]
initialDelaySeconds: 5 initialDelaySeconds: 5
periodSeconds: 5 periodSeconds: 5
livenessProbe: livenessProbe:
exec: exec:
command: ["pg_isready", "-U", "authentik"] command: ["pg_isready", "-U", "authentik"]
initialDelaySeconds: 30 initialDelaySeconds: 30
periodSeconds: 30 periodSeconds: 30
resources: resources:
requests: { cpu: 100m, memory: 256Mi } requests: { cpu: 100m, memory: 256Mi }
limits: { cpu: 1000m, memory: 1Gi } limits: { cpu: 1000m, memory: 1Gi }
volumeMounts: volumeMounts:
- name: pgdata - name: pgdata
mountPath: /var/lib/postgresql/data mountPath: /var/lib/postgresql/data
volumeClaimTemplates: volumeClaimTemplates:
# apiVersion/kind included deliberately: this STS was created via ArgoCD ServerSideApply, - metadata:
# so the live object carries PVC TypeMeta inside volumeClaimTemplates; omitting it here name: pgdata
# leaves the app eternally OutOfSync even though kubectl SSA dry-run shows no change. spec:
- apiVersion: v1 storageClassName: longhorn
kind: PersistentVolumeClaim accessModes: [ReadWriteOnce]
metadata: volumeMode: Filesystem
name: pgdata resources:
spec: requests:
storageClassName: longhorn storage: 5Gi
accessModes: [ReadWriteOnce]
volumeMode: Filesystem ---
resources: apiVersion: v1
requests: kind: Service
storage: 5Gi metadata:
name: authentik-postgres
--- namespace: authentik
apiVersion: v1 spec:
kind: Service clusterIP: None
metadata: selector:
name: authentik-postgres app: authentik-postgres
namespace: authentik ports:
spec: - name: postgres
clusterIP: None port: 5432
selector: targetPort: 5432
app: authentik-postgres
ports: ---
- name: postgres # Redis 7 — session storage + Celery broker. No persistence needed (cache).
port: 5432 apiVersion: apps/v1
targetPort: 5432 kind: Deployment
metadata:
--- name: authentik-redis
# Redis 7 — session storage + Celery broker. No persistence needed (cache). namespace: authentik
apiVersion: apps/v1 labels:
kind: Deployment app: authentik-redis
metadata: argocd.argoproj.io/instance: infra-authentik
name: authentik-redis spec:
namespace: authentik replicas: 1
labels: strategy:
app: authentik-redis type: Recreate
argocd.argoproj.io/instance: infra-authentik selector:
spec: matchLabels:
replicas: 1 app: authentik-redis
strategy: template:
type: Recreate metadata:
selector: labels:
matchLabels: app: authentik-redis
app: authentik-redis spec:
template: containers:
metadata: - name: redis
labels: image: redis:7-alpine
app: authentik-redis args:
spec: - "--save"
containers: - ""
- name: redis - "--appendonly"
image: redis:7-alpine - "no"
args: - "--requirepass"
- "--save" - "$(REDIS_PASSWORD)"
- "" env:
- "--appendonly" - name: REDIS_PASSWORD
- "no" valueFrom:
- "--requirepass" secretKeyRef:
- "$(REDIS_PASSWORD)" name: authentik-credentials
env: key: REDIS_PASSWORD
- name: REDIS_PASSWORD ports:
valueFrom: - containerPort: 6379
secretKeyRef: name: redis
name: authentik-credentials readinessProbe:
key: REDIS_PASSWORD tcpSocket: { port: 6379 }
ports: initialDelaySeconds: 5
- containerPort: 6379 periodSeconds: 5
name: redis livenessProbe:
readinessProbe: tcpSocket: { port: 6379 }
tcpSocket: { port: 6379 } initialDelaySeconds: 30
initialDelaySeconds: 5 periodSeconds: 30
periodSeconds: 5 resources:
livenessProbe: requests: { cpu: 50m, memory: 64Mi }
tcpSocket: { port: 6379 } limits: { cpu: 500m, memory: 256Mi }
initialDelaySeconds: 30
periodSeconds: 30 ---
resources: apiVersion: v1
requests: { cpu: 50m, memory: 64Mi } kind: Service
limits: { cpu: 500m, memory: 256Mi } metadata:
name: authentik-redis
--- namespace: authentik
apiVersion: v1 spec:
kind: Service selector:
metadata: app: authentik-redis
name: authentik-redis ports:
namespace: authentik - name: redis
spec: port: 6379
selector: targetPort: 6379
app: authentik-redis
ports: ---
- name: redis # Authentik server Deployment — HTTP frontend on :9000.
port: 6379 apiVersion: apps/v1
targetPort: 6379 kind: Deployment
metadata:
--- name: authentik-server
# Authentik server Deployment — HTTP frontend on :9000. namespace: authentik
apiVersion: apps/v1 labels:
kind: Deployment app: authentik-server
metadata: argocd.argoproj.io/instance: infra-authentik
name: authentik-server spec:
namespace: authentik replicas: 1
labels: strategy:
app: authentik-server type: Recreate # shares /media RWO PVC with worker
argocd.argoproj.io/instance: infra-authentik selector:
spec: matchLabels:
replicas: 1 app: authentik-server
strategy: template:
type: Recreate # shares /media RWO PVC with worker metadata:
selector: labels:
matchLabels: app: authentik-server
app: authentik-server spec:
template: securityContext:
metadata: # Authentik image runs as uid 1000 "authentik" but the Longhorn PVC mounts
labels: # root:root by default. fsGroup recursively chgrp + chmod g+rwx so the
app: authentik-server # non-root container can mkdir /media/public during the tenant_files migration.
spec: fsGroup: 1000
securityContext: containers:
# Authentik image runs as uid 1000 "authentik" but the Longhorn PVC mounts - name: server
# root:root by default. fsGroup recursively chgrp + chmod g+rwx so the image: ghcr.io/goauthentik/server:2024.12.3
# non-root container can mkdir /media/public during the tenant_files migration. args: ["server"]
fsGroup: 1000 ports:
containers: - containerPort: 9000
- name: server name: http
image: ghcr.io/goauthentik/server:2024.12.3 - containerPort: 9443
args: ["server"] name: https
ports: env:
- containerPort: 9000 - name: AUTHENTIK_SECRET_KEY
name: http valueFrom:
- containerPort: 9443 secretKeyRef:
name: https name: authentik-credentials
env: key: AUTHENTIK_SECRET_KEY
- name: AUTHENTIK_SECRET_KEY - name: AUTHENTIK_REDIS__HOST
valueFrom: value: authentik-redis
secretKeyRef: - name: AUTHENTIK_REDIS__PASSWORD
name: authentik-credentials valueFrom:
key: AUTHENTIK_SECRET_KEY secretKeyRef:
- name: AUTHENTIK_REDIS__HOST name: authentik-credentials
value: authentik-redis key: REDIS_PASSWORD
- name: AUTHENTIK_REDIS__PASSWORD - name: AUTHENTIK_POSTGRESQL__HOST
valueFrom: value: authentik-postgres
secretKeyRef: - name: AUTHENTIK_POSTGRESQL__NAME
name: authentik-credentials value: authentik
key: REDIS_PASSWORD - name: AUTHENTIK_POSTGRESQL__USER
- name: AUTHENTIK_POSTGRESQL__HOST value: authentik
value: authentik-postgres - name: AUTHENTIK_POSTGRESQL__PASSWORD
- name: AUTHENTIK_POSTGRESQL__NAME valueFrom:
value: authentik secretKeyRef:
- name: AUTHENTIK_POSTGRESQL__USER name: authentik-credentials
value: authentik key: POSTGRES_PASSWORD
- name: AUTHENTIK_POSTGRESQL__PASSWORD - name: AUTHENTIK_BOOTSTRAP_PASSWORD
valueFrom: valueFrom:
secretKeyRef: secretKeyRef:
name: authentik-credentials name: authentik-credentials
key: POSTGRES_PASSWORD key: BOOTSTRAP_ADMIN_PASSWORD
- name: AUTHENTIK_BOOTSTRAP_PASSWORD - name: AUTHENTIK_BOOTSTRAP_TOKEN
valueFrom: valueFrom:
secretKeyRef: secretKeyRef:
name: authentik-credentials name: authentik-credentials
key: BOOTSTRAP_ADMIN_PASSWORD key: BOOTSTRAP_ADMIN_TOKEN
- name: AUTHENTIK_BOOTSTRAP_TOKEN - name: AUTHENTIK_BOOTSTRAP_EMAIL
valueFrom: valueFrom:
secretKeyRef: secretKeyRef:
name: authentik-credentials name: authentik-credentials
key: BOOTSTRAP_ADMIN_TOKEN key: BOOTSTRAP_ADMIN_EMAIL
- name: AUTHENTIK_BOOTSTRAP_EMAIL - name: AUTHENTIK_DISABLE_UPDATE_CHECK
valueFrom: value: "true"
secretKeyRef: - name: AUTHENTIK_ERROR_REPORTING__ENABLED
name: authentik-credentials value: "false"
key: BOOTSTRAP_ADMIN_EMAIL - name: AUTHENTIK_LOG_LEVEL
- name: AUTHENTIK_DISABLE_UPDATE_CHECK value: info
value: "true" # First-boot Authentik can take 3+ min on the migration phase
- name: AUTHENTIK_ERROR_REPORTING__ENABLED # (waiting on DB lock while worker also runs migrations). Initial
value: "false" # delays are generous so kubelet doesn't kill the pod mid-migration;
- name: AUTHENTIK_LOG_LEVEL # periodSeconds keeps post-startup probing responsive.
value: info readinessProbe:
# First-boot Authentik can take 3+ min on the migration phase httpGet:
# (waiting on DB lock while worker also runs migrations). Initial path: /-/health/ready/
# delays are generous so kubelet doesn't kill the pod mid-migration; port: 9000
# periodSeconds keeps post-startup probing responsive. initialDelaySeconds: 60
readinessProbe: periodSeconds: 10
httpGet: timeoutSeconds: 5
path: /-/health/ready/ failureThreshold: 12
port: 9000 livenessProbe:
initialDelaySeconds: 60 httpGet:
periodSeconds: 10 path: /-/health/live/
timeoutSeconds: 5 port: 9000
failureThreshold: 12 initialDelaySeconds: 300
livenessProbe: periodSeconds: 30
httpGet: timeoutSeconds: 10
path: /-/health/live/ failureThreshold: 3
port: 9000 startupProbe:
initialDelaySeconds: 300 httpGet:
periodSeconds: 30 path: /-/health/live/
timeoutSeconds: 10 port: 9000
failureThreshold: 3 initialDelaySeconds: 30
startupProbe: periodSeconds: 15
httpGet: timeoutSeconds: 10
path: /-/health/live/ failureThreshold: 40 # 30s + 40*15s = 10.5 min budget
port: 9000 resources:
initialDelaySeconds: 30 requests: { cpu: 150m, memory: 512Mi }
periodSeconds: 15 limits: { cpu: 1500m, memory: 1Gi }
timeoutSeconds: 10 volumeMounts:
failureThreshold: 40 # 30s + 40*15s = 10.5 min budget - name: media
resources: mountPath: /media
requests: { cpu: 150m, memory: 512Mi } volumes:
limits: { cpu: 1500m, memory: 1Gi } - name: media
volumeMounts: persistentVolumeClaim:
- name: media claimName: authentik-media
mountPath: /media
volumes: ---
- name: media # Authentik worker Deployment — runs Celery background tasks.
persistentVolumeClaim: apiVersion: apps/v1
claimName: authentik-media kind: Deployment
metadata:
--- name: authentik-worker
# Authentik worker Deployment — runs Celery background tasks. namespace: authentik
apiVersion: apps/v1 labels:
kind: Deployment app: authentik-worker
metadata: argocd.argoproj.io/instance: infra-authentik
name: authentik-worker spec:
namespace: authentik replicas: 1
labels: strategy:
app: authentik-worker type: Recreate # shares /media RWO PVC with server
argocd.argoproj.io/instance: infra-authentik selector:
spec: matchLabels:
replicas: 1 app: authentik-worker
strategy: template:
type: Recreate # shares /media RWO PVC with server metadata:
selector: labels:
matchLabels: app: authentik-worker
app: authentik-worker spec:
template: securityContext:
metadata: # Same as server pod — non-root uid 1000 needs PVC group write.
labels: fsGroup: 1000
app: authentik-worker containers:
spec: - name: worker
securityContext: image: ghcr.io/goauthentik/server:2024.12.3
# Same as server pod — non-root uid 1000 needs PVC group write. args: ["worker"]
fsGroup: 1000 env:
containers: - name: AUTHENTIK_SECRET_KEY
- name: worker valueFrom:
image: ghcr.io/goauthentik/server:2024.12.3 secretKeyRef:
args: ["worker"] name: authentik-credentials
env: key: AUTHENTIK_SECRET_KEY
- name: AUTHENTIK_SECRET_KEY - name: AUTHENTIK_REDIS__HOST
valueFrom: value: authentik-redis
secretKeyRef: - name: AUTHENTIK_REDIS__PASSWORD
name: authentik-credentials valueFrom:
key: AUTHENTIK_SECRET_KEY secretKeyRef:
- name: AUTHENTIK_REDIS__HOST name: authentik-credentials
value: authentik-redis key: REDIS_PASSWORD
- name: AUTHENTIK_REDIS__PASSWORD - name: AUTHENTIK_POSTGRESQL__HOST
valueFrom: value: authentik-postgres
secretKeyRef: - name: AUTHENTIK_POSTGRESQL__NAME
name: authentik-credentials value: authentik
key: REDIS_PASSWORD - name: AUTHENTIK_POSTGRESQL__USER
- name: AUTHENTIK_POSTGRESQL__HOST value: authentik
value: authentik-postgres - name: AUTHENTIK_POSTGRESQL__PASSWORD
- name: AUTHENTIK_POSTGRESQL__NAME valueFrom:
value: authentik secretKeyRef:
- name: AUTHENTIK_POSTGRESQL__USER name: authentik-credentials
value: authentik key: POSTGRES_PASSWORD
- name: AUTHENTIK_POSTGRESQL__PASSWORD - name: AUTHENTIK_DISABLE_UPDATE_CHECK
valueFrom: value: "true"
secretKeyRef: - name: AUTHENTIK_ERROR_REPORTING__ENABLED
name: authentik-credentials value: "false"
key: POSTGRES_PASSWORD - name: AUTHENTIK_LOG_LEVEL
- name: AUTHENTIK_DISABLE_UPDATE_CHECK value: info
value: "true" resources:
- name: AUTHENTIK_ERROR_REPORTING__ENABLED requests: { cpu: 100m, memory: 256Mi }
value: "false" limits: { cpu: 1000m, memory: 768Mi }
- name: AUTHENTIK_LOG_LEVEL volumeMounts:
value: info - name: media
resources: mountPath: /media
requests: { cpu: 100m, memory: 256Mi } volumes:
limits: { cpu: 1000m, memory: 768Mi } - name: media
volumeMounts: persistentVolumeClaim:
- name: media claimName: authentik-media
mountPath: /media
volumes: ---
- name: media apiVersion: v1
persistentVolumeClaim: kind: Service
claimName: authentik-media metadata:
name: authentik-server
--- namespace: authentik
apiVersion: v1 spec:
kind: Service selector:
metadata: app: authentik-server
name: authentik-server ports:
namespace: authentik - name: http
spec: port: 9000
selector: targetPort: 9000
app: authentik-server - name: https
ports: port: 9443
- name: http targetPort: 9443
port: 9000
targetPort: 9000 ---
- name: https # step-ca leaf certificate for id.iamworkin.lan.
port: 9443 # step-ca container resolver uses pfSense Unbound, so the public A record for id.iamworkin.lan
targetPort: 9443 # MUST exist before this Certificate is applied (cert-manager HTTP-01 will silently 2h-backoff
# otherwise). Added 2026-05-25 via scripts/pfsense-add-id-host.py.
--- apiVersion: cert-manager.io/v1
# step-ca leaf certificate for id.iamworkin.lan. kind: Certificate
# step-ca container resolver uses pfSense Unbound, so the public A record for id.iamworkin.lan metadata:
# MUST exist before this Certificate is applied (cert-manager HTTP-01 will silently 2h-backoff name: authentik-tls
# otherwise). Added 2026-05-25 via scripts/pfsense-add-id-host.py. namespace: authentik
apiVersion: cert-manager.io/v1 spec:
kind: Certificate secretName: authentik-tls
metadata: dnsNames:
name: authentik-tls - id.iamworkin.lan
namespace: authentik issuerRef:
spec: name: step-ca-acme
secretName: authentik-tls kind: ClusterIssuer
dnsNames:
- id.iamworkin.lan ---
issuerRef: apiVersion: traefik.io/v1alpha1
name: step-ca-acme kind: IngressRoute
kind: ClusterIssuer metadata:
name: authentik
--- namespace: authentik
apiVersion: traefik.io/v1alpha1 spec:
kind: IngressRoute entryPoints: [websecure]
metadata: routes:
name: authentik - match: Host(`id.iamworkin.lan`)
namespace: authentik kind: Rule
spec: services:
entryPoints: [websecure] - name: authentik-server
routes: port: 9000
- match: Host(`id.iamworkin.lan`) tls:
kind: Rule secretName: authentik-tls
services:
- name: authentik-server
port: 9000
tls:
secretName: authentik-tls

View File

@@ -42,7 +42,7 @@ spec:
app: messageboard-web app: messageboard-web
annotations: annotations:
fc.flowercore.io/healthz-anon: "true" fc.flowercore.io/healthz-anon: "true"
fc.flowercore.io/probe-path: "/health" fc.flowercore.io/probe-path: "/healthz"
prometheus.io/scrape: "true" prometheus.io/scrape: "true"
prometheus.io/port: "8080" prometheus.io/port: "8080"
prometheus.io/path: "/metrics/prometheus" prometheus.io/path: "/metrics/prometheus"

View File

@@ -525,7 +525,7 @@ spec:
app.kubernetes.io/part-of: flowercore app.kubernetes.io/part-of: flowercore
annotations: annotations:
fc.flowercore.io/healthz-anon: "true" fc.flowercore.io/healthz-anon: "true"
fc.flowercore.io/probe-path: "/health" fc.flowercore.io/probe-path: "/healthz"
prometheus.io/scrape: "true" prometheus.io/scrape: "true"
prometheus.io/port: "5217" prometheus.io/port: "5217"
prometheus.io/path: "/metrics" prometheus.io/path: "/metrics"

View File

@@ -54,7 +54,7 @@ spec:
metadata: metadata:
annotations: annotations:
fc.flowercore.io/healthz-anon: "true" fc.flowercore.io/healthz-anon: "true"
fc.flowercore.io/probe-path: "/" fc.flowercore.io/probe-path: "/healthz"
labels: labels:
app: updatecenter-web app: updatecenter-web
spec: spec:

View File

@@ -24,12 +24,6 @@ original Longhorn ReadWriteOnce NuGet PVC. Every other repo-scoped runner uses
two replicas with per-pod `emptyDir` caches. That is the safe backlog-drain two replicas with per-pod `emptyDir` caches. That is the safe backlog-drain
strategy: no two pods share one RWO PVC. strategy: no two pods share one RWO PVC.
Ephemeral runner pods are expected to register, run one job, deregister, and
exit so the Deployment starts a fresh pod for the next registration token. A
small amount of exit-1/restart churn from token-expiry or no-work windows is
accepted operational noise as long as jobs are not stuck queued and the
repo-scoped runner-offline alerts stay quiet.
Sprint 32 final long-tail wave adds 16 two-replica Deployments: Sprint 32 final long-tail wave adds 16 two-replica Deployments:
`FlowerCore.Knowledge`, `FlowerCore.LlmBridge`, `FlowerCore.Media`, `FlowerCore.Knowledge`, `FlowerCore.LlmBridge`, `FlowerCore.Media`,
`FlowerCore.Presentations`, `FlowerCore.RemoteDesktop`, `FlowerCore.DNS`, `FlowerCore.Presentations`, `FlowerCore.RemoteDesktop`, `FlowerCore.DNS`,

View File

@@ -843,9 +843,7 @@ data:
rules: rules:
- alert: PiManagerDown - alert: PiManagerDown
expr: up{job="pimanager-app"} == 0 expr: up{job="pimanager-app"} == 0
# Sprint 67: delayed behind NodeDown's critical page so a powered-off for: 3m
# Pi does not create the first duplicate page for the same host.
for: 8m
labels: labels:
severity: warning severity: warning
annotations: annotations:
@@ -1244,58 +1242,6 @@ data:
summary: "Marquee animation duration drifting > 10% on {{ $labels.renderer }} ({{ $labels.phase }})" summary: "Marquee animation duration drifting > 10% on {{ $labels.renderer }} ({{ $labels.phase }})"
description: "Median observed cycle duration deviates from target DurationMs by >10%. Could indicate browser tab throttling, GPU pressure, or phase-advancement bug." description: "Median observed cycle duration deviates from target DurationMs by >10%. Could indicate browser tab throttling, GPU pressure, or phase-advancement bug."
# ============================================================
# Update Center public-edge probes
# Live-mirrored from FlowerCore.Notes/scripts/monitoring/alerts.yml.
# This K8s ConfigMap is the future migration target; live Prometheus
# still reads the canonical Notes file from noc1 Podman.
# ============================================================
- name: update_center
rules:
# Critical only when the edge is genuinely unreachable. A Cloudflare
# HTTP 429 means the prober hit a rate-limit, not that real clients
# are down, so the warning rule below owns that signal.
- alert: UpdateCenterPublicEdgeDown
expr: |
(probe_success{job="probe-update-center-public-edge"} == 0)
unless on(instance)
(probe_http_status_code{job="probe-update-center-public-edge"} == 429)
for: 10m
labels:
severity: critical
service: update-center
alert_channel: irc
annotations:
summary: "Update Center public edge probe failed for {{ $labels.instance }}"
description: >-
The external probe for {{ $labels.instance }} failed for 10 minutes with a
non-2xx status that is not a rate-limit. Public Update Center clients may be
unable to fetch manifest schema metadata through Cloudflare.
runbook: >-
1. curl -sk https://{{ $labels.instance }}/api/v1/manifests/_schema
2. Verify Cloudflare DNS record is proxied and targets the current public edge IP
3. kubectl -n fc-updater get ingressroute updatecenter-web-public secret cf-origin-flowercore-io
4. Check Traefik logs for Method() or TLS secret errors
- alert: UpdateCenterPublicEdgeRateLimited
expr: probe_http_status_code{job="probe-update-center-public-edge"} == 429
for: 15m
labels:
severity: warning
service: update-center
alert_channel: irc
annotations:
summary: "Cloudflare is rate-limiting (HTTP 429) the public-edge probe for {{ $labels.instance }}"
description: >-
The blackbox prober receives HTTP 429 from Cloudflare for {{ $labels.instance }}
while the origin is healthy. This is a Cloudflare rate-limit / WAF condition on
the public hostname, not an outage.
runbook: >-
1. curl -sk https://{{ $labels.instance }}/api/v1/manifests/_schema (expect 200 from a normal client)
2. Review Cloudflare rate-limit / WAF rules for the hostname; the 5m-cadence prober is tripping a 429
3. Add a Cloudflare rate-limit exception for the prober source IP or the /api/v1/manifests/_schema path
4. Confirm whether the singular host update.flowercore.io is still required, or only updates.flowercore.io
# ============================================================================= # =============================================================================
# ConfigMap: Blackbox Exporter Configuration # ConfigMap: Blackbox Exporter Configuration
# ============================================================================= # =============================================================================

View File

@@ -114,9 +114,9 @@ spec:
app: telephony-web app: telephony-web
template: template:
metadata: metadata:
annotations: annotations:
fc.flowercore.io/healthz-anon: "true" fc.flowercore.io/healthz-anon: "true"
fc.flowercore.io/probe-path: "/health" fc.flowercore.io/probe-path: "/health"
labels: labels:
app: telephony-web app: telephony-web
spec: spec:
@@ -164,7 +164,7 @@ spec:
ports: ports:
- containerPort: 5100 - containerPort: 5100
name: http name: http
# fc-safe-to-expose: X-Forwarded-Proto handled by AddFlowerCoreWebAuth (ADR-178) before any future public/OIDC flip. # fc-safe-to-expose: X-Forwarded-Proto handled by AddFlowerCoreWebAuth (ADR-178) before any future public/OIDC flip.
env: env:
- name: Telephony__Twilio__AccountSid - name: Telephony__Twilio__AccountSid
valueFrom: valueFrom:

View File

@@ -1,74 +0,0 @@
apiVersion: argoproj.io/v1alpha1
kind: ApplicationSet
metadata:
annotations:
argocd.argoproj.io/refresh: "true"
name: bluejay-infra
namespace: argocd
spec:
generators:
- git:
directories:
- path: apps/*
repoURL: http://gitea-clusterip.gitea.svc:3000/bluejay/bluejay-infra.git
revision: main
template:
metadata: {}
spec:
destination: {}
project: ""
goTemplate: true
goTemplateOptions:
- missingkey=error
template:
metadata:
name: infra-{{.path.basename}}
spec:
destination:
server: https://kubernetes.default.svc
ignoreDifferences:
- group: apps
jqPathExpressions:
- .spec.volumeClaimTemplates[]?.status
jsonPointers:
- /spec/volumeClaimTemplates
kind: StatefulSet
name: zabbix-postgres
namespace: zabbix
- group: apps
jqPathExpressions:
- .spec.volumeClaimTemplates[]?.status
jsonPointers:
- /spec/volumeClaimTemplates
kind: StatefulSet
name: guac-mysql
namespace: guacamole
- group: apps
jqPathExpressions:
- .spec.volumeClaimTemplates[]?.status
jsonPointers:
- /spec/volumeClaimTemplates
kind: StatefulSet
name: matrix-postgres
namespace: matrix
- group: apps
jqPathExpressions:
- .spec.volumeClaimTemplates[]?.status
jsonPointers:
- /spec/volumeClaimTemplates
kind: StatefulSet
name: authentik-postgres
namespace: authentik
project: default
source:
path: '{{.path.path}}'
repoURL: http://gitea-clusterip.gitea.svc:3000/bluejay/bluejay-infra.git
targetRevision: main
syncPolicy:
automated:
prune: true
selfHeal: true
syncOptions:
- CreateNamespace=true
- ServerSideApply=true
- RespectIgnoreDifferences=true

View File

@@ -87,10 +87,10 @@ public sealed class FleetManifestLintTests
["fc-devicemgmt"] = ("fc-devicemgmt-web", "/healthz"), ["fc-devicemgmt"] = ("fc-devicemgmt-web", "/healthz"),
["fc-library"] = ("library-web", "/health"), ["fc-library"] = ("library-web", "/health"),
["fc-llm-bridge"] = ("fc-llm-bridge", "/healthz"), ["fc-llm-bridge"] = ("fc-llm-bridge", "/healthz"),
["fc-messageboard"] = ("messageboard-web", "/health"), ["fc-messageboard"] = ("messageboard-web", "/healthz"),
["fc-retail"] = ("retail-web", "/healthz"), ["fc-retail"] = ("retail-web", "/healthz"),
["fc-ttsreader"] = ("ttsreader-web", "/health"), ["fc-ttsreader"] = ("ttsreader-web", "/healthz"),
["fc-updater"] = ("updatecenter-web", "/"), ["fc-updater"] = ("updatecenter-web", "/healthz"),
["knowledge"] = ("knowledge-web", "/healthz"), ["knowledge"] = ("knowledge-web", "/healthz"),
["telephony"] = ("telephony-web", "/health"), ["telephony"] = ("telephony-web", "/health"),
["worldbuilder"] = ("worldbuilder-web", "/healthz"), ["worldbuilder"] = ("worldbuilder-web", "/healthz"),
@@ -468,99 +468,6 @@ public sealed class FleetManifestLintTests
monitoring.Should().Contain("dedicated LinuxRunnerOffline/MacMiniRunnerOffline alerts"); monitoring.Should().Contain("dedicated LinuxRunnerOffline/MacMiniRunnerOffline alerts");
} }
[Fact]
public void GithubRunnerReadme_DocumentsAcceptedEphemeralExitChurn()
{
var readme = File.ReadAllText(Path.Combine(Inventory.BluejayRoot, "apps", "github-runner", "README.md"));
readme.Should().Contain("Ephemeral runner pods");
readme.Should().Contain("exit-1/restart churn");
readme.Should().Contain("accepted operational noise");
readme.Should().Contain("repo-scoped runner-offline alerts stay quiet");
}
[Fact]
public void Monitoring_PiManagerDownDelayAndUpdateCenterRateLimit_MatchCanonicalAlerts()
{
var notesAlerts = File.ReadAllText(Path.Combine(
Inventory.WorkspaceRoot,
"FlowerCore.Notes",
"scripts",
"monitoring",
"alerts.yml"));
var monitoring = File.ReadAllText(Path.Combine(Inventory.BluejayRoot, "apps", "monitoring", "noc-monitoring.yaml"));
notesAlerts.Should().Contain("# Sprint 67: keep this warning behind NodeDown's 5m critical page");
notesAlerts.Should().Contain("- alert: PiManagerDown");
notesAlerts.Should().Contain("for: 8m");
monitoring.Should().Contain("# Sprint 67: delayed behind NodeDown's critical page");
monitoring.Should().Contain("- alert: PiManagerDown");
monitoring.Should().Contain("for: 8m");
notesAlerts.Should().Contain("- alert: UpdateCenterPublicEdgeRateLimited");
notesAlerts.Should().Contain("expr: probe_http_status_code{job=\"probe-update-center-public-edge\"} == 429");
notesAlerts.Should().Contain("for: 15m");
monitoring.Should().Contain("- alert: UpdateCenterPublicEdgeRateLimited");
monitoring.Should().Contain("expr: probe_http_status_code{job=\"probe-update-center-public-edge\"} == 429");
monitoring.Should().Contain("for: 15m");
monitoring.Should().Contain("severity: warning");
}
[Fact]
public void ApplicationSetExport_MustRemainManualRootOfGitOpsTree()
{
var readme = File.ReadAllText(Path.Combine(Inventory.BluejayRoot, "README.md"));
var appsetPath = Path.Combine(Inventory.BluejayRoot, "argocd", "applicationset-bluejay-infra.yaml");
File.Exists(appsetPath).Should().BeTrue();
var appset = File.ReadAllText(appsetPath);
appset.Should().Contain("kind: ApplicationSet");
appset.Should().Contain("name: bluejay-infra");
appset.Should().NotContain("\nstatus:");
appset.Should().NotContain("managedFields:");
readme.Should().Contain("root of this GitOps tree");
readme.Should().Contain("NOT self-managed");
readme.Should().Contain("kubectl -n argocd apply -f argocd/applicationset-bluejay-infra.yaml");
}
[Fact]
public void ApplicationSetExport_MustDiscoverAppsDirectoryOnMain()
{
var appset = File.ReadAllText(Path.Combine(Inventory.BluejayRoot, "argocd", "applicationset-bluejay-infra.yaml"));
appset.Should().Contain("path: apps/*");
appset.Should().Contain("revision: main");
appset.Should().Contain("repoURL: http://gitea-clusterip.gitea.svc:3000/bluejay/bluejay-infra.git");
appset.Should().Contain("path: '{{.path.path}}'");
appset.Should().Contain("targetRevision: main");
appset.Should().Contain("ServerSideApply=true");
appset.Should().Contain("RespectIgnoreDifferences=true");
}
[Fact]
public void ApplicationSetExport_MustPreserveStatefulSetIgnoreDifferences()
{
var appset = File.ReadAllText(Path.Combine(Inventory.BluejayRoot, "argocd", "applicationset-bluejay-infra.yaml"));
appset.Should().Contain("jsonPointers:");
appset.Should().Contain("- /spec/volumeClaimTemplates");
appset.Should().Contain(".spec.volumeClaimTemplates[]?.status");
Regex.Matches(appset, "kind: StatefulSet").Should().HaveCount(4);
foreach (var (name, ns) in new[]
{
("zabbix-postgres", "zabbix"),
("guac-mysql", "guacamole"),
("matrix-postgres", "matrix"),
("authentik-postgres", "authentik"),
})
{
appset.Should().Contain($"name: {name}");
appset.Should().Contain($"namespace: {ns}");
}
}
[Fact] [Fact]
public void Monitoring_BlackboxTargetsForOidcSensitiveServices_MustUseAnonymousHealthRoutesWhenAvailable() public void Monitoring_BlackboxTargetsForOidcSensitiveServices_MustUseAnonymousHealthRoutesWhenAvailable()
{ {