Compare commits

..

2 Commits

Author SHA1 Message Date
Andrew Stoltz
13d8ca8c1a infra: export appset and mirror alert polish 2026-06-10 16:36:18 -05:00
Andrew Stoltz
b0a3ef7448 monitoring: delay PiManagerDown duplicate pages 2026-06-10 16:23:49 -05:00
15 changed files with 337 additions and 473 deletions

View File

@@ -2,6 +2,22 @@
Infrastructure manifests for ArgoCD. An `ApplicationSet` in `argocd` namespace watches the `apps/*` directories in this repo and creates one `Application` per subdir (prefixed `infra-<name>`). Infrastructure manifests for ArgoCD. An `ApplicationSet` in `argocd` namespace watches the `apps/*` directories in this repo and creates one `Application` per subdir (prefixed `infra-<name>`).
## Root GitOps ApplicationSet
`argocd/applicationset-bluejay-infra.yaml` is the root of this GitOps tree, but
it is **NOT self-managed** by ArgoCD. Apply it manually when the root generator
or sync policy changes:
```bash
kubectl -n argocd apply -f argocd/applicationset-bluejay-infra.yaml
```
Keep the per-StatefulSet `ignoreDifferences` entries in that file synced with
the live ApplicationSet. They intentionally cover `zabbix-postgres`,
`guac-mysql`, `matrix-postgres`, and `authentik-postgres` so ArgoCD does not
loop forever on server-side-apply `volumeClaimTemplates` status drift. Every new
StatefulSet with `volumeClaimTemplates` needs its own entry appended.
## Adding a new service to the cluster ## Adding a new service to the cluster
Follow these steps in order. **Step 1 must run before step 3** — if you skip it, cert-manager HTTP-01 will silently fail for ~2h per cert (exponential backoff) until someone diagnoses the DNS. Follow these steps in order. **Step 1 must run before step 3** — if you skip it, cert-manager HTTP-01 will silently fail for ~2h per cert (exponential backoff) until someone diagnoses the DNS.

View File

@@ -14,20 +14,6 @@
# cluster-rebuild repeatability. See # cluster-rebuild repeatability. See
# feedback_networkpolicies_belong_in_bluejay_infra.md. # feedback_networkpolicies_belong_in_bluejay_infra.md.
--- ---
# OIDC client secret for the RemoteDesktop end-user sign-in (fleet regroup L9,
# 2026-06-12). The Authentik provider `remotedesktop` already exists; the 1P item
# `remotedesktop-oidc-client` (vault IAmWorkin) carries issuer_url / client_id /
# client_secret, and the 1Password operator mints the same-named K8s Secret that
# k8s/web-deployment.yaml (FlowerCore.RemoteDesktop repo) consumes with
# optional:true. Gate stays OFF (Q-RD-16) — this is flip-READINESS only.
apiVersion: onepassword.com/v1
kind: OnePasswordItem
metadata:
name: remotedesktop-oidc-client
namespace: fc-desktop
spec:
itemPath: "vaults/IAmWorkin/items/remotedesktop-oidc-client"
---
apiVersion: cert-manager.io/v1 apiVersion: cert-manager.io/v1
kind: Certificate kind: Certificate
metadata: metadata:

View File

@@ -11,7 +11,7 @@ metadata:
flowercore.io/created-by: bluejay-infra flowercore.io/created-by: bluejay-infra
rules: rules:
- apiGroups: - apiGroups:
- flowercore.io - devices.flowercore.io
resources: resources:
- '*' - '*'
verbs: verbs:
@@ -23,7 +23,7 @@ rules:
- patch - patch
- delete - delete
- apiGroups: - apiGroups:
- flowercore.io - devices.flowercore.io
resources: resources:
- devices/status - devices/status
- devices/finalizers - devices/finalizers
@@ -33,8 +33,6 @@ rules:
- devicepolicies/finalizers - devicepolicies/finalizers
- remotecommands/status - remotecommands/status
- remotecommands/finalizers - remotecommands/finalizers
- desiredstatedocuments/status
- desiredstatedocuments/finalizers
verbs: verbs:
- get - get
- update - update

View File

@@ -1,186 +0,0 @@
# FlowerCore.DeviceManagement CRDs.
#
# These CRDs match the current operator annotations:
# [KubernetesEntity(Group = "flowercore.io", ApiVersion = "v1alpha1", ...)]
# Keep the schemas intentionally permissive until the DeviceManagement operator
# grows enforced CRD validation.
---
apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
metadata:
name: devices.flowercore.io
labels:
app.kubernetes.io/name: fc-devicemgmt-operator
app.kubernetes.io/component: operator
app.kubernetes.io/part-of: flowercore
app.kubernetes.io/managed-by: argocd
flowercore.io/tenant-id: system
flowercore.io/created-by: bluejay-infra
spec:
group: flowercore.io
scope: Namespaced
names:
plural: devices
singular: device
kind: Device
listKind: DeviceList
versions:
- name: v1alpha1
served: true
storage: true
subresources:
status: {}
schema:
openAPIV3Schema:
type: object
properties:
spec:
type: object
x-kubernetes-preserve-unknown-fields: true
status:
type: object
x-kubernetes-preserve-unknown-fields: true
---
apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
metadata:
name: devicegroups.flowercore.io
labels:
app.kubernetes.io/name: fc-devicemgmt-operator
app.kubernetes.io/component: operator
app.kubernetes.io/part-of: flowercore
app.kubernetes.io/managed-by: argocd
flowercore.io/tenant-id: system
flowercore.io/created-by: bluejay-infra
spec:
group: flowercore.io
scope: Namespaced
names:
plural: devicegroups
singular: devicegroup
kind: DeviceGroup
listKind: DeviceGroupList
versions:
- name: v1alpha1
served: true
storage: true
subresources:
status: {}
schema:
openAPIV3Schema:
type: object
properties:
spec:
type: object
x-kubernetes-preserve-unknown-fields: true
status:
type: object
x-kubernetes-preserve-unknown-fields: true
---
apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
metadata:
name: devicepolicies.flowercore.io
labels:
app.kubernetes.io/name: fc-devicemgmt-operator
app.kubernetes.io/component: operator
app.kubernetes.io/part-of: flowercore
app.kubernetes.io/managed-by: argocd
flowercore.io/tenant-id: system
flowercore.io/created-by: bluejay-infra
spec:
group: flowercore.io
scope: Namespaced
names:
plural: devicepolicies
singular: devicepolicy
kind: DevicePolicy
listKind: DevicePolicyList
versions:
- name: v1alpha1
served: true
storage: true
subresources:
status: {}
schema:
openAPIV3Schema:
type: object
properties:
spec:
type: object
x-kubernetes-preserve-unknown-fields: true
status:
type: object
x-kubernetes-preserve-unknown-fields: true
---
apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
metadata:
name: remotecommands.flowercore.io
labels:
app.kubernetes.io/name: fc-devicemgmt-operator
app.kubernetes.io/component: operator
app.kubernetes.io/part-of: flowercore
app.kubernetes.io/managed-by: argocd
flowercore.io/tenant-id: system
flowercore.io/created-by: bluejay-infra
spec:
group: flowercore.io
scope: Namespaced
names:
plural: remotecommands
singular: remotecommand
kind: RemoteCommand
listKind: RemoteCommandList
versions:
- name: v1alpha1
served: true
storage: true
subresources:
status: {}
schema:
openAPIV3Schema:
type: object
properties:
spec:
type: object
x-kubernetes-preserve-unknown-fields: true
status:
type: object
x-kubernetes-preserve-unknown-fields: true
---
apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
metadata:
name: desiredstatedocuments.flowercore.io
labels:
app.kubernetes.io/name: fc-devicemgmt-operator
app.kubernetes.io/component: operator
app.kubernetes.io/part-of: flowercore
app.kubernetes.io/managed-by: argocd
flowercore.io/tenant-id: system
flowercore.io/created-by: bluejay-infra
spec:
group: flowercore.io
scope: Namespaced
names:
plural: desiredstatedocuments
singular: desiredstatedocument
kind: DesiredStateDocument
listKind: DesiredStateDocumentList
versions:
- name: v1alpha1
served: true
storage: true
subresources:
status: {}
schema:
openAPIV3Schema:
type: object
properties:
spec:
type: object
x-kubernetes-preserve-unknown-fields: true
status:
type: object
x-kubernetes-preserve-unknown-fields: true

View File

@@ -5,35 +5,21 @@
# exist yet; import localhost/fc-devicemgmt-web:<tag> to all schedulable RKE2 # exist yet; import localhost/fc-devicemgmt-web:<tag> to all schedulable RKE2
# nodes before letting ArgoCD sync a live rollout. # nodes before letting ArgoCD sync a live rollout.
# #
# LIVE — 2026-06-11 DeviceManagement product-host enablement. # SCALED TO 0 — 2026-05-19 morning-routine cleanup.
# The current DeviceManagement Web source is SQLite-backed in Program.cs, so # The Web pod cannot start until TWO upstream gaps close:
# Phase 1 production uses a Longhorn RWO PVC at /data/devicemgmt.db. The # 1. MySQL DB instance `flowercore_devicemgmt` (user `fc_devicemgmt`) is
# 1Password runtime item stays mounted through env for future MySQL/API-key # provisioned via fc-mysql Manager. The cluster currently has ZERO
# cutover, but MySQL is not required for this first product-host rollout. # MySqlInstanceCrds and no `mysql.fc-mysql.svc:3306` Service, so the
# Image v20260611-healthz is built from FlowerCore.DeviceManagement master # deployment-web container env `FlowerCore__Database__Host=mysql.fc-mysql.svc`
# 3c15f3b, which adds the /healthz alias required by fleet monitoring. # points at nothing. Provision via the fc-mysql Manager UI/REST/MCP.
--- # 2. 1Password vault item `IAmWorkin/FlowerCore DeviceManagement Runtime`
apiVersion: v1 # with 5 fields (DB-Password, mtls-ca.pem, mtls-client.crt, mtls-client.key,
kind: PersistentVolumeClaim # mtls-chain.pem) — see apps/fc-devicemgmt/1password-item.yaml. Mint mTLS
metadata: # from step-ca-agent ClusterIssuer per ADR-126; DB-Password must match the
name: fc-devicemgmt-web-data # password configured for the MySQL user.
namespace: fc-devicemgmt # Re-enable: change replicas back to 2 after both gaps close. The image tag
labels: # in this file (v20260512-cx5) MAY also need a refresh — it predates the
app: fc-devicemgmt-web # Sprint 34 Cl-3 operator fix; Web may have an analogous bug.
app.kubernetes.io/name: fc-devicemgmt-web
app.kubernetes.io/component: web
app.kubernetes.io/part-of: flowercore
app.kubernetes.io/managed-by: argocd
flowercore.io/tenant-id: system
flowercore.io/created-by: bluejay-infra
spec:
accessModes:
- ReadWriteOnce
storageClassName: longhorn
resources:
requests:
storage: 1Gi
---
apiVersion: apps/v1 apiVersion: apps/v1
kind: Deployment kind: Deployment
metadata: metadata:
@@ -50,7 +36,7 @@ metadata:
annotations: annotations:
flowercore.io/traceability-standard: k8s-pod-ownership-and-traceability-standard flowercore.io/traceability-standard: k8s-pod-ownership-and-traceability-standard
spec: spec:
replicas: 1 replicas: 0
revisionHistoryLimit: 3 revisionHistoryLimit: 3
selector: selector:
matchLabels: matchLabels:
@@ -78,7 +64,7 @@ spec:
fsGroupChangePolicy: OnRootMismatch fsGroupChangePolicy: OnRootMismatch
containers: containers:
- name: web - name: web
image: localhost/fc-devicemgmt-web:v20260611-healthz image: localhost/fc-devicemgmt-web:v20260512-cx5
imagePullPolicy: Never imagePullPolicy: Never
ports: ports:
- name: http - name: http
@@ -91,21 +77,29 @@ spec:
value: "Production" value: "Production"
- name: DOTNET_SYSTEM_GLOBALIZATION_INVARIANT - name: DOTNET_SYSTEM_GLOBALIZATION_INVARIANT
value: "false" value: "false"
- name: HOME
value: "/data"
- name: FlowerCore__Service__Name - name: FlowerCore__Service__Name
value: "FlowerCore.DeviceManagement.Web" value: "FlowerCore.DeviceManagement.Web"
- name: FlowerCore__DeviceManagement__DefaultTenantId - name: FlowerCore__DeviceManagement__DefaultTenantId
value: "system" value: "system"
- name: FlowerCore__Database__Provider - name: FlowerCore__Database__Provider
value: "Sqlite" value: "MySql"
- name: FlowerCore__Database__ConnectionStrings__Sqlite - name: FlowerCore__Database__Host
value: "Data Source=/data/devicemgmt.db" value: "mysql.fc-mysql.svc"
- name: FlowerCore__Database__Database
value: "flowercore_devicemgmt"
- name: FlowerCore__Database__User
value: "fc_devicemgmt"
- name: FlowerCore__Database__Password - name: FlowerCore__Database__Password
valueFrom: valueFrom:
secretKeyRef: secretKeyRef:
name: fc-devicemgmt-runtime name: fc-devicemgmt-runtime
key: DB-Password key: DB-Password
- name: FlowerCore__DeviceManagement__AgentMtls__CaPath
value: "/secrets/devicemgmt-mtls/mtls-ca.pem"
- name: FlowerCore__DeviceManagement__AgentMtls__ClientCertificatePath
value: "/secrets/devicemgmt-mtls/mtls-client.crt"
- name: FlowerCore__DeviceManagement__AgentMtls__ClientKeyPath
value: "/secrets/devicemgmt-mtls/mtls-client.key"
- name: FlowerCore__EventBus__Redis__Configuration - name: FlowerCore__EventBus__Redis__Configuration
value: "redis.fc-redis.svc:6379" value: "redis.fc-redis.svc:6379"
resources: resources:
@@ -142,17 +136,19 @@ spec:
drop: drop:
- ALL - ALL
volumeMounts: volumeMounts:
- name: data
mountPath: /data
- name: tmp - name: tmp
mountPath: /tmp mountPath: /tmp
- name: logs - name: logs
mountPath: /app/logs mountPath: /app/logs
- name: devicemgmt-mtls
mountPath: /secrets/devicemgmt-mtls
readOnly: true
volumes: volumes:
- name: data
persistentVolumeClaim:
claimName: fc-devicemgmt-web-data
- name: tmp - name: tmp
emptyDir: {} emptyDir: {}
- name: logs - name: logs
emptyDir: {} emptyDir: {}
- name: devicemgmt-mtls
secret:
secretName: fc-devicemgmt-runtime
defaultMode: 0400

View File

@@ -48,7 +48,7 @@ data:
{ {
"FlowerCore": { "FlowerCore": {
"Auth": { "Auth": {
"Enabled": false, "Enabled": true,
"Oidc": { "Oidc": {
"Enabled": true, "Enabled": true,
"Audience": "dns", "Audience": "dns",
@@ -111,7 +111,7 @@ spec:
fsGroup: 1654 fsGroup: 1654
containers: containers:
- name: dns-web - name: dns-web
image: localhost/fc-dns-web:v20260612-l4dns-a5d2849 image: localhost/fc-dns-web:v20260604-oidc-proper
imagePullPolicy: Never imagePullPolicy: Never
securityContext: securityContext:
readOnlyRootFilesystem: true readOnlyRootFilesystem: true
@@ -149,7 +149,7 @@ spec:
key: client_secret key: client_secret
optional: true optional: true
- name: FlowerCore__Auth__Enabled - name: FlowerCore__Auth__Enabled
value: "false" value: "true"
- name: FlowerCore__Auth__Oidc__Enabled - name: FlowerCore__Auth__Oidc__Enabled
value: "true" value: "true"
- name: FlowerCore__Auth__Oidc__Audience - name: FlowerCore__Auth__Oidc__Audience
@@ -303,7 +303,7 @@ spec:
fsGroup: 1654 fsGroup: 1654
containers: containers:
- name: dns-acme-webhook - name: dns-acme-webhook
image: localhost/fc-dns-acme-webhook:v20260612-l4dns-a5d2849 image: localhost/fc-dns-acme-webhook:v202604290845
imagePullPolicy: Never imagePullPolicy: Never
securityContext: securityContext:
readOnlyRootFilesystem: true readOnlyRootFilesystem: true

View File

@@ -535,7 +535,7 @@ spec:
fsGroupChangePolicy: OnRootMismatch fsGroupChangePolicy: OnRootMismatch
containers: containers:
- name: web - name: web
image: localhost/fc-ttsreader-web:v20260612-readalong-corrections image: localhost/fc-ttsreader-web:v20260603-s54cx14-pr29-schema
imagePullPolicy: Never imagePullPolicy: Never
ports: ports:
- containerPort: 5217 - containerPort: 5217

View File

@@ -12,8 +12,6 @@ All repo-scoped Linux runners use:
- `ACCESS_TOKEN` from the `github-runner-token` Secret - `ACCESS_TOKEN` from the `github-runner-token` Secret
- `RUN_AS_ROOT=false` - `RUN_AS_ROOT=false`
- `EPHEMERAL=true` - `EPHEMERAL=true`
- `DISABLE_AUTO_UPDATE=true` so the runner does not self-update and exit inside
the immutable Kubernetes pod
- `LABELS=self-hosted,linux,fc-build-linux` - `LABELS=self-hosted,linux,fc-build-linux`
- writable non-root paths under `/home/runner` for .NET, NuGet, XDG cache, and - writable non-root paths under `/home/runner` for .NET, NuGet, XDG cache, and
Actions tool cache Actions tool cache
@@ -26,6 +24,12 @@ original Longhorn ReadWriteOnce NuGet PVC. Every other repo-scoped runner uses
two replicas with per-pod `emptyDir` caches. That is the safe backlog-drain two replicas with per-pod `emptyDir` caches. That is the safe backlog-drain
strategy: no two pods share one RWO PVC. strategy: no two pods share one RWO PVC.
Ephemeral runner pods are expected to register, run one job, deregister, and
exit so the Deployment starts a fresh pod for the next registration token. A
small amount of exit-1/restart churn from token-expiry or no-work windows is
accepted operational noise as long as jobs are not stuck queued and the
repo-scoped runner-offline alerts stay quiet.
Sprint 32 final long-tail wave adds 16 two-replica Deployments: Sprint 32 final long-tail wave adds 16 two-replica Deployments:
`FlowerCore.Knowledge`, `FlowerCore.LlmBridge`, `FlowerCore.Media`, `FlowerCore.Knowledge`, `FlowerCore.LlmBridge`, `FlowerCore.Media`,
`FlowerCore.Presentations`, `FlowerCore.RemoteDesktop`, `FlowerCore.DNS`, `FlowerCore.Presentations`, `FlowerCore.RemoteDesktop`, `FlowerCore.DNS`,
@@ -133,7 +137,3 @@ from GitHub Actions and verify it lands on an `rke2-linux-*` runner.
value does not change. value does not change.
- `Multi-Attach` volume error: only the Common runner uses a RWO PVC and it must - `Multi-Attach` volume error: only the Common runner uses a RWO PVC and it must
stay single-replica. New multi-replica runners use `emptyDir`. stay single-replica. New multi-replica runners use `emptyDir`.
- Runner pods repeatedly registering, downloading a newer Actions runner, then
exiting with code 4: verify `DISABLE_AUTO_UPDATE=true` is present. The image
translates that into `config.sh --disableupdate`; without it, the Deployment
controller sees the expected self-update exit as CrashLoopBackOff.

View File

@@ -195,11 +195,6 @@ spec:
# fresh registration occurs. Prevents stale runner accumulation. # fresh registration occurs. Prevents stale runner accumulation.
- name: EPHEMERAL - name: EPHEMERAL
value: "true" value: "true"
# The runner image must not self-update inside an immutable
# Kubernetes pod. Without this, GitHub runner auto-update exits
# with code 4 and the Deployment falls into CrashLoopBackOff.
- name: DISABLE_AUTO_UPDATE
value: "true"
# Labels used by workflow files: runs-on: [self-hosted, linux, fc-build-linux] # Labels used by workflow files: runs-on: [self-hosted, linux, fc-build-linux]
- name: LABELS - name: LABELS
value: "self-hosted,linux,fc-build-linux" value: "self-hosted,linux,fc-build-linux"
@@ -371,11 +366,6 @@ spec:
value: "/tmp/runner/work" value: "/tmp/runner/work"
- name: EPHEMERAL - name: EPHEMERAL
value: "true" value: "true"
# The runner image must not self-update inside an immutable
# Kubernetes pod. Without this, GitHub runner auto-update exits
# with code 4 and the Deployment falls into CrashLoopBackOff.
- name: DISABLE_AUTO_UPDATE
value: "true"
- name: LABELS - name: LABELS
value: "self-hosted,linux,fc-build-linux" value: "self-hosted,linux,fc-build-linux"
- name: HOME - name: HOME
@@ -514,11 +504,6 @@ spec:
value: "/tmp/runner/work" value: "/tmp/runner/work"
- name: EPHEMERAL - name: EPHEMERAL
value: "true" value: "true"
# The runner image must not self-update inside an immutable
# Kubernetes pod. Without this, GitHub runner auto-update exits
# with code 4 and the Deployment falls into CrashLoopBackOff.
- name: DISABLE_AUTO_UPDATE
value: "true"
- name: LABELS - name: LABELS
value: "self-hosted,linux,fc-build-linux" value: "self-hosted,linux,fc-build-linux"
- name: HOME - name: HOME
@@ -651,11 +636,6 @@ spec:
value: "/tmp/runner/work" value: "/tmp/runner/work"
- name: EPHEMERAL - name: EPHEMERAL
value: "true" value: "true"
# The runner image must not self-update inside an immutable
# Kubernetes pod. Without this, GitHub runner auto-update exits
# with code 4 and the Deployment falls into CrashLoopBackOff.
- name: DISABLE_AUTO_UPDATE
value: "true"
- name: LABELS - name: LABELS
value: "self-hosted,linux,fc-build-linux" value: "self-hosted,linux,fc-build-linux"
- name: HOME - name: HOME
@@ -788,11 +768,6 @@ spec:
value: "/tmp/runner/work" value: "/tmp/runner/work"
- name: EPHEMERAL - name: EPHEMERAL
value: "true" value: "true"
# The runner image must not self-update inside an immutable
# Kubernetes pod. Without this, GitHub runner auto-update exits
# with code 4 and the Deployment falls into CrashLoopBackOff.
- name: DISABLE_AUTO_UPDATE
value: "true"
- name: LABELS - name: LABELS
value: "self-hosted,linux,fc-build-linux" value: "self-hosted,linux,fc-build-linux"
- name: HOME - name: HOME
@@ -925,11 +900,6 @@ spec:
value: "/tmp/runner/work" value: "/tmp/runner/work"
- name: EPHEMERAL - name: EPHEMERAL
value: "true" value: "true"
# The runner image must not self-update inside an immutable
# Kubernetes pod. Without this, GitHub runner auto-update exits
# with code 4 and the Deployment falls into CrashLoopBackOff.
- name: DISABLE_AUTO_UPDATE
value: "true"
- name: LABELS - name: LABELS
value: "self-hosted,linux,fc-build-linux" value: "self-hosted,linux,fc-build-linux"
- name: HOME - name: HOME
@@ -1065,11 +1035,6 @@ spec:
value: "/tmp/runner/work" value: "/tmp/runner/work"
- name: EPHEMERAL - name: EPHEMERAL
value: "true" value: "true"
# The runner image must not self-update inside an immutable
# Kubernetes pod. Without this, GitHub runner auto-update exits
# with code 4 and the Deployment falls into CrashLoopBackOff.
- name: DISABLE_AUTO_UPDATE
value: "true"
- name: LABELS - name: LABELS
value: "self-hosted,linux,fc-build-linux" value: "self-hosted,linux,fc-build-linux"
- name: HOME - name: HOME
@@ -1202,11 +1167,6 @@ spec:
value: "/tmp/runner/work" value: "/tmp/runner/work"
- name: EPHEMERAL - name: EPHEMERAL
value: "true" value: "true"
# The runner image must not self-update inside an immutable
# Kubernetes pod. Without this, GitHub runner auto-update exits
# with code 4 and the Deployment falls into CrashLoopBackOff.
- name: DISABLE_AUTO_UPDATE
value: "true"
- name: LABELS - name: LABELS
value: "self-hosted,linux,fc-build-linux" value: "self-hosted,linux,fc-build-linux"
- name: HOME - name: HOME
@@ -1339,11 +1299,6 @@ spec:
value: "/tmp/runner/work" value: "/tmp/runner/work"
- name: EPHEMERAL - name: EPHEMERAL
value: "true" value: "true"
# The runner image must not self-update inside an immutable
# Kubernetes pod. Without this, GitHub runner auto-update exits
# with code 4 and the Deployment falls into CrashLoopBackOff.
- name: DISABLE_AUTO_UPDATE
value: "true"
- name: LABELS - name: LABELS
value: "self-hosted,linux,fc-build-linux" value: "self-hosted,linux,fc-build-linux"
- name: HOME - name: HOME
@@ -1476,11 +1431,6 @@ spec:
value: "/tmp/runner/work" value: "/tmp/runner/work"
- name: EPHEMERAL - name: EPHEMERAL
value: "true" value: "true"
# The runner image must not self-update inside an immutable
# Kubernetes pod. Without this, GitHub runner auto-update exits
# with code 4 and the Deployment falls into CrashLoopBackOff.
- name: DISABLE_AUTO_UPDATE
value: "true"
- name: LABELS - name: LABELS
value: "self-hosted,linux,fc-build-linux" value: "self-hosted,linux,fc-build-linux"
- name: HOME - name: HOME
@@ -1615,11 +1565,6 @@ spec:
value: "/tmp/runner/work" value: "/tmp/runner/work"
- name: EPHEMERAL - name: EPHEMERAL
value: "true" value: "true"
# The runner image must not self-update inside an immutable
# Kubernetes pod. Without this, GitHub runner auto-update exits
# with code 4 and the Deployment falls into CrashLoopBackOff.
- name: DISABLE_AUTO_UPDATE
value: "true"
- name: LABELS - name: LABELS
value: "self-hosted,linux,fc-build-linux" value: "self-hosted,linux,fc-build-linux"
- name: HOME - name: HOME
@@ -1754,11 +1699,6 @@ spec:
value: "/tmp/runner/work" value: "/tmp/runner/work"
- name: EPHEMERAL - name: EPHEMERAL
value: "true" value: "true"
# The runner image must not self-update inside an immutable
# Kubernetes pod. Without this, GitHub runner auto-update exits
# with code 4 and the Deployment falls into CrashLoopBackOff.
- name: DISABLE_AUTO_UPDATE
value: "true"
- name: LABELS - name: LABELS
value: "self-hosted,linux,fc-build-linux" value: "self-hosted,linux,fc-build-linux"
- name: HOME - name: HOME
@@ -1898,11 +1838,6 @@ spec:
value: "/tmp/runner/work" value: "/tmp/runner/work"
- name: EPHEMERAL - name: EPHEMERAL
value: "true" value: "true"
# The runner image must not self-update inside an immutable
# Kubernetes pod. Without this, GitHub runner auto-update exits
# with code 4 and the Deployment falls into CrashLoopBackOff.
- name: DISABLE_AUTO_UPDATE
value: "true"
- name: LABELS - name: LABELS
value: "self-hosted,linux,fc-build-linux" value: "self-hosted,linux,fc-build-linux"
- name: HOME - name: HOME
@@ -2037,11 +1972,6 @@ spec:
value: "/tmp/runner/work" value: "/tmp/runner/work"
- name: EPHEMERAL - name: EPHEMERAL
value: "true" value: "true"
# The runner image must not self-update inside an immutable
# Kubernetes pod. Without this, GitHub runner auto-update exits
# with code 4 and the Deployment falls into CrashLoopBackOff.
- name: DISABLE_AUTO_UPDATE
value: "true"
- name: LABELS - name: LABELS
value: "self-hosted,linux,fc-build-linux" value: "self-hosted,linux,fc-build-linux"
- name: HOME - name: HOME
@@ -2176,11 +2106,6 @@ spec:
value: "/tmp/runner/work" value: "/tmp/runner/work"
- name: EPHEMERAL - name: EPHEMERAL
value: "true" value: "true"
# The runner image must not self-update inside an immutable
# Kubernetes pod. Without this, GitHub runner auto-update exits
# with code 4 and the Deployment falls into CrashLoopBackOff.
- name: DISABLE_AUTO_UPDATE
value: "true"
- name: LABELS - name: LABELS
value: "self-hosted,linux,fc-build-linux" value: "self-hosted,linux,fc-build-linux"
- name: HOME - name: HOME
@@ -2315,11 +2240,6 @@ spec:
value: "/tmp/runner/work" value: "/tmp/runner/work"
- name: EPHEMERAL - name: EPHEMERAL
value: "true" value: "true"
# The runner image must not self-update inside an immutable
# Kubernetes pod. Without this, GitHub runner auto-update exits
# with code 4 and the Deployment falls into CrashLoopBackOff.
- name: DISABLE_AUTO_UPDATE
value: "true"
- name: LABELS - name: LABELS
value: "self-hosted,linux,fc-build-linux" value: "self-hosted,linux,fc-build-linux"
- name: HOME - name: HOME
@@ -2453,11 +2373,6 @@ spec:
value: "/tmp/runner/work" value: "/tmp/runner/work"
- name: EPHEMERAL - name: EPHEMERAL
value: "true" value: "true"
# The runner image must not self-update inside an immutable
# Kubernetes pod. Without this, GitHub runner auto-update exits
# with code 4 and the Deployment falls into CrashLoopBackOff.
- name: DISABLE_AUTO_UPDATE
value: "true"
- name: LABELS - name: LABELS
value: "self-hosted,linux,fc-build-linux" value: "self-hosted,linux,fc-build-linux"
- name: HOME - name: HOME
@@ -2592,11 +2507,6 @@ spec:
value: "/tmp/runner/work" value: "/tmp/runner/work"
- name: EPHEMERAL - name: EPHEMERAL
value: "true" value: "true"
# The runner image must not self-update inside an immutable
# Kubernetes pod. Without this, GitHub runner auto-update exits
# with code 4 and the Deployment falls into CrashLoopBackOff.
- name: DISABLE_AUTO_UPDATE
value: "true"
- name: LABELS - name: LABELS
value: "self-hosted,linux,fc-build-linux" value: "self-hosted,linux,fc-build-linux"
- name: HOME - name: HOME
@@ -2730,11 +2640,6 @@ spec:
value: "/tmp/runner/work" value: "/tmp/runner/work"
- name: EPHEMERAL - name: EPHEMERAL
value: "true" value: "true"
# The runner image must not self-update inside an immutable
# Kubernetes pod. Without this, GitHub runner auto-update exits
# with code 4 and the Deployment falls into CrashLoopBackOff.
- name: DISABLE_AUTO_UPDATE
value: "true"
- name: LABELS - name: LABELS
value: "self-hosted,linux,fc-build-linux" value: "self-hosted,linux,fc-build-linux"
- name: HOME - name: HOME
@@ -2868,11 +2773,6 @@ spec:
value: "/tmp/runner/work" value: "/tmp/runner/work"
- name: EPHEMERAL - name: EPHEMERAL
value: "true" value: "true"
# The runner image must not self-update inside an immutable
# Kubernetes pod. Without this, GitHub runner auto-update exits
# with code 4 and the Deployment falls into CrashLoopBackOff.
- name: DISABLE_AUTO_UPDATE
value: "true"
- name: LABELS - name: LABELS
value: "self-hosted,linux,fc-build-linux" value: "self-hosted,linux,fc-build-linux"
- name: HOME - name: HOME
@@ -3006,11 +2906,6 @@ spec:
value: "/tmp/runner/work" value: "/tmp/runner/work"
- name: EPHEMERAL - name: EPHEMERAL
value: "true" value: "true"
# The runner image must not self-update inside an immutable
# Kubernetes pod. Without this, GitHub runner auto-update exits
# with code 4 and the Deployment falls into CrashLoopBackOff.
- name: DISABLE_AUTO_UPDATE
value: "true"
- name: LABELS - name: LABELS
value: "self-hosted,linux,fc-build-linux" value: "self-hosted,linux,fc-build-linux"
- name: HOME - name: HOME
@@ -3144,11 +3039,6 @@ spec:
value: "/tmp/runner/work" value: "/tmp/runner/work"
- name: EPHEMERAL - name: EPHEMERAL
value: "true" value: "true"
# The runner image must not self-update inside an immutable
# Kubernetes pod. Without this, GitHub runner auto-update exits
# with code 4 and the Deployment falls into CrashLoopBackOff.
- name: DISABLE_AUTO_UPDATE
value: "true"
- name: LABELS - name: LABELS
value: "self-hosted,linux,fc-build-linux" value: "self-hosted,linux,fc-build-linux"
- name: HOME - name: HOME
@@ -3282,11 +3172,6 @@ spec:
value: "/tmp/runner/work" value: "/tmp/runner/work"
- name: EPHEMERAL - name: EPHEMERAL
value: "true" value: "true"
# The runner image must not self-update inside an immutable
# Kubernetes pod. Without this, GitHub runner auto-update exits
# with code 4 and the Deployment falls into CrashLoopBackOff.
- name: DISABLE_AUTO_UPDATE
value: "true"
- name: LABELS - name: LABELS
value: "self-hosted,linux,fc-build-linux" value: "self-hosted,linux,fc-build-linux"
- name: HOME - name: HOME
@@ -3421,11 +3306,6 @@ spec:
value: "/tmp/runner/work" value: "/tmp/runner/work"
- name: EPHEMERAL - name: EPHEMERAL
value: "true" value: "true"
# The runner image must not self-update inside an immutable
# Kubernetes pod. Without this, GitHub runner auto-update exits
# with code 4 and the Deployment falls into CrashLoopBackOff.
- name: DISABLE_AUTO_UPDATE
value: "true"
- name: LABELS - name: LABELS
value: "self-hosted,linux,fc-build-linux" value: "self-hosted,linux,fc-build-linux"
- name: HOME - name: HOME
@@ -3560,11 +3440,6 @@ spec:
value: "/tmp/runner/work" value: "/tmp/runner/work"
- name: EPHEMERAL - name: EPHEMERAL
value: "true" value: "true"
# The runner image must not self-update inside an immutable
# Kubernetes pod. Without this, GitHub runner auto-update exits
# with code 4 and the Deployment falls into CrashLoopBackOff.
- name: DISABLE_AUTO_UPDATE
value: "true"
- name: LABELS - name: LABELS
value: "self-hosted,linux,fc-build-linux" value: "self-hosted,linux,fc-build-linux"
- name: HOME - name: HOME
@@ -3699,11 +3574,6 @@ spec:
value: "/tmp/runner/work" value: "/tmp/runner/work"
- name: EPHEMERAL - name: EPHEMERAL
value: "true" value: "true"
# The runner image must not self-update inside an immutable
# Kubernetes pod. Without this, GitHub runner auto-update exits
# with code 4 and the Deployment falls into CrashLoopBackOff.
- name: DISABLE_AUTO_UPDATE
value: "true"
- name: LABELS - name: LABELS
value: "self-hosted,linux,fc-build-linux" value: "self-hosted,linux,fc-build-linux"
- name: HOME - name: HOME
@@ -3838,11 +3708,6 @@ spec:
value: "/tmp/runner/work" value: "/tmp/runner/work"
- name: EPHEMERAL - name: EPHEMERAL
value: "true" value: "true"
# The runner image must not self-update inside an immutable
# Kubernetes pod. Without this, GitHub runner auto-update exits
# with code 4 and the Deployment falls into CrashLoopBackOff.
- name: DISABLE_AUTO_UPDATE
value: "true"
- name: LABELS - name: LABELS
value: "self-hosted,linux,fc-build-linux" value: "self-hosted,linux,fc-build-linux"
- name: HOME - name: HOME
@@ -3977,11 +3842,6 @@ spec:
value: "/tmp/runner/work" value: "/tmp/runner/work"
- name: EPHEMERAL - name: EPHEMERAL
value: "true" value: "true"
# The runner image must not self-update inside an immutable
# Kubernetes pod. Without this, GitHub runner auto-update exits
# with code 4 and the Deployment falls into CrashLoopBackOff.
- name: DISABLE_AUTO_UPDATE
value: "true"
- name: LABELS - name: LABELS
value: "self-hosted,linux,fc-build-linux" value: "self-hosted,linux,fc-build-linux"
- name: HOME - name: HOME
@@ -4115,11 +3975,6 @@ spec:
value: "/tmp/runner/work" value: "/tmp/runner/work"
- name: EPHEMERAL - name: EPHEMERAL
value: "true" value: "true"
# The runner image must not self-update inside an immutable
# Kubernetes pod. Without this, GitHub runner auto-update exits
# with code 4 and the Deployment falls into CrashLoopBackOff.
- name: DISABLE_AUTO_UPDATE
value: "true"
- name: LABELS - name: LABELS
value: "self-hosted,linux,fc-build-linux" value: "self-hosted,linux,fc-build-linux"
- name: HOME - name: HOME
@@ -4254,11 +4109,6 @@ spec:
value: "/tmp/runner/work" value: "/tmp/runner/work"
- name: EPHEMERAL - name: EPHEMERAL
value: "true" value: "true"
# The runner image must not self-update inside an immutable
# Kubernetes pod. Without this, GitHub runner auto-update exits
# with code 4 and the Deployment falls into CrashLoopBackOff.
- name: DISABLE_AUTO_UPDATE
value: "true"
- name: LABELS - name: LABELS
value: "self-hosted,linux,fc-build-linux" value: "self-hosted,linux,fc-build-linux"
- name: HOME - name: HOME
@@ -4397,11 +4247,6 @@ spec:
value: "/tmp/runner/work" value: "/tmp/runner/work"
- name: EPHEMERAL - name: EPHEMERAL
value: "true" value: "true"
# The runner image must not self-update inside an immutable
# Kubernetes pod. Without this, GitHub runner auto-update exits
# with code 4 and the Deployment falls into CrashLoopBackOff.
- name: DISABLE_AUTO_UPDATE
value: "true"
- name: LABELS - name: LABELS
value: "self-hosted,linux,fc-build-linux" value: "self-hosted,linux,fc-build-linux"
- name: HOME - name: HOME
@@ -4541,11 +4386,6 @@ spec:
value: "/tmp/runner/work" value: "/tmp/runner/work"
- name: EPHEMERAL - name: EPHEMERAL
value: "true" value: "true"
# The runner image must not self-update inside an immutable
# Kubernetes pod. Without this, GitHub runner auto-update exits
# with code 4 and the Deployment falls into CrashLoopBackOff.
- name: DISABLE_AUTO_UPDATE
value: "true"
- name: LABELS - name: LABELS
value: "self-hosted,linux,fc-build-linux" value: "self-hosted,linux,fc-build-linux"
- name: HOME - name: HOME
@@ -4681,11 +4521,6 @@ spec:
value: "/tmp/runner/work" value: "/tmp/runner/work"
- name: EPHEMERAL - name: EPHEMERAL
value: "true" value: "true"
# The runner image must not self-update inside an immutable
# Kubernetes pod. Without this, GitHub runner auto-update exits
# with code 4 and the Deployment falls into CrashLoopBackOff.
- name: DISABLE_AUTO_UPDATE
value: "true"
- name: LABELS - name: LABELS
value: "self-hosted,linux,fc-build-linux" value: "self-hosted,linux,fc-build-linux"
- name: HOME - name: HOME

View File

@@ -46,7 +46,7 @@ spec:
spec: spec:
containers: containers:
- name: intranet-web - name: intranet-web
image: localhost/fc-intranet-web:v20260612-screenshot-metadata image: localhost/fc-intranet-web:v20260531-ttsreader-bridge
imagePullPolicy: Never imagePullPolicy: Never
ports: ports:
- containerPort: 5300 - containerPort: 5300
@@ -60,17 +60,14 @@ spec:
# ≈ 9 hours. BLUEJAY-WS GPU (R9700, 32GB VRAM) does the same work # ≈ 9 hours. BLUEJAY-WS GPU (R9700, 32GB VRAM) does the same work
# in minutes. Memory: feedback_pi5_nomic_embed_slow. # in minutes. Memory: feedback_pi5_nomic_embed_slow.
- name: IntranetSearch__OllamaBaseUrl - name: IntranetSearch__OllamaBaseUrl
value: "http://edge1.iamworkin.lan:11434" value: "http://10.0.56.20:11434"
# External Notes corpus roots are not mounted in the live pod today. # Sprint E Phase 2α — JSON-file-backed PageReadingOverride persistence
# Keep the curated/workflow docs directory active without logging # on the writable PVC at /data. Without this env var the
# repeated /srv/flowercore-notes missing-root warnings. # intranet falls back to the in-memory store (loses state on
- name: IntranetSearch__Enabled # pod restart). Master's PageReadingOverrideOptions binds
value: "false" # PageReadingOverrides:FilePath.
# Page-reading override SQLite persistence on the writable PVC at - name: PageReadingOverrides__FilePath
# /data. This backs pronunciation, notes, corrections, and value: "/data/page-reading-overrides.json"
# page-profile metadata across pod restarts.
- name: PageReadingOverrides__DatabasePath
value: "/data/page-reading-overrides.db"
- name: KnowledgeFleetSearch__BaseUrl - name: KnowledgeFleetSearch__BaseUrl
value: "https://knowledge.iamworkin.lan" value: "https://knowledge.iamworkin.lan"
- name: KnowledgeFleetSearch__ApiKey - name: KnowledgeFleetSearch__ApiKey

View File

@@ -843,7 +843,9 @@ data:
rules: rules:
- alert: PiManagerDown - alert: PiManagerDown
expr: up{job="pimanager-app"} == 0 expr: up{job="pimanager-app"} == 0
for: 3m # Sprint 67: delayed behind NodeDown's critical page so a powered-off
# Pi does not create the first duplicate page for the same host.
for: 8m
labels: labels:
severity: warning severity: warning
annotations: annotations:
@@ -1242,6 +1244,58 @@ data:
summary: "Marquee animation duration drifting > 10% on {{ $labels.renderer }} ({{ $labels.phase }})" summary: "Marquee animation duration drifting > 10% on {{ $labels.renderer }} ({{ $labels.phase }})"
description: "Median observed cycle duration deviates from target DurationMs by >10%. Could indicate browser tab throttling, GPU pressure, or phase-advancement bug." description: "Median observed cycle duration deviates from target DurationMs by >10%. Could indicate browser tab throttling, GPU pressure, or phase-advancement bug."
# ============================================================
# Update Center public-edge probes
# Live-mirrored from FlowerCore.Notes/scripts/monitoring/alerts.yml.
# This K8s ConfigMap is the future migration target; live Prometheus
# still reads the canonical Notes file from noc1 Podman.
# ============================================================
- name: update_center
rules:
# Critical only when the edge is genuinely unreachable. A Cloudflare
# HTTP 429 means the prober hit a rate-limit, not that real clients
# are down, so the warning rule below owns that signal.
- alert: UpdateCenterPublicEdgeDown
expr: |
(probe_success{job="probe-update-center-public-edge"} == 0)
unless on(instance)
(probe_http_status_code{job="probe-update-center-public-edge"} == 429)
for: 10m
labels:
severity: critical
service: update-center
alert_channel: irc
annotations:
summary: "Update Center public edge probe failed for {{ $labels.instance }}"
description: >-
The external probe for {{ $labels.instance }} failed for 10 minutes with a
non-2xx status that is not a rate-limit. Public Update Center clients may be
unable to fetch manifest schema metadata through Cloudflare.
runbook: >-
1. curl -sk https://{{ $labels.instance }}/api/v1/manifests/_schema
2. Verify Cloudflare DNS record is proxied and targets the current public edge IP
3. kubectl -n fc-updater get ingressroute updatecenter-web-public secret cf-origin-flowercore-io
4. Check Traefik logs for Method() or TLS secret errors
- alert: UpdateCenterPublicEdgeRateLimited
expr: probe_http_status_code{job="probe-update-center-public-edge"} == 429
for: 15m
labels:
severity: warning
service: update-center
alert_channel: irc
annotations:
summary: "Cloudflare is rate-limiting (HTTP 429) the public-edge probe for {{ $labels.instance }}"
description: >-
The blackbox prober receives HTTP 429 from Cloudflare for {{ $labels.instance }}
while the origin is healthy. This is a Cloudflare rate-limit / WAF condition on
the public hostname, not an outage.
runbook: >-
1. curl -sk https://{{ $labels.instance }}/api/v1/manifests/_schema (expect 200 from a normal client)
2. Review Cloudflare rate-limit / WAF rules for the hostname; the 5m-cadence prober is tripping a 429
3. Add a Cloudflare rate-limit exception for the prober source IP or the /api/v1/manifests/_schema path
4. Confirm whether the singular host update.flowercore.io is still required, or only updates.flowercore.io
# ============================================================================= # =============================================================================
# ConfigMap: Blackbox Exporter Configuration # ConfigMap: Blackbox Exporter Configuration
# ============================================================================= # =============================================================================

View File

@@ -12,27 +12,28 @@ Source: `D:\git\FlowerCore\FlowerCore.WorldBuilder` (master)
in pfSense Unbound before this manifest is applied, or cert-manager in pfSense Unbound before this manifest is applied, or cert-manager
HTTP-01 silently exponential-backs-off ~2h. HTTP-01 silently exponential-backs-off ~2h.
Memory: `feedback_pfsense_dns_required_for_acme`. Memory: `feedback_pfsense_dns_required_for_acme`.
2. **Image import to ALL Ready RKE2 nodes** — pod can currently schedule to 2. **Image import to ALL RKE2 nodes** — pod can schedule to any of
`rke2-server` (10.0.56.11) and `rke2-agent1` (10.0.56.12). Build with: `rke2-server` (10.0.56.11), `rke2-agent1` (10.0.56.12),
`rke2-agent2` (10.0.56.13). Build with:
```bash ```bash
bash deploy/build.sh # in FlowerCore.WorldBuilder repo bash deploy/build.sh # in FlowerCore.WorldBuilder repo
mkdir -p artifacts/deploy podman save localhost/fc-worldbuilder:v<TAG> -o /tmp/fc-worldbuilder-v<TAG>.tar
podman save localhost/fc-worldbuilder:v<TAG> -o artifacts/deploy/fc-worldbuilder-v<TAG>.tar for h in 10.0.56.11 10.0.56.12 10.0.56.13; do
for h in 10.0.56.11 10.0.56.12; do scp /tmp/fc-worldbuilder-v<TAG>.tar fcadmin@$h:/tmp/
ssh fcadmin@$h "mkdir -p /home/fcadmin/.fcv"
scp artifacts/deploy/fc-worldbuilder-v<TAG>.tar fcadmin@$h:/home/fcadmin/.fcv/
ssh fcadmin@$h \ ssh fcadmin@$h \
"sudo /var/lib/rancher/rke2/bin/ctr -a /run/k3s/containerd/containerd.sock \ "sudo /var/lib/rancher/rke2/bin/ctr -a /run/k3s/containerd/containerd.sock \
-n k8s.io images import /home/fcadmin/.fcv/fc-worldbuilder-v<TAG>.tar" -n k8s.io images import /tmp/fc-worldbuilder-v<TAG>.tar"
done done
``` ```
Memory: `feedback_rke2_image_import_per_node_scp`. Memory: `feedback_rke2_image_import_per_node_scp`.
3. **Bump image tag** in `worldbuilder.yaml` and git push. 3. **Bump image tag** in `worldbuilder.yaml` and git push.
ArgoCD ApplicationSet picks up within ~3 minutes. ArgoCD ApplicationSet picks up within ~3 minutes.
4. **First production render** — verify 4. **First production render** — open
`https://worldbuilder.iamworkin.lan/healthz`, open `https://worldbuilder.iamworkin.lan/studio/c32e0000-0000-4000-8000-000000000004`
`https://worldbuilder.iamworkin.lan/settings`, and confirm the image backend and confirm the Cyberpunk Blue Jay demo prompt loads with five seeded fake
reports ComfyUI before running an operator-owned render lane. generated images. This Sprint 32 visitor-safe profile uses
`ClientMode=fake`; switch the image-generation env vars back to ComfyUI only
for an operator-owned GPU render lane.
## Health probes ## Health probes
@@ -55,8 +56,13 @@ Source: `D:\git\FlowerCore\FlowerCore.WorldBuilder` (master)
## Image generation backend ## Image generation backend
The live internal profile now uses Sprint 32 pins the Kubernetes profile to
`FlowerCore:WorldBuilder:ImageGeneration:ClientMode=comfyui` with `FlowerCore:WorldBuilder:ImageGeneration:ClientMode=fake` with
`BaseUrl=http://10.0.56.20:8188` on BLUEJAY-WS (R9700 / gfx1201 / ROCm 7.2). `BaseUrl=http://127.0.0.1:1`. That keeps the public/internal visitor demo
Keep the public host pre-staging disabled unless the five safe-to-expose gates deterministic, avoids GPU exposure, and still exercises the studio/gallery
are rechecked; the live GPU lane is operator-owned and internal-only. surface with persisted generated-image metadata.
The previous ComfyUI backend target was `http://10.0.56.20:8188` on
BLUEJAY-WS (R9700 / gfx1201 / ROCm 7.2.1). Re-enable it only in an
operator-owned follow-up that also verifies workstation reachability and image
import freshness.

View File

@@ -5,10 +5,10 @@
# #
# Image build (BLUEJAY-WS): # Image build (BLUEJAY-WS):
# bash deploy/build.sh # in FlowerCore.WorldBuilder repo # bash deploy/build.sh # in FlowerCore.WorldBuilder repo
# podman save localhost/fc-worldbuilder:v<TAG> -o artifacts/deploy/fc-worldbuilder-v<TAG>.tar # podman save localhost/fc-worldbuilder:v<TAG> -o /tmp/fc-worldbuilder-v<TAG>.tar
# for h in 10.0.56.11 10.0.56.12; do # for h in 10.0.56.11 10.0.56.12 10.0.56.13; do
# scp artifacts/deploy/fc-worldbuilder-v<TAG>.tar fcadmin@$h:/home/fcadmin/.fcv/ # scp /tmp/fc-worldbuilder-v<TAG>.tar fcadmin@$h:/tmp/
# ssh fcadmin@$h "sudo /var/lib/rancher/rke2/bin/ctr -a /run/k3s/containerd/containerd.sock -n k8s.io images import /home/fcadmin/.fcv/fc-worldbuilder-v<TAG>.tar" # ssh fcadmin@$h "sudo /var/lib/rancher/rke2/bin/ctr -a /run/k3s/containerd/containerd.sock -n k8s.io images import /tmp/fc-worldbuilder-v<TAG>.tar"
# done # done
--- ---
apiVersion: v1 apiVersion: v1
@@ -90,7 +90,7 @@ spec:
containers: containers:
- name: web - name: web
# Bump tag for each rebuild. Initial deploy: v202605062048 # Bump tag for each rebuild. Initial deploy: v202605062048
image: localhost/fc-worldbuilder:v202606121657-35aaa2c-gpu image: localhost/fc-worldbuilder:v202605062048
imagePullPolicy: Never imagePullPolicy: Never
ports: ports:
- containerPort: 8080 - containerPort: 8080
@@ -117,16 +117,14 @@ spec:
value: "/data/gallery" value: "/data/gallery"
- name: FlowerCore__WorldBuilder__Export__RootPath - name: FlowerCore__WorldBuilder__Export__RootPath
value: "/data/exports" value: "/data/exports"
# Operator-approved live GPU lane. Internal-only host targets # Visitor-safe Sprint 32 profile: fake backend keeps public demo
# BLUEJAY-WS ComfyUI; keep public host pre-staging disabled below. # rendering deterministic and avoids exposing BLUEJAY-WS GPU.
- name: FlowerCore__WorldBuilder__ImageGeneration__BaseUrl - name: FlowerCore__WorldBuilder__ImageGeneration__BaseUrl
value: "http://10.0.56.20:8188" value: "http://127.0.0.1:1"
- name: FlowerCore__WorldBuilder__ImageGeneration__ClientMode - name: FlowerCore__WorldBuilder__ImageGeneration__ClientMode
value: "comfyui" value: "fake"
- name: FlowerCore__WorldBuilder__ImageGeneration__BackendId - name: FlowerCore__WorldBuilder__ImageGeneration__BackendId
value: "comfyui" value: "fake"
- name: FlowerCore__WorldBuilder__ImageGeneration__VisitorSafe
value: "false"
resources: resources:
# Cluster CPU-request budget runs hot (99% on all 3 nodes at deploy # Cluster CPU-request budget runs hot (99% on all 3 nodes at deploy
# time) while actual CPU usage is well below capacity. Idle Blazor # time) while actual CPU usage is well below capacity. Idle Blazor

View File

@@ -0,0 +1,74 @@
apiVersion: argoproj.io/v1alpha1
kind: ApplicationSet
metadata:
annotations:
argocd.argoproj.io/refresh: "true"
name: bluejay-infra
namespace: argocd
spec:
generators:
- git:
directories:
- path: apps/*
repoURL: http://gitea-clusterip.gitea.svc:3000/bluejay/bluejay-infra.git
revision: main
template:
metadata: {}
spec:
destination: {}
project: ""
goTemplate: true
goTemplateOptions:
- missingkey=error
template:
metadata:
name: infra-{{.path.basename}}
spec:
destination:
server: https://kubernetes.default.svc
ignoreDifferences:
- group: apps
jqPathExpressions:
- .spec.volumeClaimTemplates[]?.status
jsonPointers:
- /spec/volumeClaimTemplates
kind: StatefulSet
name: zabbix-postgres
namespace: zabbix
- group: apps
jqPathExpressions:
- .spec.volumeClaimTemplates[]?.status
jsonPointers:
- /spec/volumeClaimTemplates
kind: StatefulSet
name: guac-mysql
namespace: guacamole
- group: apps
jqPathExpressions:
- .spec.volumeClaimTemplates[]?.status
jsonPointers:
- /spec/volumeClaimTemplates
kind: StatefulSet
name: matrix-postgres
namespace: matrix
- group: apps
jqPathExpressions:
- .spec.volumeClaimTemplates[]?.status
jsonPointers:
- /spec/volumeClaimTemplates
kind: StatefulSet
name: authentik-postgres
namespace: authentik
project: default
source:
path: '{{.path.path}}'
repoURL: http://gitea-clusterip.gitea.svc:3000/bluejay/bluejay-infra.git
targetRevision: main
syncPolicy:
automated:
prune: true
selfHeal: true
syncOptions:
- CreateNamespace=true
- ServerSideApply=true
- RespectIgnoreDifferences=true

View File

@@ -272,7 +272,6 @@ public sealed class FleetManifestLintTests
var container = deployments[expectedRunner.Key].MainContainerMappings().Should().ContainSingle().Subject; var container = deployments[expectedRunner.Key].MainContainerMappings().Should().ContainSingle().Subject;
EnvValue(container, "REPO_URL").Should().Be(expectedRunner.Value); EnvValue(container, "REPO_URL").Should().Be(expectedRunner.Value);
EnvValue(container, "EPHEMERAL").Should().Be("true"); EnvValue(container, "EPHEMERAL").Should().Be("true");
EnvValue(container, "DISABLE_AUTO_UPDATE").Should().Be("true", $"{expectedRunner.Key} must not self-update inside immutable Kubernetes runner pods");
EnvValue(container, "LABELS").Should().Be("self-hosted,linux,fc-build-linux"); EnvValue(container, "LABELS").Should().Be("self-hosted,linux,fc-build-linux");
EnvValue(container, "RUN_AS_ROOT").Should().Be("false"); EnvValue(container, "RUN_AS_ROOT").Should().Be("false");
EnvValue(container, "ACCESS_TOKEN").Should().BeNull("ACCESS_TOKEN must come from github-runner-token Secret, not a literal"); EnvValue(container, "ACCESS_TOKEN").Should().BeNull("ACCESS_TOKEN must come from github-runner-token Secret, not a literal");
@@ -469,6 +468,99 @@ public sealed class FleetManifestLintTests
monitoring.Should().Contain("dedicated LinuxRunnerOffline/MacMiniRunnerOffline alerts"); monitoring.Should().Contain("dedicated LinuxRunnerOffline/MacMiniRunnerOffline alerts");
} }
[Fact]
public void GithubRunnerReadme_DocumentsAcceptedEphemeralExitChurn()
{
var readme = File.ReadAllText(Path.Combine(Inventory.BluejayRoot, "apps", "github-runner", "README.md"));
readme.Should().Contain("Ephemeral runner pods");
readme.Should().Contain("exit-1/restart churn");
readme.Should().Contain("accepted operational noise");
readme.Should().Contain("repo-scoped runner-offline alerts stay quiet");
}
[Fact]
public void Monitoring_PiManagerDownDelayAndUpdateCenterRateLimit_MatchCanonicalAlerts()
{
var notesAlerts = File.ReadAllText(Path.Combine(
Inventory.WorkspaceRoot,
"FlowerCore.Notes",
"scripts",
"monitoring",
"alerts.yml"));
var monitoring = File.ReadAllText(Path.Combine(Inventory.BluejayRoot, "apps", "monitoring", "noc-monitoring.yaml"));
notesAlerts.Should().Contain("# Sprint 67: keep this warning behind NodeDown's 5m critical page");
notesAlerts.Should().Contain("- alert: PiManagerDown");
notesAlerts.Should().Contain("for: 8m");
monitoring.Should().Contain("# Sprint 67: delayed behind NodeDown's critical page");
monitoring.Should().Contain("- alert: PiManagerDown");
monitoring.Should().Contain("for: 8m");
notesAlerts.Should().Contain("- alert: UpdateCenterPublicEdgeRateLimited");
notesAlerts.Should().Contain("expr: probe_http_status_code{job=\"probe-update-center-public-edge\"} == 429");
notesAlerts.Should().Contain("for: 15m");
monitoring.Should().Contain("- alert: UpdateCenterPublicEdgeRateLimited");
monitoring.Should().Contain("expr: probe_http_status_code{job=\"probe-update-center-public-edge\"} == 429");
monitoring.Should().Contain("for: 15m");
monitoring.Should().Contain("severity: warning");
}
[Fact]
public void ApplicationSetExport_MustRemainManualRootOfGitOpsTree()
{
var readme = File.ReadAllText(Path.Combine(Inventory.BluejayRoot, "README.md"));
var appsetPath = Path.Combine(Inventory.BluejayRoot, "argocd", "applicationset-bluejay-infra.yaml");
File.Exists(appsetPath).Should().BeTrue();
var appset = File.ReadAllText(appsetPath);
appset.Should().Contain("kind: ApplicationSet");
appset.Should().Contain("name: bluejay-infra");
appset.Should().NotContain("\nstatus:");
appset.Should().NotContain("managedFields:");
readme.Should().Contain("root of this GitOps tree");
readme.Should().Contain("NOT self-managed");
readme.Should().Contain("kubectl -n argocd apply -f argocd/applicationset-bluejay-infra.yaml");
}
[Fact]
public void ApplicationSetExport_MustDiscoverAppsDirectoryOnMain()
{
var appset = File.ReadAllText(Path.Combine(Inventory.BluejayRoot, "argocd", "applicationset-bluejay-infra.yaml"));
appset.Should().Contain("path: apps/*");
appset.Should().Contain("revision: main");
appset.Should().Contain("repoURL: http://gitea-clusterip.gitea.svc:3000/bluejay/bluejay-infra.git");
appset.Should().Contain("path: '{{.path.path}}'");
appset.Should().Contain("targetRevision: main");
appset.Should().Contain("ServerSideApply=true");
appset.Should().Contain("RespectIgnoreDifferences=true");
}
[Fact]
public void ApplicationSetExport_MustPreserveStatefulSetIgnoreDifferences()
{
var appset = File.ReadAllText(Path.Combine(Inventory.BluejayRoot, "argocd", "applicationset-bluejay-infra.yaml"));
appset.Should().Contain("jsonPointers:");
appset.Should().Contain("- /spec/volumeClaimTemplates");
appset.Should().Contain(".spec.volumeClaimTemplates[]?.status");
Regex.Matches(appset, "kind: StatefulSet").Should().HaveCount(4);
foreach (var (name, ns) in new[]
{
("zabbix-postgres", "zabbix"),
("guac-mysql", "guacamole"),
("matrix-postgres", "matrix"),
("authentik-postgres", "authentik"),
})
{
appset.Should().Contain($"name: {name}");
appset.Should().Contain($"namespace: {ns}");
}
}
[Fact] [Fact]
public void Monitoring_BlackboxTargetsForOidcSensitiveServices_MustUseAnonymousHealthRoutesWhenAvailable() public void Monitoring_BlackboxTargetsForOidcSensitiveServices_MustUseAnonymousHealthRoutesWhenAvailable()
{ {
@@ -650,7 +742,6 @@ public sealed class FleetManifestLintTests
"certificate-web.yaml", "certificate-web.yaml",
"clusterrole-operator.yaml", "clusterrole-operator.yaml",
"clusterrolebinding-operator.yaml", "clusterrolebinding-operator.yaml",
"crds.yaml",
"deployment-operator.yaml", "deployment-operator.yaml",
"deployment-web.yaml", "deployment-web.yaml",
"ingressroute-web.yaml", "ingressroute-web.yaml",
@@ -740,8 +831,7 @@ public sealed class FleetManifestLintTests
.Single(document => document.Kind == "ClusterRole" && document.Name == "fc-devicemgmt-operator"); .Single(document => document.Kind == "ClusterRole" && document.Name == "fc-devicemgmt-operator");
var allScalars = clusterRole.AllScalars().ToList(); var allScalars = clusterRole.AllScalars().ToList();
allScalars.Should().Contain("flowercore.io"); allScalars.Should().Contain("devices.flowercore.io");
allScalars.Should().NotContain("devices.flowercore.io");
allScalars.Should().Contain("*"); allScalars.Should().Contain("*");
allScalars.Should().Contain("deployments"); allScalars.Should().Contain("deployments");
allScalars.Should().Contain("get"); allScalars.Should().Contain("get");
@@ -770,7 +860,7 @@ public sealed class FleetManifestLintTests
FcDeviceManagementDocuments().Should().NotContain(document => document.Kind == "Secret"); FcDeviceManagementDocuments().Should().NotContain(document => document.Kind == "Secret");
appText.Should().Contain("secretKeyRef:"); appText.Should().Contain("secretKeyRef:");
appText.Should().Contain("name: fc-devicemgmt-runtime"); appText.Should().Contain("secretName: fc-devicemgmt-runtime");
appText.Should().NotContain("stringData:"); appText.Should().NotContain("stringData:");
appText.Should().NotContain("from-literal"); appText.Should().NotContain("from-literal");
appText.Should().NotContain("tls.key:"); appText.Should().NotContain("tls.key:");
@@ -867,9 +957,9 @@ public sealed class FleetManifestLintTests
{ {
var deployments = new[] var deployments = new[]
{ {
(App: "fc-dns", Name: "dns-web", Slug: "dns", Secret: "dns-oidc-client", AuthEnabled: "false"), (App: "fc-dns", Name: "dns-web", Slug: "dns", Secret: "dns-oidc-client"),
(App: "fc-media", Name: "fc-media-web", Slug: "media", Secret: "media-oidc-client", AuthEnabled: "true"), (App: "fc-media", Name: "fc-media-web", Slug: "media", Secret: "media-oidc-client"),
(App: "fc-distribution", Name: "fc-distribution", Slug: "distribution", Secret: "distribution-oidc-client", AuthEnabled: "true"), (App: "fc-distribution", Name: "fc-distribution", Slug: "distribution", Secret: "distribution-oidc-client"),
}; };
foreach (var expected in deployments) foreach (var expected in deployments)
@@ -878,7 +968,7 @@ public sealed class FleetManifestLintTests
.Single(document => document.Kind == "Deployment" && document.Name == expected.Name); .Single(document => document.Kind == "Deployment" && document.Name == expected.Name);
var container = deployment.MainContainerMappings().Should().ContainSingle().Subject; var container = deployment.MainContainerMappings().Should().ContainSingle().Subject;
EnvValue(container, "FlowerCore__Auth__Enabled").Should().Be(expected.AuthEnabled); EnvValue(container, "FlowerCore__Auth__Enabled").Should().Be("true");
EnvValue(container, "FlowerCore__Auth__Oidc__Enabled").Should().Be("true"); EnvValue(container, "FlowerCore__Auth__Oidc__Enabled").Should().Be("true");
(EnvValue(container, "FlowerCore__Auth__Oidc__Audience") ?? EnvValue(container, "FlowerCore__Auth__Oidc__ClientId")) (EnvValue(container, "FlowerCore__Auth__Oidc__Audience") ?? EnvValue(container, "FlowerCore__Auth__Oidc__ClientId"))
.Should() .Should()
@@ -927,7 +1017,7 @@ public sealed class FleetManifestLintTests
var dnsPvc = AppDocuments("fc-dns") var dnsPvc = AppDocuments("fc-dns")
.Single(document => document.Kind == "PersistentVolumeClaim" && document.Name == "dns-web-data"); .Single(document => document.Kind == "PersistentVolumeClaim" && document.Name == "dns-web-data");
ManifestNodeExtensions.Scalar(dnsContainer, "image").Should().Be("localhost/fc-dns-web:v20260612-l4dns-a5d2849"); ManifestNodeExtensions.Scalar(dnsContainer, "image").Should().Be("localhost/fc-dns-web:v20260604-oidc-proper");
dnsPvc.Scalar("spec", "storageClassName").Should().Be("longhorn"); dnsPvc.Scalar("spec", "storageClassName").Should().Be("longhorn");
dnsPvc.Scalar("spec", "resources", "requests", "storage").Should().Be("1Gi"); dnsPvc.Scalar("spec", "resources", "requests", "storage").Should().Be("1Gi");