Compare commits
5 Commits
codex/ttsr
...
9b255fefc1
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
9b255fefc1 | ||
|
|
2489464d4f | ||
|
|
4b777b16ac | ||
|
|
8c60e3a4d3 | ||
|
|
df02b4c3c3 |
@@ -1024,6 +1024,72 @@ data:
|
|||||||
summary: "Longhorn node {{ $labels.node }} not Ready"
|
summary: "Longhorn node {{ $labels.node }} not Ready"
|
||||||
description: "Node {{ $labels.node }} reports ready=false (reason: {{ $labels.condition_reason }}). Volumes scheduled to this node will be unavailable until it recovers."
|
description: "Node {{ $labels.node }} reports ready=false (reason: {{ $labels.condition_reason }}). Volumes scheduled to this node will be unavailable until it recovers."
|
||||||
|
|
||||||
|
# ============================================================
|
||||||
|
# FC Signage Marquee Performance — Track 3 + 8 (2026-05-06)
|
||||||
|
# Live-mirrored from FlowerCore.Notes/scripts/monitoring/alerts.yml.
|
||||||
|
# Source-of-truth for the live Podman Prometheus on noc1 is the
|
||||||
|
# Notes file; this K8s ConfigMap exists so a future migration to
|
||||||
|
# in-cluster Prometheus inherits the ruleset automatically.
|
||||||
|
# See feedback_monitoring_k8s_target_vs_live_podman.
|
||||||
|
# ============================================================
|
||||||
|
- name: fc-signage-marquee
|
||||||
|
rules:
|
||||||
|
- alert: MarqueeDroppedFramesHigh
|
||||||
|
expr: |
|
||||||
|
(
|
||||||
|
sum by (renderer, phase, node_id) (rate(marquee_dropped_frames_total[5m]))
|
||||||
|
/
|
||||||
|
sum by (renderer, phase, node_id) (rate(marquee_render_latency_ms_count[5m]))
|
||||||
|
) > 0.05
|
||||||
|
unless on()
|
||||||
|
absent_over_time(marquee_dropped_frames_total[7d])
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
service: signage
|
||||||
|
alert_channel: irc
|
||||||
|
annotations:
|
||||||
|
summary: "Marquee dropped-frame rate >5% on {{ $labels.renderer }}/{{ $labels.node_id }} ({{ $labels.phase }})"
|
||||||
|
description: "Renderer {{ $labels.renderer }} on {{ $labels.node_id }} drops >5% of frames during {{ $labels.phase }}. Animation visibly stuttery."
|
||||||
|
|
||||||
|
- alert: MarqueeRenderLatencyP99High
|
||||||
|
expr: |
|
||||||
|
histogram_quantile(
|
||||||
|
0.99,
|
||||||
|
sum by (renderer, phase, node_id, le) (rate(marquee_render_latency_ms_bucket[5m]))
|
||||||
|
) > 16
|
||||||
|
unless on()
|
||||||
|
absent_over_time(marquee_render_latency_ms_bucket[7d])
|
||||||
|
for: 10m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
service: signage
|
||||||
|
alert_channel: irc
|
||||||
|
annotations:
|
||||||
|
summary: "Marquee render latency p99 > 16ms on {{ $labels.renderer }}/{{ $labels.node_id }} ({{ $labels.phase }})"
|
||||||
|
description: "Per-frame render latency p99 has exceeded the Pi-class 16ms budget for 10 minutes."
|
||||||
|
|
||||||
|
- alert: MarqueeAnimationDurationDrift
|
||||||
|
expr: |
|
||||||
|
abs(
|
||||||
|
histogram_quantile(0.5, sum by (renderer, phase, le) (rate(marquee_animation_duration_ms_bucket[15m])))
|
||||||
|
-
|
||||||
|
on (phase) group_left() avg by (phase) (marquee_animation_duration_target_ms)
|
||||||
|
)
|
||||||
|
/
|
||||||
|
on (phase) group_left() avg by (phase) (marquee_animation_duration_target_ms)
|
||||||
|
> 0.10
|
||||||
|
unless on()
|
||||||
|
absent_over_time(marquee_animation_duration_ms_bucket[7d])
|
||||||
|
for: 15m
|
||||||
|
labels:
|
||||||
|
severity: info
|
||||||
|
service: signage
|
||||||
|
alert_channel: irc
|
||||||
|
annotations:
|
||||||
|
summary: "Marquee animation duration drifting > 10% on {{ $labels.renderer }} ({{ $labels.phase }})"
|
||||||
|
description: "Median observed cycle duration deviates from target DurationMs by >10%. Could indicate browser tab throttling, GPU pressure, or phase-advancement bug."
|
||||||
|
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
# ConfigMap: Blackbox Exporter Configuration
|
# ConfigMap: Blackbox Exporter Configuration
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
|
|||||||
60
apps/worldbuilder/README.md
Normal file
60
apps/worldbuilder/README.md
Normal file
@@ -0,0 +1,60 @@
|
|||||||
|
# FlowerCore.WorldBuilder
|
||||||
|
|
||||||
|
ArgoCD-managed manifest for FlowerCore.WorldBuilder.Web — comic / storyboard
|
||||||
|
authoring service that drives ComfyUI for panel image generation and
|
||||||
|
QuestPDF for letter / A4 export.
|
||||||
|
|
||||||
|
Source: `D:\git\FlowerCore\FlowerCore.WorldBuilder` (master)
|
||||||
|
|
||||||
|
## Deployment order
|
||||||
|
|
||||||
|
1. **DNS preflight** — `worldbuilder.iamworkin.lan -> 10.0.56.200` MUST exist
|
||||||
|
in pfSense Unbound before this manifest is applied, or cert-manager
|
||||||
|
HTTP-01 silently exponential-backs-off ~2h.
|
||||||
|
Memory: `feedback_pfsense_dns_required_for_acme`.
|
||||||
|
2. **Image import to ALL RKE2 nodes** — pod can schedule to any of
|
||||||
|
`rke2-server` (10.0.56.11), `rke2-agent1` (10.0.56.12),
|
||||||
|
`rke2-agent2` (10.0.56.13). Build with:
|
||||||
|
```bash
|
||||||
|
bash deploy/build.sh # in FlowerCore.WorldBuilder repo
|
||||||
|
podman save localhost/fc-worldbuilder:v<TAG> -o /tmp/fc-worldbuilder-v<TAG>.tar
|
||||||
|
for h in 10.0.56.11 10.0.56.12 10.0.56.13; do
|
||||||
|
scp /tmp/fc-worldbuilder-v<TAG>.tar fcadmin@$h:/tmp/
|
||||||
|
ssh fcadmin@$h \
|
||||||
|
"sudo /var/lib/rancher/rke2/bin/ctr -a /run/k3s/containerd/containerd.sock \
|
||||||
|
-n k8s.io images import /tmp/fc-worldbuilder-v<TAG>.tar"
|
||||||
|
done
|
||||||
|
```
|
||||||
|
Memory: `feedback_rke2_image_import_per_node_scp`.
|
||||||
|
3. **Bump image tag** in `worldbuilder.yaml` and git push.
|
||||||
|
ArgoCD ApplicationSet picks up within ~3 minutes.
|
||||||
|
4. **First production render** — open `https://worldbuilder.iamworkin.lan`,
|
||||||
|
create World → Character → Storyboard → ExportJob, confirm artifact
|
||||||
|
downloads. ComfyUI lives on BLUEJAY-WS at `http://10.0.56.20:8188`.
|
||||||
|
|
||||||
|
## Health probes
|
||||||
|
|
||||||
|
- `startupProbe` + `readinessProbe`: `httpGet /healthz` (registered explicitly
|
||||||
|
in Program.cs — anonymous, no DB or OpenAPI dependency).
|
||||||
|
- `livenessProbe`: `tcpSocket` as a cheap fallback.
|
||||||
|
Memory: `feedback_k8s_probes_must_not_hit_openapi`,
|
||||||
|
`feedback_k8s_probes_behind_auth_middleware`.
|
||||||
|
|
||||||
|
## Storage
|
||||||
|
|
||||||
|
- Longhorn RWO PVC `worldbuilder-data` (5Gi) mounted at `/data`. SQLite DB
|
||||||
|
lives at `/data/worldbuilder.db`, generated images under `/data/gallery/`,
|
||||||
|
PDF/PNG exports under `/data/exports/`.
|
||||||
|
- DataProtection keys persist to the same SQLite via
|
||||||
|
`AddFlowerCoreDataProtection<WorldBuilderDbContext>` — explicit migration
|
||||||
|
`20260429133417_Initial` already creates `fc_dp_keys`.
|
||||||
|
Memory: `feedback_dataprotection_keys_persist_to_app_dbcontext`,
|
||||||
|
`feedback_intranet_dataprotection_table_must_have_explicit_migration`.
|
||||||
|
|
||||||
|
## Image generation backend
|
||||||
|
|
||||||
|
`FlowerCore:WorldBuilder:ImageGeneration:BaseUrl=http://10.0.56.20:8188` —
|
||||||
|
ComfyUI runs on BLUEJAY-WS Windows (R9700 / gfx1201 / ROCm 7.2.1). Pod reaches
|
||||||
|
the workstation directly across the 10.0.56.0/24 VLAN (no Podman-style host-
|
||||||
|
filter issues — K8s pods route via Calico, which is L3-routed across the
|
||||||
|
VLAN).
|
||||||
208
apps/worldbuilder/worldbuilder.yaml
Normal file
208
apps/worldbuilder/worldbuilder.yaml
Normal file
@@ -0,0 +1,208 @@
|
|||||||
|
# FlowerCore.WorldBuilder — comic / storyboard authoring service.
|
||||||
|
#
|
||||||
|
# Deployment + Service + PVC + Certificate + IngressRoute. ArgoCD-managed
|
||||||
|
# end-to-end. See apps/worldbuilder/README.md for the per-deploy runbook.
|
||||||
|
#
|
||||||
|
# Image build (BLUEJAY-WS):
|
||||||
|
# bash deploy/build.sh # in FlowerCore.WorldBuilder repo
|
||||||
|
# podman save localhost/fc-worldbuilder:v<TAG> -o /tmp/fc-worldbuilder-v<TAG>.tar
|
||||||
|
# for h in 10.0.56.11 10.0.56.12 10.0.56.13; do
|
||||||
|
# scp /tmp/fc-worldbuilder-v<TAG>.tar fcadmin@$h:/tmp/
|
||||||
|
# ssh fcadmin@$h "sudo /var/lib/rancher/rke2/bin/ctr -a /run/k3s/containerd/containerd.sock -n k8s.io images import /tmp/fc-worldbuilder-v<TAG>.tar"
|
||||||
|
# done
|
||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Namespace
|
||||||
|
metadata:
|
||||||
|
name: fc-worldbuilder
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/part-of: flowercore
|
||||||
|
---
|
||||||
|
# SQLite DB + generated image gallery + PDF/PNG exports.
|
||||||
|
# Longhorn RWO — single replica with `Recreate` rollout strategy keeps it safe.
|
||||||
|
apiVersion: v1
|
||||||
|
kind: PersistentVolumeClaim
|
||||||
|
metadata:
|
||||||
|
name: worldbuilder-data
|
||||||
|
namespace: fc-worldbuilder
|
||||||
|
spec:
|
||||||
|
accessModes:
|
||||||
|
- ReadWriteOnce
|
||||||
|
storageClassName: longhorn
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
storage: 5Gi
|
||||||
|
---
|
||||||
|
apiVersion: apps/v1
|
||||||
|
kind: Deployment
|
||||||
|
metadata:
|
||||||
|
name: worldbuilder-web
|
||||||
|
namespace: fc-worldbuilder
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/name: worldbuilder-web
|
||||||
|
app.kubernetes.io/part-of: flowercore
|
||||||
|
spec:
|
||||||
|
replicas: 1
|
||||||
|
revisionHistoryLimit: 3
|
||||||
|
strategy:
|
||||||
|
# RWO PVC + single replica. Recreate avoids multi-attach overlap.
|
||||||
|
type: Recreate
|
||||||
|
selector:
|
||||||
|
matchLabels:
|
||||||
|
app.kubernetes.io/name: worldbuilder-web
|
||||||
|
template:
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/name: worldbuilder-web
|
||||||
|
app.kubernetes.io/part-of: flowercore
|
||||||
|
annotations:
|
||||||
|
prometheus.io/scrape: "true"
|
||||||
|
prometheus.io/port: "8080"
|
||||||
|
prometheus.io/path: "/metrics/prometheus"
|
||||||
|
spec:
|
||||||
|
securityContext:
|
||||||
|
fsGroup: 1654
|
||||||
|
fsGroupChangePolicy: OnRootMismatch
|
||||||
|
containers:
|
||||||
|
- name: web
|
||||||
|
# Bump tag for each rebuild. Initial deploy: v202605062048
|
||||||
|
image: localhost/fc-worldbuilder:v202605062048
|
||||||
|
imagePullPolicy: Never
|
||||||
|
ports:
|
||||||
|
- containerPort: 8080
|
||||||
|
name: http
|
||||||
|
env:
|
||||||
|
- name: ASPNETCORE_URLS
|
||||||
|
value: "http://+:8080"
|
||||||
|
- name: ASPNETCORE_ENVIRONMENT
|
||||||
|
value: "Production"
|
||||||
|
- name: DOTNET_RUNNING_IN_CONTAINER
|
||||||
|
value: "true"
|
||||||
|
- name: DOTNET_SYSTEM_GLOBALIZATION_INVARIANT
|
||||||
|
value: "false"
|
||||||
|
# SQLite path overrides (default appsettings uses relative paths).
|
||||||
|
- name: ConnectionStrings__DefaultConnection
|
||||||
|
value: "Data Source=/data/worldbuilder.db"
|
||||||
|
- name: FlowerCore__Database__Provider
|
||||||
|
value: "Sqlite"
|
||||||
|
- name: FlowerCore__Database__ConnectionStrings__Sqlite
|
||||||
|
value: "Data Source=/data/worldbuilder.db"
|
||||||
|
# Generated image gallery + exports persist on /data.
|
||||||
|
- name: FlowerCore__WorldBuilder__ImageStore__RootPath
|
||||||
|
value: "/data/gallery"
|
||||||
|
- name: FlowerCore__WorldBuilder__Export__RootPath
|
||||||
|
value: "/data/exports"
|
||||||
|
# ComfyUI on BLUEJAY-WS (R9700 / gfx1201 / ROCm 7.2.1).
|
||||||
|
- name: FlowerCore__WorldBuilder__ImageGeneration__BaseUrl
|
||||||
|
value: "http://10.0.56.20:8188"
|
||||||
|
- name: FlowerCore__WorldBuilder__ImageGeneration__ClientMode
|
||||||
|
value: "comfyui"
|
||||||
|
resources:
|
||||||
|
# Cluster CPU-request budget runs hot (99% on all 3 nodes at deploy
|
||||||
|
# time) while actual CPU usage is well below capacity. Idle Blazor
|
||||||
|
# Server + SignalR + a single ComfyUI poller uses ~5m, so 25m is
|
||||||
|
# generous. Re-evaluate if active rendering/export workers ever
|
||||||
|
# push past the limit.
|
||||||
|
requests:
|
||||||
|
cpu: 25m
|
||||||
|
memory: 256Mi
|
||||||
|
limits:
|
||||||
|
cpu: 1000m
|
||||||
|
memory: 768Mi
|
||||||
|
# /healthz is registered explicitly in Program.cs (anonymous, no DB
|
||||||
|
# or OpenAPI dependency). Liveness uses tcpSocket as a cheap fallback
|
||||||
|
# in case future middleware changes accidentally gate /healthz.
|
||||||
|
# Memory: feedback_k8s_probes_must_not_hit_openapi,
|
||||||
|
# feedback_k8s_probes_behind_auth_middleware.
|
||||||
|
startupProbe:
|
||||||
|
httpGet:
|
||||||
|
path: /healthz
|
||||||
|
port: 8080
|
||||||
|
initialDelaySeconds: 5
|
||||||
|
periodSeconds: 5
|
||||||
|
failureThreshold: 30
|
||||||
|
readinessProbe:
|
||||||
|
httpGet:
|
||||||
|
path: /healthz
|
||||||
|
port: 8080
|
||||||
|
periodSeconds: 10
|
||||||
|
failureThreshold: 3
|
||||||
|
livenessProbe:
|
||||||
|
tcpSocket:
|
||||||
|
port: 8080
|
||||||
|
initialDelaySeconds: 30
|
||||||
|
periodSeconds: 30
|
||||||
|
failureThreshold: 3
|
||||||
|
securityContext:
|
||||||
|
runAsNonRoot: true
|
||||||
|
runAsUser: 1654
|
||||||
|
runAsGroup: 1654
|
||||||
|
allowPrivilegeEscalation: false
|
||||||
|
readOnlyRootFilesystem: true
|
||||||
|
capabilities:
|
||||||
|
drop:
|
||||||
|
- ALL
|
||||||
|
volumeMounts:
|
||||||
|
- name: data
|
||||||
|
mountPath: /data
|
||||||
|
- name: tmp
|
||||||
|
mountPath: /tmp
|
||||||
|
- name: logs
|
||||||
|
mountPath: /app/logs
|
||||||
|
volumes:
|
||||||
|
- name: data
|
||||||
|
persistentVolumeClaim:
|
||||||
|
claimName: worldbuilder-data
|
||||||
|
- name: tmp
|
||||||
|
emptyDir: {}
|
||||||
|
- name: logs
|
||||||
|
emptyDir: {}
|
||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Service
|
||||||
|
metadata:
|
||||||
|
name: worldbuilder-web
|
||||||
|
namespace: fc-worldbuilder
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/name: worldbuilder-web
|
||||||
|
app.kubernetes.io/part-of: flowercore
|
||||||
|
spec:
|
||||||
|
type: ClusterIP
|
||||||
|
selector:
|
||||||
|
app.kubernetes.io/name: worldbuilder-web
|
||||||
|
ports:
|
||||||
|
- name: http
|
||||||
|
port: 80
|
||||||
|
targetPort: 8080
|
||||||
|
---
|
||||||
|
apiVersion: cert-manager.io/v1
|
||||||
|
kind: Certificate
|
||||||
|
metadata:
|
||||||
|
name: worldbuilder-web-tls
|
||||||
|
namespace: fc-worldbuilder
|
||||||
|
spec:
|
||||||
|
secretName: worldbuilder-web-tls
|
||||||
|
issuerRef:
|
||||||
|
name: step-ca-acme
|
||||||
|
kind: ClusterIssuer
|
||||||
|
dnsNames:
|
||||||
|
- worldbuilder.iamworkin.lan
|
||||||
|
duration: 2160h # 90d
|
||||||
|
renewBefore: 720h # 30d
|
||||||
|
---
|
||||||
|
apiVersion: traefik.io/v1alpha1
|
||||||
|
kind: IngressRoute
|
||||||
|
metadata:
|
||||||
|
name: worldbuilder-web
|
||||||
|
namespace: fc-worldbuilder
|
||||||
|
spec:
|
||||||
|
entryPoints:
|
||||||
|
- websecure
|
||||||
|
routes:
|
||||||
|
- match: Host(`worldbuilder.iamworkin.lan`)
|
||||||
|
kind: Rule
|
||||||
|
services:
|
||||||
|
- name: worldbuilder-web
|
||||||
|
port: 80
|
||||||
|
tls:
|
||||||
|
secretName: worldbuilder-web-tls
|
||||||
Reference in New Issue
Block a user