Compare commits
6 Commits
claude/ci1
...
f298339152
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
f298339152 | ||
|
|
6e7d88db49 | ||
|
|
5ae50bd491 | ||
|
|
653d4472f5 | ||
|
|
eb8693e1ce | ||
|
|
667777a653 |
171
apps/fc-redis/fc-redis.yaml
Normal file
171
apps/fc-redis/fc-redis.yaml
Normal file
@@ -0,0 +1,171 @@
|
|||||||
|
# fc-redis — SignalR backplane for cross-product event bus
|
||||||
|
#
|
||||||
|
# Lands per Q-SO-1 resolution (2026-05-11 PM): SignalR backplane in Phase A,
|
||||||
|
# not Phase C as originally drafted. Operator directive: "Redis can be
|
||||||
|
# deployed just fine as it's another FlowerCore technology we'll want to
|
||||||
|
# manage."
|
||||||
|
#
|
||||||
|
# Phase A scope (this file):
|
||||||
|
# - Single Redis 7.x Alpine pod
|
||||||
|
# - 1Gi Longhorn RWO PVC for AOF persistence
|
||||||
|
# - ClusterIP Service at `redis.fc-redis.svc.cluster.local:6379`
|
||||||
|
# - No AUTH (in-cluster only; not exposed externally)
|
||||||
|
# - No IngressRoute (backplane is server-to-server only)
|
||||||
|
#
|
||||||
|
# Consumers (Phase A IMPL across FC services):
|
||||||
|
# - FlowerCore.Signage.Web (OpsConsoleHub)
|
||||||
|
# - FlowerCore.Scoreboard.Web (ScoreboardHub)
|
||||||
|
# - FlowerCore.SignalControl.Web
|
||||||
|
# - FlowerCore.DMS.Web
|
||||||
|
# - Any other product joining the cross-product event bus
|
||||||
|
#
|
||||||
|
# Each consumer adds:
|
||||||
|
# services.AddSignalR()
|
||||||
|
# .AddStackExchangeRedis(
|
||||||
|
# "redis.fc-redis.svc.cluster.local:6379",
|
||||||
|
# opts => opts.Configuration.ChannelPrefix =
|
||||||
|
# StackExchange.Redis.RedisChannel.Literal("fc-opsconsole"));
|
||||||
|
#
|
||||||
|
# Phase B / C follow-ons (out of scope here):
|
||||||
|
# - Redis Sentinel for HA (3-node)
|
||||||
|
# - AUTH password from 1Password Connect (rotate via /rotate-password)
|
||||||
|
# - redis_exporter sidecar for Prometheus scrape
|
||||||
|
# - Network policies restricting which namespaces can dial 6379
|
||||||
|
#
|
||||||
|
# Design: docs/signage/operations-console-phase-2-design.md §3.5
|
||||||
|
# Decision: Q-SO-1 (RESOLVED 2026-05-11 PM)
|
||||||
|
# Memory: feedback_blooming_ui_pattern_no_iframes
|
||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Namespace
|
||||||
|
metadata:
|
||||||
|
name: fc-redis
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/part-of: flowercore
|
||||||
|
app.kubernetes.io/managed-by: argocd
|
||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: PersistentVolumeClaim
|
||||||
|
metadata:
|
||||||
|
name: fc-redis-data
|
||||||
|
namespace: fc-redis
|
||||||
|
spec:
|
||||||
|
accessModes:
|
||||||
|
- ReadWriteOnce
|
||||||
|
storageClassName: longhorn
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
storage: 1Gi
|
||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: ConfigMap
|
||||||
|
metadata:
|
||||||
|
name: fc-redis-config
|
||||||
|
namespace: fc-redis
|
||||||
|
data:
|
||||||
|
redis.conf: |
|
||||||
|
# Phase A — minimal config; no AUTH, no replication.
|
||||||
|
bind 0.0.0.0
|
||||||
|
protected-mode no
|
||||||
|
port 6379
|
||||||
|
tcp-backlog 511
|
||||||
|
timeout 0
|
||||||
|
tcp-keepalive 300
|
||||||
|
|
||||||
|
# Persistence: AOF (fsync every second is the standard SignalR-backplane
|
||||||
|
# durability sweet spot — the backplane only needs to survive Redis
|
||||||
|
# restarts, not absolute zero loss).
|
||||||
|
appendonly yes
|
||||||
|
appendfsync everysec
|
||||||
|
auto-aof-rewrite-percentage 100
|
||||||
|
auto-aof-rewrite-min-size 64mb
|
||||||
|
|
||||||
|
# Reasonable defaults — let Redis pick most things.
|
||||||
|
maxmemory-policy allkeys-lru
|
||||||
|
maxmemory 256mb
|
||||||
|
|
||||||
|
# Logging
|
||||||
|
loglevel notice
|
||||||
|
---
|
||||||
|
apiVersion: apps/v1
|
||||||
|
kind: Deployment
|
||||||
|
metadata:
|
||||||
|
name: fc-redis
|
||||||
|
namespace: fc-redis
|
||||||
|
labels:
|
||||||
|
app: fc-redis
|
||||||
|
spec:
|
||||||
|
replicas: 1
|
||||||
|
strategy:
|
||||||
|
type: Recreate # RWO PVC; do not do rolling update
|
||||||
|
selector:
|
||||||
|
matchLabels:
|
||||||
|
app: fc-redis
|
||||||
|
template:
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app: fc-redis
|
||||||
|
spec:
|
||||||
|
securityContext:
|
||||||
|
runAsNonRoot: true
|
||||||
|
runAsUser: 999 # redis:7-alpine default uid
|
||||||
|
runAsGroup: 999
|
||||||
|
fsGroup: 999
|
||||||
|
containers:
|
||||||
|
- name: redis
|
||||||
|
image: redis:7-alpine
|
||||||
|
imagePullPolicy: IfNotPresent
|
||||||
|
command: ["redis-server", "/etc/redis/redis.conf"]
|
||||||
|
ports:
|
||||||
|
- name: redis
|
||||||
|
containerPort: 6379
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
cpu: "50m"
|
||||||
|
memory: "128Mi"
|
||||||
|
limits:
|
||||||
|
cpu: "500m"
|
||||||
|
memory: "384Mi"
|
||||||
|
volumeMounts:
|
||||||
|
- name: data
|
||||||
|
mountPath: /data
|
||||||
|
- name: config
|
||||||
|
mountPath: /etc/redis
|
||||||
|
readOnly: true
|
||||||
|
livenessProbe:
|
||||||
|
tcpSocket:
|
||||||
|
port: 6379
|
||||||
|
initialDelaySeconds: 5
|
||||||
|
periodSeconds: 10
|
||||||
|
readinessProbe:
|
||||||
|
exec:
|
||||||
|
command: ["redis-cli", "ping"]
|
||||||
|
initialDelaySeconds: 2
|
||||||
|
periodSeconds: 5
|
||||||
|
securityContext:
|
||||||
|
allowPrivilegeEscalation: false
|
||||||
|
readOnlyRootFilesystem: true
|
||||||
|
capabilities:
|
||||||
|
drop: [ALL]
|
||||||
|
volumes:
|
||||||
|
- name: data
|
||||||
|
persistentVolumeClaim:
|
||||||
|
claimName: fc-redis-data
|
||||||
|
- name: config
|
||||||
|
configMap:
|
||||||
|
name: fc-redis-config
|
||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Service
|
||||||
|
metadata:
|
||||||
|
name: redis
|
||||||
|
namespace: fc-redis
|
||||||
|
spec:
|
||||||
|
type: ClusterIP
|
||||||
|
selector:
|
||||||
|
app: fc-redis
|
||||||
|
ports:
|
||||||
|
- name: redis
|
||||||
|
port: 6379
|
||||||
|
targetPort: 6379
|
||||||
|
protocol: TCP
|
||||||
@@ -466,11 +466,11 @@ spec:
|
|||||||
itemPath: vaults/IAmWorkin/items/Guacamole JSON Auth
|
itemPath: vaults/IAmWorkin/items/Guacamole JSON Auth
|
||||||
---
|
---
|
||||||
---
|
---
|
||||||
# 1Password-backed credentials for Mac mini VNC access (Phase 1 — 2026-04-28)
|
# 1Password-backed credentials for Mac mini VNC access (Phase 1 <EFBFBD> 2026-04-28)
|
||||||
# The operator mints Secret 'macmini-vnc-creds' with keys: username, password, VNC Password
|
# The operator mints Secret 'macmini-vnc-creds' with keys: username, password, VNC Password
|
||||||
# Note: '1Password' field label 'VNC Password' -> K8s Secret key 'VNC Password' (space retained)
|
# Note: '1Password' field label 'VNC Password' -> K8s Secret key 'VNC Password' (space retained)
|
||||||
# Guacamole VNC connection password is sourced from the 'VNC Password' field.
|
# Guacamole VNC connection password is sourced from the 'VNC Password' field.
|
||||||
# Actual IP is 10.0.56.115 (INFRA VLAN) — the 1P item 'IP' field is kept as backup reference.
|
# Actual IP is 10.0.56.115 (INFRA VLAN) <EFBFBD> the 1P item 'IP' field is kept as backup reference.
|
||||||
apiVersion: onepassword.com/v1
|
apiVersion: onepassword.com/v1
|
||||||
kind: OnePasswordItem
|
kind: OnePasswordItem
|
||||||
metadata:
|
metadata:
|
||||||
@@ -481,6 +481,7 @@ metadata:
|
|||||||
app.kubernetes.io/part-of: flowercore
|
app.kubernetes.io/part-of: flowercore
|
||||||
spec:
|
spec:
|
||||||
itemPath: vaults/IAmWorkin/items/Mac Mini
|
itemPath: vaults/IAmWorkin/items/Mac Mini
|
||||||
|
---
|
||||||
# Blue Jay Branding Extension (CSS + translations)
|
# Blue Jay Branding Extension (CSS + translations)
|
||||||
apiVersion: v1
|
apiVersion: v1
|
||||||
kind: ConfigMap
|
kind: ConfigMap
|
||||||
|
|||||||
@@ -411,24 +411,22 @@ spec:
|
|||||||
# Confirmed via debug pod: PVC content IS a real bootable ISO9660
|
# Confirmed via debug pod: PVC content IS a real bootable ISO9660
|
||||||
# (file: "ISO 9660 CD-ROM filesystem data ... (bootable)"), so the
|
# (file: "ISO 9660 CD-ROM filesystem data ... (bootable)"), so the
|
||||||
# only bug was boot priority.
|
# only bug was boot priority.
|
||||||
# 2026-05-08 PM: ISO presented as a virtio-blk DISK (not cdrom).
|
# 2026-05-08 PM: cdrom bus SCSI + containerDisk delivery. This
|
||||||
# Both SATA and SCSI cdrom buses hit OVMF BdsDxe "starting Boot0001
|
# combination boots qemu cleanly and reaches OVMF, but OVMF
|
||||||
# ... Time out" regardless of storage backend (NFS, Longhorn PVC,
|
# BdsDxe still hits "starting Boot0001 ... Time out" on the
|
||||||
# containerDisk tmpfs — all rule out IO speed). The qemu cdrom
|
# cdrom — see HANDOFF.md / CODEX-STATUS.md "OPEN — ci1" for the
|
||||||
# emulation path appears to have a deep-seated read window issue
|
# full diagnostic chain. virtio-blk disk swap was attempted as a
|
||||||
# under KubeVirt v1.4.0's OVMF firmware.
|
# workaround but introduced a separate QEMU rootdisk flock issue
|
||||||
#
|
# without fixing the underlying OVMF cdrom problem; reverted.
|
||||||
# Workaround: present the ISO bytes as a regular virtio-blk disk
|
# Operator decision needed for next architectural step (OVMF
|
||||||
# (model="virtio-non-transitional"). UEFI/OVMF still recognizes
|
# custom build with extended timeout, KubeVirt version bump,
|
||||||
# ISO9660 + El Torito boot records on a regular disk, so it can
|
# Hyper-V/VirtualBox-and-export, or BIOS legacy boot). The
|
||||||
# boot the EFI bootloader the same way it would from a USB stick.
|
# containerDisk distribution pipeline (build/save/scp/ctr import)
|
||||||
# This is also closer to the FlowerCore.Distribution USB-key
|
# is proven and ready to reuse for any of those.
|
||||||
# pattern: the ISO bytes live on a block device, UEFI boots from
|
|
||||||
# the GPT/El Torito boot record, Windows installer runs.
|
|
||||||
- name: windows-iso
|
- name: windows-iso
|
||||||
bootOrder: 1
|
bootOrder: 1
|
||||||
disk:
|
cdrom:
|
||||||
bus: virtio
|
bus: scsi
|
||||||
- name: rootdisk
|
- name: rootdisk
|
||||||
bootOrder: 2
|
bootOrder: 2
|
||||||
disk:
|
disk:
|
||||||
|
|||||||
@@ -974,6 +974,39 @@ data:
|
|||||||
summary: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} replica mismatch"
|
summary: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} replica mismatch"
|
||||||
description: "Spec wants {{ $labels.spec_replicas }} but only {{ $value }} available. Likely a rollout stuck on probe failure, scheduling, or PVC."
|
description: "Spec wants {{ $labels.spec_replicas }} but only {{ $value }} available. Likely a rollout stuck on probe failure, scheduling, or PVC."
|
||||||
|
|
||||||
|
# Q-MR-3 (2026-05-11): multus memory pressure — catches the next OOM
|
||||||
|
# cascade BEFORE multus is OOM-killed cluster-wide. The 2026-05-10
|
||||||
|
# outage (21h) hit because no alert fired on the rising multus working
|
||||||
|
# set — only downstream blackbox / Traefik / service alerts. With
|
||||||
|
# 1Gi limit (bluejay-infra@eb8693e), 80% = ~800MiB; steady-state
|
||||||
|
# runs ~150-250MiB so this only fires when an avalanche starts.
|
||||||
|
- alert: MultusMemoryPressure
|
||||||
|
expr: |
|
||||||
|
container_memory_working_set_bytes{container="kube-multus"}
|
||||||
|
/ container_spec_memory_limit_bytes{container="kube-multus"} > 0.8
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
alert_channel: thermal_print
|
||||||
|
annotations:
|
||||||
|
summary: "kube-multus memory >80% of limit on {{ $labels.node }} for 5m"
|
||||||
|
description: "kube-multus working set is {{ $value | humanizePercentage }} of its memory limit on node {{ $labels.node }}. If this keeps climbing, multus will OOM and all new pod networking will halt cluster-wide (precedent: 2026-05-10 outage)."
|
||||||
|
|
||||||
|
# Q-MR-3 (2026-05-11): namespace pending-pod backlog — catches the
|
||||||
|
# operator-leak avalanche pattern BEFORE it cascades into a multus
|
||||||
|
# CNI OOM. Any FC operator (RemoteDesktop / Distribution / WorldBuilder)
|
||||||
|
# emitting pods without ownerReferences will accumulate them when
|
||||||
|
# the operator crashes. >25 pending pods in any namespace for 30m
|
||||||
|
# is the signal to investigate the reconciler.
|
||||||
|
- alert: NamespacePendingPodBacklog
|
||||||
|
expr: sum by (namespace) (kube_pod_status_phase{phase="Pending"}) > 25
|
||||||
|
for: 30m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Namespace {{ $labels.namespace }} has {{ $value }} Pending pods for 30m"
|
||||||
|
description: "Pending pod count in {{ $labels.namespace }} exceeds 25 sustained for 30m. Likely operator-leak avalanche pattern — children emitted without ownerReferences. Risk of multus CNI OOM cascade."
|
||||||
|
|
||||||
# Longhorn storage health alerts. Required: longhorn scrape job
|
# Longhorn storage health alerts. Required: longhorn scrape job
|
||||||
# (added 2026-04-26 — see scrape_configs above). The K8s events
|
# (added 2026-04-26 — see scrape_configs above). The K8s events
|
||||||
# for "snapshot becomes not ready to use" are transient lifecycle
|
# for "snapshot becomes not ready to use" are transient lifecycle
|
||||||
|
|||||||
@@ -188,13 +188,24 @@ spec:
|
|||||||
- name: kube-multus
|
- name: kube-multus
|
||||||
image: ghcr.io/k8snetworkplumbingwg/multus-cni:snapshot-thick
|
image: ghcr.io/k8snetworkplumbingwg/multus-cni:snapshot-thick
|
||||||
command: [ "/usr/src/multus-cni/bin/multus-daemon" ]
|
command: [ "/usr/src/multus-cni/bin/multus-daemon" ]
|
||||||
|
# 2026-05-11: upstream default of 50Mi memory limit OOM-cascades when
|
||||||
|
# an operator-owned namespace accumulates >100 pending pods retrying
|
||||||
|
# CNI ADD. RemoteDesktop emitted 219 orphan rd-browser-only pods
|
||||||
|
# (missing OwnerReferences), kubelet's CNI ADD avalanche pushed multus
|
||||||
|
# over 50Mi, OOMKilled, restarted with even bigger backlog → loop.
|
||||||
|
# 21h cluster outage. See FlowerCore.Notes:
|
||||||
|
# feedback_multus_50mi_limit_oom_orphan_pod_avalanche.md
|
||||||
|
# 1Gi limit / 512Mi request comfortably handles a 200+ pod CNI
|
||||||
|
# catchup burst on 64GB nodes (nodes are <25% used in steady-state).
|
||||||
|
# Drop back toward 256Mi only after MultusMemoryPressure alert
|
||||||
|
# proves steady-state working set sits well below 200Mi.
|
||||||
resources:
|
resources:
|
||||||
requests:
|
requests:
|
||||||
cpu: "100m"
|
cpu: "100m"
|
||||||
memory: "50Mi"
|
memory: "512Mi"
|
||||||
limits:
|
limits:
|
||||||
cpu: "100m"
|
cpu: "100m"
|
||||||
memory: "50Mi"
|
memory: "1Gi"
|
||||||
securityContext:
|
securityContext:
|
||||||
privileged: true
|
privileged: true
|
||||||
terminationMessagePolicy: FallbackToLogsOnError
|
terminationMessagePolicy: FallbackToLogsOnError
|
||||||
|
|||||||
@@ -127,10 +127,13 @@ spec:
|
|||||||
initContainers:
|
initContainers:
|
||||||
- name: fix-data-perms
|
- name: fix-data-perms
|
||||||
image: busybox:latest
|
image: busybox:latest
|
||||||
# Also chown /shared-tts (hostPath /tmp/tts-audio) so the non-root
|
# Must run as root to chown the hostPath /tmp/tts-audio that may be
|
||||||
# app user (uid 1654) can write Piper .sln16 files that Asterisk
|
# root-owned after node reboot. Pod-level runAsNonRoot:true would
|
||||||
# reads at /var/lib/asterisk/sounds/tts. World-readable (755) is
|
# otherwise inherit and chown would fail with EPERM (see Notes memory
|
||||||
# fine — Asterisk runs as a different uid in the other pod.
|
# feedback_hostpath_initcontainer_chown_perms).
|
||||||
|
securityContext:
|
||||||
|
runAsUser: 0
|
||||||
|
runAsNonRoot: false
|
||||||
command: ["sh", "-c", "chown -R 1654:1654 /data && chown 1654:1654 /shared-tts && chmod 0755 /shared-tts"]
|
command: ["sh", "-c", "chown -R 1654:1654 /data && chown 1654:1654 /shared-tts && chmod 0755 /shared-tts"]
|
||||||
volumeMounts:
|
volumeMounts:
|
||||||
- name: telephony-data
|
- name: telephony-data
|
||||||
|
|||||||
Reference in New Issue
Block a user