Compare commits
6 Commits
claude/ci1
...
f298339152
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
f298339152 | ||
|
|
6e7d88db49 | ||
|
|
5ae50bd491 | ||
|
|
653d4472f5 | ||
|
|
eb8693e1ce | ||
|
|
667777a653 |
171
apps/fc-redis/fc-redis.yaml
Normal file
171
apps/fc-redis/fc-redis.yaml
Normal file
@@ -0,0 +1,171 @@
|
||||
# fc-redis — SignalR backplane for cross-product event bus
|
||||
#
|
||||
# Lands per Q-SO-1 resolution (2026-05-11 PM): SignalR backplane in Phase A,
|
||||
# not Phase C as originally drafted. Operator directive: "Redis can be
|
||||
# deployed just fine as it's another FlowerCore technology we'll want to
|
||||
# manage."
|
||||
#
|
||||
# Phase A scope (this file):
|
||||
# - Single Redis 7.x Alpine pod
|
||||
# - 1Gi Longhorn RWO PVC for AOF persistence
|
||||
# - ClusterIP Service at `redis.fc-redis.svc.cluster.local:6379`
|
||||
# - No AUTH (in-cluster only; not exposed externally)
|
||||
# - No IngressRoute (backplane is server-to-server only)
|
||||
#
|
||||
# Consumers (Phase A IMPL across FC services):
|
||||
# - FlowerCore.Signage.Web (OpsConsoleHub)
|
||||
# - FlowerCore.Scoreboard.Web (ScoreboardHub)
|
||||
# - FlowerCore.SignalControl.Web
|
||||
# - FlowerCore.DMS.Web
|
||||
# - Any other product joining the cross-product event bus
|
||||
#
|
||||
# Each consumer adds:
|
||||
# services.AddSignalR()
|
||||
# .AddStackExchangeRedis(
|
||||
# "redis.fc-redis.svc.cluster.local:6379",
|
||||
# opts => opts.Configuration.ChannelPrefix =
|
||||
# StackExchange.Redis.RedisChannel.Literal("fc-opsconsole"));
|
||||
#
|
||||
# Phase B / C follow-ons (out of scope here):
|
||||
# - Redis Sentinel for HA (3-node)
|
||||
# - AUTH password from 1Password Connect (rotate via /rotate-password)
|
||||
# - redis_exporter sidecar for Prometheus scrape
|
||||
# - Network policies restricting which namespaces can dial 6379
|
||||
#
|
||||
# Design: docs/signage/operations-console-phase-2-design.md §3.5
|
||||
# Decision: Q-SO-1 (RESOLVED 2026-05-11 PM)
|
||||
# Memory: feedback_blooming_ui_pattern_no_iframes
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Namespace
|
||||
metadata:
|
||||
name: fc-redis
|
||||
labels:
|
||||
app.kubernetes.io/part-of: flowercore
|
||||
app.kubernetes.io/managed-by: argocd
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: PersistentVolumeClaim
|
||||
metadata:
|
||||
name: fc-redis-data
|
||||
namespace: fc-redis
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
storageClassName: longhorn
|
||||
resources:
|
||||
requests:
|
||||
storage: 1Gi
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: fc-redis-config
|
||||
namespace: fc-redis
|
||||
data:
|
||||
redis.conf: |
|
||||
# Phase A — minimal config; no AUTH, no replication.
|
||||
bind 0.0.0.0
|
||||
protected-mode no
|
||||
port 6379
|
||||
tcp-backlog 511
|
||||
timeout 0
|
||||
tcp-keepalive 300
|
||||
|
||||
# Persistence: AOF (fsync every second is the standard SignalR-backplane
|
||||
# durability sweet spot — the backplane only needs to survive Redis
|
||||
# restarts, not absolute zero loss).
|
||||
appendonly yes
|
||||
appendfsync everysec
|
||||
auto-aof-rewrite-percentage 100
|
||||
auto-aof-rewrite-min-size 64mb
|
||||
|
||||
# Reasonable defaults — let Redis pick most things.
|
||||
maxmemory-policy allkeys-lru
|
||||
maxmemory 256mb
|
||||
|
||||
# Logging
|
||||
loglevel notice
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: fc-redis
|
||||
namespace: fc-redis
|
||||
labels:
|
||||
app: fc-redis
|
||||
spec:
|
||||
replicas: 1
|
||||
strategy:
|
||||
type: Recreate # RWO PVC; do not do rolling update
|
||||
selector:
|
||||
matchLabels:
|
||||
app: fc-redis
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: fc-redis
|
||||
spec:
|
||||
securityContext:
|
||||
runAsNonRoot: true
|
||||
runAsUser: 999 # redis:7-alpine default uid
|
||||
runAsGroup: 999
|
||||
fsGroup: 999
|
||||
containers:
|
||||
- name: redis
|
||||
image: redis:7-alpine
|
||||
imagePullPolicy: IfNotPresent
|
||||
command: ["redis-server", "/etc/redis/redis.conf"]
|
||||
ports:
|
||||
- name: redis
|
||||
containerPort: 6379
|
||||
resources:
|
||||
requests:
|
||||
cpu: "50m"
|
||||
memory: "128Mi"
|
||||
limits:
|
||||
cpu: "500m"
|
||||
memory: "384Mi"
|
||||
volumeMounts:
|
||||
- name: data
|
||||
mountPath: /data
|
||||
- name: config
|
||||
mountPath: /etc/redis
|
||||
readOnly: true
|
||||
livenessProbe:
|
||||
tcpSocket:
|
||||
port: 6379
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 10
|
||||
readinessProbe:
|
||||
exec:
|
||||
command: ["redis-cli", "ping"]
|
||||
initialDelaySeconds: 2
|
||||
periodSeconds: 5
|
||||
securityContext:
|
||||
allowPrivilegeEscalation: false
|
||||
readOnlyRootFilesystem: true
|
||||
capabilities:
|
||||
drop: [ALL]
|
||||
volumes:
|
||||
- name: data
|
||||
persistentVolumeClaim:
|
||||
claimName: fc-redis-data
|
||||
- name: config
|
||||
configMap:
|
||||
name: fc-redis-config
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: redis
|
||||
namespace: fc-redis
|
||||
spec:
|
||||
type: ClusterIP
|
||||
selector:
|
||||
app: fc-redis
|
||||
ports:
|
||||
- name: redis
|
||||
port: 6379
|
||||
targetPort: 6379
|
||||
protocol: TCP
|
||||
@@ -466,11 +466,11 @@ spec:
|
||||
itemPath: vaults/IAmWorkin/items/Guacamole JSON Auth
|
||||
---
|
||||
---
|
||||
# 1Password-backed credentials for Mac mini VNC access (Phase 1 — 2026-04-28)
|
||||
# 1Password-backed credentials for Mac mini VNC access (Phase 1 <EFBFBD> 2026-04-28)
|
||||
# The operator mints Secret 'macmini-vnc-creds' with keys: username, password, VNC Password
|
||||
# Note: '1Password' field label 'VNC Password' -> K8s Secret key 'VNC Password' (space retained)
|
||||
# Guacamole VNC connection password is sourced from the 'VNC Password' field.
|
||||
# Actual IP is 10.0.56.115 (INFRA VLAN) — the 1P item 'IP' field is kept as backup reference.
|
||||
# Actual IP is 10.0.56.115 (INFRA VLAN) <EFBFBD> the 1P item 'IP' field is kept as backup reference.
|
||||
apiVersion: onepassword.com/v1
|
||||
kind: OnePasswordItem
|
||||
metadata:
|
||||
@@ -481,6 +481,7 @@ metadata:
|
||||
app.kubernetes.io/part-of: flowercore
|
||||
spec:
|
||||
itemPath: vaults/IAmWorkin/items/Mac Mini
|
||||
---
|
||||
# Blue Jay Branding Extension (CSS + translations)
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
|
||||
@@ -411,24 +411,22 @@ spec:
|
||||
# Confirmed via debug pod: PVC content IS a real bootable ISO9660
|
||||
# (file: "ISO 9660 CD-ROM filesystem data ... (bootable)"), so the
|
||||
# only bug was boot priority.
|
||||
# 2026-05-08 PM: ISO presented as a virtio-blk DISK (not cdrom).
|
||||
# Both SATA and SCSI cdrom buses hit OVMF BdsDxe "starting Boot0001
|
||||
# ... Time out" regardless of storage backend (NFS, Longhorn PVC,
|
||||
# containerDisk tmpfs — all rule out IO speed). The qemu cdrom
|
||||
# emulation path appears to have a deep-seated read window issue
|
||||
# under KubeVirt v1.4.0's OVMF firmware.
|
||||
#
|
||||
# Workaround: present the ISO bytes as a regular virtio-blk disk
|
||||
# (model="virtio-non-transitional"). UEFI/OVMF still recognizes
|
||||
# ISO9660 + El Torito boot records on a regular disk, so it can
|
||||
# boot the EFI bootloader the same way it would from a USB stick.
|
||||
# This is also closer to the FlowerCore.Distribution USB-key
|
||||
# pattern: the ISO bytes live on a block device, UEFI boots from
|
||||
# the GPT/El Torito boot record, Windows installer runs.
|
||||
# 2026-05-08 PM: cdrom bus SCSI + containerDisk delivery. This
|
||||
# combination boots qemu cleanly and reaches OVMF, but OVMF
|
||||
# BdsDxe still hits "starting Boot0001 ... Time out" on the
|
||||
# cdrom — see HANDOFF.md / CODEX-STATUS.md "OPEN — ci1" for the
|
||||
# full diagnostic chain. virtio-blk disk swap was attempted as a
|
||||
# workaround but introduced a separate QEMU rootdisk flock issue
|
||||
# without fixing the underlying OVMF cdrom problem; reverted.
|
||||
# Operator decision needed for next architectural step (OVMF
|
||||
# custom build with extended timeout, KubeVirt version bump,
|
||||
# Hyper-V/VirtualBox-and-export, or BIOS legacy boot). The
|
||||
# containerDisk distribution pipeline (build/save/scp/ctr import)
|
||||
# is proven and ready to reuse for any of those.
|
||||
- name: windows-iso
|
||||
bootOrder: 1
|
||||
disk:
|
||||
bus: virtio
|
||||
cdrom:
|
||||
bus: scsi
|
||||
- name: rootdisk
|
||||
bootOrder: 2
|
||||
disk:
|
||||
|
||||
@@ -974,6 +974,39 @@ data:
|
||||
summary: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} replica mismatch"
|
||||
description: "Spec wants {{ $labels.spec_replicas }} but only {{ $value }} available. Likely a rollout stuck on probe failure, scheduling, or PVC."
|
||||
|
||||
# Q-MR-3 (2026-05-11): multus memory pressure — catches the next OOM
|
||||
# cascade BEFORE multus is OOM-killed cluster-wide. The 2026-05-10
|
||||
# outage (21h) hit because no alert fired on the rising multus working
|
||||
# set — only downstream blackbox / Traefik / service alerts. With
|
||||
# 1Gi limit (bluejay-infra@eb8693e), 80% = ~800MiB; steady-state
|
||||
# runs ~150-250MiB so this only fires when an avalanche starts.
|
||||
- alert: MultusMemoryPressure
|
||||
expr: |
|
||||
container_memory_working_set_bytes{container="kube-multus"}
|
||||
/ container_spec_memory_limit_bytes{container="kube-multus"} > 0.8
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
alert_channel: thermal_print
|
||||
annotations:
|
||||
summary: "kube-multus memory >80% of limit on {{ $labels.node }} for 5m"
|
||||
description: "kube-multus working set is {{ $value | humanizePercentage }} of its memory limit on node {{ $labels.node }}. If this keeps climbing, multus will OOM and all new pod networking will halt cluster-wide (precedent: 2026-05-10 outage)."
|
||||
|
||||
# Q-MR-3 (2026-05-11): namespace pending-pod backlog — catches the
|
||||
# operator-leak avalanche pattern BEFORE it cascades into a multus
|
||||
# CNI OOM. Any FC operator (RemoteDesktop / Distribution / WorldBuilder)
|
||||
# emitting pods without ownerReferences will accumulate them when
|
||||
# the operator crashes. >25 pending pods in any namespace for 30m
|
||||
# is the signal to investigate the reconciler.
|
||||
- alert: NamespacePendingPodBacklog
|
||||
expr: sum by (namespace) (kube_pod_status_phase{phase="Pending"}) > 25
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Namespace {{ $labels.namespace }} has {{ $value }} Pending pods for 30m"
|
||||
description: "Pending pod count in {{ $labels.namespace }} exceeds 25 sustained for 30m. Likely operator-leak avalanche pattern — children emitted without ownerReferences. Risk of multus CNI OOM cascade."
|
||||
|
||||
# Longhorn storage health alerts. Required: longhorn scrape job
|
||||
# (added 2026-04-26 — see scrape_configs above). The K8s events
|
||||
# for "snapshot becomes not ready to use" are transient lifecycle
|
||||
|
||||
@@ -188,13 +188,24 @@ spec:
|
||||
- name: kube-multus
|
||||
image: ghcr.io/k8snetworkplumbingwg/multus-cni:snapshot-thick
|
||||
command: [ "/usr/src/multus-cni/bin/multus-daemon" ]
|
||||
# 2026-05-11: upstream default of 50Mi memory limit OOM-cascades when
|
||||
# an operator-owned namespace accumulates >100 pending pods retrying
|
||||
# CNI ADD. RemoteDesktop emitted 219 orphan rd-browser-only pods
|
||||
# (missing OwnerReferences), kubelet's CNI ADD avalanche pushed multus
|
||||
# over 50Mi, OOMKilled, restarted with even bigger backlog → loop.
|
||||
# 21h cluster outage. See FlowerCore.Notes:
|
||||
# feedback_multus_50mi_limit_oom_orphan_pod_avalanche.md
|
||||
# 1Gi limit / 512Mi request comfortably handles a 200+ pod CNI
|
||||
# catchup burst on 64GB nodes (nodes are <25% used in steady-state).
|
||||
# Drop back toward 256Mi only after MultusMemoryPressure alert
|
||||
# proves steady-state working set sits well below 200Mi.
|
||||
resources:
|
||||
requests:
|
||||
cpu: "100m"
|
||||
memory: "50Mi"
|
||||
memory: "512Mi"
|
||||
limits:
|
||||
cpu: "100m"
|
||||
memory: "50Mi"
|
||||
memory: "1Gi"
|
||||
securityContext:
|
||||
privileged: true
|
||||
terminationMessagePolicy: FallbackToLogsOnError
|
||||
|
||||
@@ -127,10 +127,13 @@ spec:
|
||||
initContainers:
|
||||
- name: fix-data-perms
|
||||
image: busybox:latest
|
||||
# Also chown /shared-tts (hostPath /tmp/tts-audio) so the non-root
|
||||
# app user (uid 1654) can write Piper .sln16 files that Asterisk
|
||||
# reads at /var/lib/asterisk/sounds/tts. World-readable (755) is
|
||||
# fine — Asterisk runs as a different uid in the other pod.
|
||||
# Must run as root to chown the hostPath /tmp/tts-audio that may be
|
||||
# root-owned after node reboot. Pod-level runAsNonRoot:true would
|
||||
# otherwise inherit and chown would fail with EPERM (see Notes memory
|
||||
# feedback_hostpath_initcontainer_chown_perms).
|
||||
securityContext:
|
||||
runAsUser: 0
|
||||
runAsNonRoot: false
|
||||
command: ["sh", "-c", "chown -R 1654:1654 /data && chown 1654:1654 /shared-tts && chmod 0755 /shared-tts"]
|
||||
volumeMounts:
|
||||
- name: telephony-data
|
||||
|
||||
Reference in New Issue
Block a user