Compare commits
12 Commits
codex/sign
...
211ecbf294
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
211ecbf294 | ||
|
|
f298339152 | ||
|
|
6e7d88db49 | ||
|
|
5ae50bd491 | ||
|
|
653d4472f5 | ||
|
|
eb8693e1ce | ||
|
|
667777a653 | ||
|
|
84c9feb893 | ||
|
|
427dbfcef2 | ||
|
|
b651a4e2d0 | ||
|
|
b998f50f48 | ||
|
|
8fd9ae1cd3 |
26
apps/fc-devicemgmt/1password-item.yaml
Normal file
26
apps/fc-devicemgmt/1password-item.yaml
Normal file
@@ -0,0 +1,26 @@
|
||||
# Runtime secrets for FlowerCore.DeviceManagement.
|
||||
#
|
||||
# OnePasswordItem operator syncs this item into a Kubernetes Secret with the
|
||||
# same name. Expected fields:
|
||||
# DB-Password
|
||||
# mtls-ca.pem
|
||||
# mtls-client.crt
|
||||
# mtls-client.key
|
||||
# mtls-chain.pem
|
||||
#
|
||||
# Do not add literal secret values to this repo. Runtime pods consume the
|
||||
# synced Secret through env vars and read-only mounts.
|
||||
apiVersion: onepassword.com/v1
|
||||
kind: OnePasswordItem
|
||||
metadata:
|
||||
name: fc-devicemgmt-runtime
|
||||
namespace: fc-devicemgmt
|
||||
labels:
|
||||
app.kubernetes.io/name: fc-devicemgmt
|
||||
app.kubernetes.io/component: secrets
|
||||
app.kubernetes.io/part-of: flowercore
|
||||
app.kubernetes.io/managed-by: argocd
|
||||
flowercore.io/tenant-id: system
|
||||
flowercore.io/created-by: bluejay-infra
|
||||
spec:
|
||||
itemPath: "vaults/IAmWorkin/items/FlowerCore DeviceManagement Runtime"
|
||||
33
apps/fc-devicemgmt/argocd-application.yaml
Normal file
33
apps/fc-devicemgmt/argocd-application.yaml
Normal file
@@ -0,0 +1,33 @@
|
||||
# Explicit ArgoCD Application shape for bootstrap/review.
|
||||
#
|
||||
# The live bluejay-infra ApplicationSet already discovers apps/* directories
|
||||
# and creates this same Application name (`infra-fc-devicemgmt`) automatically.
|
||||
# Keep repoURL on the internal Gitea ClusterIP URL; ArgoCD does not trust the
|
||||
# external step-ca HTTPS endpoint.
|
||||
apiVersion: argoproj.io/v1alpha1
|
||||
kind: Application
|
||||
metadata:
|
||||
name: infra-fc-devicemgmt
|
||||
namespace: argocd
|
||||
labels:
|
||||
app.kubernetes.io/name: fc-devicemgmt
|
||||
app.kubernetes.io/part-of: flowercore
|
||||
app.kubernetes.io/managed-by: argocd
|
||||
flowercore.io/tenant-id: system
|
||||
flowercore.io/created-by: bluejay-infra
|
||||
spec:
|
||||
project: default
|
||||
source:
|
||||
repoURL: http://gitea-clusterip.gitea.svc.cluster.local:3000/bluejay/bluejay-infra.git
|
||||
targetRevision: main
|
||||
path: apps/fc-devicemgmt
|
||||
destination:
|
||||
server: https://kubernetes.default.svc
|
||||
namespace: fc-devicemgmt
|
||||
syncPolicy:
|
||||
automated:
|
||||
prune: true
|
||||
selfHeal: true
|
||||
syncOptions:
|
||||
- CreateNamespace=true
|
||||
- ServerSideApply=true
|
||||
30
apps/fc-devicemgmt/certificate-web.yaml
Normal file
30
apps/fc-devicemgmt/certificate-web.yaml
Normal file
@@ -0,0 +1,30 @@
|
||||
# Certificate for devices.iamworkin.lan.
|
||||
#
|
||||
# Preflight gate: FlowerCore.DNS / pfSense must contain an explicit A record:
|
||||
# devices.iamworkin.lan -> 10.0.56.200
|
||||
# before this Certificate is synced. step-ca ACME cannot see the CoreDNS
|
||||
# wildcard, so missing pfSense DNS produces cert-manager HTTP-01 backoff
|
||||
# (feedback_pfsense_dns_required_for_acme).
|
||||
apiVersion: cert-manager.io/v1
|
||||
kind: Certificate
|
||||
metadata:
|
||||
name: fc-devicemgmt-web-tls
|
||||
namespace: fc-devicemgmt
|
||||
labels:
|
||||
app.kubernetes.io/name: fc-devicemgmt-web
|
||||
app.kubernetes.io/component: web
|
||||
app.kubernetes.io/part-of: flowercore
|
||||
app.kubernetes.io/managed-by: argocd
|
||||
flowercore.io/tenant-id: system
|
||||
flowercore.io/created-by: bluejay-infra
|
||||
annotations:
|
||||
flowercore.io/dns-preflight: "devices.iamworkin.lan must resolve to 10.0.56.200 before ACME sync"
|
||||
spec:
|
||||
secretName: fc-devicemgmt-web-tls
|
||||
issuerRef:
|
||||
name: step-ca-acme
|
||||
kind: ClusterIssuer
|
||||
dnsNames:
|
||||
- devices.iamworkin.lan
|
||||
duration: 720h
|
||||
renewBefore: 240h
|
||||
81
apps/fc-devicemgmt/clusterrole-operator.yaml
Normal file
81
apps/fc-devicemgmt/clusterrole-operator.yaml
Normal file
@@ -0,0 +1,81 @@
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRole
|
||||
metadata:
|
||||
name: fc-devicemgmt-operator
|
||||
labels:
|
||||
app.kubernetes.io/name: fc-devicemgmt-operator
|
||||
app.kubernetes.io/component: operator
|
||||
app.kubernetes.io/part-of: flowercore
|
||||
app.kubernetes.io/managed-by: argocd
|
||||
flowercore.io/tenant-id: system
|
||||
flowercore.io/created-by: bluejay-infra
|
||||
rules:
|
||||
- apiGroups:
|
||||
- devices.flowercore.io
|
||||
resources:
|
||||
- '*'
|
||||
verbs:
|
||||
- get
|
||||
- list
|
||||
- watch
|
||||
- create
|
||||
- update
|
||||
- patch
|
||||
- delete
|
||||
- apiGroups:
|
||||
- devices.flowercore.io
|
||||
resources:
|
||||
- devices/status
|
||||
- devices/finalizers
|
||||
- devicegroups/status
|
||||
- devicegroups/finalizers
|
||||
- devicepolicies/status
|
||||
- devicepolicies/finalizers
|
||||
- remotecommands/status
|
||||
- remotecommands/finalizers
|
||||
verbs:
|
||||
- get
|
||||
- update
|
||||
- patch
|
||||
- apiGroups:
|
||||
- apps
|
||||
resources:
|
||||
- deployments
|
||||
verbs:
|
||||
- get
|
||||
- apiGroups:
|
||||
- ""
|
||||
resources:
|
||||
- pods
|
||||
- services
|
||||
- configmaps
|
||||
- secrets
|
||||
- events
|
||||
verbs:
|
||||
- get
|
||||
- list
|
||||
- watch
|
||||
- create
|
||||
- update
|
||||
- patch
|
||||
- delete
|
||||
- apiGroups:
|
||||
- batch
|
||||
resources:
|
||||
- jobs
|
||||
verbs:
|
||||
- get
|
||||
- list
|
||||
- watch
|
||||
- create
|
||||
- update
|
||||
- patch
|
||||
- delete
|
||||
- apiGroups:
|
||||
- networking.k8s.io
|
||||
resources:
|
||||
- networkpolicies
|
||||
verbs:
|
||||
- get
|
||||
- list
|
||||
- watch
|
||||
19
apps/fc-devicemgmt/clusterrolebinding-operator.yaml
Normal file
19
apps/fc-devicemgmt/clusterrolebinding-operator.yaml
Normal file
@@ -0,0 +1,19 @@
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRoleBinding
|
||||
metadata:
|
||||
name: fc-devicemgmt-operator
|
||||
labels:
|
||||
app.kubernetes.io/name: fc-devicemgmt-operator
|
||||
app.kubernetes.io/component: operator
|
||||
app.kubernetes.io/part-of: flowercore
|
||||
app.kubernetes.io/managed-by: argocd
|
||||
flowercore.io/tenant-id: system
|
||||
flowercore.io/created-by: bluejay-infra
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: ClusterRole
|
||||
name: fc-devicemgmt-operator
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: fc-devicemgmt-operator
|
||||
namespace: fc-devicemgmt
|
||||
109
apps/fc-devicemgmt/deployment-operator.yaml
Normal file
109
apps/fc-devicemgmt/deployment-operator.yaml
Normal file
@@ -0,0 +1,109 @@
|
||||
# FlowerCore.DeviceManagement Operator.
|
||||
#
|
||||
# KubeOps controller for devices.flowercore.io resources. Operator-created
|
||||
# children must set OwnerReferences + traceability labels/annotations per
|
||||
# k8s-pod-ownership-and-traceability-standard.md. RBAC below grants
|
||||
# apps/deployments/get so the process can resolve its own Deployment UID.
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: fc-devicemgmt-operator
|
||||
namespace: fc-devicemgmt
|
||||
labels:
|
||||
app: fc-devicemgmt-operator
|
||||
app.kubernetes.io/name: fc-devicemgmt-operator
|
||||
app.kubernetes.io/component: operator
|
||||
app.kubernetes.io/part-of: flowercore
|
||||
app.kubernetes.io/managed-by: argocd
|
||||
flowercore.io/tenant-id: system
|
||||
flowercore.io/created-by: bluejay-infra
|
||||
annotations:
|
||||
flowercore.io/traceability-standard: k8s-pod-ownership-and-traceability-standard
|
||||
spec:
|
||||
replicas: 1
|
||||
revisionHistoryLimit: 3
|
||||
selector:
|
||||
matchLabels:
|
||||
app: fc-devicemgmt-operator
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: fc-devicemgmt-operator
|
||||
app.kubernetes.io/name: fc-devicemgmt-operator
|
||||
app.kubernetes.io/component: operator
|
||||
app.kubernetes.io/part-of: flowercore
|
||||
app.kubernetes.io/managed-by: argocd
|
||||
flowercore.io/tenant-id: system
|
||||
flowercore.io/created-by: bluejay-infra
|
||||
annotations:
|
||||
prometheus.io/scrape: "true"
|
||||
prometheus.io/port: "8080"
|
||||
prometheus.io/path: "/metrics"
|
||||
flowercore.io/audit-trace-id: "runtime-activity-trace"
|
||||
spec:
|
||||
serviceAccountName: fc-devicemgmt-operator
|
||||
securityContext:
|
||||
fsGroup: 1654
|
||||
fsGroupChangePolicy: OnRootMismatch
|
||||
containers:
|
||||
- name: operator
|
||||
image: localhost/fc-devicemgmt-operator:v20260512-cx5
|
||||
imagePullPolicy: Never
|
||||
ports:
|
||||
- name: metrics
|
||||
containerPort: 8080
|
||||
env:
|
||||
- name: ASPNETCORE_ENVIRONMENT
|
||||
value: "Production"
|
||||
- name: DOTNET_SYSTEM_GLOBALIZATION_INVARIANT
|
||||
value: "false"
|
||||
- name: POD_NAME
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: metadata.name
|
||||
- name: POD_NAMESPACE
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: metadata.namespace
|
||||
- name: FLOWERCORE_KUBERNETES_OWNER_DEPLOYMENT
|
||||
value: "fc-devicemgmt-operator"
|
||||
- name: FlowerCore__Service__Name
|
||||
value: "FlowerCore.DeviceManagement.Operator"
|
||||
- name: FlowerCore__DeviceManagement__DefaultTenantId
|
||||
value: "system"
|
||||
resources:
|
||||
requests:
|
||||
cpu: 50m
|
||||
memory: 128Mi
|
||||
limits:
|
||||
cpu: 500m
|
||||
memory: 512Mi
|
||||
readinessProbe:
|
||||
tcpSocket:
|
||||
port: 8080
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 10
|
||||
livenessProbe:
|
||||
tcpSocket:
|
||||
port: 8080
|
||||
initialDelaySeconds: 20
|
||||
periodSeconds: 30
|
||||
securityContext:
|
||||
runAsNonRoot: true
|
||||
runAsUser: 1654
|
||||
runAsGroup: 1654
|
||||
allowPrivilegeEscalation: false
|
||||
readOnlyRootFilesystem: true
|
||||
capabilities:
|
||||
drop:
|
||||
- ALL
|
||||
volumeMounts:
|
||||
- name: tmp
|
||||
mountPath: /tmp
|
||||
- name: logs
|
||||
mountPath: /app/logs
|
||||
volumes:
|
||||
- name: tmp
|
||||
emptyDir: {}
|
||||
- name: logs
|
||||
emptyDir: {}
|
||||
135
apps/fc-devicemgmt/deployment-web.yaml
Normal file
135
apps/fc-devicemgmt/deployment-web.yaml
Normal file
@@ -0,0 +1,135 @@
|
||||
# FlowerCore.DeviceManagement Web.
|
||||
#
|
||||
# Source repo is expected to ship FlowerCore.DeviceManagement.Web in a later
|
||||
# Sprint 9+ lane. This manifest is static-valid without requiring the image to
|
||||
# exist yet; import localhost/fc-devicemgmt-web:<tag> to all schedulable RKE2
|
||||
# nodes before letting ArgoCD sync a live rollout.
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: fc-devicemgmt-web
|
||||
namespace: fc-devicemgmt
|
||||
labels:
|
||||
app: fc-devicemgmt-web
|
||||
app.kubernetes.io/name: fc-devicemgmt-web
|
||||
app.kubernetes.io/component: web
|
||||
app.kubernetes.io/part-of: flowercore
|
||||
app.kubernetes.io/managed-by: argocd
|
||||
flowercore.io/tenant-id: system
|
||||
flowercore.io/created-by: bluejay-infra
|
||||
annotations:
|
||||
flowercore.io/traceability-standard: k8s-pod-ownership-and-traceability-standard
|
||||
spec:
|
||||
replicas: 2
|
||||
revisionHistoryLimit: 3
|
||||
selector:
|
||||
matchLabels:
|
||||
app: fc-devicemgmt-web
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: fc-devicemgmt-web
|
||||
app.kubernetes.io/name: fc-devicemgmt-web
|
||||
app.kubernetes.io/component: web
|
||||
app.kubernetes.io/part-of: flowercore
|
||||
app.kubernetes.io/managed-by: argocd
|
||||
flowercore.io/tenant-id: system
|
||||
flowercore.io/created-by: bluejay-infra
|
||||
annotations:
|
||||
prometheus.io/scrape: "true"
|
||||
prometheus.io/port: "8080"
|
||||
prometheus.io/path: "/metrics"
|
||||
flowercore.io/audit-trace-id: "runtime-activity-trace"
|
||||
spec:
|
||||
securityContext:
|
||||
fsGroup: 1654
|
||||
fsGroupChangePolicy: OnRootMismatch
|
||||
containers:
|
||||
- name: web
|
||||
image: localhost/fc-devicemgmt-web:v20260512-cx5
|
||||
imagePullPolicy: Never
|
||||
ports:
|
||||
- name: http
|
||||
containerPort: 8080
|
||||
env:
|
||||
- name: ASPNETCORE_URLS
|
||||
value: "http://+:8080"
|
||||
- name: ASPNETCORE_ENVIRONMENT
|
||||
value: "Production"
|
||||
- name: DOTNET_SYSTEM_GLOBALIZATION_INVARIANT
|
||||
value: "false"
|
||||
- name: FlowerCore__Service__Name
|
||||
value: "FlowerCore.DeviceManagement.Web"
|
||||
- name: FlowerCore__DeviceManagement__DefaultTenantId
|
||||
value: "system"
|
||||
- name: FlowerCore__Database__Provider
|
||||
value: "MySql"
|
||||
- name: FlowerCore__Database__Host
|
||||
value: "mysql.fc-mysql.svc"
|
||||
- name: FlowerCore__Database__Database
|
||||
value: "flowercore_devicemgmt"
|
||||
- name: FlowerCore__Database__User
|
||||
value: "fc_devicemgmt"
|
||||
- name: FlowerCore__Database__Password
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: fc-devicemgmt-runtime
|
||||
key: DB-Password
|
||||
- name: FlowerCore__DeviceManagement__AgentMtls__CaPath
|
||||
value: "/secrets/devicemgmt-mtls/mtls-ca.pem"
|
||||
- name: FlowerCore__DeviceManagement__AgentMtls__ClientCertificatePath
|
||||
value: "/secrets/devicemgmt-mtls/mtls-client.crt"
|
||||
- name: FlowerCore__DeviceManagement__AgentMtls__ClientKeyPath
|
||||
value: "/secrets/devicemgmt-mtls/mtls-client.key"
|
||||
- name: FlowerCore__EventBus__Redis__Configuration
|
||||
value: "redis.fc-redis.svc:6379"
|
||||
resources:
|
||||
requests:
|
||||
cpu: 100m
|
||||
memory: 256Mi
|
||||
limits:
|
||||
cpu: 1000m
|
||||
memory: 768Mi
|
||||
startupProbe:
|
||||
tcpSocket:
|
||||
port: 8080
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
failureThreshold: 30
|
||||
readinessProbe:
|
||||
tcpSocket:
|
||||
port: 8080
|
||||
periodSeconds: 10
|
||||
failureThreshold: 3
|
||||
livenessProbe:
|
||||
tcpSocket:
|
||||
port: 8080
|
||||
initialDelaySeconds: 30
|
||||
periodSeconds: 30
|
||||
failureThreshold: 3
|
||||
securityContext:
|
||||
runAsNonRoot: true
|
||||
runAsUser: 1654
|
||||
runAsGroup: 1654
|
||||
allowPrivilegeEscalation: false
|
||||
readOnlyRootFilesystem: true
|
||||
capabilities:
|
||||
drop:
|
||||
- ALL
|
||||
volumeMounts:
|
||||
- name: tmp
|
||||
mountPath: /tmp
|
||||
- name: logs
|
||||
mountPath: /app/logs
|
||||
- name: devicemgmt-mtls
|
||||
mountPath: /secrets/devicemgmt-mtls
|
||||
readOnly: true
|
||||
volumes:
|
||||
- name: tmp
|
||||
emptyDir: {}
|
||||
- name: logs
|
||||
emptyDir: {}
|
||||
- name: devicemgmt-mtls
|
||||
secret:
|
||||
secretName: fc-devicemgmt-runtime
|
||||
defaultMode: 0400
|
||||
55
apps/fc-devicemgmt/ingressroute-web.yaml
Normal file
55
apps/fc-devicemgmt/ingressroute-web.yaml
Normal file
@@ -0,0 +1,55 @@
|
||||
# LAN ingress for FlowerCore.DeviceManagement Web.
|
||||
#
|
||||
# RKE2 Traefik has no built-in ACME resolver configured. Keep TLS certificate
|
||||
# ownership in cert-manager Certificate/fc-devicemgmt-web-tls.
|
||||
apiVersion: traefik.io/v1alpha1
|
||||
kind: IngressRoute
|
||||
metadata:
|
||||
name: fc-devicemgmt-web
|
||||
namespace: fc-devicemgmt
|
||||
labels:
|
||||
app.kubernetes.io/name: fc-devicemgmt-web
|
||||
app.kubernetes.io/component: web
|
||||
app.kubernetes.io/part-of: flowercore
|
||||
app.kubernetes.io/managed-by: argocd
|
||||
flowercore.io/tenant-id: system
|
||||
flowercore.io/created-by: bluejay-infra
|
||||
spec:
|
||||
entryPoints:
|
||||
- websecure
|
||||
routes:
|
||||
- match: Host(`devices.iamworkin.lan`)
|
||||
kind: Rule
|
||||
services:
|
||||
- name: fc-devicemgmt-web
|
||||
port: 80
|
||||
tls:
|
||||
secretName: fc-devicemgmt-web-tls
|
||||
|
||||
# Future public agent/update host gate (OFF by default):
|
||||
#
|
||||
# Do not enable `update.flowercore.io` here until Authentik OIDC Q-OIDC-1
|
||||
# resolves the public-device-management auth model and route ownership with
|
||||
# UpdateCenter. When enabled, use a separate public IngressRoute with an
|
||||
# explicit Method allowlist, public-host auth middleware, and public TLS
|
||||
# certificate strategy. Leaving this as comments keeps ArgoCD from stealing
|
||||
# live UpdateCenter traffic.
|
||||
#
|
||||
# apiVersion: traefik.io/v1alpha1
|
||||
# kind: IngressRoute
|
||||
# metadata:
|
||||
# name: fc-devicemgmt-web-public
|
||||
# namespace: fc-devicemgmt
|
||||
# annotations:
|
||||
# flowercore.io/public-host-gate: "disabled-until-Q-OIDC-1"
|
||||
# spec:
|
||||
# entryPoints:
|
||||
# - websecure
|
||||
# routes:
|
||||
# - match: Host(`update.flowercore.io`) && (Method(`GET`) || Method(`HEAD`) || Method(`POST`) || Method(`OPTIONS`))
|
||||
# kind: Rule
|
||||
# services:
|
||||
# - name: fc-devicemgmt-web
|
||||
# port: 80
|
||||
# tls:
|
||||
# secretName: fc-devicemgmt-public-tls
|
||||
13
apps/fc-devicemgmt/namespace.yaml
Normal file
13
apps/fc-devicemgmt/namespace.yaml
Normal file
@@ -0,0 +1,13 @@
|
||||
# FlowerCore.DeviceManagement namespace.
|
||||
#
|
||||
# ArgoCD discovers this directory as Application `infra-fc-devicemgmt`.
|
||||
apiVersion: v1
|
||||
kind: Namespace
|
||||
metadata:
|
||||
name: fc-devicemgmt
|
||||
labels:
|
||||
app.kubernetes.io/name: fc-devicemgmt
|
||||
app.kubernetes.io/part-of: flowercore
|
||||
app.kubernetes.io/managed-by: argocd
|
||||
flowercore.io/tenant-id: system
|
||||
flowercore.io/created-by: bluejay-infra
|
||||
224
apps/fc-devicemgmt/network-policy.yaml
Normal file
224
apps/fc-devicemgmt/network-policy.yaml
Normal file
@@ -0,0 +1,224 @@
|
||||
# FlowerCore.DeviceManagement NetworkPolicies.
|
||||
#
|
||||
# NetworkPolicies belong in bluejay-infra so ArgoCD owns rebuild state.
|
||||
# Rules include Traefik post-DNAT backend ports per
|
||||
# feedback_netpol_dnat_backend_port and Synology NFS egress for the requested
|
||||
# cold-tier / future artifact path.
|
||||
---
|
||||
apiVersion: networking.k8s.io/v1
|
||||
kind: NetworkPolicy
|
||||
metadata:
|
||||
name: fc-devicemgmt-web-isolation
|
||||
namespace: fc-devicemgmt
|
||||
labels:
|
||||
app.kubernetes.io/name: fc-devicemgmt-web
|
||||
app.kubernetes.io/component: web
|
||||
app.kubernetes.io/part-of: flowercore
|
||||
app.kubernetes.io/managed-by: argocd
|
||||
flowercore.io/tenant-id: system
|
||||
flowercore.io/created-by: bluejay-infra
|
||||
spec:
|
||||
podSelector:
|
||||
matchLabels:
|
||||
app: fc-devicemgmt-web
|
||||
policyTypes:
|
||||
- Ingress
|
||||
- Egress
|
||||
ingress:
|
||||
# LAN edge: only cluster Traefik should reach the Web pod for
|
||||
# devices.iamworkin.lan.
|
||||
- from:
|
||||
- namespaceSelector:
|
||||
matchLabels:
|
||||
kubernetes.io/metadata.name: traefik-system
|
||||
podSelector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: traefik
|
||||
ports:
|
||||
- port: 8080
|
||||
protocol: TCP
|
||||
# Direct LAN diagnostics are allowed only from FlowerCore LAN/VPN ranges.
|
||||
- from:
|
||||
- ipBlock:
|
||||
cidr: 10.0.56.0/24
|
||||
- ipBlock:
|
||||
cidr: 10.0.57.0/24
|
||||
- ipBlock:
|
||||
cidr: 10.0.58.0/24
|
||||
- ipBlock:
|
||||
cidr: 10.0.68.0/27
|
||||
ports:
|
||||
- port: 8080
|
||||
protocol: TCP
|
||||
egress:
|
||||
# CoreDNS.
|
||||
- to:
|
||||
- namespaceSelector:
|
||||
matchLabels:
|
||||
kubernetes.io/metadata.name: kube-system
|
||||
podSelector:
|
||||
matchLabels:
|
||||
k8s-app: kube-dns
|
||||
ports:
|
||||
- port: 53
|
||||
protocol: UDP
|
||||
- port: 53
|
||||
protocol: TCP
|
||||
# Database namespace.
|
||||
- to:
|
||||
- namespaceSelector:
|
||||
matchLabels:
|
||||
kubernetes.io/metadata.name: fc-mysql
|
||||
ports:
|
||||
- port: 3306
|
||||
protocol: TCP
|
||||
# Redis backplane for multi-replica SignalR / live-status fan-out.
|
||||
- to:
|
||||
- namespaceSelector:
|
||||
matchLabels:
|
||||
kubernetes.io/metadata.name: fc-redis
|
||||
ports:
|
||||
- port: 6379
|
||||
protocol: TCP
|
||||
# Traefik VIP / in-cluster Traefik for self-callbacks and public URL
|
||||
# generation tests. Include post-DNAT backend ports 8443 + 8080.
|
||||
- to:
|
||||
- ipBlock:
|
||||
cidr: 10.0.56.200/32
|
||||
- namespaceSelector:
|
||||
matchLabels:
|
||||
kubernetes.io/metadata.name: traefik-system
|
||||
podSelector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: traefik
|
||||
ports:
|
||||
- port: 80
|
||||
protocol: TCP
|
||||
- port: 443
|
||||
protocol: TCP
|
||||
- port: 8080
|
||||
protocol: TCP
|
||||
- port: 8443
|
||||
protocol: TCP
|
||||
# Agent egress: LAN/VPN devices may run DM Agent in Generic, Kiosk, Pi,
|
||||
# ThinClient, or Server mode. Keep this private-range only.
|
||||
- to:
|
||||
- ipBlock:
|
||||
cidr: 10.0.56.0/24
|
||||
- ipBlock:
|
||||
cidr: 10.0.57.0/24
|
||||
- ipBlock:
|
||||
cidr: 10.0.58.0/24
|
||||
- ipBlock:
|
||||
cidr: 10.0.68.0/27
|
||||
ports:
|
||||
- port: 80
|
||||
protocol: TCP
|
||||
- port: 443
|
||||
protocol: TCP
|
||||
- port: 8080
|
||||
protocol: TCP
|
||||
- port: 8443
|
||||
protocol: TCP
|
||||
- port: 5000
|
||||
protocol: TCP
|
||||
- port: 5001
|
||||
protocol: TCP
|
||||
# Synology NFS cold-tier / artifact mount allowance.
|
||||
- to:
|
||||
- ipBlock:
|
||||
cidr: 10.0.58.3/32
|
||||
ports:
|
||||
- port: 2049
|
||||
protocol: TCP
|
||||
- port: 2049
|
||||
protocol: UDP
|
||||
- port: 111
|
||||
protocol: TCP
|
||||
- port: 111
|
||||
protocol: UDP
|
||||
---
|
||||
apiVersion: networking.k8s.io/v1
|
||||
kind: NetworkPolicy
|
||||
metadata:
|
||||
name: fc-devicemgmt-operator-isolation
|
||||
namespace: fc-devicemgmt
|
||||
labels:
|
||||
app.kubernetes.io/name: fc-devicemgmt-operator
|
||||
app.kubernetes.io/component: operator
|
||||
app.kubernetes.io/part-of: flowercore
|
||||
app.kubernetes.io/managed-by: argocd
|
||||
flowercore.io/tenant-id: system
|
||||
flowercore.io/created-by: bluejay-infra
|
||||
spec:
|
||||
podSelector:
|
||||
matchLabels:
|
||||
app: fc-devicemgmt-operator
|
||||
policyTypes:
|
||||
- Ingress
|
||||
- Egress
|
||||
ingress:
|
||||
- from:
|
||||
- namespaceSelector:
|
||||
matchLabels:
|
||||
kubernetes.io/metadata.name: monitoring
|
||||
ports:
|
||||
- port: 8080
|
||||
protocol: TCP
|
||||
egress:
|
||||
# CoreDNS.
|
||||
- to:
|
||||
- namespaceSelector:
|
||||
matchLabels:
|
||||
kubernetes.io/metadata.name: kube-system
|
||||
podSelector:
|
||||
matchLabels:
|
||||
k8s-app: kube-dns
|
||||
ports:
|
||||
- port: 53
|
||||
protocol: UDP
|
||||
- port: 53
|
||||
protocol: TCP
|
||||
# Kubernetes API for KubeOps reconciliation and Deployment UID lookup.
|
||||
- to: []
|
||||
ports:
|
||||
- port: 443
|
||||
protocol: TCP
|
||||
- port: 6443
|
||||
protocol: TCP
|
||||
# Agent egress for operator-initiated probes / fallback command dispatch.
|
||||
- to:
|
||||
- ipBlock:
|
||||
cidr: 10.0.56.0/24
|
||||
- ipBlock:
|
||||
cidr: 10.0.57.0/24
|
||||
- ipBlock:
|
||||
cidr: 10.0.58.0/24
|
||||
- ipBlock:
|
||||
cidr: 10.0.68.0/27
|
||||
ports:
|
||||
- port: 80
|
||||
protocol: TCP
|
||||
- port: 443
|
||||
protocol: TCP
|
||||
- port: 8080
|
||||
protocol: TCP
|
||||
- port: 8443
|
||||
protocol: TCP
|
||||
- port: 5000
|
||||
protocol: TCP
|
||||
- port: 5001
|
||||
protocol: TCP
|
||||
# Synology NFS allowance for future cold-tier/audit archival jobs.
|
||||
- to:
|
||||
- ipBlock:
|
||||
cidr: 10.0.58.3/32
|
||||
ports:
|
||||
- port: 2049
|
||||
protocol: TCP
|
||||
- port: 2049
|
||||
protocol: UDP
|
||||
- port: 111
|
||||
protocol: TCP
|
||||
- port: 111
|
||||
protocol: UDP
|
||||
22
apps/fc-devicemgmt/service-web.yaml
Normal file
22
apps/fc-devicemgmt/service-web.yaml
Normal file
@@ -0,0 +1,22 @@
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: fc-devicemgmt-web
|
||||
namespace: fc-devicemgmt
|
||||
labels:
|
||||
app: fc-devicemgmt-web
|
||||
app.kubernetes.io/name: fc-devicemgmt-web
|
||||
app.kubernetes.io/component: web
|
||||
app.kubernetes.io/part-of: flowercore
|
||||
app.kubernetes.io/managed-by: argocd
|
||||
flowercore.io/tenant-id: system
|
||||
flowercore.io/created-by: bluejay-infra
|
||||
spec:
|
||||
type: ClusterIP
|
||||
selector:
|
||||
app: fc-devicemgmt-web
|
||||
ports:
|
||||
- name: http
|
||||
port: 80
|
||||
targetPort: 8080
|
||||
protocol: TCP
|
||||
12
apps/fc-devicemgmt/serviceaccount-operator.yaml
Normal file
12
apps/fc-devicemgmt/serviceaccount-operator.yaml
Normal file
@@ -0,0 +1,12 @@
|
||||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: fc-devicemgmt-operator
|
||||
namespace: fc-devicemgmt
|
||||
labels:
|
||||
app.kubernetes.io/name: fc-devicemgmt-operator
|
||||
app.kubernetes.io/component: operator
|
||||
app.kubernetes.io/part-of: flowercore
|
||||
app.kubernetes.io/managed-by: argocd
|
||||
flowercore.io/tenant-id: system
|
||||
flowercore.io/created-by: bluejay-infra
|
||||
171
apps/fc-redis/fc-redis.yaml
Normal file
171
apps/fc-redis/fc-redis.yaml
Normal file
@@ -0,0 +1,171 @@
|
||||
# fc-redis — SignalR backplane for cross-product event bus
|
||||
#
|
||||
# Lands per Q-SO-1 resolution (2026-05-11 PM): SignalR backplane in Phase A,
|
||||
# not Phase C as originally drafted. Operator directive: "Redis can be
|
||||
# deployed just fine as it's another FlowerCore technology we'll want to
|
||||
# manage."
|
||||
#
|
||||
# Phase A scope (this file):
|
||||
# - Single Redis 7.x Alpine pod
|
||||
# - 1Gi Longhorn RWO PVC for AOF persistence
|
||||
# - ClusterIP Service at `redis.fc-redis.svc.cluster.local:6379`
|
||||
# - No AUTH (in-cluster only; not exposed externally)
|
||||
# - No IngressRoute (backplane is server-to-server only)
|
||||
#
|
||||
# Consumers (Phase A IMPL across FC services):
|
||||
# - FlowerCore.Signage.Web (OpsConsoleHub)
|
||||
# - FlowerCore.Scoreboard.Web (ScoreboardHub)
|
||||
# - FlowerCore.SignalControl.Web
|
||||
# - FlowerCore.DMS.Web
|
||||
# - Any other product joining the cross-product event bus
|
||||
#
|
||||
# Each consumer adds:
|
||||
# services.AddSignalR()
|
||||
# .AddStackExchangeRedis(
|
||||
# "redis.fc-redis.svc.cluster.local:6379",
|
||||
# opts => opts.Configuration.ChannelPrefix =
|
||||
# StackExchange.Redis.RedisChannel.Literal("fc-opsconsole"));
|
||||
#
|
||||
# Phase B / C follow-ons (out of scope here):
|
||||
# - Redis Sentinel for HA (3-node)
|
||||
# - AUTH password from 1Password Connect (rotate via /rotate-password)
|
||||
# - redis_exporter sidecar for Prometheus scrape
|
||||
# - Network policies restricting which namespaces can dial 6379
|
||||
#
|
||||
# Design: docs/signage/operations-console-phase-2-design.md §3.5
|
||||
# Decision: Q-SO-1 (RESOLVED 2026-05-11 PM)
|
||||
# Memory: feedback_blooming_ui_pattern_no_iframes
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Namespace
|
||||
metadata:
|
||||
name: fc-redis
|
||||
labels:
|
||||
app.kubernetes.io/part-of: flowercore
|
||||
app.kubernetes.io/managed-by: argocd
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: PersistentVolumeClaim
|
||||
metadata:
|
||||
name: fc-redis-data
|
||||
namespace: fc-redis
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
storageClassName: longhorn
|
||||
resources:
|
||||
requests:
|
||||
storage: 1Gi
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: fc-redis-config
|
||||
namespace: fc-redis
|
||||
data:
|
||||
redis.conf: |
|
||||
# Phase A — minimal config; no AUTH, no replication.
|
||||
bind 0.0.0.0
|
||||
protected-mode no
|
||||
port 6379
|
||||
tcp-backlog 511
|
||||
timeout 0
|
||||
tcp-keepalive 300
|
||||
|
||||
# Persistence: AOF (fsync every second is the standard SignalR-backplane
|
||||
# durability sweet spot — the backplane only needs to survive Redis
|
||||
# restarts, not absolute zero loss).
|
||||
appendonly yes
|
||||
appendfsync everysec
|
||||
auto-aof-rewrite-percentage 100
|
||||
auto-aof-rewrite-min-size 64mb
|
||||
|
||||
# Reasonable defaults — let Redis pick most things.
|
||||
maxmemory-policy allkeys-lru
|
||||
maxmemory 256mb
|
||||
|
||||
# Logging
|
||||
loglevel notice
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: fc-redis
|
||||
namespace: fc-redis
|
||||
labels:
|
||||
app: fc-redis
|
||||
spec:
|
||||
replicas: 1
|
||||
strategy:
|
||||
type: Recreate # RWO PVC; do not do rolling update
|
||||
selector:
|
||||
matchLabels:
|
||||
app: fc-redis
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: fc-redis
|
||||
spec:
|
||||
securityContext:
|
||||
runAsNonRoot: true
|
||||
runAsUser: 999 # redis:7-alpine default uid
|
||||
runAsGroup: 999
|
||||
fsGroup: 999
|
||||
containers:
|
||||
- name: redis
|
||||
image: redis:7-alpine
|
||||
imagePullPolicy: IfNotPresent
|
||||
command: ["redis-server", "/etc/redis/redis.conf"]
|
||||
ports:
|
||||
- name: redis
|
||||
containerPort: 6379
|
||||
resources:
|
||||
requests:
|
||||
cpu: "50m"
|
||||
memory: "128Mi"
|
||||
limits:
|
||||
cpu: "500m"
|
||||
memory: "384Mi"
|
||||
volumeMounts:
|
||||
- name: data
|
||||
mountPath: /data
|
||||
- name: config
|
||||
mountPath: /etc/redis
|
||||
readOnly: true
|
||||
livenessProbe:
|
||||
tcpSocket:
|
||||
port: 6379
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 10
|
||||
readinessProbe:
|
||||
exec:
|
||||
command: ["redis-cli", "ping"]
|
||||
initialDelaySeconds: 2
|
||||
periodSeconds: 5
|
||||
securityContext:
|
||||
allowPrivilegeEscalation: false
|
||||
readOnlyRootFilesystem: true
|
||||
capabilities:
|
||||
drop: [ALL]
|
||||
volumes:
|
||||
- name: data
|
||||
persistentVolumeClaim:
|
||||
claimName: fc-redis-data
|
||||
- name: config
|
||||
configMap:
|
||||
name: fc-redis-config
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: redis
|
||||
namespace: fc-redis
|
||||
spec:
|
||||
type: ClusterIP
|
||||
selector:
|
||||
app: fc-redis
|
||||
ports:
|
||||
- name: redis
|
||||
port: 6379
|
||||
targetPort: 6379
|
||||
protocol: TCP
|
||||
@@ -58,7 +58,7 @@ spec:
|
||||
nodeName: rke2-server
|
||||
containers:
|
||||
- name: web
|
||||
image: localhost/fc-updater-web:v20260508-pub3-deepening-2bdf108
|
||||
image: localhost/fc-updater-web:v20260509-4162dca-authgate
|
||||
imagePullPolicy: Never
|
||||
ports:
|
||||
- containerPort: 8080
|
||||
|
||||
@@ -466,11 +466,11 @@ spec:
|
||||
itemPath: vaults/IAmWorkin/items/Guacamole JSON Auth
|
||||
---
|
||||
---
|
||||
# 1Password-backed credentials for Mac mini VNC access (Phase 1 — 2026-04-28)
|
||||
# 1Password-backed credentials for Mac mini VNC access (Phase 1 <EFBFBD> 2026-04-28)
|
||||
# The operator mints Secret 'macmini-vnc-creds' with keys: username, password, VNC Password
|
||||
# Note: '1Password' field label 'VNC Password' -> K8s Secret key 'VNC Password' (space retained)
|
||||
# Guacamole VNC connection password is sourced from the 'VNC Password' field.
|
||||
# Actual IP is 10.0.56.115 (INFRA VLAN) — the 1P item 'IP' field is kept as backup reference.
|
||||
# Actual IP is 10.0.56.115 (INFRA VLAN) <EFBFBD> the 1P item 'IP' field is kept as backup reference.
|
||||
apiVersion: onepassword.com/v1
|
||||
kind: OnePasswordItem
|
||||
metadata:
|
||||
@@ -481,6 +481,7 @@ metadata:
|
||||
app.kubernetes.io/part-of: flowercore
|
||||
spec:
|
||||
itemPath: vaults/IAmWorkin/items/Mac Mini
|
||||
---
|
||||
# Blue Jay Branding Extension (CSS + translations)
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
|
||||
@@ -377,7 +377,22 @@ spec:
|
||||
firmware:
|
||||
bootloader:
|
||||
efi:
|
||||
secureBoot: true
|
||||
# 2026-05-08: SecureBoot=false during initial install. With SecureBoot
|
||||
# enabled, OVMF's BdsDxe times out reading Boot0001 from the SCSI
|
||||
# CDROM ("BdsDxe: failed to start Boot0001 ... Time out") before the
|
||||
# EFI bootloader signature can verify against the OVMF VARS trust DB.
|
||||
# KubeVirt's `/usr/share/OVMF/OVMF_VARS.secboot.fd` template doesn't
|
||||
# appear to include the Microsoft KEK/DB by default, so signed
|
||||
# Windows EFI bootloaders fail validation. Disabling SecureBoot lets
|
||||
# OVMF skip the chain check and boot directly. This is acceptable for
|
||||
# a CI runner — TPM 2.0 is still emulated (`tpm: {}` below) so
|
||||
# BitLocker / Hyper-V / WSL still work.
|
||||
# When the operator wants SecureBoot back, the path is:
|
||||
# 1. Custom-build OVMF_VARS.fd with Microsoft KEK/DB enrolled
|
||||
# 2. Mount it into the VM via firmware.bootloader.efi.persistent
|
||||
# 3. Set secureBoot: true again
|
||||
# Tracked separately from the install unblock.
|
||||
secureBoot: false
|
||||
devices:
|
||||
tpm: {} # Non-persistent vTPM — sufficient for runner; no BitLocker
|
||||
disks:
|
||||
@@ -396,10 +411,22 @@ spec:
|
||||
# Confirmed via debug pod: PVC content IS a real bootable ISO9660
|
||||
# (file: "ISO 9660 CD-ROM filesystem data ... (bootable)"), so the
|
||||
# only bug was boot priority.
|
||||
# 2026-05-08 PM: cdrom bus SCSI + containerDisk delivery. This
|
||||
# combination boots qemu cleanly and reaches OVMF, but OVMF
|
||||
# BdsDxe still hits "starting Boot0001 ... Time out" on the
|
||||
# cdrom — see HANDOFF.md / CODEX-STATUS.md "OPEN — ci1" for the
|
||||
# full diagnostic chain. virtio-blk disk swap was attempted as a
|
||||
# workaround but introduced a separate QEMU rootdisk flock issue
|
||||
# without fixing the underlying OVMF cdrom problem; reverted.
|
||||
# Operator decision needed for next architectural step (OVMF
|
||||
# custom build with extended timeout, KubeVirt version bump,
|
||||
# Hyper-V/VirtualBox-and-export, or BIOS legacy boot). The
|
||||
# containerDisk distribution pipeline (build/save/scp/ctr import)
|
||||
# is proven and ready to reuse for any of those.
|
||||
- name: windows-iso
|
||||
bootOrder: 1
|
||||
cdrom:
|
||||
bus: sata
|
||||
bus: scsi
|
||||
- name: rootdisk
|
||||
bootOrder: 2
|
||||
disk:
|
||||
@@ -430,17 +457,40 @@ spec:
|
||||
persistentVolumeClaim:
|
||||
claimName: ci1-rootdisk
|
||||
- name: windows-iso
|
||||
# Path B (2026-05-08): mount ISO from Synology NFS instead of
|
||||
# Longhorn Filesystem PVC. The Filesystem-PVC path was confirmed to
|
||||
# contain a valid bootable ISO9660 image but caused OVMF's
|
||||
# SATA-CDROM read window to time out:
|
||||
# BdsDxe: failed to start Boot0001 ... Time out
|
||||
# Block-mode DataVolume was attempted as Path A but blocked by CDI
|
||||
# v1.65.0's upload pod capability drop. NFS-mounted ISO bypasses
|
||||
# both issues. See win2025-iso-nfs-pv.yaml header for full rationale
|
||||
# and Synology layout.
|
||||
persistentVolumeClaim:
|
||||
claimName: windows-server-2025-iso-nfs
|
||||
# 2026-05-08 PM (Path C, CONTAINERDISK): the ISO is now packaged as
|
||||
# a KubeVirt containerDisk OCI image baked from
|
||||
# `FROM scratch ; ADD --chown=107:107 disk.img /disk/disk.img`.
|
||||
# The qemu user (uid 107) reads the ISO directly from a tmpfs view
|
||||
# of the OCI layer, bypassing both:
|
||||
# - Synology NFS export ACL (Path B failed: uid 107 denied at
|
||||
# directory level even with mode 0777, see memory
|
||||
# feedback_synology_iso_export_root_only_uid_107_denied)
|
||||
# - OVMF cdrom read-window timeout (Path A and Path B's SCSI
|
||||
# retry both hit `BdsDxe: failed to start Boot0001 ... Time out`
|
||||
# when the cdrom was backed by a PVC the storage controller
|
||||
# couldn't satisfy reads from fast enough).
|
||||
#
|
||||
# Image build (one-time, per ISO version):
|
||||
# 1. Copy ISO to disk.img, write Dockerfile
|
||||
# 2. podman build --tag localhost/win-server-2025:1.0 . (on noc1)
|
||||
# 3. podman save -o win-server-2025-1.0.tar localhost/win-server-2025:1.0
|
||||
# 4. SCP tar to all 3 RKE2 nodes (rke2-server, rke2-agent1, rke2-agent2)
|
||||
# 5. sudo /var/lib/rancher/rke2/bin/ctr -a /run/k3s/containerd/containerd.sock \
|
||||
# -n k8s.io images import /tmp/win-server-2025-1.0.tar
|
||||
# Standard FC pattern per `feedback_rke2_localhost_imagepullpolicy`.
|
||||
#
|
||||
# When a new Windows ISO version ships, bump the tag (1.1, 1.2, ...),
|
||||
# rebuild + redistribute, and update the image: line below in a new
|
||||
# commit. KubeVirt picks up the new image via a VM restart.
|
||||
#
|
||||
# The legacy NFS PVC + PV (apps/kubevirt-vms/win2025-iso-nfs-pv.yaml)
|
||||
# and CDI Longhorn PVC (`windows-server-2025-iso`) are RETAINED for
|
||||
# this commit so the prior states are recoverable. Once the
|
||||
# containerDisk path proves on a successful Windows install, both
|
||||
# legacy artifacts can be pruned in a follow-up commit.
|
||||
containerDisk:
|
||||
image: localhost/win-server-2025:1.0
|
||||
imagePullPolicy: Never
|
||||
- name: virtio-drivers
|
||||
containerDisk:
|
||||
# Pinned to v1.8.2 (latest stable as of 2026-05-08).
|
||||
|
||||
@@ -974,6 +974,39 @@ data:
|
||||
summary: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} replica mismatch"
|
||||
description: "Spec wants {{ $labels.spec_replicas }} but only {{ $value }} available. Likely a rollout stuck on probe failure, scheduling, or PVC."
|
||||
|
||||
# Q-MR-3 (2026-05-11): multus memory pressure — catches the next OOM
|
||||
# cascade BEFORE multus is OOM-killed cluster-wide. The 2026-05-10
|
||||
# outage (21h) hit because no alert fired on the rising multus working
|
||||
# set — only downstream blackbox / Traefik / service alerts. With
|
||||
# 1Gi limit (bluejay-infra@eb8693e), 80% = ~800MiB; steady-state
|
||||
# runs ~150-250MiB so this only fires when an avalanche starts.
|
||||
- alert: MultusMemoryPressure
|
||||
expr: |
|
||||
container_memory_working_set_bytes{container="kube-multus"}
|
||||
/ container_spec_memory_limit_bytes{container="kube-multus"} > 0.8
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
alert_channel: thermal_print
|
||||
annotations:
|
||||
summary: "kube-multus memory >80% of limit on {{ $labels.node }} for 5m"
|
||||
description: "kube-multus working set is {{ $value | humanizePercentage }} of its memory limit on node {{ $labels.node }}. If this keeps climbing, multus will OOM and all new pod networking will halt cluster-wide (precedent: 2026-05-10 outage)."
|
||||
|
||||
# Q-MR-3 (2026-05-11): namespace pending-pod backlog — catches the
|
||||
# operator-leak avalanche pattern BEFORE it cascades into a multus
|
||||
# CNI OOM. Any FC operator (RemoteDesktop / Distribution / WorldBuilder)
|
||||
# emitting pods without ownerReferences will accumulate them when
|
||||
# the operator crashes. >25 pending pods in any namespace for 30m
|
||||
# is the signal to investigate the reconciler.
|
||||
- alert: NamespacePendingPodBacklog
|
||||
expr: sum by (namespace) (kube_pod_status_phase{phase="Pending"}) > 25
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Namespace {{ $labels.namespace }} has {{ $value }} Pending pods for 30m"
|
||||
description: "Pending pod count in {{ $labels.namespace }} exceeds 25 sustained for 30m. Likely operator-leak avalanche pattern — children emitted without ownerReferences. Risk of multus CNI OOM cascade."
|
||||
|
||||
# Longhorn storage health alerts. Required: longhorn scrape job
|
||||
# (added 2026-04-26 — see scrape_configs above). The K8s events
|
||||
# for "snapshot becomes not ready to use" are transient lifecycle
|
||||
@@ -3362,92 +3395,6 @@ data:
|
||||
relativeTimeRange: {from: 120, to: 0}
|
||||
datasourceUid: __expr__
|
||||
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [600], type: gt}}], refId: C}
|
||||
- orgId: 1
|
||||
name: Signage Marquee
|
||||
folder: AI Stack Alerts
|
||||
interval: 1m
|
||||
rules:
|
||||
- uid: marquee-dropped-frames-high
|
||||
title: MarqueeDroppedFramesHigh
|
||||
condition: C
|
||||
for: 5m
|
||||
noDataState: OK
|
||||
execErrState: OK
|
||||
annotations:
|
||||
summary: Marquee dropped-frame rate above 5%
|
||||
description: "Dropped frames exceeded the IR-21 budget for a renderer/phase/node tuple. Grafana owns alert delivery to IRC #alerts; Prometheus rules remain only the visibility source."
|
||||
runbook: "1. Open /d/fc-marquee-perf/marquee-animation-performance 2. Filter renderer/node/phase 3. Compare latest AAT baseline diff 4. Restart only the affected player if the issue is node-local"
|
||||
labels:
|
||||
severity: warning
|
||||
service: signage
|
||||
alert_channel: irc
|
||||
data:
|
||||
- refId: A
|
||||
relativeTimeRange: {from: 300, to: 0}
|
||||
datasourceUid: prometheus
|
||||
model: {expr: '(sum by (renderer, node_id, phase) (rate(marquee_dropped_frames_total[5m])) / sum by (renderer, node_id, phase) (rate(marquee_render_latency_ms_count[5m]))) * 100', instant: true, refId: A}
|
||||
- refId: B
|
||||
relativeTimeRange: {from: 300, to: 0}
|
||||
datasourceUid: __expr__
|
||||
model: {type: reduce, expression: A, reducer: last, refId: B}
|
||||
- refId: C
|
||||
relativeTimeRange: {from: 300, to: 0}
|
||||
datasourceUid: __expr__
|
||||
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [5], type: gt}}], refId: C}
|
||||
- uid: marquee-render-latency-p99-high
|
||||
title: MarqueeRenderLatencyP99High
|
||||
condition: C
|
||||
for: 5m
|
||||
noDataState: OK
|
||||
execErrState: OK
|
||||
annotations:
|
||||
summary: Marquee render latency p99 above 16ms
|
||||
description: "Renderer p99 latency exceeded the Pi-class 16ms budget. Grafana delivers this alert to IRC #alerts."
|
||||
runbook: "1. Open /d/fc-marquee-perf/marquee-animation-performance 2. Check render latency p99 by renderer/node/phase 3. Compare with dropped frames and node CPU 4. If isolated to WPF, capture current Player.Wpf frame set before restart"
|
||||
labels:
|
||||
severity: warning
|
||||
service: signage
|
||||
alert_channel: irc
|
||||
data:
|
||||
- refId: A
|
||||
relativeTimeRange: {from: 300, to: 0}
|
||||
datasourceUid: prometheus
|
||||
model: {expr: 'histogram_quantile(0.99, sum by (renderer, node_id, phase, le) (rate(marquee_render_latency_ms_bucket[5m])))', instant: true, refId: A}
|
||||
- refId: B
|
||||
relativeTimeRange: {from: 300, to: 0}
|
||||
datasourceUid: __expr__
|
||||
model: {type: reduce, expression: A, reducer: last, refId: B}
|
||||
- refId: C
|
||||
relativeTimeRange: {from: 300, to: 0}
|
||||
datasourceUid: __expr__
|
||||
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [16], type: gt}}], refId: C}
|
||||
- uid: marquee-animation-duration-drift
|
||||
title: MarqueeAnimationDurationDrift
|
||||
condition: C
|
||||
for: 10m
|
||||
noDataState: OK
|
||||
execErrState: OK
|
||||
annotations:
|
||||
summary: Marquee animation duration drift above 10%
|
||||
description: "Observed cycle duration has drifted more than 10% from target for a renderer/phase pair. Grafana delivers this alert to IRC #alerts."
|
||||
runbook: "1. Open /d/fc-marquee-perf/marquee-animation-performance 2. Compare observed vs target duration 3. Check recent theme/preset changes 4. Re-run MarqueeHolidayBrandTrajectoryTests before promoting a baseline"
|
||||
labels:
|
||||
severity: warning
|
||||
service: signage
|
||||
alert_channel: irc
|
||||
data:
|
||||
- refId: A
|
||||
relativeTimeRange: {from: 900, to: 0}
|
||||
datasourceUid: prometheus
|
||||
model: {expr: 'abs((histogram_quantile(0.5, sum by (renderer, phase, le) (rate(marquee_animation_duration_ms_bucket[15m]))) - avg by (renderer, phase) (marquee_animation_duration_target_ms)) / avg by (renderer, phase) (marquee_animation_duration_target_ms))', instant: true, refId: A}
|
||||
- refId: B
|
||||
relativeTimeRange: {from: 900, to: 0}
|
||||
datasourceUid: __expr__
|
||||
model: {type: reduce, expression: A, reducer: last, refId: B}
|
||||
- refId: C
|
||||
relativeTimeRange: {from: 900, to: 0}
|
||||
datasourceUid: __expr__
|
||||
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [0.1], type: gt}}], refId: C}
|
||||
- orgId: 1
|
||||
name: Infrastructure
|
||||
folder: AI Stack Alerts
|
||||
|
||||
@@ -188,13 +188,24 @@ spec:
|
||||
- name: kube-multus
|
||||
image: ghcr.io/k8snetworkplumbingwg/multus-cni:snapshot-thick
|
||||
command: [ "/usr/src/multus-cni/bin/multus-daemon" ]
|
||||
# 2026-05-11: upstream default of 50Mi memory limit OOM-cascades when
|
||||
# an operator-owned namespace accumulates >100 pending pods retrying
|
||||
# CNI ADD. RemoteDesktop emitted 219 orphan rd-browser-only pods
|
||||
# (missing OwnerReferences), kubelet's CNI ADD avalanche pushed multus
|
||||
# over 50Mi, OOMKilled, restarted with even bigger backlog → loop.
|
||||
# 21h cluster outage. See FlowerCore.Notes:
|
||||
# feedback_multus_50mi_limit_oom_orphan_pod_avalanche.md
|
||||
# 1Gi limit / 512Mi request comfortably handles a 200+ pod CNI
|
||||
# catchup burst on 64GB nodes (nodes are <25% used in steady-state).
|
||||
# Drop back toward 256Mi only after MultusMemoryPressure alert
|
||||
# proves steady-state working set sits well below 200Mi.
|
||||
resources:
|
||||
requests:
|
||||
cpu: "100m"
|
||||
memory: "50Mi"
|
||||
memory: "512Mi"
|
||||
limits:
|
||||
cpu: "100m"
|
||||
memory: "50Mi"
|
||||
memory: "1Gi"
|
||||
securityContext:
|
||||
privileged: true
|
||||
terminationMessagePolicy: FallbackToLogsOnError
|
||||
|
||||
@@ -127,10 +127,13 @@ spec:
|
||||
initContainers:
|
||||
- name: fix-data-perms
|
||||
image: busybox:latest
|
||||
# Also chown /shared-tts (hostPath /tmp/tts-audio) so the non-root
|
||||
# app user (uid 1654) can write Piper .sln16 files that Asterisk
|
||||
# reads at /var/lib/asterisk/sounds/tts. World-readable (755) is
|
||||
# fine — Asterisk runs as a different uid in the other pod.
|
||||
# Must run as root to chown the hostPath /tmp/tts-audio that may be
|
||||
# root-owned after node reboot. Pod-level runAsNonRoot:true would
|
||||
# otherwise inherit and chown would fail with EPERM (see Notes memory
|
||||
# feedback_hostpath_initcontainer_chown_perms).
|
||||
securityContext:
|
||||
runAsUser: 0
|
||||
runAsNonRoot: false
|
||||
command: ["sh", "-c", "chown -R 1654:1654 /data && chown 1654:1654 /shared-tts && chmod 0755 /shared-tts"]
|
||||
volumeMounts:
|
||||
- name: telephony-data
|
||||
|
||||
@@ -291,6 +291,184 @@ public sealed class FleetManifestLintTests
|
||||
violations.Should().BeEmpty();
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void FcDeviceManagement_MustShipExpectedManifestSet()
|
||||
{
|
||||
var appRoot = Path.Combine(Inventory.BluejayRoot, "apps", "fc-devicemgmt");
|
||||
Directory.Exists(appRoot).Should().BeTrue("Sprint 8 Cx-5 owns apps/fc-devicemgmt.");
|
||||
|
||||
var expectedFiles = new[]
|
||||
{
|
||||
"1password-item.yaml",
|
||||
"argocd-application.yaml",
|
||||
"certificate-web.yaml",
|
||||
"clusterrole-operator.yaml",
|
||||
"clusterrolebinding-operator.yaml",
|
||||
"deployment-operator.yaml",
|
||||
"deployment-web.yaml",
|
||||
"ingressroute-web.yaml",
|
||||
"namespace.yaml",
|
||||
"network-policy.yaml",
|
||||
"service-web.yaml",
|
||||
"serviceaccount-operator.yaml",
|
||||
};
|
||||
|
||||
Directory.GetFiles(appRoot, "*.yaml")
|
||||
.Select(Path.GetFileName)
|
||||
.Should()
|
||||
.BeEquivalentTo(expectedFiles);
|
||||
|
||||
foreach (var expectedFile in expectedFiles)
|
||||
{
|
||||
FcDeviceManagementDocuments()
|
||||
.Should()
|
||||
.Contain(document => document.RelativePath == $"fc-devicemgmt/{expectedFile}");
|
||||
}
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void FcDeviceManagement_ObjectsMustCarryStandardTraceabilityLabels()
|
||||
{
|
||||
var requiredLabels = new[]
|
||||
{
|
||||
"app.kubernetes.io/name",
|
||||
"app.kubernetes.io/part-of",
|
||||
"app.kubernetes.io/managed-by",
|
||||
"flowercore.io/tenant-id",
|
||||
"flowercore.io/created-by",
|
||||
};
|
||||
|
||||
var violations = FcDeviceManagementDocuments()
|
||||
.SelectMany(document => requiredLabels
|
||||
.Where(label => string.IsNullOrWhiteSpace(document.Scalar("metadata", "labels", label)))
|
||||
.Select(label => $"{document.Descriptor} is missing metadata.labels['{label}']."))
|
||||
.Concat(FcDeviceManagementDocuments()
|
||||
.Where(document => document.Kind == "Deployment")
|
||||
.SelectMany(document => requiredLabels
|
||||
.Where(label => string.IsNullOrWhiteSpace(document.Scalar("spec", "template", "metadata", "labels", label)))
|
||||
.Select(label => $"{document.Descriptor} pod template is missing metadata.labels['{label}'].")))
|
||||
.Concat(FcDeviceManagementDocuments()
|
||||
.Where(document => document.Kind == "Deployment")
|
||||
.Where(document => string.IsNullOrWhiteSpace(document.Scalar("spec", "template", "metadata", "annotations", "flowercore.io/audit-trace-id")))
|
||||
.Select(document => $"{document.Descriptor} pod template is missing flowercore.io/audit-trace-id."))
|
||||
.ToList();
|
||||
|
||||
violations.Should().BeEmpty();
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void FcDeviceManagement_IngressMustUseCertManagerAndKeepPublicHostDisabled()
|
||||
{
|
||||
var appText = string.Join(
|
||||
Environment.NewLine,
|
||||
Directory.GetFiles(Path.Combine(Inventory.BluejayRoot, "apps", "fc-devicemgmt"), "*.yaml")
|
||||
.Select(File.ReadAllText));
|
||||
|
||||
appText.Should().NotContain("certResolver");
|
||||
appText.Should().Contain("update.flowercore.io");
|
||||
appText.Should().Contain("disabled-until-Q-OIDC-1");
|
||||
|
||||
FcDeviceManagementDocuments()
|
||||
.Where(document => document.Kind == "IngressRoute")
|
||||
.SelectMany(document => document.MappingSequence("spec", "routes"))
|
||||
.Select(route => ManifestNodeExtensions.Scalar(route, "match") ?? string.Empty)
|
||||
.Should()
|
||||
.Contain(match => match.Contains("Host(`devices.iamworkin.lan`)", StringComparison.Ordinal))
|
||||
.And.NotContain(match => match.Contains("Host(`update.flowercore.io`)", StringComparison.Ordinal));
|
||||
|
||||
var certificate = FcDeviceManagementDocuments()
|
||||
.Single(document => document.Kind == "Certificate" && document.Name == "fc-devicemgmt-web-tls");
|
||||
|
||||
certificate.Scalar("spec", "issuerRef", "name").Should().Be("step-ca-acme");
|
||||
certificate.Scalar("spec", "issuerRef", "kind").Should().Be("ClusterIssuer");
|
||||
ManifestNodeExtensions.ScalarSequence(certificate.Root, "spec", "dnsNames")
|
||||
.Should()
|
||||
.ContainSingle("devices.iamworkin.lan");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void FcDeviceManagement_OperatorRbacMustCoverDevicesAndOwnerLookup()
|
||||
{
|
||||
var clusterRole = FcDeviceManagementDocuments()
|
||||
.Single(document => document.Kind == "ClusterRole" && document.Name == "fc-devicemgmt-operator");
|
||||
var allScalars = clusterRole.AllScalars().ToList();
|
||||
|
||||
allScalars.Should().Contain("devices.flowercore.io");
|
||||
allScalars.Should().Contain("*");
|
||||
allScalars.Should().Contain("deployments");
|
||||
allScalars.Should().Contain("get");
|
||||
|
||||
var operatorDeployment = FcDeviceManagementDocuments()
|
||||
.Single(document => document.Kind == "Deployment" && document.Name == "fc-devicemgmt-operator");
|
||||
|
||||
operatorDeployment.AllScalars().Should().Contain("FLOWERCORE_KUBERNETES_OWNER_DEPLOYMENT");
|
||||
operatorDeployment.AllScalars().Should().Contain("fc-devicemgmt-operator");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void FcDeviceManagement_RuntimeSecretsMustUseOnePasswordItemPattern()
|
||||
{
|
||||
var item = FcDeviceManagementDocuments()
|
||||
.Single(document => document.Kind == "OnePasswordItem" && document.Name == "fc-devicemgmt-runtime");
|
||||
|
||||
item.Scalar("spec", "itemPath")
|
||||
.Should()
|
||||
.Be("vaults/IAmWorkin/items/FlowerCore DeviceManagement Runtime");
|
||||
|
||||
var appText = string.Join(
|
||||
Environment.NewLine,
|
||||
Directory.GetFiles(Path.Combine(Inventory.BluejayRoot, "apps", "fc-devicemgmt"), "*.yaml")
|
||||
.Select(File.ReadAllText));
|
||||
|
||||
FcDeviceManagementDocuments().Should().NotContain(document => document.Kind == "Secret");
|
||||
appText.Should().Contain("secretKeyRef:");
|
||||
appText.Should().Contain("secretName: fc-devicemgmt-runtime");
|
||||
appText.Should().NotContain("stringData:");
|
||||
appText.Should().NotContain("from-literal");
|
||||
appText.Should().NotContain("tls.key:");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void FcDeviceManagement_NetworkPoliciesMustAllowLanAgentsSynologyAndDnatPorts()
|
||||
{
|
||||
var policies = FcDeviceManagementDocuments()
|
||||
.Where(document => document.Kind == "NetworkPolicy")
|
||||
.ToList();
|
||||
|
||||
policies.Should().HaveCount(2);
|
||||
|
||||
var combinedScalars = policies.SelectMany(policy => policy.AllScalars()).ToList();
|
||||
combinedScalars.Should().Contain("10.0.56.0/24");
|
||||
combinedScalars.Should().Contain("10.0.57.0/24");
|
||||
combinedScalars.Should().Contain("10.0.58.0/24");
|
||||
combinedScalars.Should().Contain("10.0.68.0/27");
|
||||
combinedScalars.Should().Contain("10.0.58.3/32");
|
||||
|
||||
var combinedEgressPorts = policies.SelectMany(policy => policy.EgressPorts()).ToHashSet(StringComparer.Ordinal);
|
||||
combinedEgressPorts.Should().Contain(new[] { "80", "443", "8080", "8443", "2049", "111" });
|
||||
|
||||
var traefikVipPolicies = policies
|
||||
.Where(policy => policy.AllScalars().Any(value => value.Contains("10.0.56.200", StringComparison.Ordinal)))
|
||||
.ToList();
|
||||
|
||||
traefikVipPolicies.Should().ContainSingle();
|
||||
traefikVipPolicies[0].EgressPorts().Should().Contain(new[] { "80", "443", "8080", "8443" });
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void FcDeviceManagement_ArgocdApplicationMustMatchApplicationSetDiscoveryConventions()
|
||||
{
|
||||
var application = FcDeviceManagementDocuments()
|
||||
.Single(document => document.Kind == "Application" && document.Name == "infra-fc-devicemgmt");
|
||||
|
||||
application.Namespace.Should().Be("argocd");
|
||||
application.Scalar("spec", "source", "repoURL")
|
||||
.Should()
|
||||
.Be("http://gitea-clusterip.gitea.svc.cluster.local:3000/bluejay/bluejay-infra.git");
|
||||
application.Scalar("spec", "source", "path").Should().Be("apps/fc-devicemgmt");
|
||||
application.Scalar("spec", "destination", "namespace").Should().Be("fc-devicemgmt");
|
||||
}
|
||||
|
||||
private static IEnumerable<string> ProbeViolations(
|
||||
ManifestDocument document,
|
||||
YamlMappingNode container,
|
||||
@@ -314,6 +492,13 @@ public sealed class FleetManifestLintTests
|
||||
$"{document.Descriptor} container '{containerName}' still uses {probeKey}.httpGet on /health.",
|
||||
};
|
||||
}
|
||||
|
||||
private static IReadOnlyList<ManifestDocument> FcDeviceManagementDocuments()
|
||||
{
|
||||
return Inventory.Documents
|
||||
.Where(document => document.RelativePath.StartsWith("fc-devicemgmt/", StringComparison.Ordinal))
|
||||
.ToList();
|
||||
}
|
||||
}
|
||||
|
||||
internal sealed class ManifestInventory
|
||||
|
||||
Reference in New Issue
Block a user