Compare commits
9 Commits
e2e93d482c
...
codex/s57-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
0ed9b989fa | ||
|
|
404d884863 | ||
| f4bd90f805 | |||
|
|
67d67ab73d | ||
|
|
f7d41cdc60 | ||
|
|
2c0afc28e4 | ||
|
|
ba5f5dd0fb | ||
|
|
dc699da7b3 | ||
|
|
1e8bf54c6e |
2
.gitattributes
vendored
2
.gitattributes
vendored
@@ -1,2 +1,4 @@
|
|||||||
/.gitattributes text eol=lf
|
/.gitattributes text eol=lf
|
||||||
|
*.yaml text eol=lf
|
||||||
|
*.yml text eol=lf
|
||||||
*.sh text eol=lf
|
*.sh text eol=lf
|
||||||
|
|||||||
169
apps/fc-aistation/fc-aistation.yaml
Normal file
169
apps/fc-aistation/fc-aistation.yaml
Normal file
@@ -0,0 +1,169 @@
|
|||||||
|
# FlowerCore.AiStation.Web GitOps adoption manifest.
|
||||||
|
#
|
||||||
|
# Authored from the already-live fc-aistation resources on 2026-06-04.
|
||||||
|
# Keep the live image tag, Service ClusterIP, and PVC volumeName unchanged so
|
||||||
|
# ArgoCD adopts in place instead of replacing the workload or data volume.
|
||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: PersistentVolumeClaim
|
||||||
|
metadata:
|
||||||
|
name: aistation-web-data
|
||||||
|
namespace: fc-aistation
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/name: aistation-web
|
||||||
|
app.kubernetes.io/part-of: flowercore
|
||||||
|
app.kubernetes.io/managed-by: argocd
|
||||||
|
argocd.argoproj.io/instance: infra-fc-aistation
|
||||||
|
spec:
|
||||||
|
accessModes:
|
||||||
|
- ReadWriteOnce
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
storage: 1Gi
|
||||||
|
storageClassName: longhorn
|
||||||
|
volumeMode: Filesystem
|
||||||
|
volumeName: pvc-27448d6f-6e66-42a7-a293-73dd8bbd6b3e
|
||||||
|
---
|
||||||
|
apiVersion: apps/v1
|
||||||
|
kind: Deployment
|
||||||
|
metadata:
|
||||||
|
name: aistation-web
|
||||||
|
namespace: fc-aistation
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/name: aistation-web
|
||||||
|
app.kubernetes.io/part-of: flowercore
|
||||||
|
app.kubernetes.io/managed-by: argocd
|
||||||
|
argocd.argoproj.io/instance: infra-fc-aistation
|
||||||
|
spec:
|
||||||
|
progressDeadlineSeconds: 600
|
||||||
|
replicas: 1
|
||||||
|
revisionHistoryLimit: 3
|
||||||
|
selector:
|
||||||
|
matchLabels:
|
||||||
|
app.kubernetes.io/name: aistation-web
|
||||||
|
strategy:
|
||||||
|
type: Recreate
|
||||||
|
template:
|
||||||
|
metadata:
|
||||||
|
annotations:
|
||||||
|
prometheus.io/path: /metrics/prometheus
|
||||||
|
prometheus.io/port: "5000"
|
||||||
|
prometheus.io/scrape: "true"
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/name: aistation-web
|
||||||
|
app.kubernetes.io/part-of: flowercore
|
||||||
|
spec:
|
||||||
|
containers:
|
||||||
|
- envFrom:
|
||||||
|
- configMapRef:
|
||||||
|
name: aistation-web-config
|
||||||
|
image: localhost/fc-aistation-web:v20260602-aistation-owned-deploy-fix2
|
||||||
|
imagePullPolicy: Never
|
||||||
|
livenessProbe:
|
||||||
|
failureThreshold: 3
|
||||||
|
httpGet:
|
||||||
|
path: /healthz
|
||||||
|
port: 5000
|
||||||
|
scheme: HTTP
|
||||||
|
initialDelaySeconds: 30
|
||||||
|
periodSeconds: 30
|
||||||
|
successThreshold: 1
|
||||||
|
timeoutSeconds: 5
|
||||||
|
name: aistation-web
|
||||||
|
ports:
|
||||||
|
- containerPort: 5000
|
||||||
|
name: http
|
||||||
|
protocol: TCP
|
||||||
|
readinessProbe:
|
||||||
|
failureThreshold: 6
|
||||||
|
httpGet:
|
||||||
|
path: /healthz
|
||||||
|
port: 5000
|
||||||
|
scheme: HTTP
|
||||||
|
initialDelaySeconds: 10
|
||||||
|
periodSeconds: 10
|
||||||
|
successThreshold: 1
|
||||||
|
timeoutSeconds: 5
|
||||||
|
resources: {}
|
||||||
|
terminationMessagePath: /dev/termination-log
|
||||||
|
terminationMessagePolicy: File
|
||||||
|
volumeMounts:
|
||||||
|
- mountPath: /data
|
||||||
|
name: data
|
||||||
|
dnsPolicy: ClusterFirst
|
||||||
|
restartPolicy: Always
|
||||||
|
schedulerName: default-scheduler
|
||||||
|
securityContext: {}
|
||||||
|
terminationGracePeriodSeconds: 30
|
||||||
|
volumes:
|
||||||
|
- name: data
|
||||||
|
persistentVolumeClaim:
|
||||||
|
claimName: aistation-web-data
|
||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Service
|
||||||
|
metadata:
|
||||||
|
name: aistation-web
|
||||||
|
namespace: fc-aistation
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/name: aistation-web
|
||||||
|
app.kubernetes.io/part-of: flowercore
|
||||||
|
app.kubernetes.io/managed-by: argocd
|
||||||
|
argocd.argoproj.io/instance: infra-fc-aistation
|
||||||
|
spec:
|
||||||
|
clusterIP: 10.43.211.127
|
||||||
|
clusterIPs:
|
||||||
|
- 10.43.211.127
|
||||||
|
internalTrafficPolicy: Cluster
|
||||||
|
ipFamilies:
|
||||||
|
- IPv4
|
||||||
|
ipFamilyPolicy: SingleStack
|
||||||
|
ports:
|
||||||
|
- name: http
|
||||||
|
port: 80
|
||||||
|
protocol: TCP
|
||||||
|
targetPort: 5000
|
||||||
|
selector:
|
||||||
|
app.kubernetes.io/name: aistation-web
|
||||||
|
sessionAffinity: None
|
||||||
|
type: ClusterIP
|
||||||
|
---
|
||||||
|
apiVersion: cert-manager.io/v1
|
||||||
|
kind: Certificate
|
||||||
|
metadata:
|
||||||
|
name: aistation-web-tls
|
||||||
|
namespace: fc-aistation
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/name: aistation-web-tls
|
||||||
|
app.kubernetes.io/part-of: flowercore
|
||||||
|
app.kubernetes.io/managed-by: argocd
|
||||||
|
argocd.argoproj.io/instance: infra-fc-aistation
|
||||||
|
spec:
|
||||||
|
dnsNames:
|
||||||
|
- aistation.iamworkin.lan
|
||||||
|
issuerRef:
|
||||||
|
kind: ClusterIssuer
|
||||||
|
name: step-ca-acme
|
||||||
|
secretName: aistation-web-tls
|
||||||
|
---
|
||||||
|
apiVersion: traefik.io/v1alpha1
|
||||||
|
kind: IngressRoute
|
||||||
|
metadata:
|
||||||
|
name: aistation-web
|
||||||
|
namespace: fc-aistation
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/name: aistation-web
|
||||||
|
app.kubernetes.io/part-of: flowercore
|
||||||
|
app.kubernetes.io/managed-by: argocd
|
||||||
|
argocd.argoproj.io/instance: infra-fc-aistation
|
||||||
|
spec:
|
||||||
|
entryPoints:
|
||||||
|
- websecure
|
||||||
|
routes:
|
||||||
|
- kind: Rule
|
||||||
|
match: Host(`aistation.iamworkin.lan`)
|
||||||
|
services:
|
||||||
|
- name: aistation-web
|
||||||
|
port: 80
|
||||||
|
tls:
|
||||||
|
secretName: aistation-web-tls
|
||||||
@@ -1,5 +1,206 @@
|
|||||||
# FlowerCore Chat — TLS + Ingress
|
# FlowerCore Chat
|
||||||
# Deployment and Service managed by deploy script (not ArgoCD)
|
#
|
||||||
|
# ArgoCD-managed workload plus TLS/Ingress. The chat-web-secret remains an
|
||||||
|
# out-of-band Secret until the values are moved into a 1Password-backed item;
|
||||||
|
# the Deployment references it as optional so GitOps can own the workload
|
||||||
|
# without storing secret material in this repo.
|
||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Namespace
|
||||||
|
metadata:
|
||||||
|
name: fc-chat
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/part-of: flowercore
|
||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: ConfigMap
|
||||||
|
metadata:
|
||||||
|
name: chat-web-config
|
||||||
|
namespace: fc-chat
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/name: chat-web
|
||||||
|
app.kubernetes.io/part-of: flowercore
|
||||||
|
data:
|
||||||
|
ASPNETCORE_ENVIRONMENT: Production
|
||||||
|
ASPNETCORE_URLS: "http://+:8080"
|
||||||
|
ASPNETCORE_FORWARDEDHEADERS_ENABLED: "true"
|
||||||
|
FlowerCore__Auth__Enabled: "false"
|
||||||
|
FlowerCore__Auth__Oidc__Enabled: "true"
|
||||||
|
FlowerCore__Auth__Oidc__Authority: "https://id.iamworkin.lan/application/o/chat/"
|
||||||
|
FlowerCore__Auth__Oidc__Audience: "chat"
|
||||||
|
FlowerCore__Auth__Oidc__ClientId: "chat"
|
||||||
|
FlowerCore__Database__ConnectionStrings__Sqlite: "Data Source=/data/chat.db"
|
||||||
|
# Ollama target. Switched 2026-04-25 from edge1 Pi5 (10.0.57.17) to BLUEJAY-WS
|
||||||
|
# workstation (10.0.56.20, RX 9070 XT 16GB, OLLAMA_HOST=0.0.0.0:11434, Vulkan
|
||||||
|
# backend per feedback_rdna4_vulkan_broken). The Pi5 was timing out every team-
|
||||||
|
# round speaker at the 300s per-turn cap (live-proven 2026-04-25 03:53 UTC,
|
||||||
|
# see feedback_chat_team_round_edge1_too_slow). Workstation has gemma3:4b for
|
||||||
|
# the Cheap tier, plus gemma3:27b/phi4:14b/qwen3:14b for Default/Balanced/Deep.
|
||||||
|
# Piper TTS stays on edge1 below (different service, Pi handles TTS fine).
|
||||||
|
FlowerCore__AI__OllamaBaseUrl: "http://10.0.56.20:11434"
|
||||||
|
FlowerCore__AI__DefaultModelName: "phi4:14b"
|
||||||
|
ChatOptions__BehaviorRuleEngine__OllamaBaseUrl: "http://10.0.56.20:11434"
|
||||||
|
ChatOptions__BehaviorRuleEngine__FallbackOllamaBaseUrl: "http://10.0.57.17:11434"
|
||||||
|
ChatOptions__BehaviorRuleEngine__ModelName: "gemma3:12b"
|
||||||
|
FlowerCore__AI__Memory__UseSharedIndexingAdapter: "true"
|
||||||
|
FlowerCore__AI__Memory__UseOllamaEmbeddings: "true"
|
||||||
|
FlowerCore__AI__Memory__EmbeddingModel: "nomic-embed-text"
|
||||||
|
FlowerCore__AI__Memory__EnableSharedIndexingBackfill: "true"
|
||||||
|
FlowerCore__AI__Memory__SharedIndexingDatabasePath: "/data/chat-memory-index.db"
|
||||||
|
FlowerCore__AI__Skills__Library__LibraryApiUrl: "http://library-web.fc-library.svc.cluster.local"
|
||||||
|
FlowerCore__AI__Skills__Retail__RetailApiUrl: "http://retail-web.fc-retail.svc.cluster.local"
|
||||||
|
FlowerCore__AI__Skills__Intranet__IntranetBaseUrl: "http://intranet-web.intranet.svc.cluster.local"
|
||||||
|
FlowerCore__AI__Skills__Print__PrintMcpBaseUrl: "http://10.0.57.16:5200"
|
||||||
|
FlowerCore__AI__IrcBridge__Enabled: "true"
|
||||||
|
FlowerCore__AI__IrcBridge__DefaultProfileSlug: "it-helpdesk"
|
||||||
|
FlowerCore__AI__IrcBridge__MentionProfileSlug: "it-helpdesk"
|
||||||
|
FlowerCore__AI__IrcBridge__MentionReactiveMode: "mentions-only"
|
||||||
|
FlowerCore__AI__IrcBridge__AllowActionExecution: "false"
|
||||||
|
FlowerCore__AI__Voice__Piper__Host: "10.0.57.17"
|
||||||
|
FlowerCore__AI__Voice__Piper__Port: "10400"
|
||||||
|
FlowerCore__AI__Voice__OutputRoot: "/data/audio"
|
||||||
|
FlowerCore__AI__Voice__RetentionDays: "30"
|
||||||
|
# LLM provider abstraction (ADR-088). Anthropic stays disabled here -- when
|
||||||
|
# an operator wants to enable Claude, they flip Enabled=true and mount
|
||||||
|
# FlowerCore__Anthropic__ApiKey from the onepassword-synced Secret (see
|
||||||
|
# docs/ai-agents/anthropic-integration.md).
|
||||||
|
FlowerCore__Anthropic__Enabled: "false"
|
||||||
|
FlowerCore__Anthropic__BaseUrl: "https://api.anthropic.com"
|
||||||
|
FlowerCore__Anthropic__DefaultModel: "claude-sonnet-4-6"
|
||||||
|
FlowerCore__Anthropic__CheapModel: "claude-haiku-4-5-20251001"
|
||||||
|
FlowerCore__Anthropic__DeepModel: "claude-opus-4-7"
|
||||||
|
FlowerCore__Budget__ResponseCacheEnabled: "true"
|
||||||
|
OTEL_SERVICE_NAME: FlowerCore.Chat
|
||||||
|
OTEL_EXPORTER_OTLP_ENDPOINT: "http://otel-collector.monitoring.svc.cluster.local:4317"
|
||||||
|
OTEL_EXPORTER_OTLP_PROTOCOL: grpc
|
||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: PersistentVolumeClaim
|
||||||
|
metadata:
|
||||||
|
name: chat-web-data
|
||||||
|
namespace: fc-chat
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/name: chat-web
|
||||||
|
app.kubernetes.io/part-of: flowercore
|
||||||
|
spec:
|
||||||
|
accessModes:
|
||||||
|
- ReadWriteOnce
|
||||||
|
storageClassName: longhorn
|
||||||
|
volumeMode: Filesystem
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
storage: 1Gi
|
||||||
|
---
|
||||||
|
apiVersion: apps/v1
|
||||||
|
kind: Deployment
|
||||||
|
metadata:
|
||||||
|
name: chat-web
|
||||||
|
namespace: fc-chat
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/name: chat-web
|
||||||
|
app.kubernetes.io/part-of: flowercore
|
||||||
|
spec:
|
||||||
|
replicas: 1
|
||||||
|
strategy:
|
||||||
|
type: Recreate
|
||||||
|
selector:
|
||||||
|
matchLabels:
|
||||||
|
app.kubernetes.io/name: chat-web
|
||||||
|
template:
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/name: chat-web
|
||||||
|
app.kubernetes.io/part-of: flowercore
|
||||||
|
annotations:
|
||||||
|
prometheus.io/scrape: "true"
|
||||||
|
prometheus.io/port: "8080"
|
||||||
|
prometheus.io/path: "/metrics/prometheus"
|
||||||
|
spec:
|
||||||
|
nodeSelector:
|
||||||
|
kubernetes.io/hostname: rke2-server
|
||||||
|
securityContext:
|
||||||
|
fsGroup: 1654
|
||||||
|
fsGroupChangePolicy: OnRootMismatch
|
||||||
|
containers:
|
||||||
|
- name: chat-web
|
||||||
|
image: localhost/fc-chat-web:v20260603-oidc-authentik
|
||||||
|
imagePullPolicy: Never
|
||||||
|
ports:
|
||||||
|
- name: http
|
||||||
|
containerPort: 8080
|
||||||
|
envFrom:
|
||||||
|
- configMapRef:
|
||||||
|
name: chat-web-config
|
||||||
|
- secretRef:
|
||||||
|
name: chat-web-secret
|
||||||
|
optional: true
|
||||||
|
env:
|
||||||
|
- name: FlowerCore__Auth__Oidc__Authority
|
||||||
|
valueFrom:
|
||||||
|
secretKeyRef:
|
||||||
|
name: chat-oidc-client
|
||||||
|
key: issuer_url
|
||||||
|
optional: true
|
||||||
|
- name: FlowerCore__Auth__Oidc__ClientId
|
||||||
|
valueFrom:
|
||||||
|
secretKeyRef:
|
||||||
|
name: chat-oidc-client
|
||||||
|
key: client_id
|
||||||
|
optional: true
|
||||||
|
- name: FlowerCore__Auth__Oidc__ClientSecret
|
||||||
|
valueFrom:
|
||||||
|
secretKeyRef:
|
||||||
|
name: chat-oidc-client
|
||||||
|
key: client_secret
|
||||||
|
optional: true
|
||||||
|
volumeMounts:
|
||||||
|
- name: data
|
||||||
|
mountPath: /data
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
memory: "128Mi"
|
||||||
|
cpu: "100m"
|
||||||
|
limits:
|
||||||
|
memory: "512Mi"
|
||||||
|
cpu: "500m"
|
||||||
|
readinessProbe:
|
||||||
|
httpGet:
|
||||||
|
path: /healthz
|
||||||
|
port: 8080
|
||||||
|
initialDelaySeconds: 10
|
||||||
|
periodSeconds: 10
|
||||||
|
timeoutSeconds: 5
|
||||||
|
failureThreshold: 6
|
||||||
|
livenessProbe:
|
||||||
|
httpGet:
|
||||||
|
path: /healthz
|
||||||
|
port: 8080
|
||||||
|
initialDelaySeconds: 30
|
||||||
|
periodSeconds: 30
|
||||||
|
timeoutSeconds: 5
|
||||||
|
failureThreshold: 3
|
||||||
|
volumes:
|
||||||
|
- name: data
|
||||||
|
persistentVolumeClaim:
|
||||||
|
claimName: chat-web-data
|
||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Service
|
||||||
|
metadata:
|
||||||
|
name: chat-web
|
||||||
|
namespace: fc-chat
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/name: chat-web
|
||||||
|
app.kubernetes.io/part-of: flowercore
|
||||||
|
spec:
|
||||||
|
type: ClusterIP
|
||||||
|
selector:
|
||||||
|
app.kubernetes.io/name: chat-web
|
||||||
|
ports:
|
||||||
|
- name: http
|
||||||
|
port: 80
|
||||||
|
targetPort: 8080
|
||||||
|
protocol: TCP
|
||||||
---
|
---
|
||||||
apiVersion: cert-manager.io/v1
|
apiVersion: cert-manager.io/v1
|
||||||
kind: Certificate
|
kind: Certificate
|
||||||
|
|||||||
169
apps/fc-library/fc-library.yaml
Normal file
169
apps/fc-library/fc-library.yaml
Normal file
@@ -0,0 +1,169 @@
|
|||||||
|
# FlowerCore.Library.Web GitOps adoption manifest.
|
||||||
|
#
|
||||||
|
# Authored from the already-live fc-library resources on 2026-06-04.
|
||||||
|
# Keep the live image tag, Service ClusterIP, and PVC volumeName unchanged so
|
||||||
|
# ArgoCD adopts in place instead of replacing the workload or data volume.
|
||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: PersistentVolumeClaim
|
||||||
|
metadata:
|
||||||
|
name: library-web-data
|
||||||
|
namespace: fc-library
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/name: library-web
|
||||||
|
app.kubernetes.io/part-of: flowercore
|
||||||
|
app.kubernetes.io/managed-by: argocd
|
||||||
|
argocd.argoproj.io/instance: infra-fc-library
|
||||||
|
spec:
|
||||||
|
accessModes:
|
||||||
|
- ReadWriteOnce
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
storage: 1Gi
|
||||||
|
storageClassName: longhorn
|
||||||
|
volumeMode: Filesystem
|
||||||
|
volumeName: pvc-2690bae2-4ee0-417a-b95f-50ec5c632b63
|
||||||
|
---
|
||||||
|
apiVersion: apps/v1
|
||||||
|
kind: Deployment
|
||||||
|
metadata:
|
||||||
|
name: library-web
|
||||||
|
namespace: fc-library
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/name: library-web
|
||||||
|
app.kubernetes.io/part-of: flowercore
|
||||||
|
app.kubernetes.io/managed-by: argocd
|
||||||
|
argocd.argoproj.io/instance: infra-fc-library
|
||||||
|
spec:
|
||||||
|
progressDeadlineSeconds: 600
|
||||||
|
replicas: 1
|
||||||
|
revisionHistoryLimit: 3
|
||||||
|
selector:
|
||||||
|
matchLabels:
|
||||||
|
app.kubernetes.io/name: library-web
|
||||||
|
strategy:
|
||||||
|
type: Recreate
|
||||||
|
template:
|
||||||
|
metadata:
|
||||||
|
annotations:
|
||||||
|
prometheus.io/path: /metrics/prometheus
|
||||||
|
prometheus.io/port: "5000"
|
||||||
|
prometheus.io/scrape: "true"
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/name: library-web
|
||||||
|
app.kubernetes.io/part-of: flowercore
|
||||||
|
spec:
|
||||||
|
containers:
|
||||||
|
- envFrom:
|
||||||
|
- configMapRef:
|
||||||
|
name: library-web-config
|
||||||
|
image: localhost/fc-library-web:v20260602-library-owned-deploy-fix1
|
||||||
|
imagePullPolicy: Never
|
||||||
|
livenessProbe:
|
||||||
|
failureThreshold: 3
|
||||||
|
httpGet:
|
||||||
|
path: /health
|
||||||
|
port: 5000
|
||||||
|
scheme: HTTP
|
||||||
|
initialDelaySeconds: 30
|
||||||
|
periodSeconds: 30
|
||||||
|
successThreshold: 1
|
||||||
|
timeoutSeconds: 5
|
||||||
|
name: library-web
|
||||||
|
ports:
|
||||||
|
- containerPort: 5000
|
||||||
|
name: http
|
||||||
|
protocol: TCP
|
||||||
|
readinessProbe:
|
||||||
|
failureThreshold: 6
|
||||||
|
httpGet:
|
||||||
|
path: /health
|
||||||
|
port: 5000
|
||||||
|
scheme: HTTP
|
||||||
|
initialDelaySeconds: 10
|
||||||
|
periodSeconds: 10
|
||||||
|
successThreshold: 1
|
||||||
|
timeoutSeconds: 5
|
||||||
|
resources: {}
|
||||||
|
terminationMessagePath: /dev/termination-log
|
||||||
|
terminationMessagePolicy: File
|
||||||
|
volumeMounts:
|
||||||
|
- mountPath: /data
|
||||||
|
name: data
|
||||||
|
dnsPolicy: ClusterFirst
|
||||||
|
restartPolicy: Always
|
||||||
|
schedulerName: default-scheduler
|
||||||
|
securityContext: {}
|
||||||
|
terminationGracePeriodSeconds: 30
|
||||||
|
volumes:
|
||||||
|
- name: data
|
||||||
|
persistentVolumeClaim:
|
||||||
|
claimName: library-web-data
|
||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Service
|
||||||
|
metadata:
|
||||||
|
name: library-web
|
||||||
|
namespace: fc-library
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/name: library-web
|
||||||
|
app.kubernetes.io/part-of: flowercore
|
||||||
|
app.kubernetes.io/managed-by: argocd
|
||||||
|
argocd.argoproj.io/instance: infra-fc-library
|
||||||
|
spec:
|
||||||
|
clusterIP: 10.43.179.63
|
||||||
|
clusterIPs:
|
||||||
|
- 10.43.179.63
|
||||||
|
internalTrafficPolicy: Cluster
|
||||||
|
ipFamilies:
|
||||||
|
- IPv4
|
||||||
|
ipFamilyPolicy: SingleStack
|
||||||
|
ports:
|
||||||
|
- name: http
|
||||||
|
port: 80
|
||||||
|
protocol: TCP
|
||||||
|
targetPort: 5000
|
||||||
|
selector:
|
||||||
|
app.kubernetes.io/name: library-web
|
||||||
|
sessionAffinity: None
|
||||||
|
type: ClusterIP
|
||||||
|
---
|
||||||
|
apiVersion: cert-manager.io/v1
|
||||||
|
kind: Certificate
|
||||||
|
metadata:
|
||||||
|
name: library-web-tls
|
||||||
|
namespace: fc-library
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/name: library-web-tls
|
||||||
|
app.kubernetes.io/part-of: flowercore
|
||||||
|
app.kubernetes.io/managed-by: argocd
|
||||||
|
argocd.argoproj.io/instance: infra-fc-library
|
||||||
|
spec:
|
||||||
|
dnsNames:
|
||||||
|
- library.iamworkin.lan
|
||||||
|
issuerRef:
|
||||||
|
kind: ClusterIssuer
|
||||||
|
name: step-ca-acme
|
||||||
|
secretName: library-web-tls
|
||||||
|
---
|
||||||
|
apiVersion: traefik.io/v1alpha1
|
||||||
|
kind: IngressRoute
|
||||||
|
metadata:
|
||||||
|
name: library-web
|
||||||
|
namespace: fc-library
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/name: library-web
|
||||||
|
app.kubernetes.io/part-of: flowercore
|
||||||
|
app.kubernetes.io/managed-by: argocd
|
||||||
|
argocd.argoproj.io/instance: infra-fc-library
|
||||||
|
spec:
|
||||||
|
entryPoints:
|
||||||
|
- websecure
|
||||||
|
routes:
|
||||||
|
- kind: Rule
|
||||||
|
match: Host(`library.iamworkin.lan`)
|
||||||
|
services:
|
||||||
|
- name: library-web
|
||||||
|
port: 80
|
||||||
|
tls:
|
||||||
|
secretName: library-web-tls
|
||||||
170
apps/fc-retail/fc-retail.yaml
Normal file
170
apps/fc-retail/fc-retail.yaml
Normal file
@@ -0,0 +1,170 @@
|
|||||||
|
# FlowerCore.Retail.Web GitOps adoption manifest.
|
||||||
|
#
|
||||||
|
# Authored from the already-live fc-retail resources on 2026-06-04.
|
||||||
|
# Keep the live image tag, Service ClusterIP, and PVC volumeName unchanged so
|
||||||
|
# ArgoCD adopts in place instead of replacing the workload or data volume.
|
||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: PersistentVolumeClaim
|
||||||
|
metadata:
|
||||||
|
name: retail-web-data
|
||||||
|
namespace: fc-retail
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/name: retail-web
|
||||||
|
app.kubernetes.io/part-of: flowercore
|
||||||
|
app.kubernetes.io/managed-by: argocd
|
||||||
|
argocd.argoproj.io/instance: infra-fc-retail
|
||||||
|
spec:
|
||||||
|
accessModes:
|
||||||
|
- ReadWriteOnce
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
storage: 1Gi
|
||||||
|
storageClassName: longhorn
|
||||||
|
volumeMode: Filesystem
|
||||||
|
volumeName: pvc-3d40b336-eab4-41b3-812c-d5e9413ce0ab
|
||||||
|
---
|
||||||
|
apiVersion: apps/v1
|
||||||
|
kind: Deployment
|
||||||
|
metadata:
|
||||||
|
name: retail-web
|
||||||
|
namespace: fc-retail
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/name: retail-web
|
||||||
|
app.kubernetes.io/part-of: flowercore
|
||||||
|
app.kubernetes.io/managed-by: argocd
|
||||||
|
argocd.argoproj.io/instance: infra-fc-retail
|
||||||
|
spec:
|
||||||
|
progressDeadlineSeconds: 600
|
||||||
|
replicas: 1
|
||||||
|
revisionHistoryLimit: 3
|
||||||
|
selector:
|
||||||
|
matchLabels:
|
||||||
|
app.kubernetes.io/name: retail-web
|
||||||
|
strategy:
|
||||||
|
type: Recreate
|
||||||
|
template:
|
||||||
|
metadata:
|
||||||
|
annotations:
|
||||||
|
kubectl.kubernetes.io/restartedAt: "2026-06-02T01:34:08-05:00"
|
||||||
|
prometheus.io/path: /metrics/prometheus
|
||||||
|
prometheus.io/port: "5000"
|
||||||
|
prometheus.io/scrape: "true"
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/name: retail-web
|
||||||
|
app.kubernetes.io/part-of: flowercore
|
||||||
|
spec:
|
||||||
|
containers:
|
||||||
|
- envFrom:
|
||||||
|
- configMapRef:
|
||||||
|
name: retail-web-config
|
||||||
|
image: localhost/fc-retail-web:v20260602-retail-owned-deploy-fix5
|
||||||
|
imagePullPolicy: Never
|
||||||
|
livenessProbe:
|
||||||
|
failureThreshold: 3
|
||||||
|
httpGet:
|
||||||
|
path: /health
|
||||||
|
port: 5000
|
||||||
|
scheme: HTTP
|
||||||
|
initialDelaySeconds: 30
|
||||||
|
periodSeconds: 30
|
||||||
|
successThreshold: 1
|
||||||
|
timeoutSeconds: 5
|
||||||
|
name: retail-web
|
||||||
|
ports:
|
||||||
|
- containerPort: 5000
|
||||||
|
name: http
|
||||||
|
protocol: TCP
|
||||||
|
readinessProbe:
|
||||||
|
failureThreshold: 6
|
||||||
|
httpGet:
|
||||||
|
path: /health
|
||||||
|
port: 5000
|
||||||
|
scheme: HTTP
|
||||||
|
initialDelaySeconds: 10
|
||||||
|
periodSeconds: 10
|
||||||
|
successThreshold: 1
|
||||||
|
timeoutSeconds: 5
|
||||||
|
resources: {}
|
||||||
|
terminationMessagePath: /dev/termination-log
|
||||||
|
terminationMessagePolicy: File
|
||||||
|
volumeMounts:
|
||||||
|
- mountPath: /data
|
||||||
|
name: data
|
||||||
|
dnsPolicy: ClusterFirst
|
||||||
|
restartPolicy: Always
|
||||||
|
schedulerName: default-scheduler
|
||||||
|
securityContext: {}
|
||||||
|
terminationGracePeriodSeconds: 30
|
||||||
|
volumes:
|
||||||
|
- name: data
|
||||||
|
persistentVolumeClaim:
|
||||||
|
claimName: retail-web-data
|
||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Service
|
||||||
|
metadata:
|
||||||
|
name: retail-web
|
||||||
|
namespace: fc-retail
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/name: retail-web
|
||||||
|
app.kubernetes.io/part-of: flowercore
|
||||||
|
app.kubernetes.io/managed-by: argocd
|
||||||
|
argocd.argoproj.io/instance: infra-fc-retail
|
||||||
|
spec:
|
||||||
|
clusterIP: 10.43.239.8
|
||||||
|
clusterIPs:
|
||||||
|
- 10.43.239.8
|
||||||
|
internalTrafficPolicy: Cluster
|
||||||
|
ipFamilies:
|
||||||
|
- IPv4
|
||||||
|
ipFamilyPolicy: SingleStack
|
||||||
|
ports:
|
||||||
|
- name: http
|
||||||
|
port: 80
|
||||||
|
protocol: TCP
|
||||||
|
targetPort: 5000
|
||||||
|
selector:
|
||||||
|
app.kubernetes.io/name: retail-web
|
||||||
|
sessionAffinity: None
|
||||||
|
type: ClusterIP
|
||||||
|
---
|
||||||
|
apiVersion: cert-manager.io/v1
|
||||||
|
kind: Certificate
|
||||||
|
metadata:
|
||||||
|
name: retail-web-tls
|
||||||
|
namespace: fc-retail
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/name: retail-web-tls
|
||||||
|
app.kubernetes.io/part-of: flowercore
|
||||||
|
app.kubernetes.io/managed-by: argocd
|
||||||
|
argocd.argoproj.io/instance: infra-fc-retail
|
||||||
|
spec:
|
||||||
|
dnsNames:
|
||||||
|
- retail.iamworkin.lan
|
||||||
|
issuerRef:
|
||||||
|
kind: ClusterIssuer
|
||||||
|
name: step-ca-acme
|
||||||
|
secretName: retail-web-tls
|
||||||
|
---
|
||||||
|
apiVersion: traefik.io/v1alpha1
|
||||||
|
kind: IngressRoute
|
||||||
|
metadata:
|
||||||
|
name: retail-web
|
||||||
|
namespace: fc-retail
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/name: retail-web
|
||||||
|
app.kubernetes.io/part-of: flowercore
|
||||||
|
app.kubernetes.io/managed-by: argocd
|
||||||
|
argocd.argoproj.io/instance: infra-fc-retail
|
||||||
|
spec:
|
||||||
|
entryPoints:
|
||||||
|
- websecure
|
||||||
|
routes:
|
||||||
|
- kind: Rule
|
||||||
|
match: Host(`retail.iamworkin.lan`)
|
||||||
|
services:
|
||||||
|
- name: retail-web
|
||||||
|
port: 80
|
||||||
|
tls:
|
||||||
|
secretName: retail-web-tls
|
||||||
@@ -102,7 +102,7 @@ spec:
|
|||||||
- name: web
|
- name: web
|
||||||
# Placeholder tag — bump to the image you built + imported to ALL
|
# Placeholder tag — bump to the image you built + imported to ALL
|
||||||
# RKE2 nodes via scripts/deploy-knowledge.sh before applying.
|
# RKE2 nodes via scripts/deploy-knowledge.sh before applying.
|
||||||
image: localhost/fc-knowledge-web:v20260429232635
|
image: localhost/fc-knowledge-web:v20260603-oidc-authentik-auditfix
|
||||||
imagePullPolicy: Never
|
imagePullPolicy: Never
|
||||||
command:
|
command:
|
||||||
- /bin/sh
|
- /bin/sh
|
||||||
@@ -123,6 +123,25 @@ spec:
|
|||||||
value: "Production"
|
value: "Production"
|
||||||
- name: DOTNET_SYSTEM_GLOBALIZATION_INVARIANT
|
- name: DOTNET_SYSTEM_GLOBALIZATION_INVARIANT
|
||||||
value: "false"
|
value: "false"
|
||||||
|
# AuthentiK/OIDC is wired but not enforced until the
|
||||||
|
# knowledge-oidc-client Secret is provisioned and
|
||||||
|
# FlowerCore__Auth__Enabled is flipped to true.
|
||||||
|
- name: FlowerCore__Auth__Enabled
|
||||||
|
value: "false"
|
||||||
|
- name: FlowerCore__Auth__Oidc__Enabled
|
||||||
|
value: "true"
|
||||||
|
- name: FlowerCore__Auth__Oidc__Authority
|
||||||
|
value: "https://id.iamworkin.lan/application/o/knowledge/"
|
||||||
|
- name: FlowerCore__Auth__Oidc__Audience
|
||||||
|
value: "knowledge"
|
||||||
|
- name: FlowerCore__Auth__Oidc__ClientId
|
||||||
|
value: "knowledge"
|
||||||
|
- name: FlowerCore__Auth__Oidc__ClientSecret
|
||||||
|
valueFrom:
|
||||||
|
secretKeyRef:
|
||||||
|
name: knowledge-oidc-client
|
||||||
|
key: client_secret
|
||||||
|
optional: true
|
||||||
# Vector-store directory + embedding model + edition profile dir.
|
# Vector-store directory + embedding model + edition profile dir.
|
||||||
# Profile JSON is baked into the image at /home/app/editions via the
|
# Profile JSON is baked into the image at /home/app/editions via the
|
||||||
# csproj Content-link from FlowerCore.Common/editions/.
|
# csproj Content-link from FlowerCore.Common/editions/.
|
||||||
@@ -134,6 +153,8 @@ spec:
|
|||||||
value: "5"
|
value: "5"
|
||||||
- name: Knowledge__MaxLimit
|
- name: Knowledge__MaxLimit
|
||||||
value: "50"
|
value: "50"
|
||||||
|
- name: Knowledge__Federation__DatabasePath
|
||||||
|
value: "/data/vector-stores/knowledge-federation.db"
|
||||||
- name: FlowerCore__Editions__ProfileDirectory
|
- name: FlowerCore__Editions__ProfileDirectory
|
||||||
value: "/home/app/editions"
|
value: "/home/app/editions"
|
||||||
# Embed via edge1 Pi 5 + AI HAT+ (10.0.57.17:11434). Cluster
|
# Embed via edge1 Pi 5 + AI HAT+ (10.0.57.17:11434). Cluster
|
||||||
|
|||||||
@@ -223,7 +223,7 @@ data:
|
|||||||
service: "pimanager"
|
service: "pimanager"
|
||||||
vlan: "home"
|
vlan: "home"
|
||||||
device: "pi4-ezconnect"
|
device: "pi4-ezconnect"
|
||||||
- targets: ["10.0.58.113:5100"]
|
- targets: ["10.0.58.113:5200"]
|
||||||
labels:
|
labels:
|
||||||
instance: "pirelay"
|
instance: "pirelay"
|
||||||
service: "pimanager"
|
service: "pimanager"
|
||||||
@@ -480,14 +480,16 @@ data:
|
|||||||
- "https://argocd.iamworkin.lan/"
|
- "https://argocd.iamworkin.lan/"
|
||||||
- "https://intranet.iamworkin.lan/"
|
- "https://intranet.iamworkin.lan/"
|
||||||
- "https://signage.iamworkin.lan/healthz" # root 401 auth-gated 2026-06-01; /healthz anon 200
|
- "https://signage.iamworkin.lan/healthz" # root 401 auth-gated 2026-06-01; /healthz anon 200
|
||||||
|
- "https://signalcontrol.iamworkin.lan/health" # FlowerCore.SignalControl explicit health route
|
||||||
- "https://kiosk.iamworkin.lan/"
|
- "https://kiosk.iamworkin.lan/"
|
||||||
- "https://media.iamworkin.lan/"
|
- "https://media.iamworkin.lan/"
|
||||||
- "https://mysql.iamworkin.lan/healthz" # root 401 auth-gated 2026-06-01; /healthz anon 200
|
- "https://mysql.iamworkin.lan/healthz" # root 401 auth-gated 2026-06-01; /healthz anon 200
|
||||||
- "https://php.iamworkin.lan/healthz" # root 401 auth-gated 2026-06-01; /healthz anon 200
|
- "https://php.iamworkin.lan/healthz" # root 401 auth-gated 2026-06-01; /healthz anon 200
|
||||||
|
- "https://dns.iamworkin.lan/"
|
||||||
- "https://zabbix.iamworkin.lan/"
|
- "https://zabbix.iamworkin.lan/"
|
||||||
|
- "https://flowercore.iamworkin.lan/healthz"
|
||||||
- "https://desktop.iamworkin.lan/"
|
- "https://desktop.iamworkin.lan/"
|
||||||
- "https://print.iamworkin.lan/"
|
- "https://print.iamworkin.lan/"
|
||||||
- "https://dns.iamworkin.lan/"
|
|
||||||
- "https://chat.iamworkin.lan/"
|
- "https://chat.iamworkin.lan/"
|
||||||
- "https://dist.iamworkin.lan/"
|
- "https://dist.iamworkin.lan/"
|
||||||
- "https://dms.iamworkin.lan/"
|
- "https://dms.iamworkin.lan/"
|
||||||
@@ -496,9 +498,15 @@ data:
|
|||||||
- "https://presentations.iamworkin.lan/"
|
- "https://presentations.iamworkin.lan/"
|
||||||
- "https://retail.iamworkin.lan/"
|
- "https://retail.iamworkin.lan/"
|
||||||
- "https://ttsreader.iamworkin.lan/"
|
- "https://ttsreader.iamworkin.lan/"
|
||||||
|
- "https://updates.iamworkin.lan/api/v1/manifests/_schema"
|
||||||
# Explicit healthcheck paths
|
# Explicit healthcheck paths
|
||||||
- "https://fc-llm-bridge.iamworkin.lan/healthz"
|
- "https://fc-llm-bridge.iamworkin.lan/healthz"
|
||||||
- "https://acme.iamworkin.lan/health"
|
- "https://acme.iamworkin.lan/health"
|
||||||
|
- "https://replay.iamworkin.lan/healthz"
|
||||||
|
- "https://updatecenter-internal.iamworkin.lan/api/v1/manifests/_schema"
|
||||||
|
- "https://worldbuilder.iamworkin.lan/healthz"
|
||||||
|
# Coverage gaps logged Q-MR-129/Q-MR-130: devices.iamworkin.lan
|
||||||
|
# returns 503 and e2e-test-pma/wpdemo only return 404.
|
||||||
# NOTE: services intentionally NOT in this probe surface
|
# NOTE: services intentionally NOT in this probe surface
|
||||||
# - grafana.iamworkin.lan: every endpoint (incl. /api/health
|
# - grafana.iamworkin.lan: every endpoint (incl. /api/health
|
||||||
# and /login) returns 401 behind Traefik basic-auth.
|
# and /login) returns 401 behind Traefik basic-auth.
|
||||||
@@ -907,11 +915,14 @@ data:
|
|||||||
# for: 30m absorbs sleep cycles. The EcoTank sleeps after ~5 min
|
# for: 30m absorbs sleep cycles. The EcoTank sleeps after ~5 min
|
||||||
# of idle and SNMP times out, so 5m for: would page nightly. A
|
# of idle and SNMP times out, so 5m for: would page nightly. A
|
||||||
# genuine printer outage (jam, disconnected) lasts well over 30m.
|
# genuine printer outage (jam, disconnected) lasts well over 30m.
|
||||||
|
# Use a range-window expression: instant up{} can go stale/absent
|
||||||
|
# after repeated snmp-exporter 500s.
|
||||||
- alert: EpsonPrinterDown
|
- alert: EpsonPrinterDown
|
||||||
expr: up{job="snmp-printer"} == 0
|
expr: (max_over_time(up{job="snmp-printer"}[35m]) == bool 0) == 1 and (hour() >= 13 or hour() < 1)
|
||||||
for: 30m
|
for: 30m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
|
alert_channel: irc
|
||||||
annotations:
|
annotations:
|
||||||
summary: "Epson ET-3750 SNMP unreachable for >30m (likely actual fault, not sleep)"
|
summary: "Epson ET-3750 SNMP unreachable for >30m (likely actual fault, not sleep)"
|
||||||
|
|
||||||
@@ -1020,7 +1031,9 @@ data:
|
|||||||
- name: kubernetes-state
|
- name: kubernetes-state
|
||||||
rules:
|
rules:
|
||||||
- alert: KubeContainerRestartingFrequently
|
- alert: KubeContainerRestartingFrequently
|
||||||
expr: increase(kube_pod_container_status_restarts_total[1h]) > 5
|
# Exclude github-runner: ephemeral runners register, run one job,
|
||||||
|
# exit cleanly, then restart by design.
|
||||||
|
expr: increase(kube_pod_container_status_restarts_total{namespace!="github-runner"}[1h]) > 5
|
||||||
for: 15m
|
for: 15m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
@@ -1029,7 +1042,9 @@ data:
|
|||||||
description: "Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has restarted {{ $value | printf \"%.0f\" }} times in the last hour. Check 'kubectl describe pod' + last-state termination reason."
|
description: "Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has restarted {{ $value | printf \"%.0f\" }} times in the last hour. Check 'kubectl describe pod' + last-state termination reason."
|
||||||
|
|
||||||
- alert: KubeContainerCrashLooping
|
- alert: KubeContainerCrashLooping
|
||||||
expr: increase(kube_pod_container_status_restarts_total[15m]) > 3
|
# Exclude github-runner: ephemeral runners register, run one job,
|
||||||
|
# exit cleanly, then restart by design.
|
||||||
|
expr: increase(kube_pod_container_status_restarts_total{namespace!="github-runner"}[15m]) > 3
|
||||||
for: 5m
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
@@ -1057,7 +1072,8 @@ data:
|
|||||||
description: "Pod can't pull image. Check the image ref (often a stale tag or unreachable registry) and clean up if it's an orphan."
|
description: "Pod can't pull image. Check the image ref (often a stale tag or unreachable registry) and clean up if it's an orphan."
|
||||||
|
|
||||||
- alert: KubeDeploymentReplicasMismatch
|
- alert: KubeDeploymentReplicasMismatch
|
||||||
expr: kube_deployment_spec_replicas != kube_deployment_status_replicas_available
|
# Exclude github-runner: ephemeral runner deployments flap 0/1 between jobs by design.
|
||||||
|
expr: kube_deployment_spec_replicas{namespace!="github-runner"} != kube_deployment_status_replicas_available{namespace!="github-runner"}
|
||||||
for: 15m
|
for: 15m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
@@ -3636,6 +3652,38 @@ data:
|
|||||||
relativeTimeRange: {from: 120, to: 0}
|
relativeTimeRange: {from: 120, to: 0}
|
||||||
datasourceUid: __expr__
|
datasourceUid: __expr__
|
||||||
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [600], type: gt}}], refId: C}
|
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [600], type: gt}}], refId: C}
|
||||||
|
- orgId: 1
|
||||||
|
name: SNMP Devices
|
||||||
|
folder: Infrastructure Alerts
|
||||||
|
interval: 1m
|
||||||
|
rules:
|
||||||
|
- uid: epson-printer-down-stale-window
|
||||||
|
title: EpsonPrinterDown
|
||||||
|
condition: C
|
||||||
|
for: 30m
|
||||||
|
noDataState: OK
|
||||||
|
execErrState: OK
|
||||||
|
annotations:
|
||||||
|
summary: Epson ET-3750 SNMP unreachable
|
||||||
|
description: The Epson ET-3750 snmp-printer target has reported only failed scrapes for at least 35 minutes.
|
||||||
|
runbook: "1. Check if printer is intentionally powered off 2. If printing needed: press power button on printer 3. Ping 10.0.58.107 after wake-up 4. Check WiFi on printer LCD if still unreachable"
|
||||||
|
labels:
|
||||||
|
severity: info
|
||||||
|
service: printer
|
||||||
|
alert_channel: irc
|
||||||
|
data:
|
||||||
|
- refId: A
|
||||||
|
relativeTimeRange: {from: 2100, to: 0}
|
||||||
|
datasourceUid: prometheus
|
||||||
|
model: {expr: '(max_over_time(up{job="snmp-printer"}[35m]) == bool 0) == 1 and (hour() >= 13 or hour() < 1)', instant: true, refId: A}
|
||||||
|
- refId: B
|
||||||
|
relativeTimeRange: {from: 2100, to: 0}
|
||||||
|
datasourceUid: __expr__
|
||||||
|
model: {type: reduce, expression: A, reducer: last, refId: B}
|
||||||
|
- refId: C
|
||||||
|
relativeTimeRange: {from: 2100, to: 0}
|
||||||
|
datasourceUid: __expr__
|
||||||
|
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [0], type: gt}}], refId: C}
|
||||||
- orgId: 1
|
- orgId: 1
|
||||||
name: CI Runners
|
name: CI Runners
|
||||||
folder: CI Alerts
|
folder: CI Alerts
|
||||||
|
|||||||
124
tests/bluejay-infra-lint/MonitoringCoverageLintTests.cs
Normal file
124
tests/bluejay-infra-lint/MonitoringCoverageLintTests.cs
Normal file
@@ -0,0 +1,124 @@
|
|||||||
|
using FluentAssertions;
|
||||||
|
using System.Text.RegularExpressions;
|
||||||
|
using Xunit;
|
||||||
|
|
||||||
|
namespace BluejayInfraLint.Tests;
|
||||||
|
|
||||||
|
[Trait("Category", "Unit")]
|
||||||
|
public sealed class MonitoringCoverageLintTests
|
||||||
|
{
|
||||||
|
private static readonly ManifestInventory Inventory = ManifestInventory.Load();
|
||||||
|
|
||||||
|
private static readonly string[] Sprint57ProbeTargets =
|
||||||
|
{
|
||||||
|
"https://dns.iamworkin.lan/",
|
||||||
|
"https://flowercore.iamworkin.lan/healthz",
|
||||||
|
"https://replay.iamworkin.lan/healthz",
|
||||||
|
"https://signalcontrol.iamworkin.lan/health",
|
||||||
|
"https://updatecenter-internal.iamworkin.lan/api/v1/manifests/_schema",
|
||||||
|
"https://updates.iamworkin.lan/api/v1/manifests/_schema",
|
||||||
|
"https://worldbuilder.iamworkin.lan/healthz",
|
||||||
|
};
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public void PrometheusScrape_MustNotTargetDeadPiManagerPort()
|
||||||
|
{
|
||||||
|
var monitoring = ReadMonitoringMirror();
|
||||||
|
|
||||||
|
monitoring.Should().NotContain("10.0.58.113:5100");
|
||||||
|
monitoring.Should().Contain("10.0.58.113:5200");
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public void ProbeJobs_MustKeepEnvironmentSpecificBlackboxRelabels()
|
||||||
|
{
|
||||||
|
var monitoring = ReadMonitoringMirror();
|
||||||
|
var probeJobs = FindProbeJobs(monitoring);
|
||||||
|
|
||||||
|
probeJobs.Should().NotBeEmpty();
|
||||||
|
probeJobs.Should().OnlyContain(
|
||||||
|
job => job.Contains("replacement: blackbox-exporter.monitoring.svc:9115", StringComparison.Ordinal),
|
||||||
|
"the bluejay-infra mirror runs Prometheus in-cluster and should use the blackbox service DNS");
|
||||||
|
|
||||||
|
var livePodmanPrometheus = TryReadNotesMonitoringFile("prometheus.yml");
|
||||||
|
if (livePodmanPrometheus is not null)
|
||||||
|
{
|
||||||
|
FindProbeJobs(livePodmanPrometheus).Should().OnlyContain(
|
||||||
|
job => job.Contains("replacement: localhost:9115", StringComparison.Ordinal),
|
||||||
|
"live Podman monitoring uses host networking, so blackbox probes must relabel to localhost:9115");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public void TraefikServiceProbes_MustCoverSprint57LiveFlowerCoreHosts()
|
||||||
|
{
|
||||||
|
var monitoring = ReadMonitoringMirror();
|
||||||
|
|
||||||
|
foreach (var target in Sprint57ProbeTargets)
|
||||||
|
{
|
||||||
|
monitoring.Should().Contain(target);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public void EpsonPrinterDown_MustUseRangeWindowForStaleScrapeCoverage()
|
||||||
|
{
|
||||||
|
var alerts = ReadMonitoringMirror();
|
||||||
|
|
||||||
|
alerts.Should().Contain("- alert: EpsonPrinterDown");
|
||||||
|
alerts.Should().Contain("max_over_time(up{job=\"snmp-printer\"}[35m]) == bool 0");
|
||||||
|
alerts.Should().NotContain("expr: up{job=\"snmp-printer\"} == 0");
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public void MonitoringMirror_MustCarryRunnerExclusionsAndEpsonGrafanaDelivery()
|
||||||
|
{
|
||||||
|
var mirror = ReadMonitoringMirror();
|
||||||
|
|
||||||
|
GetAlertBlock(mirror, "KubeContainerRestartingFrequently")
|
||||||
|
.Should()
|
||||||
|
.Contain("kube_pod_container_status_restarts_total{namespace!=\"github-runner\"}[1h]");
|
||||||
|
GetAlertBlock(mirror, "KubeContainerCrashLooping")
|
||||||
|
.Should()
|
||||||
|
.Contain("kube_pod_container_status_restarts_total{namespace!=\"github-runner\"}[15m]");
|
||||||
|
GetAlertBlock(mirror, "KubeDeploymentReplicasMismatch")
|
||||||
|
.Should()
|
||||||
|
.Contain("kube_deployment_spec_replicas{namespace!=\"github-runner\"} != kube_deployment_status_replicas_available{namespace!=\"github-runner\"}");
|
||||||
|
mirror.Should().Contain("uid: epson-printer-down-stale-window");
|
||||||
|
mirror.Should().Contain("title: EpsonPrinterDown");
|
||||||
|
mirror.Should().Contain("alert_channel: irc");
|
||||||
|
}
|
||||||
|
|
||||||
|
private static string ReadMonitoringMirror() =>
|
||||||
|
File.ReadAllText(Path.Combine(Inventory.BluejayRoot, "apps", "monitoring", "noc-monitoring.yaml"));
|
||||||
|
|
||||||
|
private static string? TryReadNotesMonitoringFile(string fileName)
|
||||||
|
{
|
||||||
|
var overrideRoot = Environment.GetEnvironmentVariable("FLOWERCORE_NOTES_ROOT");
|
||||||
|
if (string.IsNullOrWhiteSpace(overrideRoot))
|
||||||
|
{
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
var path = Path.Combine(overrideRoot, "scripts", "monitoring", fileName);
|
||||||
|
return File.ReadAllText(path);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static IReadOnlyList<string> FindProbeJobs(string yaml) =>
|
||||||
|
Regex.Matches(
|
||||||
|
yaml,
|
||||||
|
"(?ms)^\\s+- job_name: \"probe-[^\"]+\".*?(?=^\\s+- job_name:|\\z)")
|
||||||
|
.Cast<Match>()
|
||||||
|
.Select(match => match.Value)
|
||||||
|
.ToList();
|
||||||
|
|
||||||
|
private static string GetAlertBlock(string yaml, string alertName)
|
||||||
|
{
|
||||||
|
var match = Regex.Match(
|
||||||
|
yaml,
|
||||||
|
$"(?ms)^\\s+- alert: {Regex.Escape(alertName)}\\s*$.*?(?=^\\s+- alert:|\\z)");
|
||||||
|
|
||||||
|
match.Success.Should().BeTrue($"alert {alertName} should be present in noc-monitoring.yaml");
|
||||||
|
return match.Value;
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user