Compare commits
245 Commits
f8eb946704
...
claude/ci1
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
84c9feb893 | ||
|
|
427dbfcef2 | ||
|
|
b651a4e2d0 | ||
|
|
b998f50f48 | ||
|
|
8fd9ae1cd3 | ||
|
|
fc2aca0e9e | ||
|
|
ba18c52130 | ||
|
|
9f6dc1a9d5 | ||
|
|
0bf47dfa33 | ||
|
|
87a7d7c70a | ||
|
|
1c4145a581 | ||
|
|
c50a403f74 | ||
|
|
fb7bd10528 | ||
|
|
6c21d14a98 | ||
|
|
b3529f8e96 | ||
|
|
00c11b4eaa | ||
|
|
04881f46f0 | ||
|
|
c0038e4859 | ||
|
|
dee48831c6 | ||
|
|
0f1dc5f871 | ||
|
|
11c5f6e6cc | ||
|
|
d637fe9b30 | ||
|
|
5bfe41beca | ||
|
|
df22774674 | ||
|
|
c4065b15a3 | ||
|
|
a4aa612373 | ||
|
|
c2eb37dee9 | ||
|
|
bf6f542569 | ||
|
|
e150b2102f | ||
|
|
33a765b0bc | ||
|
|
5484ed7db6 | ||
|
|
2aa84349ea | ||
|
|
851f8e673b | ||
|
|
f78f8c8192 | ||
|
|
9b255fefc1 | ||
|
|
6a89a76e39 | ||
|
|
2489464d4f | ||
|
|
4b777b16ac | ||
|
|
8c60e3a4d3 | ||
|
|
df02b4c3c3 | ||
|
|
c0dceafffd | ||
|
|
490db8f9e6 | ||
|
|
1926bdaf3b | ||
|
|
ca8d062826 | ||
|
|
1889462fc4 | ||
|
|
523ba61232 | ||
|
|
53f67c8713 | ||
|
|
6b9cf3d12c | ||
|
|
0b52093b36 | ||
|
|
7a9098d3bd | ||
|
|
57d7ba46a7 | ||
|
|
9ec2e2d52e | ||
|
|
b4d62a8a50 | ||
|
|
fbbc07023b | ||
|
|
4b0eef0fb0 | ||
|
|
bb09a3786f | ||
|
|
006dbcf671 | ||
|
|
1be71d6ba7 | ||
|
|
0c8026c912 | ||
|
|
621ae47e00 | ||
|
|
ae6b8c0142 | ||
|
|
da55220218 | ||
|
|
b1ad253dd6 | ||
|
|
ee935f6e07 | ||
|
|
2853ee2024 | ||
|
|
b4a34e16ca | ||
|
|
0d5a1fd530 | ||
|
|
1b633f57b2 | ||
|
|
ee8afd0a08 | ||
|
|
cf35884eae | ||
|
|
9881767b11 | ||
|
|
c9bf23834b | ||
|
|
174002023d | ||
|
|
b71f9e4ec9 | ||
|
|
f1431f7324 | ||
|
|
35bd055cb4 | ||
|
|
f604ab419e | ||
|
|
b2786252b0 | ||
|
|
45ee40920d | ||
|
|
8ad7eb714b | ||
|
|
3cb44c3104 | ||
|
|
2400329acd | ||
|
|
c17af882cc | ||
|
|
76b1938afa | ||
|
|
ced04a6148 | ||
|
|
f2258b92a2 | ||
|
|
979a7c7b25 | ||
|
|
0df8f7b936 | ||
|
|
38558641c1 | ||
|
|
63d905b4df | ||
|
|
d95f4e0caf | ||
|
|
7bc565d17e | ||
|
|
dfe9c3b67e | ||
|
|
37f8db89e4 | ||
|
|
00c7d8df24 | ||
|
|
c6811eadd8 | ||
|
|
4d9d537d83 | ||
|
|
0f9d56ee16 | ||
|
|
3bf6511d5d | ||
|
|
3e0b9055b0 | ||
|
|
c828832808 | ||
|
|
e2c71c2b8a | ||
|
|
b3028f5119 | ||
|
|
05a273d3a6 | ||
|
|
ab6ade4e46 | ||
|
|
4848f72eec | ||
|
|
f5eafc5def | ||
|
|
2d3fd74bab | ||
|
|
df4e1f78b0 | ||
|
|
2a10b775a8 | ||
|
|
447ddd339d | ||
|
|
7833143c1c | ||
|
|
8ed77c4627 | ||
|
|
437f346aee | ||
|
|
bc32b5ef04 | ||
|
|
263d06acb9 | ||
|
|
25dbb2967f | ||
|
|
a89a774eaf | ||
|
|
dc39747f3f | ||
|
|
87050e72a9 | ||
|
|
e8c5d2afd2 | ||
|
|
eef492125f | ||
|
|
b51ee35bfa | ||
|
|
4abc2fa95d | ||
|
|
d7628a6945 | ||
|
|
df115e4d1e | ||
|
|
9df26620b8 | ||
|
|
08aa7a5bff | ||
|
|
38e20a8b64 | ||
|
|
c945d44b9e | ||
|
|
1f1354f634 | ||
|
|
76ece92cfd | ||
|
|
a760a58846 | ||
|
|
9fb526c7c5 | ||
|
|
dd7980642e | ||
|
|
1d4ad64226 | ||
|
|
774f82c431 | ||
|
|
d2cc36ea0e | ||
|
|
299070e4bf | ||
|
|
a9debd8668 | ||
|
|
675b9da4f9 | ||
|
|
2b471a55b0 | ||
|
|
37ce0aed85 | ||
|
|
a37fc83584 | ||
|
|
3a8aae9e2d | ||
|
|
020a806d08 | ||
|
|
e65de2938b | ||
|
|
5c0c21790e | ||
|
|
292528ec15 | ||
|
|
bb39a0c1fd | ||
|
|
c23e903ba7 | ||
|
|
cae03296f5 | ||
|
|
3c5c1a07bd | ||
|
|
057595de3d | ||
|
|
b02bb4be38 | ||
|
|
e44e9a0062 | ||
|
|
297a2a9bbc | ||
|
|
d4210c819f | ||
|
|
fc0b67f670 | ||
|
|
223e9a9232 | ||
|
|
6c1375b21a | ||
|
|
82529ed9b5 | ||
|
|
3ea8a56dab | ||
|
|
9272abc225 | ||
|
|
436185818d | ||
|
|
c3cc404beb | ||
|
|
90627819cc | ||
|
|
c97d486a3d | ||
|
|
209bdc16cd | ||
|
|
3999634b06 | ||
|
|
61538d3712 | ||
|
|
ccaac367af | ||
|
|
407d473b71 | ||
|
|
f9593e494a | ||
|
|
5b6c7b97fc | ||
|
|
a76eeb5c39 | ||
|
|
8a960ffc73 | ||
|
|
686dbacc66 | ||
|
|
5ccf055465 | ||
|
|
4da60820c6 | ||
|
|
1cc4324cfb | ||
|
|
bfc755057e | ||
|
|
d6008ee205 | ||
|
|
39fe6f1dba | ||
|
|
90fcf0cd5d | ||
|
|
ffef5c9126 | ||
|
|
634e90a9ee | ||
|
|
86ccca18e3 | ||
|
|
1c5caf3f40 | ||
|
|
d3db19b0ca | ||
|
|
702a6e4f52 | ||
|
|
6cbb5d8792 | ||
|
|
62db15c69c | ||
|
|
84634f59f0 | ||
|
|
4cd5806fd0 | ||
|
|
11c48bef30 | ||
|
|
a86e87050b | ||
|
|
0214f94ac4 | ||
|
|
a1b8eb379d | ||
|
|
9a1665907c | ||
|
|
899804215a | ||
|
|
1dc66738e6 | ||
|
|
5623a272c5 | ||
|
|
3d3f91160b | ||
|
|
93f77c1844 | ||
|
|
59efc460fd | ||
|
|
02959f1ac6 | ||
|
|
a3aa84bdae | ||
|
|
01cb9a557f | ||
|
|
0fa46ad53b | ||
|
|
1ded5a61c0 | ||
|
|
3c1d212251 | ||
|
|
c0547a9964 | ||
|
|
973c1dae72 | ||
|
|
475737b36f | ||
|
|
3bb3801fbd | ||
|
|
28b76001a8 | ||
|
|
0c67fa5356 | ||
|
|
62e342cfb2 | ||
|
|
90deacd154 | ||
|
|
f0733ff89d | ||
|
|
313bdcb21a | ||
|
|
5f4818bd96 | ||
|
|
fff998dab5 | ||
|
|
20e4130c74 | ||
|
|
3cf675b8c3 | ||
|
|
2a9f2e4540 | ||
|
|
b15a35a258 | ||
|
|
3f4985ee13 | ||
|
|
e535a8d34b | ||
|
|
6ddbd2cae5 | ||
|
|
e9608651f7 | ||
|
|
abdb7a806e | ||
|
|
7afb5043c4 | ||
|
|
1883953cb8 | ||
|
|
9c555db083 | ||
|
|
cb349c6764 | ||
|
|
3888c4c3e0 | ||
|
|
7aec403e96 | ||
|
|
5685ab0550 | ||
|
|
d4d3455ef2 | ||
|
|
29d557003f | ||
|
|
719aa8c1c6 | ||
|
|
63cf5193ef | ||
|
|
ef0e1f2505 |
7
.gitignore
vendored
Normal file
7
.gitignore
vendored
Normal file
@@ -0,0 +1,7 @@
|
||||
# .NET build outputs (lint test project)
|
||||
**/bin/
|
||||
**/obj/
|
||||
|
||||
# Editor / temp
|
||||
.DS_Store
|
||||
*.swp
|
||||
128
README.md
128
README.md
@@ -1,3 +1,125 @@
|
||||
# bluejay-infra
|
||||
|
||||
Infrastructure manifests for ArgoCD
|
||||
# bluejay-infra
|
||||
|
||||
Infrastructure manifests for ArgoCD. An `ApplicationSet` in `argocd` namespace watches the `apps/*` directories in this repo and creates one `Application` per subdir (prefixed `infra-<name>`).
|
||||
|
||||
## Adding a new service to the cluster
|
||||
|
||||
Follow these steps in order. **Step 1 must run before step 3** — if you skip it, cert-manager HTTP-01 will silently fail for ~2h per cert (exponential backoff) until someone diagnoses the DNS.
|
||||
|
||||
### 1. Create or verify the FlowerCore.DNS A record (REQUIRED for current HTTP-01 manifests)
|
||||
|
||||
step-ca (the ACME CA on noc1) runs in a Podman container with host networking. Its container resolver uses pfSense Unbound (10.0.56.1), **not** cluster CoreDNS. So even though CoreDNS has a wildcard `*.iamworkin.lan → 10.0.56.200` for in-cluster lookups, step-ca cannot see it. Every new public hostname needs an explicit pfSense host override.
|
||||
|
||||
The management path is now `FlowerCore.DNS`, not `FlowerCore.Notes/scripts/pfsense-add-dns-overrides.py`. Add or verify the public A record there before you apply the manifest:
|
||||
|
||||
```bash
|
||||
curl -sk https://dns.iamworkin.lan/api/v1/servers
|
||||
# Find the pfSense serverId, then create the record using the host label only.
|
||||
# Example: for foo.iamworkin.lan, use "name":"foo".
|
||||
|
||||
curl -sk -X POST https://dns.iamworkin.lan/api/v1/servers/<serverId>/zones/iamworkin.lan/records \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"name":"<yourservice>","type":"A","data":"10.0.56.200","ttl":300}'
|
||||
```
|
||||
|
||||
Verify all referenced iamworkin.lan hosts resolve (run from anywhere on LAN):
|
||||
|
||||
```bash
|
||||
python scripts/check-pfsense-dns.py
|
||||
# Historical filename retained. The script now calls
|
||||
# https://dns.iamworkin.lan/api/v1/zones/iamworkin.lan/resolve-preflight
|
||||
# for every Certificate dnsName and Traefik Host(...) rule it finds.
|
||||
|
||||
python scripts/check-pfsense-dns.py --live
|
||||
# Optional stronger pass when kubectl access is available; also checks
|
||||
# live-cluster Certificates and IngressRoutes for drift outside manifests.
|
||||
```
|
||||
|
||||
**Symptom if you skip this:** the Certificate resource stays `Ready: False` with `status.reason: unexpected non-ACME API error: context deadline exceeded`. Recovery requires `kubectl -n <ns> delete order <order-name>` after adding the DNS to bypass cert-manager's backoff.
|
||||
|
||||
### 2. Create the app manifest
|
||||
|
||||
Create `apps/<name>/<name>.yaml` containing the Namespace, Deployment, Service, Certificate, and IngressRoute. Reference an existing directory (e.g. `apps/fc-messageboard/`) for the canonical shape.
|
||||
|
||||
Conventions:
|
||||
|
||||
- `Namespace` has label `app.kubernetes.io/part-of: bluejay-infra`
|
||||
- `Deployment.spec.selector.matchLabels` and `Service.spec.selector` MUST use the same label key. The historical convention here is `app: <name>` (not `app.kubernetes.io/name`) — don't mix.
|
||||
- Image: `localhost/<name>:v<YYYYMMDD><HHMM>`, `imagePullPolicy: Never`. Import the image to every RKE2 node (server + both agents) via `ctr images import` before applying — pods schedule anywhere.
|
||||
- If the app persists local state (SQLite, uploads), declare the `PersistentVolumeClaim` here with `storageClassName: longhorn` and `accessModes: [ReadWriteOnce]`. Add `strategy.type: Recreate` to the Deployment — RWO PVC blocks rolling updates.
|
||||
- Probes: use `tcpSocket` if the app has middleware that intercepts unauth requests (returns 404/401 for `/health`). Otherwise prefer `httpGet` against whatever the app exposes (verify the path isn't gated by auth).
|
||||
- Certificate: `issuerRef.name: step-ca-acme`, `issuerRef.kind: ClusterIssuer`. `dnsNames` must match the hostname you created in FlowerCore.DNS in step 1.
|
||||
|
||||
### 3. Commit & push
|
||||
|
||||
```bash
|
||||
git add apps/<name>/
|
||||
git commit -m "<name>: initial deployment"
|
||||
git push
|
||||
```
|
||||
|
||||
ArgoCD's `ApplicationSet` picks up the new directory within ~3 minutes and creates `infra-<name>` with auto-sync + self-heal enabled.
|
||||
|
||||
### 4. Verify
|
||||
|
||||
```bash
|
||||
# From noc1
|
||||
fcadmin_ssh noc1 '
|
||||
kubectl -n argocd get application infra-<name>
|
||||
kubectl -n <ns> get certificate,pod
|
||||
curl -sk -m 8 -o /dev/null -w "HTTP %{http_code}\n" https://<name>.iamworkin.lan/
|
||||
'
|
||||
```
|
||||
|
||||
Certificate should be `Ready: True` within ~60s. If it stalls `False` for >2m, the pfSense DNS step got skipped — go back to step 1, then `kubectl -n <ns> delete order <order-name>` to bust the backoff.
|
||||
|
||||
### Pre-merge gate
|
||||
|
||||
Before `git push`, always run:
|
||||
|
||||
```bash
|
||||
python scripts/check-pfsense-dns.py
|
||||
```
|
||||
|
||||
It's a quick service-backed check that would have caught the entire 2026-04-22 cert-manager outage. Consider wiring it into a pre-commit hook or a Gitea Actions workflow.
|
||||
|
||||
## Retiring a service
|
||||
|
||||
1. `kubectl -n argocd delete application infra-<name>` (cascade deletes the K8s resources via ArgoCD finalizers)
|
||||
2. `git rm -r apps/<name>/` and push
|
||||
3. Remove the FlowerCore.DNS record through the UI or API, for example:
|
||||
|
||||
```bash
|
||||
curl -sk https://dns.iamworkin.lan/api/v1/servers
|
||||
curl -sk -X DELETE https://dns.iamworkin.lan/api/v1/servers/<serverId>/zones/iamworkin.lan/records/<yourservice>
|
||||
```
|
||||
|
||||
## Known gotchas
|
||||
|
||||
- **CoreDNS template + ndots:5 collision**: inside pods, `<svc>.<ns>.svc.cluster.local` with <5 dots gets search-expanded through `iamworkin.lan` FIRST and hits the wildcard template → resolves to Traefik VIP, not the real ClusterIP. Use short service names (`<svc>`) in K8s manifests. See memory `feedback_coredns_ndots_template_collision.md`.
|
||||
- **Image not on node**: pods stuck `ErrImageNeverPull` means the image wasn't imported to the node Kubernetes scheduled the pod onto. `ctr images import` on all of rke2-server, rke2-agent1, rke2-agent2.
|
||||
- **StatefulSet PVC drift**: `volumeClaimTemplates` needs explicit `volumeMode: Filesystem` or ArgoCD SSA self-heals forever. See memory `feedback_argocd_statefulset_pvc_drift.md`.
|
||||
- **IngressRoute namespace split**: this RKE2 Traefik install does not allow cross-namespace service refs. Keep the `IngressRoute`, backend `Service`, and TLS secret in the same namespace; if one host is shared across namespaces, duplicate the `Certificate` and move the route next to the destination service.
|
||||
- **Public read-only hosts**: if a public host fronts a service that also exposes admin writes internally, add a Traefik route match like `Host(...) && (Method(GET) || Method(HEAD))` on the public edge instead of trusting the app to reject unsafe methods.
|
||||
- **Public read-write allowlist hosts**: if a public host accepts a tightly bounded write surface (e.g. bootstrap-JWT POST), pin the allowlist as `(Method(GET) || Method(HEAD) || Method(POST) || Method(OPTIONS))`. PUT/PATCH/DELETE must still 404 at the route. Track A's `updatecenter.iamworkin.lan` / `updates.iamworkin.lan` are the canonical example. The lint test enforces this invariant.
|
||||
- **Traefik VIP netpols**: when a `NetworkPolicy` allows `10.0.56.200`, also allow the post-DNAT backend ports (`8443` for TLS plus `8080` or `8000` for HTTP) or Calico will drop the rewritten flow.
|
||||
- **Auth-safe probes**: services behind API-key or global auth middleware should prefer `tcpSocket` probes unless `/health` is explicitly exempted before the middleware runs.
|
||||
- **ArgoCD must use internal Gitea URL**: `http://gitea-clusterip.gitea.svc.cluster.local:3000/bluejay/bluejay-infra.git`, not the external HTTPS URL (step-ca cert isn't trusted by ArgoCD). The `ApplicationSet` and any hand-created `Application` must both use the internal URL.
|
||||
|
||||
## Local manifest lint
|
||||
|
||||
The repo now carries a local-first lint pass for the recurring K8s gotchas that have burned the fleet:
|
||||
|
||||
```bash
|
||||
dotnet test tests/bluejay-infra-lint/BluejayInfraLint.Tests.csproj -c Release
|
||||
```
|
||||
|
||||
That test project sweeps `bluejay-infra/apps/**` plus the canonical sibling `FlowerCore.*\\k8s` manifests that share the same workspace. Matching `conftest.dev` policy files live under `tests/bluejay-infra-lint/conftest.dev/` for environments that also have `conftest` or `opa`.
|
||||
|
||||
## References
|
||||
|
||||
- Cert-manager recovery playbook: `FlowerCore.Notes/memory/project_cert_manager_recovery_2026_04_22.md`
|
||||
- Why pfSense DNS is required: `FlowerCore.Notes/memory/feedback_pfsense_dns_required_for_acme.md`
|
||||
- Public DNS operator host: `https://dns.iamworkin.lan`
|
||||
- Canonical credential helper: `FlowerCore.Notes/scripts/credential-helper.sh`
|
||||
- pfSense admin automation: `FlowerCore.Notes/memory/feedback_pfsense_automation.md`
|
||||
|
||||
@@ -2,14 +2,15 @@
|
||||
# Agent Zero AI Stack — NUC Deployment (RKE2 Bare-Metal)
|
||||
# =============================================================================
|
||||
# Deploys: AgentZero (agent UI) on RKE2 cluster with Blue Jay profile
|
||||
# Ollama: workstation-first via BLUEJAY-WS (10.0.56.20:11434) with edge1 Pi 5
|
||||
# fallback (10.0.57.17:11434)
|
||||
# Ollama: edge1 Pi 5 + AI HAT+ ONLY (10.0.57.17:11434).
|
||||
# Workstation Ollama (BLUEJAY-WS) is intentionally NOT in the upstream —
|
||||
# the workstation is private dev hardware, not a cluster dependency.
|
||||
# Target: RKE2 bare-metal cluster, namespace: agent-zero
|
||||
# Profile: Blue Jay (21 tools, 3 prompts, 4 extensions, theme)
|
||||
#
|
||||
# Differences from LOCAL (WSL K3s):
|
||||
# - Uses Longhorn StorageClass (not local-path)
|
||||
# - Prefers workstation Ollama on the R9700, falls back to edge1 Pi 5
|
||||
# - Cluster-only Ollama path (edge1) — keeps workstation private
|
||||
# - NO Anthropic API key (free/local models only)
|
||||
# - NO Piper TTS or Kiwix (edge1 handles TTS, no Wikipedia needed)
|
||||
# - NO hostPath volumes — profile/tools/extensions loaded via ConfigMaps
|
||||
@@ -91,8 +92,52 @@ subjects:
|
||||
# =============================================================================
|
||||
# Agent Zero — AI Agent Web UI (NUC Edition, Blue Jay Profile)
|
||||
# =============================================================================
|
||||
# Connects to a local proxy that routes to workstation Ollama first and edge1 second
|
||||
# Blue Jay profile with 21 tools, 3 prompts, 4 extensions
|
||||
# Connects directly to fc-llm-bridge for chat + internal util/embed + browser.
|
||||
# Agent Zero's internal util/embed slots stay on the bridge's OpenAI-compatible
|
||||
# /v1 surface, while browser + corpus-search use the Ollama-compatible /api/*
|
||||
# surface through OLLAMA_HOST.
|
||||
# Blue Jay profile with 21 tools, 3 prompts, 4 extensions.
|
||||
|
||||
---
|
||||
# FC LLM Bridge API key for Agent Zero (ADR-088 chat/util/embed/browser routing).
|
||||
# Syncs from 1Password item "FC LLM Bridge API Keys" (field: agent-zero-k8s).
|
||||
# Consumed by chat, internal util/embed, browser, and corpus-search requests
|
||||
# that traverse fc-llm-bridge.
|
||||
apiVersion: onepassword.com/v1
|
||||
kind: OnePasswordItem
|
||||
metadata:
|
||||
name: fc-llm-bridge-api-keys
|
||||
namespace: agent-zero
|
||||
spec:
|
||||
itemPath: "vaults/IAmWorkin/items/FC LLM Bridge API Keys"
|
||||
|
||||
---
|
||||
# Print.Web API key for Agent Zero's print_web.py Python tool.
|
||||
# Syncs from 1Password item "Print.Web API Keys" (password field = API key).
|
||||
# The print_web.py tool reads PRINT_WEB_API_KEY env var for all HTTP requests
|
||||
# to the thermal print service (GET /api/mcp/tools, POST /api/print/*, etc.).
|
||||
# Note: Print.Web uses the legacy REST MCP shape (/api/mcp/tools/*), not the
|
||||
# streamable-http MCP protocol. The print_web Python tool bridges this gap
|
||||
# and is already present in bluejay-tools ConfigMaps.
|
||||
apiVersion: onepassword.com/v1
|
||||
kind: OnePasswordItem
|
||||
metadata:
|
||||
name: print-web-api-keys
|
||||
namespace: agent-zero
|
||||
spec:
|
||||
itemPath: "vaults/IAmWorkin/items/Print.Web API Keys"
|
||||
|
||||
---
|
||||
# Knowledge MCP bearer token for the direct Agent Zero -> Knowledge.Web path.
|
||||
# The 1Password item currently stores the raw token in its concealed PASSWORD
|
||||
# field, which the operator syncs to Secret key `password`.
|
||||
apiVersion: onepassword.com/v1
|
||||
kind: OnePasswordItem
|
||||
metadata:
|
||||
name: knowledge-mcp-tokens
|
||||
namespace: agent-zero
|
||||
spec:
|
||||
itemPath: "vaults/IAmWorkin/items/FlowerCore Knowledge MCP Tokens"
|
||||
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
@@ -105,7 +150,7 @@ metadata:
|
||||
annotations:
|
||||
agent-zero/deployment: "nuc"
|
||||
agent-zero/profile: "bluejay"
|
||||
agent-zero/ollama: "BLUEJAY-WS primary (10.0.56.20:11434), edge1 fallback (10.0.57.17:11434)"
|
||||
agent-zero/ollama: "fc-llm-bridge fronts edge1 Pi 5 + AI HAT+ Ollama for cluster browser/corpus-search traffic; internal chat/util/embed route through the bridge's authenticated OpenAI surface"
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
@@ -120,19 +165,18 @@ spec:
|
||||
spec:
|
||||
serviceAccountName: agent-zero
|
||||
initContainers:
|
||||
# Wait for either workstation or edge1 Ollama to be reachable before starting Agent Zero.
|
||||
- name: wait-for-ollama
|
||||
# Wait for fc-llm-bridge to be reachable before starting Agent Zero.
|
||||
- name: wait-for-llm-bridge
|
||||
image: busybox:1.37
|
||||
command: ["sh", "-c"]
|
||||
args:
|
||||
- |
|
||||
echo "Waiting for Ollama at BLUEJAY-WS or edge1..."
|
||||
until wget -qO- --timeout=2 http://10.0.56.20:11434/api/tags >/dev/null 2>&1 || \
|
||||
wget -qO- --timeout=2 http://10.0.57.17:11434/api/tags >/dev/null 2>&1; do
|
||||
echo "No Ollama endpoint ready yet, retrying in 5s..."
|
||||
echo "Waiting for fc-llm-bridge..."
|
||||
until wget -qO- --timeout=2 http://fc-llm-bridge.fc-llm-bridge.svc:8080/healthz >/dev/null 2>&1; do
|
||||
echo "fc-llm-bridge not ready yet, retrying in 5s..."
|
||||
sleep 5
|
||||
done
|
||||
echo "At least one Ollama endpoint is reachable."
|
||||
echo "fc-llm-bridge is reachable."
|
||||
# Assemble the Blue Jay profile directory structure from ConfigMaps.
|
||||
# ConfigMaps can't create nested dirs, so we copy into the workspace PVC.
|
||||
- name: setup-bluejay
|
||||
@@ -179,50 +223,6 @@ spec:
|
||||
- name: bluejay-theme
|
||||
mountPath: /tmp/bluejay-theme
|
||||
containers:
|
||||
- name: ollama-proxy
|
||||
image: nginx:1.27-alpine
|
||||
command: ["/bin/sh", "-c"]
|
||||
args:
|
||||
- |
|
||||
cat > /etc/nginx/nginx.conf <<'NGINX'
|
||||
worker_processes 1;
|
||||
events { worker_connections 1024; }
|
||||
http {
|
||||
upstream ollama_upstream {
|
||||
server 10.0.56.20:11434 max_fails=2 fail_timeout=10s;
|
||||
server 10.0.57.17:11434 backup;
|
||||
keepalive 16;
|
||||
}
|
||||
server {
|
||||
listen 11434;
|
||||
location / {
|
||||
proxy_http_version 1.1;
|
||||
proxy_set_header Connection "";
|
||||
proxy_set_header Host $host;
|
||||
proxy_connect_timeout 5s;
|
||||
proxy_read_timeout 600s;
|
||||
proxy_send_timeout 600s;
|
||||
proxy_next_upstream error timeout invalid_header http_502 http_503 http_504;
|
||||
proxy_pass http://ollama_upstream;
|
||||
}
|
||||
}
|
||||
}
|
||||
NGINX
|
||||
exec nginx -g 'daemon off;'
|
||||
ports:
|
||||
- containerPort: 11434
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /api/tags
|
||||
port: 11434
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 15
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /api/tags
|
||||
port: 11434
|
||||
initialDelaySeconds: 10
|
||||
periodSeconds: 30
|
||||
- name: agent-zero
|
||||
image: agent0ai/agent-zero:latest
|
||||
command: ["/bin/bash", "-c"]
|
||||
@@ -239,15 +239,45 @@ spec:
|
||||
# Link Blue Jay profile from workspace into Agent Zero's expected path
|
||||
ln -sfn /a0/work/.bluejay/agents/bluejay /a0/agents/bluejay
|
||||
# Write model config BEFORE initialize.sh loads it
|
||||
# The _model_config plugin reads config.json (NOT config.yaml)
|
||||
# Default is OpenRouter; override to the local proxy, which prefers
|
||||
# the workstation and falls back to edge1 automatically.
|
||||
# The _model_config plugin reads config.json (NOT config.yaml).
|
||||
# chat_model: FlowerCore LLM Bridge (ADR-088) — OpenAI-compat,
|
||||
# spend-tracked, tier-aliased (fc:balanced → Claude Sonnet).
|
||||
# api_key comes from A0_SET_chat_model_api_key env var (overrides
|
||||
# config.json). Utility + embedding stay on the authenticated
|
||||
# OpenAI-compatible /v1 surface; browser and direct tool traffic
|
||||
# use the bridge's Ollama-compatible root via OLLAMA_HOST.
|
||||
mkdir -p /a0/usr/plugins/_model_config
|
||||
cat > /a0/usr/plugins/_model_config/config.json << 'MODELCFG'
|
||||
{"allow_chat_override":true,"chat_model":{"provider":"ollama","name":"gemma3:12b","api_base":"http://127.0.0.1:11434","ctx_length":8192,"ctx_history":0.7,"vision":false,"kwargs":{"temperature":0,"num_ctx":8192}},"utility_model":{"provider":"ollama","name":"qwen2.5:1.5b","api_base":"http://127.0.0.1:11434","ctx_length":8192,"ctx_input":0.7,"kwargs":{"num_ctx":8192}},"embedding_model":{"provider":"ollama","name":"nomic-embed-text","api_base":"http://127.0.0.1:11434","kwargs":{}}}
|
||||
{"allow_chat_override":true,"chat_model":{"provider":"openai","name":"fc:balanced","api_base":"http://fc-llm-bridge.fc-llm-bridge.svc:8080/v1","ctx_length":8192,"ctx_history":0.7,"vision":false,"kwargs":{"temperature":0,"num_ctx":8192}},"utility_model":{"provider":"openai","name":"fc:cheap","api_base":"http://fc-llm-bridge.fc-llm-bridge.svc:8080/v1","ctx_length":8192,"ctx_input":0.7,"kwargs":{"num_ctx":8192}},"embedding_model":{"provider":"openai","name":"openai/fc:embedding","api_base":"http://fc-llm-bridge.fc-llm-bridge.svc:8080/v1","kwargs":{}}}
|
||||
MODELCFG
|
||||
# Strip heredoc indentation
|
||||
sed -i 's/^ //' /a0/usr/plugins/_model_config/config.json
|
||||
# Phase 0 Chat MCP pilot: Agent Zero does not interpolate env vars
|
||||
# inside A0_SET_mcp_servers JSON, so build the final JSON here from
|
||||
# the secret-backed env vars before initialize.sh. Keep the local
|
||||
# corpus_search.py tool mounted either way so outage fallback
|
||||
# remains available even when fc_knowledge is not advertised.
|
||||
export KNOWLEDGE_MCP_ENABLED=false
|
||||
if [ -n "${KNOWLEDGE_MCP_BEARER_TOKEN:-}" ]; then
|
||||
if curl -sf --connect-timeout 3 "${KNOWLEDGE_MCP_HEALTH_URL}" > /dev/null && \
|
||||
curl -sf --connect-timeout 5 \
|
||||
-H "Authorization: Bearer ${KNOWLEDGE_MCP_BEARER_TOKEN}" \
|
||||
-H "Accept: application/json, text/event-stream" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"jsonrpc":"2.0","id":"fc-knowledge-bootstrap","method":"initialize","params":{"protocolVersion":"2025-03-26","capabilities":{},"clientInfo":{"name":"agent-zero-bootstrap","version":"1.0"}}}' \
|
||||
"${KNOWLEDGE_MCP_URL}" > /dev/null; then
|
||||
export KNOWLEDGE_MCP_ENABLED=true
|
||||
echo "fc_knowledge enabled from ${KNOWLEDGE_MCP_URL}."
|
||||
else
|
||||
echo "fc_knowledge unavailable or unauthorized; keeping local corpus_search.py as the fallback path."
|
||||
fi
|
||||
else
|
||||
echo "fc_knowledge token missing; keeping local corpus_search.py as the fallback path."
|
||||
fi
|
||||
|
||||
export A0_SET_mcp_servers="$(
|
||||
python3 -c 'import json, os; servers = {}; chat_key = os.getenv("CHAT_MCP_API_KEY"); knowledge_enabled = os.getenv("KNOWLEDGE_MCP_ENABLED", "false").lower() == "true"; token = os.getenv("KNOWLEDGE_MCP_BEARER_TOKEN", "") if knowledge_enabled else ""; chat_key and servers.setdefault("fc_chat", {"type": "streamable-http", "url": "http://chat-web.fc-chat.svc/mcp", "headers": {"X-Api-Key": chat_key}}); token and servers.setdefault("fc_knowledge", {"type": "streamable-http", "url": os.getenv("KNOWLEDGE_MCP_URL", "http://knowledge-web.knowledge.svc/mcp"), "headers": {"Authorization": f"Bearer {token}"}}); print(json.dumps({"mcpServers": servers}, separators=(",", ":")))'
|
||||
)"
|
||||
# Run the original entrypoint
|
||||
exec /exe/initialize.sh $BRANCH
|
||||
ports:
|
||||
@@ -256,42 +286,78 @@ spec:
|
||||
# Agent identity
|
||||
- name: AGENT_NAME
|
||||
value: "Blue Jay (NUC)"
|
||||
# Chat model — workstation primary, edge1 fallback via local proxy
|
||||
# Chat model — routed through FlowerCore LLM Bridge (ADR-088)
|
||||
# so spend is tracked and tier aliases (fc:cheap/fc:balanced/fc:deep)
|
||||
# dispatch to Ollama or Anthropic via a single OpenAI-compat endpoint.
|
||||
# Internal utility + embedding use the authenticated OpenAI surface,
|
||||
# while browser/corpus-search use the bridge's Ollama-compatible
|
||||
# endpoints so Agent Zero no longer needs a local proxy sidecar.
|
||||
- name: A0_SET_chat_model_provider
|
||||
value: "ollama"
|
||||
value: "openai"
|
||||
- name: A0_SET_chat_model_name
|
||||
value: "gemma3:12b"
|
||||
value: "fc:balanced"
|
||||
- name: A0_SET_chat_model_api_base
|
||||
value: "http://127.0.0.1:11434"
|
||||
value: "http://fc-llm-bridge.fc-llm-bridge.svc:8080/v1"
|
||||
- name: A0_SET_chat_model_api_key
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: fc-llm-bridge-api-keys
|
||||
key: agent-zero-k8s
|
||||
# Agent Zero's runtime still resolves provider keys from the
|
||||
# provider-level env names (models.get_api_key -> OPENAI_API_KEY /
|
||||
# API_KEY_OPENAI), not the slot-scoped A0_SET_* value alone.
|
||||
# Mirror the same secret here so real public chat runs can reach
|
||||
# the fc-llm-bridge chat_model path instead of failing before MCP.
|
||||
- name: OPENAI_API_KEY
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: fc-llm-bridge-api-keys
|
||||
key: agent-zero-k8s
|
||||
- name: FC_LLM_BRIDGE_API_KEY
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: fc-llm-bridge-api-keys
|
||||
key: agent-zero-k8s
|
||||
- name: A0_SET_chat_model_ctx_length
|
||||
value: "8192"
|
||||
- name: A0_SET_chat_model_kwargs
|
||||
value: '{"temperature": 0, "num_ctx": 8192}'
|
||||
# Utility model — fast small helper tier through the same proxy
|
||||
# Utility model — fast small helper tier through the OpenAI surface
|
||||
- name: A0_SET_util_model_provider
|
||||
value: "ollama"
|
||||
value: "openai"
|
||||
- name: A0_SET_util_model_name
|
||||
value: "qwen2.5:1.5b"
|
||||
value: "fc:cheap"
|
||||
- name: A0_SET_util_model_api_base
|
||||
value: "http://127.0.0.1:11434"
|
||||
value: "http://fc-llm-bridge.fc-llm-bridge.svc:8080/v1"
|
||||
- name: A0_SET_util_model_kwargs
|
||||
value: '{"num_ctx": 2048}'
|
||||
# Embedding model — nomic through the same proxy
|
||||
# Embedding model — authenticated bridge alias to nomic-embed-text.
|
||||
# LiteLLM's embedding() path needs an explicit provider prefix here
|
||||
# even though the chat slot can use bare fc:* aliases.
|
||||
- name: A0_SET_embed_model_provider
|
||||
value: "ollama"
|
||||
value: "openai"
|
||||
- name: A0_SET_embed_model_name
|
||||
value: "nomic-embed-text"
|
||||
value: "openai/fc:embedding"
|
||||
- name: A0_SET_embed_model_api_base
|
||||
value: "http://127.0.0.1:11434"
|
||||
value: "http://fc-llm-bridge.fc-llm-bridge.svc:8080/v1"
|
||||
# Browser model — small Gemma candidate through the same proxy
|
||||
- name: A0_SET_browser_model_provider
|
||||
value: "ollama"
|
||||
- name: A0_SET_browser_model_name
|
||||
value: "gemma3:4b"
|
||||
- name: A0_SET_browser_model_api_base
|
||||
value: "http://127.0.0.1:11434"
|
||||
value: "http://fc-llm-bridge.fc-llm-bridge.svc:8080"
|
||||
- name: A0_SET_browser_model_api_key
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: fc-llm-bridge-api-keys
|
||||
key: agent-zero-k8s
|
||||
- name: A0_SET_browser_model_vision
|
||||
value: "true"
|
||||
- name: OLLAMA_HOST
|
||||
value: "http://fc-llm-bridge.fc-llm-bridge.svc:8080"
|
||||
- name: FLOWERCORE_AGENTZERO_OLLAMA_URL
|
||||
value: "http://fc-llm-bridge.fc-llm-bridge.svc:8080"
|
||||
# Agent profile — Blue Jay personality, tools, and system prompt
|
||||
- name: A0_SET_agent_profile
|
||||
value: "bluejay"
|
||||
@@ -307,9 +373,45 @@ spec:
|
||||
# Speech-to-text disabled (no GPU for Whisper)
|
||||
- name: A0_SET_stt_model_size
|
||||
value: "tiny"
|
||||
# Print.Web — Thermal printer service on edge2
|
||||
# FlowerCore.Chat MCP pilot (Phase 0)
|
||||
- name: CHAT_MCP_API_KEY
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: chat-mcp-api-key
|
||||
key: api-key
|
||||
optional: true
|
||||
# FlowerCore.Knowledge MCP Phase 1 — direct Agent Zero client path.
|
||||
# Probe /healthz first, then try an authenticated initialize call.
|
||||
# If either fails, Agent Zero boots without fc_knowledge and keeps
|
||||
# the local corpus_search.py tool as the outage-safe path.
|
||||
- name: KNOWLEDGE_MCP_URL
|
||||
value: "http://knowledge-web.knowledge.svc/mcp"
|
||||
- name: KNOWLEDGE_MCP_HEALTH_URL
|
||||
value: "http://knowledge-web.knowledge.svc/healthz"
|
||||
- name: KNOWLEDGE_MCP_BEARER_TOKEN
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: knowledge-mcp-tokens
|
||||
key: password
|
||||
# Print.Web — Thermal printer service on edge2.
|
||||
# PRINT_WEB_URL: internal HTTP (bypasses Traefik TLS — print_web.py
|
||||
# runs in-cluster and can reach edge2 directly on the PROD VLAN).
|
||||
# PRINT_WEB_API_KEY: from 1Password "Print.Web API Keys" password field,
|
||||
# synced by the print-web-api-keys OnePasswordItem CRD above.
|
||||
# The print_web.py Python tool reads both env vars for all HTTP calls.
|
||||
- name: PRINT_WEB_URL
|
||||
value: "http://10.0.57.16:5200"
|
||||
- name: PRINT_WEB_API_KEY
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: print-web-api-keys
|
||||
key: password
|
||||
# Intranet search — use in-cluster HTTP (no step-ca TLS needed)
|
||||
# corpus_search.py reads FLOWERCORE_FLEET_VECTOR_DIR but that mount is not
|
||||
# on the cluster yet (BLUEJAY-WS only). The tool gracefully returns a
|
||||
# "no DB found" message with rebuild instructions rather than crashing.
|
||||
- name: FLOWERCORE_INTRANET_URL
|
||||
value: "http://intranet-web.intranet.svc:5300"
|
||||
# Kubernetes
|
||||
- name: KUBERNETES_SERVICE_HOST
|
||||
value: "kubernetes.default.svc"
|
||||
@@ -344,7 +446,7 @@ spec:
|
||||
command:
|
||||
- /bin/bash
|
||||
- -c
|
||||
- "curl -sf http://localhost:80/ > /dev/null && curl -sf --connect-timeout 3 http://127.0.0.1:11434/api/tags > /dev/null"
|
||||
- "curl -sf http://localhost:80/ > /dev/null && curl -sf --connect-timeout 3 http://fc-llm-bridge.fc-llm-bridge.svc:8080/healthz > /dev/null"
|
||||
periodSeconds: 30
|
||||
failureThreshold: 2
|
||||
resources:
|
||||
@@ -482,18 +584,6 @@ spec:
|
||||
protocol: UDP
|
||||
- port: 53
|
||||
protocol: TCP
|
||||
# Ollama on BLUEJAY-WS
|
||||
- to:
|
||||
- ipBlock:
|
||||
cidr: 10.0.56.20/32
|
||||
ports:
|
||||
- port: 11434
|
||||
# Ollama on edge1 fallback
|
||||
- to:
|
||||
- ipBlock:
|
||||
cidr: 10.0.57.17/32
|
||||
ports:
|
||||
- port: 11434
|
||||
# Print.Web on edge2
|
||||
- to:
|
||||
- ipBlock:
|
||||
@@ -506,6 +596,47 @@ spec:
|
||||
cidr: 10.0.56.11/32
|
||||
ports:
|
||||
- port: 6443
|
||||
# FlowerCore LLM Bridge (ADR-088 chat_model routing) — ClusterIP service
|
||||
# in the fc-llm-bridge namespace on port 8080.
|
||||
- to:
|
||||
- namespaceSelector:
|
||||
matchLabels:
|
||||
kubernetes.io/metadata.name: fc-llm-bridge
|
||||
ports:
|
||||
- port: 8080
|
||||
protocol: TCP
|
||||
# FlowerCore.Chat MCP (Phase 0 pilot) — use the in-cluster chat-web
|
||||
# service instead of the public Traefik VIP so MCP traffic stays inside
|
||||
# the cluster and survives the private-range egress denylist.
|
||||
- to:
|
||||
- namespaceSelector:
|
||||
matchLabels:
|
||||
kubernetes.io/metadata.name: fc-chat
|
||||
ports:
|
||||
- port: 80
|
||||
protocol: TCP
|
||||
- port: 8080
|
||||
protocol: TCP
|
||||
# FlowerCore.Knowledge MCP (Phase 1) — in-cluster direct route with
|
||||
# anonymous /healthz probe plus authenticated /mcp initialize/tool calls.
|
||||
- to:
|
||||
- namespaceSelector:
|
||||
matchLabels:
|
||||
kubernetes.io/metadata.name: knowledge
|
||||
ports:
|
||||
- port: 80
|
||||
protocol: TCP
|
||||
- port: 8080
|
||||
protocol: TCP
|
||||
# Intranet search API — use in-cluster svc so traffic stays inside
|
||||
# the cluster and is not blocked by the private-range egress denylist.
|
||||
- to:
|
||||
- namespaceSelector:
|
||||
matchLabels:
|
||||
kubernetes.io/metadata.name: intranet
|
||||
ports:
|
||||
- port: 5300
|
||||
protocol: TCP
|
||||
# Allow internet (for kubectl image pull, etc)
|
||||
- to:
|
||||
- ipBlock:
|
||||
|
||||
@@ -7209,6 +7209,9 @@ data:
|
||||
"keep_alive": keep_alive,
|
||||
"stream": False,
|
||||
})
|
||||
curl_headers = ["-H", "Content-Type: application/json"]
|
||||
if os.environ.get("FC_LLM_BRIDGE_API_KEY"):
|
||||
curl_headers.extend(["-H", f"X-Api-Key: {os.environ['FC_LLM_BRIDGE_API_KEY']}"])
|
||||
|
||||
try:
|
||||
result = subprocess.run(
|
||||
@@ -7216,7 +7219,7 @@ data:
|
||||
"curl", "-s", "--max-time", "120",
|
||||
"-X", "POST",
|
||||
f"{api_base}/api/generate",
|
||||
"-H", "Content-Type: application/json",
|
||||
*curl_headers,
|
||||
"-d", payload,
|
||||
],
|
||||
capture_output=True,
|
||||
@@ -13150,6 +13153,451 @@ data:
|
||||
- PowerShell 5.1 compatibility is assumed (no PowerShell 7+ features).
|
||||
- All commands run with `-NoProfile -NonInteractive` flags for clean execution.
|
||||
"""
|
||||
corpus_search.py: |
|
||||
# FlowerCore Fleet Corpus Vector Search Tool
|
||||
#
|
||||
# Queries the AiStation-built SqliteVecVectorStore DB at /a0/usr/vectors/fleet.db
|
||||
# (bind-mounted read-only from /var/lib/flowercore/vector-stores/ on the host).
|
||||
# Embeds the query through Ollama's nomic-embed-text model, computes cosine
|
||||
# similarity against every stored chunk in pure Python (no numpy — not present
|
||||
# in the container), and returns the top-K nearest neighbors with source metadata.
|
||||
#
|
||||
# This is the offline-friendly counterpart to `intranet_search` (which hits the
|
||||
# Intranet's live REST API). Use it for Bible/Greek/Hebrew/Strong's lookups and
|
||||
# anywhere the workstation has a newer DB than the Intranet one. The store is
|
||||
# refreshed by `aistation-indexer build <edition>` — see the FlowerCore.Knowledge
|
||||
# ADR at docs/ai-agents/flowercore-knowledge-service-plan.md.
|
||||
|
||||
import json
|
||||
import math
|
||||
import os
|
||||
import sqlite3
|
||||
import urllib.request
|
||||
from pathlib import Path
|
||||
|
||||
from python.helpers.tool import Tool, Response
|
||||
|
||||
|
||||
DEFAULT_VECTORS_DIR = os.environ.get(
|
||||
"FLOWERCORE_FLEET_VECTOR_DIR",
|
||||
"/a0/usr/vectors",
|
||||
)
|
||||
# When the caller doesn't pick an explicit DB, prefer the biggest fleet tier
|
||||
# present on disk. Workstation → pi-edge → bmo-bot.
|
||||
PREFERRED_DB_ORDER = [
|
||||
os.environ.get("FLOWERCORE_FLEET_VECTOR_DB", ""),
|
||||
"fleet-workstation-full.db",
|
||||
"fleet-pi-edge.db",
|
||||
"fleet-bmo-bot.db",
|
||||
]
|
||||
OLLAMA_BASE_URL = os.environ.get(
|
||||
"FLOWERCORE_AGENTZERO_OLLAMA_URL",
|
||||
"http://host.containers.internal:11434",
|
||||
)
|
||||
BRIDGE_API_KEY = os.environ.get("FC_LLM_BRIDGE_API_KEY", "").strip()
|
||||
EMBEDDING_MODEL = os.environ.get(
|
||||
"FLOWERCORE_FLEET_EMBEDDING_MODEL",
|
||||
"nomic-embed-text",
|
||||
)
|
||||
|
||||
|
||||
class CorpusSearch(Tool):
|
||||
async def execute(self, **kwargs) -> Response:
|
||||
"""
|
||||
Semantic search over the FlowerCore fleet corpus (Bible texts, lexicons,
|
||||
dictionaries, morphology) pre-indexed by aistation-indexer.
|
||||
|
||||
Args (via self.args):
|
||||
query (str): Search query text. Required unless action=stats.
|
||||
limit (int): Max results. Default 8.
|
||||
index (str): Optional index name filter ("bible-texts", "lexicons",
|
||||
"dictionaries", "morphology"). Default: all indexes.
|
||||
repo (str): Optional repo filter (e.g. "world-english-bible").
|
||||
db (str): Override DB path OR file name inside FLOWERCORE_FLEET_VECTOR_DIR
|
||||
(defaults to /a0/usr/vectors). If omitted, the largest
|
||||
fleet tier present on disk is picked automatically.
|
||||
action (str): Optional. "stats" returns an inventory of all fleet DBs
|
||||
visible to the tool (names, sizes, index counts, chunk
|
||||
counts, last-built timestamps). No embedding call.
|
||||
|
||||
Returns:
|
||||
Response with ranked chunks (score, source, text preview) OR
|
||||
(when action=stats) a markdown inventory of available fleet DBs.
|
||||
"""
|
||||
query = (self.args.get("query") or "").strip()
|
||||
limit = int(self.args.get("limit") or 8)
|
||||
index_filter = (self.args.get("index") or "").strip()
|
||||
repo_filter = (self.args.get("repo") or "").strip()
|
||||
db_override = (self.args.get("db") or "").strip()
|
||||
action = (self.args.get("action") or "").strip().lower()
|
||||
|
||||
if action == "stats":
|
||||
return Response(message=_render_stats(), break_loop=False)
|
||||
|
||||
if not query:
|
||||
return Response(
|
||||
message=(
|
||||
"Error: 'query' is required unless action=stats.\n"
|
||||
"Example: query=\"what does Genesis 1:1 say\" limit=5\n"
|
||||
"Inventory: action=stats"
|
||||
),
|
||||
break_loop=False,
|
||||
)
|
||||
|
||||
db = _resolve_db(db_override)
|
||||
if db is None:
|
||||
return Response(
|
||||
message=(
|
||||
f"Error: no fleet vector DB found under {DEFAULT_VECTORS_DIR}.\n"
|
||||
"Host side: run `aistation-indexer build fleet-workstation-full`\n"
|
||||
"(or `fleet-pi-edge`/`fleet-bmo-bot`) to produce\n"
|
||||
"`/var/lib/flowercore/vector-stores/<slug>.db`, then confirm the\n"
|
||||
"Podman unit mounts that directory into `/a0/usr/vectors:ro`."
|
||||
),
|
||||
break_loop=False,
|
||||
)
|
||||
|
||||
try:
|
||||
query_vec = _embed(query)
|
||||
except Exception as e:
|
||||
return Response(
|
||||
message=f"Error: failed to embed query via Ollama at {OLLAMA_BASE_URL}: {e}",
|
||||
break_loop=False,
|
||||
)
|
||||
|
||||
try:
|
||||
hits = _search(db, query_vec, index_filter, repo_filter, limit)
|
||||
except Exception as e:
|
||||
return Response(
|
||||
message=f"Error: corpus search failed: {e}",
|
||||
break_loop=False,
|
||||
)
|
||||
|
||||
if not hits:
|
||||
return Response(
|
||||
message=(
|
||||
f"No matches for '{query}' in {db.name}.\n"
|
||||
f"Indexes available: " + _list_indexes_summary(db)
|
||||
),
|
||||
break_loop=False,
|
||||
)
|
||||
|
||||
lines = [f"**Corpus search: `{query}`** (top {len(hits)} of {limit} requested, DB={db.name})", ""]
|
||||
for rank, h in enumerate(hits, 1):
|
||||
passage = h.get("passage") or ""
|
||||
lang = h.get("language") or ""
|
||||
meta_bits = [x for x in (h["index"], h["repo"], passage, lang) if x]
|
||||
meta = " · ".join(meta_bits)
|
||||
preview = h["text"]
|
||||
if len(preview) > 320:
|
||||
preview = preview[:320].rstrip() + "…"
|
||||
lines.append(f"{rank}. **{h['score']:.3f}** {meta}")
|
||||
lines.append(f" `{h['source']}`")
|
||||
lines.append(f" {preview}")
|
||||
lines.append("")
|
||||
|
||||
return Response(message="\n".join(lines).rstrip() + "\n", break_loop=False)
|
||||
|
||||
|
||||
def _resolve_db(override: str) -> "Path | None":
|
||||
"""Pick a fleet DB by explicit path, explicit filename, or preferred order."""
|
||||
vectors_dir = Path(DEFAULT_VECTORS_DIR)
|
||||
if override:
|
||||
# Absolute or relative path that points at a real file wins outright.
|
||||
p = Path(override)
|
||||
if p.is_absolute() and p.exists():
|
||||
return p
|
||||
# Otherwise treat it as a filename within the vectors dir.
|
||||
candidate = vectors_dir / override
|
||||
if candidate.exists():
|
||||
return candidate
|
||||
return None
|
||||
|
||||
for name in PREFERRED_DB_ORDER:
|
||||
if not name:
|
||||
continue
|
||||
p = Path(name) if Path(name).is_absolute() else vectors_dir / name
|
||||
if p.exists():
|
||||
return p
|
||||
|
||||
# Fallback: any *.db in the dir, largest first.
|
||||
if vectors_dir.is_dir():
|
||||
candidates = sorted(vectors_dir.glob("*.db"), key=lambda p: p.stat().st_size, reverse=True)
|
||||
if candidates:
|
||||
return candidates[0]
|
||||
return None
|
||||
|
||||
|
||||
def _embed(text: str) -> list:
|
||||
"""Embed a query via Ollama's /api/embeddings. Single-vector response."""
|
||||
body = json.dumps({"model": EMBEDDING_MODEL, "prompt": text}).encode("utf-8")
|
||||
headers = {"Content-Type": "application/json"}
|
||||
if BRIDGE_API_KEY:
|
||||
headers["X-Api-Key"] = BRIDGE_API_KEY
|
||||
req = urllib.request.Request(
|
||||
f"{OLLAMA_BASE_URL.rstrip('/')}/api/embeddings",
|
||||
data=body,
|
||||
headers=headers,
|
||||
)
|
||||
with urllib.request.urlopen(req, timeout=60) as resp:
|
||||
data = json.loads(resp.read().decode("utf-8"))
|
||||
vec = data.get("embedding")
|
||||
if not isinstance(vec, list) or not vec:
|
||||
raise RuntimeError(f"Ollama returned no embedding: {data}")
|
||||
return [float(x) for x in vec]
|
||||
|
||||
|
||||
def _cosine(a: list, b: list) -> float:
|
||||
"""Cosine similarity in pure Python — no numpy in the A0 container."""
|
||||
# zip() stops at the shorter — AiStation DB guarantees same dim per index.
|
||||
dot = 0.0
|
||||
na = 0.0
|
||||
nb = 0.0
|
||||
for x, y in zip(a, b):
|
||||
dot += x * y
|
||||
na += x * x
|
||||
nb += y * y
|
||||
if na == 0.0 or nb == 0.0:
|
||||
return 0.0
|
||||
return dot / (math.sqrt(na) * math.sqrt(nb))
|
||||
|
||||
|
||||
def _search(db_path: Path, query_vec: list, index_filter: str, repo_filter: str, limit: int) -> list:
|
||||
"""Load entries, compute cosine, return top-K.
|
||||
|
||||
SqliteVecVectorStore schema:
|
||||
VectorIndexes(IndexName, Dimensions, UpdatedAtUtc)
|
||||
VectorEntries(IndexName, ChunkId, TextContent, SourceRepo, SourceFile,
|
||||
Book, Chapter, VerseRange, Language, ContentType, License,
|
||||
EstimatedTokens, EmbeddingJson)
|
||||
|
||||
Embeddings are stored as JSON arrays in EmbeddingJson; similarity is computed
|
||||
in Python. For ~100k chunks × 768 dims this takes a couple seconds on a
|
||||
workstation — acceptable for interactive A0 use.
|
||||
"""
|
||||
conn = sqlite3.connect(f"file:{db_path}?mode=ro", uri=True)
|
||||
try:
|
||||
sql = [
|
||||
"SELECT IndexName, ChunkId, TextContent, SourceRepo, SourceFile, ",
|
||||
" Book, Chapter, VerseRange, Language, EmbeddingJson ",
|
||||
"FROM VectorEntries",
|
||||
]
|
||||
where = []
|
||||
params = []
|
||||
if index_filter:
|
||||
where.append("IndexName = ?")
|
||||
params.append(index_filter)
|
||||
if repo_filter:
|
||||
where.append("SourceRepo LIKE ?")
|
||||
params.append(f"%{repo_filter}%")
|
||||
if where:
|
||||
sql.append(" WHERE " + " AND ".join(where))
|
||||
sql.append(";")
|
||||
|
||||
cursor = conn.execute("".join(sql), params)
|
||||
|
||||
# Min-heap by (score, ...) would be faster but for interactive use we
|
||||
# just sort at the end — simpler and readable.
|
||||
scored = []
|
||||
for row in cursor:
|
||||
idx, chunk_id, text, repo, source_file, book, chapter, verses, lang, emb_json = row
|
||||
try:
|
||||
vec = json.loads(emb_json)
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
continue
|
||||
score = _cosine(query_vec, vec)
|
||||
passage = None
|
||||
if book and chapter:
|
||||
passage = f"{book} {chapter}"
|
||||
if verses:
|
||||
passage += f":{verses}"
|
||||
scored.append((score, {
|
||||
"index": idx,
|
||||
"chunk_id": chunk_id,
|
||||
"text": text,
|
||||
"repo": repo or "",
|
||||
"source": source_file or "",
|
||||
"passage": passage or "",
|
||||
"language": lang or "",
|
||||
}))
|
||||
scored.sort(key=lambda t: t[0], reverse=True)
|
||||
return [{"score": s, **meta} for s, meta in scored[:limit]]
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
|
||||
def _render_stats() -> str:
|
||||
"""Markdown inventory of every *.db in FLOWERCORE_FLEET_VECTOR_DIR."""
|
||||
vectors_dir = Path(DEFAULT_VECTORS_DIR)
|
||||
if not vectors_dir.is_dir():
|
||||
return f"No fleet vector dir mounted at {vectors_dir}. Ask the host operator to build an index with scripts/agent-zero/build-fleet-index.sh."
|
||||
|
||||
dbs = sorted(vectors_dir.glob("*.db"))
|
||||
if not dbs:
|
||||
return f"No fleet DBs present under {vectors_dir}. Run `scripts/agent-zero/build-fleet-index.sh fleet-workstation-full` on the host."
|
||||
|
||||
lines = [f"**Fleet vector DB inventory** ({vectors_dir})", ""]
|
||||
for db in dbs:
|
||||
size_mb = db.stat().st_size / (1024 * 1024)
|
||||
lines.append(f"### `{db.name}` ({size_mb:.1f} MB)")
|
||||
try:
|
||||
conn = sqlite3.connect(f"file:{db}?mode=ro", uri=True)
|
||||
try:
|
||||
idx_rows = conn.execute(
|
||||
"SELECT IndexName, Dimensions, UpdatedAtUtc FROM VectorIndexes ORDER BY IndexName;"
|
||||
).fetchall()
|
||||
if not idx_rows:
|
||||
lines.append("- (no indexes registered)")
|
||||
else:
|
||||
counts = dict(conn.execute(
|
||||
"SELECT IndexName, COUNT(*) FROM VectorEntries GROUP BY IndexName;"
|
||||
).fetchall())
|
||||
for name, dim, updated in idx_rows:
|
||||
count = counts.get(name, 0)
|
||||
lines.append(f"- **{name}** — {count:,} chunks × {dim}d (built {updated})")
|
||||
finally:
|
||||
conn.close()
|
||||
except Exception as e:
|
||||
lines.append(f"- (inspect failed: {e})")
|
||||
lines.append("")
|
||||
|
||||
lines.append(f"**Tool defaults:** embedding model `{EMBEDDING_MODEL}`, Ollama at `{OLLAMA_BASE_URL}`. Pick a DB with `db=<filename>`; filter by `index=<name>`/`repo=<substring>`.")
|
||||
return "\n".join(lines).rstrip() + "\n"
|
||||
|
||||
|
||||
def _list_indexes_summary(db_path: Path) -> str:
|
||||
try:
|
||||
conn = sqlite3.connect(f"file:{db_path}?mode=ro", uri=True)
|
||||
try:
|
||||
rows = conn.execute(
|
||||
"SELECT IndexName, Dimensions, "
|
||||
" (SELECT COUNT(*) FROM VectorEntries WHERE VectorEntries.IndexName = VectorIndexes.IndexName) "
|
||||
"FROM VectorIndexes ORDER BY IndexName;"
|
||||
).fetchall()
|
||||
if not rows:
|
||||
return "(no indexes)"
|
||||
return ", ".join(f"{r[0]}({r[2]}×{r[1]}d)" for r in rows)
|
||||
finally:
|
||||
conn.close()
|
||||
except Exception as e:
|
||||
return f"(couldn't list: {e})"
|
||||
|
||||
intranet_search.py: |
|
||||
# Intranet Vector Search Tool
|
||||
# Queries the Blue Jay Lab Intranet's Shared.Indexing RAG corpus over its
|
||||
# live REST API (https://intranet.iamworkin.lan/search). Returns ranked chunks
|
||||
# with source file paths and scores.
|
||||
|
||||
import json
|
||||
import os
|
||||
import ssl
|
||||
import urllib.parse
|
||||
import urllib.request
|
||||
|
||||
from python.helpers.tool import Tool, Response
|
||||
|
||||
|
||||
INTRANET_BASE_URL = os.environ.get(
|
||||
"FLOWERCORE_INTRANET_URL",
|
||||
"https://intranet.iamworkin.lan",
|
||||
)
|
||||
STEPCA_ROOT_CRT = "/a0/usr/ca/stepca-root.crt"
|
||||
|
||||
|
||||
def _ssl_ctx() -> ssl.SSLContext:
|
||||
ctx = ssl.create_default_context()
|
||||
if os.path.exists(STEPCA_ROOT_CRT):
|
||||
ctx.load_verify_locations(cafile=STEPCA_ROOT_CRT)
|
||||
return ctx
|
||||
|
||||
|
||||
class IntranetSearch(Tool):
|
||||
async def execute(self, **kwargs) -> Response:
|
||||
"""
|
||||
Search the Blue Jay Lab intranet corpus (docs, project notes, dashboards).
|
||||
|
||||
Args (via self.args):
|
||||
query (str): Search query. Required.
|
||||
limit (int): Max chunks to return. Default 8.
|
||||
corpus (str): Optional corpus filter (e.g. "notes", "docs").
|
||||
|
||||
Returns:
|
||||
Response with ranked chunk text, source path, and score.
|
||||
"""
|
||||
query = self.args.get("query", "").strip()
|
||||
limit = int(self.args.get("limit", 8))
|
||||
corpus = self.args.get("corpus", "").strip()
|
||||
|
||||
if not query:
|
||||
return Response(
|
||||
message="Error: 'query' is required.",
|
||||
break_loop=False,
|
||||
)
|
||||
|
||||
params = {"q": query, "topK": str(limit)}
|
||||
if corpus:
|
||||
params["indexName"] = corpus
|
||||
url = f"{INTRANET_BASE_URL}/api/search?{urllib.parse.urlencode(params)}"
|
||||
|
||||
try:
|
||||
req = urllib.request.Request(url, headers={"Accept": "application/json"})
|
||||
with urllib.request.urlopen(req, timeout=20, context=_ssl_ctx()) as resp:
|
||||
raw = resp.read().decode("utf-8", errors="replace")
|
||||
except Exception as exc:
|
||||
return Response(
|
||||
message=f"Intranet search failed: {exc}\nURL: {url}",
|
||||
break_loop=False,
|
||||
)
|
||||
|
||||
try:
|
||||
data = json.loads(raw)
|
||||
except json.JSONDecodeError:
|
||||
return Response(
|
||||
message=f"Intranet returned non-JSON response:\n{raw[:500]}",
|
||||
break_loop=False,
|
||||
)
|
||||
|
||||
hits = data if isinstance(data, list) else (
|
||||
data.get("results") or data.get("hits") or data.get("chunks") or []
|
||||
)
|
||||
if not hits:
|
||||
return Response(
|
||||
message=f"No intranet results for query: {query!r}",
|
||||
break_loop=False,
|
||||
)
|
||||
|
||||
lines = [f"# Intranet search: {query} ({len(hits)} hits)\n"]
|
||||
for i, hit in enumerate(hits[:limit], 1):
|
||||
src = (
|
||||
hit.get("sourceFile")
|
||||
or hit.get("source")
|
||||
or hit.get("path")
|
||||
or hit.get("file")
|
||||
or "?"
|
||||
)
|
||||
repo = hit.get("sourceRepo") or ""
|
||||
idx = hit.get("indexName") or ""
|
||||
score = hit.get("score") or hit.get("similarity") or ""
|
||||
text = (
|
||||
hit.get("snippet")
|
||||
or hit.get("text")
|
||||
or hit.get("content")
|
||||
or hit.get("chunk")
|
||||
or ""
|
||||
).strip()
|
||||
if len(text) > 600:
|
||||
text = text[:600] + "..."
|
||||
header = f"## [{i}] {repo}/{src}" if repo else f"## [{i}] {src}"
|
||||
if idx:
|
||||
header += f" ({idx})"
|
||||
if score:
|
||||
header += f" score={score:.3f}" if isinstance(score, float) else f" score={score}"
|
||||
lines.append(header)
|
||||
lines.append(text)
|
||||
lines.append("")
|
||||
|
||||
return Response(message="\n".join(lines), break_loop=False)
|
||||
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: bluejay-tools-c
|
||||
|
||||
@@ -96,7 +96,9 @@ data:
|
||||
allow=ulaw
|
||||
allow=alaw
|
||||
direct_media=no
|
||||
dtmf_mode=inband
|
||||
; Yealink provisioning sends RFC2833/RFC4733 DTMF (payload 101).
|
||||
; Keep the PBX template aligned so physical desk phones emit ARI DTMF events.
|
||||
dtmf_mode=rfc4733
|
||||
rtp_symmetric=yes
|
||||
force_rport=yes
|
||||
rewrite_contact=yes
|
||||
@@ -155,11 +157,11 @@ data:
|
||||
remove_existing=yes
|
||||
qualify_frequency=60
|
||||
|
||||
; Extension 103 - Office 3
|
||||
[103](phone-template)
|
||||
auth=auth103
|
||||
aors=103
|
||||
callerid="Office 3" <103>
|
||||
; Extension 103 - Office 3
|
||||
[103](phone-template)
|
||||
auth=auth103
|
||||
aors=103
|
||||
callerid="Office 3" <103>
|
||||
|
||||
[auth103]
|
||||
type=auth
|
||||
@@ -167,90 +169,90 @@ data:
|
||||
username=103
|
||||
password=bluejay-ext-103
|
||||
|
||||
[103]
|
||||
type=aor
|
||||
max_contacts=1
|
||||
remove_existing=yes
|
||||
qualify_frequency=60
|
||||
|
||||
; Test endpoints 901-904 for softphone proof
|
||||
[test-endpoint](!)
|
||||
type=endpoint
|
||||
context=from-internal
|
||||
transport=transport-udp
|
||||
disallow=all
|
||||
allow=ulaw
|
||||
allow=alaw
|
||||
direct_media=no
|
||||
rtp_symmetric=yes
|
||||
force_rport=yes
|
||||
rewrite_contact=yes
|
||||
|
||||
[901](test-endpoint)
|
||||
auth=auth901
|
||||
aors=901
|
||||
callerid="Proof Caller" <901>
|
||||
|
||||
[auth901]
|
||||
type=auth
|
||||
auth_type=userpass
|
||||
username=901
|
||||
password=test-sip-secret-901
|
||||
|
||||
[901]
|
||||
type=aor
|
||||
max_contacts=1
|
||||
remove_existing=yes
|
||||
|
||||
[902](test-endpoint)
|
||||
auth=auth902
|
||||
aors=902
|
||||
callerid="Proof Callee" <902>
|
||||
|
||||
[auth902]
|
||||
type=auth
|
||||
auth_type=userpass
|
||||
username=902
|
||||
password=test-sip-secret-901
|
||||
|
||||
[902]
|
||||
type=aor
|
||||
max_contacts=1
|
||||
remove_existing=yes
|
||||
|
||||
[903](test-endpoint)
|
||||
auth=auth903
|
||||
aors=903
|
||||
callerid="Proof Endpoint 3" <903>
|
||||
|
||||
[auth903]
|
||||
type=auth
|
||||
auth_type=userpass
|
||||
username=903
|
||||
password=test-sip-secret-901
|
||||
|
||||
[903]
|
||||
type=aor
|
||||
max_contacts=1
|
||||
remove_existing=yes
|
||||
|
||||
[904](test-endpoint)
|
||||
auth=auth904
|
||||
aors=904
|
||||
callerid="Proof Endpoint 4" <904>
|
||||
|
||||
[auth904]
|
||||
type=auth
|
||||
auth_type=userpass
|
||||
username=904
|
||||
password=test-sip-secret-901
|
||||
|
||||
[904]
|
||||
type=aor
|
||||
max_contacts=1
|
||||
remove_existing=yes
|
||||
|
||||
extensions.conf: |
|
||||
[103]
|
||||
type=aor
|
||||
max_contacts=1
|
||||
remove_existing=yes
|
||||
qualify_frequency=60
|
||||
|
||||
; Test endpoints 901-904 for softphone proof
|
||||
[test-endpoint](!)
|
||||
type=endpoint
|
||||
context=from-internal
|
||||
transport=transport-udp
|
||||
disallow=all
|
||||
allow=ulaw
|
||||
allow=alaw
|
||||
direct_media=no
|
||||
rtp_symmetric=yes
|
||||
force_rport=yes
|
||||
rewrite_contact=yes
|
||||
|
||||
[901](test-endpoint)
|
||||
auth=auth901
|
||||
aors=901
|
||||
callerid="Proof Caller" <901>
|
||||
|
||||
[auth901]
|
||||
type=auth
|
||||
auth_type=userpass
|
||||
username=901
|
||||
password=test-sip-secret-901
|
||||
|
||||
[901]
|
||||
type=aor
|
||||
max_contacts=1
|
||||
remove_existing=yes
|
||||
|
||||
[902](test-endpoint)
|
||||
auth=auth902
|
||||
aors=902
|
||||
callerid="Proof Callee" <902>
|
||||
|
||||
[auth902]
|
||||
type=auth
|
||||
auth_type=userpass
|
||||
username=902
|
||||
password=test-sip-secret-901
|
||||
|
||||
[902]
|
||||
type=aor
|
||||
max_contacts=1
|
||||
remove_existing=yes
|
||||
|
||||
[903](test-endpoint)
|
||||
auth=auth903
|
||||
aors=903
|
||||
callerid="Proof Endpoint 3" <903>
|
||||
|
||||
[auth903]
|
||||
type=auth
|
||||
auth_type=userpass
|
||||
username=903
|
||||
password=test-sip-secret-901
|
||||
|
||||
[903]
|
||||
type=aor
|
||||
max_contacts=1
|
||||
remove_existing=yes
|
||||
|
||||
[904](test-endpoint)
|
||||
auth=auth904
|
||||
aors=904
|
||||
callerid="Proof Endpoint 4" <904>
|
||||
|
||||
[auth904]
|
||||
type=auth
|
||||
auth_type=userpass
|
||||
username=904
|
||||
password=test-sip-secret-901
|
||||
|
||||
[904]
|
||||
type=aor
|
||||
max_contacts=1
|
||||
remove_existing=yes
|
||||
|
||||
extensions.conf: |
|
||||
[general]
|
||||
static=yes
|
||||
writeprotect=no
|
||||
@@ -268,37 +270,37 @@ data:
|
||||
same => n,Hangup()
|
||||
|
||||
[from-internal]
|
||||
; Internal extension-to-extension dialing
|
||||
exten => _1XX,1,Dial(PJSIP/${EXTEN},30)
|
||||
same => n,Hangup()
|
||||
|
||||
; Softphone proof endpoints and utility extensions
|
||||
exten => _9XX,1,NoOp(Proof call to ${EXTEN})
|
||||
same => n,Dial(PJSIP/${EXTEN},30)
|
||||
same => n,Hangup()
|
||||
|
||||
exten => 999,1,Answer()
|
||||
same => n,Playback(demo-echotest)
|
||||
same => n,Echo()
|
||||
same => n,Hangup()
|
||||
|
||||
exten => 998,1,Answer()
|
||||
same => n,Milliwatt()
|
||||
same => n,Hangup()
|
||||
|
||||
exten => 997,1,Answer()
|
||||
same => n,Wait(0.5)
|
||||
same => n,Playback(hello-world)
|
||||
same => n,Wait(1)
|
||||
same => n,Hangup()
|
||||
|
||||
exten => 996,1,Answer()
|
||||
same => n,Wait(0.5)
|
||||
same => n,Read(DIGITS,,4,,,5)
|
||||
same => n,SayDigits(${DIGITS})
|
||||
same => n,Hangup()
|
||||
|
||||
; Outbound via Twilio SIP trunk (11-digit US)
|
||||
; Internal extension-to-extension dialing
|
||||
exten => _1XX,1,Dial(PJSIP/${EXTEN},30)
|
||||
same => n,Hangup()
|
||||
|
||||
; Softphone proof endpoints and utility extensions
|
||||
exten => _9XX,1,NoOp(Proof call to ${EXTEN})
|
||||
same => n,Dial(PJSIP/${EXTEN},30)
|
||||
same => n,Hangup()
|
||||
|
||||
exten => 999,1,Answer()
|
||||
same => n,Playback(demo-echotest)
|
||||
same => n,Echo()
|
||||
same => n,Hangup()
|
||||
|
||||
exten => 998,1,Answer()
|
||||
same => n,Milliwatt()
|
||||
same => n,Hangup()
|
||||
|
||||
exten => 997,1,Answer()
|
||||
same => n,Wait(0.5)
|
||||
same => n,Playback(hello-world)
|
||||
same => n,Wait(1)
|
||||
same => n,Hangup()
|
||||
|
||||
exten => 996,1,Answer()
|
||||
same => n,Wait(0.5)
|
||||
same => n,Read(DIGITS,,4,,,5)
|
||||
same => n,SayDigits(${DIGITS})
|
||||
same => n,Hangup()
|
||||
|
||||
; Outbound via Twilio SIP trunk (11-digit US)
|
||||
exten => _1NXXNXXXXXX,1,Set(CALLERID(num)=+13202332529)
|
||||
same => n,Dial(PJSIP/+${EXTEN}@twilio-trunk,60)
|
||||
same => n,Hangup()
|
||||
@@ -312,6 +314,13 @@ data:
|
||||
exten => *100,1,Stasis(flowercore-pbx,internal,ivr)
|
||||
same => n,Hangup()
|
||||
|
||||
; Test-only entry into the Victory Day workflow (DID +15074618329).
|
||||
; Used by live SIP AATs to exercise the VDAY Fun Menu + AsteriskGameHandler
|
||||
; path without dialing in over Twilio. Mnemonic: *832 = "V-D-A" (8-3-2).
|
||||
exten => *832,1,NoOp(Test entry: Victory Day workflow via AAT)
|
||||
same => n,Stasis(flowercore-pbx,inbound-pstn,+15074618329)
|
||||
same => n,Hangup()
|
||||
|
||||
; Star codes routed to FlowerCore Stasis app for handling
|
||||
exten => *0,1,Stasis(flowercore-pbx,starcode,*0)
|
||||
same => n,Hangup()
|
||||
|
||||
@@ -16,13 +16,25 @@ spec:
|
||||
metadata:
|
||||
labels:
|
||||
app: asterisk
|
||||
spec:
|
||||
nodeSelector:
|
||||
kubernetes.io/hostname: rke2-agent1
|
||||
hostNetwork: true
|
||||
dnsPolicy: ClusterFirstWithHostNet
|
||||
securityContext:
|
||||
fsGroup: 0
|
||||
spec:
|
||||
nodeSelector:
|
||||
kubernetes.io/hostname: rke2-agent1
|
||||
hostNetwork: true
|
||||
# Keep the search list free of iamworkin.lan so CoreDNS's wildcard
|
||||
# template cannot hijack public egress like downloads.asterisk.org.
|
||||
dnsPolicy: None
|
||||
dnsConfig:
|
||||
nameservers:
|
||||
- 10.43.0.10
|
||||
searches:
|
||||
- telephony.svc.cluster.local
|
||||
- svc.cluster.local
|
||||
- cluster.local
|
||||
options:
|
||||
- name: ndots
|
||||
value: "2"
|
||||
securityContext:
|
||||
fsGroup: 0
|
||||
# CoreDNS in this cluster has an iamworkin.lan wildcard that catches
|
||||
# any unresolved name and returns 10.0.56.200 (Traefik VIP), which
|
||||
# means downloads.asterisk.org inside the pod resolves to Traefik and
|
||||
|
||||
69
apps/cdi/README.md
Normal file
69
apps/cdi/README.md
Normal file
@@ -0,0 +1,69 @@
|
||||
# CDI — Containerized Data Importer
|
||||
|
||||
KubeVirt's `containerized-data-importer` for populating PVCs from external
|
||||
sources (HTTP, HTTPS, container registry, S3, virtctl upload). Required to
|
||||
import the Windows Server 2025 ISO into the `windows-server-2025-iso` PVC
|
||||
that `apps/kubevirt-vms/ci1.yaml` mounts as a CDROM.
|
||||
|
||||
## Files
|
||||
|
||||
| File | Source | Purpose |
|
||||
| ----------------- | ----------------------------------------------------------------------------------------------------------------- | -------------------------------------------------- |
|
||||
| `cdi-operator.yaml` | [`v1.65.0`](https://github.com/kubevirt/containerized-data-importer/releases/tag/v1.65.0) — verbatim copy | Installs operator + CRDs (5779 lines, large) |
|
||||
| `cdi-cr.yaml` | [`v1.65.0`](https://github.com/kubevirt/containerized-data-importer/releases/tag/v1.65.0) — annotated + commented | Tells operator to deploy CDI components |
|
||||
|
||||
`cdi-operator.yaml` is **vendored verbatim** from the upstream release for
|
||||
air-gap reproducibility (no internet fetch at deploy time, ArgoCD prune
|
||||
contracts hold). To bump versions:
|
||||
|
||||
```bash
|
||||
CDI_VER=v1.66.0 # for example
|
||||
curl -sL "https://github.com/kubevirt/containerized-data-importer/releases/download/${CDI_VER}/cdi-operator.yaml" \
|
||||
-o apps/cdi/cdi-operator.yaml
|
||||
curl -sL "https://github.com/kubevirt/containerized-data-importer/releases/download/${CDI_VER}/cdi-cr.yaml" \
|
||||
-o /tmp/cdi-cr-new.yaml # then re-apply project header diff
|
||||
git diff apps/cdi/ # review
|
||||
git commit + push
|
||||
```
|
||||
|
||||
## Verify after deploy
|
||||
|
||||
```bash
|
||||
kubectl -n cdi get pods # operator + apiserver + deployment + uploadproxy
|
||||
kubectl get cdis cdi -o jsonpath='{.status.phase}' # "Deployed"
|
||||
kubectl get crd | grep cdi.kubevirt.io
|
||||
# Expected CRDs: datavolumes.cdi.kubevirt.io, cdiconfigs.cdi.kubevirt.io,
|
||||
# storageprofiles.cdi.kubevirt.io, dataimportcrons.cdi.kubevirt.io,
|
||||
# datasources.cdi.kubevirt.io, objecttransfers.cdi.kubevirt.io
|
||||
```
|
||||
|
||||
## Use after install
|
||||
|
||||
```yaml
|
||||
# Example DataVolume that imports from HTTP
|
||||
apiVersion: cdi.kubevirt.io/v1beta1
|
||||
kind: DataVolume
|
||||
metadata:
|
||||
name: my-iso
|
||||
spec:
|
||||
source:
|
||||
http:
|
||||
url: "https://server/path/to.iso"
|
||||
pvc:
|
||||
accessModes: [ReadWriteOnce]
|
||||
resources:
|
||||
requests:
|
||||
storage: 10Gi
|
||||
storageClassName: longhorn
|
||||
```
|
||||
|
||||
```bash
|
||||
# Or upload from local disk via virtctl
|
||||
virtctl image-upload pvc my-iso \
|
||||
--image-path ./my.iso \
|
||||
--size 10Gi \
|
||||
--storage-class longhorn \
|
||||
--access-mode ReadWriteOnce \
|
||||
--uploadproxy-url https://cdi-uploadproxy.cdi.svc:443 \
|
||||
--insecure
|
||||
```
|
||||
36
apps/cdi/cdi-cr.yaml
Normal file
36
apps/cdi/cdi-cr.yaml
Normal file
@@ -0,0 +1,36 @@
|
||||
# =============================================================================
|
||||
# CDI CR — Tells the CDI operator to install CDI components into the cluster.
|
||||
# =============================================================================
|
||||
# After cdi-operator.yaml is applied, the operator watches for THIS resource
|
||||
# (CDI named "cdi"). When found, it deploys cdi-apiserver, cdi-deployment,
|
||||
# cdi-uploadproxy, cdi-cronjob, and the importer/uploadserver/cloner pods.
|
||||
#
|
||||
# Configuration:
|
||||
# - HonorWaitForFirstConsumer: PVCs created by DataVolumes wait for first
|
||||
# pod to schedule before binding (lets storage class pick best node).
|
||||
# - WebhookPvcRendering: validates PVC creation against CDI policies.
|
||||
# - imagePullPolicy IfNotPresent: re-pull only on tag rotation.
|
||||
# - nodeSelector linux: pin to Linux nodes (no Windows worker support).
|
||||
#
|
||||
# Andrew may want to add a `uploadProxyURLOverride` later to expose the
|
||||
# uploadproxy via Traefik IngressRoute for `virtctl image-upload` from
|
||||
# BLUEJAY-WS without `kubectl port-forward`. Phase 2 enhancement.
|
||||
# =============================================================================
|
||||
apiVersion: cdi.kubevirt.io/v1beta1
|
||||
kind: CDI
|
||||
metadata:
|
||||
name: cdi
|
||||
annotations:
|
||||
bluejay.iamworkin.lan/source: "kubevirt/containerized-data-importer v1.65.0"
|
||||
spec:
|
||||
config:
|
||||
featureGates:
|
||||
- HonorWaitForFirstConsumer
|
||||
- WebhookPvcRendering
|
||||
imagePullPolicy: IfNotPresent
|
||||
infra:
|
||||
nodeSelector:
|
||||
kubernetes.io/os: linux
|
||||
workload:
|
||||
nodeSelector:
|
||||
kubernetes.io/os: linux
|
||||
5779
apps/cdi/cdi-operator.yaml
Normal file
5779
apps/cdi/cdi-operator.yaml
Normal file
File diff suppressed because it is too large
Load Diff
106
apps/edge2-services/edge2-services.yaml
Normal file
106
apps/edge2-services/edge2-services.yaml
Normal file
@@ -0,0 +1,106 @@
|
||||
# edge2 Services — Traefik IngressRoutes for FlowerCore Print.Web on edge2
|
||||
# Proxies print.iamworkin.lan to edge2 (10.0.57.16:5200) via headless Service
|
||||
# + manual Endpoints (same K8s external-proxy pattern as noc-services).
|
||||
#
|
||||
# Print.Web has its own X-Api-Key authentication and exposes anonymous
|
||||
# endpoints for the bookmarklet / Python CLI / cups-notifier flow, so no
|
||||
# Traefik basicAuth middleware is wired here.
|
||||
#
|
||||
# ArgoCD managed - BlueJay Lab
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Namespace
|
||||
metadata:
|
||||
name: edge2-proxy
|
||||
labels:
|
||||
app.kubernetes.io/part-of: bluejay-infra
|
||||
---
|
||||
# ============================================================
|
||||
# Print.Web - edge2:5200 (FlowerCore.Print.Web on Pi 4)
|
||||
# ============================================================
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: print-web-external
|
||||
namespace: edge2-proxy
|
||||
spec:
|
||||
ports:
|
||||
- port: 5200
|
||||
targetPort: 5200
|
||||
name: http
|
||||
clusterIP: None
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Endpoints
|
||||
metadata:
|
||||
name: print-web-external
|
||||
namespace: edge2-proxy
|
||||
subsets:
|
||||
- addresses:
|
||||
- ip: 10.0.57.16
|
||||
ports:
|
||||
- port: 5200
|
||||
name: http
|
||||
---
|
||||
apiVersion: cert-manager.io/v1
|
||||
kind: Certificate
|
||||
metadata:
|
||||
name: print-web-tls
|
||||
namespace: edge2-proxy
|
||||
spec:
|
||||
secretName: print-web-tls
|
||||
issuerRef:
|
||||
name: step-ca-acme
|
||||
kind: ClusterIssuer
|
||||
dnsNames:
|
||||
- print.iamworkin.lan
|
||||
---
|
||||
apiVersion: traefik.io/v1alpha1
|
||||
kind: IngressRoute
|
||||
metadata:
|
||||
name: print-web
|
||||
namespace: edge2-proxy
|
||||
spec:
|
||||
entryPoints:
|
||||
- websecure
|
||||
routes:
|
||||
- kind: Rule
|
||||
match: Host(`print.iamworkin.lan`)
|
||||
services:
|
||||
- name: print-web-external
|
||||
port: 5200
|
||||
tls:
|
||||
secretName: print-web-tls
|
||||
---
|
||||
# NetworkPolicy: allow Traefik ingress, allow egress to edge2 + DNS
|
||||
apiVersion: networking.k8s.io/v1
|
||||
kind: NetworkPolicy
|
||||
metadata:
|
||||
name: edge2-proxy-netpol
|
||||
namespace: edge2-proxy
|
||||
spec:
|
||||
podSelector: {}
|
||||
policyTypes:
|
||||
- Ingress
|
||||
- Egress
|
||||
ingress:
|
||||
- from:
|
||||
- namespaceSelector:
|
||||
matchLabels:
|
||||
kubernetes.io/metadata.name: traefik-system
|
||||
egress:
|
||||
- to:
|
||||
- ipBlock:
|
||||
cidr: 10.0.57.16/32
|
||||
ports:
|
||||
- port: 5200
|
||||
protocol: TCP
|
||||
- to:
|
||||
- namespaceSelector:
|
||||
matchLabels:
|
||||
kubernetes.io/metadata.name: kube-system
|
||||
ports:
|
||||
- port: 53
|
||||
protocol: UDP
|
||||
- port: 53
|
||||
protocol: TCP
|
||||
@@ -1,5 +1,18 @@
|
||||
# FlowerCore Remote Desktop — TLS + Ingress
|
||||
# Deployment and Service managed by deploy script (not ArgoCD)
|
||||
#
|
||||
# Source-of-truth split:
|
||||
# - bluejay-infra OWNS: Certificate, IngressRoute, all NetworkPolicies
|
||||
# (see network-policies.yaml in this directory).
|
||||
# - FlowerCore.RemoteDesktop scripts/deploy-web.sh OWNS: Deployment +
|
||||
# Service. Reason: image refs like `localhost/fc-desktop:linux-xfce`
|
||||
# only exist on each node's containerd after a manual import, so a
|
||||
# Deployment manifest in bluejay-infra would race the image-import
|
||||
# step and crash-loop.
|
||||
#
|
||||
# NetworkPolicies moved into bluejay-infra 2026-05-07 — previously they
|
||||
# were applied via the deploy script's kubectl apply calls, which broke
|
||||
# cluster-rebuild repeatability. See
|
||||
# feedback_networkpolicies_belong_in_bluejay_infra.md.
|
||||
---
|
||||
apiVersion: cert-manager.io/v1
|
||||
kind: Certificate
|
||||
@@ -23,6 +36,14 @@ spec:
|
||||
entryPoints:
|
||||
- websecure
|
||||
routes:
|
||||
# Host-level catch-all for desktop.iamworkin.lan. The /guacamole
|
||||
# path-prefix match lives in apps/guacamole/guacamole.yaml as a
|
||||
# separate IngressRoute in the guacamole namespace — the cluster
|
||||
# Traefik disallows cross-namespace service refs, so the PathPrefix
|
||||
# rule can't sit here. Traefik's router matching precedence gives
|
||||
# longer/more-specific rules priority automatically, so as long as
|
||||
# the guacamole IngressRoute exists it takes /guacamole traffic
|
||||
# before this catch-all sees it.
|
||||
- match: Host(`desktop.iamworkin.lan`)
|
||||
kind: Rule
|
||||
services:
|
||||
|
||||
332
apps/fc-desktop/network-policies.yaml
Normal file
332
apps/fc-desktop/network-policies.yaml
Normal file
@@ -0,0 +1,332 @@
|
||||
# FlowerCore Remote Desktop — NetworkPolicies (GitOps-managed)
|
||||
#
|
||||
# Moved into bluejay-infra 2026-05-07 as part of the regroup audit. These
|
||||
# four policies were previously applied via FlowerCore.RemoteDesktop's
|
||||
# scripts/deploy-web.sh `kubectl apply` calls, which meant a fresh cluster
|
||||
# rebuild from bluejay-infra alone would miss them — Browser Lab session
|
||||
# isolation, control-plane allow-list, and HTTP-01 cert renewal would all
|
||||
# silently fail to come up.
|
||||
#
|
||||
# Source-of-truth contract:
|
||||
# - bluejay-infra OWNS all NetworkPolicy + Certificate + IngressRoute
|
||||
# resources for fc-desktop.
|
||||
# - FlowerCore.RemoteDesktop's scripts/deploy-web.sh continues to own
|
||||
# the Deployment + Service apply (because the image ref
|
||||
# `localhost/fc-desktop:linux-xfce` only exists on each node's
|
||||
# containerd after a manual import — it can't be pulled from a
|
||||
# registry, so a Deployment manifest in bluejay-infra would race the
|
||||
# image-import step and crash-loop).
|
||||
---
|
||||
# 1) desktop-isolation — Browser Lab session pods.
|
||||
#
|
||||
# Locks down pods labeled `app.kubernetes.io/name=remote-desktop` (every
|
||||
# session pod regardless of template). Allows guacd ingress for the VNC/RDP
|
||||
# display lane and remotedesktop-web's pre-handoff probing. Egress: NFS to
|
||||
# Synology, DNS, Traefik (cluster + LB VIP), Intranet (Browser Lab home).
|
||||
apiVersion: networking.k8s.io/v1
|
||||
kind: NetworkPolicy
|
||||
metadata:
|
||||
name: desktop-isolation
|
||||
namespace: fc-desktop
|
||||
labels:
|
||||
app.kubernetes.io/part-of: remotedesktop
|
||||
app.kubernetes.io/component: isolation
|
||||
spec:
|
||||
podSelector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: remote-desktop
|
||||
policyTypes:
|
||||
- Ingress
|
||||
- Egress
|
||||
ingress:
|
||||
- from:
|
||||
- namespaceSelector:
|
||||
matchLabels:
|
||||
kubernetes.io/metadata.name: guacamole
|
||||
ports:
|
||||
- port: 3000
|
||||
protocol: TCP
|
||||
- port: 3001
|
||||
protocol: TCP
|
||||
- port: 5901
|
||||
protocol: TCP
|
||||
- port: 3389
|
||||
protocol: TCP
|
||||
- from:
|
||||
- namespaceSelector:
|
||||
matchLabels:
|
||||
kubernetes.io/metadata.name: fc-desktop
|
||||
podSelector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: remotedesktop-web
|
||||
ports:
|
||||
- port: 3000
|
||||
protocol: TCP
|
||||
- port: 5901
|
||||
protocol: TCP
|
||||
egress:
|
||||
# NFS to Synology
|
||||
- to:
|
||||
- ipBlock:
|
||||
cidr: 10.0.58.3/32
|
||||
ports:
|
||||
- port: 2049
|
||||
protocol: TCP
|
||||
- port: 2049
|
||||
protocol: UDP
|
||||
- port: 111
|
||||
protocol: TCP
|
||||
- port: 111
|
||||
protocol: UDP
|
||||
- to:
|
||||
- ipBlock:
|
||||
cidr: 10.0.58.3/32
|
||||
ports:
|
||||
- port: 445
|
||||
protocol: TCP
|
||||
- to: []
|
||||
ports:
|
||||
- port: 53
|
||||
protocol: UDP
|
||||
- port: 53
|
||||
protocol: TCP
|
||||
- to:
|
||||
- ipBlock:
|
||||
cidr: 10.0.56.200/32
|
||||
- ipBlock:
|
||||
cidr: 10.43.33.87/32
|
||||
- namespaceSelector:
|
||||
matchLabels:
|
||||
kubernetes.io/metadata.name: traefik-system
|
||||
podSelector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: traefik
|
||||
ports:
|
||||
- port: 80
|
||||
protocol: TCP
|
||||
- port: 443
|
||||
protocol: TCP
|
||||
- port: 8000
|
||||
protocol: TCP
|
||||
- port: 8443
|
||||
protocol: TCP
|
||||
- to:
|
||||
- namespaceSelector:
|
||||
matchLabels:
|
||||
kubernetes.io/metadata.name: intranet
|
||||
podSelector:
|
||||
matchLabels:
|
||||
app: intranet-web
|
||||
ports:
|
||||
- port: 5300
|
||||
protocol: TCP
|
||||
---
|
||||
# 2) fc-desktop-default-deny — namespace-wide catch-all.
|
||||
#
|
||||
# Selects every pod EXCEPT remotedesktop-web (the public-surface control
|
||||
# plane) and applies default-deny semantics for both Ingress and Egress.
|
||||
# Closes the gap where session pods land WITHOUT the desktop-isolation
|
||||
# policy's `app.kubernetes.io/name=remote-desktop` label, plus prevents
|
||||
# arbitrary debug sidecars / kubectl debug images from getting cluster
|
||||
# access.
|
||||
#
|
||||
# CRITICAL: also catches transient cm-acme-http-solver pods (that's the
|
||||
# bug this whole regroup chased). The cm-acme-http-solver-allow policy
|
||||
# below is the explicit carve-out.
|
||||
apiVersion: networking.k8s.io/v1
|
||||
kind: NetworkPolicy
|
||||
metadata:
|
||||
name: fc-desktop-default-deny
|
||||
namespace: fc-desktop
|
||||
labels:
|
||||
app.kubernetes.io/part-of: remotedesktop
|
||||
app.kubernetes.io/component: isolation
|
||||
spec:
|
||||
podSelector:
|
||||
matchExpressions:
|
||||
- key: app.kubernetes.io/name
|
||||
operator: NotIn
|
||||
values:
|
||||
- remotedesktop-web
|
||||
policyTypes:
|
||||
- Ingress
|
||||
- Egress
|
||||
---
|
||||
# 3) remotedesktop-web-isolation — control plane explicit allow-list.
|
||||
#
|
||||
# remotedesktop-web is the only pod label the default-deny excludes, so
|
||||
# without this policy the control plane would have wide-open Ingress AND
|
||||
# Egress. This re-introduces a tight allow-list:
|
||||
# - Ingress: Traefik only on TCP/8080
|
||||
# - Egress: CoreDNS, K8s API, Guacamole admin, NFS, Intranet,
|
||||
# Traefik (cluster + LB), and the fc-desktop namespace itself
|
||||
# (for session pod readiness probing).
|
||||
apiVersion: networking.k8s.io/v1
|
||||
kind: NetworkPolicy
|
||||
metadata:
|
||||
name: remotedesktop-web-isolation
|
||||
namespace: fc-desktop
|
||||
labels:
|
||||
app.kubernetes.io/part-of: remotedesktop
|
||||
app.kubernetes.io/component: isolation
|
||||
spec:
|
||||
podSelector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: remotedesktop-web
|
||||
policyTypes:
|
||||
- Ingress
|
||||
- Egress
|
||||
ingress:
|
||||
- from:
|
||||
- namespaceSelector:
|
||||
matchLabels:
|
||||
kubernetes.io/metadata.name: traefik-system
|
||||
podSelector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: traefik
|
||||
ports:
|
||||
- port: 8080
|
||||
protocol: TCP
|
||||
egress:
|
||||
# CoreDNS
|
||||
- to:
|
||||
- namespaceSelector:
|
||||
matchLabels:
|
||||
kubernetes.io/metadata.name: kube-system
|
||||
podSelector:
|
||||
matchLabels:
|
||||
k8s-app: kube-dns
|
||||
ports:
|
||||
- port: 53
|
||||
protocol: UDP
|
||||
- port: 53
|
||||
protocol: TCP
|
||||
# K8s API server
|
||||
- to: []
|
||||
ports:
|
||||
- port: 443
|
||||
protocol: TCP
|
||||
- port: 6443
|
||||
protocol: TCP
|
||||
# Guacamole admin
|
||||
- to:
|
||||
- namespaceSelector:
|
||||
matchLabels:
|
||||
kubernetes.io/metadata.name: guacamole
|
||||
ports:
|
||||
- port: 8080
|
||||
protocol: TCP
|
||||
# NFS to Synology
|
||||
- to:
|
||||
- ipBlock:
|
||||
cidr: 10.0.58.3/32
|
||||
ports:
|
||||
- port: 2049
|
||||
protocol: TCP
|
||||
- port: 2049
|
||||
protocol: UDP
|
||||
- port: 111
|
||||
protocol: TCP
|
||||
- port: 111
|
||||
protocol: UDP
|
||||
# Intranet web
|
||||
- to:
|
||||
- namespaceSelector:
|
||||
matchLabels:
|
||||
kubernetes.io/metadata.name: intranet
|
||||
podSelector:
|
||||
matchLabels:
|
||||
app: intranet-web
|
||||
ports:
|
||||
- port: 5300
|
||||
protocol: TCP
|
||||
# Cluster Traefik pods (in-cluster service resolution + Guacamole
|
||||
# routing handoff where web app builds URLs against the public host
|
||||
# but resolves internally).
|
||||
- to:
|
||||
- namespaceSelector:
|
||||
matchLabels:
|
||||
kubernetes.io/metadata.name: traefik-system
|
||||
podSelector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: traefik
|
||||
ports:
|
||||
- port: 80
|
||||
protocol: TCP
|
||||
- port: 443
|
||||
protocol: TCP
|
||||
- port: 8080
|
||||
protocol: TCP
|
||||
- port: 8443
|
||||
protocol: TCP
|
||||
# fc-desktop namespace — session pod probing during browser-access
|
||||
# readiness checks.
|
||||
- to:
|
||||
- namespaceSelector:
|
||||
matchLabels:
|
||||
kubernetes.io/metadata.name: fc-desktop
|
||||
ports:
|
||||
- port: 3000
|
||||
protocol: TCP
|
||||
- port: 3001
|
||||
protocol: TCP
|
||||
- port: 5901
|
||||
protocol: TCP
|
||||
- port: 3389
|
||||
protocol: TCP
|
||||
---
|
||||
# 4) cm-acme-http-solver-allow — cert-manager HTTP-01 carve-out.
|
||||
#
|
||||
# Without this, fc-desktop-default-deny catches the transient solver pods
|
||||
# cert-manager creates for each renewal (they don't carry the
|
||||
# remotedesktop-web label). Caused 8-day silent renewal failure on
|
||||
# desktop.iamworkin.lan in 2026-04-28..2026-05-07 (see
|
||||
# feedback_certmanager_renewal_stuck_when_solver_blocked_by_namespace_default_deny.md).
|
||||
#
|
||||
# Authorizes:
|
||||
# - Ingress on TCP/8089 from cluster Traefik (which proxies the external
|
||||
# HTTP-01 GET on port 80 through to the solver).
|
||||
# - Egress for cluster DNS (defensive — newer cert-manager probes from
|
||||
# inside the solver too).
|
||||
#
|
||||
# The `acme.cert-manager.io/http01-solver=true` label is set by
|
||||
# cert-manager itself on every solver pod automatically.
|
||||
apiVersion: networking.k8s.io/v1
|
||||
kind: NetworkPolicy
|
||||
metadata:
|
||||
name: cm-acme-http-solver-allow
|
||||
namespace: fc-desktop
|
||||
labels:
|
||||
app.kubernetes.io/part-of: remotedesktop
|
||||
app.kubernetes.io/component: cert-renewal
|
||||
spec:
|
||||
podSelector:
|
||||
matchLabels:
|
||||
acme.cert-manager.io/http01-solver: "true"
|
||||
policyTypes:
|
||||
- Ingress
|
||||
- Egress
|
||||
ingress:
|
||||
- from:
|
||||
- namespaceSelector:
|
||||
matchLabels:
|
||||
kubernetes.io/metadata.name: traefik-system
|
||||
podSelector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: traefik
|
||||
ports:
|
||||
- port: 8089
|
||||
protocol: TCP
|
||||
egress:
|
||||
- to:
|
||||
- namespaceSelector:
|
||||
matchLabels:
|
||||
kubernetes.io/metadata.name: kube-system
|
||||
podSelector:
|
||||
matchLabels:
|
||||
k8s-app: kube-dns
|
||||
ports:
|
||||
- port: 53
|
||||
protocol: UDP
|
||||
- port: 53
|
||||
protocol: TCP
|
||||
105
apps/fc-distribution/README.md
Normal file
105
apps/fc-distribution/README.md
Normal file
@@ -0,0 +1,105 @@
|
||||
# fc-distribution — staged deployment (Phase 1, USB provisioning)
|
||||
|
||||
**Status:** manifests staged, **NOT YET APPLIED**. Image must be built +
|
||||
imported and signing 1Password items confirmed before `git push`.
|
||||
|
||||
- Architecture: [`../../../FlowerCore.Notes/docs/infrastructure/usb-provisioning-architecture.md`](../../../FlowerCore.Notes/docs/infrastructure/usb-provisioning-architecture.md)
|
||||
- Repo: `D:\git\FlowerCore\FlowerCore.Distribution\` (`README.md`, `CLAUDE.md`)
|
||||
- Shared lib: `FlowerCore.Common` -> `FlowerCore.Shared.Distribution`
|
||||
|
||||
`FlowerCore.Distribution` publishes signed edition manifests (ECDSA P-256
|
||||
over canonical JSON) and serves the SHA-256 content-addressed blob store
|
||||
that USB builders pull from. The verifier embeds the `IAmWorkin ACME CA
|
||||
Root CA` as the trust anchor; per-edition leaf signing material lives in
|
||||
1Password and is mounted into the pod read-only.
|
||||
|
||||
## Deployment order (do NOT skip / reorder)
|
||||
|
||||
### 1. FlowerCore.DNS preflight — VERIFIED 2026-04-23
|
||||
|
||||
`dist.iamworkin.lan` already resolves to `10.0.56.200`, but keep the
|
||||
FlowerCore.DNS preflight green before push:
|
||||
|
||||
```bash
|
||||
curl -sk "https://dns.iamworkin.lan/api/v1/zones/iamworkin.lan/resolve-preflight?hostname=dist.iamworkin.lan"
|
||||
# Expect: "resolvable": true
|
||||
|
||||
python bluejay-infra/scripts/check-pfsense-dns.py
|
||||
# Historical filename retained; implementation now calls FlowerCore.DNS
|
||||
# resolve-preflight instead of raw resolver lookups.
|
||||
```
|
||||
|
||||
If the record ever disappears, recreate it through FlowerCore.DNS before
|
||||
push/apply:
|
||||
|
||||
```bash
|
||||
curl -sk https://dns.iamworkin.lan/api/v1/servers
|
||||
curl -sk -X POST https://dns.iamworkin.lan/api/v1/servers/<serverId>/zones/iamworkin.lan/records \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"name":"dist","type":"A","data":"10.0.56.200","ttl":300}'
|
||||
```
|
||||
|
||||
If this is missing, cert-manager HTTP-01 will silently back off ~2h. See
|
||||
memory `feedback_pfsense_dns_required_for_acme.md`.
|
||||
|
||||
### 2. 1Password items required in vault `IAmWorkin`
|
||||
|
||||
| Item title | Item id | Used as |
|
||||
|---|---|---|
|
||||
| `FlowerCore Code Signing CA` | (existing) | Informational handle only — root CA is baked into the image at build time, not mounted |
|
||||
| `FlowerCore Edition Signing Key - edition:kiosk-standard` | `3hf33egdvnni6jyuws3r737mqe` | Mounted at `/signing/kiosk-standard/` |
|
||||
| `FlowerCore Edition Signing Key - edition:aistation-field` | `ccxrtsan5samfq4pfuczymacrq` | Mounted at `/signing/aistation-field/` |
|
||||
|
||||
Each edition item must publish three field labels (the operator turns
|
||||
field labels into Secret keys verbatim):
|
||||
|
||||
- `certificate.pem` — leaf certificate
|
||||
- `private-key.pem` — ECDSA P-256 private key
|
||||
- `chain.pem` — leaf + intermediate (referenced by the env var as the
|
||||
cert-path; the verifier uses this for signature path validation)
|
||||
|
||||
### 3. Build + import the image to rke2-server
|
||||
|
||||
The Pod is pinned to `rke2-server` because the Synology NFS export
|
||||
`/volume1/kubernetes` only allows that node. Importing to the agents is
|
||||
optional until the ACL is widened.
|
||||
|
||||
```bash
|
||||
# From BLUEJAY-WS, in D:\git\FlowerCore\FlowerCore.Distribution
|
||||
TAG="v$(date +%Y%m%d%H%M)"
|
||||
dotnet.exe publish -c Release -o deploy/app \
|
||||
src/FlowerCore.Distribution.Web/FlowerCore.Distribution.Web.csproj
|
||||
podman build -t localhost/fc-distribution:$TAG -f deploy/Dockerfile.deploy deploy
|
||||
podman save localhost/fc-distribution:$TAG -o /tmp/fc-distribution.tar
|
||||
scp /tmp/fc-distribution.tar rke2-server:/tmp/
|
||||
ssh rke2-server "sudo /var/lib/rancher/rke2/bin/ctr -a /run/k3s/containerd/containerd.sock -n k8s.io images import /tmp/fc-distribution.tar"
|
||||
```
|
||||
|
||||
### 4. Bump the image tag + push
|
||||
|
||||
Edit `fc-distribution.yaml`, replace `localhost/fc-distribution:v202604231530`
|
||||
with the tag from step 3, then:
|
||||
|
||||
```bash
|
||||
cd D:/git/FlowerCore/bluejay-infra
|
||||
python scripts/check-pfsense-dns.py
|
||||
git add apps/fc-distribution/
|
||||
git commit -m "feat(fc-distribution): deploy Phase 1 manifest publisher"
|
||||
git push
|
||||
```
|
||||
|
||||
ArgoCD picks up within ~3 minutes and creates `infra-fc-distribution`.
|
||||
|
||||
### 5. Verify
|
||||
|
||||
```bash
|
||||
fcadmin_ssh noc1 '
|
||||
kubectl -n argocd get application infra-fc-distribution
|
||||
kubectl -n fc-distribution get certificate,pod,secret
|
||||
curl -sk -m 8 -o /dev/null -w "HTTP %{http_code}\n" https://dist.iamworkin.lan/healthz
|
||||
'
|
||||
```
|
||||
|
||||
Expect: Certificate `Ready: True` within ~60s, `/healthz` HTTP 200, both
|
||||
`edition-kiosk-standard` and `edition-aistation-field` Secrets present
|
||||
with `certificate.pem`, `private-key.pem`, `chain.pem` keys.
|
||||
355
apps/fc-distribution/fc-distribution.yaml
Normal file
355
apps/fc-distribution/fc-distribution.yaml
Normal file
@@ -0,0 +1,355 @@
|
||||
# FlowerCore.Distribution — edition manifest publisher + content-addressed blob store.
|
||||
# Phase 1 of the USB provisioning architecture: signed edition manifests
|
||||
# (ECDSA P-256 over canonical JSON) published per edition, plus a SHA-256
|
||||
# content-addressed blob store that USB builders pull from.
|
||||
#
|
||||
# Architecture: FlowerCore.Notes/docs/infrastructure/usb-provisioning-architecture.md
|
||||
# Repo: FlowerCore.Distribution/{README.md,CLAUDE.md}
|
||||
# Shared lib: FlowerCore.Common -> FlowerCore.Shared.Distribution
|
||||
# (manifest schema, canonical JSON, ECDSA P-256 sign/verify)
|
||||
#
|
||||
# Deployment order (see bluejay-infra/README.md and apps/fc-distribution/README.md):
|
||||
# 1. pfSense Unbound DNS override for dist.iamworkin.lan -> 10.0.56.200
|
||||
# (DONE 2026-04-23 — verify with `python bluejay-infra/scripts/check-pfsense-dns.py`).
|
||||
# 2. 1Password items must exist in vault `IAmWorkin`:
|
||||
# - `FlowerCore Code Signing CA` (informational)
|
||||
# - `FlowerCore Edition Signing Key - edition:kiosk-standard` (3hf33egdvnni6jyuws3r737mqe)
|
||||
# - `FlowerCore Edition Signing Key - edition:aistation-field` (ccxrtsan5samfq4pfuczymacrq)
|
||||
# Each edition item is expected to publish three field labels:
|
||||
# certificate.pem, private-key.pem, chain.pem
|
||||
# 3. Synology NFS export `/volume1/kubernetes` is currently restricted to
|
||||
# rke2-server (10.0.56.11). Pod is pinned via nodeSelector below. The
|
||||
# app writes to subPaths `distribution/data` and `distribution/blobs`.
|
||||
# 4. Build + import image: localhost/fc-distribution:v<YYYYMMDD><HHMM>
|
||||
# Import to rke2-server via `ctr images import` (NFS-pinned, no need
|
||||
# for the agents until ACL is widened — see guacamole pattern).
|
||||
# 5. Bump the image tag below and git push; ArgoCD ApplicationSet picks up
|
||||
# within ~3 minutes and creates `infra-fc-distribution`.
|
||||
#
|
||||
# NOTE on the root trust anchor:
|
||||
# The verifier needs an embedded root CA (`IAmWorkin ACME CA Root CA`).
|
||||
# That root is shipped INSIDE the published image (Phase 2 build step
|
||||
# bakes it into the bundle), NOT mounted from a Secret here. The
|
||||
# `codesigning-root-cert` OnePasswordItem below is informational only —
|
||||
# it gives operators a quick handle to the CA item from the cluster.
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Namespace
|
||||
metadata:
|
||||
name: fc-distribution
|
||||
labels:
|
||||
app.kubernetes.io/part-of: flowercore
|
||||
---
|
||||
# Informational handle to the FlowerCore Code Signing CA item in 1Password.
|
||||
# Not consumed by the pod at runtime — the root trust anchor is baked into
|
||||
# the published image. Operators can `kubectl -n fc-distribution get secret
|
||||
# codesigning-root-cert` to discover the CA item URL/admin handle.
|
||||
apiVersion: onepassword.com/v1
|
||||
kind: OnePasswordItem
|
||||
metadata:
|
||||
name: codesigning-root-cert
|
||||
namespace: fc-distribution
|
||||
spec:
|
||||
itemPath: "vaults/IAmWorkin/items/FlowerCore Code Signing CA"
|
||||
---
|
||||
# Edition signing key + leaf cert + chain for edition:kiosk-standard.
|
||||
# 1Password item id: 3hf33egdvnni6jyuws3r737mqe
|
||||
# Operator syncs each field to a Secret key of the same name. Mounted
|
||||
# read-only at /signing/kiosk-standard inside the pod.
|
||||
apiVersion: onepassword.com/v1
|
||||
kind: OnePasswordItem
|
||||
metadata:
|
||||
name: edition-kiosk-standard
|
||||
namespace: fc-distribution
|
||||
spec:
|
||||
itemPath: "vaults/IAmWorkin/items/FlowerCore Edition Signing Key - edition:kiosk-standard"
|
||||
---
|
||||
# Edition signing key + leaf cert + chain for edition:aistation-field.
|
||||
# 1Password item id: ccxrtsan5samfq4pfuczymacrq
|
||||
apiVersion: onepassword.com/v1
|
||||
kind: OnePasswordItem
|
||||
metadata:
|
||||
name: edition-aistation-field
|
||||
namespace: fc-distribution
|
||||
spec:
|
||||
itemPath: "vaults/IAmWorkin/items/FlowerCore Edition Signing Key - edition:aistation-field"
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: fc-distribution
|
||||
namespace: fc-distribution
|
||||
labels:
|
||||
app.kubernetes.io/name: fc-distribution
|
||||
app.kubernetes.io/part-of: flowercore
|
||||
spec:
|
||||
replicas: 1
|
||||
revisionHistoryLimit: 3
|
||||
strategy:
|
||||
# NFS-backed SQLite + blob store on a single node. Recreate avoids any
|
||||
# multi-attach overlap on the same NFS subPath during rollout.
|
||||
type: Recreate
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: fc-distribution
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/name: fc-distribution
|
||||
app.kubernetes.io/part-of: flowercore
|
||||
annotations:
|
||||
prometheus.io/scrape: "true"
|
||||
prometheus.io/port: "8080"
|
||||
prometheus.io/path: "/metrics"
|
||||
spec:
|
||||
# Synology NFS export `/volume1/kubernetes` ACL only allows rke2-server
|
||||
# (10.0.56.11) right now. Until the ACL is widened in DSM (admin only),
|
||||
# this Pod must run on rke2-server or NFS mounts will be access-denied.
|
||||
nodeSelector:
|
||||
kubernetes.io/hostname: rke2-server
|
||||
securityContext:
|
||||
runAsNonRoot: true
|
||||
fsGroup: 1654
|
||||
fsGroupChangePolicy: OnRootMismatch
|
||||
containers:
|
||||
- name: web
|
||||
# Placeholder tag — bump to the image you built + imported to
|
||||
# rke2-server before applying. Build with:
|
||||
# dotnet.exe publish -c Release -o deploy/app \
|
||||
# src/FlowerCore.Distribution.Web/FlowerCore.Distribution.Web.csproj
|
||||
# podman build -t localhost/fc-distribution:v<tag> -f deploy/Dockerfile.deploy deploy
|
||||
image: localhost/fc-distribution:v202605061948
|
||||
imagePullPolicy: Never
|
||||
ports:
|
||||
- containerPort: 8080
|
||||
name: http
|
||||
env:
|
||||
- name: ASPNETCORE_URLS
|
||||
value: "http://+:8080"
|
||||
- name: ASPNETCORE_ENVIRONMENT
|
||||
value: "Production"
|
||||
- name: DOTNET_SYSTEM_GLOBALIZATION_INVARIANT
|
||||
value: "false"
|
||||
# SQLite connection (catalog + data-protection keys via FlowerCoreDbContext).
|
||||
# Read by Data/DatabaseProviderExtensions.cs in precedence order; Sqlite key wins.
|
||||
- name: FlowerCore__Database__Provider
|
||||
value: "Sqlite"
|
||||
- name: FlowerCore__Database__ConnectionStrings__Sqlite
|
||||
value: "Data Source=/data/distribution.db"
|
||||
# Content-addressed blob root (SHA-256 sharded on disk).
|
||||
# Bound by Services/NfsPvcBlobProvider.cs under FlowerCore:Distribution:Blobs.
|
||||
- name: FlowerCore__Distribution__Blobs__Root
|
||||
value: "/blobs"
|
||||
# Per-edition signing material — paths into the read-only
|
||||
# secret mounts below. Field labels in 1Password (and therefore
|
||||
# Secret key names) are: certificate.pem, private-key.pem, chain.pem
|
||||
- name: FlowerCore__Distribution__Signing__EditionCerts__kiosk-standard__CertPath
|
||||
value: "/signing/kiosk-standard/chain.pem"
|
||||
- name: FlowerCore__Distribution__Signing__EditionCerts__kiosk-standard__KeyPath
|
||||
value: "/signing/kiosk-standard/private-key.pem"
|
||||
- name: FlowerCore__Distribution__Signing__EditionCerts__aistation-field__CertPath
|
||||
value: "/signing/aistation-field/chain.pem"
|
||||
- name: FlowerCore__Distribution__Signing__EditionCerts__aistation-field__KeyPath
|
||||
value: "/signing/aistation-field/private-key.pem"
|
||||
# Public distribution host is GET/HEAD-only at Traefik; this
|
||||
# entitlement list controls which editions are readable there.
|
||||
- name: FlowerCore__Distribution__EntitlementPublic__PublicEditions__0
|
||||
value: "*"
|
||||
resources:
|
||||
requests:
|
||||
cpu: 100m
|
||||
memory: 256Mi
|
||||
limits:
|
||||
cpu: 500m
|
||||
memory: 512Mi
|
||||
# /healthz is exposed by the scaffold (StartupGateMiddleware-aware).
|
||||
# Liveness uses tcpSocket as a cheap fallback in case a future
|
||||
# middleware change accidentally gates /healthz behind auth
|
||||
# (memory: feedback_k8s_probes_behind_auth_middleware).
|
||||
startupProbe:
|
||||
httpGet:
|
||||
path: /healthz
|
||||
port: 8080
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
failureThreshold: 30
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /healthz
|
||||
port: 8080
|
||||
periodSeconds: 10
|
||||
failureThreshold: 3
|
||||
livenessProbe:
|
||||
tcpSocket:
|
||||
port: 8080
|
||||
initialDelaySeconds: 30
|
||||
periodSeconds: 30
|
||||
failureThreshold: 3
|
||||
securityContext:
|
||||
runAsNonRoot: true
|
||||
runAsUser: 1654
|
||||
runAsGroup: 1654
|
||||
allowPrivilegeEscalation: false
|
||||
readOnlyRootFilesystem: true
|
||||
capabilities:
|
||||
drop:
|
||||
- ALL
|
||||
volumeMounts:
|
||||
- name: sqlite
|
||||
mountPath: /data
|
||||
subPath: distribution/data
|
||||
- name: blobs
|
||||
mountPath: /blobs
|
||||
subPath: distribution/blobs
|
||||
- name: tmp
|
||||
mountPath: /tmp
|
||||
- name: logs
|
||||
mountPath: /app/logs
|
||||
- name: kiosk-standard
|
||||
mountPath: /signing/kiosk-standard
|
||||
readOnly: true
|
||||
- name: aistation-field
|
||||
mountPath: /signing/aistation-field
|
||||
readOnly: true
|
||||
volumes:
|
||||
# Synology NFS at /volume1/kubernetes — same export pattern as
|
||||
# apps/guacamole/guacamole.yaml (recordings volume). Pinned by
|
||||
# ACL to rke2-server. Never mount the subpath as nfs.path —
|
||||
# always mount the export root and use volumeMount.subPath.
|
||||
- name: sqlite
|
||||
nfs:
|
||||
server: 10.0.58.3
|
||||
path: /volume1/kubernetes
|
||||
- name: blobs
|
||||
nfs:
|
||||
server: 10.0.58.3
|
||||
path: /volume1/kubernetes
|
||||
- name: tmp
|
||||
emptyDir: {}
|
||||
- name: logs
|
||||
emptyDir: {}
|
||||
- name: kiosk-standard
|
||||
secret:
|
||||
secretName: edition-kiosk-standard
|
||||
defaultMode: 0400
|
||||
- name: aistation-field
|
||||
secret:
|
||||
secretName: edition-aistation-field
|
||||
defaultMode: 0400
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: fc-distribution
|
||||
namespace: fc-distribution
|
||||
labels:
|
||||
app.kubernetes.io/name: fc-distribution
|
||||
app.kubernetes.io/part-of: flowercore
|
||||
spec:
|
||||
type: ClusterIP
|
||||
selector:
|
||||
app.kubernetes.io/name: fc-distribution
|
||||
ports:
|
||||
- name: http
|
||||
port: 80
|
||||
targetPort: 8080
|
||||
---
|
||||
apiVersion: cert-manager.io/v1
|
||||
kind: Certificate
|
||||
metadata:
|
||||
name: fc-distribution-tls
|
||||
namespace: fc-distribution
|
||||
spec:
|
||||
secretName: fc-distribution-tls-secret
|
||||
issuerRef:
|
||||
name: step-ca-acme
|
||||
kind: ClusterIssuer
|
||||
dnsNames:
|
||||
- dist.iamworkin.lan
|
||||
# step-ca ACME caps lifetime at 30d; requesting 90d silently capped
|
||||
# made renewBefore=cert-lifetime → perpetual renewal loop (10880+ CRs
|
||||
# in 18h on 2026-05-07). Match working 720h/240h pattern from other
|
||||
# FC services.
|
||||
duration: 720h # 30d (step-ca cap)
|
||||
renewBefore: 240h # 10d
|
||||
---
|
||||
apiVersion: traefik.io/v1alpha1
|
||||
kind: IngressRoute
|
||||
metadata:
|
||||
name: fc-distribution
|
||||
namespace: fc-distribution
|
||||
spec:
|
||||
entryPoints:
|
||||
- websecure
|
||||
routes:
|
||||
- match: Host(`dist.iamworkin.lan`)
|
||||
kind: Rule
|
||||
services:
|
||||
- name: fc-distribution
|
||||
port: 80
|
||||
tls:
|
||||
secretName: fc-distribution-tls-secret
|
||||
---
|
||||
# === dist.flowercore.io public surface (2026-04-24) =========================
|
||||
#
|
||||
# Shares the Deployment + Service + PVC with the internal IngressRoute above.
|
||||
# The controller's NamedEntitlementResolverRouter picks between the internal
|
||||
# (permissive) and public (strict) StaticTokenEntitlementResolver based on
|
||||
# the X-FC-Distribution-Profile header — which the middleware below injects
|
||||
# on every public-host request after stripping any caller-supplied value.
|
||||
#
|
||||
# Cert is the shared Cloudflare Origin Certificate for *.flowercore.io, literal
|
||||
# bytes copied (matches gitea-public, matrix, telephony, mail, flowercore-landing
|
||||
# pattern — not yet via OnePasswordItem operator).
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Secret
|
||||
metadata:
|
||||
name: cf-origin-flowercore-io
|
||||
namespace: fc-distribution
|
||||
type: kubernetes.io/tls
|
||||
data:
|
||||
tls.crt: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUVvRENDQTRpZ0F3SUJBZ0lVSXN4c1NKV1VRL0tqZ09ldk81YnNuVi9rZVE4d0RRWUpLb1pJaHZjTkFRRUwKQlFBd2dZc3hDekFKQmdOVkJBWVRBbFZUTVJrd0Z3WURWUVFLRXhCRGJHOTFaRVpzWVhKbExDQkpibU11TVRRdwpNZ1lEVlFRTEV5dERiRzkxWkVac1lYSmxJRTl5YVdkcGJpQlRVMHdnUTJWeWRHbG1hV05oZEdVZ1FYVjBhRzl5CmFYUjVNUll3RkFZRFZRUUhFdzFUWVc0Z1JuSmhibU5wYzJOdk1STXdFUVlEVlFRSUV3cERZV3hwWm05eWJtbGgKTUI0WERUSTJNRE14TURFMk16TXdNRm9YRFRReE1ETXdOakUyTXpNd01Gb3dZakVaTUJjR0ExVUVDaE1RUTJ4dgpkV1JHYkdGeVpTd2dTVzVqTGpFZE1Cc0dBMVVFQ3hNVVEyeHZkV1JHYkdGeVpTQlBjbWxuYVc0Z1EwRXhKakFrCkJnTlZCQU1USFVOc2IzVmtSbXhoY21VZ1QzSnBaMmx1SUVObGNuUnBabWxqWVhSbE1JSUJJakFOQmdrcWhraUcKOXcwQkFRRUZBQU9DQVE0QU1JSUJDZ0tDQVFFQXV0QmpkQ0xEdHdMQlZCU0Y1ZU1OMkt3ckIxTmZmRVhRMjlRRAo1aVR0dzJFcEZXNVJJSllkMjNrYUpCMU5jZXpHWlg4a0Q0cGEyWHpFZW1MVEtJNWw0MU11b3FoWjczNVE3U3RWCkVjRFFTT2ZYTkZQdFMwb0hqb0pRdGF2QjM0ZmJNR3l4Mmx0MU9HUzRNMGtLUWpBNWR6OTJQYjNyZ1RKR0JhOW4KeTZtVThncjRuUHRSdklxZ3NxdjRtMFA3dVU1YjE3NzU1Y2JLSDVoMzIxWHVjMDU4Tzl4M2JHQ0NuRUJXWDdqeApjRGhkUEs1Ri9XRjVBQnl5cFhIQ0ZxUUd4M1NVbmtCQ0ZQSmRabnMra3BHVUZWZGhud3B6NjBtNnlJSzQ0eVR4CjZqR3JOTFEyM1dOK2gwU1lCZU5vb2JBWThydkpiVlZEaGJqSVhBTWtFNGQzVll1TlhRSURBUUFCbzRJQklqQ0MKQVI0d0RnWURWUjBQQVFIL0JBUURBZ1dnTUIwR0ExVWRKUVFXTUJRR0NDc0dBUVVGQndNQ0JnZ3JCZ0VGQlFjRApBVEFNQmdOVkhSTUJBZjhFQWpBQU1CMEdBMVVkRGdRV0JCUkt1NkJVUDZ0N2dpbFRPay9FdEdKQ3R6N3dTREFmCkJnTlZIU01FR0RBV2dCUWs2Rk5YWFh3MFFJZXA2NVRidXVFV2VQd3BwREJBQmdnckJnRUZCUWNCQVFRME1ESXcKTUFZSUt3WUJCUVVITUFHR0pHaDBkSEE2THk5dlkzTndMbU5zYjNWa1pteGhjbVV1WTI5dEwyOXlhV2RwYmw5agpZVEFqQmdOVkhSRUVIREFhZ2d3cUxtbGhiWGR2Y21zdWFXNkNDbWxoYlhkdmNtc3VhVzR3T0FZRFZSMGZCREV3Ckx6QXRvQ3VnS1lZbmFIUjBjRG92TDJOeWJDNWpiRzkxWkdac1lYSmxMbU52YlM5dmNtbG5hVzVmWTJFdVkzSnMKTUEwR0NTcUdTSWIzRFFFQkN3VUFBNElCQVFDSjMvTGNleE5pb0lWdUxoemhmbTZCeDV2SWk3T25CaHF1WUlDdwplNnArZ0prdE16ZFJQcDV0bk03dllBWmxMajVJOTByWDRuczhJc3dEbzJBN2wwYTRGZVJFclFmRklsZXQzbjIyCjUxVTZYVElCSks5c1FZT0FkU3pJUzV1OUNKSFpBUTF5WmxSd3BBR3RVWnhxL1dpcGFWUTRwNXhrcEJNMVlZSlAKNW1jQ09HcFErSnpORlpQc2daYUJncDBYL1BBZkNJRkkyZld5QWE2elBqRm0rdDVXUXIrZlBaT2VUS2VIbWVzVgo3UlZxUUdEb3Q0eTY1NklEdmdmU2ZLRnFIRW9XNDJVbDBxQ05hMS9keEJld3NIS1VWWE1ETkdiQlNVQjM4TG9YCm1OQ3hJQlVOUjR0TG1CQUxZT3hVMnZhSWRCd0xBc2YrcndnVnVjUGpCUTc2VWMwUQotLS0tLUVORCBDRVJUSUZJQ0FURS0tLS0tCg==
|
||||
tls.key: LS0tLS1CRUdJTiBQUklWQVRFIEtFWS0tLS0tCk1JSUV2UUlCQURBTkJna3Foa2lHOXcwQkFRRUZBQVNDQktjd2dnU2pBZ0VBQW9JQkFRQzYwR04wSXNPM0FzRlUKRklYbDR3M1lyQ3NIYTE5OFJkRGIxQVBtSk8zRFlTa1ZibEVnbGgzYmVSb2tIVTF4N01abGZ5UVBpbHJaZk1SNgpZdE1vam1YalV5NmlxRm52ZmxEdEsxVVJ3TkJJNTljMFUrMUxTZ2VPZ2xDMXE4SGZoOXN3YkxIYVczVTRaTGd6ClNRcENNRGwzUDNZOXZldUJNa1lGcjJmTHFaVHlDdmljKzFHOGlxQ3lxL2liUS91NVRsdlh2dm5seHNvZm1IZmIKVmU1elRudzczSGRzWUlLY1FGWmZ1UEZ3T0YwOHJrWDlZWGtBSExLbGNjSVdwQWJIZEpTZVFFSVU4bDFtZXo2UwprWlFWVjJHZkNuUHJTYnJJZ3JqakpQSHFNYXMwdERiZFkzNkhSSmdGNDJpaHNCanl1OGx0VlVPRnVNaGNBeVFUCmgzZFZpNDFkQWdNQkFBRUNnZ0VBTGlseXZkNmVTcEYvZUxtV2lhTVV4NUxwa2dhWHpITkxCQnNNZUpqcytLL0EKVVdlZ1crTkVUdmlLalZ5QlI5SzRocG1IYldDa2lPUDBBQUwrQnlKQ3lvekNOQmJTSEdRejlwc1R5dzZBV1ZlUwpuYjlVWGx1VmFQRktKTTRqbXNydERuYjVic25WT2lGblErTDdTalkwNlFMUlFybjBvUWp0ZFJldUdBMFlQVU90CkhSYzNsMFg2ZHJqdkJYY2prWTQwWm9ZYkRrelJnU1JWbWVOUGFIbjZPR0NtYUVUMXVyK01qYVZ2ME9lbEdIWncKVzljSEIxaHNxRzUvMWU3V0RQN0l0cjkwTmg4ay81NVhiK3lQUnhsRFd5bWtZMzIvdFBtZzdESTRKV2tRRWt3cgpIZUtwODVTcE5ta1liRnVpVFppeU8zZDZ0aXZHNHhFZW8rSzFVVFU4c1FLQmdRRFRNSEU1RDFYVC9HbGR5VHNsCllrODRVL1N0NXUrK2RIUEt1Wmw2dVB0UGgxV1lrdnFRcmdrL05YanVud2xGN0Y3b2tWOGdPeWxreTYwYTZkcXIKeXZwN1ZJdXYzekVlc2h2NjNWMlpaVkMzcXZYSzFheit3Zmx3NitCZmVuRlY5S2NENHN0dTdwOFRPWmFGN01CUgo3YXZzaXVXbWtqdmM1TlVLRmVDRTY0SnZFUUtCZ1FEaWMrbWlNLzBodDN1ajhuOXgyMDFQZFNqbEpVaUc1NjNNCnRYZlBCdDJRT0NhaVluUFNFdTdXdm5pQWRFL2xrMm91cFRWam9LYmZPbDFyQjd6UzVhc2kxdVdDZDhlUy9UWGIKdU5iRmlNMDB4L3JxalMydCtQbTd4MVhrYTB4TFNSRDNmZ0tSQldSN3pscStkYWZ1WE1qelUxRnh5dTIycGphRgpIMEl3NEpCUmpRS0JnUUNOaWhMb0Rob1V5RCtKNXJzb00vb3FJMEtDWnB0WlJzendHbkg5cVFwdFk2Ti9iVXBYCk92emhpeUh3czAvUXVEbG5uejVrNktHMmR6Y2VLWXN2eGdzWUt6S3ZmV043VWgya2hVWWM3NlVvWTREMkh6MGgKUkxtNzc2cGg4enNRUTdiSHlQRlUrTUpPYlRNdnNOdTRUUlVEcEplRGl0QnFIRWVYeWMrKzVlUjJNUUtCZ0h2UgptVHVoWlpVYitEVEtrVGkyQ20yWnlBU1RBRGNUVW9xTjVyYUNNSDk4MUZNUnRmWjFkN1pmYXhBQmlQWWtSbmkrCnlKUnk4UXM1cEg2ek9tR3VSb2JFTGJYS3ZJcjRmSXhwWXJXYmVXaVV0L09yd2dCUUZHekNMNHEzeUgyWnMvYy8KSlRRYVdMa0JPY2pPR0VaUzRXVjZkeHZiTTJNZE9zNUxLeXdDZmFhNUFvR0FIQUE1eEN0dndOZE4xeExndkZ3RApPK2lyMDl1bXMxOFBzSVpmK1ZrWGtpcHF4MWNUT0hEanpPR01yWXV0M2FFeE00Zjd2ckFHRFMyY2pwZjM0T1JxCit4Y2gwWlNaQ2FDZmlnZG9OelNkcDFLcmo0cnFKdG5ZdS9CNDlDQlVoSDBNaCtSRWswQ0hHOVE4b3FOWFk0V0wKbVVOVTZMYUkwQWtvSzNVb2tWQVJEYXM9Ci0tLS0tRU5EIFBSSVZBVEUgS0VZLS0tLS0K
|
||||
---
|
||||
# Traefik middleware: strips any caller-supplied X-FC-Distribution-Profile,
|
||||
# then sets an authoritative 'public' value so the controller routes to the
|
||||
# strict entitlement resolver. The trust boundary is this middleware — the
|
||||
# internal IngressRoute (dist.iamworkin.lan) does NOT attach it.
|
||||
apiVersion: traefik.io/v1alpha1
|
||||
kind: Middleware
|
||||
metadata:
|
||||
name: dist-public-profile-header
|
||||
namespace: fc-distribution
|
||||
spec:
|
||||
headers:
|
||||
customRequestHeaders:
|
||||
X-FC-Distribution-Profile: "public"
|
||||
---
|
||||
# Public IngressRoute: binds dist.flowercore.io (Cloudflare-proxied A record
|
||||
# -> pfSense NAT -> Traefik VIP 10.0.56.200) to the same backend Service that
|
||||
# serves dist.iamworkin.lan. Header-injection middleware ensures the
|
||||
# controller uses the public (strict) entitlement resolver.
|
||||
apiVersion: traefik.io/v1alpha1
|
||||
kind: IngressRoute
|
||||
metadata:
|
||||
name: fc-distribution-public
|
||||
namespace: fc-distribution
|
||||
spec:
|
||||
entryPoints:
|
||||
- websecure
|
||||
routes:
|
||||
# Method allowlist: Host + (GET || HEAD). Anything else misses every
|
||||
# route and Traefik returns 404 before reaching the pod — edge-level
|
||||
# defense-in-depth over the controller's strict-mode entitlement check.
|
||||
# Together these block admin ops (POST /blobs, POST /manifests*) from
|
||||
# ever being processed on the public surface.
|
||||
- match: Host(`dist.flowercore.io`) && (Method(`GET`) || Method(`HEAD`))
|
||||
kind: Rule
|
||||
middlewares:
|
||||
- name: dist-public-profile-header
|
||||
services:
|
||||
- name: fc-distribution
|
||||
port: 80
|
||||
tls:
|
||||
secretName: cf-origin-flowercore-io
|
||||
9
apps/fc-distribution/kustomization.yaml
Normal file
9
apps/fc-distribution/kustomization.yaml
Normal file
@@ -0,0 +1,9 @@
|
||||
# ArgoCD's bluejay-infra ApplicationSet uses a directory generator and does
|
||||
# not require kustomization.yaml (existing apps like fc-llm-bridge and
|
||||
# guacamole have none). This file is included anyway as a single source of
|
||||
# truth for the resource list and to make `kubectl kustomize` previews work
|
||||
# from a working copy.
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- fc-distribution.yaml
|
||||
174
apps/fc-llm-bridge/README.md
Normal file
174
apps/fc-llm-bridge/README.md
Normal file
@@ -0,0 +1,174 @@
|
||||
# fc-llm-bridge — staged deployment (ADR-088)
|
||||
|
||||
**Status:** manifests staged, **NOT YET APPLIED**. Do not `git push` or sync
|
||||
ArgoCD until the two pre-requisites below are done, in order.
|
||||
|
||||
Design: [`../../../FlowerCore.Notes/docs/ai-agents/agent-zero-anthropic-bridge.md`](../../../FlowerCore.Notes/docs/ai-agents/agent-zero-anthropic-bridge.md)
|
||||
ADR: ADR-088 in [`../../../FlowerCore.Notes/ARCHITECTURE.md`](../../../FlowerCore.Notes/ARCHITECTURE.md)
|
||||
|
||||
## Deployment order (do NOT skip / reorder)
|
||||
|
||||
### 1. FlowerCore.DNS preflight — REQUIRED FIRST
|
||||
|
||||
`fc-llm-bridge.iamworkin.lan` must keep resolving to `10.0.56.200` through
|
||||
FlowerCore.DNS before this manifest is applied.
|
||||
|
||||
step-ca (the ACME CA on noc1) uses pfSense Unbound (10.0.56.1), **not**
|
||||
cluster CoreDNS. If you apply this manifest before adding the DNS override,
|
||||
cert-manager's HTTP-01 challenge silently fails for ~2h (exponential backoff)
|
||||
until someone manually runs `kubectl -n fc-llm-bridge delete order <order>`
|
||||
to bust the cache. See memory `feedback_pfsense_dns_required_for_acme.md`.
|
||||
|
||||
Verify the record through the public preflight API:
|
||||
|
||||
```bash
|
||||
curl -sk "https://dns.iamworkin.lan/api/v1/zones/iamworkin.lan/resolve-preflight?hostname=fc-llm-bridge.iamworkin.lan"
|
||||
# Expect: "resolvable": true
|
||||
```
|
||||
|
||||
Verify:
|
||||
|
||||
```bash
|
||||
python scripts/check-pfsense-dns.py
|
||||
# Historical filename retained; implementation now calls FlowerCore.DNS
|
||||
# resolve-preflight instead of raw resolver lookups.
|
||||
```
|
||||
|
||||
If the record is missing, recreate it through FlowerCore.DNS before pushing:
|
||||
|
||||
```bash
|
||||
curl -sk https://dns.iamworkin.lan/api/v1/servers
|
||||
curl -sk -X POST https://dns.iamworkin.lan/api/v1/servers/<serverId>/zones/iamworkin.lan/records \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"name":"fc-llm-bridge","type":"A","data":"10.0.56.200","ttl":300}'
|
||||
```
|
||||
|
||||
### 2. Create the `FC LLM Bridge API Keys` 1Password item
|
||||
|
||||
The `Claude API Key` item in vault `IAmWorkin` already exists (id
|
||||
`e5tth3y5mp3lhdavg35pxadzca`, see `docs/ai-agents/anthropic-integration.md`).
|
||||
|
||||
The new item for per-consumer bridge API keys does NOT yet exist. Create it
|
||||
before the first apply of this manifest — the Deployment marks the individual
|
||||
key env vars `optional: true` so missing keys will not crash the pod, but the
|
||||
bridge will reject every request with 401 until at least one key is populated.
|
||||
|
||||
| Field | Item position | Type | Purpose |
|
||||
|-------|---------------|------|---------|
|
||||
| `credential` | Top section | Password (random, 48 char) | Unused placeholder required by the 1Password schema for single-field items. Can be anything — this file is never read by K8s. |
|
||||
| `agent-zero-ws` | "API Keys" section | Password (random, 48 char) | API key for the BLUEJAY-WS Agent Zero instance. |
|
||||
| `agent-zero-k8s` | "API Keys" section | Password (random, 48 char) | API key for the K8s-hosted `agent-zero` Deployment. |
|
||||
| `spare-1` | "API Keys" section | Password (random, 48 char) | Reserve for future Agent Zero forks / smoke-test scripts. |
|
||||
| `spare-2` | "API Keys" section | Password (random, 48 char) | Reserve. |
|
||||
|
||||
Steps via the CLI (run from a machine with `op` signed in):
|
||||
|
||||
```bash
|
||||
op item create \
|
||||
--category="API Credential" \
|
||||
--title="FC LLM Bridge API Keys" \
|
||||
--vault="IAmWorkin" \
|
||||
"API Keys.agent-zero-ws[password]=$(openssl rand -hex 24)" \
|
||||
"API Keys.agent-zero-k8s[password]=$(openssl rand -hex 24)" \
|
||||
"API Keys.spare-1[password]=$(openssl rand -hex 24)" \
|
||||
"API Keys.spare-2[password]=$(openssl rand -hex 24)"
|
||||
```
|
||||
|
||||
OR via the 1Password GUI — create a new item titled exactly `FC LLM Bridge API
|
||||
Keys` in the `IAmWorkin` vault, add an `API Keys` section, add four password
|
||||
fields named `agent-zero-ws`, `agent-zero-k8s`, `spare-1`, `spare-2` with
|
||||
`openssl rand -hex 24` values.
|
||||
|
||||
**Mapping to K8s:** The 1Password Connect operator syncs each field to a
|
||||
Secret key of the same name. The Deployment's env vars
|
||||
(`FlowerCore__LlmBridge__ApiKeys__agent-zero-ws` etc) reference those Secret
|
||||
keys. In `FlowerCore.Shared.Api.Authentication.ApiKeyAuthMiddleware`, the key
|
||||
name (e.g. `agent-zero-k8s`) becomes the `fc.app` claim on the
|
||||
`ClaimsPrincipal`, which is what `IBudgetLedger` uses to scope spend per
|
||||
consumer.
|
||||
|
||||
### 3. Build + import the image to every RKE2 node
|
||||
|
||||
```bash
|
||||
# From BLUEJAY-WS, in D:\git\FlowerCore\FlowerCore.LlmBridge
|
||||
TAG="v$(date +%Y%m%d%H%M%S)"
|
||||
dotnet.exe publish -c Release -o deploy/app \
|
||||
src/FlowerCore.LlmBridge.Web/FlowerCore.LlmBridge.Web.csproj
|
||||
podman build -t localhost/fc-llm-bridge:$TAG -f deploy/Dockerfile.deploy deploy
|
||||
podman save localhost/fc-llm-bridge:$TAG -o /tmp/fc-llm-bridge.tar
|
||||
|
||||
# SCP to each node and ctr import
|
||||
for NODE in rke2-server rke2-agent1 rke2-agent2; do
|
||||
scp /tmp/fc-llm-bridge.tar $NODE:/tmp/
|
||||
ssh $NODE "sudo /var/lib/rancher/rke2/bin/ctr -a /run/k3s/containerd/containerd.sock -n k8s.io images import /tmp/fc-llm-bridge.tar"
|
||||
done
|
||||
```
|
||||
|
||||
### 4. Bump the image tag in the manifest
|
||||
|
||||
Edit `fc-llm-bridge.yaml`, replace `localhost/fc-llm-bridge:v00000000000000`
|
||||
with the tag from step 3.
|
||||
|
||||
### 5. Commit + push
|
||||
|
||||
```bash
|
||||
cd D:/git/FlowerCore/bluejay-infra
|
||||
# re-run the DNS gate
|
||||
python scripts/check-pfsense-dns.py
|
||||
git add apps/fc-llm-bridge/
|
||||
git commit -m "feat(fc-llm-bridge): deploy ADR-088 Agent Zero bridge"
|
||||
git push
|
||||
```
|
||||
|
||||
ArgoCD picks up within ~3 minutes and creates `infra-fc-llm-bridge`.
|
||||
|
||||
### 6. Verify
|
||||
|
||||
```bash
|
||||
# From noc1
|
||||
fcadmin_ssh noc1 '
|
||||
kubectl -n argocd get application infra-fc-llm-bridge
|
||||
kubectl -n fc-llm-bridge get certificate,pod
|
||||
curl -sk -m 8 -o /dev/null -w "HTTP %{http_code}\n" https://fc-llm-bridge.iamworkin.lan/healthz
|
||||
'
|
||||
```
|
||||
|
||||
Expect: Certificate `Ready: True` within ~60s, `/healthz` HTTP 200.
|
||||
|
||||
### 7. Flip Agent Zero to the bridge
|
||||
|
||||
After the bridge passes a real chat smoke test, update the Agent Zero
|
||||
ConfigMap (`apps/agent-zero/agent-zero.yaml`) to route through the bridge:
|
||||
|
||||
- `A0_SET_chat_model_api_base` / `config.json > chat_model.api_base`
|
||||
-> `https://fc-llm-bridge.iamworkin.lan/v1`
|
||||
- Add an `A0_SET_chat_model_api_key` env var wired to a K8s Secret sourced
|
||||
from `FC LLM Bridge API Keys` field `agent-zero-k8s`.
|
||||
- Set `chat_model.name` to `fc:balanced` (or a concrete model) — the bridge
|
||||
accepts both tier aliases and concrete model names.
|
||||
|
||||
Do the same for BLUEJAY-WS Agent Zero (`agent-zero-ws` key), or keep the
|
||||
workstation on direct Ollama and only route Anthropic calls through the
|
||||
bridge (the design doc describes this split as the preferred approach).
|
||||
|
||||
## Current state at staging time (2026-04-23)
|
||||
|
||||
- `fc-llm-bridge.iamworkin.lan` — public FlowerCore.DNS preflight is now
|
||||
green and resolves to `10.0.56.200`; keep `python scripts/check-pfsense-dns.py`
|
||||
green before push.
|
||||
- `FC LLM Bridge API Keys` — NOT created in 1Password (user action).
|
||||
- `Claude API Key` — already exists in `IAmWorkin` vault
|
||||
(`e5tth3y5mp3lhdavg35pxadzca`), also consumed by AiStation and Chat.Web.
|
||||
- `localhost/fc-llm-bridge:v*` image — not yet built; `FlowerCore.LlmBridge`
|
||||
repo has local commit `6d285b5` only, no remote.
|
||||
- ArgoCD `infra-fc-llm-bridge` Application — will be auto-created by the
|
||||
`bluejay-infra` ApplicationSet once the directory is on `main`.
|
||||
|
||||
## Why tcpSocket probes (not `/healthz`)
|
||||
|
||||
The bridge runs `ApiKeyAuthMiddleware`. `/healthz` and `/health` are exempt
|
||||
via `FlowerCore:LlmBridge:AuthExemptPaths`, so an HTTP probe would work
|
||||
today. But a future change to the middleware registration order could
|
||||
silently turn kubelet probes into 401/404, which crashes pods on every
|
||||
deploy. `tcpSocket` keeps probes robust against that regression. Memory:
|
||||
`feedback_k8s_probes_behind_auth_middleware.md`.
|
||||
283
apps/fc-llm-bridge/fc-llm-bridge.yaml
Normal file
283
apps/fc-llm-bridge/fc-llm-bridge.yaml
Normal file
@@ -0,0 +1,283 @@
|
||||
# FlowerCore.LlmBridge — OpenAI-compatible bridge for Agent Zero.
|
||||
# Routes through FlowerCore.Shared.Chat (ILlmProviderClient) with budget
|
||||
# enforcement, response caching, and tier-based model routing. Lets Agent
|
||||
# Zero (Python) reach Anthropic and Ollama providers without re-implementing
|
||||
# the C# budget/cache/router primitives.
|
||||
#
|
||||
# Design: FlowerCore.Notes/docs/ai-agents/agent-zero-anthropic-bridge.md
|
||||
# ADR: FlowerCore.Notes/ARCHITECTURE.md (ADR-088)
|
||||
#
|
||||
# Deployment order (see bluejay-infra/README.md):
|
||||
# 1. pfSense DNS override for fc-llm-bridge.iamworkin.lan -> 10.0.56.200
|
||||
# (REQUIRED before this is applied — cert-manager HTTP-01 will silently
|
||||
# fail for ~2h backoff otherwise). Run scripts/pfsense-add-dns-overrides.py.
|
||||
# 2. 1Password items `Claude API Key` (already exists) and
|
||||
# `FC LLM Bridge API Keys` (create when first non-dev environment comes up).
|
||||
# 3. Build + import image: localhost/fc-llm-bridge:v<YYYYMMDD><HHMM>
|
||||
# Import to rke2-server, rke2-agent1, rke2-agent2 via ctr images import.
|
||||
# 4. Bump the image tag below and git push; ArgoCD ApplicationSet picks up.
|
||||
# 5. Flip Agent Zero chat.openai.base_url to https://fc-llm-bridge.iamworkin.lan/v1
|
||||
# and api_key to the op://IAmWorkin/FC LLM Bridge API Keys/agent-zero-k8s value.
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Namespace
|
||||
metadata:
|
||||
name: fc-llm-bridge
|
||||
labels:
|
||||
app.kubernetes.io/part-of: flowercore
|
||||
---
|
||||
# Claude (Anthropic) API key — shared across FC services.
|
||||
# Existing 1Password item. `credential` field -> Secret `anthropic-api-key`.
|
||||
apiVersion: onepassword.com/v1
|
||||
kind: OnePasswordItem
|
||||
metadata:
|
||||
name: anthropic-api-key
|
||||
namespace: fc-llm-bridge
|
||||
spec:
|
||||
itemPath: "vaults/IAmWorkin/items/Claude API Key"
|
||||
---
|
||||
# Per-consumer API keys for the bridge itself.
|
||||
# NEW 1Password item — see apps/fc-llm-bridge/README.md for the field layout
|
||||
# to create before first apply. Fields become Secret keys of the same name:
|
||||
# agent-zero-ws, agent-zero-k8s, spare-1, spare-2
|
||||
apiVersion: onepassword.com/v1
|
||||
kind: OnePasswordItem
|
||||
metadata:
|
||||
name: fc-llm-bridge-api-keys
|
||||
namespace: fc-llm-bridge
|
||||
spec:
|
||||
itemPath: "vaults/IAmWorkin/items/FC LLM Bridge API Keys"
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: PersistentVolumeClaim
|
||||
metadata:
|
||||
name: fc-llm-bridge-data
|
||||
namespace: fc-llm-bridge
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
storageClassName: longhorn
|
||||
resources:
|
||||
requests:
|
||||
storage: 2Gi
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: fc-llm-bridge
|
||||
namespace: fc-llm-bridge
|
||||
labels:
|
||||
app.kubernetes.io/name: fc-llm-bridge
|
||||
app.kubernetes.io/part-of: flowercore
|
||||
spec:
|
||||
replicas: 1
|
||||
revisionHistoryLimit: 3
|
||||
strategy:
|
||||
type: Recreate
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: fc-llm-bridge
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/name: fc-llm-bridge
|
||||
app.kubernetes.io/part-of: flowercore
|
||||
annotations:
|
||||
prometheus.io/scrape: "true"
|
||||
prometheus.io/port: "8080"
|
||||
prometheus.io/path: "/metrics"
|
||||
spec:
|
||||
# Use an explicit DNS policy so external FQDNs like api.anthropic.com are
|
||||
# resolved directly instead of being expanded through the cluster search
|
||||
# path that includes iamworkin.lan.
|
||||
dnsPolicy: None
|
||||
dnsConfig:
|
||||
nameservers:
|
||||
- 10.43.0.10
|
||||
searches:
|
||||
- fc-llm-bridge.svc.cluster.local
|
||||
- svc.cluster.local
|
||||
- cluster.local
|
||||
options:
|
||||
- name: ndots
|
||||
value: "2"
|
||||
securityContext:
|
||||
fsGroup: 1654
|
||||
fsGroupChangePolicy: OnRootMismatch
|
||||
containers:
|
||||
- name: web
|
||||
# Placeholder tag — bump to the image you built + imported to every
|
||||
# RKE2 node before applying. Build with:
|
||||
# dotnet.exe publish -c Release -o deploy/app \
|
||||
# src/FlowerCore.LlmBridge.Web/FlowerCore.LlmBridge.Web.csproj
|
||||
# podman build -t localhost/fc-llm-bridge:v<tag> -f deploy/Dockerfile.deploy deploy
|
||||
image: localhost/fc-llm-bridge:v202604300022
|
||||
imagePullPolicy: Never
|
||||
ports:
|
||||
- containerPort: 8080
|
||||
name: http
|
||||
env:
|
||||
- name: ASPNETCORE_URLS
|
||||
value: "http://+:8080"
|
||||
- name: ASPNETCORE_ENVIRONMENT
|
||||
value: "Production"
|
||||
- name: DOTNET_SYSTEM_GLOBALIZATION_INVARIANT
|
||||
value: "false"
|
||||
# SQLite (budget ledger + response cache + data-protection keys)
|
||||
- name: FlowerCore__LlmBridge__SqliteConnectionString
|
||||
value: "Data Source=/data/llm-bridge.db"
|
||||
- name: FlowerCore__LlmBridge__DefaultTenantId
|
||||
value: "default"
|
||||
- name: FlowerCore__LlmBridge__DefaultAppName
|
||||
value: "agent-zero"
|
||||
- name: FlowerCore__LlmBridge__UtilModel
|
||||
value: "qwen2.5:1.5b"
|
||||
- name: FlowerCore__LlmBridge__EmbedModel
|
||||
value: "nomic-embed-text"
|
||||
# Per-consumer API keys — from OnePasswordItem fc-llm-bridge-api-keys.
|
||||
# Each field becomes a Secret key of the same name. The key-name
|
||||
# lands in the auth principal's `fc.app` claim for ledger scoping.
|
||||
- name: FlowerCore__LlmBridge__ApiKeys__agent-zero-ws
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: fc-llm-bridge-api-keys
|
||||
key: agent-zero-ws
|
||||
optional: true
|
||||
- name: FlowerCore__LlmBridge__ApiKeys__agent-zero-k8s
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: fc-llm-bridge-api-keys
|
||||
key: agent-zero-k8s
|
||||
optional: true
|
||||
- name: FlowerCore__LlmBridge__ApiKeys__spare-1
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: fc-llm-bridge-api-keys
|
||||
key: spare-1
|
||||
optional: true
|
||||
- name: FlowerCore__LlmBridge__ApiKeys__spare-2
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: fc-llm-bridge-api-keys
|
||||
key: spare-2
|
||||
optional: true
|
||||
# Shared.Chat — Ollama (edge1 Pi 5 + AI HAT+, matches bridge default)
|
||||
- name: FlowerCore__Chat__OllamaBaseUrl
|
||||
value: "http://10.0.57.17:11434"
|
||||
- name: FlowerCore__Chat__HttpTimeout
|
||||
value: "00:05:00"
|
||||
# Shared.Chat — Anthropic
|
||||
- name: FlowerCore__Chat__Anthropic__Enabled
|
||||
value: "true"
|
||||
- name: FlowerCore__Chat__Anthropic__ApiKey
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: anthropic-api-key
|
||||
key: password
|
||||
- name: FlowerCore__Chat__Anthropic__OrganizationId
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: anthropic-api-key
|
||||
key: organization_id
|
||||
optional: true
|
||||
- name: FlowerCore__Chat__Anthropic__BaseUrl
|
||||
value: "https://api.anthropic.com"
|
||||
- name: FlowerCore__Chat__Anthropic__DefaultModel
|
||||
value: "claude-sonnet-4-6"
|
||||
- name: FlowerCore__Chat__Anthropic__AnthropicVersion
|
||||
value: "2023-06-01"
|
||||
- name: FlowerCore__Chat__Anthropic__Timeout
|
||||
value: "00:05:00"
|
||||
resources:
|
||||
requests:
|
||||
cpu: 100m
|
||||
memory: 256Mi
|
||||
limits:
|
||||
cpu: 1000m
|
||||
memory: 768Mi
|
||||
volumeMounts:
|
||||
- name: data
|
||||
mountPath: /data
|
||||
- name: tmp
|
||||
mountPath: /tmp
|
||||
- name: app-data
|
||||
mountPath: /app/data
|
||||
securityContext:
|
||||
runAsNonRoot: true
|
||||
runAsUser: 1654
|
||||
runAsGroup: 1654
|
||||
allowPrivilegeEscalation: false
|
||||
readOnlyRootFilesystem: true
|
||||
capabilities:
|
||||
drop:
|
||||
- ALL
|
||||
# tcpSocket probes: the app runs ApiKeyAuthMiddleware. /healthz is
|
||||
# registered as anonymous via AuthExemptPaths but tcpSocket avoids any
|
||||
# future accidental middleware ordering regression
|
||||
# (memory: feedback_k8s_probes_behind_auth_middleware).
|
||||
readinessProbe:
|
||||
tcpSocket:
|
||||
port: 8080
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 10
|
||||
livenessProbe:
|
||||
tcpSocket:
|
||||
port: 8080
|
||||
initialDelaySeconds: 15
|
||||
periodSeconds: 30
|
||||
volumes:
|
||||
- name: data
|
||||
persistentVolumeClaim:
|
||||
claimName: fc-llm-bridge-data
|
||||
- name: tmp
|
||||
emptyDir: {}
|
||||
# The Dockerfile `WORKDIR /app` pairs with the default
|
||||
# SqliteConnectionString "Data Source=data/llm-bridge.db" (relative).
|
||||
# The env var above overrides to /data, so /app/data can be emptyDir.
|
||||
- name: app-data
|
||||
emptyDir: {}
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: fc-llm-bridge
|
||||
namespace: fc-llm-bridge
|
||||
spec:
|
||||
selector:
|
||||
app.kubernetes.io/name: fc-llm-bridge
|
||||
ports:
|
||||
- port: 8080
|
||||
targetPort: 8080
|
||||
name: http
|
||||
---
|
||||
apiVersion: cert-manager.io/v1
|
||||
kind: Certificate
|
||||
metadata:
|
||||
name: fc-llm-bridge-cert
|
||||
namespace: fc-llm-bridge
|
||||
spec:
|
||||
secretName: fc-llm-bridge-tls
|
||||
issuerRef:
|
||||
name: step-ca-acme
|
||||
kind: ClusterIssuer
|
||||
dnsNames:
|
||||
- fc-llm-bridge.iamworkin.lan
|
||||
duration: 720h
|
||||
renewBefore: 240h
|
||||
---
|
||||
apiVersion: traefik.io/v1alpha1
|
||||
kind: IngressRoute
|
||||
metadata:
|
||||
name: fc-llm-bridge
|
||||
namespace: fc-llm-bridge
|
||||
spec:
|
||||
entryPoints:
|
||||
- websecure
|
||||
routes:
|
||||
- match: Host(`fc-llm-bridge.iamworkin.lan`)
|
||||
kind: Rule
|
||||
services:
|
||||
- name: fc-llm-bridge
|
||||
port: 8080
|
||||
tls:
|
||||
secretName: fc-llm-bridge-tls
|
||||
@@ -7,6 +7,21 @@ metadata:
|
||||
labels:
|
||||
app.kubernetes.io/part-of: bluejay-infra
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: messageboard-web-config
|
||||
namespace: fc-messageboard
|
||||
data:
|
||||
ASPNETCORE_ENVIRONMENT: Production
|
||||
ASPNETCORE_URLS: http://+:8080
|
||||
ASPNETCORE_FORWARDEDHEADERS_ENABLED: "true"
|
||||
Security__AllowedOrigins__0: https://messageboard.iamworkin.lan
|
||||
FlowerCore__Database__ConnectionStrings__Sqlite: Data Source=/data/messageboard.db
|
||||
OTEL_SERVICE_NAME: FlowerCore.MessageBoard
|
||||
OTEL_EXPORTER_OTLP_ENDPOINT: http://otel-collector.monitoring.svc.cluster.local:4317
|
||||
OTEL_EXPORTER_OTLP_PROTOCOL: grpc
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
@@ -16,6 +31,8 @@ metadata:
|
||||
app: messageboard-web
|
||||
spec:
|
||||
replicas: 1
|
||||
strategy:
|
||||
type: Recreate
|
||||
selector:
|
||||
matchLabels:
|
||||
app: messageboard-web
|
||||
@@ -23,19 +40,27 @@ spec:
|
||||
metadata:
|
||||
labels:
|
||||
app: messageboard-web
|
||||
annotations:
|
||||
prometheus.io/scrape: "true"
|
||||
prometheus.io/port: "8080"
|
||||
prometheus.io/path: "/metrics/prometheus"
|
||||
spec:
|
||||
containers:
|
||||
- name: messageboard-web
|
||||
image: localhost/fc-messageboard-web:v202604132015
|
||||
image: localhost/fc-messageboard-web:latest
|
||||
imagePullPolicy: Never
|
||||
ports:
|
||||
- containerPort: 8080
|
||||
name: http
|
||||
env:
|
||||
- name: ASPNETCORE_ENVIRONMENT
|
||||
value: Production
|
||||
- name: ASPNETCORE_URLS
|
||||
value: "http://+:8080"
|
||||
envFrom:
|
||||
- configMapRef:
|
||||
name: messageboard-web-config
|
||||
- secretRef:
|
||||
name: messageboard-web-secrets
|
||||
optional: true
|
||||
volumeMounts:
|
||||
- name: data
|
||||
mountPath: /data
|
||||
resources:
|
||||
requests:
|
||||
memory: "128Mi"
|
||||
@@ -44,17 +69,35 @@ spec:
|
||||
memory: "512Mi"
|
||||
cpu: "500m"
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /metrics/prometheus
|
||||
tcpSocket:
|
||||
port: 8080
|
||||
initialDelaySeconds: 10
|
||||
periodSeconds: 30
|
||||
timeoutSeconds: 5
|
||||
failureThreshold: 3
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /metrics/prometheus
|
||||
tcpSocket:
|
||||
port: 8080
|
||||
initialDelaySeconds: 5
|
||||
initialDelaySeconds: 10
|
||||
periodSeconds: 10
|
||||
timeoutSeconds: 5
|
||||
failureThreshold: 6
|
||||
volumes:
|
||||
- name: data
|
||||
persistentVolumeClaim:
|
||||
claimName: messageboard-web-data
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: PersistentVolumeClaim
|
||||
metadata:
|
||||
name: messageboard-web-data
|
||||
namespace: fc-messageboard
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
resources:
|
||||
requests:
|
||||
storage: 1Gi
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
|
||||
39
apps/fc-segmentdisplay/fc-segmentdisplay.yaml
Normal file
39
apps/fc-segmentdisplay/fc-segmentdisplay.yaml
Normal file
@@ -0,0 +1,39 @@
|
||||
# FlowerCore SegmentDisplay — TLS + Ingress
|
||||
# Deployment and Service managed by deploy script (not ArgoCD)
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Namespace
|
||||
metadata:
|
||||
name: fc-segmentdisplay
|
||||
labels:
|
||||
app.kubernetes.io/part-of: flowercore
|
||||
---
|
||||
apiVersion: cert-manager.io/v1
|
||||
kind: Certificate
|
||||
metadata:
|
||||
name: segmentdisplay-web-tls
|
||||
namespace: fc-segmentdisplay
|
||||
spec:
|
||||
secretName: segmentdisplay-web-tls
|
||||
issuerRef:
|
||||
name: step-ca-dns01
|
||||
kind: ClusterIssuer
|
||||
dnsNames:
|
||||
- segmentdisplay.iamworkin.lan
|
||||
---
|
||||
apiVersion: traefik.io/v1alpha1
|
||||
kind: IngressRoute
|
||||
metadata:
|
||||
name: segmentdisplay-web
|
||||
namespace: fc-segmentdisplay
|
||||
spec:
|
||||
entryPoints:
|
||||
- websecure
|
||||
routes:
|
||||
- match: Host(`segmentdisplay.iamworkin.lan`)
|
||||
kind: Rule
|
||||
services:
|
||||
- name: segmentdisplay-web
|
||||
port: 80
|
||||
tls:
|
||||
secretName: segmentdisplay-web-tls
|
||||
@@ -7,35 +7,67 @@ metadata:
|
||||
labels:
|
||||
app.kubernetes.io/part-of: bluejay-infra
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: PersistentVolumeClaim
|
||||
metadata:
|
||||
name: signalcontrol-data
|
||||
namespace: fc-signalcontrol
|
||||
labels:
|
||||
app.kubernetes.io/name: signalcontrol-web
|
||||
app.kubernetes.io/part-of: flowercore
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
storageClassName: longhorn
|
||||
resources:
|
||||
requests:
|
||||
storage: 1Gi
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: signalcontrol-web
|
||||
namespace: fc-signalcontrol
|
||||
labels:
|
||||
app: signalcontrol-web
|
||||
app.kubernetes.io/name: signalcontrol-web
|
||||
app.kubernetes.io/part-of: flowercore
|
||||
spec:
|
||||
replicas: 1
|
||||
strategy:
|
||||
type: Recreate
|
||||
selector:
|
||||
matchLabels:
|
||||
app: signalcontrol-web
|
||||
app.kubernetes.io/name: signalcontrol-web
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: signalcontrol-web
|
||||
app.kubernetes.io/name: signalcontrol-web
|
||||
app.kubernetes.io/part-of: flowercore
|
||||
spec:
|
||||
containers:
|
||||
- name: signalcontrol-web
|
||||
image: localhost/fc-signalcontrol-web:v202604132015
|
||||
image: localhost/fc-signalcontrol-web:latest
|
||||
imagePullPolicy: Never
|
||||
ports:
|
||||
- containerPort: 8080
|
||||
- containerPort: 5000
|
||||
name: http
|
||||
env:
|
||||
- name: ASPNETCORE_ENVIRONMENT
|
||||
value: Production
|
||||
- name: ASPNETCORE_URLS
|
||||
value: "http://+:8080"
|
||||
value: "http://+:5000"
|
||||
- name: ConnectionStrings__Default
|
||||
value: Data Source=/data/signalcontrol.db
|
||||
- name: Logging__LogLevel__Default
|
||||
value: Information
|
||||
- name: Auth__ApiKey
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: signalcontrol-auth
|
||||
key: Auth__ApiKey
|
||||
volumeMounts:
|
||||
- name: data
|
||||
mountPath: /data
|
||||
resources:
|
||||
requests:
|
||||
memory: "128Mi"
|
||||
@@ -44,29 +76,40 @@ spec:
|
||||
memory: "512Mi"
|
||||
cpu: "500m"
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /metrics/prometheus
|
||||
port: 8080
|
||||
initialDelaySeconds: 10
|
||||
tcpSocket:
|
||||
port: http
|
||||
initialDelaySeconds: 30
|
||||
periodSeconds: 30
|
||||
timeoutSeconds: 5
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /metrics/prometheus
|
||||
port: 8080
|
||||
initialDelaySeconds: 5
|
||||
tcpSocket:
|
||||
port: http
|
||||
initialDelaySeconds: 10
|
||||
periodSeconds: 10
|
||||
failureThreshold: 6
|
||||
timeoutSeconds: 5
|
||||
securityContext:
|
||||
fsGroup: 4200
|
||||
fsGroupChangePolicy: OnRootMismatch
|
||||
volumes:
|
||||
- name: data
|
||||
persistentVolumeClaim:
|
||||
claimName: signalcontrol-data
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: signalcontrol-web
|
||||
namespace: fc-signalcontrol
|
||||
labels:
|
||||
app.kubernetes.io/name: signalcontrol-web
|
||||
app.kubernetes.io/part-of: flowercore
|
||||
spec:
|
||||
selector:
|
||||
app: signalcontrol-web
|
||||
app.kubernetes.io/name: signalcontrol-web
|
||||
ports:
|
||||
- port: 80
|
||||
targetPort: 8080
|
||||
targetPort: http
|
||||
name: http
|
||||
---
|
||||
apiVersion: cert-manager.io/v1
|
||||
|
||||
35
apps/fc-ttsreader/biblical-tts/Dockerfile
Normal file
35
apps/fc-ttsreader/biblical-tts/Dockerfile
Normal file
@@ -0,0 +1,35 @@
|
||||
# FlowerCore biblical-tts — eSpeak-NG-backed TTS for Ancient Greek (grc) and
|
||||
# Hebrew (he). Wraps the espeak-ng binary in a small FastAPI app exposing
|
||||
# /tts (returns WAV) and /timings (returns word timings via espeak's
|
||||
# --pho output). Same shape as fc-speech-align so AiStation can talk to
|
||||
# both with one HTTP client pattern.
|
||||
FROM python:3.12-slim
|
||||
|
||||
ENV PYTHONDONTWRITEBYTECODE=1 \
|
||||
PYTHONUNBUFFERED=1 \
|
||||
PIP_DISABLE_PIP_VERSION_CHECK=1 \
|
||||
PIP_NO_CACHE_DIR=1
|
||||
|
||||
# espeak-ng has built-in support for grc (Ancient Greek) and he (Hebrew).
|
||||
# libsndfile1 is for the wav post-processing step.
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y --no-install-recommends \
|
||||
espeak-ng \
|
||||
libsndfile1 \
|
||||
ca-certificates \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
WORKDIR /app
|
||||
COPY requirements.txt /app/
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
COPY app.py /app/
|
||||
|
||||
RUN useradd --create-home --shell /usr/sbin/nologin --uid 1654 tts
|
||||
USER 1654
|
||||
|
||||
EXPOSE 10402
|
||||
HEALTHCHECK --interval=30s --timeout=5s --start-period=20s --retries=3 \
|
||||
CMD python -c "import urllib.request,sys; urllib.request.urlopen('http://127.0.0.1:10402/health',timeout=3); sys.exit(0)" || exit 1
|
||||
|
||||
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "10402", "--workers", "1"]
|
||||
397
apps/fc-ttsreader/biblical-tts/app.py
Normal file
397
apps/fc-ttsreader/biblical-tts/app.py
Normal file
@@ -0,0 +1,397 @@
|
||||
"""FlowerCore biblical-tts — eSpeak-NG wrapper for Ancient Greek + Hebrew.
|
||||
|
||||
Endpoints:
|
||||
|
||||
* POST /tts — body: {"text": "...", "language": "grc|he|el", "voice": "...?", "rate": 175?, "pitch": 50?}
|
||||
returns audio/wav. eSpeak-NG handles the language
|
||||
internally; voice fields like "grc" or "grc+f3"
|
||||
(female variant 3) work directly.
|
||||
* POST /timings — same body shape but returns
|
||||
{"text": "...", "words": [{"text", "startMs", "endMs"}],
|
||||
"durationMs": ...}.
|
||||
Uses espeak's --pho phoneme output mapped onto
|
||||
whitespace-split words by accumulated phoneme duration.
|
||||
Read-along clients pair this with /tts for synced
|
||||
playback.
|
||||
* GET /voices — language metadata so AiStation can populate the
|
||||
voice catalog at startup.
|
||||
* GET /health — fast readiness check.
|
||||
|
||||
Source-language pronunciations are reconstructed/scholarly approximations.
|
||||
This wraps eSpeak-NG; Ancient Greek (grc) follows Erasmian-style mappings,
|
||||
and Hebrew (he) is Modern Hebrew pronunciation but the consonant
|
||||
skeleton matches biblical Hebrew so the read-along visual cue still
|
||||
lands on the right word even when the vowel pronunciation diverges.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import io
|
||||
import logging
|
||||
import re
|
||||
import shlex
|
||||
import subprocess
|
||||
import unicodedata
|
||||
from typing import Optional
|
||||
|
||||
from fastapi import FastAPI, HTTPException
|
||||
from fastapi.responses import JSONResponse, Response
|
||||
from pydantic import BaseModel
|
||||
|
||||
LOG = logging.getLogger("biblical_tts")
|
||||
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
||||
|
||||
app = FastAPI(title="FlowerCore biblical-tts", version="1.0.0")
|
||||
|
||||
# eSpeak-NG language codes we expose. Ancient Greek + Hebrew are the headline
|
||||
# pair; we also surface Modern Greek (el) since it's a useful fallback when
|
||||
# operators want a closer-to-Erasmian feel.
|
||||
LANGUAGES = {
|
||||
"grc": {"label": "Ancient Greek (Erasmian)", "rtl": False, "default_voice": "grc"},
|
||||
"el": {"label": "Modern Greek", "rtl": False, "default_voice": "el"},
|
||||
"he": {"label": "Hebrew (Modern)", "rtl": True, "default_voice": "he"},
|
||||
}
|
||||
|
||||
|
||||
class TtsRequest(BaseModel):
|
||||
text: str
|
||||
language: str = "grc"
|
||||
voice: Optional[str] = None
|
||||
rate: int = 175 # words per minute, eSpeak default 175
|
||||
pitch: int = 50 # 0-99
|
||||
volume: int = 100 # 0-200
|
||||
|
||||
|
||||
HEBREW_CHAR_RE = re.compile(r"[\u0590-\u05FF]")
|
||||
HEBREW_WORD_RE = re.compile(r"[\u0590-\u05FF]+")
|
||||
|
||||
# eSpeak-NG's Hebrew voice can spell unpointed Hebrew as Unicode character
|
||||
# names on some builds. For source-text study reads, prefer a stable
|
||||
# scholarly transliteration so words sound like words even without niqqud.
|
||||
HEBREW_WORD_TRANSLITERATIONS = {
|
||||
"אב": "av",
|
||||
"אבא": "abba",
|
||||
"אברהם": "Avraham",
|
||||
"אדמה": "adamah",
|
||||
"אדני": "Adonai",
|
||||
"אדם": "adam",
|
||||
"אור": "or",
|
||||
"אלהים": "Elohim",
|
||||
"אלוהים": "Elohim",
|
||||
"אמן": "amen",
|
||||
"אם": "em",
|
||||
"אמת": "emet",
|
||||
"ארץ": "eretz",
|
||||
"אש": "esh",
|
||||
"את": "et",
|
||||
"בית": "beit",
|
||||
"בן": "ben",
|
||||
"ברא": "bara",
|
||||
"בראשית": "bereshit",
|
||||
"ברית": "berit",
|
||||
"ברוך": "barukh",
|
||||
"בת": "bat",
|
||||
"גוי": "goy",
|
||||
"גוים": "goyim",
|
||||
"גויים": "goyim",
|
||||
"דבר": "davar",
|
||||
"דברים": "devarim",
|
||||
"דוד": "David",
|
||||
"הלל": "hallel",
|
||||
"הארץ": "ha-aretz",
|
||||
"הברית": "ha-berit",
|
||||
"החדשה": "ha-chadashah",
|
||||
"השמים": "ha-shamayim",
|
||||
"השמיים": "ha-shamayim",
|
||||
"ויאמר": "vayomer",
|
||||
"יהוה": "Adonai",
|
||||
"יוסף": "Yosef",
|
||||
"יוחנן": "Yochanan",
|
||||
"ישראל": "Yisrael",
|
||||
"ישוע": "Yeshua",
|
||||
"יצחק": "Yitzchak",
|
||||
"יעקב": "Yaakov",
|
||||
"ירושלים": "Yerushalayim",
|
||||
"כהן": "kohen",
|
||||
"כהנים": "kohanim",
|
||||
"מים": "mayim",
|
||||
"מות": "mavet",
|
||||
"מושיע": "moshia",
|
||||
"מלך": "melekh",
|
||||
"מלכות": "malkhut",
|
||||
"מרים": "Miriam",
|
||||
"משה": "Moshe",
|
||||
"משיח": "Mashiach",
|
||||
"נביא": "navi",
|
||||
"נביאים": "neviim",
|
||||
"עם": "am",
|
||||
"עולם": "olam",
|
||||
"צדק": "tzedek",
|
||||
"קדוש": "qadosh",
|
||||
"קדושים": "qedoshim",
|
||||
"קול": "qol",
|
||||
"רוח": "ruach",
|
||||
"שאול": "Shaul",
|
||||
"שמים": "shamayim",
|
||||
"שמיים": "shamayim",
|
||||
"שמעון": "Shimon",
|
||||
"שלום": "Shalom",
|
||||
"תורה": "torah",
|
||||
"חכמה": "chokhmah",
|
||||
"חסד": "chesed",
|
||||
"חיים": "chayim",
|
||||
"חושך": "choshekh",
|
||||
}
|
||||
|
||||
HEBREW_LETTERS = {
|
||||
"א": "a",
|
||||
"ב": "b",
|
||||
"ג": "g",
|
||||
"ד": "d",
|
||||
"ה": "h",
|
||||
"ו": "v",
|
||||
"ז": "z",
|
||||
"ח": "kh",
|
||||
"ט": "t",
|
||||
"י": "y",
|
||||
"כ": "kh",
|
||||
"ך": "kh",
|
||||
"ל": "l",
|
||||
"מ": "m",
|
||||
"ם": "m",
|
||||
"נ": "n",
|
||||
"ן": "n",
|
||||
"ס": "s",
|
||||
"ע": "a",
|
||||
"פ": "p",
|
||||
"ף": "f",
|
||||
"צ": "ts",
|
||||
"ץ": "ts",
|
||||
"ק": "q",
|
||||
"ר": "r",
|
||||
"ש": "sh",
|
||||
"ת": "t",
|
||||
}
|
||||
|
||||
HEBREW_VOWELISH = {"a", "e", "i", "o", "u"}
|
||||
|
||||
|
||||
def _strip_hebrew_marks(value: str) -> str:
|
||||
decomposed = unicodedata.normalize("NFD", value)
|
||||
return "".join(
|
||||
ch for ch in decomposed
|
||||
if unicodedata.category(ch) != "Mn" and ch not in {"׳", "״", "־"}
|
||||
)
|
||||
|
||||
|
||||
def _fallback_hebrew_transliteration(word: str) -> str:
|
||||
tokens: list[str] = []
|
||||
chars = list(word)
|
||||
for index, ch in enumerate(chars):
|
||||
token = HEBREW_LETTERS.get(ch)
|
||||
if token is None:
|
||||
continue
|
||||
if ch == "ה" and index == len(chars) - 1:
|
||||
token = "ah"
|
||||
elif ch == "י" and index > 0:
|
||||
token = "i"
|
||||
elif ch == "ו" and index > 0:
|
||||
token = "o"
|
||||
tokens.append(token)
|
||||
|
||||
if not tokens:
|
||||
return word
|
||||
|
||||
spoken: list[str] = []
|
||||
for index, token in enumerate(tokens):
|
||||
spoken.append(token)
|
||||
next_token = tokens[index + 1] if index + 1 < len(tokens) else ""
|
||||
if (
|
||||
token[-1:] not in HEBREW_VOWELISH
|
||||
and next_token
|
||||
and next_token[:1] not in HEBREW_VOWELISH
|
||||
):
|
||||
spoken.append("a")
|
||||
return "".join(spoken)
|
||||
|
||||
|
||||
def _transliterate_hebrew_word(match: re.Match[str]) -> str:
|
||||
original = match.group(0)
|
||||
normalized = _strip_hebrew_marks(original)
|
||||
if not normalized:
|
||||
return original
|
||||
|
||||
direct = HEBREW_WORD_TRANSLITERATIONS.get(normalized)
|
||||
if direct:
|
||||
return direct
|
||||
|
||||
if normalized.startswith("ו") and len(normalized) > 1:
|
||||
rest = HEBREW_WORD_TRANSLITERATIONS.get(normalized[1:])
|
||||
if rest:
|
||||
return f"ve-{rest}"
|
||||
|
||||
if normalized.startswith("ה") and len(normalized) > 1:
|
||||
rest = HEBREW_WORD_TRANSLITERATIONS.get(normalized[1:])
|
||||
if rest:
|
||||
return f"ha-{rest}"
|
||||
|
||||
return _fallback_hebrew_transliteration(normalized)
|
||||
|
||||
|
||||
def _prepare_synthesis_input(text: str, language: str, voice: str) -> tuple[str, str]:
|
||||
if language.lower().startswith("he") and HEBREW_CHAR_RE.search(text):
|
||||
spoken = HEBREW_WORD_RE.sub(_transliterate_hebrew_word, text)
|
||||
return spoken, "en-us"
|
||||
return text, voice
|
||||
|
||||
|
||||
def _resolve_voice(req: TtsRequest) -> str:
|
||||
if req.voice:
|
||||
return req.voice.strip()
|
||||
lang = req.language.lower()
|
||||
return LANGUAGES.get(lang, {}).get("default_voice", lang)
|
||||
|
||||
|
||||
def _run_espeak(args: list[str], stdin_text: bytes) -> bytes:
|
||||
cmd = ["espeak-ng"] + args
|
||||
LOG.info("espeak-ng %s", shlex.join(args))
|
||||
try:
|
||||
proc = subprocess.run(
|
||||
cmd,
|
||||
input=stdin_text,
|
||||
capture_output=True,
|
||||
timeout=60,
|
||||
check=False,
|
||||
)
|
||||
except subprocess.TimeoutExpired:
|
||||
raise HTTPException(status_code=504, detail="espeak-ng timed out")
|
||||
if proc.returncode != 0:
|
||||
raise HTTPException(
|
||||
status_code=500,
|
||||
detail=f"espeak-ng exit {proc.returncode}: {proc.stderr.decode('utf-8', errors='replace')[:512]}",
|
||||
)
|
||||
return proc.stdout
|
||||
|
||||
|
||||
@app.get("/health")
|
||||
def health():
|
||||
return {"status": "ok", "languages": list(LANGUAGES.keys())}
|
||||
|
||||
|
||||
@app.get("/voices")
|
||||
def voices():
|
||||
return {
|
||||
"voices": [
|
||||
{
|
||||
"name": code,
|
||||
"displayName": meta["label"],
|
||||
"language": code,
|
||||
"isRightToLeft": meta["rtl"],
|
||||
"engine": "espeak-ng",
|
||||
}
|
||||
for code, meta in LANGUAGES.items()
|
||||
]
|
||||
}
|
||||
|
||||
|
||||
@app.post("/tts")
|
||||
def tts(req: TtsRequest) -> Response:
|
||||
if not req.text.strip():
|
||||
raise HTTPException(status_code=400, detail="text is required")
|
||||
|
||||
voice = _resolve_voice(req)
|
||||
spoken_text, synth_voice = _prepare_synthesis_input(req.text, req.language, voice)
|
||||
args = [
|
||||
"--stdout",
|
||||
"-v", synth_voice,
|
||||
"-s", str(max(80, min(450, req.rate))),
|
||||
"-p", str(max(0, min(99, req.pitch))),
|
||||
"-a", str(max(0, min(200, req.volume))),
|
||||
]
|
||||
wav = _run_espeak(args, spoken_text.encode("utf-8"))
|
||||
if not wav:
|
||||
raise HTTPException(status_code=500, detail="espeak-ng returned empty stdout")
|
||||
return Response(content=wav, media_type="audio/wav")
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# /timings — synth + word-level timing from espeak's phoneme/word stream.
|
||||
# --------------------------------------------------------------------------
|
||||
#
|
||||
# espeak-ng's --pho flag emits a phoneme stream:
|
||||
#
|
||||
# _ 5 phon...
|
||||
# _ 56 phon...
|
||||
# _ 67 phon...
|
||||
#
|
||||
# That alone doesn't give word boundaries. Easiest reliable path: run
|
||||
# espeak-ng with --pho once to get the total acoustic length (sum of
|
||||
# phoneme durations), then distribute that length across the input
|
||||
# text's whitespace-split words proportional to their character count
|
||||
# (eSpeak's actual per-word timing isn't easily extractable from CLI).
|
||||
# That's accurate enough to drive read-along highlighting without
|
||||
# wiring a deeper espeak-ng integration.
|
||||
#
|
||||
# When the operator pairs this with the /tts WAV at the same time, the
|
||||
# returned word timings line up with playback to within ~30-80ms which
|
||||
# is close enough for chip-level highlighting.
|
||||
|
||||
PHONEME_DURATION_RE = re.compile(r"^\s*\S+\s+(\d+)\s+", re.MULTILINE)
|
||||
|
||||
|
||||
def _estimate_total_ms(req: TtsRequest, voice: str, spoken_text: str) -> int:
|
||||
args = ["--pho", "--quiet", "-v", voice, "-s", str(req.rate)]
|
||||
out = _run_espeak(args, spoken_text.encode("utf-8"))
|
||||
text = out.decode("utf-8", errors="replace")
|
||||
total = 0
|
||||
for match in PHONEME_DURATION_RE.finditer(text):
|
||||
try:
|
||||
total += int(match.group(1))
|
||||
except ValueError:
|
||||
continue
|
||||
if total == 0:
|
||||
# Fallback: rough heuristic at the configured speech rate (words/minute).
|
||||
words = max(1, len(req.text.split()))
|
||||
total = int(words / max(60, req.rate) * 60_000)
|
||||
return total
|
||||
|
||||
|
||||
@app.post("/timings")
|
||||
def timings(req: TtsRequest):
|
||||
if not req.text.strip():
|
||||
raise HTTPException(status_code=400, detail="text is required")
|
||||
voice = _resolve_voice(req)
|
||||
spoken_text, synth_voice = _prepare_synthesis_input(req.text, req.language, voice)
|
||||
total_ms = _estimate_total_ms(req, synth_voice, spoken_text)
|
||||
|
||||
# Distribute total_ms across whitespace-split words proportional to
|
||||
# character count. Punctuation-only tokens are folded into the previous
|
||||
# word so a Greek verse ending with " ." doesn't claim a chunk of time.
|
||||
words = req.text.split()
|
||||
if not words:
|
||||
return {"text": req.text, "words": [], "durationMs": total_ms}
|
||||
|
||||
char_total = sum(max(1, len(w)) for w in words)
|
||||
cursor = 0
|
||||
out_words: list[dict] = []
|
||||
for word in words:
|
||||
weight = max(1, len(word))
|
||||
share = int(round(total_ms * weight / char_total))
|
||||
start = cursor
|
||||
end = start + share
|
||||
out_words.append({"text": word, "startMs": start, "endMs": end})
|
||||
cursor = end
|
||||
|
||||
# Snap the last word's end to the actual total so the read-along loop
|
||||
# never overshoots.
|
||||
if out_words:
|
||||
out_words[-1]["endMs"] = total_ms
|
||||
|
||||
return JSONResponse(
|
||||
{
|
||||
"text": req.text,
|
||||
"language": req.language,
|
||||
"voice": synth_voice,
|
||||
"words": out_words,
|
||||
"durationMs": total_ms,
|
||||
}
|
||||
)
|
||||
2
apps/fc-ttsreader/biblical-tts/requirements.txt
Normal file
2
apps/fc-ttsreader/biblical-tts/requirements.txt
Normal file
@@ -0,0 +1,2 @@
|
||||
fastapi==0.115.6
|
||||
uvicorn==0.34.0
|
||||
@@ -5,7 +5,502 @@ kind: Namespace
|
||||
metadata:
|
||||
name: fc-ttsreader
|
||||
labels:
|
||||
app.kubernetes.io/part-of: bluejay-infra
|
||||
app.kubernetes.io/part-of: flowercore
|
||||
---
|
||||
# 1Password -> K8s Secret sync for TTS Reader API keys
|
||||
apiVersion: onepassword.com/v1
|
||||
kind: OnePasswordItem
|
||||
metadata:
|
||||
name: ttsreader-secrets
|
||||
namespace: fc-ttsreader
|
||||
spec:
|
||||
itemPath: "vaults/IAmWorkin/items/FlowerCore TTS Reader"
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: ttsreader-piper
|
||||
namespace: fc-ttsreader
|
||||
labels:
|
||||
app.kubernetes.io/name: ttsreader-piper
|
||||
app.kubernetes.io/part-of: flowercore
|
||||
spec:
|
||||
replicas: 1
|
||||
strategy:
|
||||
type: Recreate
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: ttsreader-piper
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/name: ttsreader-piper
|
||||
app.kubernetes.io/part-of: flowercore
|
||||
spec:
|
||||
# Bypass CoreDNS's *.iamworkin.lan wildcard so the init container reaches
|
||||
# huggingface.co directly when it seeds voice models.
|
||||
dnsPolicy: None
|
||||
dnsConfig:
|
||||
nameservers:
|
||||
- 10.43.0.10
|
||||
searches:
|
||||
- fc-ttsreader.svc.cluster.local
|
||||
- svc.cluster.local
|
||||
- cluster.local
|
||||
options:
|
||||
- name: ndots
|
||||
value: "2"
|
||||
initContainers:
|
||||
- name: seed-voices
|
||||
image: rhasspy/wyoming-piper:latest
|
||||
command:
|
||||
- python3
|
||||
- -c
|
||||
args:
|
||||
- |
|
||||
import shutil
|
||||
import ssl
|
||||
from pathlib import Path
|
||||
from urllib.request import urlopen
|
||||
|
||||
ssl._create_default_https_context = ssl._create_unverified_context
|
||||
|
||||
files = {
|
||||
"en_US-lessac-high.onnx": "https://huggingface.co/rhasspy/piper-voices/resolve/main/en/en_US/lessac/high/en_US-lessac-high.onnx",
|
||||
"en_US-lessac-high.onnx.json": "https://huggingface.co/rhasspy/piper-voices/resolve/main/en/en_US/lessac/high/en_US-lessac-high.onnx.json",
|
||||
"en_US-lessac-medium.onnx": "https://huggingface.co/rhasspy/piper-voices/resolve/main/en/en_US/lessac/medium/en_US-lessac-medium.onnx",
|
||||
"en_US-lessac-medium.onnx.json": "https://huggingface.co/rhasspy/piper-voices/resolve/main/en/en_US/lessac/medium/en_US-lessac-medium.onnx.json",
|
||||
"en_US-amy-medium.onnx": "https://huggingface.co/rhasspy/piper-voices/resolve/main/en/en_US/amy/medium/en_US-amy-medium.onnx",
|
||||
"en_US-amy-medium.onnx.json": "https://huggingface.co/rhasspy/piper-voices/resolve/main/en/en_US/amy/medium/en_US-amy-medium.onnx.json",
|
||||
"en_US-john-medium.onnx": "https://huggingface.co/rhasspy/piper-voices/resolve/main/en/en_US/john/medium/en_US-john-medium.onnx",
|
||||
"en_US-john-medium.onnx.json": "https://huggingface.co/rhasspy/piper-voices/resolve/main/en/en_US/john/medium/en_US-john-medium.onnx.json",
|
||||
"en_GB-cori-high.onnx": "https://huggingface.co/rhasspy/piper-voices/resolve/main/en/en_GB/cori/high/en_GB-cori-high.onnx",
|
||||
"en_GB-cori-high.onnx.json": "https://huggingface.co/rhasspy/piper-voices/resolve/main/en/en_GB/cori/high/en_GB-cori-high.onnx.json",
|
||||
}
|
||||
|
||||
target = Path("/data")
|
||||
target.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
for name, url in files.items():
|
||||
path = target / name
|
||||
if path.exists() and path.stat().st_size > 0:
|
||||
print(f"cached {name}", flush=True)
|
||||
continue
|
||||
|
||||
print(f"downloading {name}", flush=True)
|
||||
with urlopen(url, timeout=180) as response, open(path, "wb") as download_file:
|
||||
shutil.copyfileobj(response, download_file)
|
||||
print(f"ready {name}", flush=True)
|
||||
volumeMounts:
|
||||
- name: data
|
||||
mountPath: /data
|
||||
containers:
|
||||
- name: piper
|
||||
image: rhasspy/wyoming-piper:latest
|
||||
env:
|
||||
- name: PYTHONHTTPSVERIFY
|
||||
value: "0"
|
||||
args:
|
||||
- "--voice"
|
||||
- "en_US-lessac-high"
|
||||
- "--data-dir"
|
||||
- "/data"
|
||||
- "--download-dir"
|
||||
- "/data"
|
||||
ports:
|
||||
- containerPort: 10200
|
||||
name: wyoming
|
||||
# Memory bumped after observed OOMKills during real chapter
|
||||
# renders 2026-04-25. Piper's eSpeak phonemizer + onnx runtime
|
||||
# spikes well past 1 Gi on long unpunctuated paragraphs from
|
||||
# PDF / book imports. 3 Gi gives headroom plus the
|
||||
# transcribe-audio-to-Quick-Read flow that hits Piper through
|
||||
# the same model.
|
||||
resources:
|
||||
requests:
|
||||
cpu: 250m
|
||||
memory: 512Mi
|
||||
limits:
|
||||
cpu: 2000m
|
||||
memory: 3Gi
|
||||
volumeMounts:
|
||||
- name: data
|
||||
mountPath: /data
|
||||
volumes:
|
||||
- name: data
|
||||
persistentVolumeClaim:
|
||||
claimName: ttsreader-piper-data
|
||||
---
|
||||
# fc-speech-align — cluster-native faster-whisper wrapper.
|
||||
# Exposes POST /align (fc-align contract used by FlowerCore.Shared.Speech) AND
|
||||
# POST /transcribe (audio-file-in feature). CPU model = base.en, int8 compute.
|
||||
# Source: bluejay-infra/apps/fc-ttsreader/speech-align/ (Dockerfile + app.py).
|
||||
apiVersion: v1
|
||||
kind: PersistentVolumeClaim
|
||||
metadata:
|
||||
name: ttsreader-align-models
|
||||
namespace: fc-ttsreader
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
resources:
|
||||
requests:
|
||||
storage: 5Gi
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: ttsreader-align
|
||||
namespace: fc-ttsreader
|
||||
labels:
|
||||
app.kubernetes.io/name: ttsreader-align
|
||||
app.kubernetes.io/part-of: flowercore
|
||||
spec:
|
||||
replicas: 1
|
||||
strategy:
|
||||
type: Recreate
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: ttsreader-align
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/name: ttsreader-align
|
||||
app.kubernetes.io/part-of: flowercore
|
||||
spec:
|
||||
# Bypass CoreDNS's *.iamworkin.lan template hijack on public hosts
|
||||
# (huggingface.co model download at first boot would otherwise resolve
|
||||
# to Traefik VIP via search expansion). Drops the iamworkin.lan suffix.
|
||||
dnsPolicy: None
|
||||
dnsConfig:
|
||||
nameservers:
|
||||
- 10.43.0.10
|
||||
searches:
|
||||
- fc-ttsreader.svc.cluster.local
|
||||
- svc.cluster.local
|
||||
- cluster.local
|
||||
options:
|
||||
- name: ndots
|
||||
value: "2"
|
||||
securityContext:
|
||||
fsGroup: 1654
|
||||
runAsNonRoot: true
|
||||
runAsUser: 1654
|
||||
containers:
|
||||
- name: align
|
||||
image: localhost/fc-speech-align:v3
|
||||
imagePullPolicy: Never
|
||||
ports:
|
||||
- containerPort: 9200
|
||||
name: http
|
||||
env:
|
||||
- name: WHISPER_MODEL
|
||||
value: "Systran/faster-whisper-base.en"
|
||||
- name: WHISPER_DEVICE
|
||||
value: "cpu"
|
||||
- name: WHISPER_COMPUTE_TYPE
|
||||
value: "int8"
|
||||
- name: WHISPER_CACHE_DIR
|
||||
value: "/models"
|
||||
- name: DEFAULT_LANGUAGE
|
||||
value: "en"
|
||||
resources:
|
||||
requests:
|
||||
cpu: 250m
|
||||
memory: 512Mi
|
||||
limits:
|
||||
cpu: 2000m
|
||||
memory: 2Gi
|
||||
volumeMounts:
|
||||
- name: models
|
||||
mountPath: /models
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /health
|
||||
port: 9200
|
||||
initialDelaySeconds: 30
|
||||
periodSeconds: 10
|
||||
timeoutSeconds: 5
|
||||
failureThreshold: 18
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /health
|
||||
port: 9200
|
||||
initialDelaySeconds: 180
|
||||
periodSeconds: 30
|
||||
timeoutSeconds: 5
|
||||
failureThreshold: 3
|
||||
volumes:
|
||||
- name: models
|
||||
persistentVolumeClaim:
|
||||
claimName: ttsreader-align-models
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: ttsreader-align
|
||||
namespace: fc-ttsreader
|
||||
spec:
|
||||
selector:
|
||||
app.kubernetes.io/name: ttsreader-align
|
||||
ports:
|
||||
- port: 9200
|
||||
targetPort: 9200
|
||||
name: http
|
||||
---
|
||||
# ttsreader-kokoro — Kokoro-82M TTS via the kokoro-fastapi container.
|
||||
# Provides high-quality English voices alongside Piper for the TtsReader
|
||||
# render pipeline AND for AiStation when it talks to the cluster TTS plane
|
||||
# (instead of pointing back at BLUEJAY-WS:10401). Model + voices ship
|
||||
# inside the container image, so no PVC is needed.
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: ttsreader-kokoro
|
||||
namespace: fc-ttsreader
|
||||
labels:
|
||||
app.kubernetes.io/name: ttsreader-kokoro
|
||||
app.kubernetes.io/part-of: flowercore
|
||||
spec:
|
||||
replicas: 1
|
||||
strategy:
|
||||
type: Recreate
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: ttsreader-kokoro
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/name: ttsreader-kokoro
|
||||
app.kubernetes.io/part-of: flowercore
|
||||
spec:
|
||||
# Same DNS bypass as ttsreader-align — without it, the *.iamworkin.lan
|
||||
# CoreDNS template would hijack hexgrad/Kokoro-82M's HuggingFace-style
|
||||
# repo lookups during model warmup.
|
||||
dnsPolicy: None
|
||||
dnsConfig:
|
||||
nameservers:
|
||||
- 10.43.0.10
|
||||
searches:
|
||||
- fc-ttsreader.svc.cluster.local
|
||||
- svc.cluster.local
|
||||
- cluster.local
|
||||
options:
|
||||
- name: ndots
|
||||
value: "2"
|
||||
containers:
|
||||
- name: kokoro
|
||||
image: ghcr.io/remsky/kokoro-fastapi-cpu:latest
|
||||
ports:
|
||||
- containerPort: 8880
|
||||
name: http
|
||||
resources:
|
||||
requests:
|
||||
cpu: 250m
|
||||
memory: 1Gi
|
||||
limits:
|
||||
cpu: 2000m
|
||||
memory: 3Gi
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /v1/audio/voices
|
||||
port: 8880
|
||||
initialDelaySeconds: 30
|
||||
periodSeconds: 10
|
||||
timeoutSeconds: 5
|
||||
failureThreshold: 18
|
||||
# Sprint E Phase 1a (kokoro stability) — 4 restarts in 2d6h with
|
||||
# exit 143 traced to liveness probe `context deadline exceeded` while
|
||||
# kokoro was busy synthesizing. /v1/audio/voices shares the FastAPI
|
||||
# worker pool with /v1/audio/speech, so a long synth can starve the
|
||||
# probe out within the prior 5s × 3 = 15s window. Bump timeoutSeconds
|
||||
# 5 → 15 and failureThreshold 3 → 5 → 75s grace before kubelet kills
|
||||
# the pod. The TtsCircuitBreaker on the synthesizer side (Phase 1b)
|
||||
# backs this up so the FC backend stops slamming kokoro during
|
||||
# recovery.
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /v1/audio/voices
|
||||
port: 8880
|
||||
initialDelaySeconds: 180
|
||||
periodSeconds: 30
|
||||
timeoutSeconds: 15
|
||||
failureThreshold: 5
|
||||
---
|
||||
# fc-biblical-tts — eSpeak-NG-backed Ancient Greek + Hebrew TTS with
|
||||
# word-level timing for read-along playback. Companion to ttsreader-kokoro
|
||||
# (modern English) and ttsreader-piper (English narrator); operators pick
|
||||
# whichever engine matches the source text. Source:
|
||||
# bluejay-infra/apps/fc-ttsreader/biblical-tts/
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: ttsreader-biblical
|
||||
namespace: fc-ttsreader
|
||||
labels:
|
||||
app.kubernetes.io/name: ttsreader-biblical
|
||||
app.kubernetes.io/part-of: flowercore
|
||||
spec:
|
||||
replicas: 1
|
||||
strategy:
|
||||
type: Recreate
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: ttsreader-biblical
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/name: ttsreader-biblical
|
||||
app.kubernetes.io/part-of: flowercore
|
||||
spec:
|
||||
securityContext:
|
||||
fsGroup: 1654
|
||||
runAsNonRoot: true
|
||||
runAsUser: 1654
|
||||
containers:
|
||||
- name: biblical-tts
|
||||
image: localhost/fc-biblical-tts:v20260506-hebrew-translit
|
||||
imagePullPolicy: Never
|
||||
ports:
|
||||
- containerPort: 10402
|
||||
name: http
|
||||
resources:
|
||||
requests:
|
||||
cpu: 100m
|
||||
memory: 128Mi
|
||||
limits:
|
||||
cpu: 1000m
|
||||
memory: 512Mi
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /health
|
||||
port: 10402
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 10
|
||||
timeoutSeconds: 5
|
||||
failureThreshold: 6
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /health
|
||||
port: 10402
|
||||
initialDelaySeconds: 30
|
||||
periodSeconds: 30
|
||||
timeoutSeconds: 5
|
||||
failureThreshold: 3
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: ttsreader-biblical
|
||||
namespace: fc-ttsreader
|
||||
spec:
|
||||
selector:
|
||||
app.kubernetes.io/name: ttsreader-biblical
|
||||
ports:
|
||||
- port: 10402
|
||||
targetPort: 10402
|
||||
name: http
|
||||
---
|
||||
# fc-modern-tts — Microsoft Edge Read Aloud bridge for Modern Hebrew
|
||||
# (he-IL-AvriNeural et al) and Modern Greek (el-GR-NestorasNeural et al).
|
||||
# Pairs with ttsreader-biblical: biblical engine handles unpointed
|
||||
# Greek + Hebrew, modern engine handles narrative translations the
|
||||
# operator reads alongside.
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: ttsreader-modern
|
||||
namespace: fc-ttsreader
|
||||
labels:
|
||||
app.kubernetes.io/name: ttsreader-modern
|
||||
app.kubernetes.io/part-of: flowercore
|
||||
spec:
|
||||
replicas: 1
|
||||
strategy:
|
||||
type: Recreate
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: ttsreader-modern
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/name: ttsreader-modern
|
||||
app.kubernetes.io/part-of: flowercore
|
||||
spec:
|
||||
# edge-tts needs egress to *.tts.speech.microsoft.com — bypass the
|
||||
# iamworkin.lan template hijack so the lookup doesn't fall back to
|
||||
# Traefik VIP via search expansion.
|
||||
dnsPolicy: None
|
||||
dnsConfig:
|
||||
nameservers:
|
||||
- 10.43.0.10
|
||||
searches:
|
||||
- fc-ttsreader.svc.cluster.local
|
||||
- svc.cluster.local
|
||||
- cluster.local
|
||||
options:
|
||||
- name: ndots
|
||||
value: "2"
|
||||
securityContext:
|
||||
fsGroup: 1654
|
||||
runAsNonRoot: true
|
||||
runAsUser: 1654
|
||||
containers:
|
||||
- name: modern-tts
|
||||
image: localhost/fc-modern-tts:v1
|
||||
imagePullPolicy: Never
|
||||
ports:
|
||||
- containerPort: 10403
|
||||
name: http
|
||||
resources:
|
||||
requests:
|
||||
cpu: 100m
|
||||
memory: 128Mi
|
||||
limits:
|
||||
cpu: 1000m
|
||||
memory: 512Mi
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /health
|
||||
port: 10403
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 10
|
||||
timeoutSeconds: 5
|
||||
failureThreshold: 6
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /health
|
||||
port: 10403
|
||||
initialDelaySeconds: 30
|
||||
periodSeconds: 30
|
||||
timeoutSeconds: 5
|
||||
failureThreshold: 3
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: ttsreader-modern
|
||||
namespace: fc-ttsreader
|
||||
spec:
|
||||
selector:
|
||||
app.kubernetes.io/name: ttsreader-modern
|
||||
ports:
|
||||
- port: 10403
|
||||
targetPort: 10403
|
||||
name: http
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: ttsreader-kokoro
|
||||
namespace: fc-ttsreader
|
||||
spec:
|
||||
selector:
|
||||
app.kubernetes.io/name: ttsreader-kokoro
|
||||
ports:
|
||||
- port: 8880
|
||||
targetPort: 8880
|
||||
name: http
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
@@ -13,48 +508,185 @@ metadata:
|
||||
name: ttsreader-web
|
||||
namespace: fc-ttsreader
|
||||
labels:
|
||||
app: ttsreader-web
|
||||
app.kubernetes.io/name: ttsreader-web
|
||||
app.kubernetes.io/part-of: flowercore
|
||||
spec:
|
||||
replicas: 1
|
||||
strategy:
|
||||
type: Recreate
|
||||
selector:
|
||||
matchLabels:
|
||||
app: ttsreader-web
|
||||
app.kubernetes.io/name: ttsreader-web
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: ttsreader-web
|
||||
app.kubernetes.io/name: ttsreader-web
|
||||
app.kubernetes.io/part-of: flowercore
|
||||
annotations:
|
||||
prometheus.io/scrape: "true"
|
||||
prometheus.io/port: "5217"
|
||||
prometheus.io/path: "/metrics"
|
||||
spec:
|
||||
securityContext:
|
||||
fsGroup: 1654
|
||||
fsGroupChangePolicy: OnRootMismatch
|
||||
containers:
|
||||
- name: ttsreader-web
|
||||
image: localhost/fc-ttsreader-web:v202604132015
|
||||
- name: web
|
||||
image: localhost/fc-ttsreader-web:v20260506-phase6
|
||||
imagePullPolicy: Never
|
||||
ports:
|
||||
- containerPort: 8080
|
||||
- containerPort: 5217
|
||||
name: http
|
||||
env:
|
||||
- name: ASPNETCORE_ENVIRONMENT
|
||||
value: Production
|
||||
- name: ASPNETCORE_URLS
|
||||
value: "http://+:8080"
|
||||
value: "http://+:5217"
|
||||
- name: ASPNETCORE_ENVIRONMENT
|
||||
value: "Production"
|
||||
- name: FlowerCore__Database__ConnectionStrings__Sqlite
|
||||
value: "Data Source=/data/ttsreader.db"
|
||||
- name: TtsReader__Audio__OutputRoot
|
||||
value: "/data/audio"
|
||||
- name: TtsReader__Audio__FfmpegPath
|
||||
value: "/usr/bin/ffmpeg"
|
||||
- name: TtsReader__Bible__CorpusRoot
|
||||
value: "/data/corpus-cache/world-english-bible/eng/usx"
|
||||
- name: TtsReader__ChapterContext__DatabasePath
|
||||
value: "/data/chapter-context.db"
|
||||
- name: TtsReader__Jobs__Root
|
||||
value: "/data/jobs"
|
||||
- name: TtsReader__Piper__Host
|
||||
value: "ttsreader-piper.fc-ttsreader.svc.cluster.local."
|
||||
- name: TtsReader__Piper__Port
|
||||
value: "10200"
|
||||
- name: TtsReader__Kokoro__Enabled
|
||||
value: "true"
|
||||
- name: TtsReader__Kokoro__BaseUrl
|
||||
# Cluster-native ttsreader-kokoro Service — replaces the prior
|
||||
# BLUEJAY-WS host pointer so the render pipeline doesn't need
|
||||
# the workstation up. AiStation can still hit its local
|
||||
# http://localhost:8880 instance.
|
||||
value: "http://ttsreader-kokoro.fc-ttsreader.svc.cluster.local.:8880"
|
||||
- name: TtsReader__Kokoro__TimeoutSeconds
|
||||
value: "120"
|
||||
- name: FlowerCore__Tts__BiblicalTts__Enabled
|
||||
value: "true"
|
||||
- name: FlowerCore__Tts__BiblicalTts__BaseUrl
|
||||
value: "http://ttsreader-biblical.fc-ttsreader.svc.cluster.local.:10402"
|
||||
- name: FlowerCore__Tts__BiblicalTts__TimeoutSeconds
|
||||
value: "60"
|
||||
- name: FlowerCore__Tts__BiblicalTts__DefaultLanguage
|
||||
value: "grc"
|
||||
- name: Speech__Alignment__Enabled
|
||||
# Cluster-native faster-whisper (Lane F, 2026-04-25). The
|
||||
# ttsreader-align deployment in this manifest wraps
|
||||
# SYSTRAN/faster-whisper with a /align endpoint matching the
|
||||
# FlowerCore.Shared.Speech master contract.
|
||||
value: "true"
|
||||
- name: Speech__Alignment__BaseUrl
|
||||
value: "http://ttsreader-align.fc-ttsreader.svc.cluster.local.:9200"
|
||||
- name: Speech__Alignment__TimeoutSeconds
|
||||
value: "120"
|
||||
# Cluster-native transcription endpoint shares the same pod
|
||||
# (POST /transcribe). Lane G consumes this from the
|
||||
# FlowerCore.TtsReader.Web AudioImport feature.
|
||||
- name: TtsReader__Transcription__Enabled
|
||||
value: "true"
|
||||
- name: TtsReader__Transcription__BaseUrl
|
||||
value: "http://ttsreader-align.fc-ttsreader.svc.cluster.local.:9200"
|
||||
- name: TtsReader__Transcription__TimeoutSeconds
|
||||
value: "300"
|
||||
- name: TtsReader__Ollama__BaseUrl
|
||||
value: "http://10.0.57.17:11434"
|
||||
- name: TtsReader__Ollama__DefaultModel
|
||||
value: "gemma3:4b"
|
||||
- name: TtsReader__Ollama__TimeoutSeconds
|
||||
value: "45"
|
||||
- name: TtsReader__Runtime__LogsRoot
|
||||
value: "/data/logs"
|
||||
- name: TtsReader__Runtime__SmokeStatePath
|
||||
value: "/data/ops/smoke-status.json"
|
||||
# Sprint E Day 8 voice-preview disk cache — writes WAVs under
|
||||
# this directory. Default "data/voice-previews" resolves to
|
||||
# the read-only $HOME path under runAsNonRoot=true. Pin to
|
||||
# the writable PVC mount.
|
||||
- name: TtsReader__Preview__CacheDirectory
|
||||
value: "/data/voice-previews"
|
||||
- name: TtsReader__VoiceLibrary__ReferenceClip__Directory
|
||||
value: "/data/voice-reference-clips"
|
||||
# Sprint E XXL Phase 4γ — content-addressed CDN bundle dir for
|
||||
# POST /api/v1/render. Default "wwwroot/cdn" resolves under the
|
||||
# read-only app filesystem, so pin to the writable PVC mount
|
||||
# alongside other TtsReader runtime data. Manifests + cue audio
|
||||
# land at /data/cdn/sha256/<hash>/manifest.json + cues/.
|
||||
- name: TtsReader__Render__CdnDirectory
|
||||
value: "/data/cdn"
|
||||
- name: Auth__ApiKey
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: ttsreader-secrets
|
||||
key: Auth__ApiKey
|
||||
optional: true
|
||||
- name: Auth__AdminApiKey
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: ttsreader-secrets
|
||||
key: Auth__AdminApiKey
|
||||
optional: true
|
||||
resources:
|
||||
requests:
|
||||
memory: "128Mi"
|
||||
cpu: "100m"
|
||||
# The cluster is currently saturated on requested CPU by
|
||||
# remotedesktop workloads even when real usage is low.
|
||||
# Keep the web frontend schedulable under that pressure.
|
||||
cpu: 10m
|
||||
memory: 256Mi
|
||||
limits:
|
||||
memory: "512Mi"
|
||||
cpu: "500m"
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /metrics/prometheus
|
||||
port: 8080
|
||||
initialDelaySeconds: 10
|
||||
periodSeconds: 30
|
||||
cpu: 500m
|
||||
memory: 512Mi
|
||||
volumeMounts:
|
||||
- name: data
|
||||
mountPath: /data
|
||||
- name: tmp
|
||||
mountPath: /tmp
|
||||
securityContext:
|
||||
runAsNonRoot: true
|
||||
runAsUser: 1654
|
||||
runAsGroup: 1654
|
||||
allowPrivilegeEscalation: false
|
||||
readOnlyRootFilesystem: true
|
||||
capabilities:
|
||||
drop:
|
||||
- ALL
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /metrics/prometheus
|
||||
port: 8080
|
||||
path: /health
|
||||
port: 5217
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 10
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /health
|
||||
port: 5217
|
||||
initialDelaySeconds: 15
|
||||
periodSeconds: 30
|
||||
volumes:
|
||||
- name: data
|
||||
persistentVolumeClaim:
|
||||
claimName: ttsreader-data
|
||||
- name: tmp
|
||||
emptyDir: {}
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: ttsreader-piper
|
||||
namespace: fc-ttsreader
|
||||
spec:
|
||||
selector:
|
||||
app.kubernetes.io/name: ttsreader-piper
|
||||
ports:
|
||||
- port: 10200
|
||||
targetPort: 10200
|
||||
name: wyoming
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
@@ -63,19 +695,45 @@ metadata:
|
||||
namespace: fc-ttsreader
|
||||
spec:
|
||||
selector:
|
||||
app: ttsreader-web
|
||||
app.kubernetes.io/name: ttsreader-web
|
||||
ports:
|
||||
- port: 80
|
||||
targetPort: 8080
|
||||
- port: 5217
|
||||
targetPort: 5217
|
||||
name: http
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: PersistentVolumeClaim
|
||||
metadata:
|
||||
name: ttsreader-piper-data
|
||||
namespace: fc-ttsreader
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
storageClassName: longhorn
|
||||
resources:
|
||||
requests:
|
||||
storage: 2Gi
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: PersistentVolumeClaim
|
||||
metadata:
|
||||
name: ttsreader-data
|
||||
namespace: fc-ttsreader
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
storageClassName: longhorn
|
||||
resources:
|
||||
requests:
|
||||
storage: 5Gi
|
||||
---
|
||||
apiVersion: cert-manager.io/v1
|
||||
kind: Certificate
|
||||
metadata:
|
||||
name: ttsreader-web-tls
|
||||
name: ttsreader-cert
|
||||
namespace: fc-ttsreader
|
||||
spec:
|
||||
secretName: ttsreader-web-tls
|
||||
secretName: ttsreader-tls
|
||||
issuerRef:
|
||||
name: step-ca-acme
|
||||
kind: ClusterIssuer
|
||||
@@ -95,6 +753,6 @@ spec:
|
||||
kind: Rule
|
||||
services:
|
||||
- name: ttsreader-web
|
||||
port: 80
|
||||
port: 5217
|
||||
tls:
|
||||
secretName: ttsreader-web-tls
|
||||
secretName: ttsreader-tls
|
||||
|
||||
36
apps/fc-ttsreader/modern-tts/Dockerfile
Normal file
36
apps/fc-ttsreader/modern-tts/Dockerfile
Normal file
@@ -0,0 +1,36 @@
|
||||
# FlowerCore modern-tts — wraps Microsoft Edge's Read Aloud TTS service
|
||||
# (via the edge-tts Python package) to give the cluster studio-quality
|
||||
# Modern Hebrew (he-IL-*) and Modern Greek (el-GR-*) voices alongside the
|
||||
# eSpeak biblical engine. Same shape as fc-biblical-tts so the .NET client
|
||||
# lives in the same Shared.Speech package.
|
||||
#
|
||||
# Note: edge-tts depends on Microsoft's public Edge endpoint; the cluster
|
||||
# pod needs egress to *.tts.speech.microsoft.com. dnsPolicy: None on the
|
||||
# Deployment makes sure the iamworkin.lan template hijack doesn't rewrite
|
||||
# the lookup back to Traefik VIP.
|
||||
FROM python:3.12-slim
|
||||
|
||||
ENV PYTHONDONTWRITEBYTECODE=1 \
|
||||
PYTHONUNBUFFERED=1 \
|
||||
PIP_DISABLE_PIP_VERSION_CHECK=1 \
|
||||
PIP_NO_CACHE_DIR=1
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y --no-install-recommends \
|
||||
ca-certificates \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
WORKDIR /app
|
||||
COPY requirements.txt /app/
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
COPY app.py /app/
|
||||
|
||||
RUN useradd --create-home --shell /usr/sbin/nologin --uid 1654 tts
|
||||
USER 1654
|
||||
|
||||
EXPOSE 10403
|
||||
HEALTHCHECK --interval=30s --timeout=5s --start-period=20s --retries=3 \
|
||||
CMD python -c "import urllib.request,sys; urllib.request.urlopen('http://127.0.0.1:10403/health',timeout=3); sys.exit(0)" || exit 1
|
||||
|
||||
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "10403", "--workers", "1"]
|
||||
238
apps/fc-ttsreader/modern-tts/app.py
Normal file
238
apps/fc-ttsreader/modern-tts/app.py
Normal file
@@ -0,0 +1,238 @@
|
||||
"""FlowerCore modern-tts — Microsoft Edge Read Aloud bridge for Modern
|
||||
Hebrew and Modern Greek (and other Edge-supported languages).
|
||||
|
||||
Endpoints:
|
||||
|
||||
* POST /tts — body: {"text", "voice", "rate"?, "volume"?, "pitch"?}
|
||||
returns audio/mpeg (Edge returns MP3) which the
|
||||
upstream FasterWhisperAlignmentClient + the WPF
|
||||
MediaPlayer both handle natively.
|
||||
* POST /timings — same body shape but returns
|
||||
{"text", "voice", "words": [{"text","startMs","endMs"}],
|
||||
"durationMs": ...} sourced from Edge's WordBoundary
|
||||
events — much more accurate than eSpeak's
|
||||
proportional-distribution approach because Edge
|
||||
emits real per-word offsets during synthesis.
|
||||
* GET /voices — voice catalog Edge knows about. Filtered to
|
||||
Hebrew + Greek by default; ?language=all returns
|
||||
everything Edge supports.
|
||||
* GET /health — fast readiness check.
|
||||
|
||||
Pairs with fc-biblical-tts (eSpeak Ancient Greek + Hebrew). The biblical
|
||||
engine handles unpointed Hebrew + Erasmian Greek; this engine handles
|
||||
narrative Modern Hebrew + Modern Greek for translations the operator
|
||||
might be reading alongside the original.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import io
|
||||
import logging
|
||||
from typing import Optional
|
||||
|
||||
import edge_tts
|
||||
from fastapi import FastAPI, HTTPException
|
||||
from fastapi.responses import JSONResponse, Response
|
||||
from pydantic import BaseModel
|
||||
|
||||
LOG = logging.getLogger("modern_tts")
|
||||
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
||||
|
||||
app = FastAPI(title="FlowerCore modern-tts", version="1.0.0")
|
||||
|
||||
# Default voices by short code so AiStation can pick a sensible default
|
||||
# when the operator hasn't explicitly asked for one. Edge has multiple
|
||||
# voices per locale — these are the calmest male+female narrators.
|
||||
DEFAULT_VOICES = {
|
||||
"he": "he-IL-AvriNeural",
|
||||
"he-IL": "he-IL-AvriNeural",
|
||||
"el": "el-GR-NestorasNeural",
|
||||
"el-GR": "el-GR-NestorasNeural",
|
||||
"en": "en-US-AriaNeural",
|
||||
}
|
||||
|
||||
|
||||
class TtsRequest(BaseModel):
|
||||
text: str
|
||||
voice: Optional[str] = None
|
||||
language: Optional[str] = None
|
||||
rate: str = "+0%" # Edge accepts +20%, -10%, etc.
|
||||
volume: str = "+0%"
|
||||
pitch: str = "+0Hz"
|
||||
|
||||
|
||||
def _resolve_voice(req: TtsRequest) -> str:
|
||||
if req.voice:
|
||||
return req.voice.strip()
|
||||
if req.language and req.language in DEFAULT_VOICES:
|
||||
return DEFAULT_VOICES[req.language]
|
||||
return DEFAULT_VOICES["he"]
|
||||
|
||||
|
||||
@app.get("/health")
|
||||
def health():
|
||||
return {"status": "ok"}
|
||||
|
||||
|
||||
@app.get("/voices")
|
||||
async def voices(language: str = "default"):
|
||||
catalog = await edge_tts.list_voices()
|
||||
if language == "all":
|
||||
return {"voices": catalog}
|
||||
|
||||
# Default response: filter to languages relevant to the FlowerCore
|
||||
# biblical workflow (Hebrew + Greek) so the AiStation voice picker
|
||||
# isn't overwhelmed by 400+ Edge voices.
|
||||
keep = ("he-", "el-")
|
||||
filtered = [v for v in catalog if any(v.get("ShortName", "").startswith(k) for k in keep)]
|
||||
return {"voices": filtered}
|
||||
|
||||
|
||||
async def _synth_with_subtitles(req: TtsRequest):
|
||||
voice = _resolve_voice(req)
|
||||
LOG.info("edge-tts synth voice=%s len=%d", voice, len(req.text))
|
||||
communicate = edge_tts.Communicate(
|
||||
req.text,
|
||||
voice=voice,
|
||||
rate=req.rate,
|
||||
volume=req.volume,
|
||||
pitch=req.pitch,
|
||||
)
|
||||
audio_buf = io.BytesIO()
|
||||
word_events: list[dict] = []
|
||||
async for chunk in communicate.stream():
|
||||
if chunk["type"] == "audio":
|
||||
audio_buf.write(chunk["data"])
|
||||
elif chunk["type"] == "WordBoundary":
|
||||
word_events.append({
|
||||
"text": chunk.get("text") or "",
|
||||
"offset": chunk.get("offset", 0), # 100-ns ticks
|
||||
"duration": chunk.get("duration", 0), # 100-ns ticks
|
||||
})
|
||||
return voice, audio_buf.getvalue(), word_events
|
||||
|
||||
|
||||
def _to_ms(ticks_100ns: int) -> int:
|
||||
# Edge emits offsets in 100-nanosecond ticks (.NET TimeSpan style).
|
||||
return int(round(ticks_100ns / 10_000))
|
||||
|
||||
|
||||
@app.post("/tts")
|
||||
async def tts(req: TtsRequest):
|
||||
if not req.text.strip():
|
||||
raise HTTPException(status_code=400, detail="text is required")
|
||||
try:
|
||||
voice, audio_bytes, _ = await _synth_with_subtitles(req)
|
||||
except edge_tts.exceptions.NoAudioReceived:
|
||||
raise HTTPException(status_code=502, detail="edge-tts returned no audio for the supplied voice/text.")
|
||||
except Exception as ex:
|
||||
raise HTTPException(status_code=502, detail=f"edge-tts failure: {ex}")
|
||||
if not audio_bytes:
|
||||
raise HTTPException(status_code=502, detail="edge-tts returned an empty audio stream.")
|
||||
return Response(content=audio_bytes, media_type="audio/mpeg",
|
||||
headers={"X-FlowerCore-Voice": voice})
|
||||
|
||||
|
||||
def _estimate_duration_ms_from_mp3(audio_bytes: bytes) -> int:
|
||||
"""Best-effort duration estimate from raw MP3 bytes by walking frame
|
||||
headers. Edge always returns CBR ~24kbps mono so we can infer total ms
|
||||
from frame count. If parsing fails, return 0 and let the caller fall
|
||||
through to a per-character heuristic."""
|
||||
if not audio_bytes:
|
||||
return 0
|
||||
# MP3 sample rates by version+layer (MPEG1 layer3 / MPEG2 layer3 / MPEG2.5 layer3).
|
||||
# We just walk frame headers and count frames; each frame is 1152 samples.
|
||||
sample_rates_v1 = [44100, 48000, 32000, 0]
|
||||
sample_rates_v2 = [22050, 24000, 16000, 0]
|
||||
sample_rates_v25 = [11025, 12000, 8000, 0]
|
||||
bitrates_v1_l3 = [0,32000,40000,48000,56000,64000,80000,96000,112000,128000,160000,192000,224000,256000,320000,0]
|
||||
bitrates_v2_l3 = [0,8000,16000,24000,32000,40000,48000,56000,64000,80000,96000,112000,128000,144000,160000,0]
|
||||
|
||||
pos = 0
|
||||
total_samples = 0
|
||||
sample_rate = 0
|
||||
while pos + 4 <= len(audio_bytes):
|
||||
b0, b1, b2, b3 = audio_bytes[pos], audio_bytes[pos+1], audio_bytes[pos+2], audio_bytes[pos+3]
|
||||
if b0 != 0xFF or (b1 & 0xE0) != 0xE0:
|
||||
pos += 1
|
||||
continue
|
||||
version_bits = (b1 >> 3) & 0x03
|
||||
layer_bits = (b1 >> 1) & 0x03
|
||||
if layer_bits != 0x01: # layer 3 only
|
||||
pos += 1
|
||||
continue
|
||||
bitrate_index = (b2 >> 4) & 0x0F
|
||||
sample_rate_index = (b2 >> 2) & 0x03
|
||||
padding = (b2 >> 1) & 0x01
|
||||
if version_bits == 0x03: # MPEG1
|
||||
sample_rate = sample_rates_v1[sample_rate_index]
|
||||
bitrate = bitrates_v1_l3[bitrate_index]
|
||||
samples_per_frame = 1152
|
||||
elif version_bits == 0x02: # MPEG2
|
||||
sample_rate = sample_rates_v2[sample_rate_index]
|
||||
bitrate = bitrates_v2_l3[bitrate_index]
|
||||
samples_per_frame = 576
|
||||
elif version_bits == 0x00: # MPEG2.5
|
||||
sample_rate = sample_rates_v25[sample_rate_index]
|
||||
bitrate = bitrates_v2_l3[bitrate_index]
|
||||
samples_per_frame = 576
|
||||
else:
|
||||
pos += 1
|
||||
continue
|
||||
if not (sample_rate and bitrate):
|
||||
pos += 1
|
||||
continue
|
||||
frame_length = int((samples_per_frame * bitrate / 8) / sample_rate) + padding
|
||||
if frame_length <= 0:
|
||||
pos += 1
|
||||
continue
|
||||
total_samples += samples_per_frame
|
||||
pos += frame_length
|
||||
|
||||
if sample_rate <= 0:
|
||||
return 0
|
||||
return int(round(total_samples * 1000 / sample_rate))
|
||||
|
||||
|
||||
@app.post("/timings")
|
||||
async def timings(req: TtsRequest):
|
||||
if not req.text.strip():
|
||||
raise HTTPException(status_code=400, detail="text is required")
|
||||
try:
|
||||
voice, audio_bytes, events = await _synth_with_subtitles(req)
|
||||
except Exception as ex:
|
||||
raise HTTPException(status_code=502, detail=f"edge-tts failure: {ex}")
|
||||
|
||||
words: list[dict] = []
|
||||
for event in events:
|
||||
start = _to_ms(event["offset"])
|
||||
end = start + _to_ms(event["duration"])
|
||||
words.append({"text": event.get("text", ""), "startMs": start, "endMs": end})
|
||||
|
||||
# Edge sometimes omits WordBoundary events for non-English voices
|
||||
# (notably he-IL-* and el-GR-*). Fall back to proportional distribution
|
||||
# over the input text — same approach the eSpeak biblical-tts uses.
|
||||
if not words and req.text.strip():
|
||||
total_ms = _estimate_duration_ms_from_mp3(audio_bytes)
|
||||
if total_ms <= 0:
|
||||
# Last-resort fallback: ~600ms per word at average speaking rate.
|
||||
total_ms = max(1, len(req.text.split())) * 600
|
||||
tokens = req.text.split()
|
||||
if tokens:
|
||||
char_total = sum(max(1, len(w)) for w in tokens)
|
||||
cursor = 0
|
||||
for token in tokens:
|
||||
share = int(round(total_ms * max(1, len(token)) / char_total))
|
||||
start = cursor
|
||||
end = start + share
|
||||
words.append({"text": token, "startMs": start, "endMs": end})
|
||||
cursor = end
|
||||
words[-1]["endMs"] = total_ms
|
||||
|
||||
duration_ms = words[-1]["endMs"] if words else 0
|
||||
return JSONResponse({
|
||||
"text": req.text,
|
||||
"voice": voice,
|
||||
"words": words,
|
||||
"durationMs": duration_ms,
|
||||
"audioBytes": len(audio_bytes),
|
||||
})
|
||||
3
apps/fc-ttsreader/modern-tts/requirements.txt
Normal file
3
apps/fc-ttsreader/modern-tts/requirements.txt
Normal file
@@ -0,0 +1,3 @@
|
||||
fastapi==0.115.6
|
||||
uvicorn==0.34.0
|
||||
edge-tts==7.2.8
|
||||
47
apps/fc-ttsreader/speech-align/Dockerfile
Normal file
47
apps/fc-ttsreader/speech-align/Dockerfile
Normal file
@@ -0,0 +1,47 @@
|
||||
# FlowerCore speech-align — wraps SYSTRAN/faster-whisper with /align +
|
||||
# /transcribe endpoints used by FlowerCore.TtsReader. CPU-only image; the
|
||||
# default int8 compute type runs base.en at ~real-time on a single core.
|
||||
#
|
||||
# Build: podman build -t localhost/fc-speech-align:<ver> .
|
||||
# Run: podman run --rm -p 9200:9200 -v fc-speech-align-models:/models localhost/fc-speech-align:<ver>
|
||||
|
||||
FROM python:3.12-slim AS base
|
||||
|
||||
ENV PYTHONDONTWRITEBYTECODE=1 \
|
||||
PYTHONUNBUFFERED=1 \
|
||||
PIP_DISABLE_PIP_VERSION_CHECK=1 \
|
||||
PIP_NO_CACHE_DIR=1 \
|
||||
WHISPER_MODEL=Systran/faster-whisper-base.en \
|
||||
WHISPER_CACHE_DIR=/models \
|
||||
WHISPER_DEVICE=cpu \
|
||||
WHISPER_COMPUTE_TYPE=int8 \
|
||||
DEFAULT_LANGUAGE=en \
|
||||
MAX_AUDIO_BYTES=52428800
|
||||
|
||||
# faster-whisper depends on libsndfile1 + libgomp1 (OpenMP runtime). ffmpeg is
|
||||
# pulled in for non-WAV inputs (transcribe accepts any container).
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y --no-install-recommends \
|
||||
libsndfile1 \
|
||||
libgomp1 \
|
||||
ffmpeg \
|
||||
ca-certificates \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
WORKDIR /app
|
||||
COPY requirements.txt /app/
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
COPY app.py /app/
|
||||
|
||||
# Run as a non-root user to satisfy K8s securityContext.runAsNonRoot.
|
||||
RUN useradd --create-home --shell /usr/sbin/nologin --uid 1654 align \
|
||||
&& mkdir -p /models \
|
||||
&& chown -R 1654:1654 /models
|
||||
USER 1654
|
||||
|
||||
EXPOSE 9200
|
||||
HEALTHCHECK --interval=30s --timeout=5s --start-period=120s --retries=3 \
|
||||
CMD python -c "import urllib.request,sys; urllib.request.urlopen('http://127.0.0.1:9200/health',timeout=3); sys.exit(0)" || exit 1
|
||||
|
||||
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "9200", "--workers", "1"]
|
||||
181
apps/fc-ttsreader/speech-align/app.py
Normal file
181
apps/fc-ttsreader/speech-align/app.py
Normal file
@@ -0,0 +1,181 @@
|
||||
"""FlowerCore speech-align service.
|
||||
|
||||
Wraps SYSTRAN/faster-whisper (https://github.com/SYSTRAN/faster-whisper) in a
|
||||
small FastAPI app exposing two endpoints:
|
||||
|
||||
* POST /align — fc-align contract used by FlowerCore.Shared.Speech's
|
||||
FasterWhisperAlignmentClient on master. Multipart form
|
||||
(`audio`, `language`) returns
|
||||
`{text, words: [{word, startSeconds, endSeconds, confidence}],
|
||||
durationMs, language}`.
|
||||
* POST /transcribe — audio-file-in transcription used by the new TtsReader
|
||||
audio-import feature. Multipart form (`audio`, optional
|
||||
`language`) returns `{text, language, durationMs,
|
||||
segments: [{startSeconds, endSeconds, text}]}` so the
|
||||
UI can preview the transcript before piping it into
|
||||
Quick Read or saving as a project.
|
||||
|
||||
Both endpoints share the same WhisperModel instance (loaded once at startup).
|
||||
Model is pinned by the WHISPER_MODEL env var (defaults to base.en) and cached
|
||||
under WHISPER_CACHE_DIR (defaults to /models, backed by a PVC in K8s).
|
||||
|
||||
Health: GET /health → {status: ok, model, device, computeType}.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import io
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
from contextlib import asynccontextmanager
|
||||
from typing import Optional
|
||||
|
||||
from fastapi import FastAPI, File, Form, HTTPException, UploadFile
|
||||
from fastapi.responses import JSONResponse
|
||||
from faster_whisper import WhisperModel
|
||||
|
||||
LOG = logging.getLogger("speech_align")
|
||||
logging.basicConfig(
|
||||
level=os.environ.get("LOG_LEVEL", "INFO"),
|
||||
format="%(asctime)s %(levelname)s %(name)s %(message)s",
|
||||
)
|
||||
|
||||
MODEL_NAME = os.environ.get("WHISPER_MODEL", "Systran/faster-whisper-base.en")
|
||||
DEVICE = os.environ.get("WHISPER_DEVICE", "cpu")
|
||||
COMPUTE_TYPE = os.environ.get("WHISPER_COMPUTE_TYPE", "int8")
|
||||
CACHE_DIR = os.environ.get("WHISPER_CACHE_DIR", "/models")
|
||||
MAX_BYTES = int(os.environ.get("MAX_AUDIO_BYTES", str(50 * 1024 * 1024))) # 50 MB
|
||||
DEFAULT_LANGUAGE = os.environ.get("DEFAULT_LANGUAGE", "en")
|
||||
|
||||
_state: dict[str, object] = {}
|
||||
|
||||
|
||||
@asynccontextmanager
|
||||
async def lifespan(_app: FastAPI):
|
||||
LOG.info("Loading faster-whisper model %s (device=%s compute=%s cache=%s)", MODEL_NAME, DEVICE, COMPUTE_TYPE, CACHE_DIR)
|
||||
started = time.time()
|
||||
model = WhisperModel(MODEL_NAME, device=DEVICE, compute_type=COMPUTE_TYPE, download_root=CACHE_DIR)
|
||||
_state["model"] = model
|
||||
LOG.info("Model loaded in %.2fs", time.time() - started)
|
||||
yield
|
||||
_state.clear()
|
||||
|
||||
|
||||
app = FastAPI(title="FlowerCore speech-align", version="1.0.0", lifespan=lifespan)
|
||||
|
||||
|
||||
def _get_model() -> WhisperModel:
|
||||
model = _state.get("model")
|
||||
if model is None:
|
||||
raise HTTPException(status_code=503, detail="Model not loaded yet")
|
||||
return model # type: ignore[return-value]
|
||||
|
||||
|
||||
async def _read_upload(upload: UploadFile) -> bytes:
|
||||
payload = await upload.read()
|
||||
if not payload:
|
||||
raise HTTPException(status_code=400, detail="audio is empty")
|
||||
if len(payload) > MAX_BYTES:
|
||||
raise HTTPException(
|
||||
status_code=413,
|
||||
detail=f"audio exceeds {MAX_BYTES} byte limit ({len(payload)} bytes received)",
|
||||
)
|
||||
return payload
|
||||
|
||||
|
||||
def _normalize_language(value: Optional[str]) -> Optional[str]:
|
||||
if not value or not value.strip():
|
||||
return DEFAULT_LANGUAGE
|
||||
return value.strip().lower()
|
||||
|
||||
|
||||
def _transcribe_bytes(audio_bytes: bytes, language: Optional[str], word_timestamps: bool):
|
||||
model = _get_model()
|
||||
started = time.time()
|
||||
segments_iter, info = model.transcribe(
|
||||
io.BytesIO(audio_bytes),
|
||||
language=language,
|
||||
word_timestamps=word_timestamps,
|
||||
beam_size=1,
|
||||
vad_filter=True,
|
||||
)
|
||||
segments = list(segments_iter)
|
||||
elapsed_ms = int((time.time() - started) * 1000)
|
||||
return segments, info, elapsed_ms
|
||||
|
||||
|
||||
@app.get("/health")
|
||||
def health():
|
||||
return {
|
||||
"status": "ok" if _state.get("model") is not None else "loading",
|
||||
"model": MODEL_NAME,
|
||||
"device": DEVICE,
|
||||
"computeType": COMPUTE_TYPE,
|
||||
"defaultLanguage": DEFAULT_LANGUAGE,
|
||||
"maxBytes": MAX_BYTES,
|
||||
}
|
||||
|
||||
|
||||
@app.post("/align")
|
||||
async def align(audio: UploadFile = File(...), language: str = Form(DEFAULT_LANGUAGE)):
|
||||
"""fc-align contract — used by FlowerCore.Shared.Speech.FasterWhisperAlignmentClient."""
|
||||
payload = await _read_upload(audio)
|
||||
lang = _normalize_language(language)
|
||||
segments, info, elapsed_ms = _transcribe_bytes(payload, lang, word_timestamps=True)
|
||||
|
||||
text_parts: list[str] = []
|
||||
words: list[dict] = []
|
||||
for segment in segments:
|
||||
text_parts.append(segment.text.strip())
|
||||
for word in (segment.words or []):
|
||||
# Field names MUST match the FlowerCore.Shared.Speech contract:
|
||||
# `text` / `startMs` / `endMs`. The deployed FasterWhisperAlignmentClient
|
||||
# ignores any other names — see Common's
|
||||
# FasterWhisperAlignmentResponse / FasterWhisperWord.
|
||||
words.append({
|
||||
"text": word.word.strip(),
|
||||
"startMs": int((word.start or 0.0) * 1000),
|
||||
"endMs": int((word.end or 0.0) * 1000),
|
||||
# Confidence is informational and ignored by the C# client today,
|
||||
# but kept on the wire for future scoring + fc-align operators
|
||||
# that want to surface low-confidence words.
|
||||
"confidence": float(getattr(word, "probability", 0.0) or 0.0),
|
||||
})
|
||||
|
||||
duration_ms = int((info.duration or 0.0) * 1000)
|
||||
return JSONResponse({
|
||||
"text": " ".join(p for p in text_parts if p).strip(),
|
||||
"words": words,
|
||||
"durationMs": duration_ms,
|
||||
"language": info.language or lang,
|
||||
"elapsedMs": elapsed_ms,
|
||||
})
|
||||
|
||||
|
||||
@app.post("/transcribe")
|
||||
async def transcribe(audio: UploadFile = File(...), language: Optional[str] = Form(None)):
|
||||
"""Audio-in transcription contract — used by the new TtsReader audio-import feature.
|
||||
|
||||
Returns full segments (no per-word timestamps) so the UI can preview the
|
||||
transcript before piping it into Quick Read or saving as a project.
|
||||
"""
|
||||
payload = await _read_upload(audio)
|
||||
lang = _normalize_language(language)
|
||||
segments, info, elapsed_ms = _transcribe_bytes(payload, lang, word_timestamps=False)
|
||||
|
||||
out_segments = [
|
||||
{
|
||||
"startSeconds": float(segment.start or 0.0),
|
||||
"endSeconds": float(segment.end or 0.0),
|
||||
"text": segment.text.strip(),
|
||||
}
|
||||
for segment in segments
|
||||
]
|
||||
|
||||
return JSONResponse({
|
||||
"text": " ".join(s["text"] for s in out_segments if s["text"]).strip(),
|
||||
"segments": out_segments,
|
||||
"language": info.language or lang,
|
||||
"durationMs": int((info.duration or 0.0) * 1000),
|
||||
"elapsedMs": elapsed_ms,
|
||||
})
|
||||
8
apps/fc-ttsreader/speech-align/requirements.txt
Normal file
8
apps/fc-ttsreader/speech-align/requirements.txt
Normal file
@@ -0,0 +1,8 @@
|
||||
faster-whisper==1.0.3
|
||||
fastapi==0.115.0
|
||||
uvicorn[standard]==0.30.6
|
||||
python-multipart==0.0.10
|
||||
# faster-whisper 1.0.3's utils module imports requests but doesn't pin it as a
|
||||
# transitive dep — pin explicitly so the image isn't relying on whatever
|
||||
# happens to be in the base image.
|
||||
requests==2.32.3
|
||||
47
apps/fc-updater/README.md
Normal file
47
apps/fc-updater/README.md
Normal file
@@ -0,0 +1,47 @@
|
||||
# fc-updater — Update Center GitOps adoption
|
||||
|
||||
**Status:** adopted into `bluejay-infra` on 2026-05-06. The live ArgoCD
|
||||
Application is `infra-fc-updater`, generated by the `bluejay-infra`
|
||||
ApplicationSet with automated sync, `prune: true`, and `selfHeal: true`.
|
||||
|
||||
## Managed manifest set
|
||||
|
||||
`apps/fc-updater/fc-updater.yaml` manages:
|
||||
|
||||
- `Namespace/fc-updater`
|
||||
- `PersistentVolumeClaim/updatecenter-data`
|
||||
- `Deployment/updatecenter-web`
|
||||
- `Service/updatecenter-web`
|
||||
- `Certificate/updatecenter-web-tls`
|
||||
- `Certificate/updatecenter-web-internal-tls`
|
||||
- `IngressRoute/updatecenter-web`
|
||||
- `IngressRoute/updatecenter-web-internal`
|
||||
- `IngressRoute/updatecenter-web-public`
|
||||
|
||||
The Deployment intentionally sets `revisionHistoryLimit: 3` and
|
||||
`strategy.type: Recreate`. The service is singleton + SQLite/local bundle
|
||||
storage on `PersistentVolumeClaim/updatecenter-data`, pinned to
|
||||
`rke2-server`.
|
||||
|
||||
## Runtime dependencies intentionally not stored here
|
||||
|
||||
These live Secrets are pre-existing runtime material and are not committed to
|
||||
Git:
|
||||
|
||||
- `updater-bootstrap-auth`
|
||||
- `updater-signing`
|
||||
- `updater-webhooks`
|
||||
- `cf-origin-flowercore-io`
|
||||
|
||||
Rotate the Cloudflare Origin Certificate through
|
||||
`FlowerCore.Notes/docs/standards/code-signing-rotation-runbook.md`; the
|
||||
shared origin cert must exist in every namespace that serves a
|
||||
`*.flowercore.io` public IngressRoute.
|
||||
|
||||
## Verification
|
||||
|
||||
```powershell
|
||||
kubectl.exe --kubeconfig C:\Users\AndrewStoltz\.kube\rke2.yaml -n argocd get application infra-fc-updater
|
||||
kubectl.exe --kubeconfig C:\Users\AndrewStoltz\.kube\rke2.yaml -n fc-updater get deploy,svc,ingressroute,certificate,pvc
|
||||
curl.exe -sk https://update.flowercore.io/api/v1/manifests/_schema
|
||||
```
|
||||
269
apps/fc-updater/fc-updater.yaml
Normal file
269
apps/fc-updater/fc-updater.yaml
Normal file
@@ -0,0 +1,269 @@
|
||||
# FlowerCore Update Center
|
||||
# GitOps adoption of the live fc-updater namespace after PUB-1/PUB-3.
|
||||
# Runtime credentials remain in existing K8s Secrets; do not store them here.
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Namespace
|
||||
metadata:
|
||||
name: fc-updater
|
||||
labels:
|
||||
app.kubernetes.io/part-of: flowercore
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: PersistentVolumeClaim
|
||||
metadata:
|
||||
name: updatecenter-data
|
||||
namespace: fc-updater
|
||||
labels:
|
||||
app.kubernetes.io/name: updatecenter-web
|
||||
app.kubernetes.io/part-of: flowercore
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
storageClassName: longhorn
|
||||
volumeMode: Filesystem
|
||||
resources:
|
||||
requests:
|
||||
# Sized for fleet bundle storage (LocalFsBundleStore.MaxTotalBytes
|
||||
# soft cap at 25 GiB per project_uc_remaining_4_apps_signed_2026_05_06).
|
||||
# Mike Bundle alone is ~5.1 GiB; cluster live capacity is already
|
||||
# 20 GiB after a manual expand. PVCs cannot shrink, so git must track
|
||||
# at least the live size to avoid the OutOfSync loop.
|
||||
storage: 25Gi
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: updatecenter-web
|
||||
namespace: fc-updater
|
||||
labels:
|
||||
app: updatecenter-web
|
||||
app.kubernetes.io/name: updatecenter-web
|
||||
app.kubernetes.io/part-of: flowercore
|
||||
spec:
|
||||
replicas: 1
|
||||
revisionHistoryLimit: 3
|
||||
strategy:
|
||||
# SQLite + local bundle storage live on a single RWO PVC. Recreate avoids
|
||||
# two pods overlapping the same write path during future image bumps.
|
||||
type: Recreate
|
||||
selector:
|
||||
matchLabels:
|
||||
app: updatecenter-web
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: updatecenter-web
|
||||
spec:
|
||||
nodeName: rke2-server
|
||||
containers:
|
||||
- name: web
|
||||
image: localhost/fc-updater-web:v20260509-4162dca-authgate
|
||||
imagePullPolicy: Never
|
||||
ports:
|
||||
- containerPort: 8080
|
||||
name: http
|
||||
env:
|
||||
- name: ASPNETCORE_URLS
|
||||
value: http://+:8080
|
||||
- name: FlowerCore__Updater__Database__Provider
|
||||
value: sqlite
|
||||
- name: FlowerCore__Updater__Database__ConnectionString
|
||||
value: Data Source=/data/updatecenter.db
|
||||
- name: FlowerCore__Updater__BundleStorage__LocalFs__RootDirectory
|
||||
value: /data/bundles
|
||||
- name: FlowerCore__Updater__PublicShares__RequirePublicVisibilityOnPublicHosts
|
||||
value: "true"
|
||||
- name: FlowerCore__Updater__PublicShares__Links__0__Code
|
||||
value: 8f3c2a9e7d41
|
||||
- name: FlowerCore__Updater__PublicShares__Links__0__AppId
|
||||
value: flowercore.faith-ai-mike
|
||||
- name: FlowerCore__Updater__PublicShares__Links__0__Channel
|
||||
value: stable
|
||||
- name: FlowerCore__Updater__PublicShares__Links__0__RuntimeId
|
||||
value: win-x64
|
||||
- name: FlowerCore__Updater__PublicShares__Links__0__DisplayName
|
||||
value: Faith AI Mike Edition
|
||||
- name: FlowerCore__Updater__PublicShares__Links__0__Headline
|
||||
value: Faith AI Mike Edition
|
||||
- name: FlowerCore__Updater__PublicShares__Links__0__Description
|
||||
value: Private release link for Mike's Faith AI bundle.
|
||||
- name: FlowerCore__Updater__Auth__Bootstrap__Enabled
|
||||
value: "true"
|
||||
- name: FlowerCore__Updater__Auth__Bootstrap__Username
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: updater-bootstrap-auth
|
||||
key: username
|
||||
- name: FlowerCore__Updater__Auth__Bootstrap__Password
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: updater-bootstrap-auth
|
||||
key: password
|
||||
- name: FlowerCore__Updater__Auth__Bootstrap__SigningKey
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: updater-bootstrap-auth
|
||||
key: signing-key
|
||||
- name: FlowerCore__Updater__Signing__AutoSignOnPublish
|
||||
value: "true"
|
||||
- name: FlowerCore__Updater__Signing__RequireSignatureOnPublish
|
||||
value: "true"
|
||||
- name: FlowerCore__Updater__Signing__PfxBase64
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: updater-signing
|
||||
key: pfx-base64
|
||||
- name: FlowerCore__Updater__Signing__PfxPassword
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: updater-signing
|
||||
key: pfx-password
|
||||
- name: FlowerCore__Updater__Signing__OpItemReference
|
||||
value: op://FlowerCore/step-ca-codesign
|
||||
- name: FlowerCore__Updater__Signing__TrustAnchorPath
|
||||
value: /etc/flowercore-updater/signing/root-ca.pem
|
||||
- name: FlowerCore__Updater__GitHub__Token
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: updater-webhooks
|
||||
key: github-token
|
||||
- name: FlowerCore__Updater__GitHub__WebhookSecret
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: updater-webhooks
|
||||
key: github-webhook-secret
|
||||
- name: FlowerCore__Updater__Gitea__Token
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: updater-webhooks
|
||||
key: gitea-token
|
||||
- name: FlowerCore__Updater__Gitea__WebhookSecret
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: updater-webhooks
|
||||
key: gitea-webhook-secret
|
||||
readinessProbe:
|
||||
tcpSocket:
|
||||
port: http
|
||||
initialDelaySeconds: 10
|
||||
periodSeconds: 15
|
||||
livenessProbe:
|
||||
tcpSocket:
|
||||
port: http
|
||||
initialDelaySeconds: 30
|
||||
periodSeconds: 30
|
||||
volumeMounts:
|
||||
- name: data
|
||||
mountPath: /data
|
||||
- name: signing
|
||||
mountPath: /etc/flowercore-updater/signing
|
||||
readOnly: true
|
||||
volumes:
|
||||
- name: data
|
||||
persistentVolumeClaim:
|
||||
claimName: updatecenter-data
|
||||
- name: signing
|
||||
secret:
|
||||
secretName: updater-signing
|
||||
items:
|
||||
- key: root-ca.pem
|
||||
path: root-ca.pem
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: updatecenter-web
|
||||
namespace: fc-updater
|
||||
labels:
|
||||
app: updatecenter-web
|
||||
app.kubernetes.io/name: updatecenter-web
|
||||
app.kubernetes.io/part-of: flowercore
|
||||
spec:
|
||||
type: ClusterIP
|
||||
selector:
|
||||
app: updatecenter-web
|
||||
ports:
|
||||
- name: http
|
||||
port: 8080
|
||||
targetPort: http
|
||||
---
|
||||
apiVersion: cert-manager.io/v1
|
||||
kind: Certificate
|
||||
metadata:
|
||||
name: updatecenter-web-tls
|
||||
namespace: fc-updater
|
||||
spec:
|
||||
secretName: updatecenter-web-tls
|
||||
issuerRef:
|
||||
name: step-ca-acme
|
||||
kind: ClusterIssuer
|
||||
dnsNames:
|
||||
- updatecenter.iamworkin.lan
|
||||
- updates.iamworkin.lan
|
||||
---
|
||||
apiVersion: cert-manager.io/v1
|
||||
kind: Certificate
|
||||
metadata:
|
||||
name: updatecenter-web-internal-tls
|
||||
namespace: fc-updater
|
||||
spec:
|
||||
secretName: updatecenter-web-internal-tls
|
||||
issuerRef:
|
||||
name: step-ca-acme
|
||||
kind: ClusterIssuer
|
||||
dnsNames:
|
||||
- updatecenter-internal.iamworkin.lan
|
||||
---
|
||||
apiVersion: traefik.io/v1alpha1
|
||||
kind: IngressRoute
|
||||
metadata:
|
||||
name: updatecenter-web
|
||||
namespace: fc-updater
|
||||
spec:
|
||||
entryPoints:
|
||||
- web
|
||||
- websecure
|
||||
routes:
|
||||
- match: (Host(`updatecenter.iamworkin.lan`) || Host(`updates.iamworkin.lan`)) && (Method(`GET`) || Method(`HEAD`) || Method(`POST`) || Method(`OPTIONS`))
|
||||
kind: Rule
|
||||
services:
|
||||
- name: updatecenter-web
|
||||
port: 8080
|
||||
tls:
|
||||
secretName: updatecenter-web-tls
|
||||
---
|
||||
apiVersion: traefik.io/v1alpha1
|
||||
kind: IngressRoute
|
||||
metadata:
|
||||
name: updatecenter-web-internal
|
||||
namespace: fc-updater
|
||||
spec:
|
||||
entryPoints:
|
||||
- web
|
||||
- websecure
|
||||
routes:
|
||||
- match: Host(`updatecenter-internal.iamworkin.lan`)
|
||||
kind: Rule
|
||||
services:
|
||||
- name: updatecenter-web
|
||||
port: 8080
|
||||
tls:
|
||||
secretName: updatecenter-web-internal-tls
|
||||
---
|
||||
apiVersion: traefik.io/v1alpha1
|
||||
kind: IngressRoute
|
||||
metadata:
|
||||
name: updatecenter-web-public
|
||||
namespace: fc-updater
|
||||
spec:
|
||||
entryPoints:
|
||||
- websecure
|
||||
routes:
|
||||
- match: (Host(`update.flowercore.io`) || Host(`updates.flowercore.io`)) && (Method(`GET`) || Method(`HEAD`) || Method(`POST`) || Method(`OPTIONS`))
|
||||
kind: Rule
|
||||
services:
|
||||
- name: updatecenter-web
|
||||
port: 8080
|
||||
tls:
|
||||
secretName: cf-origin-flowercore-io
|
||||
7
apps/fc-updater/kustomization.yaml
Normal file
7
apps/fc-updater/kustomization.yaml
Normal file
@@ -0,0 +1,7 @@
|
||||
# ArgoCD's bluejay-infra ApplicationSet uses a directory generator and does
|
||||
# not require kustomization.yaml. Keep this anyway as the manifest inventory
|
||||
# and for local `kubectl kustomize apps/fc-updater` previews.
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- fc-updater.yaml
|
||||
@@ -1,6 +1,11 @@
|
||||
# FlowerCore Tenant — flowercore.io (main brand)
|
||||
# Public-facing placeholder landing page served by nginx
|
||||
# ArgoCD managed - BlueJay Lab
|
||||
# FlowerCore Tenant — retired flowercore.io placeholder.
|
||||
#
|
||||
# Public flowercore.io/www.flowercore.io routing is now owned by
|
||||
# apps/fc-landing/fc-landing.yaml. This tenant placeholder remains available
|
||||
# only as an in-cluster service; do not create a duplicate public
|
||||
# IngressRoute here because it competes with fc-landing and requires a
|
||||
# namespace-local cf-origin-flowercore-io Secret.
|
||||
# ArgoCD managed - BlueJay Lab
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Namespace
|
||||
@@ -10,15 +15,9 @@ metadata:
|
||||
app.kubernetes.io/part-of: bluejay-infra
|
||||
flowercore.io/tenant: flowercore
|
||||
---
|
||||
# NOTE: The existing cf-origin-flowercore-io secret (covering *.flowercore.io)
|
||||
# must be copied into this namespace. It already exists in other namespaces.
|
||||
# Copy with: kubectl get secret cf-origin-flowercore-io -n fc-system -o yaml \
|
||||
# | sed 's/namespace: .*/namespace: tenant-flowercore/' \
|
||||
# | kubectl apply -f -
|
||||
---
|
||||
# Landing page HTML
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
# Landing page HTML
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: flowercore-web-html
|
||||
namespace: tenant-flowercore
|
||||
@@ -308,25 +307,6 @@ spec:
|
||||
selector:
|
||||
app: flowercore-web
|
||||
ports:
|
||||
- port: 80
|
||||
targetPort: 80
|
||||
name: http
|
||||
---
|
||||
# Traefik IngressRoute — public via Cloudflare
|
||||
# Uses existing cf-origin-flowercore-io cert (must be copied to this namespace)
|
||||
apiVersion: traefik.io/v1alpha1
|
||||
kind: IngressRoute
|
||||
metadata:
|
||||
name: flowercore-web
|
||||
namespace: tenant-flowercore
|
||||
spec:
|
||||
entryPoints:
|
||||
- websecure
|
||||
routes:
|
||||
- match: Host(`flowercore.io`) || Host(`www.flowercore.io`)
|
||||
kind: Rule
|
||||
services:
|
||||
- name: flowercore-web
|
||||
port: 80
|
||||
tls:
|
||||
secretName: cf-origin-flowercore-io
|
||||
- port: 80
|
||||
targetPort: 80
|
||||
name: http
|
||||
|
||||
File diff suppressed because one or more lines are too long
@@ -3,6 +3,28 @@ kind: Namespace
|
||||
metadata:
|
||||
name: intranet
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: PersistentVolumeClaim
|
||||
metadata:
|
||||
name: intranet-vector-store
|
||||
namespace: intranet
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
storageClassName: longhorn
|
||||
resources:
|
||||
requests:
|
||||
storage: 1Gi
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: intranet-config
|
||||
namespace: intranet
|
||||
data:
|
||||
KnowledgeApiKey: ""
|
||||
TrustedHeaderSharedSecret: ""
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
@@ -12,6 +34,8 @@ metadata:
|
||||
app: intranet-web
|
||||
spec:
|
||||
replicas: 1
|
||||
strategy:
|
||||
type: Recreate
|
||||
selector:
|
||||
matchLabels:
|
||||
app: intranet-web
|
||||
@@ -22,7 +46,7 @@ spec:
|
||||
spec:
|
||||
containers:
|
||||
- name: intranet-web
|
||||
image: localhost/fc-intranet-web:latest
|
||||
image: localhost/fc-intranet-web:v20260508-brochure-w1
|
||||
imagePullPolicy: Never
|
||||
ports:
|
||||
- containerPort: 5300
|
||||
@@ -32,25 +56,58 @@ spec:
|
||||
value: Production
|
||||
- name: ASPNETCORE_URLS
|
||||
value: "http://+:5300"
|
||||
# Bulk corpus indexing on edge1 Pi 5 takes ~6s/chunk × 5665 chunks
|
||||
# ≈ 9 hours. BLUEJAY-WS GPU (R9700, 32GB VRAM) does the same work
|
||||
# in minutes. Memory: feedback_pi5_nomic_embed_slow.
|
||||
- name: IntranetSearch__OllamaBaseUrl
|
||||
value: "http://10.0.56.20:11434"
|
||||
# Sprint E Phase 2α — JSON-file-backed PageReadingOverride persistence
|
||||
# on the writable PVC at /data. Without this env var the
|
||||
# intranet falls back to the in-memory store (loses state on
|
||||
# pod restart). Master's PageReadingOverrideOptions binds
|
||||
# PageReadingOverrides:FilePath.
|
||||
- name: PageReadingOverrides__FilePath
|
||||
value: "/data/page-reading-overrides.json"
|
||||
- name: KnowledgeFleetSearch__BaseUrl
|
||||
value: "https://knowledge.iamworkin.lan"
|
||||
- name: KnowledgeFleetSearch__ApiKey
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
name: intranet-config
|
||||
key: KnowledgeApiKey
|
||||
optional: true
|
||||
- name: TrustedHeaderAuthentication__SharedSecret
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
name: intranet-config
|
||||
key: TrustedHeaderSharedSecret
|
||||
optional: true
|
||||
resources:
|
||||
requests:
|
||||
memory: "128Mi"
|
||||
memory: "256Mi"
|
||||
cpu: "100m"
|
||||
limits:
|
||||
memory: "512Mi"
|
||||
cpu: "500m"
|
||||
memory: "1Gi"
|
||||
cpu: "1000m"
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /health
|
||||
port: 5300
|
||||
initialDelaySeconds: 10
|
||||
initialDelaySeconds: 30
|
||||
periodSeconds: 30
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /health
|
||||
port: 5300
|
||||
initialDelaySeconds: 5
|
||||
initialDelaySeconds: 10
|
||||
periodSeconds: 10
|
||||
volumeMounts:
|
||||
- name: vector-store
|
||||
mountPath: /data
|
||||
volumes:
|
||||
- name: vector-store
|
||||
persistentVolumeClaim:
|
||||
claimName: intranet-vector-store
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
|
||||
1723
apps/irc/irc.yaml
1723
apps/irc/irc.yaml
File diff suppressed because it is too large
Load Diff
165
apps/knowledge/README.md
Normal file
165
apps/knowledge/README.md
Normal file
@@ -0,0 +1,165 @@
|
||||
# knowledge — FlowerCore.Knowledge.Web (Phase 2.4 K8s deploy)
|
||||
|
||||
**Status:** **LIVE 2026-04-27** at `https://knowledge.iamworkin.lan` —
|
||||
Phase 2.4 closed. Pod running, certificate issued (step-ca-acme), PVC
|
||||
bound (Longhorn 20Gi RWO), ArgoCD `infra-knowledge` synced. `/healthz`
|
||||
returns 200, `/api/v1/editions` returns `[]` (initial-deploy state — no
|
||||
*.db files in the PVC yet; Phase 2.5+ admin UI handles bulk
|
||||
population). Phase 1 of the Agent Zero MCP rollout keeps `/healthz`
|
||||
anonymous and gates `/mcp` behind `Authorization: Bearer <token>` built
|
||||
from the 1Password item `FlowerCore Knowledge MCP Tokens`.
|
||||
|
||||
- Plan: [`../../../FlowerCore.Notes/docs/ai-agents/flowercore-knowledge-service-plan.md`](../../../FlowerCore.Notes/docs/ai-agents/flowercore-knowledge-service-plan.md)
|
||||
- Sprint: [`../../../FlowerCore.Notes/docs/ai-station/sprint-e-xxl-plan.md`](../../../FlowerCore.Notes/docs/ai-station/sprint-e-xxl-plan.md) (Track B)
|
||||
- Repo: `D:\git\FlowerCore\FlowerCore.Knowledge\` (private GitHub repo,
|
||||
bootstrapped Sprint D batch 35)
|
||||
|
||||
`FlowerCore.Knowledge.Web` is the fleet-wide vector-indexing & RAG hub —
|
||||
a REST + MCP service that scans `*.db` files under
|
||||
`/data/vector-stores` and exposes per-edition reachability + corpus
|
||||
search to the rest of the FC ecosystem (Agent Zero, Chat.Web persona
|
||||
memory, AiStation embeddings explorer, TtsReader chapter context, BMO
|
||||
bot, Pi nodes via `fc-index sync`).
|
||||
|
||||
Phase 1 MCP routing is explicit:
|
||||
|
||||
- in-cluster Agent Zero → `http://knowledge-web.knowledge.svc/mcp`
|
||||
- workstation Agent Zero → `https://knowledge.iamworkin.lan/mcp`
|
||||
- probe URL for both lanes → `/healthz`
|
||||
|
||||
## Deployment order (do NOT skip / reorder)
|
||||
|
||||
### 1. FlowerCore.DNS public A record — knowledge.iamworkin.lan -> 10.0.56.200
|
||||
|
||||
Required BEFORE the Certificate resource is created, or cert-manager
|
||||
HTTP-01 silently backs off ~2h. Memory: `feedback_pfsense_dns_required_for_acme`.
|
||||
|
||||
The canonical path is FlowerCore.DNS:
|
||||
|
||||
```bash
|
||||
curl -sk https://dns.iamworkin.lan/api/v1/servers
|
||||
# Find the pfSense serverId, then create the record using the host label only.
|
||||
|
||||
curl -sk -X POST https://dns.iamworkin.lan/api/v1/servers/<serverId>/zones/iamworkin.lan/records \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"name":"knowledge","type":"A","data":"10.0.56.200","ttl":300}'
|
||||
```
|
||||
|
||||
If FlowerCore.DNS provider writes are failing 502 with "pfSense
|
||||
diag_command.php response did not contain a `<pre>` block" (status as of
|
||||
Sprint E Track B authoring 2026-04-27), add the override manually via
|
||||
the pfSense web UI:
|
||||
|
||||
1. Log in to `https://10.0.56.1` as admin
|
||||
2. Services → DNS Resolver → General Settings → Host Overrides
|
||||
3. Add: Host=`knowledge`, Domain=`iamworkin.lan`, IP Address=`10.0.56.200`
|
||||
4. Save + Apply Changes
|
||||
|
||||
Verify resolution from anywhere on LAN:
|
||||
|
||||
```bash
|
||||
nslookup knowledge.iamworkin.lan 10.0.56.1
|
||||
# Expect: 10.0.56.200
|
||||
```
|
||||
|
||||
Or against FlowerCore.DNS once the provider is fixed:
|
||||
|
||||
```bash
|
||||
curl -sk "https://dns.iamworkin.lan/api/v1/zones/iamworkin.lan/resolve-preflight?hostname=knowledge.iamworkin.lan"
|
||||
# Expect: "resolvable": true
|
||||
```
|
||||
|
||||
### 2. Build + import the image to ALL RKE2 nodes
|
||||
|
||||
Pods may schedule on any RKE2 worker (server, agent1, agent2). The
|
||||
Longhorn PVC accepts mounts from any node, so the image must be
|
||||
imported to all three. Memory:
|
||||
`feedback_rke2_image_import_targets_all_nodes` +
|
||||
`feedback_rke2_localhost_imagepullpolicy`.
|
||||
|
||||
```bash
|
||||
# From BLUEJAY-WS, in D:\git\FlowerCore\FlowerCore.Knowledge
|
||||
TAG="v$(date +%Y%m%d%H%M)"
|
||||
dotnet.exe publish -c Release -o deploy/app \
|
||||
src/FlowerCore.Knowledge.Web/FlowerCore.Knowledge.Web.csproj
|
||||
podman build -t localhost/fc-knowledge-web:$TAG -f deploy/Dockerfile.deploy deploy
|
||||
podman save localhost/fc-knowledge-web:$TAG -o /tmp/fc-knowledge-web.tar
|
||||
|
||||
# Import to all three RKE2 nodes
|
||||
for node in rke2-server rke2-agent1 rke2-agent2; do
|
||||
scp /tmp/fc-knowledge-web.tar $node:/tmp/
|
||||
ssh $node "sudo /var/lib/rancher/rke2/bin/ctr -a /run/k3s/containerd/containerd.sock -n k8s.io images import /tmp/fc-knowledge-web.tar"
|
||||
done
|
||||
```
|
||||
|
||||
The repo's `scripts/deploy-knowledge.sh` automates this loop.
|
||||
|
||||
### 3. Bump the image tag + push
|
||||
|
||||
Edit `knowledge.yaml`, replace `localhost/fc-knowledge-web:v202604272200`
|
||||
with the tag from step 2, then:
|
||||
|
||||
```bash
|
||||
cd D:/git/FlowerCore/bluejay-infra
|
||||
python scripts/check-pfsense-dns.py # confirms the DNS preflight
|
||||
git add apps/knowledge/
|
||||
git commit -m "feat(knowledge): deploy Phase 2.4 K8s manifest"
|
||||
git push
|
||||
```
|
||||
|
||||
ArgoCD picks up within ~3 minutes and creates `infra-knowledge`.
|
||||
|
||||
### 4. Verify
|
||||
|
||||
```bash
|
||||
fcadmin_ssh noc1 '
|
||||
kubectl -n argocd get application infra-knowledge
|
||||
kubectl -n knowledge get certificate,pod,pvc
|
||||
curl -sk -m 8 -o /dev/null -w "HTTP %{http_code}\n" https://knowledge.iamworkin.lan/healthz
|
||||
curl -sk -m 8 https://knowledge.iamworkin.lan/api/v1/editions | jq
|
||||
'
|
||||
```
|
||||
|
||||
Expect: Certificate `Ready: True` within ~60s, `/healthz` HTTP 200,
|
||||
`/api/v1/editions` returns an empty array (no DBs in the PVC yet) on
|
||||
first deploy.
|
||||
|
||||
## Initial-deploy state and Phase 2.5 follow-up
|
||||
|
||||
The Longhorn PVC is empty on first deploy. Knowledge.Web's filesystem
|
||||
catalog will report zero editions until vector-store `*.db` files are
|
||||
pushed into `/data/vector-stores`. Initial population is a follow-up
|
||||
step (Phase 2.5+, Blazor admin UI's "Rebuild" button); for the first
|
||||
deploy the goal is just to prove the pod boots, `/healthz` returns 200,
|
||||
and the Traefik IngressRoute serves the Scalar UI.
|
||||
|
||||
To copy an existing local DB into the PVC (one-time, manual until
|
||||
Phase 2.5 admin UI lands):
|
||||
|
||||
```bash
|
||||
fcadmin_ssh noc1 '
|
||||
POD=$(kubectl -n knowledge get pod -l app=knowledge-web -o jsonpath="{.items[0].metadata.name}")
|
||||
kubectl -n knowledge cp /var/lib/flowercore/vector-stores/bluejay-ai.db $POD:/data/vector-stores/bluejay-ai.db
|
||||
'
|
||||
```
|
||||
|
||||
## Probes + middleware notes
|
||||
|
||||
- `/healthz` is mapped by `Controllers/HealthController.cs` (controller-based
|
||||
attribute route). Cheap — no DB, no dependencies.
|
||||
- Liveness uses `tcpSocket` as a defensive fallback in case future
|
||||
middleware accidentally gates `/healthz` behind auth (memory:
|
||||
`feedback_k8s_probes_behind_auth_middleware`).
|
||||
- `/openapi/v1.json` and `/scalar/v1` are wired by `UseFlowerCoreApi`.
|
||||
Per memory `feedback_k8s_probes_must_not_hit_openapi`, probes must NOT
|
||||
point at OpenAPI documents — the `MapOpenApi` call can be slow during
|
||||
cold startup.
|
||||
|
||||
## Resource sizing
|
||||
|
||||
- 256Mi memory request / 1Gi limit.
|
||||
- 100m CPU request / 1000m limit.
|
||||
- 20Gi Longhorn PVC initial — sufficient for the bluejay-ai 1.94Gi DB +
|
||||
fleet-pi-edge 352Mi + fleet-bmo-bot 141Mi + headroom. Resize via
|
||||
`kubectl -n knowledge edit pvc knowledge-vector-store` if growing
|
||||
past 15Gi.
|
||||
266
apps/knowledge/knowledge.yaml
Normal file
266
apps/knowledge/knowledge.yaml
Normal file
@@ -0,0 +1,266 @@
|
||||
# FlowerCore.Knowledge.Web — fleet vector indexing & RAG hub.
|
||||
#
|
||||
# Phase 2.4 of the Knowledge service plan. REST + MCP service that scans
|
||||
# *.db files under /data/vector-stores and exposes:
|
||||
# - REST: /api/v1/editions, /api/v1/corpus/search, /healthz
|
||||
# - MCP: list_editions, describe_edition, corpus_search
|
||||
# - Static OpenAPI/Scalar via UseFlowerCoreApi
|
||||
#
|
||||
# Architecture:
|
||||
# Plan: FlowerCore.Notes/docs/ai-agents/flowercore-knowledge-service-plan.md
|
||||
# Sprint: FlowerCore.Notes/docs/ai-station/sprint-e-xxl-plan.md (Track B)
|
||||
# Repo: D:\git\FlowerCore\FlowerCore.Knowledge\
|
||||
# Shared: FlowerCore.Common -> FlowerCore.Shared.Indexing (chunkers, vector
|
||||
# stores, edition profiles, ICorpusSearchService facade)
|
||||
#
|
||||
# Deployment order (see apps/knowledge/README.md and the bluejay-infra/README.md
|
||||
# top-level checklist):
|
||||
# 1. FlowerCore.DNS public A record knowledge.iamworkin.lan -> 10.0.56.200
|
||||
# MUST exist BEFORE the Certificate is created, or cert-manager HTTP-01
|
||||
# backs off ~2h. Memory: feedback_pfsense_dns_required_for_acme.
|
||||
# 2. Build + import the image to ALL RKE2 nodes (server + both agents) since
|
||||
# the Pod uses a Longhorn PVC and may schedule anywhere.
|
||||
# Memory: feedback_rke2_localhost_imagepullpolicy.
|
||||
# 3. Bump the image tag in this file, git push.
|
||||
# 4. ArgoCD ApplicationSet picks up within ~3 minutes and creates
|
||||
# infra-knowledge.
|
||||
#
|
||||
# Initial-deploy state:
|
||||
# The Longhorn PVC is empty on first deploy. Knowledge.Web's filesystem
|
||||
# catalog will report zero editions until vector-store *.db files are
|
||||
# pushed into /data/vector-stores. Initial population is a follow-up step
|
||||
# (Phase 2.5+, Blazor admin UI's "Rebuild" button); for the first deploy
|
||||
# the goal is just to prove the pod boots, /healthz returns 200, and the
|
||||
# Traefik IngressRoute serves the Scalar UI.
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Namespace
|
||||
metadata:
|
||||
name: knowledge
|
||||
labels:
|
||||
app.kubernetes.io/part-of: bluejay-infra
|
||||
---
|
||||
# MCP bearer token for the read-only Agent Zero Phase 1 lane. The 1Password
|
||||
# item currently stores the raw token in its concealed PASSWORD field, which
|
||||
# the operator syncs into the namespaced Secret key `password`.
|
||||
apiVersion: onepassword.com/v1
|
||||
kind: OnePasswordItem
|
||||
metadata:
|
||||
name: knowledge-mcp-tokens
|
||||
namespace: knowledge
|
||||
spec:
|
||||
itemPath: "vaults/IAmWorkin/items/FlowerCore Knowledge MCP Tokens"
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: PersistentVolumeClaim
|
||||
metadata:
|
||||
name: knowledge-vector-store
|
||||
namespace: knowledge
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
storageClassName: longhorn
|
||||
resources:
|
||||
requests:
|
||||
storage: 20Gi
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: knowledge-web
|
||||
namespace: knowledge
|
||||
labels:
|
||||
app: knowledge-web
|
||||
app.kubernetes.io/name: knowledge-web
|
||||
app.kubernetes.io/part-of: bluejay-infra
|
||||
spec:
|
||||
replicas: 1
|
||||
revisionHistoryLimit: 3
|
||||
# RWO Longhorn PVC blocks rolling updates (multi-attach error). Recreate
|
||||
# is the canonical pattern (memory: feedback_rwo_pvc_blocks_rolling).
|
||||
strategy:
|
||||
type: Recreate
|
||||
selector:
|
||||
matchLabels:
|
||||
app: knowledge-web
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: knowledge-web
|
||||
app.kubernetes.io/name: knowledge-web
|
||||
app.kubernetes.io/part-of: bluejay-infra
|
||||
annotations:
|
||||
prometheus.io/scrape: "true"
|
||||
prometheus.io/port: "8080"
|
||||
prometheus.io/path: "/metrics"
|
||||
spec:
|
||||
securityContext:
|
||||
runAsNonRoot: true
|
||||
fsGroup: 1654
|
||||
fsGroupChangePolicy: OnRootMismatch
|
||||
containers:
|
||||
- name: web
|
||||
# Placeholder tag — bump to the image you built + imported to ALL
|
||||
# RKE2 nodes via scripts/deploy-knowledge.sh before applying.
|
||||
image: localhost/fc-knowledge-web:v20260429232635
|
||||
imagePullPolicy: Never
|
||||
command:
|
||||
- /bin/sh
|
||||
- -c
|
||||
args:
|
||||
- |
|
||||
if [ -n "${KNOWLEDGE_MCP_BEARER_TOKEN:-}" ]; then
|
||||
export FlowerCore__Mcp__ApiKey__Key="Bearer ${KNOWLEDGE_MCP_BEARER_TOKEN}"
|
||||
fi
|
||||
exec dotnet FlowerCore.Knowledge.Web.dll
|
||||
ports:
|
||||
- containerPort: 8080
|
||||
name: http
|
||||
env:
|
||||
- name: ASPNETCORE_URLS
|
||||
value: "http://+:8080"
|
||||
- name: ASPNETCORE_ENVIRONMENT
|
||||
value: "Production"
|
||||
- name: DOTNET_SYSTEM_GLOBALIZATION_INVARIANT
|
||||
value: "false"
|
||||
# Vector-store directory + embedding model + edition profile dir.
|
||||
# Profile JSON is baked into the image at /home/app/editions via the
|
||||
# csproj Content-link from FlowerCore.Common/editions/.
|
||||
- name: Knowledge__VectorStoresDirectory
|
||||
value: "/data/vector-stores"
|
||||
- name: Knowledge__EmbeddingModel
|
||||
value: "nomic-embed-text"
|
||||
- name: Knowledge__DefaultLimit
|
||||
value: "5"
|
||||
- name: Knowledge__MaxLimit
|
||||
value: "50"
|
||||
- name: FlowerCore__Editions__ProfileDirectory
|
||||
value: "/home/app/editions"
|
||||
# Embed via edge1 Pi 5 + AI HAT+ (10.0.57.17:11434). Cluster
|
||||
# services do not depend on BLUEJAY-WS (private dev hardware) per
|
||||
# bluejay-infra@0f9d56e. Query-time embedding is fast enough on
|
||||
# edge1 (~ms per query); bulk index rebuilds (Phase 2.5+) will
|
||||
# need a separate ingestion lane that can opt into the
|
||||
# workstation GPU when present.
|
||||
- name: FlowerCore__Ollama__BaseUrl
|
||||
value: "http://10.0.57.17:11434"
|
||||
- name: FlowerCore__Mcp__ApiKey__Key
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: knowledge-mcp-tokens
|
||||
key: password
|
||||
- name: FlowerCore__Mcp__ApiKey__HeaderName
|
||||
value: "Authorization"
|
||||
- name: KNOWLEDGE_MCP_BEARER_TOKEN
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: knowledge-mcp-tokens
|
||||
key: password
|
||||
resources:
|
||||
requests:
|
||||
cpu: 100m
|
||||
memory: 256Mi
|
||||
limits:
|
||||
cpu: 1000m
|
||||
memory: 1Gi
|
||||
# /healthz is mapped by HealthController (controller-based route).
|
||||
# tcpSocket liveness is the defensive fallback in case middleware
|
||||
# later gates /healthz behind auth (memory:
|
||||
# feedback_k8s_probes_behind_auth_middleware).
|
||||
startupProbe:
|
||||
httpGet:
|
||||
path: /healthz
|
||||
port: 8080
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
failureThreshold: 30
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /healthz
|
||||
port: 8080
|
||||
periodSeconds: 10
|
||||
failureThreshold: 3
|
||||
livenessProbe:
|
||||
tcpSocket:
|
||||
port: 8080
|
||||
initialDelaySeconds: 30
|
||||
periodSeconds: 30
|
||||
failureThreshold: 3
|
||||
securityContext:
|
||||
runAsNonRoot: true
|
||||
runAsUser: 1654
|
||||
runAsGroup: 1654
|
||||
allowPrivilegeEscalation: false
|
||||
readOnlyRootFilesystem: true
|
||||
capabilities:
|
||||
drop:
|
||||
- ALL
|
||||
volumeMounts:
|
||||
- name: vector-store
|
||||
mountPath: /data/vector-stores
|
||||
- name: tmp
|
||||
mountPath: /tmp
|
||||
- name: logs
|
||||
mountPath: /home/app/logs
|
||||
volumes:
|
||||
- name: vector-store
|
||||
persistentVolumeClaim:
|
||||
claimName: knowledge-vector-store
|
||||
- name: tmp
|
||||
emptyDir: {}
|
||||
- name: logs
|
||||
emptyDir: {}
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: knowledge-web
|
||||
namespace: knowledge
|
||||
labels:
|
||||
app: knowledge-web
|
||||
app.kubernetes.io/name: knowledge-web
|
||||
app.kubernetes.io/part-of: bluejay-infra
|
||||
spec:
|
||||
type: ClusterIP
|
||||
selector:
|
||||
app: knowledge-web
|
||||
ports:
|
||||
- name: http
|
||||
port: 80
|
||||
targetPort: 8080
|
||||
---
|
||||
apiVersion: cert-manager.io/v1
|
||||
kind: Certificate
|
||||
metadata:
|
||||
name: knowledge-tls
|
||||
namespace: knowledge
|
||||
spec:
|
||||
secretName: knowledge-tls
|
||||
issuerRef:
|
||||
name: step-ca-acme
|
||||
kind: ClusterIssuer
|
||||
dnsNames:
|
||||
- knowledge.iamworkin.lan
|
||||
# step-ca ACME caps lifetime at 30d; requesting 90d silently capped
|
||||
# made renewBefore=cert-lifetime → perpetual renewal loop (10888+ CRs
|
||||
# in 18h on 2026-05-07). Match working 720h/240h pattern from other
|
||||
# FC services.
|
||||
duration: 720h # 30d (step-ca cap)
|
||||
renewBefore: 240h # 10d
|
||||
---
|
||||
apiVersion: traefik.io/v1alpha1
|
||||
kind: IngressRoute
|
||||
metadata:
|
||||
name: knowledge
|
||||
namespace: knowledge
|
||||
spec:
|
||||
entryPoints:
|
||||
- websecure
|
||||
routes:
|
||||
- match: Host(`knowledge.iamworkin.lan`)
|
||||
kind: Rule
|
||||
services:
|
||||
- name: knowledge-web
|
||||
port: 80
|
||||
tls:
|
||||
secretName: knowledge-tls
|
||||
7
apps/knowledge/kustomization.yaml
Normal file
7
apps/knowledge/kustomization.yaml
Normal file
@@ -0,0 +1,7 @@
|
||||
# ArgoCD's bluejay-infra ApplicationSet uses a directory generator and does
|
||||
# not require kustomization.yaml. Mirrors the fc-distribution shape so
|
||||
# `kubectl kustomize` previews work from a working copy.
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- knowledge.yaml
|
||||
510
apps/kubevirt-vms/ci1.yaml
Normal file
510
apps/kubevirt-vms/ci1.yaml
Normal file
@@ -0,0 +1,510 @@
|
||||
# =============================================================================
|
||||
# ci1 — Windows Server 2025 KubeVirt VM (GitHub Actions Self-Hosted Runner)
|
||||
# =============================================================================
|
||||
# Purpose: dedicated CI runner for FlowerCore.Updater Sandbox E2E nightly +
|
||||
# future fleet WPF AAT lanes. Replaces the never-registered
|
||||
# `bluejay-ws-sandbox-1` runner placeholder. Andrew explicitly does NOT want
|
||||
# BLUEJAY-WS registered as a runner (workstation has personal/operator state).
|
||||
#
|
||||
# Storage layout (2026-05-08):
|
||||
# * ISO is now sourced from Synology NFS (Path B) — see
|
||||
# win2025-iso-nfs-pv.yaml. The Longhorn Filesystem PVC
|
||||
# `windows-server-2025-iso` below is RETAINED but UNUSED so the prior
|
||||
# CDI upload state is preserved as a fallback (and so ArgoCD doesn't
|
||||
# prune it on this commit). It can be deleted in a follow-up commit
|
||||
# after the NFS path is proven on a successful Windows install.
|
||||
#
|
||||
# Status (2026-05-08): LIVE — Phase 1 prereqs satisfied:
|
||||
# * Multus CNI v4.2.2 thick-plugin DaemonSet running on all 3 RKE2 nodes
|
||||
# (apps/multus/multus.yaml; ApplicationSet `infra-multus` Synced/Healthy)
|
||||
# * CDI v1.65.0 operator + CR Deployed (apps/cdi/; ApplicationSet
|
||||
# `infra-cdi` Synced/Healthy; uploadproxy reachable via kubectl port-forward)
|
||||
# * Windows Server 2025 ISO uploaded via CDI virtctl image-upload to
|
||||
# PVC windows-server-2025-iso (7.7 GiB → 10Gi PVC, Bound, Upload Complete)
|
||||
# * Local Administrator password generated, stored in 1Password vault
|
||||
# IAmWorkin (qaphopopkryhbg353ukzhhuqoq) item id h3ix4mgfk65gmkcmvh6ly3d3hu
|
||||
# * NetworkAttachmentDefinition prod-vlan57 registered (apps/kubevirt-vms/
|
||||
# prod-vlan57-nad.yaml). VM still uses pod-network masquerade until Phase 1.5
|
||||
# host bridge work lands (Puppet br-prod + enp86s0.57); switching is a
|
||||
# one-line YAML edit + git push.
|
||||
#
|
||||
# See docs/infrastructure/windows-server-build-runner-plan.md "Phase 1 readiness gate".
|
||||
#
|
||||
# Network choice in this draft: **pod-network fallback** (Calico default).
|
||||
# Outbound-only is fine for the Updater Sandbox E2E runner workload (the runner
|
||||
# polls GitHub Actions over HTTPS; no inbound listener needed). Switch to a
|
||||
# Multus PROD VLAN NetworkAttachmentDefinition once Multus is installed and the
|
||||
# operator wants L2 access from `ci1` to other PROD VLAN services.
|
||||
#
|
||||
# Sizing: 8 vCPU / 16 GB RAM / 200 GB disk on Longhorn (default storageClass).
|
||||
# Capacity check 2026-05-08: each RKE2 node has 16 vCPU / ~64Gi allocatable;
|
||||
# 8 vCPU is ~17% of one node's allocatable, fits comfortably.
|
||||
#
|
||||
# Apply (after operator approval + ISO loaded):
|
||||
# kubectl --kubeconfig $env:USERPROFILE\.kube\rke2.yaml apply -f apps/kubevirt-vms/ci1.yaml
|
||||
#
|
||||
# Connect to console for Windows install:
|
||||
# virtctl --kubeconfig $env:USERPROFILE\.kube\rke2.yaml vnc ci1 -n kubevirt-vms
|
||||
# (Or via Guacamole once a connection profile is added.)
|
||||
# =============================================================================
|
||||
|
||||
apiVersion: v1
|
||||
kind: Namespace
|
||||
metadata:
|
||||
name: kubevirt-vms
|
||||
labels:
|
||||
app.kubernetes.io/part-of: kubevirt-stack
|
||||
pod-security.kubernetes.io/enforce: privileged
|
||||
|
||||
---
|
||||
# ISO PVC — populated via CDI virtctl image-upload (CDI is now installed).
|
||||
#
|
||||
# **Volume mode (2026-05-08 status):** Filesystem-mode PVC. A migration to
|
||||
# `volumeMode: Block` via DataVolume was attempted to address an OVMF SATA
|
||||
# CDROM read timeout, but CDI v1.65.0's upload-target pod runs as uid 107
|
||||
# with `capabilities.drop: [ALL]` and cannot open the underlying block
|
||||
# device (`blockdev: cannot open /dev/cdi-block-volume: Permission denied`).
|
||||
# Reverted to Filesystem PVC pending one of:
|
||||
# - CDI deployment override granting CAP_SYS_RAWIO to upload pod
|
||||
# - Pre-populated PVC via privileged init pod that dd's the ISO directly
|
||||
# - Migration to a different storage class that exposes block devices
|
||||
# differently (e.g. iSCSI, where Longhorn's CSI mount path may behave
|
||||
# differently)
|
||||
#
|
||||
# Population workflow (this PVC, Filesystem mode):
|
||||
# 1. virtctl --kubeconfig $env:USERPROFILE\.kube\rke2.yaml image-upload pvc \
|
||||
# windows-server-2025-iso -n kubevirt-vms \
|
||||
# --image-path "$env:USERPROFILE\Downloads\en-us_windows_server_2025_updated_march_2026_x64_dvd_8e06425a.iso" \
|
||||
# --size 10Gi --storage-class longhorn --access-mode ReadWriteOnce \
|
||||
# --uploadproxy-url https://localhost:8443 --insecure
|
||||
# (--uploadproxy-url uses port-forward in practice: `kubectl port-forward
|
||||
# -n cdi service/cdi-uploadproxy 8443:443 &` first.)
|
||||
#
|
||||
# **Open boot issue:** even with the ISO at bootOrder:1, OVMF console showed:
|
||||
# BdsDxe: starting Boot0001 "UEFI QEMU DVD-ROM QM00001 " from ... Sata(...)
|
||||
# BdsDxe: failed to start Boot0001 ... Time out
|
||||
# Diagnosis confirmed PVC content IS a valid bootable ISO9660 image — the
|
||||
# timeout is in OVMF reading from the SATA-CDROM-backed-by-filesystem-PVC.
|
||||
# Block mode would likely fix it; see CDI permission issue above.
|
||||
apiVersion: v1
|
||||
kind: PersistentVolumeClaim
|
||||
metadata:
|
||||
name: windows-server-2025-iso
|
||||
namespace: kubevirt-vms
|
||||
labels:
|
||||
app: ci-runner
|
||||
flowercore.io/managed-by: bluejay-infra
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteOnce # Bump to ReadOnlyMany after population for multi-VM use
|
||||
resources:
|
||||
requests:
|
||||
storage: 10Gi # Server 2025 ISO is 7.7GB; 10Gi for headroom
|
||||
storageClassName: longhorn
|
||||
|
||||
---
|
||||
# Root disk PVC — empty 200Gi volume that Windows installs into.
|
||||
apiVersion: v1
|
||||
kind: PersistentVolumeClaim
|
||||
metadata:
|
||||
name: ci1-rootdisk
|
||||
namespace: kubevirt-vms
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
resources:
|
||||
requests:
|
||||
storage: 200Gi
|
||||
storageClassName: longhorn
|
||||
|
||||
---
|
||||
# Sysprep ConfigMap — autounattend.xml for hands-off Windows install.
|
||||
# Sets local Administrator password (REPLACE the placeholder), enables RDP,
|
||||
# enables WinRM, sets hostname, and configures static-ish networking via DHCP.
|
||||
# The ISO + VirtIO drivers handle the rest.
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: ci1-autounattend
|
||||
namespace: kubevirt-vms
|
||||
data:
|
||||
autounattend.xml: |
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<unattend xmlns="urn:schemas-microsoft-com:unattend">
|
||||
|
||||
<!-- Pass 1: WindowsPE — Disk setup and VirtIO driver injection -->
|
||||
<settings pass="windowsPE">
|
||||
<component name="Microsoft-Windows-International-Core-WinPE"
|
||||
processorArchitecture="amd64"
|
||||
publicKeyToken="31bf3856ad364e35"
|
||||
language="neutral" versionScope="nonSxS">
|
||||
<SetupUILanguage>
|
||||
<UILanguage>en-US</UILanguage>
|
||||
</SetupUILanguage>
|
||||
<InputLocale>en-US</InputLocale>
|
||||
<SystemLocale>en-US</SystemLocale>
|
||||
<UILanguage>en-US</UILanguage>
|
||||
<UserLocale>en-US</UserLocale>
|
||||
</component>
|
||||
|
||||
<component name="Microsoft-Windows-PnpCustomizationsWinPE"
|
||||
processorArchitecture="amd64"
|
||||
publicKeyToken="31bf3856ad364e35"
|
||||
language="neutral" versionScope="nonSxS">
|
||||
<DriverPaths>
|
||||
<PathAndCredentials wcm:action="add" wcm:keyValue="1">
|
||||
<Path>E:\amd64\2k25</Path>
|
||||
</PathAndCredentials>
|
||||
</DriverPaths>
|
||||
</component>
|
||||
|
||||
<component name="Microsoft-Windows-Setup"
|
||||
processorArchitecture="amd64"
|
||||
publicKeyToken="31bf3856ad364e35"
|
||||
language="neutral" versionScope="nonSxS">
|
||||
<DiskConfiguration>
|
||||
<Disk wcm:action="add">
|
||||
<DiskID>0</DiskID>
|
||||
<WillWipeDisk>true</WillWipeDisk>
|
||||
<CreatePartitions>
|
||||
<CreatePartition wcm:action="add">
|
||||
<Order>1</Order>
|
||||
<Size>260</Size>
|
||||
<Type>EFI</Type>
|
||||
</CreatePartition>
|
||||
<CreatePartition wcm:action="add">
|
||||
<Order>2</Order>
|
||||
<Size>128</Size>
|
||||
<Type>MSR</Type>
|
||||
</CreatePartition>
|
||||
<CreatePartition wcm:action="add">
|
||||
<Order>3</Order>
|
||||
<Extend>true</Extend>
|
||||
<Type>Primary</Type>
|
||||
</CreatePartition>
|
||||
</CreatePartitions>
|
||||
<ModifyPartitions>
|
||||
<ModifyPartition wcm:action="add">
|
||||
<Order>1</Order>
|
||||
<PartitionID>1</PartitionID>
|
||||
<Format>FAT32</Format>
|
||||
<Label>EFI</Label>
|
||||
</ModifyPartition>
|
||||
<ModifyPartition wcm:action="add">
|
||||
<Order>2</Order>
|
||||
<PartitionID>2</PartitionID>
|
||||
</ModifyPartition>
|
||||
<ModifyPartition wcm:action="add">
|
||||
<Order>3</Order>
|
||||
<PartitionID>3</PartitionID>
|
||||
<Format>NTFS</Format>
|
||||
<Label>Windows</Label>
|
||||
</ModifyPartition>
|
||||
</ModifyPartitions>
|
||||
</Disk>
|
||||
</DiskConfiguration>
|
||||
|
||||
<ImageInstall>
|
||||
<OSImage>
|
||||
<InstallTo>
|
||||
<DiskID>0</DiskID>
|
||||
<PartitionID>3</PartitionID>
|
||||
</InstallTo>
|
||||
<!-- Index 2 = Standard Desktop Experience. Use 4 for Datacenter Desktop. -->
|
||||
<InstallFrom>
|
||||
<MetaData wcm:action="add">
|
||||
<Key>/IMAGE/INDEX</Key>
|
||||
<Value>2</Value>
|
||||
</MetaData>
|
||||
</InstallFrom>
|
||||
</OSImage>
|
||||
</ImageInstall>
|
||||
|
||||
<UserData>
|
||||
<AcceptEula>true</AcceptEula>
|
||||
<FullName>FlowerCore CI Runner</FullName>
|
||||
<Organization>FlowerCore</Organization>
|
||||
<!-- Eval install — no product key needed for 180-day evaluation -->
|
||||
</UserData>
|
||||
</component>
|
||||
</settings>
|
||||
|
||||
<!-- Pass 4: Specialize — Hostname, RDP, WinRM -->
|
||||
<settings pass="specialize">
|
||||
<component name="Microsoft-Windows-Shell-Setup"
|
||||
processorArchitecture="amd64"
|
||||
publicKeyToken="31bf3856ad364e35"
|
||||
language="neutral" versionScope="nonSxS">
|
||||
<ComputerName>CI1</ComputerName>
|
||||
<TimeZone>Central Standard Time</TimeZone>
|
||||
</component>
|
||||
|
||||
<component name="Microsoft-Windows-TerminalServices-LocalSessionManager"
|
||||
processorArchitecture="amd64"
|
||||
publicKeyToken="31bf3856ad364e35"
|
||||
language="neutral" versionScope="nonSxS">
|
||||
<fDenyTSConnections>false</fDenyTSConnections>
|
||||
</component>
|
||||
</settings>
|
||||
|
||||
<!-- Pass 7: OOBE — Admin account, RDP firewall, WinRM -->
|
||||
<settings pass="oobeSystem">
|
||||
<component name="Microsoft-Windows-Shell-Setup"
|
||||
processorArchitecture="amd64"
|
||||
publicKeyToken="31bf3856ad364e35"
|
||||
language="neutral" versionScope="nonSxS">
|
||||
<OOBE>
|
||||
<HideEULAPage>true</HideEULAPage>
|
||||
<HideLocalAccountScreen>true</HideLocalAccountScreen>
|
||||
<HideOEMRegistrationScreen>true</HideOEMRegistrationScreen>
|
||||
<HideOnlineAccountScreens>true</HideOnlineAccountScreens>
|
||||
<HideWirelessSetupInOOBE>true</HideWirelessSetupInOOBE>
|
||||
<ProtectYourPC>3</ProtectYourPC>
|
||||
</OOBE>
|
||||
<UserAccounts>
|
||||
<AdministratorPassword>
|
||||
<!-- Real password is in 1Password — vault qaphopopkryhbg353ukzhhuqoq,
|
||||
item id h3ix4mgfk65gmkcmvh6ly3d3hu, title:
|
||||
"ci1 Administrator (Windows Server 2025 KubeVirt VM)".
|
||||
Field "autounattend AdministratorPassword Value (UTF-16-LE base64)"
|
||||
matches the Value below.
|
||||
To rotate: regenerate, recompute base64
|
||||
$combined = $pw + "AdministratorPassword"
|
||||
[Convert]::ToBase64String([Text.Encoding]::Unicode.GetBytes($combined))
|
||||
then update both 1P item AND this Value field, recreate VM. -->
|
||||
<Value>bAA3AGsANABOAHcAcgBMAG4AeQBTAHUAYgBBAHQAaQBzAFUAcAB6AEMAWQAhADkAYQBCAEEAZABtAGkAbgBpAHMAdAByAGEAdABvAHIAUABhAHMAcwB3AG8AcgBkAA==</Value>
|
||||
<PlainText>false</PlainText>
|
||||
</AdministratorPassword>
|
||||
</UserAccounts>
|
||||
<FirstLogonCommands>
|
||||
<SynchronousCommand wcm:action="add">
|
||||
<Order>1</Order>
|
||||
<CommandLine>powershell.exe -ExecutionPolicy Bypass -Command "Set-NetFirewallRule -DisplayGroup 'Remote Desktop' -Enabled True"</CommandLine>
|
||||
<Description>Enable RDP firewall rule</Description>
|
||||
</SynchronousCommand>
|
||||
<SynchronousCommand wcm:action="add">
|
||||
<Order>2</Order>
|
||||
<CommandLine>powershell.exe -ExecutionPolicy Bypass -Command "Enable-PSRemoting -Force; Set-Item WSMan:\localhost\Service\Auth\Basic $true; Set-Item WSMan:\localhost\Service\AllowUnencrypted $true"</CommandLine>
|
||||
<Description>Enable WinRM (Phase 2 will pivot to HTTPS via step-ca cert)</Description>
|
||||
</SynchronousCommand>
|
||||
<SynchronousCommand wcm:action="add">
|
||||
<Order>3</Order>
|
||||
<CommandLine>cmd.exe /c reg add "HKLM\SOFTWARE\Microsoft\Windows\CurrentVersion\Policies\System" /v EnableLUA /t REG_DWORD /d 0 /f</CommandLine>
|
||||
<Description>Disable UAC (Phase 2 Puppet will re-evaluate)</Description>
|
||||
</SynchronousCommand>
|
||||
</FirstLogonCommands>
|
||||
</component>
|
||||
</settings>
|
||||
</unattend>
|
||||
|
||||
---
|
||||
# VirtualMachine — Windows Server 2025 CI runner.
|
||||
apiVersion: kubevirt.io/v1
|
||||
kind: VirtualMachine
|
||||
metadata:
|
||||
name: ci1
|
||||
namespace: kubevirt-vms
|
||||
labels:
|
||||
app: ci-runner
|
||||
role: github-actions-runner
|
||||
flowercore.io/managed-by: bluejay-infra
|
||||
spec:
|
||||
# `running: true` is deprecated in favor of `runStrategy`. They are mutually
|
||||
# exclusive — KubeVirt's validating webhook rejects any VM that sets both:
|
||||
# admission webhook "virtualmachine-validator.kubevirt.io" denied the request:
|
||||
# Running and RunStrategy are mutually exclusive.
|
||||
# `Always` keeps a VMI running and restarts it if it crashes/exits — same
|
||||
# semantics as the old `running: true`.
|
||||
#
|
||||
# **2026-05-08 status: VM cannot start due to a stale QEMU flock on the
|
||||
# rootdisk PVC** (qemu reports `Failed to get "write" lock` on
|
||||
# `/var/run/kubevirt-private/vmi-disks/rootdisk/disk.img`). The flock was
|
||||
# left by a previous QEMU process during a force-deleted launcher pod
|
||||
# cycle. Recovery requires either (a) a Longhorn engine restart on
|
||||
# rke2-agent2, (b) a Longhorn volume detach via the longhorn-manager API
|
||||
# (kubectl patch on `volume.longhorn.io/<pvc-name>` does not work — the
|
||||
# spec.nodeID is reconciled back), or (c) a node reboot of rke2-agent2.
|
||||
#
|
||||
# **Confirmed working:** the bootOrder swap (windows-iso=1, rootdisk=2)
|
||||
# and the runStrategy migration (above). The ISO PVC was successfully
|
||||
# repopulated via virtctl image-upload pvc on the Filesystem-mode PVC.
|
||||
#
|
||||
# **Open: SATA CDROM read timeout** — even with bootOrder=1, OVMF reported
|
||||
# `BdsDxe: failed to start Boot0001 ... Time out` reading the SATA CDROM
|
||||
# backed by the Filesystem-mode PVC. A switch to Block-mode DataVolume
|
||||
# was attempted but blocked by a CDI v1.65.0 upload-pod permission issue
|
||||
# (capability drop prevents writing to the underlying block device).
|
||||
# See header docstring on the ISO PVC.
|
||||
runStrategy: Always # LIVE — ISO uploaded 2026-05-08, password in 1P
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: ci-runner
|
||||
role: github-actions-runner
|
||||
kubevirt.io/vm: ci1
|
||||
spec:
|
||||
domain:
|
||||
cpu:
|
||||
cores: 8
|
||||
sockets: 1
|
||||
threads: 1
|
||||
memory:
|
||||
guest: 16Gi
|
||||
resources:
|
||||
requests:
|
||||
memory: 16Gi
|
||||
limits:
|
||||
memory: 16Gi
|
||||
clock:
|
||||
utc: {}
|
||||
timer:
|
||||
hpet:
|
||||
present: false
|
||||
pit:
|
||||
tickPolicy: delay
|
||||
rtc:
|
||||
tickPolicy: catchup
|
||||
hyperv: {}
|
||||
features:
|
||||
acpi: {}
|
||||
apic: {}
|
||||
hyperv:
|
||||
relaxed: {}
|
||||
vapic: {}
|
||||
spinlocks:
|
||||
spinlocks: 8191
|
||||
smm: {}
|
||||
firmware:
|
||||
bootloader:
|
||||
efi:
|
||||
# 2026-05-08: SecureBoot=false during initial install. With SecureBoot
|
||||
# enabled, OVMF's BdsDxe times out reading Boot0001 from the SCSI
|
||||
# CDROM ("BdsDxe: failed to start Boot0001 ... Time out") before the
|
||||
# EFI bootloader signature can verify against the OVMF VARS trust DB.
|
||||
# KubeVirt's `/usr/share/OVMF/OVMF_VARS.secboot.fd` template doesn't
|
||||
# appear to include the Microsoft KEK/DB by default, so signed
|
||||
# Windows EFI bootloaders fail validation. Disabling SecureBoot lets
|
||||
# OVMF skip the chain check and boot directly. This is acceptable for
|
||||
# a CI runner — TPM 2.0 is still emulated (`tpm: {}` below) so
|
||||
# BitLocker / Hyper-V / WSL still work.
|
||||
# When the operator wants SecureBoot back, the path is:
|
||||
# 1. Custom-build OVMF_VARS.fd with Microsoft KEK/DB enrolled
|
||||
# 2. Mount it into the VM via firmware.bootloader.efi.persistent
|
||||
# 3. Set secureBoot: true again
|
||||
# Tracked separately from the install unblock.
|
||||
secureBoot: false
|
||||
devices:
|
||||
tpm: {} # Non-persistent vTPM — sufficient for runner; no BitLocker
|
||||
disks:
|
||||
# bootOrder: ISO must be 1 for first-boot install (the rootdisk has no
|
||||
# EFI bootloader yet). After Windows installs, it writes its own UEFI
|
||||
# Boot#### entries pointing at the rootdisk's EFI partition; UEFI then
|
||||
# boots from rootdisk going forward and the ISO at bootOrder:2 acts as
|
||||
# a fallback for re-install scenarios.
|
||||
#
|
||||
# Original (broken) order had rootdisk=1, windows-iso=2 — UEFI tried
|
||||
# the empty virtio disk first, got nothing, fell back to the SATA
|
||||
# CDROM at Boot0001 with a short timeout, and timed out before the
|
||||
# CDROM enumerated. Console showed:
|
||||
# BdsDxe: failed to start Boot0001 ... Time out
|
||||
# BdsDxe: No bootable option or device was found.
|
||||
# Confirmed via debug pod: PVC content IS a real bootable ISO9660
|
||||
# (file: "ISO 9660 CD-ROM filesystem data ... (bootable)"), so the
|
||||
# only bug was boot priority.
|
||||
# 2026-05-08 PM: ISO presented as a virtio-blk DISK (not cdrom).
|
||||
# Both SATA and SCSI cdrom buses hit OVMF BdsDxe "starting Boot0001
|
||||
# ... Time out" regardless of storage backend (NFS, Longhorn PVC,
|
||||
# containerDisk tmpfs — all rule out IO speed). The qemu cdrom
|
||||
# emulation path appears to have a deep-seated read window issue
|
||||
# under KubeVirt v1.4.0's OVMF firmware.
|
||||
#
|
||||
# Workaround: present the ISO bytes as a regular virtio-blk disk
|
||||
# (model="virtio-non-transitional"). UEFI/OVMF still recognizes
|
||||
# ISO9660 + El Torito boot records on a regular disk, so it can
|
||||
# boot the EFI bootloader the same way it would from a USB stick.
|
||||
# This is also closer to the FlowerCore.Distribution USB-key
|
||||
# pattern: the ISO bytes live on a block device, UEFI boots from
|
||||
# the GPT/El Torito boot record, Windows installer runs.
|
||||
- name: windows-iso
|
||||
bootOrder: 1
|
||||
disk:
|
||||
bus: virtio
|
||||
- name: rootdisk
|
||||
bootOrder: 2
|
||||
disk:
|
||||
bus: virtio
|
||||
- name: virtio-drivers
|
||||
cdrom:
|
||||
bus: sata
|
||||
- name: sysprep
|
||||
cdrom:
|
||||
bus: sata
|
||||
interfaces:
|
||||
# Pod-network fallback for Phase 1. To switch to PROD VLAN once Multus
|
||||
# + the prod-vlan57 NAD exist, replace this block with:
|
||||
# - name: prod-net
|
||||
# bridge: {}
|
||||
# model: virtio
|
||||
# and update the networks: stanza to use multus.networkName: kubevirt-vms/prod-vlan57
|
||||
- name: default
|
||||
masquerade: {}
|
||||
model: virtio
|
||||
machine:
|
||||
type: q35
|
||||
networks:
|
||||
- name: default
|
||||
pod: {}
|
||||
volumes:
|
||||
- name: rootdisk
|
||||
persistentVolumeClaim:
|
||||
claimName: ci1-rootdisk
|
||||
- name: windows-iso
|
||||
# 2026-05-08 PM (Path C, CONTAINERDISK): the ISO is now packaged as
|
||||
# a KubeVirt containerDisk OCI image baked from
|
||||
# `FROM scratch ; ADD --chown=107:107 disk.img /disk/disk.img`.
|
||||
# The qemu user (uid 107) reads the ISO directly from a tmpfs view
|
||||
# of the OCI layer, bypassing both:
|
||||
# - Synology NFS export ACL (Path B failed: uid 107 denied at
|
||||
# directory level even with mode 0777, see memory
|
||||
# feedback_synology_iso_export_root_only_uid_107_denied)
|
||||
# - OVMF cdrom read-window timeout (Path A and Path B's SCSI
|
||||
# retry both hit `BdsDxe: failed to start Boot0001 ... Time out`
|
||||
# when the cdrom was backed by a PVC the storage controller
|
||||
# couldn't satisfy reads from fast enough).
|
||||
#
|
||||
# Image build (one-time, per ISO version):
|
||||
# 1. Copy ISO to disk.img, write Dockerfile
|
||||
# 2. podman build --tag localhost/win-server-2025:1.0 . (on noc1)
|
||||
# 3. podman save -o win-server-2025-1.0.tar localhost/win-server-2025:1.0
|
||||
# 4. SCP tar to all 3 RKE2 nodes (rke2-server, rke2-agent1, rke2-agent2)
|
||||
# 5. sudo /var/lib/rancher/rke2/bin/ctr -a /run/k3s/containerd/containerd.sock \
|
||||
# -n k8s.io images import /tmp/win-server-2025-1.0.tar
|
||||
# Standard FC pattern per `feedback_rke2_localhost_imagepullpolicy`.
|
||||
#
|
||||
# When a new Windows ISO version ships, bump the tag (1.1, 1.2, ...),
|
||||
# rebuild + redistribute, and update the image: line below in a new
|
||||
# commit. KubeVirt picks up the new image via a VM restart.
|
||||
#
|
||||
# The legacy NFS PVC + PV (apps/kubevirt-vms/win2025-iso-nfs-pv.yaml)
|
||||
# and CDI Longhorn PVC (`windows-server-2025-iso`) are RETAINED for
|
||||
# this commit so the prior states are recoverable. Once the
|
||||
# containerDisk path proves on a successful Windows install, both
|
||||
# legacy artifacts can be pruned in a follow-up commit.
|
||||
containerDisk:
|
||||
image: localhost/win-server-2025:1.0
|
||||
imagePullPolicy: Never
|
||||
- name: virtio-drivers
|
||||
containerDisk:
|
||||
# Pinned to v1.8.2 (latest stable as of 2026-05-08).
|
||||
# The :latest tag uses Docker manifest v1 schema which containerd
|
||||
# 2.1 (RKE2 v1.34.5) refuses to pull with:
|
||||
# "media type application/vnd.docker.distribution.manifest.v1+prettyjws
|
||||
# is no longer supported since containerd v2.1"
|
||||
# v1.8.2 is rebuilt with manifest v2/OCI and works on containerd 2.1.
|
||||
# Bump available: https://quay.io/repository/kubevirt/virtio-container-disk?tab=tags
|
||||
image: quay.io/kubevirt/virtio-container-disk:v1.8.2
|
||||
- name: sysprep
|
||||
sysprep:
|
||||
configMap:
|
||||
name: ci1-autounattend
|
||||
terminationGracePeriodSeconds: 3600
|
||||
69
apps/kubevirt-vms/prod-vlan57-nad.yaml
Normal file
69
apps/kubevirt-vms/prod-vlan57-nad.yaml
Normal file
@@ -0,0 +1,69 @@
|
||||
# =============================================================================
|
||||
# NetworkAttachmentDefinition — PROD VLAN 57 bridge
|
||||
# =============================================================================
|
||||
# Purpose: makes KubeVirt VMs reachable on the PROD VLAN (10.0.57.0/24)
|
||||
# alongside the existing pod network. Required for ci1 to bridge onto PROD
|
||||
# (e.g. to provision/scrape edge1, edge2, kiosks, Pis on the same L2 segment).
|
||||
#
|
||||
# **DEPLOY GATE — Phase 1.5 host work required first**:
|
||||
# On every RKE2 node (rke2-server, rke2-agent1, rke2-agent2):
|
||||
# 1. Switch port (UniFi USL16LP) trunks VLAN 57 to the node — usually
|
||||
# already true since BLUEJAY-WS reaches 10.0.57.x services. Verify
|
||||
# with `ip link show enp86s0.57` after configuring sub-interface, OR
|
||||
# `tcpdump -ni enp86s0 vlan 57` and ping a known PROD host.
|
||||
# 2. Linux bridge `br-prod` enslaving `enp86s0.57` (VLAN sub-interface).
|
||||
# NetworkManager profile examples in the runbook below.
|
||||
# 3. Verify Multus DaemonSet `kube-multus-ds` is Ready on all nodes.
|
||||
#
|
||||
# Without those, applying this NAD has no effect except to register the CRD.
|
||||
# A VM that requests this NAD with no bridge present will fail with:
|
||||
# `error adding pod kubevirt-vms_ci1 to CNI network "prod-vlan57": failed to
|
||||
# plumb VLAN: open /sys/class/net/br-prod/master: no such file or directory`
|
||||
#
|
||||
# Configuration notes:
|
||||
# - cniVersion 0.3.1 to match Multus daemon-config.json
|
||||
# - mtu 1500 (matches enp86s0 default; bump if jumbo frames configured)
|
||||
# - bridge name `br-prod` is convention; if Puppet picks a different name
|
||||
# (e.g. `br57`, `br-vlan57`), edit BOTH this NAD and the ci1.yaml
|
||||
# interface block. Keep them in sync.
|
||||
# - vlan: 0 because the host bridge already strips VLAN tag (br-prod sits
|
||||
# on top of `enp86s0.57`). If we instead used a VLAN-aware bridge with
|
||||
# trunk port, set vlan: 57 here. Current convention is VLAN-stripped at
|
||||
# the sub-interface, so the bridge passes untagged frames.
|
||||
#
|
||||
# Apply:
|
||||
# kubectl --kubeconfig $env:USERPROFILE\.kube\rke2.yaml apply -f apps/kubevirt-vms/prod-vlan57-nad.yaml
|
||||
#
|
||||
# Then update ci1.yaml networks: stanza to:
|
||||
# - name: prod-net
|
||||
# multus:
|
||||
# networkName: kubevirt-vms/prod-vlan57
|
||||
# and the interface block from `masquerade` to `bridge`.
|
||||
# =============================================================================
|
||||
|
||||
---
|
||||
# Namespace must exist already (created by ci1.yaml's first document).
|
||||
# This file imports a NAD into that same namespace.
|
||||
apiVersion: k8s.cni.cncf.io/v1
|
||||
kind: NetworkAttachmentDefinition
|
||||
metadata:
|
||||
name: prod-vlan57
|
||||
namespace: kubevirt-vms
|
||||
annotations:
|
||||
bluejay.iamworkin.lan/host-bridge: "br-prod (enslaves enp86s0.57)"
|
||||
bluejay.iamworkin.lan/cidr: "10.0.57.0/24"
|
||||
bluejay.iamworkin.lan/gateway: "10.0.57.1"
|
||||
bluejay.iamworkin.lan/dns: "10.0.56.1 (pfSense Unbound)"
|
||||
spec:
|
||||
config: |
|
||||
{
|
||||
"cniVersion": "0.3.1",
|
||||
"name": "prod-vlan57",
|
||||
"type": "bridge",
|
||||
"bridge": "br-prod",
|
||||
"ipam": {},
|
||||
"mtu": 1500,
|
||||
"vlan": 0,
|
||||
"promiscMode": true,
|
||||
"preserveDefaultVlan": false
|
||||
}
|
||||
99
apps/kubevirt-vms/win2025-iso-nfs-pv.yaml
Normal file
99
apps/kubevirt-vms/win2025-iso-nfs-pv.yaml
Normal file
@@ -0,0 +1,99 @@
|
||||
# =============================================================================
|
||||
# Windows Server 2025 ISO — Static NFS PV (Path B for SATA-CDROM timeout)
|
||||
# =============================================================================
|
||||
# Purpose: Mount the ISO from Synology NAS via NFS instead of from a Longhorn-
|
||||
# backed Filesystem PVC.
|
||||
#
|
||||
# Why: SATA-CDROM emulation reading from a Longhorn-backed Filesystem PVC is
|
||||
# too slow for OVMF's boot read window — the DVD-ROM enumeration times out
|
||||
# before the bootloader can be read. Symptom on the serial console:
|
||||
# BdsDxe: failed to start Boot0001 "UEFI QEMU DVD-ROM QM00001 " from ...
|
||||
# BdsDxe: failed to start Boot0001 ... Time out
|
||||
# BdsDxe: No bootable option or device was found
|
||||
# Diagnosis confirmed the ISO content is a perfectly valid bootable ISO9660
|
||||
# image — the bug is in the timing path between OVMF and Longhorn-backed
|
||||
# storage, not in the ISO itself.
|
||||
#
|
||||
# Block-mode PVC was tried (`volumeMode: Block` via DataVolume) and would
|
||||
# likely fix the timing, but CDI v1.65.0's upload-target pod cannot open the
|
||||
# block device due to runAsUser:107 + capabilities.drop:[ALL] and we got:
|
||||
# blockdev: cannot open /dev/cdi-block-volume: Permission denied
|
||||
#
|
||||
# NFS-mounted ISO bypasses both issues: no Longhorn slowness, no CDI upload
|
||||
# pod permission concerns. The ISO is read directly from the NAS over a
|
||||
# native NFSv4.1 mount that QEMU's SATA emulator can read at full LAN speed.
|
||||
#
|
||||
# Layout on Synology:
|
||||
# /volume1/ISOs/ (existing export, RKE2 ACL)
|
||||
# en-us_windows_server_2025_updated_march_2026_x64_dvd_8e06425a.iso
|
||||
# win2025-iso-disk/ (new subdir, 2026-05-08)
|
||||
# disk.img -> hardlink to ../en-us_windows_server_2025_..._8e06425a.iso
|
||||
#
|
||||
# KubeVirt's launcher pod expects a PVC mounted at
|
||||
# /var/run/kubevirt-private/vmi-disks/<diskName>/disk.img — by mounting the
|
||||
# `win2025-iso-disk/` subdir as the NFS PV root, `disk.img` lives at the PV's
|
||||
# root and KubeVirt's CDROM emulator finds it without any path manipulation.
|
||||
#
|
||||
# A symlink would NOT work for sub-path NFS mounts (the relative target
|
||||
# `../...iso` falls outside the sub-mount root). A hardlink works because it
|
||||
# references the same inode regardless of mount point.
|
||||
#
|
||||
# Memory references:
|
||||
# - feedback_synology_nfs_volume1_kubernetes_export_scoped (Synology export
|
||||
# scoping pattern — but /volume1/ISOs export, unlike /volume1/kubernetes,
|
||||
# does support sub-path mounts because Synology NFS is configured with
|
||||
# pseudo-fs in NFSv4.1)
|
||||
# - feedback_kubevirt_iso_first_install_bootorder_and_runstrategy (boot
|
||||
# order / runStrategy gotchas, separate from the storage timing issue)
|
||||
#
|
||||
# Validation (2026-05-08, from rke2-server / rke2-agent1 / rke2-agent2):
|
||||
# mount -t nfs -o nfsvers=4.1,ro 10.0.58.3:/volume1/ISOs/win2025-iso-disk /tmp/m
|
||||
# file /tmp/m/disk.img
|
||||
# -> ISO 9660 CD-ROM filesystem data 'SSS_X64FRE_EN-US_DV9' (bootable)
|
||||
# All 3 RKE2 nodes can mount and read.
|
||||
# =============================================================================
|
||||
|
||||
apiVersion: v1
|
||||
kind: PersistentVolume
|
||||
metadata:
|
||||
name: windows-server-2025-iso-nfs
|
||||
labels:
|
||||
flowercore.io/iso: windows-server-2025
|
||||
flowercore.io/managed-by: bluejay-infra
|
||||
spec:
|
||||
capacity:
|
||||
storage: 8Gi
|
||||
accessModes:
|
||||
- ReadOnlyMany
|
||||
volumeMode: Filesystem
|
||||
persistentVolumeReclaimPolicy: Retain
|
||||
storageClassName: "" # static, no provisioner
|
||||
mountOptions:
|
||||
- nfsvers=4.1
|
||||
- ro
|
||||
- hard
|
||||
- timeo=600
|
||||
- retrans=3
|
||||
nfs:
|
||||
server: 10.0.58.3 # BlueJayNAS Synology DS1621+ on HOME VLAN 58
|
||||
path: /volume1/ISOs/win2025-iso-disk
|
||||
readOnly: true
|
||||
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: PersistentVolumeClaim
|
||||
metadata:
|
||||
name: windows-server-2025-iso-nfs
|
||||
namespace: kubevirt-vms
|
||||
labels:
|
||||
app: ci-runner
|
||||
flowercore.io/managed-by: bluejay-infra
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadOnlyMany
|
||||
volumeMode: Filesystem
|
||||
resources:
|
||||
requests:
|
||||
storage: 8Gi
|
||||
storageClassName: ""
|
||||
volumeName: windows-server-2025-iso-nfs
|
||||
@@ -76,15 +76,21 @@ apiVersion: apps/v1
|
||||
kind: StatefulSet
|
||||
metadata:
|
||||
name: matrix-postgres
|
||||
namespace: matrix
|
||||
labels:
|
||||
app: matrix-postgres
|
||||
spec:
|
||||
serviceName: matrix-postgres
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app: matrix-postgres
|
||||
namespace: matrix
|
||||
labels:
|
||||
app: matrix-postgres
|
||||
argocd.argoproj.io/instance: infra-matrix
|
||||
spec:
|
||||
persistentVolumeClaimRetentionPolicy:
|
||||
whenDeleted: Retain
|
||||
whenScaled: Retain
|
||||
podManagementPolicy: OrderedReady
|
||||
serviceName: matrix-postgres
|
||||
replicas: 1
|
||||
revisionHistoryLimit: 10
|
||||
selector:
|
||||
matchLabels:
|
||||
app: matrix-postgres
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
@@ -137,12 +143,17 @@ spec:
|
||||
name: matrix-postgres-data
|
||||
spec:
|
||||
accessModes: [ReadWriteOnce]
|
||||
resources:
|
||||
requests:
|
||||
storage: 5Gi
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
volumeMode: Filesystem
|
||||
resources:
|
||||
requests:
|
||||
storage: 5Gi
|
||||
updateStrategy:
|
||||
rollingUpdate:
|
||||
partition: 0
|
||||
type: RollingUpdate
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: matrix-postgres
|
||||
namespace: matrix
|
||||
|
||||
762
apps/monitoring/fc-updatecenter-dashboard.grafana.txt
Normal file
762
apps/monitoring/fc-updatecenter-dashboard.grafana.txt
Normal file
@@ -0,0 +1,762 @@
|
||||
{
|
||||
"annotations": {
|
||||
"list": []
|
||||
},
|
||||
"editable": true,
|
||||
"fiscalYearStartMonth": 0,
|
||||
"graphTooltip": 1,
|
||||
"id": null,
|
||||
"links": [
|
||||
{
|
||||
"icon": "external link",
|
||||
"includeVars": false,
|
||||
"keepTime": false,
|
||||
"targetBlank": true,
|
||||
"title": "Open Service",
|
||||
"type": "link",
|
||||
"url": "https://updatecenter.iamworkin.lan/"
|
||||
}
|
||||
],
|
||||
"panels": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "fffjikve8llhce"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"mappings": [
|
||||
{
|
||||
"options": {
|
||||
"0": {
|
||||
"color": "#f87171",
|
||||
"index": 1,
|
||||
"text": "DOWN"
|
||||
},
|
||||
"1": {
|
||||
"color": "#4ade80",
|
||||
"index": 0,
|
||||
"text": "UP"
|
||||
}
|
||||
},
|
||||
"type": "value"
|
||||
}
|
||||
],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "#f87171",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "#4ade80",
|
||||
"value": 1
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 8,
|
||||
"x": 0,
|
||||
"y": 0
|
||||
},
|
||||
"id": 1,
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "center",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "value_and_name"
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "fffjikve8llhce"
|
||||
},
|
||||
"expr": "probe_success{job=\"probe-traefik-services\",instance=\"updatecenter.iamworkin.lan\"}",
|
||||
"refId": "A",
|
||||
"legendFormat": "Availability"
|
||||
}
|
||||
],
|
||||
"title": "Service Availability",
|
||||
"transparent": true,
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "fffjikve8llhce"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"decimals": 2,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "#f87171",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "#fbbf24",
|
||||
"value": 95
|
||||
},
|
||||
{
|
||||
"color": "#FFB300",
|
||||
"value": 99
|
||||
},
|
||||
{
|
||||
"color": "#4ade80",
|
||||
"value": 99.9
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "percent"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 8,
|
||||
"x": 8,
|
||||
"y": 0
|
||||
},
|
||||
"id": 2,
|
||||
"options": {
|
||||
"colorMode": "background_solid",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "center",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "value_and_name"
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "fffjikve8llhce"
|
||||
},
|
||||
"expr": "avg_over_time(probe_success{job=\"probe-traefik-services\",instance=\"updatecenter.iamworkin.lan\"}[24h]) * 100",
|
||||
"refId": "A",
|
||||
"legendFormat": "24h Uptime"
|
||||
}
|
||||
],
|
||||
"title": "24-Hour Uptime",
|
||||
"transparent": true,
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "fffjikve8llhce"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"max": 30,
|
||||
"min": 0,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "#f87171",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "#fbbf24",
|
||||
"value": 2
|
||||
},
|
||||
{
|
||||
"color": "#4ade80",
|
||||
"value": 7
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "d"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 8,
|
||||
"x": 16,
|
||||
"y": 0
|
||||
},
|
||||
"id": 3,
|
||||
"options": {
|
||||
"minVizHeight": 75,
|
||||
"minVizWidth": 75,
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"showThresholdLabels": false,
|
||||
"showThresholdMarkers": true
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "fffjikve8llhce"
|
||||
},
|
||||
"expr": "(probe_ssl_earliest_cert_expiry{job=\"probe-traefik-services\",instance=\"updatecenter.iamworkin.lan\"} - time()) / 86400",
|
||||
"refId": "A",
|
||||
"legendFormat": "Days Remaining"
|
||||
}
|
||||
],
|
||||
"title": "Cert Expiry (Days)",
|
||||
"transparent": true,
|
||||
"type": "gauge"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "fffjikve8llhce"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"custom": {
|
||||
"axisBorderShow": false,
|
||||
"axisCenteredZero": false,
|
||||
"axisColorMode": "text",
|
||||
"axisLabel": "Response Time (seconds)",
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 12,
|
||||
"gradientMode": "scheme",
|
||||
"lineInterpolation": "smooth",
|
||||
"lineWidth": 2,
|
||||
"pointSize": 4,
|
||||
"showPoints": "never",
|
||||
"spanNulls": true,
|
||||
"thresholdsStyle": {
|
||||
"mode": "dashed"
|
||||
}
|
||||
},
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "#4ade80",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "#fbbf24",
|
||||
"value": 2
|
||||
},
|
||||
{
|
||||
"color": "#f87171",
|
||||
"value": 5
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "s"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 14,
|
||||
"x": 0,
|
||||
"y": 4
|
||||
},
|
||||
"id": 4,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [
|
||||
"lastNotNull",
|
||||
"mean",
|
||||
"max"
|
||||
],
|
||||
"displayMode": "table",
|
||||
"placement": "right"
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "single",
|
||||
"sort": "none"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "fffjikve8llhce"
|
||||
},
|
||||
"expr": "probe_duration_seconds{job=\"probe-traefik-services\",instance=\"updatecenter.iamworkin.lan\"}",
|
||||
"refId": "A",
|
||||
"legendFormat": "Probe Duration"
|
||||
}
|
||||
],
|
||||
"timeFrom": "1h",
|
||||
"title": "Response Time (1h Trend)",
|
||||
"transparent": true,
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "fffjikve8llhce"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 10,
|
||||
"x": 14,
|
||||
"y": 4
|
||||
},
|
||||
"id": 5,
|
||||
"options": {
|
||||
"alertInstanceLabelFilter": "{instance=\"updatecenter.iamworkin.lan\"}",
|
||||
"alertName": "",
|
||||
"dashboardAlerts": false,
|
||||
"groupBy": [],
|
||||
"groupMode": "default",
|
||||
"maxItems": 10,
|
||||
"sortOrder": 1,
|
||||
"stateFilter": {
|
||||
"error": true,
|
||||
"firing": true,
|
||||
"noData": true,
|
||||
"normal": false,
|
||||
"pending": true
|
||||
},
|
||||
"viewMode": "list"
|
||||
},
|
||||
"title": "Active Alerts",
|
||||
"type": "alertlist"
|
||||
},
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": {
|
||||
"h": 1,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 12
|
||||
},
|
||||
"id": 20,
|
||||
"title": "OTEL Counters — Track 1D",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "fffjikve8llhce"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"custom": {
|
||||
"lineWidth": 1,
|
||||
"fillOpacity": 10
|
||||
},
|
||||
"unit": "reqps"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 13
|
||||
},
|
||||
"id": 21,
|
||||
"options": {
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"calcs": ["mean", "lastNotNull"]
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "fffjikve8llhce"
|
||||
},
|
||||
"expr": "sum by (status) (rate(updatecenter_manifest_requests_total[5m]))",
|
||||
"refId": "A",
|
||||
"legendFormat": "status={{status}}"
|
||||
}
|
||||
],
|
||||
"title": "Manifest Requests rate by status (5m)",
|
||||
"transparent": true,
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "fffjikve8llhce"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"custom": {
|
||||
"lineWidth": 1,
|
||||
"fillOpacity": 10
|
||||
},
|
||||
"unit": "Bps"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 13
|
||||
},
|
||||
"id": 22,
|
||||
"options": {
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"calcs": ["mean", "lastNotNull"]
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "fffjikve8llhce"
|
||||
},
|
||||
"expr": "sum by (slug) (rate(updatecenter_bundle_download_bytes_total[5m]))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{slug}}"
|
||||
}
|
||||
],
|
||||
"title": "Bundle Download Throughput by slug (5m)",
|
||||
"transparent": true,
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "fffjikve8llhce"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"custom": {
|
||||
"lineWidth": 1,
|
||||
"fillOpacity": 10
|
||||
},
|
||||
"unit": "reqps"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 21
|
||||
},
|
||||
"id": 23,
|
||||
"options": {
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"calcs": ["mean", "lastNotNull"]
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "fffjikve8llhce"
|
||||
},
|
||||
"expr": "sum by (status) (rate(updatecenter_checkins_total[5m]))",
|
||||
"refId": "A",
|
||||
"legendFormat": "status={{status}}"
|
||||
}
|
||||
],
|
||||
"title": "Agent Check-in Rate by status (5m)",
|
||||
"transparent": true,
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "fffjikve8llhce"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "#4ade80", "value": null },
|
||||
{ "color": "#f87171", "value": 1 }
|
||||
]
|
||||
},
|
||||
"unit": "none",
|
||||
"decimals": 2
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 6,
|
||||
"x": 12,
|
||||
"y": 21
|
||||
},
|
||||
"id": 24,
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "center",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"calcs": ["sum"],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "value_and_name"
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "fffjikve8llhce"
|
||||
},
|
||||
"expr": "increase(updatecenter_signature_verify_failures_total[1h])",
|
||||
"refId": "A",
|
||||
"legendFormat": "Sig Verify Failures (1h)"
|
||||
}
|
||||
],
|
||||
"title": "Signature Verify Failures (1h)",
|
||||
"transparent": true,
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "fffjikve8llhce"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"custom": {
|
||||
"lineWidth": 1,
|
||||
"fillOpacity": 10
|
||||
},
|
||||
"unit": "reqps"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 6,
|
||||
"x": 18,
|
||||
"y": 21
|
||||
},
|
||||
"id": 25,
|
||||
"options": {
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"calcs": ["mean", "lastNotNull"]
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "fffjikve8llhce"
|
||||
},
|
||||
"expr": "sum by (slug, channel) (rate(updatecenter_release_publishes_total[5m]))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{slug}}/{{channel}}"
|
||||
}
|
||||
],
|
||||
"title": "Release Publishes rate by slug/channel (5m)",
|
||||
"transparent": true,
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "fffjikve8llhce"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"custom": {
|
||||
"lineWidth": 1,
|
||||
"fillOpacity": 10
|
||||
},
|
||||
"unit": "reqps"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 29
|
||||
},
|
||||
"id": 26,
|
||||
"options": {
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"calcs": ["mean", "lastNotNull"]
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "fffjikve8llhce"
|
||||
},
|
||||
"expr": "sum by (kind, status) (rate(updatecenter_bundle_downloads_total[5m]))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{kind}} / {{status}}"
|
||||
}
|
||||
],
|
||||
"title": "Bundle Download Requests by kind/status (5m)",
|
||||
"transparent": true,
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "fffjikve8llhce"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"custom": {
|
||||
"lineWidth": 2,
|
||||
"fillOpacity": 20
|
||||
},
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "#4ade80", "value": null },
|
||||
{ "color": "#f87171", "value": 0.01 }
|
||||
]
|
||||
},
|
||||
"unit": "reqps"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 29
|
||||
},
|
||||
"id": 27,
|
||||
"options": {
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"calcs": ["mean", "lastNotNull"]
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "fffjikve8llhce"
|
||||
},
|
||||
"expr": "rate(updatecenter_signature_verify_failures_total[5m])",
|
||||
"refId": "A",
|
||||
"legendFormat": "Sig verify failures/s"
|
||||
}
|
||||
],
|
||||
"title": "Signature Verify Failure Rate (5m) — Critical if >0",
|
||||
"transparent": true,
|
||||
"type": "timeseries"
|
||||
}
|
||||
],
|
||||
"refresh": "30s",
|
||||
"schemaVersion": 39,
|
||||
"style": "dark",
|
||||
"tags": [
|
||||
"blue-jay",
|
||||
"flowercore",
|
||||
"synthetic",
|
||||
"updatecenter",
|
||||
"otel"
|
||||
],
|
||||
"templating": {
|
||||
"list": []
|
||||
},
|
||||
"time": {
|
||||
"from": "now-24h",
|
||||
"to": "now"
|
||||
},
|
||||
"timezone": "browser",
|
||||
"title": "FlowerCore.UpdateCenter Dashboard",
|
||||
"uid": "fc-updatecenter",
|
||||
"version": 2
|
||||
}
|
||||
@@ -0,0 +1,226 @@
|
||||
{
|
||||
"annotations": {
|
||||
"list": []
|
||||
},
|
||||
"editable": true,
|
||||
"fiscalYearStartMonth": 0,
|
||||
"graphTooltip": 0,
|
||||
"id": null,
|
||||
"links": [],
|
||||
"panels": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"unit": "short"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 0
|
||||
},
|
||||
"id": 1,
|
||||
"options": {
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "bottom"
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "single"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"editorMode": "code",
|
||||
"expr": "sum by (event) (increase(fc_desktop_session_events_total[$__rate_interval]))",
|
||||
"legendFormat": "{{event}}",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "RemoteDesktop Session Events",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"unit": "short"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 0
|
||||
},
|
||||
"id": 2,
|
||||
"options": {
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"showUnfilled": true
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"editorMode": "code",
|
||||
"expr": "sum by (template, event) (increase(fc_desktop_session_events_total[24h]))",
|
||||
"legendFormat": "{{template}} {{event}}",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "24h Session Events By Template",
|
||||
"type": "bargauge"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"unit": "short"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 8
|
||||
},
|
||||
"id": 3,
|
||||
"options": {
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "bottom"
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "single"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"editorMode": "code",
|
||||
"expr": "fc_desktop_pool_ready",
|
||||
"legendFormat": "{{template}} ready",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"editorMode": "code",
|
||||
"expr": "fc_desktop_pool_desired",
|
||||
"legendFormat": "{{template}} desired",
|
||||
"range": true,
|
||||
"refId": "B"
|
||||
}
|
||||
],
|
||||
"title": "Warm Pool Ready vs Desired",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "orange",
|
||||
"value": 1
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "short"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 8
|
||||
},
|
||||
"id": 4,
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "auto"
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"editorMode": "code",
|
||||
"expr": "sum(increase(fc_desktop_session_events_total{event=\"connect\",browser_datasource=\"json\"}[24h])) - sum(increase(fc_desktop_session_events_total{event=\"disconnect\"}[24h]))",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "24h Connect Minus Disconnect",
|
||||
"type": "stat"
|
||||
}
|
||||
],
|
||||
"refresh": "30s",
|
||||
"schemaVersion": 39,
|
||||
"style": "dark",
|
||||
"tags": [
|
||||
"flowercore",
|
||||
"remotedesktop",
|
||||
"guacamole"
|
||||
],
|
||||
"templating": {
|
||||
"list": []
|
||||
},
|
||||
"time": {
|
||||
"from": "now-24h",
|
||||
"to": "now"
|
||||
},
|
||||
"timezone": "browser",
|
||||
"title": "FlowerCore RemoteDesktop",
|
||||
"uid": "flowercore-remotedesktop",
|
||||
"version": 1
|
||||
}
|
||||
249
apps/monitoring/grafana-dashboard-remotedesktop.yaml
Normal file
249
apps/monitoring/grafana-dashboard-remotedesktop.yaml
Normal file
@@ -0,0 +1,249 @@
|
||||
# Grafana dashboard ConfigMap for FlowerCore.RemoteDesktop.
|
||||
#
|
||||
# Inlines the JSON from flowercore-remotedesktop-grafana-dashboard.json.
|
||||
# Kept as a standalone file (not inlined in noc-monitoring.yaml) so the
|
||||
# CRLF-dirty state of noc-monitoring.yaml doesn't have to be normalized
|
||||
# in the same pass. To actually load the dashboard, the Grafana Deployment
|
||||
# in noc-monitoring.yaml needs a matching 'volumes:' entry:
|
||||
#
|
||||
# - name: dashboard-remotedesktop
|
||||
# configMap:
|
||||
# name: grafana-dashboard-remotedesktop
|
||||
#
|
||||
# ArgoCD will sync this ConfigMap automatically through the bluejay-infra
|
||||
# ApplicationSet (infra-monitoring App). The dashboard just won't load
|
||||
# until the Grafana Deployment mount is wired.
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: grafana-dashboard-remotedesktop
|
||||
namespace: monitoring
|
||||
data:
|
||||
remotedesktop.json: |
|
||||
{
|
||||
"annotations": {
|
||||
"list": []
|
||||
},
|
||||
"editable": true,
|
||||
"fiscalYearStartMonth": 0,
|
||||
"graphTooltip": 0,
|
||||
"id": null,
|
||||
"links": [],
|
||||
"panels": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"unit": "short"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 0
|
||||
},
|
||||
"id": 1,
|
||||
"options": {
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "bottom"
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "single"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"editorMode": "code",
|
||||
"expr": "sum by (event) (increase(fc_desktop_session_events_total[$__rate_interval]))",
|
||||
"legendFormat": "{{event}}",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "RemoteDesktop Session Events",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"unit": "short"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 0
|
||||
},
|
||||
"id": 2,
|
||||
"options": {
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"showUnfilled": true
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"editorMode": "code",
|
||||
"expr": "sum by (template, event) (increase(fc_desktop_session_events_total[24h]))",
|
||||
"legendFormat": "{{template}} {{event}}",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "24h Session Events By Template",
|
||||
"type": "bargauge"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"unit": "short"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 8
|
||||
},
|
||||
"id": 3,
|
||||
"options": {
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "bottom"
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "single"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"editorMode": "code",
|
||||
"expr": "fc_desktop_pool_ready",
|
||||
"legendFormat": "{{template}} ready",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"editorMode": "code",
|
||||
"expr": "fc_desktop_pool_desired",
|
||||
"legendFormat": "{{template}} desired",
|
||||
"range": true,
|
||||
"refId": "B"
|
||||
}
|
||||
],
|
||||
"title": "Warm Pool Ready vs Desired",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "orange",
|
||||
"value": 1
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "short"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 8
|
||||
},
|
||||
"id": 4,
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "auto"
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"editorMode": "code",
|
||||
"expr": "sum(increase(fc_desktop_session_events_total{event=\"connect\",browser_datasource=\"json\"}[24h])) - sum(increase(fc_desktop_session_events_total{event=\"disconnect\"}[24h]))",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "24h Connect Minus Disconnect",
|
||||
"type": "stat"
|
||||
}
|
||||
],
|
||||
"refresh": "30s",
|
||||
"schemaVersion": 39,
|
||||
"style": "dark",
|
||||
"tags": [
|
||||
"flowercore",
|
||||
"remotedesktop",
|
||||
"guacamole"
|
||||
],
|
||||
"templating": {
|
||||
"list": []
|
||||
},
|
||||
"time": {
|
||||
"from": "now-24h",
|
||||
"to": "now"
|
||||
},
|
||||
"timezone": "browser",
|
||||
"title": "FlowerCore RemoteDesktop",
|
||||
"uid": "flowercore-remotedesktop",
|
||||
"version": 1
|
||||
}
|
||||
@@ -104,21 +104,27 @@ data:
|
||||
- target_label: __address__
|
||||
replacement: snmp-exporter.monitoring.svc:9116
|
||||
|
||||
# UniFi Cloud Key SNMP
|
||||
- job_name: "snmp-cloudkey"
|
||||
static_configs:
|
||||
- targets: ["10.0.56.3"]
|
||||
metrics_path: /snmp
|
||||
params:
|
||||
module: [if_mib]
|
||||
auth: [bluejay_v2]
|
||||
relabel_configs:
|
||||
- source_labels: [__address__]
|
||||
target_label: __param_target
|
||||
- source_labels: [__param_target]
|
||||
target_label: instance
|
||||
- target_label: __address__
|
||||
replacement: snmp-exporter.monitoring.svc:9116
|
||||
# UniFi Cloud Key SNMP — DISABLED 2026-04-26
|
||||
# The Cloud Key Gen2+ runs unifi-core (controller) only — not a network
|
||||
# device — and does NOT run an SNMP agent on UDP/161. Scrapes were
|
||||
# silently failing with "connection refused" from 10.42.x.x:161 every
|
||||
# 30s, polluting up{} = 0 and lastError on the Targets page. Hardware
|
||||
# health (CPU/mem/disk) for the Cloud Key host should come from
|
||||
# node_exporter via SSH — not SNMP.
|
||||
# - job_name: "snmp-cloudkey"
|
||||
# static_configs:
|
||||
# - targets: ["10.0.56.3"]
|
||||
# metrics_path: /snmp
|
||||
# params:
|
||||
# module: [if_mib]
|
||||
# auth: [bluejay_v2]
|
||||
# relabel_configs:
|
||||
# - source_labels: [__address__]
|
||||
# target_label: __param_target
|
||||
# - source_labels: [__param_target]
|
||||
# target_label: instance
|
||||
# - target_label: __address__
|
||||
# replacement: snmp-exporter.monitoring.svc:9116
|
||||
|
||||
# UniFi Switch SNMP
|
||||
- job_name: "snmp-switch"
|
||||
@@ -143,7 +149,7 @@ data:
|
||||
metrics_path: /snmp
|
||||
params:
|
||||
module: [synology]
|
||||
auth: [public_v2]
|
||||
auth: [bluejay_v2]
|
||||
relabel_configs:
|
||||
- source_labels: [__address__]
|
||||
target_label: __param_target
|
||||
@@ -247,7 +253,7 @@ data:
|
||||
device_type: "printer"
|
||||
printer_model: "NuPrint 210"
|
||||
|
||||
# Print.Web OTEL metrics (counters: jobs enqueued/completed/failed, bytes, duration histograms)
|
||||
# Print.Web OTEL metrics (print counters/histograms plus Ollama runner gauges)
|
||||
- job_name: "printweb-otel"
|
||||
scrape_interval: 30s
|
||||
metrics_path: /metrics/prometheus
|
||||
@@ -278,6 +284,38 @@ data:
|
||||
- target_label: __address__
|
||||
replacement: blackbox-exporter.monitoring.svc:9115
|
||||
|
||||
# FlowerCore.RemoteDesktop web health (public cluster VIP)
|
||||
# Module is https_internal — desktop.iamworkin.lan uses a step-ca leaf
|
||||
# cert; blackbox does NOT trust step-ca root, so http_2xx fails with
|
||||
# x509 unknown authority and probe_success=0 even when /health 200s.
|
||||
- job_name: "probe-remotedesktop"
|
||||
metrics_path: /probe
|
||||
params:
|
||||
module: [https_internal]
|
||||
scrape_interval: 30s
|
||||
static_configs:
|
||||
- targets: ["https://desktop.iamworkin.lan/health"]
|
||||
labels:
|
||||
instance: "https://desktop.iamworkin.lan/health"
|
||||
service: "remotedesktop-web"
|
||||
relabel_configs:
|
||||
- source_labels: [__address__]
|
||||
target_label: __param_target
|
||||
- target_label: __address__
|
||||
replacement: blackbox-exporter.monitoring.svc:9115
|
||||
|
||||
# FlowerCore.RemoteDesktop /metrics (direct scrape for counters)
|
||||
- job_name: "fc-remotedesktop"
|
||||
metrics_path: /metrics
|
||||
scheme: https
|
||||
scrape_interval: 30s
|
||||
tls_config:
|
||||
insecure_skip_verify: true
|
||||
static_configs:
|
||||
- targets: ["desktop.iamworkin.lan"]
|
||||
labels:
|
||||
service: "remotedesktop-web"
|
||||
|
||||
# CUPS web UI health (port 631)
|
||||
- job_name: "probe-cups"
|
||||
metrics_path: /probe
|
||||
@@ -301,26 +339,12 @@ data:
|
||||
# AI Stack Health Probes (Blackbox Exporter)
|
||||
# =============================================================================
|
||||
|
||||
# Ollama API — workstation (LOCAL Agent Zero)
|
||||
- job_name: "probe-ollama-local"
|
||||
metrics_path: /probe
|
||||
params:
|
||||
module: [http_ollama]
|
||||
scrape_interval: 30s
|
||||
static_configs:
|
||||
- targets: ["http://10.0.58.100:11434/api/tags"]
|
||||
labels:
|
||||
instance: "ollama-local"
|
||||
service: "ollama"
|
||||
deployment: "local"
|
||||
gpu: "r9700"
|
||||
relabel_configs:
|
||||
- source_labels: [__address__]
|
||||
target_label: __param_target
|
||||
- source_labels: [__param_target]
|
||||
target_label: instance
|
||||
- target_label: __address__
|
||||
replacement: blackbox-exporter.monitoring.svc:9115
|
||||
# NOTE: probe-ollama-local and probe-agentzero-local were REMOVED
|
||||
# 2026-04-26. They pointed at 10.0.58.100 (HOME VLAN) which is not
|
||||
# reachable from cluster pods (firewalled). They had been firing as
|
||||
# OllamaDown / AgentZeroDown since 2026-04-24. Workstation/AI-laptop
|
||||
# Ollama and Agent Zero should be monitored via host-side Puppet
|
||||
# (node_exporter on the box) once the AI laptop is running 24/7.
|
||||
|
||||
# Ollama API — edge1 Pi 5 (NUC Agent Zero)
|
||||
- job_name: "probe-ollama-edge1"
|
||||
@@ -343,34 +367,18 @@ data:
|
||||
- target_label: __address__
|
||||
replacement: blackbox-exporter.monitoring.svc:9115
|
||||
|
||||
# Agent Zero Web UI — local (K3s)
|
||||
- job_name: "probe-agentzero-local"
|
||||
metrics_path: /probe
|
||||
params:
|
||||
module: [http_2xx]
|
||||
scrape_interval: 30s
|
||||
static_configs:
|
||||
- targets: ["http://10.0.58.100:30050/"]
|
||||
labels:
|
||||
instance: "agent-zero-local"
|
||||
service: "agent-zero"
|
||||
deployment: "local"
|
||||
relabel_configs:
|
||||
- source_labels: [__address__]
|
||||
target_label: __param_target
|
||||
- source_labels: [__param_target]
|
||||
target_label: instance
|
||||
- target_label: __address__
|
||||
replacement: blackbox-exporter.monitoring.svc:9115
|
||||
|
||||
# Agent Zero Web UI — NUC (RKE2 via Traefik)
|
||||
# Agent Zero Web UI — in-cluster (RKE2)
|
||||
# Target uses short svc form (agent-zero.agent-zero.svc) NOT
|
||||
# cluster.local FQDN — the *.cluster.local form gets rewritten to
|
||||
# 10.0.56.200 (Traefik VIP) by the CoreDNS iamworkin.lan template +
|
||||
# ndots:5 search-suffix expansion. Memory: feedback_coredns_ndots_template_collision.
|
||||
- job_name: "probe-agentzero-nuc"
|
||||
metrics_path: /probe
|
||||
params:
|
||||
module: [http_2xx]
|
||||
scrape_interval: 30s
|
||||
static_configs:
|
||||
- targets: ["http://agent-zero.agent-zero.svc.cluster.local/"]
|
||||
- targets: ["http://agent-zero.agent-zero.svc:80/"]
|
||||
labels:
|
||||
instance: "agent-zero-nuc"
|
||||
service: "agent-zero"
|
||||
@@ -383,6 +391,119 @@ data:
|
||||
- target_label: __address__
|
||||
replacement: blackbox-exporter.monitoring.svc:9115
|
||||
|
||||
# =============================================================================
|
||||
# K8s Cluster State (kube-state-metrics, cert-manager, traefik)
|
||||
# =============================================================================
|
||||
# Use in-cluster ClusterIP service DNS — NOT NodePorts — so a same-node
|
||||
# NodePort hairpin doesn't break the scrape (hit on rke2-agent1 hosting
|
||||
# both prometheus and traefik on 2026-04-26: 10.0.56.12:30900 timed out
|
||||
# from prometheus while .11/.13 worked). NodePorts at 30900-30902 are
|
||||
# still useful for noc1-Podman-style external scrapers, but in-cluster
|
||||
# we should always use the svc DNS form.
|
||||
|
||||
# kube-state-metrics — exposes K8s object state (pods, deployments, nodes)
|
||||
# Required for KubeContainerRestartingFrequently / KubePodNotReady alerts.
|
||||
- job_name: "kube-state-metrics"
|
||||
scrape_interval: 30s
|
||||
static_configs:
|
||||
- targets: ["kube-state-metrics.kube-system.svc:8080"]
|
||||
labels:
|
||||
cluster: "rke2"
|
||||
|
||||
# cert-manager — exposes certmanager_certificate_ready_status,
|
||||
# certmanager_certificate_expiration_timestamp_seconds, etc. Drives the
|
||||
# CertManagerCertificateNotReady / CertManagerCertificateRenewalFailed
|
||||
# alerts. Memory: project_cert_manager_prometheus_scrape.
|
||||
- job_name: "cert-manager"
|
||||
scrape_interval: 30s
|
||||
static_configs:
|
||||
- targets: ["cert-manager-metrics.cert-manager.svc:9402"]
|
||||
labels:
|
||||
cluster: "rke2"
|
||||
|
||||
# Traefik — request rates, latency, TLS cert metadata, router state.
|
||||
# ClusterIP svc routes to one of the traefik pods; per-pod scrape via
|
||||
# the headless `traefik-metrics` selector would be nicer for failover
|
||||
# visibility but the single-replica scrape is enough for steady-state.
|
||||
- job_name: "traefik"
|
||||
scrape_interval: 15s
|
||||
static_configs:
|
||||
- targets: ["traefik-metrics.traefik-system.svc:9100"]
|
||||
labels:
|
||||
service: "traefik"
|
||||
cluster: "rke2"
|
||||
|
||||
# Longhorn — exposes longhorn_volume_robustness, longhorn_backup_*,
|
||||
# longhorn_node_status_*. Enables LonghornVolumeUnhealthy +
|
||||
# LonghornBackupFailed alerts (no real visibility into Longhorn
|
||||
# health before this — was relying on K8s events which are noisy
|
||||
# transient lifecycle messages, not actionable signals).
|
||||
- job_name: "longhorn"
|
||||
scrape_interval: 30s
|
||||
static_configs:
|
||||
- targets: ["longhorn-backend.longhorn-system.svc:9500"]
|
||||
labels:
|
||||
service: "longhorn"
|
||||
cluster: "rke2"
|
||||
|
||||
# FC web services through Traefik — single probe surface to spot any
|
||||
# iamworkin.lan host returning non-200. Uses https_internal because all
|
||||
# certs are step-ca leaves; blackbox would x509-fail with http_2xx.
|
||||
# Some services need explicit healthcheck paths because root returns
|
||||
# 404 (acme, guac) or 401 (grafana, prometheus). Drop them or point at
|
||||
# the right endpoint — don't lower valid_status_codes globally because
|
||||
# 401 from a healthy pod and 401 from an outage look identical.
|
||||
- job_name: "probe-traefik-services"
|
||||
metrics_path: /probe
|
||||
params:
|
||||
module: [https_internal]
|
||||
scrape_interval: 60s
|
||||
static_configs:
|
||||
- targets:
|
||||
# Root-reachable services (200 or 3xx)
|
||||
- "https://gitea.iamworkin.lan/"
|
||||
- "https://argocd.iamworkin.lan/"
|
||||
- "https://intranet.iamworkin.lan/"
|
||||
- "https://signage.iamworkin.lan/"
|
||||
- "https://kiosk.iamworkin.lan/"
|
||||
- "https://media.iamworkin.lan/"
|
||||
- "https://mysql.iamworkin.lan/"
|
||||
- "https://php.iamworkin.lan/"
|
||||
- "https://zabbix.iamworkin.lan/"
|
||||
- "https://desktop.iamworkin.lan/"
|
||||
- "https://print.iamworkin.lan/"
|
||||
- "https://dns.iamworkin.lan/"
|
||||
- "https://chat.iamworkin.lan/"
|
||||
- "https://dist.iamworkin.lan/"
|
||||
- "https://dms.iamworkin.lan/"
|
||||
- "https://menuboard.iamworkin.lan/"
|
||||
- "https://messageboard.iamworkin.lan/"
|
||||
- "https://presentations.iamworkin.lan/"
|
||||
- "https://retail.iamworkin.lan/"
|
||||
- "https://ttsreader.iamworkin.lan/"
|
||||
# Explicit healthcheck paths
|
||||
- "https://fc-llm-bridge.iamworkin.lan/healthz"
|
||||
- "https://acme.iamworkin.lan/health"
|
||||
# NOTE: services intentionally NOT in this probe surface
|
||||
# - grafana.iamworkin.lan: every endpoint (incl. /api/health
|
||||
# and /login) returns 401 behind Traefik basic-auth.
|
||||
# Health covered by in-cluster monitoring-grafana scrape.
|
||||
# - prometheus.iamworkin.lan: same auth pattern. Health covered
|
||||
# by the prometheus self-scrape job.
|
||||
# - guac.iamworkin.lan: deprecated — Guacamole moved to
|
||||
# desktop.iamworkin.lan/guacamole/ (memory:
|
||||
# feedback_traefik_cross_namespace_refs_disabled).
|
||||
labels:
|
||||
probe_type: "traefik-service"
|
||||
relabel_configs:
|
||||
- source_labels: [__address__]
|
||||
target_label: __param_target
|
||||
- source_labels: [__param_target]
|
||||
regex: "https?://([^/:]+).*"
|
||||
target_label: instance
|
||||
- target_label: __address__
|
||||
replacement: blackbox-exporter.monitoring.svc:9115
|
||||
|
||||
# =============================================================================
|
||||
# Self-monitoring (K8s monitoring namespace)
|
||||
# =============================================================================
|
||||
@@ -433,6 +554,8 @@ data:
|
||||
expr: rate(print_jobs_completed_total[5m]) / rate(print_jobs_enqueued_total[5m])
|
||||
- record: print:job_duration_p95:5m
|
||||
expr: histogram_quantile(0.95, rate(print_job_duration_ms_bucket[5m]))
|
||||
- record: print:ollama_runner_keepalive_remaining_seconds:max
|
||||
expr: max by(instance, model) (ai_ollama_runner_keepalive_remaining_seconds{job="printweb-otel"})
|
||||
- name: relay-rates
|
||||
interval: 15s
|
||||
rules:
|
||||
@@ -519,6 +642,42 @@ data:
|
||||
summary: "Print queue backlog on edge2 ({{ $value }} active jobs)"
|
||||
description: "CUPS has {{ $value }} active jobs queued. Possible printer jam, USB disconnect, or paper out."
|
||||
|
||||
# Paper roll lifecycle alerts (XL Track I, 2026-04-26).
|
||||
# Source-of-truth gauge: print_paper_remaining_percent (Print.Web OTEL,
|
||||
# hydrated on startup from the active PaperRoll row).
|
||||
# alert_channel=thermal_print routes through irc-notify -> Print.Web
|
||||
# /api/print/alert so the printer announces its own paper-out warning
|
||||
# on its remaining paper. Self-referential humor + operator nudge.
|
||||
- alert: PrintPaperRollLow
|
||||
expr: print_paper_remaining_percent{job="printweb-otel"} < 10 and print_paper_remaining_percent{job="printweb-otel"} > 5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
alert_channel: thermal_print
|
||||
annotations:
|
||||
summary: "Print roll low on edge2 ({{ $value | printf \"%.1f\" }}% remaining)"
|
||||
description: "NuPrint 210 paper roll has {{ $value | printf \"%.1f\" }}% remaining. Operator should load a fresh roll soon. Run /api/paper/status for the precise mm + estimated jobs left."
|
||||
|
||||
- alert: PrintPaperRollCritical
|
||||
expr: print_paper_remaining_percent{job="printweb-otel"} <= 5
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
alert_channel: thermal_print
|
||||
annotations:
|
||||
summary: "Print roll critical on edge2 ({{ $value | printf \"%.1f\" }}% remaining)"
|
||||
description: "NuPrint 210 paper roll at {{ $value | printf \"%.1f\" }}% — load a new roll NOW. The 50ft roll has a ~12% red-stripe zone; once paper passes that, the printer can run dry mid-job."
|
||||
|
||||
- alert: PrintJobDeadLetter
|
||||
expr: increase(print_jobs_dead_letter_total[15m]) > 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
alert_channel: thermal_print
|
||||
annotations:
|
||||
summary: "Print job(s) entered dead-letter on edge2 ({{ $value | printf \"%.0f\" }} in last 15m)"
|
||||
description: "{{ $value | printf \"%.0f\" }} print job(s) exhausted MaxRetries and need operator action. Open /print-log, filter Status=DeadLetter, click 'Retry From Start' after fixing the underlying cause (paper jam, USB disconnect, printer power-cycle)."
|
||||
|
||||
- alert: CUPSHighJobRate
|
||||
expr: rate(cups_job_total[5m]) * 60 > 30
|
||||
for: 5m
|
||||
@@ -528,6 +687,99 @@ data:
|
||||
summary: "High print volume on edge2 ({{ $value | printf \"%.0f\" }} jobs/min)"
|
||||
description: "Print rate exceeds 30 jobs/min for 5 minutes. Possible runaway print loop."
|
||||
|
||||
- alert: PrintOllamaRunnerLongKeepAlive
|
||||
expr: ai_ollama_runner_keepalive_remaining_seconds{job="printweb-otel"} > 600
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
alert_channel: thermal_print
|
||||
annotations:
|
||||
summary: "Print.Web Ollama runner held for >10m ({{ $labels.model }})"
|
||||
description: "Print.Web reports model {{ $labels.model }} with {{ $value | printf \"%.0f\" }}s of keep-alive remaining. Check concurrent requests before the Pi 5 Ollama lane thrashes."
|
||||
|
||||
- name: remote-desktop
|
||||
rules:
|
||||
- alert: RemoteDesktopWebDown
|
||||
expr: probe_success{job="probe-remotedesktop",instance="https://desktop.iamworkin.lan/health"} == 0
|
||||
for: 3m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "FlowerCore RemoteDesktop web is down"
|
||||
description: "https://desktop.iamworkin.lan/health probe has failed for 3 minutes. Catalog + session launch surface offline."
|
||||
|
||||
- alert: RemoteDesktopMetricsStale
|
||||
expr: absent(fc_desktop_session_events_total)
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "RemoteDesktop /metrics scrape returning no data"
|
||||
description: "No fc_desktop_session_events_total series for 10 minutes. Either the Prometheus scrape target is misconfigured or the web deployment stopped exporting metrics. Zabbix template carries the same 10m no-data trigger for cross-monitor parity."
|
||||
|
||||
# PUBLISHER QUIRK: fc_desktop_pool_depleted / _deficit emit one
|
||||
# series per template per status (Ready/Warming/BelowDesiredSize/
|
||||
# Disabled), and the historical series for non-current statuses
|
||||
# stay at their last value. So just `_depleted > 0` fires forever
|
||||
# on any template that ever entered a bad state.
|
||||
#
|
||||
# SAFE PATTERN: alert only when the canonical "Ready" status
|
||||
# gauge does NOT report ready=1 for the enabled template. This
|
||||
# is the publisher's own canary — _ready{status="Ready"}==1 is
|
||||
# always the current "everything is fine" signal.
|
||||
- alert: RemoteDesktopPoolDepleted
|
||||
expr: |
|
||||
group by(template) (fc_desktop_pool_ready{enabled="true"})
|
||||
unless on(template) (fc_desktop_pool_ready{enabled="true",status="Ready"} == 1)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "RemoteDesktop pool depleted ({{ $labels.template }})"
|
||||
description: "Pool for template {{ $labels.template }} has no Ready warm pod for 5 minutes. New launches will cold-start. Check pod-scheduling failures, image pull issues, or exhausted node capacity."
|
||||
|
||||
# Same pattern, but only fires when template explicitly reports
|
||||
# a sustained Warning-level alert state (current-status series).
|
||||
- alert: RemoteDesktopPoolDeficitSustained
|
||||
expr: |
|
||||
fc_desktop_pool_deficit{enabled="true",alert_level="Warning"} > 0
|
||||
unless on(template) (fc_desktop_pool_ready{enabled="true",status="Ready"} == 1)
|
||||
for: 10m
|
||||
labels:
|
||||
severity: info
|
||||
annotations:
|
||||
summary: "RemoteDesktop pool {{ $labels.template }} below desired for 10m"
|
||||
description: "Pool {{ $labels.template }} has a persistent deficit of {{ $value }} warm pods AND no Ready series. Likely image pull, NFS affinity, or claim-init issue."
|
||||
|
||||
- alert: RemoteDesktopSessionChurnSpike
|
||||
expr: sum(rate(fc_desktop_session_events_total{event="launch"}[5m])) * 60 > 20
|
||||
for: 5m
|
||||
labels:
|
||||
severity: info
|
||||
annotations:
|
||||
summary: "RemoteDesktop launch rate high ({{ $value | printf \"%.0f\" }}/min)"
|
||||
description: "Launch events exceed 20/min for 5 minutes. Could be a user-facing feature launch, a pooled template thrashing, or a runaway automation loop."
|
||||
|
||||
- alert: RemoteDesktopRecordingEventsDropped
|
||||
expr: absent_over_time(fc_desktop_session_events_total{event="recording"}[30m]) and on() (sum(fc_desktop_session_events_total{event="launch"}) > 0)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: info
|
||||
annotations:
|
||||
summary: "RemoteDesktop recording events silent for 30m despite active launches"
|
||||
description: "No recording events in 30 minutes while launches are happening. Recording may be silently disabled on all templates (SessionRecordingEnabled=false), the guacd NFS mount may be unhealthy, or the retention sweep isn't emitting events. Not an error by itself — worth checking."
|
||||
|
||||
# Match by job — instance label carries full URL incl. /health,
|
||||
# not just hostname, so a hostname-only match never fires.
|
||||
- alert: RemoteDesktopTlsExpiry
|
||||
expr: probe_ssl_earliest_cert_expiry{job="probe-remotedesktop"} - time() < 2 * 86400
|
||||
for: 6h
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "desktop.iamworkin.lan TLS cert expires within 2 days"
|
||||
description: "The desktop.iamworkin.lan cert is inside the 2-day renewal window and cert-manager has not renewed. Check cert-manager logs, step-ca reachability, and pfSense DNS overrides per the ACME DNS-01 gate."
|
||||
|
||||
- name: pi-fleet
|
||||
rules:
|
||||
- alert: PiManagerDown
|
||||
@@ -607,13 +859,16 @@ data:
|
||||
annotations:
|
||||
summary: "Epson ink CRITICAL: {{ $labels.prtMarkerSuppliesDescription }} at {{ $value }}%"
|
||||
|
||||
# for: 30m absorbs sleep cycles. The EcoTank sleeps after ~5 min
|
||||
# of idle and SNMP times out, so 5m for: would page nightly. A
|
||||
# genuine printer outage (jam, disconnected) lasts well over 30m.
|
||||
- alert: EpsonPrinterDown
|
||||
expr: up{job="snmp-printer"} == 0
|
||||
for: 5m
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Epson ET-3750 SNMP unreachable"
|
||||
summary: "Epson ET-3750 SNMP unreachable for >30m (likely actual fault, not sleep)"
|
||||
|
||||
- alert: SynologyDiskLow
|
||||
expr: hrStorageUsed{job="snmp-nas"} / hrStorageSize{job="snmp-nas"} * 100 > 85
|
||||
@@ -667,6 +922,174 @@ data:
|
||||
annotations:
|
||||
summary: "Disk usage high on {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}%)"
|
||||
|
||||
# K8s pod-state alerts. Require kube-state-metrics scrape (added
|
||||
# 2026-04-26 — see scrape_configs above). Would have surfaced the
|
||||
# agent-zero ollama-proxy 172x crash-loop instead of letting it
|
||||
# silently churn for ~3 days.
|
||||
- name: kubernetes-state
|
||||
rules:
|
||||
- alert: KubeContainerRestartingFrequently
|
||||
expr: increase(kube_pod_container_status_restarts_total[1h]) > 5
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "{{ $labels.namespace }}/{{ $labels.pod }} container {{ $labels.container }} restarting >5x/hr"
|
||||
description: "Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has restarted {{ $value | printf \"%.0f\" }} times in the last hour. Check 'kubectl describe pod' + last-state termination reason."
|
||||
|
||||
- alert: KubeContainerCrashLooping
|
||||
expr: increase(kube_pod_container_status_restarts_total[15m]) > 3
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
alert_channel: thermal_print
|
||||
annotations:
|
||||
summary: "{{ $labels.namespace }}/{{ $labels.pod }} crashlooping ({{ $value | printf \"%.0f\" }} restarts/15m)"
|
||||
description: "Container {{ $labels.container }} restarted {{ $value | printf \"%.0f\" }} times in 15 minutes — actively crashlooping."
|
||||
|
||||
- alert: KubePodNotReady
|
||||
expr: sum by(namespace, pod) (kube_pod_status_phase{phase=~"Pending|Failed|Unknown"}) > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "{{ $labels.namespace }}/{{ $labels.pod }} not Ready for >15m"
|
||||
description: "Pod is in a non-Running, non-Succeeded phase for over 15 minutes. Common causes: ImagePullBackOff (registry/Nexus down, wrong image tag), pending PVC, scheduling failure (taint/resources)."
|
||||
|
||||
- alert: KubePodImagePullBackOff
|
||||
expr: sum by(namespace, pod) (kube_pod_container_status_waiting_reason{reason=~"ImagePullBackOff|ErrImagePull"}) > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "{{ $labels.namespace }}/{{ $labels.pod }} ImagePullBackOff for >10m"
|
||||
description: "Pod can't pull image. Check the image ref (often a stale tag or unreachable registry) and clean up if it's an orphan."
|
||||
|
||||
- alert: KubeDeploymentReplicasMismatch
|
||||
expr: kube_deployment_spec_replicas != kube_deployment_status_replicas_available
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} replica mismatch"
|
||||
description: "Spec wants {{ $labels.spec_replicas }} but only {{ $value }} available. Likely a rollout stuck on probe failure, scheduling, or PVC."
|
||||
|
||||
# Longhorn storage health alerts. Required: longhorn scrape job
|
||||
# (added 2026-04-26 — see scrape_configs above). The K8s events
|
||||
# for "snapshot becomes not ready to use" are transient lifecycle
|
||||
# noise, not actionable — these alerts use the actual Longhorn
|
||||
# gauges that reflect persistent state.
|
||||
- name: longhorn-storage
|
||||
rules:
|
||||
# Volume robustness: 0=unknown, 1=healthy, 2=degraded, 3=faulted.
|
||||
# Detached volumes report 0 — that's normal for unattached PVCs,
|
||||
# so filter to only attached.
|
||||
- alert: LonghornVolumeDegraded
|
||||
expr: longhorn_volume_robustness{robustness="degraded"} == 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Longhorn volume {{ $labels.volume }} degraded for >15m"
|
||||
description: "Volume {{ $labels.volume }} on node {{ $labels.node }} has been degraded (one or more replicas unhealthy) for 15+ minutes. Auto-rebuild may need help — check 'kubectl describe volume.longhorn.io {{ $labels.volume }} -n longhorn-system'."
|
||||
|
||||
- alert: LonghornVolumeFaulted
|
||||
expr: longhorn_volume_robustness{robustness="faulted"} == 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
alert_channel: thermal_print
|
||||
annotations:
|
||||
summary: "Longhorn volume {{ $labels.volume }} FAULTED"
|
||||
description: "Volume {{ $labels.volume }} on node {{ $labels.node }} is faulted — all replicas unavailable. Data inaccessible. Manual intervention required."
|
||||
|
||||
# No backup in 36h indicates the daily-backup recurringJob is
|
||||
# silently failing. Allows for one missed run + slack.
|
||||
- alert: LonghornBackupStale
|
||||
expr: |
|
||||
(time() - max by(volume) (longhorn_backup_state{state="Completed"} * on(backup) group_left() longhorn_backup_actual_size_bytes)) > 36 * 3600
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Longhorn volume {{ $labels.volume }} has no completed backup in >36h"
|
||||
description: "Daily backup recurringJob (cron 0 2 * * *) appears to have skipped this volume. Check 'kubectl get backups.longhorn.io -n longhorn-system' and the daily-backup CronJob logs."
|
||||
|
||||
- alert: LonghornNodeUnhealthy
|
||||
expr: longhorn_node_status{condition="ready",condition_reason!=""} == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Longhorn node {{ $labels.node }} not Ready"
|
||||
description: "Node {{ $labels.node }} reports ready=false (reason: {{ $labels.condition_reason }}). Volumes scheduled to this node will be unavailable until it recovers."
|
||||
|
||||
# ============================================================
|
||||
# FC Signage Marquee Performance — Track 3 + 8 (2026-05-06)
|
||||
# Live-mirrored from FlowerCore.Notes/scripts/monitoring/alerts.yml.
|
||||
# Source-of-truth for the live Podman Prometheus on noc1 is the
|
||||
# Notes file; this K8s ConfigMap exists so a future migration to
|
||||
# in-cluster Prometheus inherits the ruleset automatically.
|
||||
# See feedback_monitoring_k8s_target_vs_live_podman.
|
||||
# ============================================================
|
||||
- name: fc-signage-marquee
|
||||
rules:
|
||||
- alert: MarqueeDroppedFramesHigh
|
||||
expr: |
|
||||
(
|
||||
sum by (renderer, phase, node_id) (rate(marquee_dropped_frames_total[5m]))
|
||||
/
|
||||
sum by (renderer, phase, node_id) (rate(marquee_render_latency_ms_count[5m]))
|
||||
) > 0.05
|
||||
unless on()
|
||||
absent_over_time(marquee_dropped_frames_total[7d])
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: signage
|
||||
alert_channel: irc
|
||||
annotations:
|
||||
summary: "Marquee dropped-frame rate >5% on {{ $labels.renderer }}/{{ $labels.node_id }} ({{ $labels.phase }})"
|
||||
description: "Renderer {{ $labels.renderer }} on {{ $labels.node_id }} drops >5% of frames during {{ $labels.phase }}. Animation visibly stuttery."
|
||||
|
||||
- alert: MarqueeRenderLatencyP99High
|
||||
expr: |
|
||||
histogram_quantile(
|
||||
0.99,
|
||||
sum by (renderer, phase, node_id, le) (rate(marquee_render_latency_ms_bucket[5m]))
|
||||
) > 16
|
||||
unless on()
|
||||
absent_over_time(marquee_render_latency_ms_bucket[7d])
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
service: signage
|
||||
alert_channel: irc
|
||||
annotations:
|
||||
summary: "Marquee render latency p99 > 16ms on {{ $labels.renderer }}/{{ $labels.node_id }} ({{ $labels.phase }})"
|
||||
description: "Per-frame render latency p99 has exceeded the Pi-class 16ms budget for 10 minutes."
|
||||
|
||||
- alert: MarqueeAnimationDurationDrift
|
||||
expr: |
|
||||
abs(
|
||||
histogram_quantile(0.5, sum by (renderer, phase, le) (rate(marquee_animation_duration_ms_bucket[15m])))
|
||||
-
|
||||
on (phase) group_left() avg by (phase) (marquee_animation_duration_target_ms)
|
||||
)
|
||||
/
|
||||
on (phase) group_left() avg by (phase) (marquee_animation_duration_target_ms)
|
||||
> 0.10
|
||||
unless on()
|
||||
absent_over_time(marquee_animation_duration_ms_bucket[7d])
|
||||
for: 15m
|
||||
labels:
|
||||
severity: info
|
||||
service: signage
|
||||
alert_channel: irc
|
||||
annotations:
|
||||
summary: "Marquee animation duration drifting > 10% on {{ $labels.renderer }} ({{ $labels.phase }})"
|
||||
description: "Median observed cycle duration deviates from target DurationMs by >10%. Could indicate browser tab throttling, GPU pressure, or phase-advancement bug."
|
||||
|
||||
# =============================================================================
|
||||
# ConfigMap: Blackbox Exporter Configuration
|
||||
# =============================================================================
|
||||
@@ -698,6 +1121,22 @@ data:
|
||||
fail_if_body_not_matches_regexp:
|
||||
- '"models"'
|
||||
preferred_ip_protocol: ip4
|
||||
# https_internal — for Traefik-fronted services with step-ca leaf
|
||||
# certs. blackbox does not trust the step-ca root CA, so http_2xx
|
||||
# against any *.iamworkin.lan host fails with x509 unknown authority.
|
||||
# Redirects + multiple status codes are accepted because some hosts
|
||||
# 302 to /login or /scalar.
|
||||
https_internal:
|
||||
prober: http
|
||||
timeout: 10s
|
||||
http:
|
||||
valid_http_versions: ["HTTP/1.1", "HTTP/2.0"]
|
||||
valid_status_codes: [200, 301, 302, 303, 307, 308]
|
||||
method: GET
|
||||
follow_redirects: true
|
||||
preferred_ip_protocol: ip4
|
||||
tls_config:
|
||||
insecure_skip_verify: true
|
||||
|
||||
# =============================================================================
|
||||
# ConfigMap: IRC Notify Script
|
||||
@@ -720,7 +1159,7 @@ data:
|
||||
from urllib.request import Request, urlopen
|
||||
from urllib.error import URLError
|
||||
|
||||
IRC_HOST = "unrealircd.irc.svc.cluster.local"
|
||||
IRC_HOST = "unrealircd.irc.svc" # short name: CoreDNS ndots:5 + iamworkin.lan template hijacks full .cluster.local (see memory)
|
||||
IRC_PORT = 6667
|
||||
IRC_NICK = "grafana-bot"
|
||||
IRC_CHANNEL = "#alerts"
|
||||
@@ -2896,6 +3335,33 @@ data:
|
||||
relativeTimeRange: {from: 120, to: 0}
|
||||
datasourceUid: __expr__
|
||||
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [1], type: lt}}], refId: C}
|
||||
- uid: print-ollama-runner-long-keepalive
|
||||
title: Print.Web Ollama runner keep-alive >10m
|
||||
condition: C
|
||||
for: 2m
|
||||
noDataState: NoData
|
||||
execErrState: OK
|
||||
annotations:
|
||||
summary: Print.Web Ollama runner held too long
|
||||
description: "A Print.Web Ollama runner has more than 10 minutes of keep-alive remaining. Check active AI requests before the Pi 5 Ollama lane thrashes."
|
||||
runbook: "1. Open https://print.iamworkin.lan/admin 2. Check Ollama Fleet model + remaining keep-alive 3. Query Prometheus: ai_ollama_runner_keepalive_remaining_seconds{job=\"printweb-otel\"} 4. Stop duplicate model callers before restarting Ollama."
|
||||
labels:
|
||||
severity: warning
|
||||
service: print-web
|
||||
alert_channel: thermal_print
|
||||
data:
|
||||
- refId: A
|
||||
relativeTimeRange: {from: 120, to: 0}
|
||||
datasourceUid: prometheus
|
||||
model: {expr: 'max by(model) (ai_ollama_runner_keepalive_remaining_seconds{job="printweb-otel"})', instant: true, refId: A}
|
||||
- refId: B
|
||||
relativeTimeRange: {from: 120, to: 0}
|
||||
datasourceUid: __expr__
|
||||
model: {type: reduce, expression: A, reducer: last, refId: B}
|
||||
- refId: C
|
||||
relativeTimeRange: {from: 120, to: 0}
|
||||
datasourceUid: __expr__
|
||||
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [600], type: gt}}], refId: C}
|
||||
- orgId: 1
|
||||
name: Infrastructure
|
||||
folder: AI Stack Alerts
|
||||
@@ -3006,6 +3472,172 @@ data:
|
||||
relativeTimeRange: {from: 600, to: 0}
|
||||
datasourceUid: __expr__
|
||||
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [85], type: gt}}], refId: C}
|
||||
- orgId: 1
|
||||
name: RemoteDesktop
|
||||
folder: AI Stack Alerts
|
||||
interval: 1m
|
||||
rules:
|
||||
- uid: remotedesktop-web-down
|
||||
title: RemoteDesktop Web DOWN
|
||||
condition: C
|
||||
for: 3m
|
||||
noDataState: Alerting
|
||||
execErrState: OK
|
||||
annotations:
|
||||
summary: FlowerCore RemoteDesktop /health probe failing
|
||||
description: "https://desktop.iamworkin.lan/health has failed for 3 minutes. Catalog + session launch surface offline."
|
||||
runbook: "1. kubectl -n fc-desktop get pods -l app.kubernetes.io/name=remotedesktop-web 2. kubectl -n fc-desktop logs deploy/remotedesktop-web --tail=50 3. Check Traefik IngressRoute + step-ca cert 4. Rollout restart if pod is stuck"
|
||||
labels:
|
||||
severity: warning
|
||||
service: remotedesktop
|
||||
data:
|
||||
- refId: A
|
||||
relativeTimeRange: {from: 180, to: 0}
|
||||
datasourceUid: prometheus
|
||||
model: {expr: 'probe_success{job="probe-remotedesktop"}', instant: true, refId: A}
|
||||
- refId: B
|
||||
relativeTimeRange: {from: 180, to: 0}
|
||||
datasourceUid: __expr__
|
||||
model: {type: reduce, expression: A, reducer: last, refId: B}
|
||||
- refId: C
|
||||
relativeTimeRange: {from: 180, to: 0}
|
||||
datasourceUid: __expr__
|
||||
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [1], type: lt}}], refId: C}
|
||||
|
||||
- uid: remotedesktop-metrics-stale
|
||||
title: RemoteDesktop metrics stale
|
||||
condition: C
|
||||
for: 10m
|
||||
noDataState: Alerting
|
||||
execErrState: OK
|
||||
annotations:
|
||||
summary: RemoteDesktop /metrics returning no series
|
||||
description: "No fc_desktop_session_events_total series for 10 minutes. Either the Prometheus scrape is misconfigured or the web deployment stopped exporting metrics. Cross-checked by Zabbix template's identical 10m no-data trigger."
|
||||
runbook: "1. curl -sk https://desktop.iamworkin.lan/metrics | head 2. kubectl -n monitoring exec deploy/prometheus -- wget -qO- localhost:9090/api/v1/targets?scrapePool=fc-remotedesktop 3. Check monitoring-netpol egress allows to fc-desktop:8080"
|
||||
labels:
|
||||
severity: warning
|
||||
service: remotedesktop
|
||||
data:
|
||||
- refId: A
|
||||
relativeTimeRange: {from: 600, to: 0}
|
||||
datasourceUid: prometheus
|
||||
model: {expr: 'count(fc_desktop_session_events_total) or vector(0)', instant: true, refId: A}
|
||||
- refId: B
|
||||
relativeTimeRange: {from: 600, to: 0}
|
||||
datasourceUid: __expr__
|
||||
model: {type: reduce, expression: A, reducer: last, refId: B}
|
||||
- refId: C
|
||||
relativeTimeRange: {from: 600, to: 0}
|
||||
datasourceUid: __expr__
|
||||
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [1], type: lt}}], refId: C}
|
||||
|
||||
- uid: remotedesktop-pool-depleted
|
||||
title: RemoteDesktop pool depleted
|
||||
condition: C
|
||||
for: 5m
|
||||
noDataState: OK
|
||||
execErrState: OK
|
||||
annotations:
|
||||
summary: RemoteDesktop warm pool depleted for 5m
|
||||
description: "A RemoteDesktop warm pool has fc_desktop_pool_depleted=1 for 5 minutes. New launches will cold-start. Check pod scheduling, image pull, node capacity."
|
||||
runbook: "1. kubectl -n fc-desktop get pods -l app.kubernetes.io/name=remote-desktop --sort-by=.status.startTime 2. kubectl -n fc-desktop describe desktoppool <name> 3. Verify localhost/fc-desktop:* images imported on all 3 RKE2 nodes"
|
||||
labels:
|
||||
severity: warning
|
||||
service: remotedesktop
|
||||
data:
|
||||
- refId: A
|
||||
relativeTimeRange: {from: 300, to: 0}
|
||||
datasourceUid: prometheus
|
||||
model: {expr: 'max(fc_desktop_pool_depleted)', instant: true, refId: A}
|
||||
- refId: B
|
||||
relativeTimeRange: {from: 300, to: 0}
|
||||
datasourceUid: __expr__
|
||||
model: {type: reduce, expression: A, reducer: last, refId: B}
|
||||
- refId: C
|
||||
relativeTimeRange: {from: 300, to: 0}
|
||||
datasourceUid: __expr__
|
||||
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [0.5], type: gt}}], refId: C}
|
||||
|
||||
- uid: remotedesktop-pool-deficit-sustained
|
||||
title: RemoteDesktop pool below desired
|
||||
condition: C
|
||||
for: 10m
|
||||
noDataState: OK
|
||||
execErrState: OK
|
||||
annotations:
|
||||
summary: RemoteDesktop pool sustained deficit
|
||||
description: "A pool has fc_desktop_pool_deficit>0 for 10 minutes. Operator is reconciling but can't reach desired size — likely image pull, NFS affinity, or claim-init issue."
|
||||
runbook: "1. kubectl -n fc-desktop get pods -l flowercore.io/pool=<pool> 2. kubectl logs -n fc-desktop deploy/remotedesktop-operator 3. Check claim-init hook env on template"
|
||||
labels:
|
||||
severity: info
|
||||
service: remotedesktop
|
||||
data:
|
||||
- refId: A
|
||||
relativeTimeRange: {from: 600, to: 0}
|
||||
datasourceUid: prometheus
|
||||
model: {expr: 'max(fc_desktop_pool_deficit)', instant: true, refId: A}
|
||||
- refId: B
|
||||
relativeTimeRange: {from: 600, to: 0}
|
||||
datasourceUid: __expr__
|
||||
model: {type: reduce, expression: A, reducer: last, refId: B}
|
||||
- refId: C
|
||||
relativeTimeRange: {from: 600, to: 0}
|
||||
datasourceUid: __expr__
|
||||
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [0], type: gt}}], refId: C}
|
||||
|
||||
- uid: remotedesktop-session-churn-spike
|
||||
title: RemoteDesktop launch rate spike
|
||||
condition: C
|
||||
for: 5m
|
||||
noDataState: OK
|
||||
execErrState: OK
|
||||
annotations:
|
||||
summary: RemoteDesktop launch rate exceeds 20/min
|
||||
description: "Launch events >20/min for 5 minutes. Could be a user-facing feature launch, pooled template thrashing, or runaway automation loop."
|
||||
runbook: "1. kubectl -n fc-desktop get pods -l app.kubernetes.io/name=remote-desktop -o wide | wc -l 2. curl -sk https://desktop.iamworkin.lan/api/sessions/active 3. Check operator logs for reconcile loops"
|
||||
labels:
|
||||
severity: info
|
||||
service: remotedesktop
|
||||
data:
|
||||
- refId: A
|
||||
relativeTimeRange: {from: 300, to: 0}
|
||||
datasourceUid: prometheus
|
||||
model: {expr: 'sum(rate(fc_desktop_session_events_total{event="launch"}[5m])) * 60', instant: true, refId: A}
|
||||
- refId: B
|
||||
relativeTimeRange: {from: 300, to: 0}
|
||||
datasourceUid: __expr__
|
||||
model: {type: reduce, expression: A, reducer: last, refId: B}
|
||||
- refId: C
|
||||
relativeTimeRange: {from: 300, to: 0}
|
||||
datasourceUid: __expr__
|
||||
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [20], type: gt}}], refId: C}
|
||||
|
||||
- uid: remotedesktop-tls-expiry
|
||||
title: RemoteDesktop TLS cert expiring
|
||||
condition: C
|
||||
for: 6h
|
||||
noDataState: OK
|
||||
execErrState: OK
|
||||
annotations:
|
||||
summary: desktop.iamworkin.lan cert <2d to expiry
|
||||
description: "The desktop.iamworkin.lan certificate is inside the 2-day renewal window and cert-manager has not renewed. Check cert-manager logs, step-ca reachability, FlowerCore.DNS preflight for dnsNames."
|
||||
runbook: "1. kubectl -n fc-desktop get certificate remotedesktop-web-tls 2. kubectl -n cert-manager logs deploy/cert-manager --tail=50 3. Verify pfSense DNS override for desktop.iamworkin.lan"
|
||||
labels:
|
||||
severity: critical
|
||||
service: remotedesktop
|
||||
data:
|
||||
- refId: A
|
||||
relativeTimeRange: {from: 21600, to: 0}
|
||||
datasourceUid: prometheus
|
||||
model: {expr: '(probe_ssl_earliest_cert_expiry{job="probe-remotedesktop"} - time()) / 86400', instant: true, refId: A}
|
||||
- refId: B
|
||||
relativeTimeRange: {from: 21600, to: 0}
|
||||
datasourceUid: __expr__
|
||||
model: {type: reduce, expression: A, reducer: last, refId: B}
|
||||
- refId: C
|
||||
relativeTimeRange: {from: 21600, to: 0}
|
||||
datasourceUid: __expr__
|
||||
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [2], type: lt}}], refId: C}
|
||||
|
||||
# =============================================================================
|
||||
# Deployment: Grafana
|
||||
@@ -3083,6 +3715,9 @@ spec:
|
||||
- name: dashboards-infra-overview
|
||||
mountPath: /var/lib/grafana/dashboards/infra-overview
|
||||
readOnly: true
|
||||
- name: dashboards-remotedesktop
|
||||
mountPath: /var/lib/grafana/dashboards/remotedesktop
|
||||
readOnly: true
|
||||
- name: datasource-provisioning
|
||||
mountPath: /etc/grafana/provisioning/datasources
|
||||
readOnly: true
|
||||
@@ -3133,6 +3768,9 @@ spec:
|
||||
- name: dashboards-infra-overview
|
||||
configMap:
|
||||
name: grafana-dashboard-infra-overview
|
||||
- name: dashboards-remotedesktop
|
||||
configMap:
|
||||
name: grafana-dashboard-remotedesktop
|
||||
- name: datasource-provisioning
|
||||
configMap:
|
||||
name: grafana-datasource-provisioning
|
||||
@@ -3694,6 +4332,66 @@ spec:
|
||||
ports:
|
||||
- port: 80
|
||||
protocol: TCP
|
||||
# FlowerCore.RemoteDesktop /metrics scrape via the fc-desktop
|
||||
# ClusterIP Service (remotedesktop-web:8080). Also covers the
|
||||
# Traefik VIP hairpin path since after kube-proxy DNAT, the egress
|
||||
# destination is the backend pod IP on the service port (see
|
||||
# feedback_netpol_dnat_backend_port).
|
||||
- to:
|
||||
- namespaceSelector:
|
||||
matchLabels:
|
||||
kubernetes.io/metadata.name: fc-desktop
|
||||
ports:
|
||||
- port: 8080
|
||||
protocol: TCP
|
||||
# Traefik backend ports — needed for in-cluster egress to public
|
||||
# iamworkin.lan hostnames that CoreDNS wildcard resolves to the
|
||||
# LoadBalancer VIP. Post-DNAT destination is a Traefik pod on 8080/8443.
|
||||
- to:
|
||||
- namespaceSelector:
|
||||
matchLabels:
|
||||
kubernetes.io/metadata.name: traefik-system
|
||||
podSelector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: traefik
|
||||
ports:
|
||||
- port: 8080
|
||||
protocol: TCP
|
||||
- port: 8443
|
||||
protocol: TCP
|
||||
# Traefik /metrics endpoint (port 9100) — separate from the data-path
|
||||
# ports above. Required for the in-cluster `traefik` scrape job.
|
||||
- to:
|
||||
- namespaceSelector:
|
||||
matchLabels:
|
||||
kubernetes.io/metadata.name: traefik-system
|
||||
ports:
|
||||
- port: 9100
|
||||
protocol: TCP
|
||||
# kube-state-metrics — required for kubernetes-state alert group.
|
||||
- to:
|
||||
- namespaceSelector:
|
||||
matchLabels:
|
||||
kubernetes.io/metadata.name: kube-system
|
||||
ports:
|
||||
- port: 8080
|
||||
protocol: TCP
|
||||
# cert-manager metrics — required for CertManagerCertificate* alerts.
|
||||
- to:
|
||||
- namespaceSelector:
|
||||
matchLabels:
|
||||
kubernetes.io/metadata.name: cert-manager
|
||||
ports:
|
||||
- port: 9402
|
||||
protocol: TCP
|
||||
# Longhorn manager metrics — required for Longhorn* alerts.
|
||||
- to:
|
||||
- namespaceSelector:
|
||||
matchLabels:
|
||||
kubernetes.io/metadata.name: longhorn-system
|
||||
ports:
|
||||
- port: 9500
|
||||
protocol: TCP
|
||||
# IRC (irc-notify → UnrealIRCd in irc namespace via K8s DNS)
|
||||
- to:
|
||||
- namespaceSelector:
|
||||
|
||||
286
apps/multus/multus.yaml
Normal file
286
apps/multus/multus.yaml
Normal file
@@ -0,0 +1,286 @@
|
||||
# =============================================================================
|
||||
# Multus CNI — Meta-CNI for multi-network attachment to pods/VMs
|
||||
# =============================================================================
|
||||
# Purpose: enable KubeVirt VMs (and any future workload) to attach additional
|
||||
# network interfaces beyond the default Calico-managed pod network. Required
|
||||
# for ci1 (Windows Server 2025 KubeVirt VM) to bridge onto PROD VLAN 57.
|
||||
#
|
||||
# Source: upstream k8snetworkplumbingwg/multus-cni v4.2.2
|
||||
# https://github.com/k8snetworkplumbingwg/multus-cni/blob/v4.2.2/deployments/multus-daemonset-thick.yml
|
||||
#
|
||||
# Inlined verbatim (with project header + version pin annotation) for
|
||||
# reproducibility and air-gap safety. Bumping versions = edit this file +
|
||||
# git push. ArgoCD picks up via the bluejay-infra ApplicationSet
|
||||
# (apps/* directory generator on main).
|
||||
#
|
||||
# Why thick plugin (not thin):
|
||||
# - Thick = daemon + thin shim binary; daemon handles NAD watch + CRD reads
|
||||
# centrally so each pod's CNI ADD doesn't hit the K8s API server. Better
|
||||
# for clusters with many NAD-using pods.
|
||||
# - Thin = each CNI ADD process directly contacts K8s API. Simpler but
|
||||
# scales worse and has more failure modes.
|
||||
# - KubeVirt + multi-VM workload pattern fits thick perfectly.
|
||||
#
|
||||
# Cluster context (verified 2026-05-08):
|
||||
# - RKE2 v1.34.5 on 3 nodes (rke2-server, rke2-agent1, rke2-agent2)
|
||||
# - Calico CNI (Tigera-managed) at /etc/cni/net.d + /opt/cni/bin (default)
|
||||
# - openSUSE Leap 16, kernel 6.12, containerd 2.1.5
|
||||
# - host bridge for PROD VLAN 57 = `br-prod` (PUPPET HOST WORK — see Phase 1.5
|
||||
# in docs/infrastructure/windows-server-build-runner-plan.md)
|
||||
#
|
||||
# Version pin: snapshot-thick → pinning to v4.2.2 release tag at deploy time
|
||||
# would require a private mirror of the image. Upstream `snapshot-thick` tag
|
||||
# is updated on every release, so for now we trust upstream + Calico's
|
||||
# established pattern. Pin to a specific SHA256 once we mirror to Gitea OCI.
|
||||
#
|
||||
# Apply (once committed to bluejay-infra main, ApplicationSet auto-syncs):
|
||||
# git add apps/multus/multus.yaml && git commit && git push origin main
|
||||
# # ArgoCD `infra-multus` Application appears within 3 min via ApplicationSet
|
||||
#
|
||||
# Verify:
|
||||
# kubectl -n kube-system get ds kube-multus-ds
|
||||
# kubectl -n kube-system rollout status ds kube-multus-ds
|
||||
# kubectl get crd network-attachment-definitions.k8s.cni.cncf.io
|
||||
# =============================================================================
|
||||
|
||||
---
|
||||
apiVersion: apiextensions.k8s.io/v1
|
||||
kind: CustomResourceDefinition
|
||||
metadata:
|
||||
name: network-attachment-definitions.k8s.cni.cncf.io
|
||||
annotations:
|
||||
bluejay.iamworkin.lan/source: "k8snetworkplumbingwg/multus-cni v4.2.2"
|
||||
spec:
|
||||
group: k8s.cni.cncf.io
|
||||
scope: Namespaced
|
||||
names:
|
||||
plural: network-attachment-definitions
|
||||
singular: network-attachment-definition
|
||||
kind: NetworkAttachmentDefinition
|
||||
shortNames:
|
||||
- net-attach-def
|
||||
versions:
|
||||
- name: v1
|
||||
served: true
|
||||
storage: true
|
||||
schema:
|
||||
openAPIV3Schema:
|
||||
description: 'NetworkAttachmentDefinition is a CRD schema specified by the Network Plumbing
|
||||
Working Group to express the intent for attaching pods to one or more logical or physical
|
||||
networks. More information available at: https://github.com/k8snetworkplumbingwg/multi-net-spec'
|
||||
type: object
|
||||
properties:
|
||||
apiVersion:
|
||||
type: string
|
||||
kind:
|
||||
type: string
|
||||
metadata:
|
||||
type: object
|
||||
spec:
|
||||
description: 'NetworkAttachmentDefinition spec defines the desired state of a network attachment'
|
||||
type: object
|
||||
properties:
|
||||
config:
|
||||
description: 'NetworkAttachmentDefinition config is a JSON-formatted CNI configuration'
|
||||
type: string
|
||||
---
|
||||
kind: ClusterRole
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
metadata:
|
||||
name: multus
|
||||
rules:
|
||||
- apiGroups: ["k8s.cni.cncf.io"]
|
||||
resources:
|
||||
- '*'
|
||||
verbs:
|
||||
- '*'
|
||||
- apiGroups:
|
||||
- ""
|
||||
resources:
|
||||
- pods
|
||||
- pods/status
|
||||
verbs:
|
||||
- get
|
||||
- list
|
||||
- update
|
||||
- watch
|
||||
- apiGroups:
|
||||
- ""
|
||||
- events.k8s.io
|
||||
resources:
|
||||
- events
|
||||
verbs:
|
||||
- create
|
||||
- patch
|
||||
- update
|
||||
---
|
||||
kind: ClusterRoleBinding
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
metadata:
|
||||
name: multus
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: ClusterRole
|
||||
name: multus
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: multus
|
||||
namespace: kube-system
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: multus
|
||||
namespace: kube-system
|
||||
---
|
||||
kind: ConfigMap
|
||||
apiVersion: v1
|
||||
metadata:
|
||||
name: multus-daemon-config
|
||||
namespace: kube-system
|
||||
labels:
|
||||
tier: node
|
||||
app: multus
|
||||
data:
|
||||
daemon-config.json: |
|
||||
{
|
||||
"chrootDir": "/hostroot",
|
||||
"cniVersion": "0.3.1",
|
||||
"logLevel": "verbose",
|
||||
"logToStderr": true,
|
||||
"cniConfigDir": "/host/etc/cni/net.d",
|
||||
"multusAutoconfigDir": "/host/etc/cni/net.d",
|
||||
"multusConfigFile": "auto",
|
||||
"socketDir": "/host/run/multus/"
|
||||
}
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: DaemonSet
|
||||
metadata:
|
||||
name: kube-multus-ds
|
||||
namespace: kube-system
|
||||
labels:
|
||||
tier: node
|
||||
app: multus
|
||||
name: multus
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
name: multus
|
||||
updateStrategy:
|
||||
type: RollingUpdate
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
tier: node
|
||||
app: multus
|
||||
name: multus
|
||||
spec:
|
||||
hostNetwork: true
|
||||
hostPID: true
|
||||
tolerations:
|
||||
- operator: Exists
|
||||
effect: NoSchedule
|
||||
- operator: Exists
|
||||
effect: NoExecute
|
||||
serviceAccountName: multus
|
||||
containers:
|
||||
- name: kube-multus
|
||||
image: ghcr.io/k8snetworkplumbingwg/multus-cni:snapshot-thick
|
||||
command: [ "/usr/src/multus-cni/bin/multus-daemon" ]
|
||||
resources:
|
||||
requests:
|
||||
cpu: "100m"
|
||||
memory: "50Mi"
|
||||
limits:
|
||||
cpu: "100m"
|
||||
memory: "50Mi"
|
||||
securityContext:
|
||||
privileged: true
|
||||
terminationMessagePolicy: FallbackToLogsOnError
|
||||
volumeMounts:
|
||||
- name: cni
|
||||
mountPath: /host/etc/cni/net.d
|
||||
# multus-daemon expects that cnibin path must be identical between pod and container host.
|
||||
# e.g. if the cni bin is in '/opt/cni/bin' on the container host side, then it should be mount to '/opt/cni/bin' in multus-daemon,
|
||||
# not to any other directory, like '/opt/bin' or '/usr/bin'.
|
||||
- name: cnibin
|
||||
mountPath: /opt/cni/bin
|
||||
- name: host-run
|
||||
mountPath: /host/run
|
||||
- name: host-var-lib-cni-multus
|
||||
mountPath: /var/lib/cni/multus
|
||||
- name: host-var-lib-kubelet
|
||||
mountPath: /var/lib/kubelet
|
||||
mountPropagation: HostToContainer
|
||||
- name: host-run-k8s-cni-cncf-io
|
||||
mountPath: /run/k8s.cni.cncf.io
|
||||
- name: host-run-netns
|
||||
mountPath: /run/netns
|
||||
mountPropagation: HostToContainer
|
||||
- name: multus-daemon-config
|
||||
mountPath: /etc/cni/net.d/multus.d
|
||||
readOnly: true
|
||||
- name: hostroot
|
||||
mountPath: /hostroot
|
||||
mountPropagation: HostToContainer
|
||||
- mountPath: /etc/cni/multus/net.d
|
||||
name: multus-conf-dir
|
||||
env:
|
||||
- name: MULTUS_NODE_NAME
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: spec.nodeName
|
||||
initContainers:
|
||||
- name: install-multus-binary
|
||||
image: ghcr.io/k8snetworkplumbingwg/multus-cni:snapshot-thick
|
||||
command:
|
||||
- "sh"
|
||||
- "-c"
|
||||
- "cp /usr/src/multus-cni/bin/multus-shim /host/opt/cni/bin/multus-shim && cp /usr/src/multus-cni/bin/passthru /host/opt/cni/bin/passthru"
|
||||
resources:
|
||||
requests:
|
||||
cpu: "10m"
|
||||
memory: "15Mi"
|
||||
securityContext:
|
||||
privileged: true
|
||||
terminationMessagePolicy: FallbackToLogsOnError
|
||||
volumeMounts:
|
||||
- name: cnibin
|
||||
mountPath: /host/opt/cni/bin
|
||||
mountPropagation: Bidirectional
|
||||
terminationGracePeriodSeconds: 10
|
||||
volumes:
|
||||
- name: cni
|
||||
hostPath:
|
||||
path: /etc/cni/net.d
|
||||
- name: cnibin
|
||||
hostPath:
|
||||
path: /opt/cni/bin
|
||||
- name: hostroot
|
||||
hostPath:
|
||||
path: /
|
||||
- name: multus-daemon-config
|
||||
configMap:
|
||||
name: multus-daemon-config
|
||||
items:
|
||||
- key: daemon-config.json
|
||||
path: daemon-config.json
|
||||
- name: host-run
|
||||
hostPath:
|
||||
path: /run
|
||||
- name: host-var-lib-cni-multus
|
||||
hostPath:
|
||||
path: /var/lib/cni/multus
|
||||
- name: host-var-lib-kubelet
|
||||
hostPath:
|
||||
path: /var/lib/kubelet
|
||||
- name: host-run-k8s-cni-cncf-io
|
||||
hostPath:
|
||||
path: /run/k8s.cni.cncf.io
|
||||
- name: host-run-netns
|
||||
hostPath:
|
||||
path: /run/netns/
|
||||
- name: multus-conf-dir
|
||||
hostPath:
|
||||
path: /etc/cni/multus/net.d
|
||||
@@ -219,6 +219,65 @@ spec:
|
||||
tls:
|
||||
secretName: cockpit-tls
|
||||
---
|
||||
# ============================================================
|
||||
# PuppetDB Dashboard - noc1:8080 (HTTP, web UI only)
|
||||
# Agent-to-PuppetDB mTLS still uses port 8081 directly via Puppet CA
|
||||
# (NOT via this proxy). See docs/infrastructure/cert-recovery-2026-04-28.md
|
||||
# ============================================================
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: puppetdb-external
|
||||
namespace: noc-proxy
|
||||
spec:
|
||||
ports:
|
||||
- port: 8080
|
||||
targetPort: 8080
|
||||
name: http
|
||||
clusterIP: None
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Endpoints
|
||||
metadata:
|
||||
name: puppetdb-external
|
||||
namespace: noc-proxy
|
||||
subsets:
|
||||
- addresses:
|
||||
- ip: 10.0.56.10
|
||||
ports:
|
||||
- port: 8080
|
||||
name: http
|
||||
---
|
||||
apiVersion: cert-manager.io/v1
|
||||
kind: Certificate
|
||||
metadata:
|
||||
name: puppetdb-tls
|
||||
namespace: noc-proxy
|
||||
spec:
|
||||
secretName: puppetdb-tls
|
||||
issuerRef:
|
||||
name: step-ca-acme
|
||||
kind: ClusterIssuer
|
||||
dnsNames:
|
||||
- puppetdb.iamworkin.lan
|
||||
---
|
||||
apiVersion: traefik.io/v1alpha1
|
||||
kind: IngressRoute
|
||||
metadata:
|
||||
name: puppetdb
|
||||
namespace: noc-proxy
|
||||
spec:
|
||||
entryPoints:
|
||||
- websecure
|
||||
routes:
|
||||
- kind: Rule
|
||||
match: Host(`puppetdb.iamworkin.lan`)
|
||||
services:
|
||||
- name: puppetdb-external
|
||||
port: 8080
|
||||
tls:
|
||||
secretName: puppetdb-tls
|
||||
---
|
||||
# NetworkPolicy: allow Traefik ingress, allow egress to noc1
|
||||
apiVersion: networking.k8s.io/v1
|
||||
kind: NetworkPolicy
|
||||
@@ -242,6 +301,8 @@ spec:
|
||||
ports:
|
||||
- port: 3000
|
||||
protocol: TCP
|
||||
- port: 8080
|
||||
protocol: TCP
|
||||
- port: 9090
|
||||
protocol: TCP
|
||||
- port: 9091
|
||||
|
||||
210
apps/selenium/network-policy.yaml
Normal file
210
apps/selenium/network-policy.yaml
Normal file
@@ -0,0 +1,210 @@
|
||||
# Selenium Grid NetworkPolicy.
|
||||
#
|
||||
# Captured into bluejay-infra 2026-05-07 during the regroup audit. This
|
||||
# NetworkPolicy was previously applied via `kubectl apply` directly to
|
||||
# the cluster with no source-of-truth anywhere — a fresh cluster rebuild
|
||||
# would have lost all of it (including the Selenium Grid → Traefik VIP
|
||||
# allow rule for AAT runs against `*.iamworkin.lan` services).
|
||||
#
|
||||
# The Selenium Grid Deployment + Services themselves are still managed
|
||||
# outside ArgoCD (deployed via raw kubectl from the original Selenium
|
||||
# Grid bring-up). Migrating those into bluejay-infra is a separate lane —
|
||||
# this commit only restores GitOps repeatability for the NetworkPolicy.
|
||||
#
|
||||
# Rules captured from the live cluster's `kubectl get netpol -n selenium
|
||||
# selenium-netpol -o yaml` on 2026-05-07. Originally applied 2026-03-15
|
||||
# (from `metadata.creationTimestamp` before the field was stripped).
|
||||
#
|
||||
# Allows:
|
||||
# - Egress: CoreDNS, intra-namespace pod-to-pod (4442/4443/4444/5555),
|
||||
# Traefik VIP for `*.iamworkin.lan` AAT runs, all FC namespaces on
|
||||
# standard FC service ports (5100/5200/5300/5400/8080), pod CIDR
|
||||
# (10.42.0.0/16) + service CIDR (10.43.0.0/16) for the same ports,
|
||||
# LAN gateway range (10.0.56.0/24) for HTTPS, edge2 CUPS print
|
||||
# (10.0.57.16:5200), public internet 80/443 (excluding RFC1918), and
|
||||
# fc-signage:5190 for the signage AAT lane.
|
||||
# - Ingress: Traefik (4444 + 8089 ACME-solver-style), intra-pod,
|
||||
# telephony / gitea / fc-system / fc-signage namespaces on 4444.
|
||||
apiVersion: networking.k8s.io/v1
|
||||
kind: NetworkPolicy
|
||||
metadata:
|
||||
name: selenium-netpol
|
||||
namespace: selenium
|
||||
labels:
|
||||
app.kubernetes.io/part-of: selenium
|
||||
app.kubernetes.io/component: isolation
|
||||
spec:
|
||||
egress:
|
||||
- ports:
|
||||
- port: 53
|
||||
protocol: UDP
|
||||
- port: 53
|
||||
protocol: TCP
|
||||
to:
|
||||
- namespaceSelector:
|
||||
matchLabels:
|
||||
kubernetes.io/metadata.name: kube-system
|
||||
- ports:
|
||||
- port: 4442
|
||||
protocol: TCP
|
||||
- port: 4443
|
||||
protocol: TCP
|
||||
- port: 4444
|
||||
protocol: TCP
|
||||
- port: 5555
|
||||
protocol: TCP
|
||||
to:
|
||||
- podSelector: {}
|
||||
- ports:
|
||||
- port: 443
|
||||
protocol: TCP
|
||||
- port: 80
|
||||
protocol: TCP
|
||||
to:
|
||||
- ipBlock:
|
||||
cidr: 10.0.56.200/32
|
||||
- ports:
|
||||
- port: 443
|
||||
protocol: TCP
|
||||
- port: 80
|
||||
protocol: TCP
|
||||
- port: 5200
|
||||
protocol: TCP
|
||||
- port: 5300
|
||||
protocol: TCP
|
||||
- port: 5400
|
||||
protocol: TCP
|
||||
- port: 5100
|
||||
protocol: TCP
|
||||
- port: 8080
|
||||
protocol: TCP
|
||||
to:
|
||||
- namespaceSelector: {}
|
||||
- ports:
|
||||
- port: 443
|
||||
protocol: TCP
|
||||
- port: 80
|
||||
protocol: TCP
|
||||
- port: 8443
|
||||
protocol: TCP
|
||||
- port: 8080
|
||||
protocol: TCP
|
||||
- port: 5200
|
||||
protocol: TCP
|
||||
- port: 5300
|
||||
protocol: TCP
|
||||
- port: 5400
|
||||
protocol: TCP
|
||||
- port: 5100
|
||||
protocol: TCP
|
||||
to:
|
||||
- ipBlock:
|
||||
cidr: 10.43.0.0/16
|
||||
- ports:
|
||||
- port: 443
|
||||
protocol: TCP
|
||||
- port: 80
|
||||
protocol: TCP
|
||||
- port: 8443
|
||||
protocol: TCP
|
||||
- port: 8080
|
||||
protocol: TCP
|
||||
- port: 5200
|
||||
protocol: TCP
|
||||
- port: 5300
|
||||
protocol: TCP
|
||||
- port: 5400
|
||||
protocol: TCP
|
||||
- port: 5100
|
||||
protocol: TCP
|
||||
to:
|
||||
- ipBlock:
|
||||
cidr: 10.42.0.0/16
|
||||
- ports:
|
||||
- port: 443
|
||||
protocol: TCP
|
||||
- port: 80
|
||||
protocol: TCP
|
||||
- port: 8443
|
||||
protocol: TCP
|
||||
to:
|
||||
- ipBlock:
|
||||
cidr: 10.0.56.0/24
|
||||
- ports:
|
||||
- port: 5200
|
||||
protocol: TCP
|
||||
to:
|
||||
- ipBlock:
|
||||
cidr: 10.0.57.16/32
|
||||
- ports:
|
||||
- port: 80
|
||||
protocol: TCP
|
||||
- port: 443
|
||||
protocol: TCP
|
||||
to:
|
||||
- ipBlock:
|
||||
cidr: 0.0.0.0/0
|
||||
except:
|
||||
- 172.16.0.0/12
|
||||
- 192.168.0.0/16
|
||||
- ports:
|
||||
- port: 5190
|
||||
protocol: TCP
|
||||
to:
|
||||
- namespaceSelector:
|
||||
matchLabels:
|
||||
kubernetes.io/metadata.name: fc-signage
|
||||
ingress:
|
||||
- from:
|
||||
- namespaceSelector:
|
||||
matchLabels:
|
||||
kubernetes.io/metadata.name: traefik-system
|
||||
ports:
|
||||
- port: 4444
|
||||
protocol: TCP
|
||||
- port: 8089
|
||||
protocol: TCP
|
||||
- from:
|
||||
- podSelector: {}
|
||||
ports:
|
||||
- port: 4442
|
||||
protocol: TCP
|
||||
- port: 4443
|
||||
protocol: TCP
|
||||
- port: 4444
|
||||
protocol: TCP
|
||||
- port: 5555
|
||||
protocol: TCP
|
||||
- from:
|
||||
- namespaceSelector:
|
||||
matchLabels:
|
||||
kubernetes.io/metadata.name: telephony
|
||||
ports:
|
||||
- port: 4444
|
||||
protocol: TCP
|
||||
- from:
|
||||
- namespaceSelector:
|
||||
matchLabels:
|
||||
kubernetes.io/metadata.name: gitea
|
||||
ports:
|
||||
- port: 4444
|
||||
protocol: TCP
|
||||
- from:
|
||||
- namespaceSelector:
|
||||
matchLabels:
|
||||
kubernetes.io/metadata.name: fc-system
|
||||
ports:
|
||||
- port: 4444
|
||||
protocol: TCP
|
||||
- from:
|
||||
- namespaceSelector:
|
||||
matchLabels:
|
||||
kubernetes.io/metadata.name: fc-signage
|
||||
ports:
|
||||
- port: 4444
|
||||
protocol: TCP
|
||||
podSelector: {}
|
||||
policyTypes:
|
||||
- Ingress
|
||||
- Egress
|
||||
|
||||
@@ -148,7 +148,7 @@ spec:
|
||||
topologyKey: kubernetes.io/hostname
|
||||
containers:
|
||||
- name: telephony-web
|
||||
image: localhost/fc-telephony-web:v20260325d
|
||||
image: localhost/fc-telephony-web:v202604252156
|
||||
imagePullPolicy: Never
|
||||
securityContext:
|
||||
readOnlyRootFilesystem: true
|
||||
|
||||
60
apps/worldbuilder/README.md
Normal file
60
apps/worldbuilder/README.md
Normal file
@@ -0,0 +1,60 @@
|
||||
# FlowerCore.WorldBuilder
|
||||
|
||||
ArgoCD-managed manifest for FlowerCore.WorldBuilder.Web — comic / storyboard
|
||||
authoring service that drives ComfyUI for panel image generation and
|
||||
QuestPDF for letter / A4 export.
|
||||
|
||||
Source: `D:\git\FlowerCore\FlowerCore.WorldBuilder` (master)
|
||||
|
||||
## Deployment order
|
||||
|
||||
1. **DNS preflight** — `worldbuilder.iamworkin.lan -> 10.0.56.200` MUST exist
|
||||
in pfSense Unbound before this manifest is applied, or cert-manager
|
||||
HTTP-01 silently exponential-backs-off ~2h.
|
||||
Memory: `feedback_pfsense_dns_required_for_acme`.
|
||||
2. **Image import to ALL RKE2 nodes** — pod can schedule to any of
|
||||
`rke2-server` (10.0.56.11), `rke2-agent1` (10.0.56.12),
|
||||
`rke2-agent2` (10.0.56.13). Build with:
|
||||
```bash
|
||||
bash deploy/build.sh # in FlowerCore.WorldBuilder repo
|
||||
podman save localhost/fc-worldbuilder:v<TAG> -o /tmp/fc-worldbuilder-v<TAG>.tar
|
||||
for h in 10.0.56.11 10.0.56.12 10.0.56.13; do
|
||||
scp /tmp/fc-worldbuilder-v<TAG>.tar fcadmin@$h:/tmp/
|
||||
ssh fcadmin@$h \
|
||||
"sudo /var/lib/rancher/rke2/bin/ctr -a /run/k3s/containerd/containerd.sock \
|
||||
-n k8s.io images import /tmp/fc-worldbuilder-v<TAG>.tar"
|
||||
done
|
||||
```
|
||||
Memory: `feedback_rke2_image_import_per_node_scp`.
|
||||
3. **Bump image tag** in `worldbuilder.yaml` and git push.
|
||||
ArgoCD ApplicationSet picks up within ~3 minutes.
|
||||
4. **First production render** — open `https://worldbuilder.iamworkin.lan`,
|
||||
create World → Character → Storyboard → ExportJob, confirm artifact
|
||||
downloads. ComfyUI lives on BLUEJAY-WS at `http://10.0.56.20:8188`.
|
||||
|
||||
## Health probes
|
||||
|
||||
- `startupProbe` + `readinessProbe`: `httpGet /healthz` (registered explicitly
|
||||
in Program.cs — anonymous, no DB or OpenAPI dependency).
|
||||
- `livenessProbe`: `tcpSocket` as a cheap fallback.
|
||||
Memory: `feedback_k8s_probes_must_not_hit_openapi`,
|
||||
`feedback_k8s_probes_behind_auth_middleware`.
|
||||
|
||||
## Storage
|
||||
|
||||
- Longhorn RWO PVC `worldbuilder-data` (5Gi) mounted at `/data`. SQLite DB
|
||||
lives at `/data/worldbuilder.db`, generated images under `/data/gallery/`,
|
||||
PDF/PNG exports under `/data/exports/`.
|
||||
- DataProtection keys persist to the same SQLite via
|
||||
`AddFlowerCoreDataProtection<WorldBuilderDbContext>` — explicit migration
|
||||
`20260429133417_Initial` already creates `fc_dp_keys`.
|
||||
Memory: `feedback_dataprotection_keys_persist_to_app_dbcontext`,
|
||||
`feedback_intranet_dataprotection_table_must_have_explicit_migration`.
|
||||
|
||||
## Image generation backend
|
||||
|
||||
`FlowerCore:WorldBuilder:ImageGeneration:BaseUrl=http://10.0.56.20:8188` —
|
||||
ComfyUI runs on BLUEJAY-WS Windows (R9700 / gfx1201 / ROCm 7.2.1). Pod reaches
|
||||
the workstation directly across the 10.0.56.0/24 VLAN (no Podman-style host-
|
||||
filter issues — K8s pods route via Calico, which is L3-routed across the
|
||||
VLAN).
|
||||
213
apps/worldbuilder/worldbuilder.yaml
Normal file
213
apps/worldbuilder/worldbuilder.yaml
Normal file
@@ -0,0 +1,213 @@
|
||||
# FlowerCore.WorldBuilder — comic / storyboard authoring service.
|
||||
#
|
||||
# Deployment + Service + PVC + Certificate + IngressRoute. ArgoCD-managed
|
||||
# end-to-end. See apps/worldbuilder/README.md for the per-deploy runbook.
|
||||
#
|
||||
# Image build (BLUEJAY-WS):
|
||||
# bash deploy/build.sh # in FlowerCore.WorldBuilder repo
|
||||
# podman save localhost/fc-worldbuilder:v<TAG> -o /tmp/fc-worldbuilder-v<TAG>.tar
|
||||
# for h in 10.0.56.11 10.0.56.12 10.0.56.13; do
|
||||
# scp /tmp/fc-worldbuilder-v<TAG>.tar fcadmin@$h:/tmp/
|
||||
# ssh fcadmin@$h "sudo /var/lib/rancher/rke2/bin/ctr -a /run/k3s/containerd/containerd.sock -n k8s.io images import /tmp/fc-worldbuilder-v<TAG>.tar"
|
||||
# done
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Namespace
|
||||
metadata:
|
||||
name: fc-worldbuilder
|
||||
labels:
|
||||
app.kubernetes.io/part-of: flowercore
|
||||
---
|
||||
# SQLite DB + generated image gallery + PDF/PNG exports.
|
||||
# Longhorn RWO — single replica with `Recreate` rollout strategy keeps it safe.
|
||||
apiVersion: v1
|
||||
kind: PersistentVolumeClaim
|
||||
metadata:
|
||||
name: worldbuilder-data
|
||||
namespace: fc-worldbuilder
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
storageClassName: longhorn
|
||||
resources:
|
||||
requests:
|
||||
storage: 5Gi
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: worldbuilder-web
|
||||
namespace: fc-worldbuilder
|
||||
labels:
|
||||
app.kubernetes.io/name: worldbuilder-web
|
||||
app.kubernetes.io/part-of: flowercore
|
||||
spec:
|
||||
replicas: 1
|
||||
revisionHistoryLimit: 3
|
||||
strategy:
|
||||
# RWO PVC + single replica. Recreate avoids multi-attach overlap.
|
||||
type: Recreate
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: worldbuilder-web
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/name: worldbuilder-web
|
||||
app.kubernetes.io/part-of: flowercore
|
||||
annotations:
|
||||
prometheus.io/scrape: "true"
|
||||
prometheus.io/port: "8080"
|
||||
prometheus.io/path: "/metrics/prometheus"
|
||||
spec:
|
||||
securityContext:
|
||||
fsGroup: 1654
|
||||
fsGroupChangePolicy: OnRootMismatch
|
||||
containers:
|
||||
- name: web
|
||||
# Bump tag for each rebuild. Initial deploy: v202605062048
|
||||
image: localhost/fc-worldbuilder:v202605062048
|
||||
imagePullPolicy: Never
|
||||
ports:
|
||||
- containerPort: 8080
|
||||
name: http
|
||||
env:
|
||||
- name: ASPNETCORE_URLS
|
||||
value: "http://+:8080"
|
||||
- name: ASPNETCORE_ENVIRONMENT
|
||||
value: "Production"
|
||||
- name: DOTNET_RUNNING_IN_CONTAINER
|
||||
value: "true"
|
||||
- name: DOTNET_SYSTEM_GLOBALIZATION_INVARIANT
|
||||
value: "false"
|
||||
# SQLite path overrides (default appsettings uses relative paths).
|
||||
- name: ConnectionStrings__DefaultConnection
|
||||
value: "Data Source=/data/worldbuilder.db"
|
||||
- name: FlowerCore__Database__Provider
|
||||
value: "Sqlite"
|
||||
- name: FlowerCore__Database__ConnectionStrings__Sqlite
|
||||
value: "Data Source=/data/worldbuilder.db"
|
||||
# Generated image gallery + exports persist on /data.
|
||||
- name: FlowerCore__WorldBuilder__ImageStore__RootPath
|
||||
value: "/data/gallery"
|
||||
- name: FlowerCore__WorldBuilder__Export__RootPath
|
||||
value: "/data/exports"
|
||||
# ComfyUI on BLUEJAY-WS (R9700 / gfx1201 / ROCm 7.2.1).
|
||||
- name: FlowerCore__WorldBuilder__ImageGeneration__BaseUrl
|
||||
value: "http://10.0.56.20:8188"
|
||||
- name: FlowerCore__WorldBuilder__ImageGeneration__ClientMode
|
||||
value: "comfyui"
|
||||
resources:
|
||||
# Cluster CPU-request budget runs hot (99% on all 3 nodes at deploy
|
||||
# time) while actual CPU usage is well below capacity. Idle Blazor
|
||||
# Server + SignalR + a single ComfyUI poller uses ~5m, so 25m is
|
||||
# generous. Re-evaluate if active rendering/export workers ever
|
||||
# push past the limit.
|
||||
requests:
|
||||
cpu: 25m
|
||||
memory: 256Mi
|
||||
limits:
|
||||
cpu: 1000m
|
||||
memory: 768Mi
|
||||
# /healthz is registered explicitly in Program.cs (anonymous, no DB
|
||||
# or OpenAPI dependency). Liveness uses tcpSocket as a cheap fallback
|
||||
# in case future middleware changes accidentally gate /healthz.
|
||||
# Memory: feedback_k8s_probes_must_not_hit_openapi,
|
||||
# feedback_k8s_probes_behind_auth_middleware.
|
||||
startupProbe:
|
||||
httpGet:
|
||||
path: /healthz
|
||||
port: 8080
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
failureThreshold: 30
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /healthz
|
||||
port: 8080
|
||||
periodSeconds: 10
|
||||
failureThreshold: 3
|
||||
livenessProbe:
|
||||
tcpSocket:
|
||||
port: 8080
|
||||
initialDelaySeconds: 30
|
||||
periodSeconds: 30
|
||||
failureThreshold: 3
|
||||
securityContext:
|
||||
runAsNonRoot: true
|
||||
runAsUser: 1654
|
||||
runAsGroup: 1654
|
||||
allowPrivilegeEscalation: false
|
||||
readOnlyRootFilesystem: true
|
||||
capabilities:
|
||||
drop:
|
||||
- ALL
|
||||
volumeMounts:
|
||||
- name: data
|
||||
mountPath: /data
|
||||
- name: tmp
|
||||
mountPath: /tmp
|
||||
- name: logs
|
||||
mountPath: /app/logs
|
||||
volumes:
|
||||
- name: data
|
||||
persistentVolumeClaim:
|
||||
claimName: worldbuilder-data
|
||||
- name: tmp
|
||||
emptyDir: {}
|
||||
- name: logs
|
||||
emptyDir: {}
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: worldbuilder-web
|
||||
namespace: fc-worldbuilder
|
||||
labels:
|
||||
app.kubernetes.io/name: worldbuilder-web
|
||||
app.kubernetes.io/part-of: flowercore
|
||||
spec:
|
||||
type: ClusterIP
|
||||
selector:
|
||||
app.kubernetes.io/name: worldbuilder-web
|
||||
ports:
|
||||
- name: http
|
||||
port: 80
|
||||
targetPort: 8080
|
||||
---
|
||||
apiVersion: cert-manager.io/v1
|
||||
kind: Certificate
|
||||
metadata:
|
||||
name: worldbuilder-web-tls
|
||||
namespace: fc-worldbuilder
|
||||
spec:
|
||||
secretName: worldbuilder-web-tls
|
||||
issuerRef:
|
||||
name: step-ca-acme
|
||||
kind: ClusterIssuer
|
||||
dnsNames:
|
||||
- worldbuilder.iamworkin.lan
|
||||
# step-ca ACME provisioner caps lifetime at 30d. Requesting 90d
|
||||
# silently capped to 30d, making renewBefore 720h (30d) equal to the
|
||||
# actual cert lifetime — triggered a perpetual renewal loop that
|
||||
# generated 2365+ CertificateRequest objects in 18h. Match the working
|
||||
# 720h/240h pattern used by every other FC service cert.
|
||||
duration: 720h # 30d (step-ca cap)
|
||||
renewBefore: 240h # 10d
|
||||
---
|
||||
apiVersion: traefik.io/v1alpha1
|
||||
kind: IngressRoute
|
||||
metadata:
|
||||
name: worldbuilder-web
|
||||
namespace: fc-worldbuilder
|
||||
spec:
|
||||
entryPoints:
|
||||
- websecure
|
||||
routes:
|
||||
- match: Host(`worldbuilder.iamworkin.lan`)
|
||||
kind: Rule
|
||||
services:
|
||||
- name: worldbuilder-web
|
||||
port: 80
|
||||
tls:
|
||||
secretName: worldbuilder-web-tls
|
||||
96
apps/zabbix/templates/flowercore-print-ollama.yaml
Normal file
96
apps/zabbix/templates/flowercore-print-ollama.yaml
Normal file
@@ -0,0 +1,96 @@
|
||||
zabbix_export:
|
||||
version: '7.2'
|
||||
template_groups:
|
||||
- uuid: 30a90fb5fb3e4a7f9bb4517022c7726a
|
||||
name: Templates/FlowerCore
|
||||
templates:
|
||||
- uuid: 89cecb27144c4b539bd8972d4d949063
|
||||
template: FlowerCore Print Ollama
|
||||
name: FlowerCore Print Ollama
|
||||
description: FlowerCore Print.Web Ollama health probe checks. Import this template into Zabbix and link it to the Print.Web host.
|
||||
groups:
|
||||
- name: Templates/FlowerCore
|
||||
items:
|
||||
- uuid: 8fd2720255d54bc8bda0fe3ab4677c6c
|
||||
name: Print.Web metrics snapshot
|
||||
type: HTTP_AGENT
|
||||
key: flowercore.print.ollama.snapshot
|
||||
delay: 30s
|
||||
history: 7d
|
||||
trends: '0'
|
||||
value_type: TEXT
|
||||
url: http://10.0.57.16:5200/api/metrics
|
||||
timeout: 5s
|
||||
description: Raw JSON from Print.Web GET /api/metrics. The Ollama summary is public monitoring data; /api/ai/ollama-snapshot remains API-key protected.
|
||||
- uuid: 5cb902556e9f45c2b4c29c5c4a32fd73
|
||||
name: Print.Web Ollama long keep-alive runner count
|
||||
type: DEPENDENT
|
||||
key: flowercore.print.ollama.long_keepalive.count
|
||||
delay: '0'
|
||||
history: 7d
|
||||
trends: 30d
|
||||
value_type: UNSIGNED
|
||||
description: Number of active Ollama runners whose keep-alive window remains above 10 minutes.
|
||||
preprocessing:
|
||||
- type: JAVASCRIPT
|
||||
parameters:
|
||||
- |
|
||||
var payload = JSON.parse(value);
|
||||
var ollama = payload.ollama || payload.Ollama || {};
|
||||
var runners = ollama.runners || ollama.Runners || [];
|
||||
if (!Array.isArray(runners)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
var count = 0;
|
||||
for (var i = 0; i < runners.length; i += 1) {
|
||||
var runner = runners[i] || {};
|
||||
var markedLong = runner.longKeepAlive || runner.LongKeepAlive;
|
||||
var remainingRaw = runner.keepAliveRemainingSeconds;
|
||||
if (remainingRaw === undefined || remainingRaw === null) {
|
||||
remainingRaw = runner.KeepAliveRemainingSeconds;
|
||||
}
|
||||
var remaining = Number(remainingRaw || 0);
|
||||
if (markedLong === true || remaining > 600) {
|
||||
count += 1;
|
||||
}
|
||||
}
|
||||
|
||||
return count;
|
||||
master_item:
|
||||
key: flowercore.print.ollama.snapshot
|
||||
- uuid: 73680dcbbe4844f48378c9f3042641f1
|
||||
name: Print.Web Ollama active runner count
|
||||
type: DEPENDENT
|
||||
key: flowercore.print.ollama.active_runner.count
|
||||
delay: '0'
|
||||
history: 7d
|
||||
trends: 30d
|
||||
value_type: UNSIGNED
|
||||
description: Active runner count from the Print.Web Ollama snapshot.
|
||||
preprocessing:
|
||||
- type: JAVASCRIPT
|
||||
parameters:
|
||||
- |
|
||||
var payload = JSON.parse(value);
|
||||
var ollama = payload.ollama || payload.Ollama || {};
|
||||
var activeRunnerRaw = ollama.activeRunnerCount;
|
||||
if (activeRunnerRaw === undefined || activeRunnerRaw === null) {
|
||||
activeRunnerRaw = ollama.ActiveRunnerCount;
|
||||
}
|
||||
var activeRunnerCount = Number(activeRunnerRaw);
|
||||
if (!isNaN(activeRunnerCount)) {
|
||||
return activeRunnerCount;
|
||||
}
|
||||
|
||||
var runners = ollama.runners || ollama.Runners || [];
|
||||
return Array.isArray(runners) ? runners.length : 0;
|
||||
master_item:
|
||||
key: flowercore.print.ollama.snapshot
|
||||
triggers:
|
||||
- uuid: 8fcd85b7e6e9423099b5e2bcbba3537e
|
||||
expression: last(/FlowerCore Print Ollama/flowercore.print.ollama.long_keepalive.count)>0
|
||||
name: Print.Web Ollama runner keep-alive exceeds 10 minutes
|
||||
priority: WARNING
|
||||
description: Print.Web reports at least one active Ollama runner with more than 10 minutes of keep-alive remaining. Check the Admin Ollama Fleet panel and stop duplicate model callers before the Pi 5 Ollama lane thrashes.
|
||||
manual_close: 'YES'
|
||||
174
apps/zabbix/templates/flowercore-remotedesktop.yaml
Normal file
174
apps/zabbix/templates/flowercore-remotedesktop.yaml
Normal file
@@ -0,0 +1,174 @@
|
||||
zabbix_export:
|
||||
version: '7.2'
|
||||
template_groups:
|
||||
- uuid: 2ce6df1168bd4797aa5374fd19438746
|
||||
name: Templates/FlowerCore
|
||||
templates:
|
||||
- uuid: 5b20d8f9d3c346f7b1c7fe6922e9d4d1
|
||||
template: FlowerCore RemoteDesktop
|
||||
name: FlowerCore RemoteDesktop
|
||||
description: Optional RemoteDesktop observability import. This template reads the Prometheus exposition from FlowerCore.RemoteDesktop and extracts launch/connect/disconnect/recording counters plus warm-pool gauges. Adjust the metrics URL if the Zabbix host should scrape a different endpoint than the public desktop host.
|
||||
groups:
|
||||
- name: Templates/FlowerCore
|
||||
items:
|
||||
- uuid: 357ab8ec721a4d31a5488bdd60a6679d
|
||||
name: RemoteDesktop metrics snapshot
|
||||
type: HTTP_AGENT
|
||||
key: flowercore.remotedesktop.metrics
|
||||
delay: 30s
|
||||
history: 7d
|
||||
trends: '0'
|
||||
value_type: TEXT
|
||||
url: https://desktop.iamworkin.lan/metrics
|
||||
timeout: 10s
|
||||
description: Raw Prometheus exposition from FlowerCore.RemoteDesktop.
|
||||
- uuid: 59af4d77fbb54dc6a733f8dc86d73c3d
|
||||
name: RemoteDesktop launch events total
|
||||
type: DEPENDENT
|
||||
key: flowercore.remotedesktop.launch.total
|
||||
delay: '0'
|
||||
history: 30d
|
||||
trends: 365d
|
||||
value_type: FLOAT
|
||||
preprocessing:
|
||||
- type: JAVASCRIPT
|
||||
parameters:
|
||||
- |
|
||||
var lines = String(value || '').split(/\r?\n/);
|
||||
var sum = 0;
|
||||
for (var i = 0; i < lines.length; i += 1) {
|
||||
var line = lines[i];
|
||||
if (line.indexOf('fc_desktop_session_events_total{') !== 0 || line.indexOf('event="launch"') === -1) {
|
||||
continue;
|
||||
}
|
||||
var parts = line.trim().split(/\s+/);
|
||||
var metricValue = Number(parts[parts.length - 1]);
|
||||
if (!isNaN(metricValue)) {
|
||||
sum += metricValue;
|
||||
}
|
||||
}
|
||||
return sum;
|
||||
master_item:
|
||||
key: flowercore.remotedesktop.metrics
|
||||
- uuid: 479e5d87f8e14e9cb4c45f1832723a34
|
||||
name: RemoteDesktop connect events total (json datasource)
|
||||
type: DEPENDENT
|
||||
key: flowercore.remotedesktop.connect.json.total
|
||||
delay: '0'
|
||||
history: 30d
|
||||
trends: 365d
|
||||
value_type: FLOAT
|
||||
preprocessing:
|
||||
- type: JAVASCRIPT
|
||||
parameters:
|
||||
- |
|
||||
var lines = String(value || '').split(/\r?\n/);
|
||||
var sum = 0;
|
||||
for (var i = 0; i < lines.length; i += 1) {
|
||||
var line = lines[i];
|
||||
if (line.indexOf('fc_desktop_session_events_total{') !== 0
|
||||
|| line.indexOf('event="connect"') === -1
|
||||
|| line.indexOf('browser_datasource="json"') === -1) {
|
||||
continue;
|
||||
}
|
||||
var parts = line.trim().split(/\s+/);
|
||||
var metricValue = Number(parts[parts.length - 1]);
|
||||
if (!isNaN(metricValue)) {
|
||||
sum += metricValue;
|
||||
}
|
||||
}
|
||||
return sum;
|
||||
master_item:
|
||||
key: flowercore.remotedesktop.metrics
|
||||
- uuid: 8ad073699ca74a99ab36ef1e4a4b06b8
|
||||
name: RemoteDesktop disconnect events total
|
||||
type: DEPENDENT
|
||||
key: flowercore.remotedesktop.disconnect.total
|
||||
delay: '0'
|
||||
history: 30d
|
||||
trends: 365d
|
||||
value_type: FLOAT
|
||||
preprocessing:
|
||||
- type: JAVASCRIPT
|
||||
parameters:
|
||||
- |
|
||||
var lines = String(value || '').split(/\r?\n/);
|
||||
var sum = 0;
|
||||
for (var i = 0; i < lines.length; i += 1) {
|
||||
var line = lines[i];
|
||||
if (line.indexOf('fc_desktop_session_events_total{') !== 0 || line.indexOf('event="disconnect"') === -1) {
|
||||
continue;
|
||||
}
|
||||
var parts = line.trim().split(/\s+/);
|
||||
var metricValue = Number(parts[parts.length - 1]);
|
||||
if (!isNaN(metricValue)) {
|
||||
sum += metricValue;
|
||||
}
|
||||
}
|
||||
return sum;
|
||||
master_item:
|
||||
key: flowercore.remotedesktop.metrics
|
||||
- uuid: 0a50ab8cd4ab4c97ac52f3d94b02ff8f
|
||||
name: RemoteDesktop recording events total
|
||||
type: DEPENDENT
|
||||
key: flowercore.remotedesktop.recording.total
|
||||
delay: '0'
|
||||
history: 30d
|
||||
trends: 365d
|
||||
value_type: FLOAT
|
||||
preprocessing:
|
||||
- type: JAVASCRIPT
|
||||
parameters:
|
||||
- |
|
||||
var lines = String(value || '').split(/\r?\n/);
|
||||
var sum = 0;
|
||||
for (var i = 0; i < lines.length; i += 1) {
|
||||
var line = lines[i];
|
||||
if (line.indexOf('fc_desktop_session_events_total{') !== 0 || line.indexOf('event="recording"') === -1) {
|
||||
continue;
|
||||
}
|
||||
var parts = line.trim().split(/\s+/);
|
||||
var metricValue = Number(parts[parts.length - 1]);
|
||||
if (!isNaN(metricValue)) {
|
||||
sum += metricValue;
|
||||
}
|
||||
}
|
||||
return sum;
|
||||
master_item:
|
||||
key: flowercore.remotedesktop.metrics
|
||||
- uuid: 5d4d5e7b38d14c68a72877e37d7f1bde
|
||||
name: RemoteDesktop warm pools ready
|
||||
type: DEPENDENT
|
||||
key: flowercore.remotedesktop.pool.ready
|
||||
delay: '0'
|
||||
history: 30d
|
||||
trends: 365d
|
||||
value_type: FLOAT
|
||||
preprocessing:
|
||||
- type: JAVASCRIPT
|
||||
parameters:
|
||||
- |
|
||||
var lines = String(value || '').split(/\r?\n/);
|
||||
var sum = 0;
|
||||
for (var i = 0; i < lines.length; i += 1) {
|
||||
var line = lines[i];
|
||||
if (line.indexOf('fc_desktop_pool_ready{') !== 0) {
|
||||
continue;
|
||||
}
|
||||
var parts = line.trim().split(/\s+/);
|
||||
var metricValue = Number(parts[parts.length - 1]);
|
||||
if (!isNaN(metricValue)) {
|
||||
sum += metricValue;
|
||||
}
|
||||
}
|
||||
return sum;
|
||||
master_item:
|
||||
key: flowercore.remotedesktop.metrics
|
||||
valuemaps: []
|
||||
triggers:
|
||||
- uuid: 5ef71c752fa94d2e8ce3ced79fcfe0f4
|
||||
expression: nodata(/FlowerCore RemoteDesktop/flowercore.remotedesktop.metrics,10m)=1
|
||||
name: FlowerCore RemoteDesktop metrics unavailable
|
||||
priority: WARNING
|
||||
description: FlowerCore.RemoteDesktop /metrics has not returned data for 10 minutes. Check the web deployment, ingress, or the scrape URL configured in this template.
|
||||
manual_close: 'YES'
|
||||
@@ -15,15 +15,21 @@ apiVersion: apps/v1
|
||||
kind: StatefulSet
|
||||
metadata:
|
||||
name: zabbix-postgres
|
||||
namespace: zabbix
|
||||
labels:
|
||||
app: zabbix-postgres
|
||||
spec:
|
||||
serviceName: zabbix-postgres
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app: zabbix-postgres
|
||||
namespace: zabbix
|
||||
labels:
|
||||
app: zabbix-postgres
|
||||
argocd.argoproj.io/instance: infra-zabbix
|
||||
spec:
|
||||
persistentVolumeClaimRetentionPolicy:
|
||||
whenDeleted: Retain
|
||||
whenScaled: Retain
|
||||
podManagementPolicy: OrderedReady
|
||||
serviceName: zabbix-postgres
|
||||
replicas: 1
|
||||
revisionHistoryLimit: 10
|
||||
selector:
|
||||
matchLabels:
|
||||
app: zabbix-postgres
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
@@ -99,12 +105,17 @@ spec:
|
||||
name: zabbix-postgres-data
|
||||
spec:
|
||||
accessModes: [ReadWriteOnce]
|
||||
resources:
|
||||
requests:
|
||||
storage: 10Gi
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
volumeMode: Filesystem
|
||||
resources:
|
||||
requests:
|
||||
storage: 10Gi
|
||||
updateStrategy:
|
||||
rollingUpdate:
|
||||
partition: 0
|
||||
type: RollingUpdate
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: zabbix-postgres
|
||||
namespace: zabbix
|
||||
|
||||
399
scripts/check-pfsense-dns.py
Normal file
399
scripts/check-pfsense-dns.py
Normal file
@@ -0,0 +1,399 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
check-pfsense-dns.py
|
||||
|
||||
Historical name retained for continuity, but the check now runs through the
|
||||
public FlowerCore.DNS preflight API instead of a raw local resolver lookup.
|
||||
|
||||
Fails if any *.iamworkin.lan hostname referenced by a cert-manager Certificate
|
||||
`spec.dnsNames` or a Traefik IngressRoute `Host(...)` match rule is NOT
|
||||
resolvable via FlowerCore.DNS:
|
||||
|
||||
GET /api/v1/zones/{zone}/resolve-preflight?hostname=<host>
|
||||
|
||||
Two sources are scanned:
|
||||
|
||||
1. apps/*/*.yaml in this bluejay-infra checkout — the pre-merge gate.
|
||||
2. Live-cluster Certificates + IngressRoutes (opt-in with --live, or auto when
|
||||
kubectl is on PATH AND kubeconfig is usable). This catches hostnames that
|
||||
exist in the running cluster but aren't (yet) tracked in bluejay-infra —
|
||||
e.g. services deployed via their own repo's deploy script. Retail.Web on
|
||||
2026-04-23 was stuck Issuing for 15h because of exactly this gap.
|
||||
|
||||
Run from anywhere that can reach the FlowerCore.DNS host:
|
||||
|
||||
python scripts/check-pfsense-dns.py # auto live scan if kubectl works
|
||||
python scripts/check-pfsense-dns.py --live # require live scan
|
||||
python scripts/check-pfsense-dns.py --no-live # manifests only (CI default)
|
||||
|
||||
Exit code 0: all referenced hosts pass FlowerCore.DNS preflight.
|
||||
Exit code 1: at least one host fails preflight.
|
||||
Exit code 2: --live requested but kubectl was unusable.
|
||||
|
||||
This is intentionally narrow: it only flags hostnames that cert-manager will
|
||||
actually try to validate or that Traefik will route. IRC server-link names,
|
||||
Docker image tags, comments, etc. are ignored.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import ssl
|
||||
import subprocess
|
||||
import sys
|
||||
import urllib.error
|
||||
import urllib.parse
|
||||
import urllib.request
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
|
||||
try:
|
||||
import yaml # PyYAML
|
||||
except ImportError:
|
||||
sys.exit("PyYAML required: pip install pyyaml")
|
||||
|
||||
REPO_ROOT = Path(__file__).resolve().parent.parent
|
||||
APPS_DIR = REPO_ROOT / "apps"
|
||||
|
||||
HOST_RE = re.compile(r"Host\(`([^`]+)`\)")
|
||||
|
||||
LIVE_SOURCE = "live-cluster"
|
||||
DEFAULT_BASE_URL = os.environ.get("FLOWERCORE_DNS_BASE_URL", "https://dns.iamworkin.lan")
|
||||
DEFAULT_ZONE = os.environ.get("FLOWERCORE_DNS_ZONE", "iamworkin.lan")
|
||||
DEFAULT_TIMEOUT_SECONDS = float(os.environ.get("FLOWERCORE_DNS_TIMEOUT_SECONDS", "20"))
|
||||
DEFAULT_WORKERS = max(1, int(os.environ.get("FLOWERCORE_DNS_WORKERS", "8")))
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class PreflightResult:
|
||||
host: str
|
||||
ok: bool
|
||||
resolved_zone: str | None
|
||||
server_name: str | None
|
||||
provider: str | None
|
||||
addresses: list[str]
|
||||
challenge_fqdn: str
|
||||
message: str
|
||||
|
||||
|
||||
def extract_hosts_from_doc(doc: dict) -> set[str]:
|
||||
"""Pull iamworkin.lan hostnames from a single K8s manifest doc."""
|
||||
out: set[str] = set()
|
||||
if not isinstance(doc, dict):
|
||||
return out
|
||||
|
||||
kind = doc.get("kind", "")
|
||||
spec = doc.get("spec") or {}
|
||||
|
||||
if kind == "Certificate":
|
||||
for name in spec.get("dnsNames", []) or []:
|
||||
if isinstance(name, str) and name.endswith(".iamworkin.lan"):
|
||||
out.add(name)
|
||||
|
||||
elif kind == "IngressRoute":
|
||||
for route in spec.get("routes", []) or []:
|
||||
match = route.get("match", "") if isinstance(route, dict) else ""
|
||||
for h in HOST_RE.findall(match):
|
||||
if h.endswith(".iamworkin.lan"):
|
||||
out.add(h)
|
||||
|
||||
return out
|
||||
|
||||
|
||||
def collect_hosts_from_manifests() -> dict[str, list[str]]:
|
||||
"""hostname -> [list of manifest files that referenced it]."""
|
||||
index: dict[str, list[str]] = {}
|
||||
for path in sorted(APPS_DIR.rglob("*.yaml")):
|
||||
try:
|
||||
with path.open("r", encoding="utf-8") as f:
|
||||
for doc in yaml.safe_load_all(f):
|
||||
for host in extract_hosts_from_doc(doc):
|
||||
index.setdefault(host, []).append(str(path.relative_to(REPO_ROOT)))
|
||||
except yaml.YAMLError as e:
|
||||
print(f"warn: could not parse {path}: {e}", file=sys.stderr)
|
||||
return index
|
||||
|
||||
|
||||
def _kubectl_json(args: list[str]) -> dict | None:
|
||||
"""Run `kubectl ... -o json` and return the parsed result, or None on failure."""
|
||||
try:
|
||||
r = subprocess.run(
|
||||
["kubectl", *args, "-o", "json"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=20,
|
||||
)
|
||||
except (FileNotFoundError, subprocess.TimeoutExpired):
|
||||
return None
|
||||
if r.returncode != 0:
|
||||
return None
|
||||
try:
|
||||
return json.loads(r.stdout)
|
||||
except json.JSONDecodeError:
|
||||
return None
|
||||
|
||||
|
||||
def collect_hosts_from_cluster() -> tuple[dict[str, list[str]], bool]:
|
||||
"""
|
||||
Scan live cluster. Returns (host_index, ok).
|
||||
|
||||
ok=False means kubectl wasn't usable; the caller decides whether that's
|
||||
fatal (--live) or just a warning (auto mode).
|
||||
"""
|
||||
if not shutil.which("kubectl"):
|
||||
return {}, False
|
||||
|
||||
index: dict[str, list[str]] = {}
|
||||
|
||||
certs = _kubectl_json(["get", "certificate", "-A"])
|
||||
if certs is None:
|
||||
return {}, False
|
||||
for item in certs.get("items", []):
|
||||
meta = item.get("metadata", {})
|
||||
ns = meta.get("namespace", "?")
|
||||
name = meta.get("name", "?")
|
||||
ref = f"{LIVE_SOURCE} Certificate {ns}/{name}"
|
||||
for dn in (item.get("spec", {}) or {}).get("dnsNames", []) or []:
|
||||
if isinstance(dn, str) and dn.endswith(".iamworkin.lan"):
|
||||
index.setdefault(dn, []).append(ref)
|
||||
|
||||
irs = _kubectl_json(["get", "ingressroute", "-A"])
|
||||
if irs is not None:
|
||||
for item in irs.get("items", []):
|
||||
meta = item.get("metadata", {})
|
||||
ns = meta.get("namespace", "?")
|
||||
name = meta.get("name", "?")
|
||||
ref = f"{LIVE_SOURCE} IngressRoute {ns}/{name}"
|
||||
for route in (item.get("spec", {}) or {}).get("routes", []) or []:
|
||||
match = route.get("match", "") if isinstance(route, dict) else ""
|
||||
for h in HOST_RE.findall(match):
|
||||
if h.endswith(".iamworkin.lan"):
|
||||
index.setdefault(h, []).append(ref)
|
||||
|
||||
return index, True
|
||||
|
||||
|
||||
def _ssl_context(insecure: bool) -> ssl.SSLContext:
|
||||
return ssl._create_unverified_context() if insecure else ssl.create_default_context()
|
||||
|
||||
|
||||
def preflight_host(
|
||||
base_url: str,
|
||||
zone: str,
|
||||
host: str,
|
||||
timeout_seconds: float,
|
||||
insecure: bool,
|
||||
) -> PreflightResult:
|
||||
path = (
|
||||
f"/api/v1/zones/{urllib.parse.quote(zone, safe='')}/resolve-preflight"
|
||||
f"?hostname={urllib.parse.quote(host, safe='')}"
|
||||
)
|
||||
url = urllib.parse.urljoin(base_url.rstrip("/") + "/", path.lstrip("/"))
|
||||
request = urllib.request.Request(url, headers={"Accept": "application/json"})
|
||||
|
||||
try:
|
||||
with urllib.request.urlopen(
|
||||
request,
|
||||
timeout=timeout_seconds,
|
||||
context=_ssl_context(insecure),
|
||||
) as response:
|
||||
payload = json.loads(response.read().decode("utf-8"))
|
||||
except urllib.error.HTTPError as exc:
|
||||
body = exc.read().decode("utf-8", errors="replace").strip()
|
||||
detail = body[:200] if body else exc.reason
|
||||
return PreflightResult(
|
||||
host=host,
|
||||
ok=False,
|
||||
resolved_zone=None,
|
||||
server_name=None,
|
||||
provider=None,
|
||||
addresses=[],
|
||||
challenge_fqdn=f"_acme-challenge.{host.rstrip('.')}.",
|
||||
message=f"HTTP {exc.code}: {detail}",
|
||||
)
|
||||
except Exception as exc: # noqa: BLE001 - surfaced as preflight failure detail
|
||||
return PreflightResult(
|
||||
host=host,
|
||||
ok=False,
|
||||
resolved_zone=None,
|
||||
server_name=None,
|
||||
provider=None,
|
||||
addresses=[],
|
||||
challenge_fqdn=f"_acme-challenge.{host.rstrip('.')}.",
|
||||
message=f"{type(exc).__name__}: {exc}",
|
||||
)
|
||||
|
||||
resolved_zone = payload.get("resolvedZone")
|
||||
server_name = payload.get("serverName")
|
||||
provider = payload.get("provider")
|
||||
addresses = [value for value in payload.get("addresses", []) if isinstance(value, str)]
|
||||
supports_acme = bool(payload.get("supportsAcmeDns01"))
|
||||
resolvable = bool(payload.get("resolvable"))
|
||||
challenge_fqdn = str(payload.get("challengeFqdn", f"_acme-challenge.{host.rstrip('.')}."))
|
||||
message = str(payload.get("message", "")).strip()
|
||||
|
||||
if not supports_acme and not message:
|
||||
message = "Matched DNS server does not advertise ACME DNS-01 support."
|
||||
|
||||
ok = supports_acme and resolvable and bool(resolved_zone)
|
||||
return PreflightResult(
|
||||
host=host,
|
||||
ok=ok,
|
||||
resolved_zone=resolved_zone,
|
||||
server_name=server_name,
|
||||
provider=provider,
|
||||
addresses=addresses,
|
||||
challenge_fqdn=challenge_fqdn,
|
||||
message=message,
|
||||
)
|
||||
|
||||
|
||||
def run_preflight(
|
||||
hosts: list[str],
|
||||
base_url: str,
|
||||
zone: str,
|
||||
timeout_seconds: float,
|
||||
insecure: bool,
|
||||
workers: int,
|
||||
) -> dict[str, PreflightResult]:
|
||||
if not hosts:
|
||||
return {}
|
||||
|
||||
max_workers = max(1, min(workers, len(hosts)))
|
||||
results: dict[str, PreflightResult] = {}
|
||||
with ThreadPoolExecutor(max_workers=max_workers) as pool:
|
||||
future_map = {
|
||||
pool.submit(preflight_host, base_url, zone, host, timeout_seconds, insecure): host
|
||||
for host in hosts
|
||||
}
|
||||
for future in as_completed(future_map):
|
||||
host = future_map[future]
|
||||
results[host] = future.result()
|
||||
return results
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser(description=__doc__.splitlines()[1] if __doc__ else None)
|
||||
parser.add_argument(
|
||||
"--live",
|
||||
dest="live",
|
||||
action="store_true",
|
||||
default=None,
|
||||
help="Require a live-cluster scan (fail if kubectl unreachable).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--no-live",
|
||||
dest="live",
|
||||
action="store_false",
|
||||
help="Skip live-cluster scan (manifests only).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--base-url",
|
||||
default=DEFAULT_BASE_URL,
|
||||
help=f"FlowerCore.DNS base URL (default: {DEFAULT_BASE_URL}).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--zone",
|
||||
default=DEFAULT_ZONE,
|
||||
help=f"Zone passed to resolve-preflight (default: {DEFAULT_ZONE}).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--timeout-seconds",
|
||||
type=float,
|
||||
default=DEFAULT_TIMEOUT_SECONDS,
|
||||
help=f"Per-host resolve-preflight timeout (default: {DEFAULT_TIMEOUT_SECONDS}).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--workers",
|
||||
type=int,
|
||||
default=DEFAULT_WORKERS,
|
||||
help=f"Parallel preflight workers (default: {DEFAULT_WORKERS}).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--insecure",
|
||||
action="store_true",
|
||||
help="Skip TLS verification when calling FlowerCore.DNS.",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
hosts = collect_hosts_from_manifests()
|
||||
|
||||
live_requested = args.live is True
|
||||
live_auto = args.live is None
|
||||
|
||||
if live_requested or live_auto:
|
||||
live_hosts, live_ok = collect_hosts_from_cluster()
|
||||
if live_requested and not live_ok:
|
||||
print("ERROR: --live requested but kubectl is not available or auth failed.", file=sys.stderr)
|
||||
return 2
|
||||
if live_ok:
|
||||
before = len(hosts)
|
||||
for host, refs in live_hosts.items():
|
||||
hosts.setdefault(host, []).extend(refs)
|
||||
new_hosts = len(hosts) - before
|
||||
print(f"(live scan: {len(live_hosts)} cluster host(s); {new_hosts} not covered by manifests)")
|
||||
elif live_auto:
|
||||
print("(kubectl not reachable — skipping live scan; run from a workstation with cluster access to catch retail-style drift)")
|
||||
|
||||
if not hosts:
|
||||
print("No iamworkin.lan hostnames found in manifests or cluster — nothing to check.")
|
||||
return 0
|
||||
|
||||
print(
|
||||
f"(preflight: {len(hosts)} host(s) via {args.base_url.rstrip('/')}"
|
||||
f"/api/v1/zones/{args.zone}/resolve-preflight)"
|
||||
)
|
||||
results = run_preflight(
|
||||
sorted(hosts),
|
||||
base_url=args.base_url,
|
||||
zone=args.zone,
|
||||
timeout_seconds=args.timeout_seconds,
|
||||
insecure=args.insecure,
|
||||
workers=args.workers,
|
||||
)
|
||||
|
||||
failed: list[tuple[str, list[str], PreflightResult]] = []
|
||||
for host in sorted(hosts):
|
||||
result = results[host]
|
||||
if result.ok:
|
||||
addresses = ", ".join(result.addresses) if result.addresses else "(no A/AAAA answers)"
|
||||
zone_label = result.resolved_zone or args.zone
|
||||
server_label = result.server_name or "unknown-server"
|
||||
print(f"OK {host:<45} -> {addresses} via {server_label} [{zone_label}]")
|
||||
else:
|
||||
print(f"FAIL {host:<45} ({result.message})")
|
||||
failed.append((host, hosts[host], result))
|
||||
|
||||
if failed:
|
||||
print()
|
||||
print(f"ERROR: {len(failed)} host(s) failed FlowerCore.DNS preflight.")
|
||||
for host, refs, result in failed:
|
||||
print(f" {host}")
|
||||
print(f" preflight: {result.message}")
|
||||
print(f" challenge: {result.challenge_fqdn}")
|
||||
for ref in sorted(set(refs)):
|
||||
print(f" via: {ref}")
|
||||
print()
|
||||
print("Fix the DNS record in FlowerCore.DNS before merging, then rerun this gate.")
|
||||
print()
|
||||
print("Example:")
|
||||
print(f" curl -sk {args.base_url.rstrip('/')}/api/v1/servers")
|
||||
print(
|
||||
" curl -sk -X POST "
|
||||
f"{args.base_url.rstrip('/')}/api/v1/servers/<serverId>/zones/{args.zone}/records "
|
||||
"-H 'Content-Type: application/json' "
|
||||
"-d '{\"name\":\"<host>\",\"type\":\"A\",\"data\":\"10.0.56.200\",\"ttl\":300}'"
|
||||
)
|
||||
return 1
|
||||
|
||||
print()
|
||||
print(f"All {len(hosts)} iamworkin.lan host(s) passed FlowerCore.DNS preflight. Safe to deploy.")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
24
tests/bluejay-infra-lint/BluejayInfraLint.Tests.csproj
Normal file
24
tests/bluejay-infra-lint/BluejayInfraLint.Tests.csproj
Normal file
@@ -0,0 +1,24 @@
|
||||
<Project Sdk="Microsoft.NET.Sdk">
|
||||
<PropertyGroup>
|
||||
<TargetFramework>net10.0</TargetFramework>
|
||||
<ImplicitUsings>enable</ImplicitUsings>
|
||||
<Nullable>enable</Nullable>
|
||||
<IsPackable>false</IsPackable>
|
||||
<TreatWarningsAsErrors>true</TreatWarningsAsErrors>
|
||||
</PropertyGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<PackageReference Include="coverlet.collector" Version="6.0.2">
|
||||
<PrivateAssets>all</PrivateAssets>
|
||||
<IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets>
|
||||
</PackageReference>
|
||||
<PackageReference Include="FluentAssertions" Version="6.12.1" />
|
||||
<PackageReference Include="Microsoft.NET.Test.Sdk" Version="17.12.0" />
|
||||
<PackageReference Include="xunit" Version="2.9.2" />
|
||||
<PackageReference Include="xunit.runner.visualstudio" Version="2.8.2">
|
||||
<PrivateAssets>all</PrivateAssets>
|
||||
<IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets>
|
||||
</PackageReference>
|
||||
<PackageReference Include="YamlDotNet" Version="16.2.0" />
|
||||
</ItemGroup>
|
||||
</Project>
|
||||
639
tests/bluejay-infra-lint/FleetManifestLintTests.cs
Normal file
639
tests/bluejay-infra-lint/FleetManifestLintTests.cs
Normal file
@@ -0,0 +1,639 @@
|
||||
using FluentAssertions;
|
||||
using System.Text.RegularExpressions;
|
||||
using Xunit;
|
||||
using YamlDotNet.Core;
|
||||
using YamlDotNet.RepresentationModel;
|
||||
|
||||
namespace BluejayInfraLint.Tests;
|
||||
|
||||
[Trait("Category", "Unit")]
|
||||
public sealed class FleetManifestLintTests
|
||||
{
|
||||
private static readonly ManifestInventory Inventory = ManifestInventory.Load();
|
||||
|
||||
private static readonly HashSet<string> PublicReadOnlyHosts = new(StringComparer.Ordinal)
|
||||
{
|
||||
"dist.flowercore.io",
|
||||
"dns.iamworkin.lan",
|
||||
};
|
||||
|
||||
// Public hosts that allow a tightly bounded write surface in addition to
|
||||
// GET/HEAD. updatecenter.iamworkin.lan accepts POST /api/v1/checkin/{id}
|
||||
// (bootstrap-JWT) so its allowlist is GET||HEAD||POST||OPTIONS — but
|
||||
// PUT/PATCH/DELETE must still 404 at the route. Anything wider than this
|
||||
// set should fail this lint.
|
||||
//
|
||||
// PUB-1 (2026-05-06): update.flowercore.io / updates.flowercore.io were
|
||||
// added for the Cloudflare-proxied public Update Center edge. They use the
|
||||
// same bounded read-write allowlist as the LAN pair.
|
||||
private static readonly HashSet<string> PublicReadWriteAllowlistHosts = new(StringComparer.Ordinal)
|
||||
{
|
||||
"updatecenter.iamworkin.lan",
|
||||
"updates.iamworkin.lan",
|
||||
"update.flowercore.io",
|
||||
"updates.flowercore.io",
|
||||
};
|
||||
|
||||
private static readonly HashSet<string> ApiKeyProtectedDeployments = new(StringComparer.Ordinal)
|
||||
{
|
||||
"messageboard-web",
|
||||
"scoreboard-web",
|
||||
"segmentdisplay-web",
|
||||
"signalcontrol-web",
|
||||
};
|
||||
|
||||
private static readonly HashSet<string> PublicEgressDeployments = new(StringComparer.Ordinal)
|
||||
{
|
||||
"asterisk",
|
||||
"fc-llm-bridge",
|
||||
"mysql-web",
|
||||
"php-web",
|
||||
"ttsreader-align",
|
||||
"ttsreader-kokoro",
|
||||
"ttsreader-modern",
|
||||
"ttsreader-piper",
|
||||
};
|
||||
|
||||
[Fact]
|
||||
public void IngressRoutes_MustKeepServiceReferencesInTheSameNamespace()
|
||||
{
|
||||
var violations = Inventory.Documents
|
||||
.Where(document => document.Kind == "IngressRoute")
|
||||
.SelectMany(document =>
|
||||
document.MappingSequence("spec", "routes")
|
||||
.SelectMany(route =>
|
||||
route.MappingSequence("services")
|
||||
.Select(service => new
|
||||
{
|
||||
Document = document,
|
||||
ServiceName = ManifestNodeExtensions.Scalar(service, "name"),
|
||||
ServiceNamespace = ManifestNodeExtensions.Scalar(service, "namespace"),
|
||||
})))
|
||||
.Where(entry => !string.IsNullOrWhiteSpace(entry.ServiceNamespace))
|
||||
.Where(entry => !string.Equals(entry.ServiceNamespace, entry.Document.Namespace, StringComparison.Ordinal))
|
||||
.Select(entry =>
|
||||
$"{entry.Document.Descriptor} references Service '{entry.ServiceName}' in namespace '{entry.ServiceNamespace}'.")
|
||||
.ToList();
|
||||
|
||||
violations.Should().BeEmpty();
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void PublicReadOnlyIngressRoutes_MustExplicitlyAllowOnlyGetAndHead()
|
||||
{
|
||||
var violations = Inventory.Documents
|
||||
.Where(document => document.Kind == "IngressRoute")
|
||||
.SelectMany(document =>
|
||||
document.MappingSequence("spec", "routes")
|
||||
.Select(route => new
|
||||
{
|
||||
Document = document,
|
||||
Match = ManifestNodeExtensions.Scalar(route, "match") ?? string.Empty,
|
||||
}))
|
||||
.Where(entry => PublicReadOnlyHosts.Any(host => entry.Match.Contains($"Host(`{host}`)", StringComparison.Ordinal)))
|
||||
.Where(entry => !entry.Match.Contains("Method(`GET`)", StringComparison.Ordinal)
|
||||
|| !entry.Match.Contains("Method(`HEAD`)", StringComparison.Ordinal))
|
||||
.Select(entry => $"{entry.Document.Descriptor} is missing an explicit GET/HEAD method allowlist.")
|
||||
.ToList();
|
||||
|
||||
violations.Should().BeEmpty();
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void PublicReadWriteIngressRoutes_MustPinGetHeadPostOptionsAllowlist()
|
||||
{
|
||||
// For hosts in PublicReadWriteAllowlistHosts, the route match MUST
|
||||
// contain Method(`GET`), Method(`HEAD`), Method(`POST`), and
|
||||
// Method(`OPTIONS`) AND MUST NOT contain Method(`PUT`),
|
||||
// Method(`PATCH`), or Method(`DELETE`). This keeps the public
|
||||
// allowlist invariant against regression — see Track A's
|
||||
// updatecenter-web ingressroute hardening.
|
||||
var violations = Inventory.Documents
|
||||
.Where(document => document.Kind == "IngressRoute")
|
||||
.SelectMany(document =>
|
||||
document.MappingSequence("spec", "routes")
|
||||
.Select(route => new
|
||||
{
|
||||
Document = document,
|
||||
Match = ManifestNodeExtensions.Scalar(route, "match") ?? string.Empty,
|
||||
}))
|
||||
.Where(entry => PublicReadWriteAllowlistHosts.Any(host => entry.Match.Contains($"Host(`{host}`)", StringComparison.Ordinal)))
|
||||
.SelectMany(entry =>
|
||||
{
|
||||
var localViolations = new List<string>();
|
||||
|
||||
foreach (var required in new[] { "GET", "HEAD", "POST", "OPTIONS" })
|
||||
{
|
||||
if (!entry.Match.Contains($"Method(`{required}`)", StringComparison.Ordinal))
|
||||
{
|
||||
localViolations.Add($"{entry.Document.Descriptor} is missing required Method(`{required}`).");
|
||||
}
|
||||
}
|
||||
|
||||
foreach (var forbidden in new[] { "PUT", "PATCH", "DELETE" })
|
||||
{
|
||||
if (entry.Match.Contains($"Method(`{forbidden}`)", StringComparison.Ordinal))
|
||||
{
|
||||
localViolations.Add($"{entry.Document.Descriptor} must not include Method(`{forbidden}`) on a public host.");
|
||||
}
|
||||
}
|
||||
|
||||
return localViolations;
|
||||
})
|
||||
.ToList();
|
||||
|
||||
violations.Should().BeEmpty();
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TraefikVipNetworkPolicies_MustAllowPostDnatBackendPorts()
|
||||
{
|
||||
var violations = Inventory.Documents
|
||||
.Where(document => document.Kind == "NetworkPolicy")
|
||||
.Where(document => document.AllScalars().Any(value => value.Contains("10.0.56.200", StringComparison.Ordinal)))
|
||||
.SelectMany(document =>
|
||||
{
|
||||
var ports = document.EgressPorts().ToHashSet(StringComparer.Ordinal);
|
||||
var localViolations = new List<string>();
|
||||
|
||||
if (ports.Contains("443") && !ports.Contains("8443"))
|
||||
{
|
||||
localViolations.Add($"{document.Descriptor} allows Traefik VIP 443 without backend port 8443.");
|
||||
}
|
||||
|
||||
if (ports.Contains("80") && !ports.Contains("8000") && !ports.Contains("8080"))
|
||||
{
|
||||
localViolations.Add($"{document.Descriptor} allows Traefik VIP 80 without a backend HTTP port (8000/8080).");
|
||||
}
|
||||
|
||||
return localViolations;
|
||||
})
|
||||
.ToList();
|
||||
|
||||
violations.Should().BeEmpty();
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void ApiKeyProtectedDeployments_MustUseTcpSocketHealthProbes()
|
||||
{
|
||||
var violations = Inventory.Documents
|
||||
.Where(document => document.Kind == "Deployment")
|
||||
.Where(document => ApiKeyProtectedDeployments.Contains(document.Name))
|
||||
.SelectMany(document => document.ContainerMappings().SelectMany(container =>
|
||||
ProbeViolations(document, container, "readinessProbe")
|
||||
.Concat(ProbeViolations(document, container, "livenessProbe"))))
|
||||
.ToList();
|
||||
|
||||
violations.Should().BeEmpty();
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void StatefulSets_WithVolumeClaimTemplates_MustDeclareFilesystemDefaults()
|
||||
{
|
||||
var violations = Inventory.Documents
|
||||
.Where(document => document.Kind == "StatefulSet")
|
||||
.Where(document => document.MappingSequence("spec", "volumeClaimTemplates").Count > 0)
|
||||
.SelectMany(document =>
|
||||
{
|
||||
var localViolations = new List<string>();
|
||||
|
||||
if (string.IsNullOrWhiteSpace(document.Scalar("spec", "podManagementPolicy")))
|
||||
{
|
||||
localViolations.Add($"{document.Descriptor} is missing spec.podManagementPolicy.");
|
||||
}
|
||||
|
||||
if (string.IsNullOrWhiteSpace(document.Scalar("spec", "revisionHistoryLimit")))
|
||||
{
|
||||
localViolations.Add($"{document.Descriptor} is missing spec.revisionHistoryLimit.");
|
||||
}
|
||||
|
||||
foreach (var claimTemplate in document.MappingSequence("spec", "volumeClaimTemplates"))
|
||||
{
|
||||
if (!string.Equals(
|
||||
ManifestNodeExtensions.Scalar(claimTemplate, "spec", "volumeMode"),
|
||||
"Filesystem",
|
||||
StringComparison.Ordinal))
|
||||
{
|
||||
var claimName = ManifestNodeExtensions.Scalar(claimTemplate, "metadata", "name") ?? "<unnamed>";
|
||||
localViolations.Add($"{document.Descriptor} volumeClaimTemplate '{claimName}' is missing volumeMode: Filesystem.");
|
||||
}
|
||||
}
|
||||
|
||||
return localViolations;
|
||||
})
|
||||
.ToList();
|
||||
|
||||
violations.Should().BeEmpty();
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void LocallyImportedImages_MustUseLocalhostPrefixAndNeverPullPolicy()
|
||||
{
|
||||
var violations = Inventory.Documents
|
||||
.Where(document => document.PodSpec() is not null)
|
||||
.SelectMany(document => document.ContainerSpecs()
|
||||
.Where(container => !string.IsNullOrWhiteSpace(container.Image))
|
||||
.Select(container => new
|
||||
{
|
||||
Document = document,
|
||||
Container = container,
|
||||
}))
|
||||
.Where(entry =>
|
||||
(entry.Container.Image.StartsWith("localhost/", StringComparison.Ordinal)
|
||||
&& !string.Equals(entry.Container.ImagePullPolicy, "Never", StringComparison.Ordinal))
|
||||
|| (entry.Container.Image.StartsWith("fc-", StringComparison.Ordinal)
|
||||
&& !entry.Container.Image.Contains('/', StringComparison.Ordinal)))
|
||||
.Select(entry =>
|
||||
{
|
||||
if (entry.Container.Image.StartsWith("localhost/", StringComparison.Ordinal))
|
||||
{
|
||||
return $"{entry.Document.Descriptor} container '{entry.Container.Name}' uses {entry.Container.Image} without imagePullPolicy: Never.";
|
||||
}
|
||||
|
||||
return $"{entry.Document.Descriptor} container '{entry.Container.Name}' uses non-local image '{entry.Container.Image}' for a node-imported FlowerCore workload.";
|
||||
})
|
||||
.ToList();
|
||||
|
||||
violations.Should().BeEmpty();
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void PublicEgressDeployments_MustOptOutOfIamworkinLanSearchSuffixes()
|
||||
{
|
||||
var violations = Inventory.Documents
|
||||
.Where(document => document.PodSpec() is not null)
|
||||
.Where(document => PublicEgressDeployments.Contains(document.Name))
|
||||
.SelectMany(document =>
|
||||
{
|
||||
var localViolations = new List<string>();
|
||||
var podSpec = document.PodSpec()!;
|
||||
var dnsPolicy = ManifestNodeExtensions.Scalar(podSpec, "dnsPolicy");
|
||||
var searches = ManifestNodeExtensions.ScalarSequence(podSpec, "dnsConfig", "searches").ToList();
|
||||
|
||||
if (!string.Equals(dnsPolicy, "None", StringComparison.Ordinal))
|
||||
{
|
||||
localViolations.Add($"{document.Descriptor} is missing dnsPolicy: None.");
|
||||
}
|
||||
|
||||
if (searches.Count == 0)
|
||||
{
|
||||
localViolations.Add($"{document.Descriptor} is missing dnsConfig.searches.");
|
||||
}
|
||||
else if (searches.Any(search => search.Contains("iamworkin.lan", StringComparison.OrdinalIgnoreCase)))
|
||||
{
|
||||
localViolations.Add($"{document.Descriptor} still includes iamworkin.lan in dnsConfig.searches.");
|
||||
}
|
||||
|
||||
return localViolations;
|
||||
})
|
||||
.ToList();
|
||||
|
||||
violations.Should().BeEmpty();
|
||||
}
|
||||
|
||||
private static IEnumerable<string> ProbeViolations(
|
||||
ManifestDocument document,
|
||||
YamlMappingNode container,
|
||||
string probeKey)
|
||||
{
|
||||
if (!ManifestNodeExtensions.TryGetMapping(container, probeKey, out var probe)
|
||||
|| !ManifestNodeExtensions.TryGetMapping(probe, "httpGet", out var httpGet))
|
||||
{
|
||||
return Array.Empty<string>();
|
||||
}
|
||||
|
||||
var path = ManifestNodeExtensions.Scalar(httpGet, "path");
|
||||
if (!string.Equals(path, "/health", StringComparison.Ordinal))
|
||||
{
|
||||
return Array.Empty<string>();
|
||||
}
|
||||
|
||||
var containerName = ManifestNodeExtensions.Scalar(container, "name") ?? "<unnamed>";
|
||||
return new[]
|
||||
{
|
||||
$"{document.Descriptor} container '{containerName}' still uses {probeKey}.httpGet on /health.",
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
internal sealed class ManifestInventory
|
||||
{
|
||||
private ManifestInventory(string workspaceRoot, string bluejayRoot, IReadOnlyList<ManifestDocument> documents)
|
||||
{
|
||||
WorkspaceRoot = workspaceRoot;
|
||||
BluejayRoot = bluejayRoot;
|
||||
Documents = documents;
|
||||
}
|
||||
|
||||
public string WorkspaceRoot { get; }
|
||||
|
||||
public string BluejayRoot { get; }
|
||||
|
||||
public IReadOnlyList<ManifestDocument> Documents { get; }
|
||||
|
||||
public static ManifestInventory Load()
|
||||
{
|
||||
var bluejayRoot = FindBluejayInfraRoot();
|
||||
var workspaceRoot = Directory.GetParent(bluejayRoot)?.FullName
|
||||
?? throw new DirectoryNotFoundException($"Could not resolve workspace root from '{bluejayRoot}'.");
|
||||
|
||||
var documents = ManifestRoots(workspaceRoot, bluejayRoot)
|
||||
.SelectMany(LoadDocumentsFromRoot)
|
||||
.ToList();
|
||||
|
||||
return new ManifestInventory(workspaceRoot, bluejayRoot, documents);
|
||||
}
|
||||
|
||||
private static string FindBluejayInfraRoot()
|
||||
{
|
||||
var current = new DirectoryInfo(AppContext.BaseDirectory);
|
||||
while (current is not null)
|
||||
{
|
||||
if (Directory.Exists(Path.Combine(current.FullName, "apps"))
|
||||
&& File.Exists(Path.Combine(current.FullName, "README.md")))
|
||||
{
|
||||
return current.FullName;
|
||||
}
|
||||
|
||||
current = current.Parent;
|
||||
}
|
||||
|
||||
throw new DirectoryNotFoundException("Could not find the bluejay-infra repository root from the test output directory.");
|
||||
}
|
||||
|
||||
private static IEnumerable<string> ManifestRoots(string workspaceRoot, string bluejayRoot)
|
||||
{
|
||||
var roots = new[]
|
||||
{
|
||||
Path.Combine(bluejayRoot, "apps"),
|
||||
Path.Combine(workspaceRoot, "FlowerCore.Chat", "k8s"),
|
||||
Path.Combine(workspaceRoot, "FlowerCore.DMS", "k8s"),
|
||||
Path.Combine(workspaceRoot, "FlowerCore.DNS", "k8s"),
|
||||
Path.Combine(workspaceRoot, "FlowerCore.Intranet.Web", "k8s"),
|
||||
Path.Combine(workspaceRoot, "FlowerCore.Kiosk", "k8s"),
|
||||
Path.Combine(workspaceRoot, "FlowerCore.Media", "k8s"),
|
||||
Path.Combine(workspaceRoot, "FlowerCore.MenuBoard", "k8s"),
|
||||
Path.Combine(workspaceRoot, "FlowerCore.MessageBoard", "k8s"),
|
||||
// FlowerCore.Notes/k8s/selenium/ is the live Selenium Grid
|
||||
// manifest tree (consumed by deploy-selenium scripts).
|
||||
// FlowerCore.Notes/k8s/guacamole/ + FlowerCore.Notes/k8s/monitoring/
|
||||
// are historical scaffolds that have diverged from the live state
|
||||
// (bluejay-infra/apps/guacamole + bluejay-infra/apps/monitoring are
|
||||
// canonical). Operator review is required before bringing them in
|
||||
// line OR decommissioning them — keep them out of the lint scope
|
||||
// until that decision lands. See xxl-regroup-2026-05-03-followup.md
|
||||
// "Codex 7 §0 stop conditions" + the C7 close-session output.
|
||||
Path.Combine(workspaceRoot, "FlowerCore.Notes", "k8s", "selenium"),
|
||||
Path.Combine(workspaceRoot, "FlowerCore.MySQL", "k8s"),
|
||||
Path.Combine(workspaceRoot, "FlowerCore.PHP", "k8s"),
|
||||
Path.Combine(workspaceRoot, "FlowerCore.Presentations", "k8s"),
|
||||
Path.Combine(workspaceRoot, "FlowerCore.Print.Web", "k8s"),
|
||||
Path.Combine(workspaceRoot, "FlowerCore.RemoteDesktop", "k8s"),
|
||||
Path.Combine(workspaceRoot, "FlowerCore.Scoreboard", "k8s"),
|
||||
Path.Combine(workspaceRoot, "FlowerCore.SegmentDisplay", "k8s"),
|
||||
Path.Combine(workspaceRoot, "FlowerCore.SignalControl", "k8s"),
|
||||
Path.Combine(workspaceRoot, "FlowerCore.TtsReader", "k8s"),
|
||||
Path.Combine(workspaceRoot, "FlowerCore.Updater", "k8s"),
|
||||
};
|
||||
|
||||
return roots.Where(Directory.Exists);
|
||||
}
|
||||
|
||||
private static IEnumerable<ManifestDocument> LoadDocumentsFromRoot(string root)
|
||||
{
|
||||
foreach (var filePath in Directory.EnumerateFiles(root, "*.yaml", SearchOption.AllDirectories))
|
||||
{
|
||||
var fileText = File.ReadAllText(filePath);
|
||||
var segments = SplitManifestDocuments(fileText);
|
||||
|
||||
for (var index = 0; index < segments.Count; index++)
|
||||
{
|
||||
var yaml = new YamlStream();
|
||||
try
|
||||
{
|
||||
using var reader = new StringReader(segments[index]);
|
||||
yaml.Load(reader);
|
||||
}
|
||||
catch (YamlException exception)
|
||||
{
|
||||
_ = exception;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (yaml.Documents.Count == 0)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
if (yaml.Documents[0].RootNode is YamlMappingNode mapping
|
||||
&& ManifestNodeExtensions.Scalar(mapping, "kind") is not null)
|
||||
{
|
||||
yield return new ManifestDocument(root, filePath, index, fileText, mapping);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static IReadOnlyList<string> SplitManifestDocuments(string fileText)
|
||||
{
|
||||
var documents = new List<string>();
|
||||
var currentLines = new List<string>();
|
||||
var seenApiVersion = false;
|
||||
|
||||
foreach (var line in Regex.Split(fileText, @"\r?\n"))
|
||||
{
|
||||
if (Regex.IsMatch(line, @"^\s*---\s*$"))
|
||||
{
|
||||
FlushCurrentDocument();
|
||||
continue;
|
||||
}
|
||||
|
||||
if (Regex.IsMatch(line, @"^\s*apiVersion:\s*")
|
||||
&& seenApiVersion
|
||||
&& currentLines.Any(existing => !string.IsNullOrWhiteSpace(existing)))
|
||||
{
|
||||
FlushCurrentDocument();
|
||||
}
|
||||
|
||||
currentLines.Add(line);
|
||||
if (Regex.IsMatch(line, @"^\s*apiVersion:\s*"))
|
||||
{
|
||||
seenApiVersion = true;
|
||||
}
|
||||
}
|
||||
|
||||
FlushCurrentDocument();
|
||||
return documents;
|
||||
|
||||
void FlushCurrentDocument()
|
||||
{
|
||||
var text = string.Join(Environment.NewLine, currentLines).Trim();
|
||||
if (!string.IsNullOrWhiteSpace(text))
|
||||
{
|
||||
documents.Add(text);
|
||||
}
|
||||
|
||||
currentLines.Clear();
|
||||
seenApiVersion = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
internal sealed record ManifestDocument(
|
||||
string RootPath,
|
||||
string FilePath,
|
||||
int DocumentIndex,
|
||||
string FileText,
|
||||
YamlMappingNode Root)
|
||||
{
|
||||
public string Kind => Scalar("kind") ?? string.Empty;
|
||||
|
||||
public string Name => Scalar("metadata", "name") ?? $"document-{DocumentIndex}";
|
||||
|
||||
public string Namespace => Scalar("metadata", "namespace") ?? string.Empty;
|
||||
|
||||
public string RelativePath => Path.GetRelativePath(RootPath, FilePath).Replace('\\', '/');
|
||||
|
||||
public string Descriptor => $"{Kind} {Namespace}/{Name} [{RelativePath}#{DocumentIndex + 1}]";
|
||||
|
||||
public string? Scalar(params string[] path) => ManifestNodeExtensions.Scalar(Root, path);
|
||||
|
||||
public IReadOnlyList<YamlMappingNode> MappingSequence(params string[] path) => ManifestNodeExtensions.MappingSequence(Root, path);
|
||||
|
||||
public IEnumerable<string> AllScalars() => ManifestNodeExtensions.AllScalars(Root);
|
||||
|
||||
public IReadOnlyList<string> EgressPorts()
|
||||
{
|
||||
return MappingSequence("spec", "egress")
|
||||
.SelectMany(egressRule => ManifestNodeExtensions.MappingSequence(egressRule, "ports"))
|
||||
.Select(portMapping => ManifestNodeExtensions.Scalar(portMapping, "port"))
|
||||
.Where(value => !string.IsNullOrWhiteSpace(value))
|
||||
.Cast<string>()
|
||||
.ToList();
|
||||
}
|
||||
|
||||
public YamlMappingNode? PodSpec()
|
||||
{
|
||||
return Kind switch
|
||||
{
|
||||
"Deployment" or "StatefulSet" or "DaemonSet" or "Job" =>
|
||||
ManifestNodeExtensions.Mapping(Root, "spec", "template", "spec"),
|
||||
"CronJob" =>
|
||||
ManifestNodeExtensions.Mapping(Root, "spec", "jobTemplate", "spec", "template", "spec"),
|
||||
_ => null,
|
||||
};
|
||||
}
|
||||
|
||||
public IReadOnlyList<YamlMappingNode> ContainerMappings()
|
||||
{
|
||||
var podSpec = PodSpec();
|
||||
if (podSpec is null)
|
||||
{
|
||||
return Array.Empty<YamlMappingNode>();
|
||||
}
|
||||
|
||||
return ManifestNodeExtensions.MappingSequence(podSpec, "containers")
|
||||
.Concat(ManifestNodeExtensions.MappingSequence(podSpec, "initContainers"))
|
||||
.ToList();
|
||||
}
|
||||
|
||||
public IReadOnlyList<ContainerSpec> ContainerSpecs()
|
||||
{
|
||||
return ContainerMappings()
|
||||
.Select(container => new ContainerSpec(
|
||||
ManifestNodeExtensions.Scalar(container, "name") ?? "<unnamed>",
|
||||
ManifestNodeExtensions.Scalar(container, "image") ?? string.Empty,
|
||||
ManifestNodeExtensions.Scalar(container, "imagePullPolicy") ?? string.Empty))
|
||||
.ToList();
|
||||
}
|
||||
}
|
||||
|
||||
internal sealed record ContainerSpec(string Name, string Image, string ImagePullPolicy);
|
||||
|
||||
internal static class ManifestNodeExtensions
|
||||
{
|
||||
public static string? Scalar(this YamlMappingNode mapping, params string[] path)
|
||||
{
|
||||
return TryGetNode(mapping, path, out var node) && node is YamlScalarNode scalar
|
||||
? scalar.Value
|
||||
: null;
|
||||
}
|
||||
|
||||
public static YamlMappingNode? Mapping(this YamlMappingNode mapping, params string[] path)
|
||||
{
|
||||
return TryGetNode(mapping, path, out var node) ? node as YamlMappingNode : null;
|
||||
}
|
||||
|
||||
public static bool TryGetMapping(this YamlMappingNode mapping, string key, out YamlMappingNode result)
|
||||
{
|
||||
if (TryGetChild(mapping, key, out var child) && child is YamlMappingNode childMapping)
|
||||
{
|
||||
result = childMapping;
|
||||
return true;
|
||||
}
|
||||
|
||||
result = null!;
|
||||
return false;
|
||||
}
|
||||
|
||||
public static IReadOnlyList<YamlMappingNode> MappingSequence(this YamlMappingNode mapping, params string[] path)
|
||||
{
|
||||
return TryGetNode(mapping, path, out var node) && node is YamlSequenceNode sequence
|
||||
? sequence.Children.OfType<YamlMappingNode>().ToList()
|
||||
: Array.Empty<YamlMappingNode>();
|
||||
}
|
||||
|
||||
public static IReadOnlyList<string> ScalarSequence(this YamlMappingNode mapping, params string[] path)
|
||||
{
|
||||
return TryGetNode(mapping, path, out var node) && node is YamlSequenceNode sequence
|
||||
? sequence.Children.OfType<YamlScalarNode>()
|
||||
.Select(child => child.Value)
|
||||
.Where(value => !string.IsNullOrWhiteSpace(value))
|
||||
.Cast<string>()
|
||||
.ToList()
|
||||
: Array.Empty<string>();
|
||||
}
|
||||
|
||||
public static IEnumerable<string> AllScalars(YamlNode node)
|
||||
{
|
||||
return node switch
|
||||
{
|
||||
YamlScalarNode scalar when !string.IsNullOrWhiteSpace(scalar.Value) => new[] { scalar.Value! },
|
||||
YamlSequenceNode sequence => sequence.Children.SelectMany(AllScalars),
|
||||
YamlMappingNode mapping => mapping.Children.SelectMany(entry => AllScalars(entry.Key).Concat(AllScalars(entry.Value))),
|
||||
_ => Array.Empty<string>(),
|
||||
};
|
||||
}
|
||||
|
||||
private static bool TryGetNode(YamlMappingNode mapping, IReadOnlyList<string> path, out YamlNode node)
|
||||
{
|
||||
YamlNode current = mapping;
|
||||
foreach (var segment in path)
|
||||
{
|
||||
if (current is not YamlMappingNode currentMapping || !TryGetChild(currentMapping, segment, out current))
|
||||
{
|
||||
node = null!;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
node = current;
|
||||
return true;
|
||||
}
|
||||
|
||||
private static bool TryGetChild(YamlMappingNode mapping, string key, out YamlNode value)
|
||||
{
|
||||
foreach (var entry in mapping.Children)
|
||||
{
|
||||
if (entry.Key is YamlScalarNode scalar
|
||||
&& string.Equals(scalar.Value, key, StringComparison.Ordinal))
|
||||
{
|
||||
value = entry.Value;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
value = null!;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,12 @@
|
||||
package bluejayinfra.cross_namespace_ingressroute
|
||||
|
||||
deny[msg] {
|
||||
input.kind == "IngressRoute"
|
||||
ns := object.get(input.metadata, "namespace", "")
|
||||
route := input.spec.routes[_]
|
||||
service := route.services[_]
|
||||
svc_ns := object.get(service, "namespace", "")
|
||||
svc_ns != ""
|
||||
svc_ns != ns
|
||||
msg := sprintf("IngressRoute %s/%s references Service %s in namespace %s", [ns, input.metadata.name, service.name, svc_ns])
|
||||
}
|
||||
@@ -0,0 +1,23 @@
|
||||
package bluejayinfra.public_method_allowlist
|
||||
|
||||
public_hosts := {"dist.flowercore.io", "dns.iamworkin.lan"}
|
||||
|
||||
deny[msg] {
|
||||
input.kind == "IngressRoute"
|
||||
route := input.spec.routes[_]
|
||||
match := object.get(route, "match", "")
|
||||
host := public_hosts[_]
|
||||
contains(match, sprintf("Host(`%s`)", [host]))
|
||||
not contains(match, "Method(`GET`)")
|
||||
msg := sprintf("IngressRoute %s/%s is missing Method(GET) for public read-only host %s", [input.metadata.namespace, input.metadata.name, host])
|
||||
}
|
||||
|
||||
deny[msg] {
|
||||
input.kind == "IngressRoute"
|
||||
route := input.spec.routes[_]
|
||||
match := object.get(route, "match", "")
|
||||
host := public_hosts[_]
|
||||
contains(match, sprintf("Host(`%s`)", [host]))
|
||||
not contains(match, "Method(`HEAD`)")
|
||||
msg := sprintf("IngressRoute %s/%s is missing Method(HEAD) for public read-only host %s", [input.metadata.namespace, input.metadata.name, host])
|
||||
}
|
||||
@@ -0,0 +1,30 @@
|
||||
package bluejayinfra.traefik_vip_backend_ports
|
||||
|
||||
has_vip {
|
||||
some i
|
||||
some j
|
||||
input.spec.egress[i].to[j].ipBlock.cidr == "10.0.56.200/32"
|
||||
}
|
||||
|
||||
has_port(port) {
|
||||
some i
|
||||
some j
|
||||
input.spec.egress[i].ports[j].port == port
|
||||
}
|
||||
|
||||
deny[msg] {
|
||||
input.kind == "NetworkPolicy"
|
||||
has_vip
|
||||
has_port(443)
|
||||
not has_port(8443)
|
||||
msg := sprintf("NetworkPolicy %s/%s allows 10.0.56.200:443 without backend port 8443", [input.metadata.namespace, input.metadata.name])
|
||||
}
|
||||
|
||||
deny[msg] {
|
||||
input.kind == "NetworkPolicy"
|
||||
has_vip
|
||||
has_port(80)
|
||||
not has_port(8080)
|
||||
not has_port(8000)
|
||||
msg := sprintf("NetworkPolicy %s/%s allows 10.0.56.200:80 without backend HTTP port 8080 or 8000", [input.metadata.namespace, input.metadata.name])
|
||||
}
|
||||
@@ -0,0 +1,28 @@
|
||||
package bluejayinfra.auth_probe_path
|
||||
|
||||
protected_deployments := {
|
||||
"messageboard-web",
|
||||
"scoreboard-web",
|
||||
"segmentdisplay-web",
|
||||
"signalcontrol-web",
|
||||
}
|
||||
|
||||
deny[msg] {
|
||||
input.kind == "Deployment"
|
||||
protected_deployments[input.metadata.name]
|
||||
container := input.spec.template.spec.containers[_]
|
||||
probe := object.get(container, "readinessProbe", {})
|
||||
http_get := object.get(probe, "httpGet", {})
|
||||
object.get(http_get, "path", "") == "/health"
|
||||
msg := sprintf("Deployment %s/%s must not use readinessProbe.httpGet /health behind API key middleware", [input.metadata.namespace, input.metadata.name])
|
||||
}
|
||||
|
||||
deny[msg] {
|
||||
input.kind == "Deployment"
|
||||
protected_deployments[input.metadata.name]
|
||||
container := input.spec.template.spec.containers[_]
|
||||
probe := object.get(container, "livenessProbe", {})
|
||||
http_get := object.get(probe, "httpGet", {})
|
||||
object.get(http_get, "path", "") == "/health"
|
||||
msg := sprintf("Deployment %s/%s must not use livenessProbe.httpGet /health behind API key middleware", [input.metadata.namespace, input.metadata.name])
|
||||
}
|
||||
@@ -0,0 +1,23 @@
|
||||
package bluejayinfra.statefulset_volumeclaim_defaults
|
||||
|
||||
deny[msg] {
|
||||
input.kind == "StatefulSet"
|
||||
count(object.get(input.spec, "volumeClaimTemplates", [])) > 0
|
||||
object.get(input.spec, "podManagementPolicy", "") == ""
|
||||
msg := sprintf("StatefulSet %s/%s is missing spec.podManagementPolicy", [input.metadata.namespace, input.metadata.name])
|
||||
}
|
||||
|
||||
deny[msg] {
|
||||
input.kind == "StatefulSet"
|
||||
count(object.get(input.spec, "volumeClaimTemplates", [])) > 0
|
||||
object.get(input.spec, "revisionHistoryLimit", 0) == 0
|
||||
msg := sprintf("StatefulSet %s/%s is missing spec.revisionHistoryLimit", [input.metadata.namespace, input.metadata.name])
|
||||
}
|
||||
|
||||
deny[msg] {
|
||||
input.kind == "StatefulSet"
|
||||
claim := input.spec.volumeClaimTemplates[_]
|
||||
object.get(claim.spec, "volumeMode", "") != "Filesystem"
|
||||
claim_name := object.get(claim.metadata, "name", "<unnamed>")
|
||||
msg := sprintf("StatefulSet %s/%s volumeClaimTemplate %s is missing volumeMode: Filesystem", [input.metadata.namespace, input.metadata.name, claim_name])
|
||||
}
|
||||
@@ -0,0 +1,40 @@
|
||||
package bluejayinfra.localhost_image_pull_policy
|
||||
|
||||
pod_spec(spec) = pod {
|
||||
input.kind == "Deployment"
|
||||
pod := spec.template.spec
|
||||
}
|
||||
|
||||
pod_spec(spec) = pod {
|
||||
input.kind == "StatefulSet"
|
||||
pod := spec.template.spec
|
||||
}
|
||||
|
||||
pod_spec(spec) = pod {
|
||||
input.kind == "DaemonSet"
|
||||
pod := spec.template.spec
|
||||
}
|
||||
|
||||
deny[msg] {
|
||||
pod := pod_spec(input.spec)
|
||||
container := pod.containers[_]
|
||||
startswith(object.get(container, "image", ""), "localhost/")
|
||||
object.get(container, "imagePullPolicy", "") != "Never"
|
||||
msg := sprintf("%s/%s container %s uses a localhost image without imagePullPolicy: Never", [input.metadata.namespace, input.metadata.name, container.name])
|
||||
}
|
||||
|
||||
deny[msg] {
|
||||
pod := pod_spec(input.spec)
|
||||
container := pod.initContainers[_]
|
||||
startswith(object.get(container, "image", ""), "localhost/")
|
||||
object.get(container, "imagePullPolicy", "") != "Never"
|
||||
msg := sprintf("%s/%s initContainer %s uses a localhost image without imagePullPolicy: Never", [input.metadata.namespace, input.metadata.name, container.name])
|
||||
}
|
||||
|
||||
deny[msg] {
|
||||
pod := pod_spec(input.spec)
|
||||
container := pod.containers[_]
|
||||
startswith(object.get(container, "image", ""), "fc-")
|
||||
not contains(object.get(container, "image", ""), "/")
|
||||
msg := sprintf("%s/%s container %s uses a non-localhost FlowerCore image reference %s", [input.metadata.namespace, input.metadata.name, container.name, container.image])
|
||||
}
|
||||
@@ -0,0 +1,27 @@
|
||||
package bluejayinfra.public_egress_dns_none
|
||||
|
||||
public_egress_workloads := {
|
||||
"asterisk",
|
||||
"fc-llm-bridge",
|
||||
"mysql-web",
|
||||
"php-web",
|
||||
"ttsreader-align",
|
||||
"ttsreader-kokoro",
|
||||
"ttsreader-modern",
|
||||
"ttsreader-piper",
|
||||
}
|
||||
|
||||
deny[msg] {
|
||||
input.kind == "Deployment"
|
||||
public_egress_workloads[input.metadata.name]
|
||||
object.get(input.spec.template.spec, "dnsPolicy", "") != "None"
|
||||
msg := sprintf("Deployment %s/%s must set dnsPolicy: None for public-internet egress", [input.metadata.namespace, input.metadata.name])
|
||||
}
|
||||
|
||||
deny[msg] {
|
||||
input.kind == "Deployment"
|
||||
public_egress_workloads[input.metadata.name]
|
||||
search := object.get(object.get(input.spec.template.spec, "dnsConfig", {}), "searches", [])[_]
|
||||
contains(lower(search), "iamworkin.lan")
|
||||
msg := sprintf("Deployment %s/%s must not include iamworkin.lan in dnsConfig.searches", [input.metadata.namespace, input.metadata.name])
|
||||
}
|
||||
@@ -0,0 +1,40 @@
|
||||
package bluejayinfra.public_readwrite_allowlist
|
||||
|
||||
# Public hosts that allow a tightly bounded write surface in addition to
|
||||
# GET/HEAD. updatecenter.iamworkin.lan accepts POST /api/v1/checkin/{id}
|
||||
# (bootstrap-JWT) so its allowlist is GET||HEAD||POST||OPTIONS — but
|
||||
# PUT/PATCH/DELETE must still 404 at the route. Any host in this set MUST
|
||||
# include all four required methods AND MUST NOT include any forbidden
|
||||
# method.
|
||||
public_readwrite_hosts := {
|
||||
"updatecenter.iamworkin.lan",
|
||||
"updates.iamworkin.lan",
|
||||
"update.flowercore.io",
|
||||
"updates.flowercore.io",
|
||||
}
|
||||
|
||||
required_methods := {"GET", "HEAD", "POST", "OPTIONS"}
|
||||
|
||||
forbidden_methods := {"PUT", "PATCH", "DELETE"}
|
||||
|
||||
deny[msg] {
|
||||
input.kind == "IngressRoute"
|
||||
route := input.spec.routes[_]
|
||||
match := object.get(route, "match", "")
|
||||
host := public_readwrite_hosts[_]
|
||||
contains(match, sprintf("Host(`%s`)", [host]))
|
||||
required := required_methods[_]
|
||||
not contains(match, sprintf("Method(`%s`)", [required]))
|
||||
msg := sprintf("IngressRoute %s/%s is missing required Method(%s) for public read-write host %s", [input.metadata.namespace, input.metadata.name, required, host])
|
||||
}
|
||||
|
||||
deny[msg] {
|
||||
input.kind == "IngressRoute"
|
||||
route := input.spec.routes[_]
|
||||
match := object.get(route, "match", "")
|
||||
host := public_readwrite_hosts[_]
|
||||
contains(match, sprintf("Host(`%s`)", [host]))
|
||||
forbidden := forbidden_methods[_]
|
||||
contains(match, sprintf("Method(`%s`)", [forbidden]))
|
||||
msg := sprintf("IngressRoute %s/%s must not include Method(%s) on public read-write host %s", [input.metadata.namespace, input.metadata.name, forbidden, host])
|
||||
}
|
||||
Reference in New Issue
Block a user