From 3bf6511d5d5424745f3603e3e0775156f3a51e95 Mon Sep 17 00:00:00 2001 From: Andrew Stoltz Date: Mon, 27 Apr 2026 16:28:26 -0500 Subject: [PATCH] feat(knowledge): stage Phase 2.4 K8s deployment manifests (Sprint E B2) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit NOT YET APPLIED — push to origin/main is gated on the DNS A record knowledge.iamworkin.lan -> 10.0.56.200 being live. Per memory feedback_pfsense_dns_required_for_acme, applying the Certificate without DNS in place puts cert-manager into ~2h HTTP-01 backoff and needs `kubectl -n knowledge delete order ` recovery. Manifests authored: - apps/knowledge/knowledge.yaml — Namespace, PVC (knowledge-vector-store Longhorn 20Gi RWO), Deployment (single replica, Recreate, image localhost/fc-knowledge-web:v202604272200 placeholder, runAsNonRoot 1654, readOnlyRootFilesystem, drop ALL caps, /healthz startupProbe + readinessProbe, tcpSocket livenessProbe), Service (ClusterIP port 80 -> 8080), Certificate (step-ca-acme ClusterIssuer, 90d duration), IngressRoute (knowledge.iamworkin.lan, websecure entrypoint). - apps/knowledge/kustomization.yaml — `kubectl kustomize` preview file (matches fc-distribution shape; ApplicationSet uses dir generator). - apps/knowledge/README.md — deployment order checklist with the DNS preflight, image build/import loop for all 3 RKE2 nodes, push procedure, smoke verification, initial-deploy-state notes (zero editions until *.db files are pushed to the PVC), resource sizing, probe + middleware notes. Companion artifacts (separate repos, separate commits): - FlowerCore.Knowledge@eb91eb4 — Dockerfile.deploy at repo root - FlowerCore.Notes@96cd443 — scripts/deploy-knowledge.sh Apply order (from apps/knowledge/README.md): 1. Add DNS A record knowledge.iamworkin.lan -> 10.0.56.200 via FlowerCore.DNS or pfSense web UI. 2. Run `bash scripts/deploy-knowledge.sh` from FlowerCore.Notes — this builds + imports the image to all 3 RKE2 nodes with FLOWERCORE_DEPLOY_SKIP_ROLLOUT=1 (since the Deployment doesn't exist yet on the cluster). 3. Bump the image tag in this manifest to match the freshly-imported tag, then `git push` from this repo to land on main. ArgoCD picks up within ~3 minutes and creates `infra-knowledge`. Co-Authored-By: Claude Opus 4.7 (1M context) --- apps/knowledge/README.md | 153 ++++++++++++++++++++ apps/knowledge/knowledge.yaml | 227 ++++++++++++++++++++++++++++++ apps/knowledge/kustomization.yaml | 7 + 3 files changed, 387 insertions(+) create mode 100644 apps/knowledge/README.md create mode 100644 apps/knowledge/knowledge.yaml create mode 100644 apps/knowledge/kustomization.yaml diff --git a/apps/knowledge/README.md b/apps/knowledge/README.md new file mode 100644 index 0000000..89e9c70 --- /dev/null +++ b/apps/knowledge/README.md @@ -0,0 +1,153 @@ +# knowledge — FlowerCore.Knowledge.Web (Phase 2.4 K8s deploy) + +**Status:** manifests staged, **NOT YET APPLIED**. Image must be built + +imported AND DNS record provisioned before `git push`. + +- Plan: [`../../../FlowerCore.Notes/docs/ai-agents/flowercore-knowledge-service-plan.md`](../../../FlowerCore.Notes/docs/ai-agents/flowercore-knowledge-service-plan.md) +- Sprint: [`../../../FlowerCore.Notes/docs/ai-station/sprint-e-xxl-plan.md`](../../../FlowerCore.Notes/docs/ai-station/sprint-e-xxl-plan.md) (Track B) +- Repo: `D:\git\FlowerCore\FlowerCore.Knowledge\` (private GitHub repo, + bootstrapped Sprint D batch 35) + +`FlowerCore.Knowledge.Web` is the fleet-wide vector-indexing & RAG hub — +a REST + MCP service that scans `*.db` files under +`/data/vector-stores` and exposes per-edition reachability + corpus +search to the rest of the FC ecosystem (Agent Zero, Chat.Web persona +memory, AiStation embeddings explorer, TtsReader chapter context, BMO +bot, Pi nodes via `fc-index sync`). + +## Deployment order (do NOT skip / reorder) + +### 1. FlowerCore.DNS public A record — knowledge.iamworkin.lan -> 10.0.56.200 + +Required BEFORE the Certificate resource is created, or cert-manager +HTTP-01 silently backs off ~2h. Memory: `feedback_pfsense_dns_required_for_acme`. + +The canonical path is FlowerCore.DNS: + +```bash +curl -sk https://dns.iamworkin.lan/api/v1/servers +# Find the pfSense serverId, then create the record using the host label only. + +curl -sk -X POST https://dns.iamworkin.lan/api/v1/servers//zones/iamworkin.lan/records \ + -H "Content-Type: application/json" \ + -d '{"name":"knowledge","type":"A","data":"10.0.56.200","ttl":300}' +``` + +If FlowerCore.DNS provider writes are failing 502 with "pfSense +diag_command.php response did not contain a `
` block" (status as of
+Sprint E Track B authoring 2026-04-27), add the override manually via
+the pfSense web UI:
+
+1. Log in to `https://10.0.56.1` as admin
+2. Services → DNS Resolver → General Settings → Host Overrides
+3. Add: Host=`knowledge`, Domain=`iamworkin.lan`, IP Address=`10.0.56.200`
+4. Save + Apply Changes
+
+Verify resolution from anywhere on LAN:
+
+```bash
+nslookup knowledge.iamworkin.lan 10.0.56.1
+# Expect: 10.0.56.200
+```
+
+Or against FlowerCore.DNS once the provider is fixed:
+
+```bash
+curl -sk "https://dns.iamworkin.lan/api/v1/zones/iamworkin.lan/resolve-preflight?hostname=knowledge.iamworkin.lan"
+# Expect: "resolvable": true
+```
+
+### 2. Build + import the image to ALL RKE2 nodes
+
+Pods may schedule on any RKE2 worker (server, agent1, agent2). The
+Longhorn PVC accepts mounts from any node, so the image must be
+imported to all three. Memory:
+`feedback_rke2_image_import_targets_all_nodes` +
+`feedback_rke2_localhost_imagepullpolicy`.
+
+```bash
+# From BLUEJAY-WS, in D:\git\FlowerCore\FlowerCore.Knowledge
+TAG="v$(date +%Y%m%d%H%M)"
+dotnet.exe publish -c Release -o deploy/app \
+  src/FlowerCore.Knowledge.Web/FlowerCore.Knowledge.Web.csproj
+podman build -t localhost/fc-knowledge-web:$TAG -f deploy/Dockerfile.deploy deploy
+podman save localhost/fc-knowledge-web:$TAG -o /tmp/fc-knowledge-web.tar
+
+# Import to all three RKE2 nodes
+for node in rke2-server rke2-agent1 rke2-agent2; do
+  scp /tmp/fc-knowledge-web.tar $node:/tmp/
+  ssh $node "sudo /var/lib/rancher/rke2/bin/ctr -a /run/k3s/containerd/containerd.sock -n k8s.io images import /tmp/fc-knowledge-web.tar"
+done
+```
+
+The repo's `scripts/deploy-knowledge.sh` automates this loop.
+
+### 3. Bump the image tag + push
+
+Edit `knowledge.yaml`, replace `localhost/fc-knowledge-web:v202604272200`
+with the tag from step 2, then:
+
+```bash
+cd D:/git/FlowerCore/bluejay-infra
+python scripts/check-pfsense-dns.py     # confirms the DNS preflight
+git add apps/knowledge/
+git commit -m "feat(knowledge): deploy Phase 2.4 K8s manifest"
+git push
+```
+
+ArgoCD picks up within ~3 minutes and creates `infra-knowledge`.
+
+### 4. Verify
+
+```bash
+fcadmin_ssh noc1 '
+  kubectl -n argocd get application infra-knowledge
+  kubectl -n knowledge get certificate,pod,pvc
+  curl -sk -m 8 -o /dev/null -w "HTTP %{http_code}\n" https://knowledge.iamworkin.lan/healthz
+  curl -sk -m 8 https://knowledge.iamworkin.lan/api/v1/editions | jq
+'
+```
+
+Expect: Certificate `Ready: True` within ~60s, `/healthz` HTTP 200,
+`/api/v1/editions` returns an empty array (no DBs in the PVC yet) on
+first deploy.
+
+## Initial-deploy state and Phase 2.5 follow-up
+
+The Longhorn PVC is empty on first deploy. Knowledge.Web's filesystem
+catalog will report zero editions until vector-store `*.db` files are
+pushed into `/data/vector-stores`. Initial population is a follow-up
+step (Phase 2.5+, Blazor admin UI's "Rebuild" button); for the first
+deploy the goal is just to prove the pod boots, `/healthz` returns 200,
+and the Traefik IngressRoute serves the Scalar UI.
+
+To copy an existing local DB into the PVC (one-time, manual until
+Phase 2.5 admin UI lands):
+
+```bash
+fcadmin_ssh noc1 '
+  POD=$(kubectl -n knowledge get pod -l app=knowledge-web -o jsonpath="{.items[0].metadata.name}")
+  kubectl -n knowledge cp /var/lib/flowercore/vector-stores/bluejay-ai.db $POD:/data/vector-stores/bluejay-ai.db
+'
+```
+
+## Probes + middleware notes
+
+- `/healthz` is mapped by `Controllers/HealthController.cs` (controller-based
+  attribute route). Cheap — no DB, no dependencies.
+- Liveness uses `tcpSocket` as a defensive fallback in case future
+  middleware accidentally gates `/healthz` behind auth (memory:
+  `feedback_k8s_probes_behind_auth_middleware`).
+- `/openapi/v1.json` and `/scalar/v1` are wired by `UseFlowerCoreApi`.
+  Per memory `feedback_k8s_probes_must_not_hit_openapi`, probes must NOT
+  point at OpenAPI documents — the `MapOpenApi` call can be slow during
+  cold startup.
+
+## Resource sizing
+
+- 256Mi memory request / 1Gi limit.
+- 100m CPU request / 1000m limit.
+- 20Gi Longhorn PVC initial — sufficient for the bluejay-ai 1.94Gi DB +
+  fleet-pi-edge 352Mi + fleet-bmo-bot 141Mi + headroom. Resize via
+  `kubectl -n knowledge edit pvc knowledge-vector-store` if growing
+  past 15Gi.
diff --git a/apps/knowledge/knowledge.yaml b/apps/knowledge/knowledge.yaml
new file mode 100644
index 0000000..1467ad8
--- /dev/null
+++ b/apps/knowledge/knowledge.yaml
@@ -0,0 +1,227 @@
+# FlowerCore.Knowledge.Web — fleet vector indexing & RAG hub.
+#
+# Phase 2.4 of the Knowledge service plan. REST + MCP service that scans
+# *.db files under /data/vector-stores and exposes:
+#   - REST: /api/v1/editions, /api/v1/corpus/search, /healthz
+#   - MCP:  list_editions, describe_edition, corpus_search
+#   - Static OpenAPI/Scalar via UseFlowerCoreApi
+#
+# Architecture:
+#   Plan:    FlowerCore.Notes/docs/ai-agents/flowercore-knowledge-service-plan.md
+#   Sprint:  FlowerCore.Notes/docs/ai-station/sprint-e-xxl-plan.md (Track B)
+#   Repo:    D:\git\FlowerCore\FlowerCore.Knowledge\
+#   Shared:  FlowerCore.Common -> FlowerCore.Shared.Indexing (chunkers, vector
+#            stores, edition profiles, ICorpusSearchService facade)
+#
+# Deployment order (see apps/knowledge/README.md and the bluejay-infra/README.md
+# top-level checklist):
+#   1. FlowerCore.DNS public A record knowledge.iamworkin.lan -> 10.0.56.200
+#      MUST exist BEFORE the Certificate is created, or cert-manager HTTP-01
+#      backs off ~2h. Memory: feedback_pfsense_dns_required_for_acme.
+#   2. Build + import the image to ALL RKE2 nodes (server + both agents) since
+#      the Pod uses a Longhorn PVC and may schedule anywhere.
+#      Memory: feedback_rke2_localhost_imagepullpolicy.
+#   3. Bump the image tag in this file, git push.
+#   4. ArgoCD ApplicationSet picks up within ~3 minutes and creates
+#      infra-knowledge.
+#
+# Initial-deploy state:
+#   The Longhorn PVC is empty on first deploy. Knowledge.Web's filesystem
+#   catalog will report zero editions until vector-store *.db files are
+#   pushed into /data/vector-stores. Initial population is a follow-up step
+#   (Phase 2.5+, Blazor admin UI's "Rebuild" button); for the first deploy
+#   the goal is just to prove the pod boots, /healthz returns 200, and the
+#   Traefik IngressRoute serves the Scalar UI.
+---
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: knowledge
+  labels:
+    app.kubernetes.io/part-of: bluejay-infra
+---
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: knowledge-vector-store
+  namespace: knowledge
+spec:
+  accessModes:
+    - ReadWriteOnce
+  storageClassName: longhorn
+  resources:
+    requests:
+      storage: 20Gi
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: knowledge-web
+  namespace: knowledge
+  labels:
+    app: knowledge-web
+    app.kubernetes.io/name: knowledge-web
+    app.kubernetes.io/part-of: bluejay-infra
+spec:
+  replicas: 1
+  revisionHistoryLimit: 3
+  # RWO Longhorn PVC blocks rolling updates (multi-attach error). Recreate
+  # is the canonical pattern (memory: feedback_rwo_pvc_blocks_rolling).
+  strategy:
+    type: Recreate
+  selector:
+    matchLabels:
+      app: knowledge-web
+  template:
+    metadata:
+      labels:
+        app: knowledge-web
+        app.kubernetes.io/name: knowledge-web
+        app.kubernetes.io/part-of: bluejay-infra
+      annotations:
+        prometheus.io/scrape: "true"
+        prometheus.io/port: "8080"
+        prometheus.io/path: "/metrics"
+    spec:
+      securityContext:
+        runAsNonRoot: true
+        fsGroup: 1654
+        fsGroupChangePolicy: OnRootMismatch
+      containers:
+        - name: web
+          # Placeholder tag — bump to the image you built + imported to ALL
+          # RKE2 nodes via scripts/deploy-knowledge.sh before applying.
+          image: localhost/fc-knowledge-web:v202604272200
+          imagePullPolicy: Never
+          ports:
+            - containerPort: 8080
+              name: http
+          env:
+            - name: ASPNETCORE_URLS
+              value: "http://+:8080"
+            - name: ASPNETCORE_ENVIRONMENT
+              value: "Production"
+            - name: DOTNET_SYSTEM_GLOBALIZATION_INVARIANT
+              value: "false"
+            # Vector-store directory + embedding model + edition profile dir.
+            # Profile JSON is baked into the image at /app/editions via the
+            # csproj Content-link from FlowerCore.Common/editions/.
+            - name: Knowledge__VectorStoresDirectory
+              value: "/data/vector-stores"
+            - name: Knowledge__EmbeddingModel
+              value: "nomic-embed-text"
+            - name: Knowledge__DefaultLimit
+              value: "5"
+            - name: Knowledge__MaxLimit
+              value: "50"
+            - name: FlowerCore__Editions__ProfileDirectory
+              value: "/app/editions"
+            # Embed via BLUEJAY-WS GPU (R9700, 32GB VRAM). Pi5 Ollama is
+            # ~4-5x slower; use the workstation while we have it.
+            # Memory: feedback_pi5_nomic_embed_slow.
+            - name: FlowerCore__Ollama__BaseUrl
+              value: "http://10.0.56.20:11434"
+          resources:
+            requests:
+              cpu: 100m
+              memory: 256Mi
+            limits:
+              cpu: 1000m
+              memory: 1Gi
+          # /healthz is mapped by HealthController (controller-based route).
+          # tcpSocket liveness is the defensive fallback in case middleware
+          # later gates /healthz behind auth (memory:
+          # feedback_k8s_probes_behind_auth_middleware).
+          startupProbe:
+            httpGet:
+              path: /healthz
+              port: 8080
+            initialDelaySeconds: 5
+            periodSeconds: 5
+            failureThreshold: 30
+          readinessProbe:
+            httpGet:
+              path: /healthz
+              port: 8080
+            periodSeconds: 10
+            failureThreshold: 3
+          livenessProbe:
+            tcpSocket:
+              port: 8080
+            initialDelaySeconds: 30
+            periodSeconds: 30
+            failureThreshold: 3
+          securityContext:
+            runAsNonRoot: true
+            runAsUser: 1654
+            runAsGroup: 1654
+            allowPrivilegeEscalation: false
+            readOnlyRootFilesystem: true
+            capabilities:
+              drop:
+                - ALL
+          volumeMounts:
+            - name: vector-store
+              mountPath: /data/vector-stores
+            - name: tmp
+              mountPath: /tmp
+            - name: logs
+              mountPath: /app/logs
+      volumes:
+        - name: vector-store
+          persistentVolumeClaim:
+            claimName: knowledge-vector-store
+        - name: tmp
+          emptyDir: {}
+        - name: logs
+          emptyDir: {}
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: knowledge-web
+  namespace: knowledge
+  labels:
+    app: knowledge-web
+    app.kubernetes.io/name: knowledge-web
+    app.kubernetes.io/part-of: bluejay-infra
+spec:
+  type: ClusterIP
+  selector:
+    app: knowledge-web
+  ports:
+    - name: http
+      port: 80
+      targetPort: 8080
+---
+apiVersion: cert-manager.io/v1
+kind: Certificate
+metadata:
+  name: knowledge-tls
+  namespace: knowledge
+spec:
+  secretName: knowledge-tls
+  issuerRef:
+    name: step-ca-acme
+    kind: ClusterIssuer
+  dnsNames:
+    - knowledge.iamworkin.lan
+  duration: 2160h    # 90d
+  renewBefore: 720h  # 30d
+---
+apiVersion: traefik.io/v1alpha1
+kind: IngressRoute
+metadata:
+  name: knowledge
+  namespace: knowledge
+spec:
+  entryPoints:
+    - websecure
+  routes:
+    - match: Host(`knowledge.iamworkin.lan`)
+      kind: Rule
+      services:
+        - name: knowledge-web
+          port: 80
+  tls:
+    secretName: knowledge-tls
diff --git a/apps/knowledge/kustomization.yaml b/apps/knowledge/kustomization.yaml
new file mode 100644
index 0000000..5d3505c
--- /dev/null
+++ b/apps/knowledge/kustomization.yaml
@@ -0,0 +1,7 @@
+# ArgoCD's bluejay-infra ApplicationSet uses a directory generator and does
+# not require kustomization.yaml. Mirrors the fc-distribution shape so
+# `kubectl kustomize` previews work from a working copy.
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+resources:
+  - knowledge.yaml