Fix edge1 Ollama IP (.15->.17), add monitoring ingress, add init container

2026-04-08 17:30:22 +00:00
parent f3919cf728
commit c9f07108bd
1 changed files with 368 additions and 325 deletions
--- a/apps/agent-zero/agent-zero.yaml
+++ b/apps/agent-zero/agent-zero.yaml
@@ -1,325 +1,368 @@
-# =============================================================================
-# Agent Zero AI Stack — NUC Deployment (RKE2 Bare-Metal)
-# =============================================================================
-# Deploys: AgentZero (agent UI) on RKE2 cluster
-# Ollama: edge1 Pi 5 at 10.0.57.15:11434 (qwen2.5-coder:7b, CPU)
-# Target: RKE2 bare-metal cluster, namespace: agent-zero
-#
-# Differences from LOCAL (WSL K3s):
-#   - Uses Longhorn StorageClass (not local-path)
-#   - Connects to edge1 Pi 5 Ollama (not workstation R9700)
-#   - NO Anthropic API key (free/local models only)
-#   - NO Piper TTS or Kiwix (edge1 handles TTS, no Wikipedia needed)
-#   - NO hostPath volumes (no access to Windows filesystem)
-#   - Traefik IngressRoute for LAN access at agent-zero.iamworkin.lan
-#   - Knowledge base loaded via ConfigMap (not hostPath)
-#
-# Available Ollama models on edge1:
-#   - qwen2.5-coder:7b   ~4.7 GB  Code generation (CPU, Q4_K_M)
-#
-# Apply: KUBECONFIG=~/.kube/rke2.yaml kubectl apply -f agent-zero-nuc.yaml
-# =============================================================================
-
---
-apiVersion: v1
-kind: Namespace
-metadata:
-  name: agent-zero
-  labels:
-    app.kubernetes.io/part-of: agent-zero-stack
-
-# =============================================================================
-# Persistent Volume Claims (Longhorn)
-# =============================================================================
-
---
-apiVersion: v1
-kind: PersistentVolumeClaim
-metadata:
-  name: agent-zero-data
-  namespace: agent-zero
-spec:
-  accessModes: [ReadWriteOnce]
-  storageClassName: longhorn
-  resources:
-    requests:
-      storage: 5Gi
-
---
-apiVersion: v1
-kind: PersistentVolumeClaim
-metadata:
-  name: agent-zero-knowledge
-  namespace: agent-zero
-spec:
-  accessModes: [ReadWriteOnce]
-  storageClassName: longhorn
-  resources:
-    requests:
-      storage: 1Gi
-
-# =============================================================================
-# RBAC — Give Agent Zero kubectl access to the cluster
-# =============================================================================
-
---
-apiVersion: v1
-kind: ServiceAccount
-metadata:
-  name: agent-zero
-  namespace: agent-zero
-
---
-apiVersion: rbac.authorization.k8s.io/v1
-kind: ClusterRoleBinding
-metadata:
-  name: agent-zero-cluster-admin
-roleRef:
-  apiGroup: rbac.authorization.k8s.io
-  kind: ClusterRole
-  name: cluster-admin
-subjects:
-  - kind: ServiceAccount
-    name: agent-zero
-    namespace: agent-zero
-
-# =============================================================================
-# Agent Zero — AI Agent Web UI (NUC Edition)
-# =============================================================================
-# Connects to edge1 Pi 5 Ollama (free, local models only)
-# No paid API keys — uses qwen2.5-coder:7b for everything
-
---
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: agent-zero
-  namespace: agent-zero
-  labels:
-    app: agent-zero
-  annotations:
-    agent-zero/deployment: "nuc"
-    agent-zero/ollama: "edge1 Pi 5 (10.0.57.15:11434)"
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: agent-zero
-  strategy:
-    type: Recreate
-  template:
-    metadata:
-      labels:
-        app: agent-zero
-    spec:
-      serviceAccountName: agent-zero
-      containers:
-        - name: agent-zero
-          image: agent0ai/agent-zero:latest
-          command: ["/bin/bash", "-c"]
-          args:
-            - |
-              # Install kubectl if not cached
-              if [ -f /a0/work/kubectl ]; then
-                cp /a0/work/kubectl /usr/local/bin/kubectl
-              else
-                curl -sLO "https://dl.k8s.io/release/v1.32.0/bin/linux/amd64/kubectl" && \
-                chmod +x kubectl && mv kubectl /usr/local/bin/kubectl && \
-                cp /usr/local/bin/kubectl /a0/work/kubectl
-              fi
-              # Run the original entrypoint
-              exec /exe/initialize.sh $BRANCH
-          ports:
-            - containerPort: 80
-          env:
-            # Agent identity
-            - name: AGENT_NAME
-              value: "Blue Jay (NUC)"
-            # Chat model — qwen2.5-coder:7b on edge1 Pi 5
-            - name: A0_SET_chat_model_provider
-              value: "ollama"
-            - name: A0_SET_chat_model_name
-              value: "qwen2.5-coder:7b"
-            - name: A0_SET_chat_model_api_base
-              value: "http://10.0.57.15:11434"
-            - name: A0_SET_chat_model_ctx_length
-              value: "32768"
-            - name: A0_SET_chat_model_kwargs
-              value: '{"temperature": 0, "num_ctx": 32768}'
-            # Utility model — same as chat (only one model available)
-            - name: A0_SET_util_model_provider
-              value: "ollama"
-            - name: A0_SET_util_model_name
-              value: "qwen2.5-coder:7b"
-            - name: A0_SET_util_model_api_base
-              value: "http://10.0.57.15:11434"
-            - name: A0_SET_util_model_kwargs
-              value: '{"num_ctx": 8192}'
-            # Embedding model — nomic on edge1 (if installed, fallback to none)
-            - name: A0_SET_embed_model_provider
-              value: "ollama"
-            - name: A0_SET_embed_model_name
-              value: "nomic-embed-text"
-            - name: A0_SET_embed_model_api_base
-              value: "http://10.0.57.15:11434"
-            # Browser model — disabled (no vision model on Pi)
-            - name: A0_SET_browser_model_provider
-              value: "ollama"
-            - name: A0_SET_browser_model_name
-              value: "qwen2.5-coder:7b"
-            - name: A0_SET_browser_model_api_base
-              value: "http://10.0.57.15:11434"
-            - name: A0_SET_browser_model_vision
-              value: "false"
-            # Agent profile
-            - name: A0_SET_agent_profile
-              value: "default"
-            # Memory settings
-            - name: A0_SET_memory_memorize_enabled
-              value: "true"
-            - name: A0_SET_memory_memorize_consolidation
-              value: "true"
-            - name: A0_SET_memory_memorize_replace_threshold
-              value: "0.85"
-            - name: A0_SET_memory_recall_enabled
-              value: "true"
-            # Speech-to-text disabled (no GPU for Whisper)
-            - name: A0_SET_stt_model_size
-              value: "tiny"
-            # Kubernetes
-            - name: KUBERNETES_SERVICE_HOST
-              value: "kubernetes.default.svc"
-            - name: KUBERNETES_SERVICE_PORT
-              value: "443"
-          volumeMounts:
-            - name: workspace
-              mountPath: /a0/work
-            - name: knowledge
-              mountPath: /a0/knowledge/custom/main
-          resources:
-            requests:
-              memory: "2Gi"
-              cpu: "1000m"
-            limits:
-              memory: "3Gi"
-              cpu: "2000m"
-      volumes:
-        - name: workspace
-          persistentVolumeClaim:
-            claimName: agent-zero-data
-        - name: knowledge
-          persistentVolumeClaim:
-            claimName: agent-zero-knowledge
-
---
-apiVersion: v1
-kind: Service
-metadata:
-  name: agent-zero
-  namespace: agent-zero
-spec:
-  type: ClusterIP
-  selector:
-    app: agent-zero
-  ports:
-    - port: 80
-      targetPort: 80
-
-# =============================================================================
-# Traefik IngressRoute — LAN access at agent-zero.iamworkin.lan
-# =============================================================================
-
---
-apiVersion: traefik.io/v1alpha1
-kind: IngressRoute
-metadata:
-  name: agent-zero
-  namespace: agent-zero
-spec:
-  entryPoints:
-    - websecure
-  routes:
-    - match: Host(`agent-zero.iamworkin.lan`)
-      kind: Rule
-      services:
-        - name: agent-zero
-          port: 80
-  tls:
-    secretName: agent-zero-tls
-
-# =============================================================================
-# TLS Certificate via cert-manager (step-ca ACME)
-# =============================================================================
-
---
-apiVersion: cert-manager.io/v1
-kind: Certificate
-metadata:
-  name: agent-zero-tls
-  namespace: agent-zero
-spec:
-  secretName: agent-zero-tls
-  issuerRef:
-    name: step-ca-acme
-    kind: ClusterIssuer
-  dnsNames:
-    - agent-zero.iamworkin.lan
-  duration: 720h
-  renewBefore: 240h
-
-# =============================================================================
-# NetworkPolicy — Restrict traffic
-# =============================================================================
-
---
-apiVersion: networking.k8s.io/v1
-kind: NetworkPolicy
-metadata:
-  name: agent-zero-netpol
-  namespace: agent-zero
-spec:
-  podSelector:
-    matchLabels:
-      app: agent-zero
-  policyTypes:
-    - Ingress
-    - Egress
-  ingress:
-    # Allow from Traefik
-    - from:
-        - namespaceSelector:
-            matchLabels:
-              kubernetes.io/metadata.name: traefik-system
-      ports:
-        - port: 80
-  egress:
-    # DNS
-    - to:
-        - namespaceSelector:
-            matchLabels:
-              kubernetes.io/metadata.name: kube-system
-      ports:
-        - port: 53
-          protocol: UDP
-        - port: 53
-          protocol: TCP
-    # Ollama on edge1
-    - to:
-        - ipBlock:
-            cidr: 10.0.57.15/32
-      ports:
-        - port: 11434
-    # K8s API
-    - to:
-        - ipBlock:
-            cidr: 10.0.56.11/32
-      ports:
-        - port: 6443
-    # Allow internet (for kubectl image pull, etc)
-    - to:
-        - ipBlock:
-            cidr: 0.0.0.0/0
-            except:
-              - 10.0.0.0/8
-              - 172.16.0.0/12
-              - 192.168.0.0/16
+# =============================================================================
+# Agent Zero AI Stack — NUC Deployment (RKE2 Bare-Metal)
+# =============================================================================
+# Deploys: AgentZero (agent UI) on RKE2 cluster
+# Ollama: edge1 Pi 5 at 10.0.57.17:11434 (qwen2.5-coder:7b, CPU)
+# Target: RKE2 bare-metal cluster, namespace: agent-zero
+#
+# Differences from LOCAL (WSL K3s):
+#   - Uses Longhorn StorageClass (not local-path)
+#   - Connects to edge1 Pi 5 Ollama (not workstation R9700)
+#   - NO Anthropic API key (free/local models only)
+#   - NO Piper TTS or Kiwix (edge1 handles TTS, no Wikipedia needed)
+#   - NO hostPath volumes (no access to Windows filesystem)
+#   - Traefik IngressRoute for LAN access at agent-zero.iamworkin.lan
+#   - Knowledge base loaded via ConfigMap (not hostPath)
+#
+# Available Ollama models on edge1:
+#   - qwen2.5-coder:7b   ~4.7 GB  Code generation (CPU, Q4_K_M)
+#
+# Apply: KUBECONFIG=~/.kube/rke2.yaml kubectl apply -f agent-zero-nuc.yaml
+# =============================================================================
+
+---
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: agent-zero
+  labels:
+    app.kubernetes.io/part-of: agent-zero-stack
+
+# =============================================================================
+# Persistent Volume Claims (Longhorn)
+# =============================================================================
+
+---
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: agent-zero-data
+  namespace: agent-zero
+spec:
+  accessModes: [ReadWriteOnce]
+  storageClassName: longhorn
+  resources:
+    requests:
+      storage: 5Gi
+
+---
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: agent-zero-knowledge
+  namespace: agent-zero
+spec:
+  accessModes: [ReadWriteOnce]
+  storageClassName: longhorn
+  resources:
+    requests:
+      storage: 1Gi
+
+# =============================================================================
+# RBAC — Give Agent Zero kubectl access to the cluster
+# =============================================================================
+
+---
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: agent-zero
+  namespace: agent-zero
+
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: agent-zero-cluster-admin
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: cluster-admin
+subjects:
+  - kind: ServiceAccount
+    name: agent-zero
+    namespace: agent-zero
+
+# =============================================================================
+# Agent Zero — AI Agent Web UI (NUC Edition)
+# =============================================================================
+# Connects to edge1 Pi 5 Ollama (free, local models only)
+# No paid API keys — uses qwen2.5-coder:7b for everything
+
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: agent-zero
+  namespace: agent-zero
+  labels:
+    app: agent-zero
+  annotations:
+    agent-zero/deployment: "nuc"
+    agent-zero/ollama: "edge1 Pi 5 (10.0.57.17:11434)"
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: agent-zero
+  strategy:
+    type: Recreate
+  template:
+    metadata:
+      labels:
+        app: agent-zero
+    spec:
+      serviceAccountName: agent-zero
+      initContainers:
+        # Wait for edge1 Pi 5 Ollama to be reachable before starting Agent Zero.
+        # Without this, the FAISS memory system crashes on embed_query()
+        # and Agent Zero enters a broken state on every message.
+        - name: wait-for-ollama
+          image: busybox:1.37
+          command: ["sh", "-c"]
+          args:
+            - |
+              echo "Waiting for Ollama at edge1 (10.0.57.17:11434)..."
+              until wget -qO- --timeout=2 http://10.0.57.17:11434/api/tags >/dev/null 2>&1; do
+                echo "edge1 Ollama not ready, retrying in 5s..."
+                sleep 5
+              done
+              echo "edge1 Ollama is reachable!"
+      containers:
+        - name: agent-zero
+          image: agent0ai/agent-zero:latest
+          command: ["/bin/bash", "-c"]
+          args:
+            - |
+              # Install kubectl if not cached
+              if [ -f /a0/work/kubectl ]; then
+                cp /a0/work/kubectl /usr/local/bin/kubectl
+              else
+                curl -sLO "https://dl.k8s.io/release/v1.32.0/bin/linux/amd64/kubectl" && \
+                chmod +x kubectl && mv kubectl /usr/local/bin/kubectl && \
+                cp /usr/local/bin/kubectl /a0/work/kubectl
+              fi
+              # Run the original entrypoint
+              exec /exe/initialize.sh $BRANCH
+          ports:
+            - containerPort: 80
+          env:
+            # Agent identity
+            - name: AGENT_NAME
+              value: "Blue Jay (NUC)"
+            # Chat model — qwen2.5-coder:7b on edge1 Pi 5
+            - name: A0_SET_chat_model_provider
+              value: "ollama"
+            - name: A0_SET_chat_model_name
+              value: "qwen2.5-coder:7b"
+            - name: A0_SET_chat_model_api_base
+              value: "http://10.0.57.17:11434"
+            - name: A0_SET_chat_model_ctx_length
+              value: "32768"
+            - name: A0_SET_chat_model_kwargs
+              value: '{"temperature": 0, "num_ctx": 32768}'
+            # Utility model — same as chat (only one model available)
+            - name: A0_SET_util_model_provider
+              value: "ollama"
+            - name: A0_SET_util_model_name
+              value: "qwen2.5-coder:7b"
+            - name: A0_SET_util_model_api_base
+              value: "http://10.0.57.17:11434"
+            - name: A0_SET_util_model_kwargs
+              value: '{"num_ctx": 8192}'
+            # Embedding model — nomic on edge1 (if installed, fallback to none)
+            - name: A0_SET_embed_model_provider
+              value: "ollama"
+            - name: A0_SET_embed_model_name
+              value: "nomic-embed-text"
+            - name: A0_SET_embed_model_api_base
+              value: "http://10.0.57.17:11434"
+            # Browser model — disabled (no vision model on Pi)
+            - name: A0_SET_browser_model_provider
+              value: "ollama"
+            - name: A0_SET_browser_model_name
+              value: "qwen2.5-coder:7b"
+            - name: A0_SET_browser_model_api_base
+              value: "http://10.0.57.17:11434"
+            - name: A0_SET_browser_model_vision
+              value: "false"
+            # Agent profile
+            - name: A0_SET_agent_profile
+              value: "default"
+            # Memory settings
+            - name: A0_SET_memory_memorize_enabled
+              value: "true"
+            - name: A0_SET_memory_memorize_consolidation
+              value: "true"
+            - name: A0_SET_memory_memorize_replace_threshold
+              value: "0.85"
+            - name: A0_SET_memory_recall_enabled
+              value: "true"
+            # Speech-to-text disabled (no GPU for Whisper)
+            - name: A0_SET_stt_model_size
+              value: "tiny"
+            # Kubernetes
+            - name: KUBERNETES_SERVICE_HOST
+              value: "kubernetes.default.svc"
+            - name: KUBERNETES_SERVICE_PORT
+              value: "443"
+          volumeMounts:
+            - name: workspace
+              mountPath: /a0/work
+            - name: knowledge
+              mountPath: /a0/knowledge/custom/main
+          startupProbe:
+            httpGet:
+              path: /
+              port: 80
+            initialDelaySeconds: 15
+            periodSeconds: 10
+            failureThreshold: 18
+          livenessProbe:
+            httpGet:
+              path: /
+              port: 80
+            periodSeconds: 30
+            failureThreshold: 3
+          readinessProbe:
+            exec:
+              command:
+                - /bin/bash
+                - -c
+                - "curl -sf http://localhost:80/ > /dev/null && curl -sf --connect-timeout 3 http://10.0.57.17:11434/api/tags > /dev/null"
+            periodSeconds: 30
+            failureThreshold: 2
+          resources:
+            requests:
+              memory: "2Gi"
+              cpu: "1000m"
+            limits:
+              memory: "3Gi"
+              cpu: "2000m"
+      volumes:
+        - name: workspace
+          persistentVolumeClaim:
+            claimName: agent-zero-data
+        - name: knowledge
+          persistentVolumeClaim:
+            claimName: agent-zero-knowledge
+
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: agent-zero
+  namespace: agent-zero
+spec:
+  type: ClusterIP
+  selector:
+    app: agent-zero
+  ports:
+    - port: 80
+      targetPort: 80
+
+# =============================================================================
+# Traefik IngressRoute — LAN access at agent-zero.iamworkin.lan
+# =============================================================================
+
+---
+apiVersion: traefik.io/v1alpha1
+kind: IngressRoute
+metadata:
+  name: agent-zero
+  namespace: agent-zero
+spec:
+  entryPoints:
+    - websecure
+  routes:
+    - match: Host(`agent-zero.iamworkin.lan`)
+      kind: Rule
+      services:
+        - name: agent-zero
+          port: 80
+  tls:
+    secretName: agent-zero-tls
+
+# =============================================================================
+# TLS Certificate via cert-manager (step-ca ACME)
+# =============================================================================
+
+---
+apiVersion: cert-manager.io/v1
+kind: Certificate
+metadata:
+  name: agent-zero-tls
+  namespace: agent-zero
+spec:
+  secretName: agent-zero-tls
+  issuerRef:
+    name: step-ca-acme
+    kind: ClusterIssuer
+  dnsNames:
+    - agent-zero.iamworkin.lan
+  duration: 720h
+  renewBefore: 240h
+
+# =============================================================================
+# NetworkPolicy — Restrict traffic
+# =============================================================================
+
+---
+apiVersion: networking.k8s.io/v1
+kind: NetworkPolicy
+metadata:
+  name: agent-zero-netpol
+  namespace: agent-zero
+spec:
+  podSelector:
+    matchLabels:
+      app: agent-zero
+  policyTypes:
+    - Ingress
+    - Egress
+  ingress:
+    # Allow from Traefik
+    - from:
+        - namespaceSelector:
+            matchLabels:
+              kubernetes.io/metadata.name: traefik-system
+      ports:
+        - port: 80
+    # Allow from monitoring (blackbox probe)
+    - from:
+        - namespaceSelector:
+            matchLabels:
+              kubernetes.io/metadata.name: monitoring
+      ports:
+        - port: 80
+  egress:
+    # DNS
+    - to:
+        - namespaceSelector:
+            matchLabels:
+              kubernetes.io/metadata.name: kube-system
+      ports:
+        - port: 53
+          protocol: UDP
+        - port: 53
+          protocol: TCP
+    # Ollama on edge1
+    - to:
+        - ipBlock:
+            cidr: 10.0.57.17/32
+      ports:
+        - port: 11434
+    # K8s API
+    - to:
+        - ipBlock:
+            cidr: 10.0.56.11/32
+      ports:
+        - port: 6443
+    # Allow internet (for kubectl image pull, etc)
+    - to:
+        - ipBlock:
+            cidr: 0.0.0.0/0
+            except:
+              - 10.0.0.0/8
+              - 172.16.0.0/12
+              - 192.168.0.0/16