From e2c71c2b8a658d2009ec56f4f62c207487c0b409 Mon Sep 17 00:00:00 2001
From: Andrew Stoltz <andrew@flowercore.io>
Date: Sun, 26 Apr 2026 13:31:14 -0500
Subject: [PATCH] fix agent-zero ollama-proxy crashloop + add Longhorn
 monitoring
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

agent-zero ollama-proxy had 172 historic restarts (now stable).
Root cause: liveness/readiness probes hit /api/tags which proxies
through to BLUEJAY-WS Ollama (10.0.56.20:11434). When the workstation
Ollama is slow or offline, nginx fails over to the edge1 backup —
but the failover takes >1s and the kube-probe default timeoutSeconds=1
gives up first. Three failed probes → kubelet kills the container.

Fix:
- Add nginx local healthz endpoint (200, no upstream).
- Liveness probe → /healthz (proves nginx itself is alive).
- Readiness probe stays on /api/tags but with timeoutSeconds=5 so
  failover to backup completes before the probe times out.

This decouples liveness from upstream availability — kubelet only
restarts the proxy when nginx is genuinely dead, not when Ollama is
slow.

Longhorn coverage gap: K8s emits "snapshot becomes not ready to use"
events constantly during the hourly snapshot lifecycle (1047
snapshots, all readyToUse=true on inspect). Those events were the
only signal we had — purely transient lifecycle noise, not actionable.

Add:
- longhorn scrape job (longhorn-backend.longhorn-system.svc:9500)
- NetworkPolicy egress rule for longhorn-system port 9500
- 4 new alerts in 'longhorn-storage' group:
  - LonghornVolumeDegraded (>15m) — replica unhealthy, auto-rebuild
  - LonghornVolumeFaulted (>5m, critical, thermal print) — data loss
  - LonghornBackupStale (no completed backup in >36h) — recurring job
    silently failing
  - LonghornNodeUnhealthy (>5m) — node ready=false

zabbix-web 7 restarts and Print.Web 12:55 stop investigated — both
are stable now, no actionable cause found in journal/events. Adding
KubeContainerRestartingFrequently in the previous commit will catch
recurrence of either.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 apps/agent-zero/agent-zero.yaml     | 23 +++++++++-
 apps/monitoring/noc-monitoring.yaml | 71 +++++++++++++++++++++++++++++
 2 files changed, 93 insertions(+), 1 deletion(-)

diff --git a/apps/agent-zero/agent-zero.yaml b/apps/agent-zero/agent-zero.yaml
index c795213..79fb95f 100644
--- a/apps/agent-zero/agent-zero.yaml
+++ b/apps/agent-zero/agent-zero.yaml
@@ -208,6 +208,15 @@ spec:
                 }
                 server {
                   listen 11434;
+                  # Local healthcheck — proves nginx itself is alive.
+                  # Must NOT depend on upstream so liveness doesn't restart
+                  # the container when BLUEJAY-WS Ollama is slow/offline
+                  # and nginx is mid-failover to the edge1 backup.
+                  location = /healthz {
+                    access_log off;
+                    return 200 'ok\n';
+                    default_type text/plain;
+                  }
                   location / {
                     proxy_http_version 1.1;
                     proxy_set_header Connection "";
@@ -224,18 +233,30 @@ spec:
               exec nginx -g 'daemon off;'
           ports:
             - containerPort: 11434
+          # Readiness probe DOES check upstream so K8s only routes traffic
+          # when at least one Ollama backend is reachable. timeoutSeconds=5
+          # allows nginx to fail over from BLUEJAY-WS primary to edge1
+          # backup before the probe fails (was timeoutSeconds=1 default →
+          # 172 historic restarts when workstation Ollama was down).
           readinessProbe:
             httpGet:
               path: /api/tags
               port: 11434
             initialDelaySeconds: 5
             periodSeconds: 15
+            timeoutSeconds: 5
+            failureThreshold: 3
+          # Liveness probe hits ONLY local healthz — restarts the container
+          # only when nginx itself is dead. Decoupling liveness from upstream
+          # eliminates restart-loops caused by transient upstream outages.
           livenessProbe:
             httpGet:
-              path: /api/tags
+              path: /healthz
               port: 11434
             initialDelaySeconds: 10
             periodSeconds: 30
+            timeoutSeconds: 3
+            failureThreshold: 3
         - name: agent-zero
           image: agent0ai/agent-zero:latest
           command: ["/bin/bash", "-c"]
diff --git a/apps/monitoring/noc-monitoring.yaml b/apps/monitoring/noc-monitoring.yaml
index 412380d..ec79546 100644
--- a/apps/monitoring/noc-monitoring.yaml
+++ b/apps/monitoring/noc-monitoring.yaml
@@ -433,6 +433,19 @@ data:
               service: "traefik"
               cluster: "rke2"
 
+      # Longhorn — exposes longhorn_volume_robustness, longhorn_backup_*,
+      # longhorn_node_status_*. Enables LonghornVolumeUnhealthy +
+      # LonghornBackupFailed alerts (no real visibility into Longhorn
+      # health before this — was relying on K8s events which are noisy
+      # transient lifecycle messages, not actionable signals).
+      - job_name: "longhorn"
+        scrape_interval: 30s
+        static_configs:
+          - targets: ["longhorn-backend.longhorn-system.svc:9500"]
+            labels:
+              service: "longhorn"
+              cluster: "rke2"
+
       # FC web services through Traefik — single probe surface to spot any
       # iamworkin.lan host returning non-200. Uses https_internal because all
       # certs are step-ca leaves; blackbox would x509-fail with http_2xx.
@@ -925,6 +938,56 @@ data:
               summary: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} replica mismatch"
               description: "Spec wants {{ $labels.spec_replicas }} but only {{ $value }} available. Likely a rollout stuck on probe failure, scheduling, or PVC."
 
+      # Longhorn storage health alerts. Required: longhorn scrape job
+      # (added 2026-04-26 — see scrape_configs above). The K8s events
+      # for "snapshot becomes not ready to use" are transient lifecycle
+      # noise, not actionable — these alerts use the actual Longhorn
+      # gauges that reflect persistent state.
+      - name: longhorn-storage
+        rules:
+          # Volume robustness: 0=unknown, 1=healthy, 2=degraded, 3=faulted.
+          # Detached volumes report 0 — that's normal for unattached PVCs,
+          # so filter to only attached.
+          - alert: LonghornVolumeDegraded
+            expr: longhorn_volume_robustness{robustness="degraded"} == 1
+            for: 15m
+            labels:
+              severity: warning
+            annotations:
+              summary: "Longhorn volume {{ $labels.volume }} degraded for >15m"
+              description: "Volume {{ $labels.volume }} on node {{ $labels.node }} has been degraded (one or more replicas unhealthy) for 15+ minutes. Auto-rebuild may need help — check 'kubectl describe volume.longhorn.io {{ $labels.volume }} -n longhorn-system'."
+
+          - alert: LonghornVolumeFaulted
+            expr: longhorn_volume_robustness{robustness="faulted"} == 1
+            for: 5m
+            labels:
+              severity: critical
+              alert_channel: thermal_print
+            annotations:
+              summary: "Longhorn volume {{ $labels.volume }} FAULTED"
+              description: "Volume {{ $labels.volume }} on node {{ $labels.node }} is faulted — all replicas unavailable. Data inaccessible. Manual intervention required."
+
+          # No backup in 36h indicates the daily-backup recurringJob is
+          # silently failing. Allows for one missed run + slack.
+          - alert: LonghornBackupStale
+            expr: |
+              (time() - max by(volume) (longhorn_backup_state{state="Completed"} * on(backup) group_left() longhorn_backup_actual_size_bytes)) > 36 * 3600
+            for: 1h
+            labels:
+              severity: warning
+            annotations:
+              summary: "Longhorn volume {{ $labels.volume }} has no completed backup in >36h"
+              description: "Daily backup recurringJob (cron 0 2 * * *) appears to have skipped this volume. Check 'kubectl get backups.longhorn.io -n longhorn-system' and the daily-backup CronJob logs."
+
+          - alert: LonghornNodeUnhealthy
+            expr: longhorn_node_status{condition="ready",condition_reason!=""} == 0
+            for: 5m
+            labels:
+              severity: warning
+            annotations:
+              summary: "Longhorn node {{ $labels.node }} not Ready"
+              description: "Node {{ $labels.node }} reports ready=false (reason: {{ $labels.condition_reason }}). Volumes scheduled to this node will be unavailable until it recovers."
+
 # =============================================================================
 # ConfigMap: Blackbox Exporter Configuration
 # =============================================================================
@@ -4219,6 +4282,14 @@ spec:
       ports:
         - port: 9402
           protocol: TCP
+    # Longhorn manager metrics — required for Longhorn* alerts.
+    - to:
+        - namespaceSelector:
+            matchLabels:
+              kubernetes.io/metadata.name: longhorn-system
+      ports:
+        - port: 9500
+          protocol: TCP
     # IRC (irc-notify → UnrealIRCd in irc namespace via K8s DNS)
     - to:
         - namespaceSelector: