From e2c71c2b8a658d2009ec56f4f62c207487c0b409 Mon Sep 17 00:00:00 2001 From: Andrew Stoltz Date: Sun, 26 Apr 2026 13:31:14 -0500 Subject: [PATCH] fix agent-zero ollama-proxy crashloop + add Longhorn monitoring MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit agent-zero ollama-proxy had 172 historic restarts (now stable). Root cause: liveness/readiness probes hit /api/tags which proxies through to BLUEJAY-WS Ollama (10.0.56.20:11434). When the workstation Ollama is slow or offline, nginx fails over to the edge1 backup — but the failover takes >1s and the kube-probe default timeoutSeconds=1 gives up first. Three failed probes → kubelet kills the container. Fix: - Add nginx local healthz endpoint (200, no upstream). - Liveness probe → /healthz (proves nginx itself is alive). - Readiness probe stays on /api/tags but with timeoutSeconds=5 so failover to backup completes before the probe times out. This decouples liveness from upstream availability — kubelet only restarts the proxy when nginx is genuinely dead, not when Ollama is slow. Longhorn coverage gap: K8s emits "snapshot becomes not ready to use" events constantly during the hourly snapshot lifecycle (1047 snapshots, all readyToUse=true on inspect). Those events were the only signal we had — purely transient lifecycle noise, not actionable. Add: - longhorn scrape job (longhorn-backend.longhorn-system.svc:9500) - NetworkPolicy egress rule for longhorn-system port 9500 - 4 new alerts in 'longhorn-storage' group: - LonghornVolumeDegraded (>15m) — replica unhealthy, auto-rebuild - LonghornVolumeFaulted (>5m, critical, thermal print) — data loss - LonghornBackupStale (no completed backup in >36h) — recurring job silently failing - LonghornNodeUnhealthy (>5m) — node ready=false zabbix-web 7 restarts and Print.Web 12:55 stop investigated — both are stable now, no actionable cause found in journal/events. Adding KubeContainerRestartingFrequently in the previous commit will catch recurrence of either. Co-Authored-By: Claude Opus 4.7 (1M context) --- apps/agent-zero/agent-zero.yaml | 23 +++++++++- apps/monitoring/noc-monitoring.yaml | 71 +++++++++++++++++++++++++++++ 2 files changed, 93 insertions(+), 1 deletion(-) diff --git a/apps/agent-zero/agent-zero.yaml b/apps/agent-zero/agent-zero.yaml index c795213..79fb95f 100644 --- a/apps/agent-zero/agent-zero.yaml +++ b/apps/agent-zero/agent-zero.yaml @@ -208,6 +208,15 @@ spec: } server { listen 11434; + # Local healthcheck — proves nginx itself is alive. + # Must NOT depend on upstream so liveness doesn't restart + # the container when BLUEJAY-WS Ollama is slow/offline + # and nginx is mid-failover to the edge1 backup. + location = /healthz { + access_log off; + return 200 'ok\n'; + default_type text/plain; + } location / { proxy_http_version 1.1; proxy_set_header Connection ""; @@ -224,18 +233,30 @@ spec: exec nginx -g 'daemon off;' ports: - containerPort: 11434 + # Readiness probe DOES check upstream so K8s only routes traffic + # when at least one Ollama backend is reachable. timeoutSeconds=5 + # allows nginx to fail over from BLUEJAY-WS primary to edge1 + # backup before the probe fails (was timeoutSeconds=1 default → + # 172 historic restarts when workstation Ollama was down). readinessProbe: httpGet: path: /api/tags port: 11434 initialDelaySeconds: 5 periodSeconds: 15 + timeoutSeconds: 5 + failureThreshold: 3 + # Liveness probe hits ONLY local healthz — restarts the container + # only when nginx itself is dead. Decoupling liveness from upstream + # eliminates restart-loops caused by transient upstream outages. livenessProbe: httpGet: - path: /api/tags + path: /healthz port: 11434 initialDelaySeconds: 10 periodSeconds: 30 + timeoutSeconds: 3 + failureThreshold: 3 - name: agent-zero image: agent0ai/agent-zero:latest command: ["/bin/bash", "-c"] diff --git a/apps/monitoring/noc-monitoring.yaml b/apps/monitoring/noc-monitoring.yaml index 412380d..ec79546 100644 --- a/apps/monitoring/noc-monitoring.yaml +++ b/apps/monitoring/noc-monitoring.yaml @@ -433,6 +433,19 @@ data: service: "traefik" cluster: "rke2" + # Longhorn — exposes longhorn_volume_robustness, longhorn_backup_*, + # longhorn_node_status_*. Enables LonghornVolumeUnhealthy + + # LonghornBackupFailed alerts (no real visibility into Longhorn + # health before this — was relying on K8s events which are noisy + # transient lifecycle messages, not actionable signals). + - job_name: "longhorn" + scrape_interval: 30s + static_configs: + - targets: ["longhorn-backend.longhorn-system.svc:9500"] + labels: + service: "longhorn" + cluster: "rke2" + # FC web services through Traefik — single probe surface to spot any # iamworkin.lan host returning non-200. Uses https_internal because all # certs are step-ca leaves; blackbox would x509-fail with http_2xx. @@ -925,6 +938,56 @@ data: summary: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} replica mismatch" description: "Spec wants {{ $labels.spec_replicas }} but only {{ $value }} available. Likely a rollout stuck on probe failure, scheduling, or PVC." + # Longhorn storage health alerts. Required: longhorn scrape job + # (added 2026-04-26 — see scrape_configs above). The K8s events + # for "snapshot becomes not ready to use" are transient lifecycle + # noise, not actionable — these alerts use the actual Longhorn + # gauges that reflect persistent state. + - name: longhorn-storage + rules: + # Volume robustness: 0=unknown, 1=healthy, 2=degraded, 3=faulted. + # Detached volumes report 0 — that's normal for unattached PVCs, + # so filter to only attached. + - alert: LonghornVolumeDegraded + expr: longhorn_volume_robustness{robustness="degraded"} == 1 + for: 15m + labels: + severity: warning + annotations: + summary: "Longhorn volume {{ $labels.volume }} degraded for >15m" + description: "Volume {{ $labels.volume }} on node {{ $labels.node }} has been degraded (one or more replicas unhealthy) for 15+ minutes. Auto-rebuild may need help — check 'kubectl describe volume.longhorn.io {{ $labels.volume }} -n longhorn-system'." + + - alert: LonghornVolumeFaulted + expr: longhorn_volume_robustness{robustness="faulted"} == 1 + for: 5m + labels: + severity: critical + alert_channel: thermal_print + annotations: + summary: "Longhorn volume {{ $labels.volume }} FAULTED" + description: "Volume {{ $labels.volume }} on node {{ $labels.node }} is faulted — all replicas unavailable. Data inaccessible. Manual intervention required." + + # No backup in 36h indicates the daily-backup recurringJob is + # silently failing. Allows for one missed run + slack. + - alert: LonghornBackupStale + expr: | + (time() - max by(volume) (longhorn_backup_state{state="Completed"} * on(backup) group_left() longhorn_backup_actual_size_bytes)) > 36 * 3600 + for: 1h + labels: + severity: warning + annotations: + summary: "Longhorn volume {{ $labels.volume }} has no completed backup in >36h" + description: "Daily backup recurringJob (cron 0 2 * * *) appears to have skipped this volume. Check 'kubectl get backups.longhorn.io -n longhorn-system' and the daily-backup CronJob logs." + + - alert: LonghornNodeUnhealthy + expr: longhorn_node_status{condition="ready",condition_reason!=""} == 0 + for: 5m + labels: + severity: warning + annotations: + summary: "Longhorn node {{ $labels.node }} not Ready" + description: "Node {{ $labels.node }} reports ready=false (reason: {{ $labels.condition_reason }}). Volumes scheduled to this node will be unavailable until it recovers." + # ============================================================================= # ConfigMap: Blackbox Exporter Configuration # ============================================================================= @@ -4219,6 +4282,14 @@ spec: ports: - port: 9402 protocol: TCP + # Longhorn manager metrics — required for Longhorn* alerts. + - to: + - namespaceSelector: + matchLabels: + kubernetes.io/metadata.name: longhorn-system + ports: + - port: 9500 + protocol: TCP # IRC (irc-notify → UnrealIRCd in irc namespace via K8s DNS) - to: - namespaceSelector: