fix agent-zero ollama-proxy crashloop + add Longhorn monitoring
agent-zero ollama-proxy had 172 historic restarts (now stable).
Root cause: liveness/readiness probes hit /api/tags which proxies
through to BLUEJAY-WS Ollama (10.0.56.20:11434). When the workstation
Ollama is slow or offline, nginx fails over to the edge1 backup —
but the failover takes >1s and the kube-probe default timeoutSeconds=1
gives up first. Three failed probes → kubelet kills the container.
Fix:
- Add nginx local healthz endpoint (200, no upstream).
- Liveness probe → /healthz (proves nginx itself is alive).
- Readiness probe stays on /api/tags but with timeoutSeconds=5 so
failover to backup completes before the probe times out.
This decouples liveness from upstream availability — kubelet only
restarts the proxy when nginx is genuinely dead, not when Ollama is
slow.
Longhorn coverage gap: K8s emits "snapshot becomes not ready to use"
events constantly during the hourly snapshot lifecycle (1047
snapshots, all readyToUse=true on inspect). Those events were the
only signal we had — purely transient lifecycle noise, not actionable.
Add:
- longhorn scrape job (longhorn-backend.longhorn-system.svc:9500)
- NetworkPolicy egress rule for longhorn-system port 9500
- 4 new alerts in 'longhorn-storage' group:
- LonghornVolumeDegraded (>15m) — replica unhealthy, auto-rebuild
- LonghornVolumeFaulted (>5m, critical, thermal print) — data loss
- LonghornBackupStale (no completed backup in >36h) — recurring job
silently failing
- LonghornNodeUnhealthy (>5m) — node ready=false
zabbix-web 7 restarts and Print.Web 12:55 stop investigated — both
are stable now, no actionable cause found in journal/events. Adding
KubeContainerRestartingFrequently in the previous commit will catch
recurrence of either.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -208,6 +208,15 @@ spec:
|
|||||||
}
|
}
|
||||||
server {
|
server {
|
||||||
listen 11434;
|
listen 11434;
|
||||||
|
# Local healthcheck — proves nginx itself is alive.
|
||||||
|
# Must NOT depend on upstream so liveness doesn't restart
|
||||||
|
# the container when BLUEJAY-WS Ollama is slow/offline
|
||||||
|
# and nginx is mid-failover to the edge1 backup.
|
||||||
|
location = /healthz {
|
||||||
|
access_log off;
|
||||||
|
return 200 'ok\n';
|
||||||
|
default_type text/plain;
|
||||||
|
}
|
||||||
location / {
|
location / {
|
||||||
proxy_http_version 1.1;
|
proxy_http_version 1.1;
|
||||||
proxy_set_header Connection "";
|
proxy_set_header Connection "";
|
||||||
@@ -224,18 +233,30 @@ spec:
|
|||||||
exec nginx -g 'daemon off;'
|
exec nginx -g 'daemon off;'
|
||||||
ports:
|
ports:
|
||||||
- containerPort: 11434
|
- containerPort: 11434
|
||||||
|
# Readiness probe DOES check upstream so K8s only routes traffic
|
||||||
|
# when at least one Ollama backend is reachable. timeoutSeconds=5
|
||||||
|
# allows nginx to fail over from BLUEJAY-WS primary to edge1
|
||||||
|
# backup before the probe fails (was timeoutSeconds=1 default →
|
||||||
|
# 172 historic restarts when workstation Ollama was down).
|
||||||
readinessProbe:
|
readinessProbe:
|
||||||
httpGet:
|
httpGet:
|
||||||
path: /api/tags
|
path: /api/tags
|
||||||
port: 11434
|
port: 11434
|
||||||
initialDelaySeconds: 5
|
initialDelaySeconds: 5
|
||||||
periodSeconds: 15
|
periodSeconds: 15
|
||||||
|
timeoutSeconds: 5
|
||||||
|
failureThreshold: 3
|
||||||
|
# Liveness probe hits ONLY local healthz — restarts the container
|
||||||
|
# only when nginx itself is dead. Decoupling liveness from upstream
|
||||||
|
# eliminates restart-loops caused by transient upstream outages.
|
||||||
livenessProbe:
|
livenessProbe:
|
||||||
httpGet:
|
httpGet:
|
||||||
path: /api/tags
|
path: /healthz
|
||||||
port: 11434
|
port: 11434
|
||||||
initialDelaySeconds: 10
|
initialDelaySeconds: 10
|
||||||
periodSeconds: 30
|
periodSeconds: 30
|
||||||
|
timeoutSeconds: 3
|
||||||
|
failureThreshold: 3
|
||||||
- name: agent-zero
|
- name: agent-zero
|
||||||
image: agent0ai/agent-zero:latest
|
image: agent0ai/agent-zero:latest
|
||||||
command: ["/bin/bash", "-c"]
|
command: ["/bin/bash", "-c"]
|
||||||
|
|||||||
@@ -433,6 +433,19 @@ data:
|
|||||||
service: "traefik"
|
service: "traefik"
|
||||||
cluster: "rke2"
|
cluster: "rke2"
|
||||||
|
|
||||||
|
# Longhorn — exposes longhorn_volume_robustness, longhorn_backup_*,
|
||||||
|
# longhorn_node_status_*. Enables LonghornVolumeUnhealthy +
|
||||||
|
# LonghornBackupFailed alerts (no real visibility into Longhorn
|
||||||
|
# health before this — was relying on K8s events which are noisy
|
||||||
|
# transient lifecycle messages, not actionable signals).
|
||||||
|
- job_name: "longhorn"
|
||||||
|
scrape_interval: 30s
|
||||||
|
static_configs:
|
||||||
|
- targets: ["longhorn-backend.longhorn-system.svc:9500"]
|
||||||
|
labels:
|
||||||
|
service: "longhorn"
|
||||||
|
cluster: "rke2"
|
||||||
|
|
||||||
# FC web services through Traefik — single probe surface to spot any
|
# FC web services through Traefik — single probe surface to spot any
|
||||||
# iamworkin.lan host returning non-200. Uses https_internal because all
|
# iamworkin.lan host returning non-200. Uses https_internal because all
|
||||||
# certs are step-ca leaves; blackbox would x509-fail with http_2xx.
|
# certs are step-ca leaves; blackbox would x509-fail with http_2xx.
|
||||||
@@ -925,6 +938,56 @@ data:
|
|||||||
summary: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} replica mismatch"
|
summary: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} replica mismatch"
|
||||||
description: "Spec wants {{ $labels.spec_replicas }} but only {{ $value }} available. Likely a rollout stuck on probe failure, scheduling, or PVC."
|
description: "Spec wants {{ $labels.spec_replicas }} but only {{ $value }} available. Likely a rollout stuck on probe failure, scheduling, or PVC."
|
||||||
|
|
||||||
|
# Longhorn storage health alerts. Required: longhorn scrape job
|
||||||
|
# (added 2026-04-26 — see scrape_configs above). The K8s events
|
||||||
|
# for "snapshot becomes not ready to use" are transient lifecycle
|
||||||
|
# noise, not actionable — these alerts use the actual Longhorn
|
||||||
|
# gauges that reflect persistent state.
|
||||||
|
- name: longhorn-storage
|
||||||
|
rules:
|
||||||
|
# Volume robustness: 0=unknown, 1=healthy, 2=degraded, 3=faulted.
|
||||||
|
# Detached volumes report 0 — that's normal for unattached PVCs,
|
||||||
|
# so filter to only attached.
|
||||||
|
- alert: LonghornVolumeDegraded
|
||||||
|
expr: longhorn_volume_robustness{robustness="degraded"} == 1
|
||||||
|
for: 15m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Longhorn volume {{ $labels.volume }} degraded for >15m"
|
||||||
|
description: "Volume {{ $labels.volume }} on node {{ $labels.node }} has been degraded (one or more replicas unhealthy) for 15+ minutes. Auto-rebuild may need help — check 'kubectl describe volume.longhorn.io {{ $labels.volume }} -n longhorn-system'."
|
||||||
|
|
||||||
|
- alert: LonghornVolumeFaulted
|
||||||
|
expr: longhorn_volume_robustness{robustness="faulted"} == 1
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
alert_channel: thermal_print
|
||||||
|
annotations:
|
||||||
|
summary: "Longhorn volume {{ $labels.volume }} FAULTED"
|
||||||
|
description: "Volume {{ $labels.volume }} on node {{ $labels.node }} is faulted — all replicas unavailable. Data inaccessible. Manual intervention required."
|
||||||
|
|
||||||
|
# No backup in 36h indicates the daily-backup recurringJob is
|
||||||
|
# silently failing. Allows for one missed run + slack.
|
||||||
|
- alert: LonghornBackupStale
|
||||||
|
expr: |
|
||||||
|
(time() - max by(volume) (longhorn_backup_state{state="Completed"} * on(backup) group_left() longhorn_backup_actual_size_bytes)) > 36 * 3600
|
||||||
|
for: 1h
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Longhorn volume {{ $labels.volume }} has no completed backup in >36h"
|
||||||
|
description: "Daily backup recurringJob (cron 0 2 * * *) appears to have skipped this volume. Check 'kubectl get backups.longhorn.io -n longhorn-system' and the daily-backup CronJob logs."
|
||||||
|
|
||||||
|
- alert: LonghornNodeUnhealthy
|
||||||
|
expr: longhorn_node_status{condition="ready",condition_reason!=""} == 0
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Longhorn node {{ $labels.node }} not Ready"
|
||||||
|
description: "Node {{ $labels.node }} reports ready=false (reason: {{ $labels.condition_reason }}). Volumes scheduled to this node will be unavailable until it recovers."
|
||||||
|
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
# ConfigMap: Blackbox Exporter Configuration
|
# ConfigMap: Blackbox Exporter Configuration
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
@@ -4219,6 +4282,14 @@ spec:
|
|||||||
ports:
|
ports:
|
||||||
- port: 9402
|
- port: 9402
|
||||||
protocol: TCP
|
protocol: TCP
|
||||||
|
# Longhorn manager metrics — required for Longhorn* alerts.
|
||||||
|
- to:
|
||||||
|
- namespaceSelector:
|
||||||
|
matchLabels:
|
||||||
|
kubernetes.io/metadata.name: longhorn-system
|
||||||
|
ports:
|
||||||
|
- port: 9500
|
||||||
|
protocol: TCP
|
||||||
# IRC (irc-notify → UnrealIRCd in irc namespace via K8s DNS)
|
# IRC (irc-notify → UnrealIRCd in irc namespace via K8s DNS)
|
||||||
- to:
|
- to:
|
||||||
- namespaceSelector:
|
- namespaceSelector:
|
||||||
|
|||||||
Reference in New Issue
Block a user