From 55729a24f96e05614da5878bcf5b523d5b7155a5 Mon Sep 17 00:00:00 2001 From: Andrew Stoltz Date: Tue, 19 May 2026 12:34:28 -0500 Subject: [PATCH] monitoring: alert on remotedesktop longhorn pvc growth --- apps/monitoring/noc-monitoring.yaml | 47 +++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/apps/monitoring/noc-monitoring.yaml b/apps/monitoring/noc-monitoring.yaml index 8ab91e1..35fb9bf 100644 --- a/apps/monitoring/noc-monitoring.yaml +++ b/apps/monitoring/noc-monitoring.yaml @@ -824,6 +824,53 @@ data: summary: "desktop.iamworkin.lan TLS cert expires within 2 days" description: "The desktop.iamworkin.lan cert is inside the 2-day renewal window and cert-manager has not renewed. Check cert-manager logs, step-ca reachability, and pfSense DNS overrides per the ACME DNS-01 gate." + - alert: LonghornPVCGrowthRapid + expr: | + ( + ( + ( + longhorn_volume_actual_size_bytes + - (longhorn_volume_actual_size_bytes offset 1h) + ) + / clamp_min(longhorn_volume_actual_size_bytes offset 1h, 1) + ) + * on(volume) group_left(namespace, persistentvolumeclaim) ( + ( + label_replace(kube_persistentvolumeclaim_info{storageclass="longhorn"}, "volume", "$1", "volumename", "(.+)") + * on(namespace, persistentvolumeclaim) group_left() + kube_persistentvolumeclaim_labels{label_flowercore_io_managed_by="remotedesktop"} + ) + or + label_replace(kube_persistentvolumeclaim_info{namespace="fc-desktop", storageclass="longhorn", persistentvolumeclaim=~"fc-profile-.*|remotedesktop-data"}, "volume", "$1", "volumename", "(.+)") + ) + ) > 0.20 + or + ( + ( + longhorn_volume_actual_size_bytes + / on(volume) clamp_min(longhorn_volume_capacity_bytes, 1) + ) + * on(volume) group_left(namespace, persistentvolumeclaim) ( + ( + label_replace(kube_persistentvolumeclaim_info{storageclass="longhorn"}, "volume", "$1", "volumename", "(.+)") + * on(namespace, persistentvolumeclaim) group_left() + kube_persistentvolumeclaim_labels{label_flowercore_io_managed_by="remotedesktop"} + ) + or + label_replace(kube_persistentvolumeclaim_info{namespace="fc-desktop", storageclass="longhorn", persistentvolumeclaim=~"fc-profile-.*|remotedesktop-data"}, "volume", "$1", "volumename", "(.+)") + ) + ) > 0.80 + for: 5m + labels: + severity: warning + alert_channel: thermal_print + service: remotedesktop + annotations: + summary: "RemoteDesktop Longhorn PVC {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} growing rapidly" + description: "Longhorn volume {{ $labels.volume }} backing RemoteDesktop PVC {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} grew more than 20% in 1h or is over 80% capacity. Check for runaway SQLite/user-profile growth; this alert was added after the 2026-05-16 RemoteDesktop web SQLite Error 13 incident." + runbook: "1. kubectl -n {{ $labels.namespace }} describe pvc {{ $labels.persistentvolumeclaim }} 2. Open Longhorn UI volume {{ $labels.volume }} 3. Check RemoteDesktop web/user-volume SQLite files for permission or runaway growth 4. Expand PVC only after confirming the writer is healthy" + todo: "2026-05-19 metric gate: live noc1 Prometheus currently exposes kube_persistentvolumeclaim_info and kube_persistentvolumeclaim_resource_requests_storage_bytes, but not longhorn_volume_actual_size_bytes, longhorn_volume_capacity_bytes, kube_persistentvolumeclaim_labels, or kubelet_volume_stats_used_bytes. Keep the fc-desktop PVC fallback until kube-state-metrics label allowlist exposes flowercore.io/managed-by=remotedesktop." + - name: pi-fleet rules: - alert: PiManagerDown -- 2.49.1