diff --git a/apps/monitoring/noc-monitoring.yaml b/apps/monitoring/noc-monitoring.yaml index 1671d0d..972d2cf 100644 --- a/apps/monitoring/noc-monitoring.yaml +++ b/apps/monitoring/noc-monitoring.yaml @@ -278,6 +278,35 @@ data: - target_label: __address__ replacement: blackbox-exporter.monitoring.svc:9115 + # FlowerCore.RemoteDesktop web health (public cluster VIP) + - job_name: "probe-remotedesktop" + metrics_path: /probe + params: + module: [http_2xx] + scrape_interval: 30s + static_configs: + - targets: ["https://desktop.iamworkin.lan/health"] + labels: + instance: "https://desktop.iamworkin.lan/health" + service: "remotedesktop-web" + relabel_configs: + - source_labels: [__address__] + target_label: __param_target + - target_label: __address__ + replacement: blackbox-exporter.monitoring.svc:9115 + + # FlowerCore.RemoteDesktop /metrics (direct scrape for counters) + - job_name: "fc-remotedesktop" + metrics_path: /metrics + scheme: https + scrape_interval: 30s + tls_config: + insecure_skip_verify: true + static_configs: + - targets: ["desktop.iamworkin.lan"] + labels: + service: "remotedesktop-web" + # CUPS web UI health (port 631) - job_name: "probe-cups" metrics_path: /probe @@ -540,6 +569,71 @@ data: summary: "Print.Web Ollama runner held for >10m ({{ $labels.model }})" description: "Print.Web reports model {{ $labels.model }} with {{ $value | printf \"%.0f\" }}s of keep-alive remaining. Check concurrent requests before the Pi 5 Ollama lane thrashes." + - name: remote-desktop + rules: + - alert: RemoteDesktopWebDown + expr: probe_success{job="probe-remotedesktop",instance="https://desktop.iamworkin.lan/health"} == 0 + for: 3m + labels: + severity: warning + annotations: + summary: "FlowerCore RemoteDesktop web is down" + description: "https://desktop.iamworkin.lan/health probe has failed for 3 minutes. Catalog + session launch surface offline." + + - alert: RemoteDesktopMetricsStale + expr: absent(fc_desktop_session_events_total) + for: 10m + labels: + severity: warning + annotations: + summary: "RemoteDesktop /metrics scrape returning no data" + description: "No fc_desktop_session_events_total series for 10 minutes. Either the Prometheus scrape target is misconfigured or the web deployment stopped exporting metrics. Zabbix template carries the same 10m no-data trigger for cross-monitor parity." + + - alert: RemoteDesktopPoolDepleted + expr: fc_desktop_pool_depleted > 0 + for: 5m + labels: + severity: warning + annotations: + summary: "RemoteDesktop pool {{ $labels.pool }} depleted ({{ $labels.template }})" + description: "Pool {{ $labels.pool }} has been depleted for 5 minutes. New launches will cold-start. Operator should check for pod-scheduling failures, image pull issues, or exhausted node capacity before warm-pool parity is expected back." + + - alert: RemoteDesktopPoolDeficitSustained + expr: fc_desktop_pool_deficit > 0 + for: 10m + labels: + severity: info + annotations: + summary: "RemoteDesktop pool {{ $labels.pool }} below desired for 10m" + description: "Pool {{ $labels.pool }} has a persistent deficit of {{ $value }} warm pods. The operator is reconciling but can't reach desired size — likely an image pull, NFS affinity, or claim-init issue." + + - alert: RemoteDesktopSessionChurnSpike + expr: sum(rate(fc_desktop_session_events_total{event="launch"}[5m])) * 60 > 20 + for: 5m + labels: + severity: info + annotations: + summary: "RemoteDesktop launch rate high ({{ $value | printf \"%.0f\" }}/min)" + description: "Launch events exceed 20/min for 5 minutes. Could be a user-facing feature launch, a pooled template thrashing, or a runaway automation loop." + + - alert: RemoteDesktopRecordingEventsDropped + expr: absent_over_time(fc_desktop_session_events_total{event="recording"}[30m]) and on() (sum(fc_desktop_session_events_total{event="launch"}) > 0) + for: 15m + labels: + severity: info + annotations: + summary: "RemoteDesktop recording events silent for 30m despite active launches" + description: "No recording events in 30 minutes while launches are happening. Recording may be silently disabled on all templates (SessionRecordingEnabled=false), the guacd NFS mount may be unhealthy, or the retention sweep isn't emitting events. Not an error by itself — worth checking." + + - alert: RemoteDesktopTlsExpiry + expr: probe_ssl_earliest_cert_expiry{instance="https://desktop.iamworkin.lan"} - time() < 2 * 86400 + for: 6h + labels: + severity: critical + annotations: + summary: "desktop.iamworkin.lan TLS cert expires within 2 days" + description: "The desktop.iamworkin.lan cert is inside the 2-day renewal window and cert-manager has not renewed. Check cert-manager logs, step-ca reachability, and pfSense DNS overrides per the ACME DNS-01 gate." + - name: pi-fleet rules: - alert: PiManagerDown @@ -3122,6 +3216,9 @@ spec: - name: dashboards-infra-overview mountPath: /var/lib/grafana/dashboards/infra-overview readOnly: true + - name: dashboards-remotedesktop + mountPath: /var/lib/grafana/dashboards/remotedesktop + readOnly: true - name: datasource-provisioning mountPath: /etc/grafana/provisioning/datasources readOnly: true @@ -3172,6 +3269,9 @@ spec: - name: dashboards-infra-overview configMap: name: grafana-dashboard-infra-overview + - name: dashboards-remotedesktop + configMap: + name: grafana-dashboard-remotedesktop - name: datasource-provisioning configMap: name: grafana-datasource-provisioning