feat(monitoring): RemoteDesktop alerts + scrape jobs + dashboard mount

Three additions to the monitoring ConfigMap, each targeting
FlowerCore.RemoteDesktop:

- **Scrape jobs** (2 new):
  - probe-remotedesktop: blackbox http_2xx against
    https://desktop.iamworkin.lan/health every 30s. Feeds the
    RemoteDesktopWebDown alert.
  - fc-remotedesktop: direct /metrics scrape against
    desktop.iamworkin.lan for the fc_desktop_session_events_total
    and fc_desktop_pool_* series.

- **Alert group `remote-desktop`** (7 rules in alerts.yml):
  - RemoteDesktopWebDown (3m) — /health probe failing
  - RemoteDesktopMetricsStale (10m) — absent metrics series
  - RemoteDesktopPoolDepleted (5m) — pool deficit + depleted flag
  - RemoteDesktopPoolDeficitSustained (10m, info) — persistent
    below-desired pool size
  - RemoteDesktopSessionChurnSpike (5m, info) — launch rate
    >20/min
  - RemoteDesktopRecordingEventsDropped (15m, info) — 30m without
    recording events while launches active
  - RemoteDesktopTlsExpiry (6h, critical) — <2d cert renewal
    window; aligns with feedback_acme_expiry_alert_threshold

- **Grafana dashboard mount**: new volumeMounts + volumes entry for
  `dashboards-remotedesktop` backed by the grafana-dashboard-remotedesktop
  ConfigMap (previously added as a standalone file in d4210c8).
  Folder path /var/lib/grafana/dashboards/remotedesktop — picked up
  by the file-provider with foldersFromFilesStructure:true so the
  dashboard shows up in a "Remotedesktop" folder in Grafana.

No CRLF churn; pure 100-line insertion into LF-normalized file.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Andrew Stoltz
2026-04-24 00:41:35 -05:00
parent 297a2a9bbc
commit e44e9a0062

View File

@@ -278,6 +278,35 @@ data:
- target_label: __address__
replacement: blackbox-exporter.monitoring.svc:9115
# FlowerCore.RemoteDesktop web health (public cluster VIP)
- job_name: "probe-remotedesktop"
metrics_path: /probe
params:
module: [http_2xx]
scrape_interval: 30s
static_configs:
- targets: ["https://desktop.iamworkin.lan/health"]
labels:
instance: "https://desktop.iamworkin.lan/health"
service: "remotedesktop-web"
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- target_label: __address__
replacement: blackbox-exporter.monitoring.svc:9115
# FlowerCore.RemoteDesktop /metrics (direct scrape for counters)
- job_name: "fc-remotedesktop"
metrics_path: /metrics
scheme: https
scrape_interval: 30s
tls_config:
insecure_skip_verify: true
static_configs:
- targets: ["desktop.iamworkin.lan"]
labels:
service: "remotedesktop-web"
# CUPS web UI health (port 631)
- job_name: "probe-cups"
metrics_path: /probe
@@ -540,6 +569,71 @@ data:
summary: "Print.Web Ollama runner held for >10m ({{ $labels.model }})"
description: "Print.Web reports model {{ $labels.model }} with {{ $value | printf \"%.0f\" }}s of keep-alive remaining. Check concurrent requests before the Pi 5 Ollama lane thrashes."
- name: remote-desktop
rules:
- alert: RemoteDesktopWebDown
expr: probe_success{job="probe-remotedesktop",instance="https://desktop.iamworkin.lan/health"} == 0
for: 3m
labels:
severity: warning
annotations:
summary: "FlowerCore RemoteDesktop web is down"
description: "https://desktop.iamworkin.lan/health probe has failed for 3 minutes. Catalog + session launch surface offline."
- alert: RemoteDesktopMetricsStale
expr: absent(fc_desktop_session_events_total)
for: 10m
labels:
severity: warning
annotations:
summary: "RemoteDesktop /metrics scrape returning no data"
description: "No fc_desktop_session_events_total series for 10 minutes. Either the Prometheus scrape target is misconfigured or the web deployment stopped exporting metrics. Zabbix template carries the same 10m no-data trigger for cross-monitor parity."
- alert: RemoteDesktopPoolDepleted
expr: fc_desktop_pool_depleted > 0
for: 5m
labels:
severity: warning
annotations:
summary: "RemoteDesktop pool {{ $labels.pool }} depleted ({{ $labels.template }})"
description: "Pool {{ $labels.pool }} has been depleted for 5 minutes. New launches will cold-start. Operator should check for pod-scheduling failures, image pull issues, or exhausted node capacity before warm-pool parity is expected back."
- alert: RemoteDesktopPoolDeficitSustained
expr: fc_desktop_pool_deficit > 0
for: 10m
labels:
severity: info
annotations:
summary: "RemoteDesktop pool {{ $labels.pool }} below desired for 10m"
description: "Pool {{ $labels.pool }} has a persistent deficit of {{ $value }} warm pods. The operator is reconciling but can't reach desired size — likely an image pull, NFS affinity, or claim-init issue."
- alert: RemoteDesktopSessionChurnSpike
expr: sum(rate(fc_desktop_session_events_total{event="launch"}[5m])) * 60 > 20
for: 5m
labels:
severity: info
annotations:
summary: "RemoteDesktop launch rate high ({{ $value | printf \"%.0f\" }}/min)"
description: "Launch events exceed 20/min for 5 minutes. Could be a user-facing feature launch, a pooled template thrashing, or a runaway automation loop."
- alert: RemoteDesktopRecordingEventsDropped
expr: absent_over_time(fc_desktop_session_events_total{event="recording"}[30m]) and on() (sum(fc_desktop_session_events_total{event="launch"}) > 0)
for: 15m
labels:
severity: info
annotations:
summary: "RemoteDesktop recording events silent for 30m despite active launches"
description: "No recording events in 30 minutes while launches are happening. Recording may be silently disabled on all templates (SessionRecordingEnabled=false), the guacd NFS mount may be unhealthy, or the retention sweep isn't emitting events. Not an error by itself — worth checking."
- alert: RemoteDesktopTlsExpiry
expr: probe_ssl_earliest_cert_expiry{instance="https://desktop.iamworkin.lan"} - time() < 2 * 86400
for: 6h
labels:
severity: critical
annotations:
summary: "desktop.iamworkin.lan TLS cert expires within 2 days"
description: "The desktop.iamworkin.lan cert is inside the 2-day renewal window and cert-manager has not renewed. Check cert-manager logs, step-ca reachability, and pfSense DNS overrides per the ACME DNS-01 gate."
- name: pi-fleet
rules:
- alert: PiManagerDown
@@ -3122,6 +3216,9 @@ spec:
- name: dashboards-infra-overview
mountPath: /var/lib/grafana/dashboards/infra-overview
readOnly: true
- name: dashboards-remotedesktop
mountPath: /var/lib/grafana/dashboards/remotedesktop
readOnly: true
- name: datasource-provisioning
mountPath: /etc/grafana/provisioning/datasources
readOnly: true
@@ -3172,6 +3269,9 @@ spec:
- name: dashboards-infra-overview
configMap:
name: grafana-dashboard-infra-overview
- name: dashboards-remotedesktop
configMap:
name: grafana-dashboard-remotedesktop
- name: datasource-provisioning
configMap:
name: grafana-datasource-provisioning