diff --git a/apps/monitoring/noc-monitoring.yaml b/apps/monitoring/noc-monitoring.yaml index 606776a..ea1b175 100644 --- a/apps/monitoring/noc-monitoring.yaml +++ b/apps/monitoring/noc-monitoring.yaml @@ -966,6 +966,52 @@ data: annotations: summary: "Disk usage high on {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}%)" + # Puppet agent + service alerts. + # Mirror of FlowerCore.Notes/scripts/monitoring/alerts.yml `puppet` group + # so a future migration to in-cluster Prometheus inherits the ruleset. + # Source-of-truth for the live Podman Prometheus on noc1 is the Notes file. + # See feedback_monitoring_k8s_target_vs_live_podman. + - name: puppet + rules: + - alert: PuppetAgentReportStale + expr: puppet_last_run_age_seconds > 7200 + for: 30m + labels: + severity: warning + alert_channel: irc + annotations: + summary: "Puppet agent {{ $labels.instance }} hasn't reported in over 2h" + description: "Last run age: {{ $value | humanizeDuration }}. The puppet agent on {{ $labels.instance }} may be stopped, the node may be powered off, or noc1 may be unreachable from this node." + runbook: "1. SSH to node (via noc1 jumpbox if needed) 2. sudo systemctl status puppet 3. sudo puppet agent -t --noop to force a run 4. Check r10k: ssh fcadmin@10.0.56.10 'sudo podman logs openvoxserver --tail 50' 5. Verify noc1 reachability: ping puppet.iamworkin.lan" + + - alert: PuppetAgentReportCritical + expr: puppet_last_run_age_seconds > 86400 + for: 1h + labels: + severity: critical + alert_channel: irc + annotations: + summary: "Puppet agent {{ $labels.instance }} silent for over 24h — node is unmanaged" + description: "Last run age: {{ $value | humanizeDuration }}. Node {{ $labels.instance }} has not submitted a Puppet report in over 24 hours. Config drift is accumulating — investigate immediately. If intentional (maintenance), add to the exclusion filter or silence in Grafana." + runbook: "URGENT: 1. Check node power state 2. SSH via noc1 jumpbox: ssh fcadmin@10.0.56.10 then ssh 3. sudo systemctl status puppet 4. sudo systemctl start puppet + sudo puppet agent -t 5. Check for network partitions (VLAN connectivity to 10.0.56.10) 6. If node was recently reimaged: sudo puppet agent -t to re-register with new SSL cert" + + # Sprint 33 Cx-7 Phase B (2026-05-25 postmortem follow-up): + # Detects puppet.service in failed state — distinct from PuppetAgentReportStale + # which catches "agent hasn't run." This catches "systemd gave up restarting it" + # (CA-verify loop or other fatal exit). Requires node-exporter systemd collector + # enabled with --collector.systemd. If `node_systemd_unit_state` has no series + # for a node, the collector is disabled there — flag in postmortem follow-up. + - alert: PuppetServiceFailed + expr: node_systemd_unit_state{name="puppet.service",state="failed"} == 1 + for: 5m + labels: + severity: warning + alert_channel: irc + annotations: + summary: "Puppet service failed on {{ $labels.instance }}" + description: "puppet.service on {{ $labels.instance }} has been in failed state for 5+ minutes. systemd has stopped auto-restarting (CA-verify-loop or other exit). Manual `systemctl status puppet` confirms. Run `sudo systemctl start puppet` to recover; investigate journal for root cause." + runbook_url: "https://github.com/astoltz/FlowerCore.Notes/blob/master/memory/feedback_puppet_service_dead_after_ca_loop_alert_misreads.md" + # K8s pod-state alerts. Require kube-state-metrics scrape (added # 2026-04-26 — see scrape_configs above). Would have surfaced the # agent-zero ollama-proxy 172x crash-loop instead of letting it