From e641ceab4817f28f5ac1eeeaf412fd900177cf7b Mon Sep 17 00:00:00 2001 From: Andrew Stoltz Date: Tue, 19 May 2026 10:22:25 -0500 Subject: [PATCH] =?UTF-8?q?monitoring(irc-notify):=20criticals=20also=20ba?= =?UTF-8?q?tch=20hourly=20=E2=80=94=20fix=20per-fire=20spam?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The first batching pass (bacac06) left critical-severity alerts on the immediate-print path. That's still per-event spam for any persistent critical (e.g. PrintPaperRollCritical fires every 30s Grafana evaluation cycle when paper is <5%). Caught immediately after deploy: CUPS queue grew 0 → 8 jobs in 8 minutes from a single firing PrintPaperRollCritical. This commit aligns with the operator's verbatim ask ("one alert an hour"): - Critical-severity alerts now go into the digest buffer, NOT the immediate-print path. The digest payload already shows severity tags per alertname, so the operator still sees "[critical] X" in the printout. - The explicit `alert_channel=thermal_print_immediate` label still bypasses batching, but only on NEW fingerprint arrival — it triggers a flush of the CURRENT digest (with the new alert included), then clears. Repeat webhooks for the same fingerprint dedupe in the buffer until the next hourly tick OR until the alert resolves. No fingerprint can spam. - `add_to_digest` now returns bool (True = buffer grew, False = dedup / resolution / disabled) so the immediate-label path can flush only on state transitions. Net effect: max 1 thermal print per BATCH_INTERVAL_MIN per alert fingerprint, regardless of severity. Rules that genuinely need same-second paper opt in via `alert_channel=thermal_print_immediate` (currently zero rules use this). Co-Authored-By: Claude Opus 4.7 (1M context) --- apps/monitoring/noc-monitoring.yaml | 40 +++++++++++++++++------------ 1 file changed, 24 insertions(+), 16 deletions(-) diff --git a/apps/monitoring/noc-monitoring.yaml b/apps/monitoring/noc-monitoring.yaml index 447f612..8ab91e1 100644 --- a/apps/monitoring/noc-monitoring.yaml +++ b/apps/monitoring/noc-monitoring.yaml @@ -1392,7 +1392,10 @@ data: return alert.get("labels", {}).get("alert_channel") == "thermal_print" def add_to_digest(alert): - if not THERMAL_PRINT_ENABLED: return + """Add an alert to the digest buffer. Returns True if the buffer GREW + (new fingerprint), False if it was a dedup, resolution, or no-op. + """ + if not THERMAL_PRINT_ENABLED: return False fp = fingerprint_of(alert) status = alert.get("status", "firing").lower() with _buffer_lock: @@ -1400,14 +1403,15 @@ data: if fp in _buffer: del _buffer[fp] _stats["buffer_resolved"] += 1 - return + return False if fp in _buffer: _buffer[fp]["last_seen"] = time.time() _buffer[fp]["alert"] = alert _stats["buffer_dedup"] += 1 - return + return False _buffer[fp] = {"alert": alert, "first_seen": time.time(), "last_seen": time.time()} _stats["buffer_added"] += 1 + return True def build_digest_payload(): with _buffer_lock: @@ -1495,23 +1499,27 @@ data: msg = f"{icon}{sev_tag} {name}: {summary}" if desc: msg += f"\n {desc}" send_irc(msg) - # Thermal routing + # Thermal routing — EVERYTHING (including criticals) goes into + # the hourly digest. Only the explicit `alert_channel=thermal_print_immediate` + # label bypasses, and even that flushes-the-current-digest rather + # than printing a standalone job, so the same fingerprint can't + # spam the printer per webhook cycle. if status == "RESOLVED": add_to_digest(alert) # removes from buffer continue - if is_critical(alert) or is_immediate_label(alert): - runbook = alert.get("annotations", {}).get("runbook", "") - parts = [summary] - if desc and desc != summary: parts.append(desc) - if runbook: parts.append("STEPS: " + runbook) - pl = {"title": name, "severity": (severity or "warning").capitalize(), - "host": labels.get("instance", labels.get("pod", labels.get("namespace", "unknown"))), - "message": " | ".join(parts), "eventId": alert.get("fingerprint", ""), - "source": "Grafana (immediate)", "status": "PROBLEM", "acknowledged": False} - post_thermal(pl, "immediate") - elif is_batched_label(alert): + if is_immediate_label(alert): + # Explicit opt-in for "paper this NOW" — first arrival of a + # new fingerprint triggers an immediate digest flush; repeat + # webhooks for the same fingerprint dedupe in the buffer + # until the next interval or until the alert resolves. + new_in_buffer = add_to_digest(alert) + if new_in_buffer: + global _last_flush_time + flush_digest() + _last_flush_time = time.time() + elif is_critical(alert) or is_batched_label(alert): add_to_digest(alert) - # else: IRC-only + # else: IRC-only (warnings without thermal_print label) self.send_response(200); self.send_header("Content-Type", "application/json"); self.end_headers() self.wfile.write(b'{"status":"ok"}')