diff --git a/apps/monitoring/noc-monitoring.yaml b/apps/monitoring/noc-monitoring.yaml index 447f612..8ab91e1 100644 --- a/apps/monitoring/noc-monitoring.yaml +++ b/apps/monitoring/noc-monitoring.yaml @@ -1392,7 +1392,10 @@ data: return alert.get("labels", {}).get("alert_channel") == "thermal_print" def add_to_digest(alert): - if not THERMAL_PRINT_ENABLED: return + """Add an alert to the digest buffer. Returns True if the buffer GREW + (new fingerprint), False if it was a dedup, resolution, or no-op. + """ + if not THERMAL_PRINT_ENABLED: return False fp = fingerprint_of(alert) status = alert.get("status", "firing").lower() with _buffer_lock: @@ -1400,14 +1403,15 @@ data: if fp in _buffer: del _buffer[fp] _stats["buffer_resolved"] += 1 - return + return False if fp in _buffer: _buffer[fp]["last_seen"] = time.time() _buffer[fp]["alert"] = alert _stats["buffer_dedup"] += 1 - return + return False _buffer[fp] = {"alert": alert, "first_seen": time.time(), "last_seen": time.time()} _stats["buffer_added"] += 1 + return True def build_digest_payload(): with _buffer_lock: @@ -1495,23 +1499,27 @@ data: msg = f"{icon}{sev_tag} {name}: {summary}" if desc: msg += f"\n {desc}" send_irc(msg) - # Thermal routing + # Thermal routing — EVERYTHING (including criticals) goes into + # the hourly digest. Only the explicit `alert_channel=thermal_print_immediate` + # label bypasses, and even that flushes-the-current-digest rather + # than printing a standalone job, so the same fingerprint can't + # spam the printer per webhook cycle. if status == "RESOLVED": add_to_digest(alert) # removes from buffer continue - if is_critical(alert) or is_immediate_label(alert): - runbook = alert.get("annotations", {}).get("runbook", "") - parts = [summary] - if desc and desc != summary: parts.append(desc) - if runbook: parts.append("STEPS: " + runbook) - pl = {"title": name, "severity": (severity or "warning").capitalize(), - "host": labels.get("instance", labels.get("pod", labels.get("namespace", "unknown"))), - "message": " | ".join(parts), "eventId": alert.get("fingerprint", ""), - "source": "Grafana (immediate)", "status": "PROBLEM", "acknowledged": False} - post_thermal(pl, "immediate") - elif is_batched_label(alert): + if is_immediate_label(alert): + # Explicit opt-in for "paper this NOW" — first arrival of a + # new fingerprint triggers an immediate digest flush; repeat + # webhooks for the same fingerprint dedupe in the buffer + # until the next interval or until the alert resolves. + new_in_buffer = add_to_digest(alert) + if new_in_buffer: + global _last_flush_time + flush_digest() + _last_flush_time = time.time() + elif is_critical(alert) or is_batched_label(alert): add_to_digest(alert) - # else: IRC-only + # else: IRC-only (warnings without thermal_print label) self.send_response(200); self.send_header("Content-Type", "application/json"); self.end_headers() self.wfile.write(b'{"status":"ok"}')