diff --git a/apps/monitoring/noc-monitoring.yaml b/apps/monitoring/noc-monitoring.yaml new file mode 100644 index 0000000..335f583 --- /dev/null +++ b/apps/monitoring/noc-monitoring.yaml @@ -0,0 +1,3788 @@ +# ============================================================================= +# NOC Monitoring Stack — K8s Migration Target +# ============================================================================= +# Migrates the noc1 Podman monitoring pod to RKE2 K8s. +# Source: noc1 (10.0.56.10) /opt/monitoring/ +# +# Components: +# - Prometheus (metrics, alerting) +# - Grafana (dashboards) +# - Blackbox Exporter (HTTP probes) +# - SNMP Exporter (network device metrics) +# - Node Exporter (host metrics, DaemonSet) +# - IRC Notify (alert relay to UnrealIRCd) +# +# Note: SNMP exporter config (snmp.yml) is ~2MB, exceeding the 1MB ConfigMap +# limit. It is stored in a separate file (snmp-config.yaml) and must be +# applied as a standalone ConfigMap or mounted via an init container that +# downloads it from Gitea. +# ============================================================================= + +--- +apiVersion: v1 +kind: Namespace +metadata: + name: monitoring + +# ============================================================================= +# ConfigMap: Prometheus Configuration +# ============================================================================= +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: prometheus-config + namespace: monitoring +data: + prometheus.yml: | + global: + scrape_interval: 30s + evaluation_interval: 30s + + rule_files: + - /etc/prometheus/alerts.yml + - /etc/prometheus/recording-rules.yml + + scrape_configs: + # noc1 host metrics (external to cluster) + - job_name: "node-exporter" + static_configs: + - targets: ["10.0.56.10:9100"] + labels: + instance: "noc1" + vlan: "mgmt" + + # RKE2 bare-metal cluster (openSUSE Leap 16 on NUCs) + - job_name: "rke2-nodes" + scrape_timeout: 15s + static_configs: + - targets: ["10.0.56.11:9100"] + labels: + instance: "rke2-server" + vlan: "mgmt" + cluster: "rke2" + role: "server" + - targets: ["10.0.56.12:9100"] + labels: + instance: "rke2-agent1" + vlan: "mgmt" + cluster: "rke2" + role: "agent" + - targets: ["10.0.56.13:9100"] + labels: + instance: "rke2-agent2" + vlan: "mgmt" + cluster: "rke2" + role: "agent" + + # In-cluster node-exporter DaemonSet + - job_name: "k8s-node-exporter" + kubernetes_sd_configs: + - role: endpoints + namespaces: + names: ["monitoring"] + relabel_configs: + - source_labels: [__meta_kubernetes_endpoints_name] + action: keep + regex: node-exporter + - source_labels: [__meta_kubernetes_endpoint_node_name] + target_label: instance + + # pfSense SNMP via snmp-exporter + - job_name: "snmp-pfsense" + static_configs: + - targets: ["10.0.56.1"] + metrics_path: /snmp + params: + module: [if_mib] + auth: [bluejay_v2] + relabel_configs: + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - target_label: __address__ + replacement: snmp-exporter.monitoring.svc:9116 + + # UniFi Cloud Key SNMP + - job_name: "snmp-cloudkey" + static_configs: + - targets: ["10.0.56.3"] + metrics_path: /snmp + params: + module: [if_mib] + auth: [bluejay_v2] + relabel_configs: + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - target_label: __address__ + replacement: snmp-exporter.monitoring.svc:9116 + + # UniFi Switch SNMP + - job_name: "snmp-switch" + static_configs: + - targets: ["10.0.56.2"] + metrics_path: /snmp + params: + module: [if_mib] + auth: [bluejay_v2] + relabel_configs: + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - target_label: __address__ + replacement: snmp-exporter.monitoring.svc:9116 + + # Synology NAS SNMP + - job_name: "snmp-nas" + static_configs: + - targets: ["10.0.58.3"] + metrics_path: /snmp + params: + module: [synology] + auth: [public_v2] + relabel_configs: + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - target_label: __address__ + replacement: snmp-exporter.monitoring.svc:9116 + + # Prometheus self-monitoring + - job_name: "prometheus" + static_configs: + - targets: ["localhost:9090"] + + # Edge nodes (PROD VLAN) + - job_name: "edge-nodes" + static_configs: + - targets: ["10.0.57.17:9100"] + labels: + instance: "edge1" + vlan: "prod" + arch: "arm64" + role: "ai-inference" + puppet_managed: "true" + puppet_server: "puppet.iamworkin.lan" + - targets: ["10.0.57.16:9100"] + labels: + instance: "edge2" + vlan: "prod" + arch: "arm64" + role: "ci-runner" + puppet_managed: "true" + puppet_server: "puppet.iamworkin.lan" + - targets: ["10.0.58.25:9100"] + labels: + instance: "piez" + vlan: "home" + arch: "arm64" + role: "prototyping" + - targets: ["10.0.58.113:9100"] + labels: + instance: "pirelay" + vlan: "home" + arch: "arm64" + role: "relay-controller" + + # ======================================================================= + # PiManager Application Metrics (relay states, temps, automation) + # ======================================================================= + + - job_name: "pimanager-app" + scrape_interval: 15s + metrics_path: /metrics + static_configs: + - targets: ["10.0.58.25:5000"] + labels: + instance: "piez" + service: "pimanager" + vlan: "home" + device: "pi4-ezconnect" + - targets: ["10.0.58.113:5100"] + labels: + instance: "pirelay" + service: "pimanager" + vlan: "home" + device: "pi3-ks0212" + + # Epson ET-3750 EcoTank Printer SNMP + - job_name: "snmp-printer" + scrape_interval: 5m + scrape_timeout: 30s + static_configs: + - targets: ["10.0.58.107"] + labels: + instance: "epson-ecotank" + vlan: "home" + device_type: "printer" + metrics_path: /snmp + params: + module: [printer_mib] + auth: [public_v2] + relabel_configs: + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - target_label: __address__ + replacement: snmp-exporter.monitoring.svc:9116 + + # ============================================================================= + # Print Services (CUPS + Print.Web on edge2) + # ============================================================================= + + # CUPS Prometheus exporter (cups_exporter on edge2:9628) + - job_name: "cups" + scrape_interval: 30s + static_configs: + - targets: ["10.0.57.16:9628"] + labels: + instance: "edge2" + service: "cups" + device_type: "printer" + printer_model: "NuPrint 210" + + # Print.Web OTEL metrics (counters: jobs enqueued/completed/failed, bytes, duration histograms) + - job_name: "printweb-otel" + scrape_interval: 30s + metrics_path: /metrics/prometheus + static_configs: + - targets: ["10.0.57.16:5200"] + labels: + instance: "print-web" + service: "print-web" + device_type: "printer" + printer_model: "NuPrint 210" + + # Print.Web health (Blazor app on edge2:5200) + - job_name: "probe-printweb" + metrics_path: /probe + params: + module: [http_2xx] + scrape_interval: 30s + static_configs: + - targets: ["http://10.0.57.16:5200/"] + labels: + instance: "print-web" + service: "print-web" + relabel_configs: + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - target_label: __address__ + replacement: blackbox-exporter.monitoring.svc:9115 + + # CUPS web UI health (port 631) + - job_name: "probe-cups" + metrics_path: /probe + params: + module: [http_2xx] + scrape_interval: 60s + static_configs: + - targets: ["http://10.0.57.16:631/"] + labels: + instance: "cups-edge2" + service: "cups" + relabel_configs: + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - target_label: __address__ + replacement: blackbox-exporter.monitoring.svc:9115 + + # ============================================================================= + # AI Stack Health Probes (Blackbox Exporter) + # ============================================================================= + + # Ollama API — workstation (LOCAL Agent Zero) + - job_name: "probe-ollama-local" + metrics_path: /probe + params: + module: [http_ollama] + scrape_interval: 30s + static_configs: + - targets: ["http://10.0.58.100:11434/api/tags"] + labels: + instance: "ollama-local" + service: "ollama" + deployment: "local" + gpu: "r9700" + relabel_configs: + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - target_label: __address__ + replacement: blackbox-exporter.monitoring.svc:9115 + + # Ollama API — edge1 Pi 5 (NUC Agent Zero) + - job_name: "probe-ollama-edge1" + metrics_path: /probe + params: + module: [http_ollama] + scrape_interval: 30s + static_configs: + - targets: ["http://10.0.57.17:11434/api/tags"] + labels: + instance: "ollama-edge1" + service: "ollama" + deployment: "nuc" + gpu: "cpu" + relabel_configs: + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - target_label: __address__ + replacement: blackbox-exporter.monitoring.svc:9115 + + # Agent Zero Web UI — local (K3s) + - job_name: "probe-agentzero-local" + metrics_path: /probe + params: + module: [http_2xx] + scrape_interval: 30s + static_configs: + - targets: ["http://10.0.58.100:30050/"] + labels: + instance: "agent-zero-local" + service: "agent-zero" + deployment: "local" + relabel_configs: + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - target_label: __address__ + replacement: blackbox-exporter.monitoring.svc:9115 + + # Agent Zero Web UI — NUC (RKE2 via Traefik) + - job_name: "probe-agentzero-nuc" + metrics_path: /probe + params: + module: [http_2xx] + scrape_interval: 30s + static_configs: + - targets: ["http://agent-zero.agent-zero.svc.cluster.local/"] + labels: + instance: "agent-zero-nuc" + service: "agent-zero" + deployment: "nuc" + relabel_configs: + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - target_label: __address__ + replacement: blackbox-exporter.monitoring.svc:9115 + + # ============================================================================= + # Self-monitoring (K8s monitoring namespace) + # ============================================================================= + + - job_name: "monitoring-grafana" + metrics_path: /metrics + static_configs: + - targets: ["grafana.monitoring.svc:3000"] + labels: + instance: "grafana-k8s" + service: "grafana" + + - job_name: "monitoring-blackbox" + static_configs: + - targets: ["blackbox-exporter.monitoring.svc:9115"] + labels: + instance: "blackbox-k8s" + service: "blackbox" + + recording-rules.yml: | + groups: + - name: node-aggregations + interval: 30s + rules: + - record: instance:node_cpu_usage:avg5m + expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) + - record: instance:node_memory_usage:percent + expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100 + - record: instance:node_disk_usage:percent + expr: (1 - node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100 + - record: instance:node_network_receive:rate5m + expr: rate(node_network_receive_bytes_total{device!~"lo|veth.*|cali.*|flannel.*"}[5m]) * 8 + - record: instance:node_network_transmit:rate5m + expr: rate(node_network_transmit_bytes_total{device!~"lo|veth.*|cali.*|flannel.*"}[5m]) * 8 + - name: probe-aggregations + interval: 30s + rules: + - record: service:probe_success:min + expr: min by(service) (probe_success) + - record: service:probe_duration:avg + expr: avg by(service) (probe_duration_seconds) + - name: print-rates + interval: 30s + rules: + - record: print:jobs_per_minute:rate5m + expr: rate(print_jobs_enqueued_total[5m]) * 60 + - record: print:success_rate:ratio5m + expr: rate(print_jobs_completed_total[5m]) / rate(print_jobs_enqueued_total[5m]) + - record: print:job_duration_p95:5m + expr: histogram_quantile(0.95, rate(print_job_duration_ms_bucket[5m])) + - name: relay-rates + interval: 15s + rules: + - record: relay:state_changes:1h + expr: changes(pimanager_relay_state[1h]) + - record: epson:pages_per_day:rate24h + expr: increase(prtMarkerLifeCount{job="snmp-printer"}[24h]) + + alerts.yml: | + groups: + - name: ai-stack + rules: + - alert: OllamaDown + expr: probe_success{service="ollama"} == 0 + for: 2m + labels: + severity: warning + annotations: + summary: "Ollama is down on {{ $labels.deployment }}" + description: "Ollama API at {{ $labels.instance }} has been unreachable for 2 minutes. Agent Zero FAISS memory will fail." + + - alert: AgentZeroDown + expr: probe_success{service="agent-zero"} == 0 + for: 2m + labels: + severity: warning + annotations: + summary: "Agent Zero is down on {{ $labels.deployment }}" + description: "Agent Zero web UI at {{ $labels.instance }} has been unreachable for 2 minutes." + + - alert: OllamaSlowResponse + expr: probe_duration_seconds{service="ollama"} > 3 + for: 5m + labels: + severity: info + annotations: + summary: "Ollama responding slowly on {{ $labels.deployment }}" + description: "Ollama API response time exceeds 3s for 5 minutes. GPU may be overloaded." + + - name: print-services + rules: + - alert: CUPSExporterDown + expr: up{job="cups"} == 0 + for: 2m + labels: + severity: warning + annotations: + summary: "CUPS exporter unreachable on edge2" + description: "cups_exporter at edge2:9628 has been down for 2 minutes. CUPS monitoring offline." + + - alert: CUPSWebUIDown + expr: probe_success{job="probe-cups"} == 0 + for: 3m + labels: + severity: warning + annotations: + summary: "CUPS web UI down on edge2" + description: "CUPS port 631 unreachable for 3 minutes. Network printing unavailable." + + - alert: PrintWebDown + expr: probe_success{job="probe-printweb"} == 0 + for: 2m + labels: + severity: warning + annotations: + summary: "Print.Web is down on edge2" + description: "FlowerCore Print.Web at edge2:5200 unreachable. API/MCP/Blazor printing unavailable." + + - alert: CUPSPrinterStopped + expr: cups_printer_state_total{state="stopped"} > 0 + for: 5m + labels: + severity: warning + annotations: + summary: "CUPS printer stopped on edge2" + description: "A CUPS printer has been in stopped state for 5 minutes. Check USB connection or paper." + + - alert: CUPSJobBacklog + expr: cups_job_active_total > 10 + for: 2m + labels: + severity: warning + annotations: + summary: "Print queue backlog on edge2 ({{ $value }} active jobs)" + description: "CUPS has {{ $value }} active jobs queued. Possible printer jam, USB disconnect, or paper out." + + - alert: CUPSHighJobRate + expr: rate(cups_job_total[5m]) * 60 > 30 + for: 5m + labels: + severity: info + annotations: + summary: "High print volume on edge2 ({{ $value | printf \"%.0f\" }} jobs/min)" + description: "Print rate exceeds 30 jobs/min for 5 minutes. Possible runaway print loop." + + - name: pi-fleet + rules: + - alert: PiManagerDown + expr: up{job="pimanager-app"} == 0 + for: 3m + labels: + severity: warning + annotations: + summary: "PiManager down on {{ $labels.instance }}" + description: "PiManager app on {{ $labels.instance }} ({{ $labels.device }}) unreachable for 3 minutes." + + - alert: PiCpuTempHigh + expr: pimanager_cpu_temperature_celsius > 75 + for: 5m + labels: + severity: warning + annotations: + summary: "Pi CPU temperature high on {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}C)" + + - alert: PiCpuTempCritical + expr: pimanager_cpu_temperature_celsius > 82 + for: 2m + labels: + severity: critical + annotations: + summary: "Pi CPU temperature CRITICAL on {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}C)" + + - alert: PiMemoryHigh + expr: pimanager_memory_usage_percent > 90 + for: 5m + labels: + severity: warning + annotations: + summary: "Pi memory usage high on {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}%)" + + - alert: PiDiskHigh + expr: pimanager_disk_usage_percent > 85 + for: 10m + labels: + severity: warning + annotations: + summary: "Pi disk usage high on {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}%)" + + - alert: RelayAllOff + expr: sum by (instance) (pimanager_relay_state) == 0 and pimanager_relay_channel_count > 0 + for: 0m + labels: + severity: info + annotations: + summary: "All relay channels OFF on {{ $labels.instance }}" + + - alert: PiWifiWeak + expr: pimanager_wifi_signal_dbm < -75 and pimanager_wifi_signal_dbm != 0 + for: 10m + labels: + severity: warning + annotations: + summary: "Weak WiFi signal on {{ $labels.instance }} ({{ $value }}dBm)" + + - name: snmp-devices + rules: + - alert: EpsonInkLow + expr: prtMarkerSuppliesLevel{job="snmp-printer"} < 15 and prtMarkerSuppliesLevel{job="snmp-printer"} > 0 + for: 0m + labels: + severity: warning + alert_channel: thermal_print + annotations: + summary: "Epson ink low: {{ $labels.prtMarkerSuppliesDescription }} at {{ $value }}%" + + - alert: EpsonInkCritical + expr: prtMarkerSuppliesLevel{job="snmp-printer"} < 5 and prtMarkerSuppliesLevel{job="snmp-printer"} > 0 + for: 0m + labels: + severity: critical + alert_channel: thermal_print + annotations: + summary: "Epson ink CRITICAL: {{ $labels.prtMarkerSuppliesDescription }} at {{ $value }}%" + + - alert: EpsonPrinterDown + expr: up{job="snmp-printer"} == 0 + for: 5m + labels: + severity: warning + annotations: + summary: "Epson ET-3750 SNMP unreachable" + + - alert: SynologyDiskLow + expr: hrStorageUsed{job="snmp-nas"} / hrStorageSize{job="snmp-nas"} * 100 > 85 + for: 10m + labels: + severity: warning + alert_channel: thermal_print + annotations: + summary: "Synology NAS storage high ({{ $value | printf \"%.1f\" }}%)" + + - alert: SynologyDown + expr: up{job="snmp-nas"} == 0 + for: 3m + labels: + severity: critical + alert_channel: thermal_print + annotations: + summary: "Synology NAS SNMP unreachable" + + - name: infrastructure + rules: + - alert: NodeDown + expr: up{job=~"node-exporter|rke2-nodes|edge-nodes|k8s-node-exporter"} == 0 + for: 2m + labels: + severity: critical + annotations: + summary: "Node {{ $labels.instance }} is down" + + - alert: HighCPU + expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 85 + for: 10m + labels: + severity: warning + annotations: + summary: "High CPU on {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}%)" + + - alert: HighMemory + expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100 > 90 + for: 5m + labels: + severity: warning + annotations: + summary: "High memory usage on {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}%)" + + - alert: DiskSpaceLow + expr: (1 - node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100 > 85 + for: 10m + labels: + severity: warning + annotations: + summary: "Disk usage high on {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}%)" + +# ============================================================================= +# ConfigMap: Blackbox Exporter Configuration +# ============================================================================= +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: blackbox-config + namespace: monitoring +data: + blackbox.yml: | + modules: + http_2xx: + prober: http + timeout: 5s + http: + valid_http_versions: ["HTTP/1.1", "HTTP/2.0"] + valid_status_codes: [200] + method: GET + fail_if_body_not_matches_regexp: [] + preferred_ip_protocol: ip4 + http_ollama: + prober: http + timeout: 5s + http: + valid_http_versions: ["HTTP/1.1", "HTTP/2.0"] + valid_status_codes: [200] + method: GET + fail_if_body_not_matches_regexp: + - '"models"' + preferred_ip_protocol: ip4 + +# ============================================================================= +# ConfigMap: IRC Notify Script +# ============================================================================= +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: irc-notify-script + namespace: monitoring +data: + notify.py: | + #!/usr/bin/env python3 + """HTTP->IRC alert relay with thermal printer forwarding for Grafana webhooks. + Listens on :9119, posts to #alerts on UnrealIRCd via raw IRC protocol. + Alerts tagged alert_channel=thermal_print also POST to Print.Web /api/print/alert. + """ + import json, socket, sys, time + from http.server import HTTPServer, BaseHTTPRequestHandler + from urllib.request import Request, urlopen + from urllib.error import URLError + + IRC_HOST = "unrealircd.irc.svc.cluster.local" + IRC_PORT = 6667 + IRC_NICK = "grafana-bot" + IRC_CHANNEL = "#alerts" + PRINT_WEB_URL = "http://10.0.57.16:5200/api/print/alert" + PRINT_ENABLED = True + + def send_irc(message): + """Connect, handle PING, join, send, quit.""" + try: + sock = socket.create_connection((IRC_HOST, IRC_PORT), timeout=15) + sock.sendall(f"NICK {IRC_NICK}\r\n".encode()) + sock.sendall(f"USER {IRC_NICK} 0 * :Grafana Alert Bot\r\n".encode()) + registered = False + deadline = time.time() + 10 + buf = "" + while time.time() < deadline: + try: + data = sock.recv(4096).decode("utf-8", errors="replace") + if not data: break + buf += data + for line in buf.split("\r\n"): + if line.startswith("PING"): + sock.sendall(("PONG " + line.split(" ", 1)[1] + "\r\n").encode()) + if " 001 " in buf: + registered = True + break + except socket.timeout: break + if not registered: + sock.close() + return False + sock.sendall(f"JOIN {IRC_CHANNEL}\r\n".encode()) + time.sleep(0.5) + sock.recv(4096) + for line in message.split("\n"): + if line.strip(): + sock.sendall(f"PRIVMSG {IRC_CHANNEL} :{line}\r\n".encode()) + time.sleep(0.3) + time.sleep(0.5) + sock.sendall(b"QUIT :alert delivered\r\n") + sock.close() + return True + except Exception as e: + print(f"[irc-notify] IRC send failed: {e}", file=sys.stderr) + return False + + def send_thermal_print(alert): + if not PRINT_ENABLED: return + labels = alert.get("labels", {}) + annotations = alert.get("annotations", {}) + status = alert.get("status", "firing").upper() + summary = annotations.get("summary", "") + description = annotations.get("description", "") + runbook = annotations.get("runbook", "") + # Build a useful message: summary + description + runbook steps + parts = [] + if summary: parts.append(summary) + if description and description != summary: parts.append(description) + if runbook: parts.append("STEPS: " + runbook) + message = " | ".join(parts) if parts else labels.get("alertname", "Unknown alert") + payload = { + "title": labels.get("alertname", "Unknown"), + "severity": labels.get("severity", "warning").capitalize(), + "host": labels.get("instance", labels.get("host", "unknown")), + "message": message, + "eventId": alert.get("fingerprint", ""), + "source": "Grafana", + "status": "RESOLVED" if status == "RESOLVED" else "PROBLEM", + "acknowledged": False + } + try: + req = Request(PRINT_WEB_URL, data=json.dumps(payload).encode("utf-8"), + headers={"Content-Type": "application/json"}, method="POST") + resp = urlopen(req, timeout=10) + print(f"[irc-notify] Thermal print sent: {resp.read().decode()}", file=sys.stderr) + except Exception as e: + print(f"[irc-notify] Thermal print failed: {e}", file=sys.stderr) + + def should_print(alert): + labels = alert.get("labels", {}) + if labels.get("alert_channel") == "thermal_print": return True + if labels.get("severity", "").lower() in ("critical", "disaster"): return True + if alert.get("status", "").upper() == "RESOLVED": return False + return False + + class Handler(BaseHTTPRequestHandler): + def do_POST(self): + length = int(self.headers.get("Content-Length", 0)) + body = json.loads(self.rfile.read(length)) if length else {} + for alert in body.get("alerts", []): + status = alert.get("status", "unknown").upper() + labels = alert.get("labels", {}) + name = labels.get("alertname", "Unknown") + summary = alert.get("annotations", {}).get("summary", "") + desc = alert.get("annotations", {}).get("description", "") + severity = labels.get("severity", "") + icon = "\x0304[FIRING]\x03" if status == "FIRING" else "\x0303[RESOLVED]\x03" + sev_tag = f" [{severity}]" if severity else "" + msg = f"{icon}{sev_tag} {name}: {summary}" + if desc: msg += f"\n {desc}" + send_irc(msg) + if should_print(alert): send_thermal_print(alert) + self.send_response(200) + self.send_header("Content-Type", "application/json") + self.end_headers() + self.wfile.write(b'{"status":"ok"}') + def do_GET(self): + self.send_response(200) + self.send_header("Content-Type", "application/json") + self.end_headers() + self.wfile.write(json.dumps({"service":"irc-notify","thermal_print":PRINT_ENABLED}).encode()) + def log_message(self, format, *args): + print(f"[irc-notify] {args[0]}", file=sys.stderr) + + if __name__ == "__main__": + server = HTTPServer(("0.0.0.0", 9119), Handler) + print(f"IRC alert relay :9119 -> {IRC_HOST}:{IRC_PORT} {IRC_CHANNEL} (thermal: {PRINT_ENABLED})") + server.serve_forever() + +# ============================================================================= +# SNMP Exporter Auth Secret +# ============================================================================= +# The full snmp.yml (~2MB, auto-generated) exceeds the 1MB ConfigMap limit. +# Strategy: store SNMP auth credentials in a Secret, and use an init container +# to download the full snmp.yml from Gitea, or split into multiple ConfigMaps. +# For now, we mount a minimal auth-only config and rely on the default modules +# bundled in the snmp-exporter image. To use custom modules, apply +# snmp-config.yaml separately (see comments in that file). +--- +apiVersion: v1 +kind: Secret +metadata: + name: snmp-auth + namespace: monitoring +type: Opaque +stringData: + # SNMP v2 community string used by prometheus scrape configs + SNMP_COMMUNITY_BLUEJAY: bluejay_monitor + SNMP_V3_USER: bluejay_snmpv3 + SNMP_V3_AUTH_PASS: BlueJay-SNMP-Auth-2026 + SNMP_V3_PRIV_PASS: BlueJay-SNMP-Priv-2026 + +# ============================================================================= +# Grafana Credentials — synced from 1Password via Operator +# ============================================================================= +# 1Password vault: IAmWorkin > "Grafana" +# Creates K8s Secret "grafana-credentials" with fields: username, password +# Grafana Deployment uses secretKeyRef to map username→GF_SECURITY_ADMIN_USER, password→GF_SECURITY_ADMIN_PASSWORD +--- +apiVersion: onepassword.com/v1 +kind: OnePasswordItem +metadata: + name: grafana-credentials + namespace: monitoring +spec: + itemPath: vaults/IAmWorkin/items/Grafana + +# ============================================================================= +# RBAC: ServiceAccount + ClusterRole for Prometheus K8s SD +# ============================================================================= +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: prometheus + namespace: monitoring +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: prometheus +rules: + - apiGroups: [""] + resources: ["nodes", "nodes/proxy", "services", "endpoints", "pods"] + verbs: ["get", "list", "watch"] + - apiGroups: ["extensions", "networking.k8s.io"] + resources: ["ingresses"] + verbs: ["get", "list", "watch"] + - nonResourceURLs: ["/metrics"] + verbs: ["get"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: prometheus +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: prometheus +subjects: + - kind: ServiceAccount + name: prometheus + namespace: monitoring + +# ============================================================================= +# PVC: Prometheus Data (10Gi, Longhorn) +# ============================================================================= +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: prometheus-data + namespace: monitoring +spec: + accessModes: + - ReadWriteOnce + storageClassName: longhorn + resources: + requests: + storage: 10Gi + +# ============================================================================= +# PVC: Grafana Data (2Gi, Longhorn) +# ============================================================================= +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: grafana-data + namespace: monitoring +spec: + accessModes: + - ReadWriteOnce + storageClassName: longhorn + resources: + requests: + storage: 2Gi + +# ============================================================================= +# Deployment: Prometheus +# ============================================================================= +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: prometheus + namespace: monitoring + labels: + app: prometheus +spec: + replicas: 1 + strategy: + type: Recreate + selector: + matchLabels: + app: prometheus + template: + metadata: + labels: + app: prometheus + spec: + serviceAccountName: prometheus + securityContext: + fsGroup: 65534 # nobody + runAsUser: 65534 + runAsGroup: 65534 + containers: + - name: prometheus + image: docker.io/prom/prometheus:latest + args: + - "--config.file=/etc/prometheus/prometheus.yml" + - "--storage.tsdb.path=/prometheus" + - "--storage.tsdb.retention.time=90d" + - "--web.enable-lifecycle" + ports: + - containerPort: 9090 + name: http + volumeMounts: + - name: config + mountPath: /etc/prometheus/prometheus.yml + subPath: prometheus.yml + readOnly: true + - name: config + mountPath: /etc/prometheus/alerts.yml + subPath: alerts.yml + readOnly: true + - name: config + mountPath: /etc/prometheus/recording-rules.yml + subPath: recording-rules.yml + readOnly: true + - name: data + mountPath: /prometheus + resources: + requests: + cpu: 200m + memory: 512Mi + limits: + cpu: "1" + memory: 2Gi + livenessProbe: + httpGet: + path: /-/healthy + port: 9090 + initialDelaySeconds: 15 + periodSeconds: 30 + readinessProbe: + httpGet: + path: /-/ready + port: 9090 + initialDelaySeconds: 5 + periodSeconds: 10 + volumes: + - name: config + configMap: + name: prometheus-config + - name: data + persistentVolumeClaim: + claimName: prometheus-data + +# ============================================================================= +# ConfigMap: Grafana Dashboard Provider +# ============================================================================= +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-dashboard-provider + namespace: monitoring +data: + default.yml: | + apiVersion: 1 + providers: + - name: 'default' + orgId: 1 + folder: '' + type: file + disableDeletion: false + updateIntervalSeconds: 30 + options: + path: /var/lib/grafana/dashboards + foldersFromFilesStructure: true + +# ============================================================================= +# ConfigMap: Grafana Dashboards (AI Stack Health) +# ============================================================================= +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-dashboards + namespace: monitoring +data: + ai-stack-health.json: | + { + "id": null, + "panels": [ + { + "fieldConfig": { + "defaults": { + "mappings": [ + { + "options": { + "0": { "color": "red", "text": "DOWN" }, + "1": { "color": "green", "text": "UP" } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "green", "value": 1 } + ] + } + } + }, + "gridPos": { "h": 4, "w": 6, "x": 0, "y": 0 }, + "id": 1, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "probe_success{job=\"probe-ollama-local\"}", + "legendFormat": "Status" + } + ], + "title": "Ollama (Local)", + "type": "stat" + }, + { + "fieldConfig": { + "defaults": { + "mappings": [ + { + "options": { + "0": { "color": "red", "text": "DOWN" }, + "1": { "color": "green", "text": "UP" } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "green", "value": 1 } + ] + } + } + }, + "gridPos": { "h": 4, "w": 6, "x": 6, "y": 0 }, + "id": 2, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "probe_success{job=\"probe-ollama-edge1\"}", + "legendFormat": "Status" + } + ], + "title": "Ollama (Edge1)", + "type": "stat" + }, + { + "fieldConfig": { + "defaults": { + "mappings": [ + { + "options": { + "0": { "color": "red", "text": "DOWN" }, + "1": { "color": "green", "text": "UP" } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "green", "value": 1 } + ] + } + } + }, + "gridPos": { "h": 4, "w": 6, "x": 12, "y": 0 }, + "id": 3, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "probe_success{job=\"probe-agentzero-local\"}", + "legendFormat": "Status" + } + ], + "title": "Agent Zero (Local)", + "type": "stat" + }, + { + "fieldConfig": { + "defaults": { + "mappings": [ + { + "options": { + "0": { "color": "red", "text": "DOWN" }, + "1": { "color": "green", "text": "UP" } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "green", "value": 1 } + ] + } + } + }, + "gridPos": { "h": 4, "w": 6, "x": 18, "y": 0 }, + "id": 4, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "probe_success{job=\"probe-agentzero-nuc\"}", + "legendFormat": "Status" + } + ], + "title": "Agent Zero (NUC)", + "type": "stat" + }, + { + "fieldConfig": { + "defaults": { + "custom": { "fillOpacity": 15, "lineWidth": 2 }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 1 }, + { "color": "red", "value": 3 } + ] + }, + "unit": "s" + } + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 4 }, + "id": 5, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "probe_duration_seconds{service=\"ollama\"}", + "legendFormat": "{{ deployment }}" + } + ], + "title": "Ollama Response Time", + "type": "timeseries" + }, + { + "fieldConfig": { + "defaults": { + "custom": { "fillOpacity": 15, "lineWidth": 2 }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 1 }, + { "color": "red", "value": 3 } + ] + }, + "unit": "s" + } + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 4 }, + "id": 6, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "probe_duration_seconds{service=\"agent-zero\"}", + "legendFormat": "{{ deployment }}" + } + ], + "title": "Agent Zero Response Time", + "type": "timeseries" + }, + { + "fieldConfig": { + "defaults": { + "custom": { "fillOpacity": 30, "lineWidth": 1, "stacking": { "mode": "none" } }, + "mappings": [ + { + "options": { + "0": { "text": "DOWN" }, + "1": { "text": "UP" } + }, + "type": "value" + } + ], + "max": 1, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "green", "value": 1 } + ] + } + } + }, + "gridPos": { "h": 6, "w": 24, "x": 0, "y": 12 }, + "id": 7, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "probe_success{service=\"ollama\"}", + "legendFormat": "Ollama ({{ deployment }})" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "probe_success{service=\"agent-zero\"}", + "legendFormat": "Agent Zero ({{ deployment }})" + } + ], + "title": "Uptime History", + "type": "timeseries" + }, + { + "fieldConfig": { + "defaults": { + "custom": { "fillOpacity": 10, "lineWidth": 2 }, + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 75 }, + { "color": "red", "value": 90 } + ] + }, + "unit": "percent" + } + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 18 }, + "id": 8, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "100 - (avg(rate(node_cpu_seconds_total{instance=\"edge1\",mode=\"idle\"}[5m])) * 100)", + "legendFormat": "CPU %" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "(1 - node_memory_MemAvailable_bytes{instance=\"edge1\"} / node_memory_MemTotal_bytes{instance=\"edge1\"}) * 100", + "legendFormat": "Memory %" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "(1 - node_filesystem_avail_bytes{instance=\"edge1\",mountpoint=\"/\"} / node_filesystem_size_bytes{instance=\"edge1\",mountpoint=\"/\"}) * 100", + "legendFormat": "Disk %" + } + ], + "title": "Edge1 Pi 5 \u2014 CPU / Memory / Disk", + "type": "timeseries" + }, + { + "fieldConfig": { + "defaults": { + "custom": { "fillOpacity": 10, "lineWidth": 2 }, + "unit": "s" + } + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 18 }, + "id": 9, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "probe_dns_lookup_time_seconds", + "legendFormat": "{{ job }}" + } + ], + "title": "Probe DNS Lookup Time", + "type": "timeseries" + } + ], + "refresh": "30s", + "schemaVersion": 39, + "tags": ["ai", "ollama", "agent-zero", "blue-jay"], + "time": { "from": "now-1h", "to": "now" }, + "timezone": "browser", + "title": "AI Stack Health", + "uid": "ai-stack-health", + "version": 1 + } + +# ============================================================================= +# ConfigMap: Grafana Dashboard — Edge Nodes +# ============================================================================= +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-dashboard-edge-nodes + namespace: monitoring +data: + bluejay-edge-nodes.json: | + { + "id": null, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "mappings": [ + { + "options": { + "0": { + "color": "red", + "text": "DOWN" + }, + "1": { + "color": "green", + "text": "UP" + } + }, + "type": "value" + } + ] + } + }, + "gridPos": { "h": 4, "w": 24, "x": 0, "y": 0 }, + "targets": [ + { + "expr": "up{instance=~\"edge.*\"}", + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "title": "Edge Node Status", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "max": 100, + "min": 0, + "unit": "percent" + } + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 4 }, + "targets": [ + { + "expr": "100 - (avg(rate(node_cpu_seconds_total{mode=\"idle\", instance=~\"edge1.*\"}[5m])) * 100)", + "legendFormat": "CPU %", + "refId": "A" + }, + { + "expr": "node_load1{instance=~\"edge1.*\"}", + "legendFormat": "Load 1m", + "refId": "B" + } + ], + "title": "edge1 (Pi5 + Hailo) CPU", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "max": 100, + "min": 0, + "unit": "percent" + } + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 4 }, + "targets": [ + { + "expr": "100 - (avg(rate(node_cpu_seconds_total{mode=\"idle\", instance=~\"edge2.*\"}[5m])) * 100)", + "legendFormat": "CPU %", + "refId": "A" + }, + { + "expr": "node_load1{instance=~\"edge2.*\"}", + "legendFormat": "Load 1m", + "refId": "B" + } + ], + "title": "edge2 (Pi4) CPU", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "max": 100, + "min": 0, + "unit": "percent" + } + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 12 }, + "targets": [ + { + "expr": "(1 - node_memory_MemAvailable_bytes{instance=~\"edge.*\"} / node_memory_MemTotal_bytes{instance=~\"edge.*\"}) * 100", + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "title": "Edge Memory Usage", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "max": 100, + "min": 0, + "unit": "percent" + } + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 12 }, + "targets": [ + { + "expr": "100 - (node_filesystem_avail_bytes{instance=~\"edge.*\",mountpoint=\"/\"} / node_filesystem_size_bytes{instance=~\"edge.*\",mountpoint=\"/\"} * 100)", + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "title": "Edge Disk Usage", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "unit": "celsius" + } + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 20 }, + "targets": [ + { + "expr": "node_hwmon_temp_celsius{instance=~\"edge.*\"}", + "legendFormat": "{{instance}} {{chip}} {{sensor}}", + "refId": "A" + } + ], + "title": "Edge CPU Temperature", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "unit": "bps" + } + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 20 }, + "targets": [ + { + "expr": "rate(node_network_receive_bytes_total{instance=~\"edge.*\",device!~\"lo|docker.*|veth.*\"}[5m]) * 8", + "legendFormat": "{{instance}} {{device}} RX", + "refId": "A" + }, + { + "expr": "-rate(node_network_transmit_bytes_total{instance=~\"edge.*\",device!~\"lo|docker.*|veth.*\"}[5m]) * 8", + "legendFormat": "{{instance}} {{device}} TX", + "refId": "B" + } + ], + "title": "Edge Network Traffic", + "type": "timeseries" + } + ], + "refresh": "30s", + "schemaVersion": 40, + "tags": ["bluejay", "edge"], + "timezone": "browser", + "title": "BlueJay Edge Nodes", + "uid": "7e1603b9-e918-4b3f-a22b-163132fd5cee" + } + +# ============================================================================= +# ConfigMap: Grafana Dashboard — Network Overview +# ============================================================================= +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-dashboard-network-overview + namespace: monitoring +data: + bluejay-network-overview.json: | + { + "id": null, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "thresholds": { + "steps": [ + { "color": "green", "value": null } + ] + } + } + }, + "gridPos": { "h": 4, "w": 24, "x": 0, "y": 0 }, + "targets": [ + { + "expr": "count(up == 1)", + "legendFormat": "Up", + "refId": "A" + }, + { + "expr": "count(up == 0)", + "legendFormat": "Down", + "refId": "B" + } + ], + "title": "Target Health", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "max": 4, + "min": 0, + "thresholds": { + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 2 }, + { "color": "red", "value": 3 } + ] + } + } + }, + "gridPos": { "h": 6, "w": 6, "x": 0, "y": 4 }, + "targets": [ + { + "expr": "laLoad{instance=\"10.0.56.1\", laNames=\"Load-1\"}", + "refId": "A" + } + ], + "title": "pfSense CPU Load (1m)", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "max": 100, + "min": 0, + "thresholds": { + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 70 }, + { "color": "red", "value": 90 } + ] + }, + "unit": "percent" + } + }, + "gridPos": { "h": 6, "w": 6, "x": 6, "y": 4 }, + "targets": [ + { + "expr": "100 - (memAvailReal{instance=\"10.0.56.1\"} / memTotalReal{instance=\"10.0.56.1\"} * 100)", + "refId": "A" + } + ], + "title": "pfSense Memory Used %", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "max": 100, + "min": 0, + "unit": "percent" + } + }, + "gridPos": { "h": 6, "w": 12, "x": 12, "y": 4 }, + "targets": [ + { + "expr": "100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\", instance=\"noc1\"}[5m])) * 100)", + "legendFormat": "CPU %", + "refId": "A" + } + ], + "title": "noc1 CPU Usage", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "max": 100, + "min": 0, + "unit": "percent" + } + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 10 }, + "targets": [ + { + "expr": "(1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100", + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "title": "Node Memory Usage", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "max": 100, + "min": 0, + "thresholds": { + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 70 }, + { "color": "red", "value": 90 } + ] + }, + "unit": "percent" + } + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 10 }, + "targets": [ + { + "expr": "100 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} * 100)", + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "title": "Node Disk Usage %", + "type": "bargauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "unit": "bps" + } + }, + "gridPos": { "h": 8, "w": 24, "x": 0, "y": 18 }, + "targets": [ + { + "expr": "rate(node_network_receive_bytes_total{device!~\"lo|veth.*|cali.*|flannel.*\"}[5m]) * 8", + "legendFormat": "{{instance}} {{device}} RX", + "refId": "A" + }, + { + "expr": "-rate(node_network_transmit_bytes_total{device!~\"lo|veth.*|cali.*|flannel.*\"}[5m]) * 8", + "legendFormat": "{{instance}} {{device}} TX", + "refId": "B" + } + ], + "title": "Network Traffic", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "gridPos": { "h": 8, "w": 24, "x": 0, "y": 26 }, + "targets": [ + { + "expr": "up", + "format": "table", + "instant": true, + "refId": "A" + } + ], + "title": "Prometheus Targets", + "type": "table" + } + ], + "refresh": "30s", + "schemaVersion": 40, + "tags": ["bluejay", "network"], + "timezone": "browser", + "title": "BlueJay Network Overview", + "uid": "b83a122c-6eb5-4fae-a632-77cbf753ad05" + } + +# ============================================================================= +# ConfigMap: Grafana Dashboard — Operations +# ============================================================================= +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-dashboard-operations + namespace: monitoring +data: + bluejay-operations.json: | + { + "annotations": { + "list": [] + }, + "id": null, + "panels": [ + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, + "title": "Infrastructure Overview", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "noValue": "0", + "thresholds": { + "steps": [ + { "color": "green", "value": null } + ] + } + } + }, + "gridPos": { "h": 4, "w": 8, "x": 0, "y": 1 }, + "targets": [ + { + "expr": "count(up == 1)", + "legendFormat": "Up", + "refId": "A" + }, + { + "expr": "count(up == 0)", + "legendFormat": "Down", + "refId": "B" + } + ], + "title": "All Targets Up/Down", + "type": "stat" + }, + { + "datasource": { + "type": "alexanderzobnin-zabbix-datasource", + "uid": "bffjila3zkdfka" + }, + "gridPos": { "h": 8, "w": 16, "x": 8, "y": 1 }, + "targets": [ + { + "application": { "filter": "" }, + "group": { "filter": "/.*/" }, + "host": { "filter": "/.*/" }, + "queryType": 5, + "refId": "A", + "trigger": { "filter": "/.*/" } + } + ], + "title": "Zabbix Active Problems", + "type": "alexanderzobnin-zabbix-triggers-panel" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "gridPos": { "h": 6, "w": 8, "x": 0, "y": 5 }, + "targets": [ + { + "expr": "node_load1{instance=\"noc1\"}", + "legendFormat": "1m", + "refId": "A" + }, + { + "expr": "node_load5{instance=\"noc1\"}", + "legendFormat": "5m", + "refId": "B" + }, + { + "expr": "node_load15{instance=\"noc1\"}", + "legendFormat": "15m", + "refId": "C" + } + ], + "title": "noc1 Load Average", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 11 }, + "title": "Kubernetes & Services", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": {}, + "overrides": [ + { + "matcher": { "id": "byName", "options": "Value" }, + "properties": [ + { + "id": "mappings", + "value": [ + { + "options": { + "0": { "color": "red", "text": "DOWN" }, + "1": { "color": "green", "text": "UP" } + }, + "type": "value" + } + ] + } + ] + } + ] + }, + "gridPos": { "h": 8, "w": 24, "x": 0, "y": 12 }, + "targets": [ + { + "expr": "up", + "format": "table", + "instant": true, + "refId": "A" + } + ], + "title": "K8s Services Uptime (Prometheus Targets)", + "type": "table" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 20 }, + "title": "Network & SNMP", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "unit": "bps" + } + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 21 }, + "targets": [ + { + "expr": "rate(ifHCInOctets{instance=\"10.0.56.1\", ifAlias=~\".*WAN.*\"}[5m]) * 8", + "legendFormat": "WAN In", + "refId": "A" + }, + { + "expr": "-rate(ifHCOutOctets{instance=\"10.0.56.1\", ifAlias=~\".*WAN.*\"}[5m]) * 8", + "legendFormat": "WAN Out", + "refId": "B" + } + ], + "title": "pfSense WAN Traffic", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "unit": "bps" + } + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 21 }, + "targets": [ + { + "expr": "rate(ifHCInOctets{instance=\"10.0.56.1\", ifAlias!~\".*WAN.*\"}[5m]) * 8", + "legendFormat": "{{ifAlias}} In", + "refId": "A" + }, + { + "expr": "-rate(ifHCOutOctets{instance=\"10.0.56.1\", ifAlias!~\".*WAN.*\"}[5m]) * 8", + "legendFormat": "{{ifAlias}} Out", + "refId": "B" + } + ], + "title": "pfSense LAN Traffic", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "max": 100, + "min": 0, + "unit": "percent" + } + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 29 }, + "targets": [ + { + "expr": "(1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100", + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "title": "All Nodes Memory", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "max": 100, + "min": 0, + "unit": "percent" + } + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 29 }, + "targets": [ + { + "expr": "100 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} * 100)", + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "title": "All Nodes Disk", + "type": "timeseries" + } + ], + "refresh": "1m", + "schemaVersion": 40, + "tags": ["bluejay", "operations", "zabbix"], + "timezone": "browser", + "title": "BlueJay Operations", + "uid": "a781c4e4-c3fe-4ac7-be93-21363a41b97d" + } + +# ============================================================================= +# ConfigMap: Grafana Dashboard — Epson Printer +# ============================================================================= +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-dashboard-printer + namespace: monitoring +data: + epson-ecotank-printer.json: | + { + "id": null, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "orange", "value": 10 }, + { "color": "yellow", "value": 20 }, + { "color": "green", "value": 40 } + ] + }, + "unit": "percent" + }, + "overrides": [ + { + "matcher": { "id": "byName", "options": "Black Ink Bottle" }, + "properties": [ + { "id": "color", "value": { "fixedColor": "dark-blue", "mode": "fixed" } } + ] + }, + { + "matcher": { "id": "byName", "options": "Cyan Ink Bottle" }, + "properties": [ + { "id": "color", "value": { "fixedColor": "super-light-blue", "mode": "fixed" } } + ] + }, + { + "matcher": { "id": "byName", "options": "Magenta Ink Bottle" }, + "properties": [ + { "id": "color", "value": { "fixedColor": "purple", "mode": "fixed" } } + ] + }, + { + "matcher": { "id": "byName", "options": "Yellow Ink Bottle" }, + "properties": [ + { "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } } + ] + } + ] + }, + "gridPos": { "h": 10, "w": 12, "x": 0, "y": 0 }, + "id": 1, + "options": { + "orientation": "horizontal", + "reduceOptions": { + "calcs": ["lastNotNull"] + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "targets": [ + { + "expr": "prtMarkerSuppliesLevel{job=\"snmp-printer\"}", + "legendFormat": "{{prtMarkerSuppliesDescription}}", + "refId": "A" + } + ], + "title": "Ink Levels", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 20, + "lineWidth": 2, + "spanNulls": true + }, + "max": 100, + "min": 0, + "unit": "percent" + }, + "overrides": [ + { + "matcher": { "id": "byName", "options": "Black Ink Bottle" }, + "properties": [ + { "id": "color", "value": { "fixedColor": "dark-blue", "mode": "fixed" } } + ] + }, + { + "matcher": { "id": "byName", "options": "Cyan Ink Bottle" }, + "properties": [ + { "id": "color", "value": { "fixedColor": "super-light-blue", "mode": "fixed" } } + ] + }, + { + "matcher": { "id": "byName", "options": "Magenta Ink Bottle" }, + "properties": [ + { "id": "color", "value": { "fixedColor": "purple", "mode": "fixed" } } + ] + }, + { + "matcher": { "id": "byName", "options": "Yellow Ink Bottle" }, + "properties": [ + { "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } } + ] + } + ] + }, + "gridPos": { "h": 10, "w": 12, "x": 12, "y": 0 }, + "id": 2, + "targets": [ + { + "expr": "prtMarkerSuppliesLevel{job=\"snmp-printer\"}", + "legendFormat": "{{prtMarkerSuppliesDescription}}", + "refId": "A" + } + ], + "title": "Ink Level History", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 10000 }, + { "color": "red", "value": 50000 } + ] + }, + "unit": "short" + } + }, + "gridPos": { "h": 5, "w": 6, "x": 0, "y": 10 }, + "id": 3, + "options": { + "colorMode": "background", + "reduceOptions": { + "calcs": ["lastNotNull"] + }, + "textMode": "value_and_name" + }, + "targets": [ + { + "expr": "prtMarkerLifeCount{job=\"snmp-printer\"}", + "legendFormat": "Pages", + "refId": "A" + } + ], + "title": "Lifetime Page Count", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "mappings": [ + { + "options": { + "1": { "text": "Online" } + }, + "type": "value" + } + ], + "thresholds": { + "steps": [ + { "color": "blue", "value": null } + ] + } + } + }, + "gridPos": { "h": 5, "w": 6, "x": 6, "y": 10 }, + "id": 4, + "options": { + "colorMode": "background", + "reduceOptions": { + "calcs": ["lastNotNull"] + }, + "textMode": "name" + }, + "targets": [ + { + "expr": "prtGeneralPrinterName{job=\"snmp-printer\"}", + "legendFormat": "{{prtGeneralPrinterName}}", + "refId": "A" + } + ], + "title": "Printer Model", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "thresholds": { + "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 1 } + ] + }, + "unit": "short" + } + }, + "gridPos": { "h": 5, "w": 6, "x": 12, "y": 10 }, + "id": 5, + "options": { + "colorMode": "background", + "reduceOptions": { + "calcs": ["lastNotNull"] + } + }, + "targets": [ + { + "expr": "prtAlertCriticalEvents{job=\"snmp-printer\"}", + "legendFormat": "Critical Alerts", + "refId": "A" + } + ], + "title": "Critical Events", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "thresholds": { + "steps": [ + { "color": "blue", "value": null } + ] + } + } + }, + "gridPos": { "h": 5, "w": 6, "x": 18, "y": 10 }, + "id": 6, + "options": { + "colorMode": "background", + "reduceOptions": { + "calcs": ["lastNotNull"] + }, + "textMode": "name" + }, + "targets": [ + { + "expr": "prtGeneralSerialNumber{job=\"snmp-printer\"}", + "legendFormat": "{{prtGeneralSerialNumber}}", + "refId": "A" + } + ], + "title": "Serial Number", + "type": "stat" + } + ], + "refresh": "5m", + "schemaVersion": 39, + "tags": ["printer", "snmp", "bluejay"], + "time": { "from": "now-24h", "to": "now" }, + "timezone": "browser", + "title": "Epson ET-3750 EcoTank Printer", + "uid": "epson-ecotank" + } + +# ============================================================================= +# ConfigMap: Grafana Dashboard — Infrastructure Overview +# ============================================================================= +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-dashboard-infra-overview + namespace: monitoring +data: + infra-overview.json: | + { + "id": null, + "panels": [ + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, + "id": 100, + "title": "AI Stack", + "type": "row" + }, + { + "fieldConfig": { + "defaults": { + "mappings": [ + { + "options": { + "0": { "color": "red", "text": "DOWN" }, + "1": { "color": "green", "text": "UP" } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "green", "value": 1 } + ] + } + } + }, + "gridPos": { "h": 4, "w": 6, "x": 0, "y": 1 }, + "id": 1, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "probe_success{job=\"probe-ollama-local\"}", + "legendFormat": "Status" + } + ], + "title": "Ollama (Local)", + "type": "stat" + }, + { + "fieldConfig": { + "defaults": { + "mappings": [ + { + "options": { + "0": { "color": "red", "text": "DOWN" }, + "1": { "color": "green", "text": "UP" } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "green", "value": 1 } + ] + } + } + }, + "gridPos": { "h": 4, "w": 6, "x": 6, "y": 1 }, + "id": 2, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "probe_success{job=\"probe-ollama-edge1\"}", + "legendFormat": "Status" + } + ], + "title": "Ollama (Edge1)", + "type": "stat" + }, + { + "fieldConfig": { + "defaults": { + "mappings": [ + { + "options": { + "0": { "color": "red", "text": "DOWN" }, + "1": { "color": "green", "text": "UP" } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "green", "value": 1 } + ] + } + } + }, + "gridPos": { "h": 4, "w": 6, "x": 12, "y": 1 }, + "id": 3, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "probe_success{job=\"probe-agentzero-local\"}", + "legendFormat": "Status" + } + ], + "title": "Agent Zero (Local)", + "type": "stat" + }, + { + "fieldConfig": { + "defaults": { + "mappings": [ + { + "options": { + "0": { "color": "red", "text": "DOWN" }, + "1": { "color": "green", "text": "UP" } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "green", "value": 1 } + ] + } + } + }, + "gridPos": { "h": 4, "w": 6, "x": 18, "y": 1 }, + "id": 4, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "probe_success{job=\"probe-agentzero-nuc\"}", + "legendFormat": "Status" + } + ], + "title": "Agent Zero (NUC)", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 5 }, + "id": 101, + "title": "K8s Cluster", + "type": "row" + }, + { + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 30 }, + { "color": "red", "value": 50 } + ] + } + } + }, + "gridPos": { "h": 6, "w": 8, "x": 0, "y": 6 }, + "id": 5, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "count(up{job=\"node-exporter\"} == 1)", + "legendFormat": "Nodes Up" + } + ], + "title": "Nodes Up (node-exporter)", + "type": "stat" + }, + { + "fieldConfig": { + "defaults": { + "custom": { "fillOpacity": 15, "lineWidth": 2 }, + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 70 }, + { "color": "red", "value": 90 } + ] + }, + "unit": "percent" + } + }, + "gridPos": { "h": 6, "w": 8, "x": 8, "y": 6 }, + "id": 6, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "100 - (avg(rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) by (instance) * 100)", + "legendFormat": "{{ instance }}" + } + ], + "title": "Node CPU Usage %", + "type": "timeseries" + }, + { + "fieldConfig": { + "defaults": { + "custom": { "fillOpacity": 15, "lineWidth": 2 }, + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 70 }, + { "color": "red", "value": 90 } + ] + }, + "unit": "percent" + } + }, + "gridPos": { "h": 6, "w": 8, "x": 16, "y": 6 }, + "id": 7, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "(1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100", + "legendFormat": "{{ instance }}" + } + ], + "title": "Node Memory Usage %", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 12 }, + "id": 102, + "title": "Network", + "type": "row" + }, + { + "fieldConfig": { + "defaults": { + "custom": { "fillOpacity": 10, "lineWidth": 2 }, + "unit": "Bps" + } + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 13 }, + "id": 8, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "rate(ifHCInOctets{job=\"snmp-pfsense\",ifDescr=\"igb0\"}[5m])", + "legendFormat": "WAN In" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "rate(ifHCOutOctets{job=\"snmp-pfsense\",ifDescr=\"igb0\"}[5m])", + "legendFormat": "WAN Out" + } + ], + "title": "pfSense WAN Bandwidth", + "type": "timeseries" + }, + { + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "green", "value": 1 } + ] + } + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 13 }, + "id": 9, + "options": { + "showHeader": true, + "sortBy": [{ "displayName": "Value", "desc": false }] + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "up", + "format": "table", + "instant": true, + "legendFormat": "" + } + ], + "title": "Target Health (up)", + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { "Time": true, "__name__": true }, + "renameByName": { "job": "Job", "instance": "Instance", "Value": "Up" } + } + } + ], + "type": "table" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 21 }, + "id": 103, + "title": "Services", + "type": "row" + }, + { + "gridPos": { "h": 4, "w": 24, "x": 0, "y": 22 }, + "id": 10, + "options": { + "content": "### ArgoCD Applications\n\nView sync status and health of all 11 ArgoCD-managed applications:\n\n[Open ArgoCD Dashboard](https://argocd.iamworkin.lan)\n\n| App | Expected Status |\n|-----|----------------|\n| gitea | Healthy / Synced |\n| zabbix | Healthy / Synced |\n| guacamole | Healthy / Synced |\n| irc | Healthy / Synced |\n| mail | Healthy / Synced |\n| matrix | Healthy / Synced |\n| teamspeak | Healthy / Synced |\n| pki-web | Healthy / Synced |\n| intranet | Healthy / Synced |\n| telephony | Healthy / Synced |\n| infra-noc-services | Healthy / Synced |", + "mode": "markdown" + }, + "title": "ArgoCD App Status", + "type": "text" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 26 }, + "id": 104, + "title": "Alerting", + "type": "row" + }, + { + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "orange", "value": 1 }, + { "color": "red", "value": 3 } + ] + } + } + }, + "gridPos": { "h": 6, "w": 24, "x": 0, "y": 27 }, + "id": 11, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "count(ALERTS{alertstate=\"firing\"}) or vector(0)", + "legendFormat": "Firing Alerts" + } + ], + "title": "Firing Alerts", + "type": "stat" + } + ], + "refresh": "30s", + "schemaVersion": 39, + "tags": ["infrastructure", "blue-jay", "overview"], + "time": { "from": "now-1h", "to": "now" }, + "timezone": "browser", + "title": "Infrastructure Overview", + "uid": "infra-overview", + "version": 1 + } + +# ============================================================================= +# ConfigMap: Grafana Datasource Provisioning +# ============================================================================= +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-datasource-provisioning + namespace: monitoring +data: + datasource.yml: | + apiVersion: 1 + datasources: + - name: Prometheus + type: prometheus + access: proxy + url: http://prometheus.monitoring.svc:9090 + isDefault: true + editable: true + +# ============================================================================= +# ConfigMap: Grafana Alerting Provisioning (contact points + policies + rules) +# ============================================================================= +# Makes alert rules declarative — survives pod rebuilds without API recreation +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-alerting-provisioning + namespace: monitoring +data: + alerting.yml: | + apiVersion: 1 + contactPoints: + - orgId: 1 + name: IRC #alerts + receivers: + - uid: irc-alerts-webhook + type: webhook + settings: + url: http://irc-notify.monitoring.svc:9119 + httpMethod: POST + disableResolveMessage: false + - orgId: 1 + name: Thermal Printer + receivers: + - uid: thermal-print-001 + type: webhook + settings: + url: http://irc-notify.monitoring.svc:9119 + httpMethod: POST + disableResolveMessage: true + policies: + - orgId: 1 + receiver: IRC #alerts + group_by: ['alertname'] + group_wait: 30s + group_interval: 5m + repeat_interval: 1h + routes: + - receiver: Thermal Printer + matchers: ['alert_channel = thermal_print'] + group_wait: 1m + group_interval: 10m + repeat_interval: 4h + continue: true + groups: + - orgId: 1 + name: AI Stack + folder: AI Stack Alerts + interval: 1m + rules: + - uid: ollama-down-local + title: Ollama DOWN (Local) + condition: C + for: 2m + noDataState: Alerting + execErrState: OK + annotations: + summary: Ollama DOWN on workstation (R9700) + description: Agent Zero LOCAL cannot reach Ollama. FAISS memory will fail. + runbook: "1. Check PC power/wake 2. ssh stoltz@10.0.58.100 3. systemctl status ollama 4. journalctl -u ollama --since 5min" + labels: + severity: warning + service: ollama + data: + - refId: A + relativeTimeRange: {from: 120, to: 0} + datasourceUid: prometheus + model: {expr: 'probe_success{job="probe-ollama-local"}', instant: true, refId: A} + - refId: B + relativeTimeRange: {from: 120, to: 0} + datasourceUid: __expr__ + model: {type: reduce, expression: A, reducer: last, refId: B} + - refId: C + relativeTimeRange: {from: 120, to: 0} + datasourceUid: __expr__ + model: {type: threshold, expression: B, conditions: [{evaluator: {params: [1], type: lt}}], refId: C} + - uid: ollama-down-edge1 + title: Ollama DOWN (Edge1) + condition: C + for: 2m + noDataState: Alerting + execErrState: OK + annotations: + summary: Ollama DOWN on edge1 Pi 5 + description: Agent Zero NUC cannot reach Ollama. + runbook: "1. ssh stoltz@10.0.57.17 2. systemctl status ollama 3. journalctl -u ollama --since 5min 4. cat /sys/class/thermal/thermal_zone0/temp" + labels: + severity: warning + service: ollama + data: + - refId: A + relativeTimeRange: {from: 120, to: 0} + datasourceUid: prometheus + model: {expr: 'probe_success{job="probe-ollama-edge1"}', instant: true, refId: A} + - refId: B + relativeTimeRange: {from: 120, to: 0} + datasourceUid: __expr__ + model: {type: reduce, expression: A, reducer: last, refId: B} + - refId: C + relativeTimeRange: {from: 120, to: 0} + datasourceUid: __expr__ + model: {type: threshold, expression: B, conditions: [{evaluator: {params: [1], type: lt}}], refId: C} + - uid: a0-down-local + title: Agent Zero DOWN (Local) + condition: C + for: 2m + noDataState: Alerting + execErrState: OK + annotations: + summary: Agent Zero LOCAL DOWN + description: K3s web UI unreachable. + runbook: "1. ssh stoltz@10.0.58.100 2. docker ps | grep agent-zero 3. docker logs agent-zero --tail 20 4. Check Ollama first (A0 crashes if Ollama down)" + labels: + severity: warning + service: agent-zero + data: + - refId: A + relativeTimeRange: {from: 120, to: 0} + datasourceUid: prometheus + model: {expr: 'probe_success{job="probe-agentzero-local"}', instant: true, refId: A} + - refId: B + relativeTimeRange: {from: 120, to: 0} + datasourceUid: __expr__ + model: {type: reduce, expression: A, reducer: last, refId: B} + - refId: C + relativeTimeRange: {from: 120, to: 0} + datasourceUid: __expr__ + model: {type: threshold, expression: B, conditions: [{evaluator: {params: [1], type: lt}}], refId: C} + - uid: a0-down-nuc + title: Agent Zero DOWN (NUC) + condition: C + for: 2m + noDataState: Alerting + execErrState: OK + annotations: + summary: Agent Zero NUC DOWN + description: RKE2 web UI unreachable. + runbook: "1. ssh root@10.0.56.200 via noc1 2. kubectl get pods -n agent-zero 3. kubectl logs -n agent-zero deploy/agent-zero --tail 20" + labels: + severity: warning + service: agent-zero + data: + - refId: A + relativeTimeRange: {from: 120, to: 0} + datasourceUid: prometheus + model: {expr: 'probe_success{job="probe-agentzero-nuc"}', instant: true, refId: A} + - refId: B + relativeTimeRange: {from: 120, to: 0} + datasourceUid: __expr__ + model: {type: reduce, expression: A, reducer: last, refId: B} + - refId: C + relativeTimeRange: {from: 120, to: 0} + datasourceUid: __expr__ + model: {type: threshold, expression: B, conditions: [{evaluator: {params: [1], type: lt}}], refId: C} + - orgId: 1 + name: Infrastructure + folder: AI Stack Alerts + interval: 1m + rules: + - uid: node-down + title: Node DOWN + condition: C + for: 2m + noDataState: Alerting + execErrState: OK + annotations: + summary: Node down + description: Node exporter unreachable for 2 minutes. Host may be down or network issue. + runbook: "1. Ping host IP 2. SSH via noc1 jumpbox (root@10.0.56.10) 3. systemctl status node_exporter 4. Check network/VLAN 5. Physical power check if unreachable" + labels: + severity: critical + service: infrastructure + alert_channel: thermal_print + data: + - refId: A + relativeTimeRange: {from: 120, to: 0} + datasourceUid: prometheus + model: {expr: 'up{job=~"node-exporter|rke2-nodes|edge-nodes"}', instant: true, refId: A} + - refId: B + relativeTimeRange: {from: 120, to: 0} + datasourceUid: __expr__ + model: {type: reduce, expression: A, reducer: last, refId: B} + - refId: C + relativeTimeRange: {from: 120, to: 0} + datasourceUid: __expr__ + model: {type: threshold, expression: B, conditions: [{evaluator: {params: [1], type: lt}}], refId: C} + - uid: high-cpu + title: High CPU (>85%) + condition: C + for: 10m + noDataState: NoData + execErrState: OK + annotations: + summary: High CPU + description: CPU above 85% for 10 minutes. Performance degradation likely. + runbook: "1. SSH to host 2. top -b -n1 | head -20 3. Check for runaway process 4. kubectl top pods (if K8s node)" + labels: + severity: warning + service: infrastructure + data: + - refId: A + relativeTimeRange: {from: 600, to: 0} + datasourceUid: prometheus + model: {expr: 'instance:node_cpu_usage:avg5m', instant: true, refId: A} + - refId: B + relativeTimeRange: {from: 600, to: 0} + datasourceUid: __expr__ + model: {type: reduce, expression: A, reducer: last, refId: B} + - refId: C + relativeTimeRange: {from: 600, to: 0} + datasourceUid: __expr__ + model: {type: threshold, expression: B, conditions: [{evaluator: {params: [85], type: gt}}], refId: C} + - uid: high-memory + title: High Memory (>90%) + condition: C + for: 5m + noDataState: NoData + execErrState: OK + annotations: + summary: High memory usage + description: Memory above 90% for 5 minutes. OOM kills imminent. + runbook: "1. SSH to host 2. free -h 3. ps aux --sort=-%mem | head 10 4. Check for memory leak (growing RSS)" + labels: + severity: warning + service: infrastructure + data: + - refId: A + relativeTimeRange: {from: 600, to: 0} + datasourceUid: prometheus + model: {expr: 'instance:node_memory_usage:percent', instant: true, refId: A} + - refId: B + relativeTimeRange: {from: 600, to: 0} + datasourceUid: __expr__ + model: {type: reduce, expression: A, reducer: last, refId: B} + - refId: C + relativeTimeRange: {from: 600, to: 0} + datasourceUid: __expr__ + model: {type: threshold, expression: B, conditions: [{evaluator: {params: [90], type: gt}}], refId: C} + - uid: disk-low + title: Disk Space Low (>85%) + condition: C + for: 10m + noDataState: NoData + execErrState: OK + annotations: + summary: Disk usage high + description: Root disk above 85% for 10 minutes. Service disruption if full. + runbook: "1. SSH to host 2. df -h / 3. du -sh /* | sort -rh | head 4. journalctl --vacuum-size=100M 5. podman system prune" + labels: + severity: warning + service: infrastructure + data: + - refId: A + relativeTimeRange: {from: 600, to: 0} + datasourceUid: prometheus + model: {expr: 'instance:node_disk_usage:percent', instant: true, refId: A} + - refId: B + relativeTimeRange: {from: 600, to: 0} + datasourceUid: __expr__ + model: {type: reduce, expression: A, reducer: last, refId: B} + - refId: C + relativeTimeRange: {from: 600, to: 0} + datasourceUid: __expr__ + model: {type: threshold, expression: B, conditions: [{evaluator: {params: [85], type: gt}}], refId: C} + +# ============================================================================= +# Deployment: Grafana +# ============================================================================= +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: grafana + namespace: monitoring + labels: + app: grafana +spec: + replicas: 1 + strategy: + type: Recreate + selector: + matchLabels: + app: grafana + template: + metadata: + labels: + app: grafana + spec: + securityContext: + fsGroup: 472 # grafana group + runAsUser: 472 + runAsGroup: 472 + containers: + - name: grafana + image: docker.io/grafana/grafana:latest + env: + # Credentials from 1Password Operator (OnePasswordItem → Secret "grafana-credentials") + - name: GF_SECURITY_ADMIN_USER + valueFrom: + secretKeyRef: + name: grafana-credentials + key: username + - name: GF_SECURITY_ADMIN_PASSWORD + valueFrom: + secretKeyRef: + name: grafana-credentials + key: password + - name: GF_SERVER_ROOT_URL + value: "https://grafana.iamworkin.lan" + - name: GF_SERVER_SERVE_FROM_SUB_PATH + value: "false" + # Zabbix plugin: install manually after first boot if needed + # GF_INSTALL_PLUGINS requires internet on startup — breaks with restrictive NetworkPolicy + # kubectl exec -n monitoring deploy/grafana -- grafana cli plugins install alexanderzobnin-zabbix-app + ports: + - containerPort: 3000 + name: http + volumeMounts: + - name: data + mountPath: /var/lib/grafana + - name: dashboard-provider + mountPath: /etc/grafana/provisioning/dashboards + readOnly: true + - name: dashboards-ai-stack + mountPath: /var/lib/grafana/dashboards/ai-stack + readOnly: true + - name: dashboards-edge-nodes + mountPath: /var/lib/grafana/dashboards/edge-nodes + readOnly: true + - name: dashboards-network + mountPath: /var/lib/grafana/dashboards/network + readOnly: true + - name: dashboards-operations + mountPath: /var/lib/grafana/dashboards/operations + readOnly: true + - name: dashboards-printer + mountPath: /var/lib/grafana/dashboards/printer + readOnly: true + - name: dashboards-infra-overview + mountPath: /var/lib/grafana/dashboards/infra-overview + readOnly: true + - name: datasource-provisioning + mountPath: /etc/grafana/provisioning/datasources + readOnly: true + - name: alerting-provisioning + mountPath: /etc/grafana/provisioning/alerting + readOnly: true + resources: + requests: + cpu: 100m + memory: 128Mi + limits: + cpu: 500m + memory: 512Mi + livenessProbe: + httpGet: + path: /api/health + port: 3000 + initialDelaySeconds: 30 + periodSeconds: 30 + readinessProbe: + httpGet: + path: /api/health + port: 3000 + initialDelaySeconds: 10 + periodSeconds: 10 + volumes: + - name: data + persistentVolumeClaim: + claimName: grafana-data + - name: dashboard-provider + configMap: + name: grafana-dashboard-provider + - name: dashboards-ai-stack + configMap: + name: grafana-dashboards + - name: dashboards-edge-nodes + configMap: + name: grafana-dashboard-edge-nodes + - name: dashboards-network + configMap: + name: grafana-dashboard-network-overview + - name: dashboards-operations + configMap: + name: grafana-dashboard-operations + - name: dashboards-printer + configMap: + name: grafana-dashboard-printer + - name: dashboards-infra-overview + configMap: + name: grafana-dashboard-infra-overview + - name: datasource-provisioning + configMap: + name: grafana-datasource-provisioning + - name: alerting-provisioning + configMap: + name: grafana-alerting-provisioning + +# ============================================================================= +# Deployment: Blackbox Exporter +# ============================================================================= +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: blackbox-exporter + namespace: monitoring + labels: + app: blackbox-exporter +spec: + replicas: 1 + selector: + matchLabels: + app: blackbox-exporter + template: + metadata: + labels: + app: blackbox-exporter + spec: + containers: + - name: blackbox-exporter + image: quay.io/prometheus/blackbox-exporter:latest + args: + - "--config.file=/config/blackbox.yml" + ports: + - containerPort: 9115 + name: http + volumeMounts: + - name: config + mountPath: /config/blackbox.yml + subPath: blackbox.yml + readOnly: true + resources: + requests: + cpu: 50m + memory: 32Mi + limits: + cpu: 200m + memory: 128Mi + livenessProbe: + httpGet: + path: / + port: 9115 + initialDelaySeconds: 5 + periodSeconds: 30 + readinessProbe: + httpGet: + path: / + port: 9115 + initialDelaySeconds: 3 + periodSeconds: 10 + volumes: + - name: config + configMap: + name: blackbox-config + +# ============================================================================= +# PVC: SNMP Exporter Config (100Mi, Longhorn) +# ============================================================================= +# The custom snmp.yml (~2MB) exceeds the 1MB ConfigMap limit. +# This PVC stores the config file. To load a custom config: +# kubectl cp snmp.yml monitoring/:/config/snmp.yml +# Then restart the pod to pick up the new config. +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: snmp-config + namespace: monitoring +spec: + accessModes: + - ReadWriteOnce + storageClassName: longhorn + resources: + requests: + storage: 100Mi + +# ============================================================================= +# Deployment: SNMP Exporter +# ============================================================================= +# Uses a PVC-mounted config at /config/snmp.yml. An init container copies the +# default config from the image if the PVC is empty (first deploy). +# To load the custom noc1 snmp.yml (~2MB): +# kubectl cp snmp.yml monitoring/:/config/snmp.yml +# Then restart: kubectl rollout restart deploy/snmp-exporter -n monitoring +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: snmp-exporter + namespace: monitoring + labels: + app: snmp-exporter +spec: + replicas: 1 + selector: + matchLabels: + app: snmp-exporter + template: + metadata: + labels: + app: snmp-exporter + spec: + initContainers: + # Copy default snmp.yml from image if PVC is empty (first deploy) + - name: init-config + image: docker.io/prom/snmp-exporter:latest + command: + - sh + - -c + - | + if [ ! -f /config/snmp.yml ]; then + echo "No custom config found, copying default from image..." + cp /etc/snmp_exporter/snmp.yml /config/snmp.yml + echo "Default snmp.yml copied to PVC." + else + echo "Custom snmp.yml already exists on PVC, skipping copy." + fi + volumeMounts: + - name: snmp-config + mountPath: /config + containers: + - name: snmp-exporter + image: docker.io/prom/snmp-exporter:latest + args: + - "--config.file=/config/snmp.yml" + ports: + - containerPort: 9116 + name: http + volumeMounts: + - name: snmp-config + mountPath: /config + readOnly: true + resources: + requests: + cpu: 50m + memory: 64Mi + limits: + cpu: 200m + memory: 256Mi + livenessProbe: + httpGet: + path: / + port: 9116 + initialDelaySeconds: 5 + periodSeconds: 30 + readinessProbe: + httpGet: + path: / + port: 9116 + initialDelaySeconds: 3 + periodSeconds: 10 + volumes: + - name: snmp-config + persistentVolumeClaim: + claimName: snmp-config + +# ============================================================================= +# Deployment: IRC Notify (alert relay) +# ============================================================================= +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: irc-notify + namespace: monitoring + labels: + app: irc-notify +spec: + replicas: 1 + selector: + matchLabels: + app: irc-notify + template: + metadata: + labels: + app: irc-notify + spec: + containers: + - name: irc-notify + image: docker.io/library/python:3.12-slim + command: ["python3", "/app/notify.py"] + ports: + - containerPort: 9119 + name: http + volumeMounts: + - name: script + mountPath: /app/notify.py + subPath: notify.py + readOnly: true + resources: + requests: + cpu: 25m + memory: 32Mi + limits: + cpu: 100m + memory: 64Mi + livenessProbe: + tcpSocket: + port: 9119 + initialDelaySeconds: 5 + periodSeconds: 30 + readinessProbe: + tcpSocket: + port: 9119 + initialDelaySeconds: 3 + periodSeconds: 10 + volumes: + - name: script + configMap: + name: irc-notify-script + +# ============================================================================= +# DaemonSet: Node Exporter (runs on every RKE2 node) +# ============================================================================= +# Port 9101 avoids conflict with host-level node-exporters already on :9100. +# The rke2-nodes Prometheus job scrapes the host instances on :9100; this +# DaemonSet provides K8s service-discovery-based scraping on :9101. +--- +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: node-exporter + namespace: monitoring + labels: + app: node-exporter +spec: + selector: + matchLabels: + app: node-exporter + updateStrategy: + type: RollingUpdate + rollingUpdate: + maxUnavailable: 1 + template: + metadata: + labels: + app: node-exporter + spec: + hostPID: true + hostNetwork: true + tolerations: + - operator: Exists + securityContext: + runAsNonRoot: false + runAsUser: 0 + containers: + - name: node-exporter + image: docker.io/prom/node-exporter:latest + args: + - "--path.rootfs=/host" + - "--path.sysfs=/host/sys" + - "--path.procfs=/host/proc" + - "--collector.filesystem.mount-points-exclude=^/(dev|proc|sys|run|var/lib/containers|run/containers|var/lib/rancher|var/lib/longhorn)($|/)" + - "--collector.netclass.ignored-devices=^(veth|cali|flannel|cni).*$" + - "--no-collector.btrfs" + - "--web.listen-address=:9101" + ports: + - containerPort: 9101 + hostPort: 9101 + name: metrics + securityContext: + privileged: true + readOnlyRootFilesystem: true + volumeMounts: + - name: rootfs + mountPath: /host + readOnly: true + mountPropagation: HostToContainer + - name: proc + mountPath: /host/proc + readOnly: true + - name: sys + mountPath: /host/sys + readOnly: true + resources: + requests: + cpu: 50m + memory: 32Mi + limits: + cpu: 200m + memory: 128Mi + volumes: + - name: rootfs + hostPath: + path: / + - name: proc + hostPath: + path: /proc + - name: sys + hostPath: + path: /sys + +# ============================================================================= +# Service: Prometheus (ClusterIP :9090) +# ============================================================================= +--- +apiVersion: v1 +kind: Service +metadata: + name: prometheus + namespace: monitoring + labels: + app: prometheus +spec: + type: ClusterIP + ports: + - port: 9090 + targetPort: 9090 + protocol: TCP + name: http + selector: + app: prometheus + +# ============================================================================= +# Service: Grafana (ClusterIP :3000) +# ============================================================================= +--- +apiVersion: v1 +kind: Service +metadata: + name: grafana + namespace: monitoring + labels: + app: grafana +spec: + type: ClusterIP + ports: + - port: 3000 + targetPort: 3000 + protocol: TCP + name: http + selector: + app: grafana + +# ============================================================================= +# Service: Blackbox Exporter (ClusterIP :9115) +# ============================================================================= +--- +apiVersion: v1 +kind: Service +metadata: + name: blackbox-exporter + namespace: monitoring + labels: + app: blackbox-exporter +spec: + type: ClusterIP + ports: + - port: 9115 + targetPort: 9115 + protocol: TCP + name: http + selector: + app: blackbox-exporter + +# ============================================================================= +# Service: SNMP Exporter (ClusterIP :9116) +# ============================================================================= +--- +apiVersion: v1 +kind: Service +metadata: + name: snmp-exporter + namespace: monitoring + labels: + app: snmp-exporter +spec: + type: ClusterIP + ports: + - port: 9116 + targetPort: 9116 + protocol: TCP + name: http + selector: + app: snmp-exporter + +# ============================================================================= +# Service: Node Exporter (headless for Prometheus SD) +# ============================================================================= +--- +apiVersion: v1 +kind: Service +metadata: + name: node-exporter + namespace: monitoring + labels: + app: node-exporter +spec: + type: ClusterIP + clusterIP: None + ports: + - port: 9101 + targetPort: 9101 + protocol: TCP + name: metrics + selector: + app: node-exporter + +# ============================================================================= +# Service: IRC Notify (ClusterIP :9119) +# ============================================================================= +--- +apiVersion: v1 +kind: Service +metadata: + name: irc-notify + namespace: monitoring + labels: + app: irc-notify +spec: + type: ClusterIP + ports: + - port: 9119 + targetPort: 9119 + protocol: TCP + name: http + selector: + app: irc-notify + +# ============================================================================= +# TLS Certificates (cert-manager + step-ca ACME) +# ============================================================================= +--- +apiVersion: cert-manager.io/v1 +kind: Certificate +metadata: + name: grafana-tls + namespace: monitoring +spec: + secretName: grafana-tls + issuerRef: + name: step-ca-acme + kind: ClusterIssuer + dnsNames: + - grafana.iamworkin.lan +--- +apiVersion: cert-manager.io/v1 +kind: Certificate +metadata: + name: prometheus-tls + namespace: monitoring +spec: + secretName: prometheus-tls + issuerRef: + name: step-ca-acme + kind: ClusterIssuer + dnsNames: + - prometheus.iamworkin.lan + +# ============================================================================= +# Traefik IngressRoute: Grafana +# ============================================================================= +--- +apiVersion: traefik.io/v1alpha1 +kind: IngressRoute +metadata: + name: grafana + namespace: monitoring +spec: + entryPoints: + - websecure + routes: + - kind: Rule + match: Host(`grafana.iamworkin.lan`) + services: + - name: grafana + port: 3000 + tls: + secretName: grafana-tls + +# ============================================================================= +# Traefik IngressRoute: Prometheus +# ============================================================================= +--- +apiVersion: traefik.io/v1alpha1 +kind: IngressRoute +metadata: + name: prometheus + namespace: monitoring +spec: + entryPoints: + - websecure + routes: + - kind: Rule + match: Host(`prometheus.iamworkin.lan`) + services: + - name: prometheus + port: 9090 + tls: + secretName: prometheus-tls + +# ============================================================================= +# NetworkPolicy: monitoring namespace +# ============================================================================= +--- +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: monitoring-netpol + namespace: monitoring +spec: + podSelector: {} + policyTypes: + - Ingress + - Egress + ingress: + # Allow from Traefik (IngressRoutes AND ACME solver pods) + - from: + - namespaceSelector: + matchLabels: + kubernetes.io/metadata.name: traefik-system + # Allow intra-namespace (prometheus→exporters, grafana→prometheus, grafana→irc-notify) + - from: + - podSelector: {} + # Allow from cert-manager (ACME HTTP-01 self-check) + - from: + - namespaceSelector: + matchLabels: + kubernetes.io/metadata.name: cert-manager + egress: + # DNS + - to: + - namespaceSelector: {} + ports: + - port: 53 + protocol: UDP + - port: 53 + protocol: TCP + # MGMT VLAN (noc1, pfSense, switches, SNMP, node-exporter) + - to: + - ipBlock: + cidr: 10.0.56.0/24 + # PROD VLAN (edge nodes) + - to: + - ipBlock: + cidr: 10.0.57.0/24 + # HOME VLAN (workstation, printer, NAS) + - to: + - ipBlock: + cidr: 10.0.58.0/24 + # Intra-namespace + - to: + - podSelector: {} + # Blackbox probes to other namespaces (agent-zero, etc) + - to: + - namespaceSelector: + matchLabels: + kubernetes.io/metadata.name: agent-zero + ports: + - port: 80 + protocol: TCP + # IRC (irc-notify → UnrealIRCd in irc namespace via K8s DNS) + - to: + - namespaceSelector: + matchLabels: + kubernetes.io/metadata.name: irc + ports: + - port: 6667 + protocol: TCP + - port: 6697 + protocol: TCP + # Step-CA ACME (cert renewal) + - to: + - ipBlock: + cidr: 10.0.56.10/32 + ports: + - port: 9443 + protocol: TCP + # Internet (optional: Grafana plugin install, ACME) + - to: + - ipBlock: + cidr: 0.0.0.0/0 + except: + - 10.0.0.0/8 + - 172.16.0.0/12 + - 192.168.0.0/16 + +# ============================================================================= +# Job: SNMP Config Loader (ArgoCD PostSync hook) +# ============================================================================= +# Runs once after the main deployment to populate the SNMP config PVC. +# Attempts to download custom snmp.yml from noc1; falls back to the default +# config bundled in the snmp-exporter image. +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: snmp-config-loader + namespace: monitoring + annotations: + argocd.argoproj.io/hook: PostSync + argocd.argoproj.io/hook-delete-policy: HookSucceeded +spec: + backoffLimit: 0 + template: + metadata: + labels: + app: snmp-config-loader + spec: + restartPolicy: Never + initContainers: + # Try to download custom snmp.yml from noc1 + - name: download-config + image: docker.io/curlimages/curl:latest + command: + - sh + - -c + - | + echo "Attempting to download custom snmp.yml from noc1..." + curl -sf --connect-timeout 10 --max-time 30 \ + http://10.0.56.10:9116/config -o /config/snmp.yml 2>/dev/null + if [ $? -eq 0 ] && [ -s /config/snmp.yml ]; then + echo "Custom snmp.yml downloaded from noc1 successfully." + else + echo "Download failed or empty, will use default from image." + rm -f /config/snmp.yml + fi + volumeMounts: + - name: snmp-config + mountPath: /config + containers: + # If download failed, copy the default config from the image + - name: fallback-default + image: docker.io/prom/snmp-exporter:latest + command: + - sh + - -c + - | + if [ -f /config/snmp.yml ] && [ -s /config/snmp.yml ]; then + echo "Custom config already present, nothing to do." + else + echo "Copying default snmp.yml from image to PVC..." + cp /etc/snmp_exporter/snmp.yml /config/snmp.yml + echo "Default config copied." + fi + echo "SNMP config loader complete." + volumeMounts: + - name: snmp-config + mountPath: /config + volumes: + - name: snmp-config + persistentVolumeClaim: + claimName: snmp-config