diff --git a/apps/monitoring/noc-monitoring.yaml b/apps/monitoring/noc-monitoring.yaml index 09a6f83..0a513b3 100644 --- a/apps/monitoring/noc-monitoring.yaml +++ b/apps/monitoring/noc-monitoring.yaml @@ -216,19 +216,24 @@ data: - job_name: "pimanager-app" scrape_interval: 15s metrics_path: /metrics + scheme: https + tls_config: + insecure_skip_verify: true static_configs: - - targets: ["10.0.58.25:5000"] + - targets: ["piez.iamworkin.lan"] labels: instance: "piez" - service: "pimanager" + service: "signalcontrol" vlan: "home" device: "pi4-ezconnect" - - targets: ["10.0.58.113:5200"] + rig: "signal-b" + - targets: ["pirelay.iamworkin.lan"] labels: instance: "pirelay" - service: "pimanager" + service: "signalcontrol" vlan: "home" device: "pi3-ks0212" + rig: "signal-a" # Epson ET-3750 EcoTank Printer SNMP - job_name: "snmp-printer" @@ -488,6 +493,12 @@ data: - "https://desktop.iamworkin.lan/" - "https://print.iamworkin.lan/healthz" # root 401 behind API key auth; /healthz anonymous 200 - "https://dns.iamworkin.lan/healthz" # root auth-gated by OIDC; /healthz anonymous 200 + - "https://signalcontrol.iamworkin.lan/health" # FlowerCore.SignalControl Pi control plane + - "https://flowercore.iamworkin.lan/healthz" # FlowerCore landing + - "https://replay.iamworkin.lan/healthz" # FlowerCore.Signage replay surface + - "https://worldbuilder.iamworkin.lan/healthz" # FlowerCore.WorldBuilder + - "https://updates.iamworkin.lan/api/v1/manifests/_schema" # UpdateCenter plural LAN alias + - "https://updatecenter-internal.iamworkin.lan/api/v1/manifests/_schema" # internal UC schema route - "https://chat.iamworkin.lan/healthz" # OIDC staged; keep blackbox off root before enforcement flips - "https://dist.iamworkin.lan/healthz" # root/admin auth-gated by OIDC; /healthz anonymous 200 - "https://dms.iamworkin.lan/healthz" # future OIDC posture; health route is already anonymous/live @@ -911,12 +922,13 @@ data: # of idle and SNMP times out, so 5m for: would page nightly. A # genuine printer outage (jam, disconnected) lasts well over 30m. - alert: EpsonPrinterDown - expr: up{job="snmp-printer"} == 0 + expr: (max_over_time(up{job="snmp-printer"}[35m]) == bool 0) == 1 and (hour() >= 13 or hour() < 1) for: 30m labels: - severity: warning + severity: info + alert_channel: irc annotations: - summary: "Epson ET-3750 SNMP unreachable for >30m (likely actual fault, not sleep)" + summary: "Epson ET-3750 SNMP unreachable during waking hours (30m)" - alert: SynologyDiskLow expr: hrStorageUsed{job="snmp-nas"} / hrStorageSize{job="snmp-nas"} * 100 > 85