From b0a3ef7448fe525e5e5f0a99d62ccf151784011e Mon Sep 17 00:00:00 2001 From: Andrew Stoltz Date: Wed, 10 Jun 2026 16:23:49 -0500 Subject: [PATCH] monitoring: delay PiManagerDown duplicate pages --- apps/github-runner/README.md | 6 ++++++ apps/monitoring/noc-monitoring.yaml | 4 +++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/apps/github-runner/README.md b/apps/github-runner/README.md index a2e69dc..e3756ef 100644 --- a/apps/github-runner/README.md +++ b/apps/github-runner/README.md @@ -24,6 +24,12 @@ original Longhorn ReadWriteOnce NuGet PVC. Every other repo-scoped runner uses two replicas with per-pod `emptyDir` caches. That is the safe backlog-drain strategy: no two pods share one RWO PVC. +Ephemeral runner pods are expected to register, run one job, deregister, and +exit so the Deployment starts a fresh pod for the next registration token. A +small amount of exit-1/restart churn from token-expiry or no-work windows is +accepted operational noise as long as jobs are not stuck queued and the +repo-scoped runner-offline alerts stay quiet. + Sprint 32 final long-tail wave adds 16 two-replica Deployments: `FlowerCore.Knowledge`, `FlowerCore.LlmBridge`, `FlowerCore.Media`, `FlowerCore.Presentations`, `FlowerCore.RemoteDesktop`, `FlowerCore.DNS`, diff --git a/apps/monitoring/noc-monitoring.yaml b/apps/monitoring/noc-monitoring.yaml index 0a513b3..c4a5ff3 100644 --- a/apps/monitoring/noc-monitoring.yaml +++ b/apps/monitoring/noc-monitoring.yaml @@ -843,7 +843,9 @@ data: rules: - alert: PiManagerDown expr: up{job="pimanager-app"} == 0 - for: 3m + # Sprint 67: delayed behind NodeDown's critical page so a powered-off + # Pi does not create the first duplicate page for the same host. + for: 8m labels: severity: warning annotations: