check-pfsense-dns: add live-cluster scan

Extends the pre-merge DNS gate to (optionally) scan live-cluster
Certificates + IngressRoutes via kubectl. Closes the coverage hole
where a service's IngressRoute gets deployed from its own repo (not
from bluejay-infra/apps/) and the manifests-only scan misses it —
fc-retail/retail-web-tls stuck Issuing for 15h on a missing pfSense
Unbound override was exactly this class of bug.

Auto mode: if kubectl is on PATH and usable, live-scan runs silently.
--live  forces it (and errors out if kubectl can't reach the cluster).
--no-live skips live entirely (CI path with no cluster access).

Immediate live-scan finding on 2026-04-23: 10 orphan *.iamworkin.lan
IngressRoutes from failed e2e / codex / smoke / deleteproof test runs
in fc-php + fc-tenant-default (2026-04-16/17). None have DNS overrides
so their Certificates have been failing to issue for 7 days — the new
CertManagerCertificateNotReady alert will catch them too. Cleanup
(delete abandoned IngressRoutes + Certificates + CertificateRequests)
is a separate task; this check now surfaces them.
This commit is contained in:
Andrew Stoltz
2026-04-23 15:51:19 -05:00
parent 4da60820c6
commit 5ccf055465

View File

@@ -2,14 +2,25 @@
""" """
check-pfsense-dns.py check-pfsense-dns.py
Fails if any apps/*/*.yaml references an iamworkin.lan hostname in a Fails if any *.iamworkin.lan hostname referenced by a cert-manager Certificate
cert-manager Certificate `spec.dnsNames` or a Traefik IngressRoute `spec.dnsNames` or a Traefik IngressRoute `Host(...)` match rule does NOT
`Host(...)` match rule that does NOT resolve via the system DNS resolver resolve via the system DNS resolver (pfSense Unbound at 10.0.56.1 on this LAN).
(which on this LAN is pfSense Unbound at 10.0.56.1).
Run from anywhere that uses pfSense as a resolver (LAN hosts, noc1, BLUEJAY-WS): Two sources are scanned:
python scripts/check-pfsense-dns.py 1. apps/*/*.yaml in this bluejay-infra checkout — the pre-merge gate.
2. Live-cluster Certificates + IngressRoutes (opt-in with --live, or auto when
kubectl is on PATH AND kubeconfig is usable). This catches hostnames that
exist in the running cluster but aren't (yet) tracked in bluejay-infra —
e.g. services deployed via their own repo's deploy script. Retail.Web on
2026-04-23 was stuck Issuing for 15h because of exactly this gap.
Run from anywhere that uses pfSense as a resolver (LAN hosts, noc1,
BLUEJAY-WS):
python scripts/check-pfsense-dns.py # auto live scan if kubectl works
python scripts/check-pfsense-dns.py --live # require live scan
python scripts/check-pfsense-dns.py --no-live # manifests only (CI default)
Exit code 0: all referenced hosts resolve. 1: at least one doesn't. Exit code 0: all referenced hosts resolve. 1: at least one doesn't.
@@ -19,9 +30,13 @@ server-link names, Docker image tags, comments, etc. are ignored.
""" """
from __future__ import annotations from __future__ import annotations
import argparse
import json
import os import os
import re import re
import shutil
import socket import socket
import subprocess
import sys import sys
from pathlib import Path from pathlib import Path
@@ -35,6 +50,8 @@ APPS_DIR = REPO_ROOT / "apps"
HOST_RE = re.compile(r"Host\(`([^`]+)`\)") HOST_RE = re.compile(r"Host\(`([^`]+)`\)")
LIVE_SOURCE = "live-cluster"
def extract_hosts_from_doc(doc: dict) -> set[str]: def extract_hosts_from_doc(doc: dict) -> set[str]:
"""Pull iamworkin.lan hostnames from a single K8s manifest doc.""" """Pull iamworkin.lan hostnames from a single K8s manifest doc."""
@@ -60,7 +77,7 @@ def extract_hosts_from_doc(doc: dict) -> set[str]:
return out return out
def collect_hosts() -> dict[str, list[str]]: def collect_hosts_from_manifests() -> dict[str, list[str]]:
"""hostname -> [list of manifest files that referenced it].""" """hostname -> [list of manifest files that referenced it]."""
index: dict[str, list[str]] = {} index: dict[str, list[str]] = {}
for path in sorted(APPS_DIR.rglob("*.yaml")): for path in sorted(APPS_DIR.rglob("*.yaml")):
@@ -74,6 +91,68 @@ def collect_hosts() -> dict[str, list[str]]:
return index return index
def _kubectl_json(args: list[str]) -> dict | None:
"""Run `kubectl ... -o json` and return the parsed result, or None on failure."""
try:
r = subprocess.run(
["kubectl", *args, "-o", "json"],
capture_output=True,
text=True,
timeout=20,
)
except (FileNotFoundError, subprocess.TimeoutExpired):
return None
if r.returncode != 0:
return None
try:
return json.loads(r.stdout)
except json.JSONDecodeError:
return None
def collect_hosts_from_cluster() -> tuple[dict[str, list[str]], bool]:
"""
Scan live cluster. Returns (host_index, ok).
ok=False means kubectl wasn't usable; the caller decides whether that's
fatal (--live) or just a warning (auto mode).
"""
if not shutil.which("kubectl"):
return {}, False
index: dict[str, list[str]] = {}
# Certificates (cert-manager.io/v1) — spec.dnsNames
certs = _kubectl_json(["get", "certificate", "-A"])
if certs is None:
return {}, False
for item in certs.get("items", []):
meta = item.get("metadata", {})
ns = meta.get("namespace", "?")
name = meta.get("name", "?")
ref = f"{LIVE_SOURCE} Certificate {ns}/{name}"
for dn in (item.get("spec", {}) or {}).get("dnsNames", []) or []:
if isinstance(dn, str) and dn.endswith(".iamworkin.lan"):
index.setdefault(dn, []).append(ref)
# IngressRoutes (traefik.io/v1alpha1) — spec.routes[].match Host(...)
# The CRD may or may not be installed. Silent skip when it isn't.
irs = _kubectl_json(["get", "ingressroute", "-A"])
if irs is not None:
for item in irs.get("items", []):
meta = item.get("metadata", {})
ns = meta.get("namespace", "?")
name = meta.get("name", "?")
ref = f"{LIVE_SOURCE} IngressRoute {ns}/{name}"
for route in (item.get("spec", {}) or {}).get("routes", []) or []:
match = route.get("match", "") if isinstance(route, dict) else ""
for h in HOST_RE.findall(match):
if h.endswith(".iamworkin.lan"):
index.setdefault(h, []).append(ref)
return index, True
def resolves(host: str) -> str | None: def resolves(host: str) -> str | None:
try: try:
return socket.gethostbyname(host) return socket.gethostbyname(host)
@@ -82,9 +161,43 @@ def resolves(host: str) -> str | None:
def main() -> int: def main() -> int:
hosts = collect_hosts() parser = argparse.ArgumentParser(description=__doc__.splitlines()[1] if __doc__ else None)
parser.add_argument(
"--live",
dest="live",
action="store_true",
default=None,
help="Require a live-cluster scan (fail if kubectl unreachable).",
)
parser.add_argument(
"--no-live",
dest="live",
action="store_false",
help="Skip live-cluster scan (manifests only).",
)
args = parser.parse_args()
hosts = collect_hosts_from_manifests()
live_requested = args.live is True
live_auto = args.live is None # neither --live nor --no-live
if live_requested or live_auto:
live_hosts, live_ok = collect_hosts_from_cluster()
if live_requested and not live_ok:
print("ERROR: --live requested but kubectl is not available or auth failed.", file=sys.stderr)
return 2
if live_ok:
before = len(hosts)
for host, refs in live_hosts.items():
hosts.setdefault(host, []).extend(refs)
new_hosts = len(hosts) - before
print(f"(live scan: {len(live_hosts)} cluster host(s); {new_hosts} not covered by manifests)")
elif live_auto:
print("(kubectl not reachable — skipping live scan; run from a workstation with cluster access to catch retail-style drift)")
if not hosts: if not hosts:
print(f"No iamworkin.lan hostnames found in {APPS_DIR} — nothing to check.") print(f"No iamworkin.lan hostnames found in manifests or cluster — nothing to check.")
return 0 return 0
failed: list[tuple[str, list[str]]] = [] failed: list[tuple[str, list[str]]] = []
@@ -98,9 +211,11 @@ def main() -> int:
if failed: if failed:
print() print()
print(f"ERROR: {len(failed)} host(s) referenced in manifests but not in pfSense Unbound.") print(f"ERROR: {len(failed)} host(s) referenced but not in pfSense Unbound.")
for host, files in failed: for host, refs in failed:
print(f" {host} (referenced in: {', '.join(sorted(set(files)))})") print(f" {host}")
for ref in sorted(set(refs)):
print(f" via: {ref}")
print() print()
print("Add them before merging — see README.md step 1.") print("Add them before merging — see README.md step 1.")
print() print()