#!/usr/bin/env python3 """ check-pfsense-dns.py Historical name retained for continuity, but the check now runs through the public FlowerCore.DNS preflight API instead of a raw local resolver lookup. Fails if any *.iamworkin.lan hostname referenced by a cert-manager Certificate `spec.dnsNames` or a Traefik IngressRoute `Host(...)` match rule is NOT resolvable via FlowerCore.DNS: GET /api/v1/zones/{zone}/resolve-preflight?hostname= Two sources are scanned: 1. apps/*/*.yaml in this bluejay-infra checkout — the pre-merge gate. 2. Live-cluster Certificates + IngressRoutes (opt-in with --live, or auto when kubectl is on PATH AND kubeconfig is usable). This catches hostnames that exist in the running cluster but aren't (yet) tracked in bluejay-infra — e.g. services deployed via their own repo's deploy script. Retail.Web on 2026-04-23 was stuck Issuing for 15h because of exactly this gap. Run from anywhere that can reach the FlowerCore.DNS host: python scripts/check-pfsense-dns.py # auto live scan if kubectl works python scripts/check-pfsense-dns.py --live # require live scan python scripts/check-pfsense-dns.py --no-live # manifests only (CI default) Exit code 0: all referenced hosts pass FlowerCore.DNS preflight. Exit code 1: at least one host fails preflight. Exit code 2: --live requested but kubectl was unusable. This is intentionally narrow: it only flags hostnames that cert-manager will actually try to validate or that Traefik will route. IRC server-link names, Docker image tags, comments, etc. are ignored. """ from __future__ import annotations import argparse import json import os import re import shutil import ssl import subprocess import sys import urllib.error import urllib.parse import urllib.request from concurrent.futures import ThreadPoolExecutor, as_completed from dataclasses import dataclass from pathlib import Path try: import yaml # PyYAML except ImportError: sys.exit("PyYAML required: pip install pyyaml") REPO_ROOT = Path(__file__).resolve().parent.parent APPS_DIR = REPO_ROOT / "apps" HOST_RE = re.compile(r"Host\(`([^`]+)`\)") LIVE_SOURCE = "live-cluster" DEFAULT_BASE_URL = os.environ.get("FLOWERCORE_DNS_BASE_URL", "https://dns.iamworkin.lan") DEFAULT_ZONE = os.environ.get("FLOWERCORE_DNS_ZONE", "iamworkin.lan") DEFAULT_TIMEOUT_SECONDS = float(os.environ.get("FLOWERCORE_DNS_TIMEOUT_SECONDS", "20")) DEFAULT_WORKERS = max(1, int(os.environ.get("FLOWERCORE_DNS_WORKERS", "8"))) @dataclass(frozen=True) class PreflightResult: host: str ok: bool resolved_zone: str | None server_name: str | None provider: str | None addresses: list[str] challenge_fqdn: str message: str def extract_hosts_from_doc(doc: dict) -> set[str]: """Pull iamworkin.lan hostnames from a single K8s manifest doc.""" out: set[str] = set() if not isinstance(doc, dict): return out kind = doc.get("kind", "") spec = doc.get("spec") or {} if kind == "Certificate": for name in spec.get("dnsNames", []) or []: if isinstance(name, str) and name.endswith(".iamworkin.lan"): out.add(name) elif kind == "IngressRoute": for route in spec.get("routes", []) or []: match = route.get("match", "") if isinstance(route, dict) else "" for h in HOST_RE.findall(match): if h.endswith(".iamworkin.lan"): out.add(h) return out def collect_hosts_from_manifests() -> dict[str, list[str]]: """hostname -> [list of manifest files that referenced it].""" index: dict[str, list[str]] = {} for path in sorted(APPS_DIR.rglob("*.yaml")): try: with path.open("r", encoding="utf-8") as f: for doc in yaml.safe_load_all(f): for host in extract_hosts_from_doc(doc): index.setdefault(host, []).append(str(path.relative_to(REPO_ROOT))) except yaml.YAMLError as e: print(f"warn: could not parse {path}: {e}", file=sys.stderr) return index def _kubectl_json(args: list[str]) -> dict | None: """Run `kubectl ... -o json` and return the parsed result, or None on failure.""" try: r = subprocess.run( ["kubectl", *args, "-o", "json"], capture_output=True, text=True, timeout=20, ) except (FileNotFoundError, subprocess.TimeoutExpired): return None if r.returncode != 0: return None try: return json.loads(r.stdout) except json.JSONDecodeError: return None def collect_hosts_from_cluster() -> tuple[dict[str, list[str]], bool]: """ Scan live cluster. Returns (host_index, ok). ok=False means kubectl wasn't usable; the caller decides whether that's fatal (--live) or just a warning (auto mode). """ if not shutil.which("kubectl"): return {}, False index: dict[str, list[str]] = {} certs = _kubectl_json(["get", "certificate", "-A"]) if certs is None: return {}, False for item in certs.get("items", []): meta = item.get("metadata", {}) ns = meta.get("namespace", "?") name = meta.get("name", "?") ref = f"{LIVE_SOURCE} Certificate {ns}/{name}" for dn in (item.get("spec", {}) or {}).get("dnsNames", []) or []: if isinstance(dn, str) and dn.endswith(".iamworkin.lan"): index.setdefault(dn, []).append(ref) irs = _kubectl_json(["get", "ingressroute", "-A"]) if irs is not None: for item in irs.get("items", []): meta = item.get("metadata", {}) ns = meta.get("namespace", "?") name = meta.get("name", "?") ref = f"{LIVE_SOURCE} IngressRoute {ns}/{name}" for route in (item.get("spec", {}) or {}).get("routes", []) or []: match = route.get("match", "") if isinstance(route, dict) else "" for h in HOST_RE.findall(match): if h.endswith(".iamworkin.lan"): index.setdefault(h, []).append(ref) return index, True def _ssl_context(insecure: bool) -> ssl.SSLContext: return ssl._create_unverified_context() if insecure else ssl.create_default_context() def preflight_host( base_url: str, zone: str, host: str, timeout_seconds: float, insecure: bool, ) -> PreflightResult: path = ( f"/api/v1/zones/{urllib.parse.quote(zone, safe='')}/resolve-preflight" f"?hostname={urllib.parse.quote(host, safe='')}" ) url = urllib.parse.urljoin(base_url.rstrip("/") + "/", path.lstrip("/")) request = urllib.request.Request(url, headers={"Accept": "application/json"}) try: with urllib.request.urlopen( request, timeout=timeout_seconds, context=_ssl_context(insecure), ) as response: payload = json.loads(response.read().decode("utf-8")) except urllib.error.HTTPError as exc: body = exc.read().decode("utf-8", errors="replace").strip() detail = body[:200] if body else exc.reason return PreflightResult( host=host, ok=False, resolved_zone=None, server_name=None, provider=None, addresses=[], challenge_fqdn=f"_acme-challenge.{host.rstrip('.')}.", message=f"HTTP {exc.code}: {detail}", ) except Exception as exc: # noqa: BLE001 - surfaced as preflight failure detail return PreflightResult( host=host, ok=False, resolved_zone=None, server_name=None, provider=None, addresses=[], challenge_fqdn=f"_acme-challenge.{host.rstrip('.')}.", message=f"{type(exc).__name__}: {exc}", ) resolved_zone = payload.get("resolvedZone") server_name = payload.get("serverName") provider = payload.get("provider") addresses = [value for value in payload.get("addresses", []) if isinstance(value, str)] supports_acme = bool(payload.get("supportsAcmeDns01")) resolvable = bool(payload.get("resolvable")) challenge_fqdn = str(payload.get("challengeFqdn", f"_acme-challenge.{host.rstrip('.')}.")) message = str(payload.get("message", "")).strip() if not supports_acme and not message: message = "Matched DNS server does not advertise ACME DNS-01 support." ok = supports_acme and resolvable and bool(resolved_zone) return PreflightResult( host=host, ok=ok, resolved_zone=resolved_zone, server_name=server_name, provider=provider, addresses=addresses, challenge_fqdn=challenge_fqdn, message=message, ) def run_preflight( hosts: list[str], base_url: str, zone: str, timeout_seconds: float, insecure: bool, workers: int, ) -> dict[str, PreflightResult]: if not hosts: return {} max_workers = max(1, min(workers, len(hosts))) results: dict[str, PreflightResult] = {} with ThreadPoolExecutor(max_workers=max_workers) as pool: future_map = { pool.submit(preflight_host, base_url, zone, host, timeout_seconds, insecure): host for host in hosts } for future in as_completed(future_map): host = future_map[future] results[host] = future.result() return results def main() -> int: parser = argparse.ArgumentParser(description=__doc__.splitlines()[1] if __doc__ else None) parser.add_argument( "--live", dest="live", action="store_true", default=None, help="Require a live-cluster scan (fail if kubectl unreachable).", ) parser.add_argument( "--no-live", dest="live", action="store_false", help="Skip live-cluster scan (manifests only).", ) parser.add_argument( "--base-url", default=DEFAULT_BASE_URL, help=f"FlowerCore.DNS base URL (default: {DEFAULT_BASE_URL}).", ) parser.add_argument( "--zone", default=DEFAULT_ZONE, help=f"Zone passed to resolve-preflight (default: {DEFAULT_ZONE}).", ) parser.add_argument( "--timeout-seconds", type=float, default=DEFAULT_TIMEOUT_SECONDS, help=f"Per-host resolve-preflight timeout (default: {DEFAULT_TIMEOUT_SECONDS}).", ) parser.add_argument( "--workers", type=int, default=DEFAULT_WORKERS, help=f"Parallel preflight workers (default: {DEFAULT_WORKERS}).", ) parser.add_argument( "--insecure", action="store_true", help="Skip TLS verification when calling FlowerCore.DNS.", ) args = parser.parse_args() hosts = collect_hosts_from_manifests() live_requested = args.live is True live_auto = args.live is None if live_requested or live_auto: live_hosts, live_ok = collect_hosts_from_cluster() if live_requested and not live_ok: print("ERROR: --live requested but kubectl is not available or auth failed.", file=sys.stderr) return 2 if live_ok: before = len(hosts) for host, refs in live_hosts.items(): hosts.setdefault(host, []).extend(refs) new_hosts = len(hosts) - before print(f"(live scan: {len(live_hosts)} cluster host(s); {new_hosts} not covered by manifests)") elif live_auto: print("(kubectl not reachable — skipping live scan; run from a workstation with cluster access to catch retail-style drift)") if not hosts: print("No iamworkin.lan hostnames found in manifests or cluster — nothing to check.") return 0 print( f"(preflight: {len(hosts)} host(s) via {args.base_url.rstrip('/')}" f"/api/v1/zones/{args.zone}/resolve-preflight)" ) results = run_preflight( sorted(hosts), base_url=args.base_url, zone=args.zone, timeout_seconds=args.timeout_seconds, insecure=args.insecure, workers=args.workers, ) failed: list[tuple[str, list[str], PreflightResult]] = [] for host in sorted(hosts): result = results[host] if result.ok: addresses = ", ".join(result.addresses) if result.addresses else "(no A/AAAA answers)" zone_label = result.resolved_zone or args.zone server_label = result.server_name or "unknown-server" print(f"OK {host:<45} -> {addresses} via {server_label} [{zone_label}]") else: print(f"FAIL {host:<45} ({result.message})") failed.append((host, hosts[host], result)) if failed: print() print(f"ERROR: {len(failed)} host(s) failed FlowerCore.DNS preflight.") for host, refs, result in failed: print(f" {host}") print(f" preflight: {result.message}") print(f" challenge: {result.challenge_fqdn}") for ref in sorted(set(refs)): print(f" via: {ref}") print() print("Fix the DNS record in FlowerCore.DNS before merging, then rerun this gate.") print() print("Example:") print(f" curl -sk {args.base_url.rstrip('/')}/api/v1/servers") print( " curl -sk -X POST " f"{args.base_url.rstrip('/')}/api/v1/servers//zones/{args.zone}/records " "-H 'Content-Type: application/json' " "-d '{\"name\":\"\",\"type\":\"A\",\"data\":\"10.0.56.200\",\"ttl\":300}'" ) return 1 print() print(f"All {len(hosts)} iamworkin.lan host(s) passed FlowerCore.DNS preflight. Safe to deploy.") return 0 if __name__ == "__main__": sys.exit(main())