400 lines
14 KiB
Python
400 lines
14 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
check-pfsense-dns.py
|
|
|
|
Historical name retained for continuity, but the check now runs through the
|
|
public FlowerCore.DNS preflight API instead of a raw local resolver lookup.
|
|
|
|
Fails if any *.iamworkin.lan hostname referenced by a cert-manager Certificate
|
|
`spec.dnsNames` or a Traefik IngressRoute `Host(...)` match rule is NOT
|
|
resolvable via FlowerCore.DNS:
|
|
|
|
GET /api/v1/zones/{zone}/resolve-preflight?hostname=<host>
|
|
|
|
Two sources are scanned:
|
|
|
|
1. apps/*/*.yaml in this bluejay-infra checkout — the pre-merge gate.
|
|
2. Live-cluster Certificates + IngressRoutes (opt-in with --live, or auto when
|
|
kubectl is on PATH AND kubeconfig is usable). This catches hostnames that
|
|
exist in the running cluster but aren't (yet) tracked in bluejay-infra —
|
|
e.g. services deployed via their own repo's deploy script. Retail.Web on
|
|
2026-04-23 was stuck Issuing for 15h because of exactly this gap.
|
|
|
|
Run from anywhere that can reach the FlowerCore.DNS host:
|
|
|
|
python scripts/check-pfsense-dns.py # auto live scan if kubectl works
|
|
python scripts/check-pfsense-dns.py --live # require live scan
|
|
python scripts/check-pfsense-dns.py --no-live # manifests only (CI default)
|
|
|
|
Exit code 0: all referenced hosts pass FlowerCore.DNS preflight.
|
|
Exit code 1: at least one host fails preflight.
|
|
Exit code 2: --live requested but kubectl was unusable.
|
|
|
|
This is intentionally narrow: it only flags hostnames that cert-manager will
|
|
actually try to validate or that Traefik will route. IRC server-link names,
|
|
Docker image tags, comments, etc. are ignored.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
import os
|
|
import re
|
|
import shutil
|
|
import ssl
|
|
import subprocess
|
|
import sys
|
|
import urllib.error
|
|
import urllib.parse
|
|
import urllib.request
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
from dataclasses import dataclass
|
|
from pathlib import Path
|
|
|
|
try:
|
|
import yaml # PyYAML
|
|
except ImportError:
|
|
sys.exit("PyYAML required: pip install pyyaml")
|
|
|
|
REPO_ROOT = Path(__file__).resolve().parent.parent
|
|
APPS_DIR = REPO_ROOT / "apps"
|
|
|
|
HOST_RE = re.compile(r"Host\(`([^`]+)`\)")
|
|
|
|
LIVE_SOURCE = "live-cluster"
|
|
DEFAULT_BASE_URL = os.environ.get("FLOWERCORE_DNS_BASE_URL", "https://dns.iamworkin.lan")
|
|
DEFAULT_ZONE = os.environ.get("FLOWERCORE_DNS_ZONE", "iamworkin.lan")
|
|
DEFAULT_TIMEOUT_SECONDS = float(os.environ.get("FLOWERCORE_DNS_TIMEOUT_SECONDS", "20"))
|
|
DEFAULT_WORKERS = max(1, int(os.environ.get("FLOWERCORE_DNS_WORKERS", "8")))
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class PreflightResult:
|
|
host: str
|
|
ok: bool
|
|
resolved_zone: str | None
|
|
server_name: str | None
|
|
provider: str | None
|
|
addresses: list[str]
|
|
challenge_fqdn: str
|
|
message: str
|
|
|
|
|
|
def extract_hosts_from_doc(doc: dict) -> set[str]:
|
|
"""Pull iamworkin.lan hostnames from a single K8s manifest doc."""
|
|
out: set[str] = set()
|
|
if not isinstance(doc, dict):
|
|
return out
|
|
|
|
kind = doc.get("kind", "")
|
|
spec = doc.get("spec") or {}
|
|
|
|
if kind == "Certificate":
|
|
for name in spec.get("dnsNames", []) or []:
|
|
if isinstance(name, str) and name.endswith(".iamworkin.lan"):
|
|
out.add(name)
|
|
|
|
elif kind == "IngressRoute":
|
|
for route in spec.get("routes", []) or []:
|
|
match = route.get("match", "") if isinstance(route, dict) else ""
|
|
for h in HOST_RE.findall(match):
|
|
if h.endswith(".iamworkin.lan"):
|
|
out.add(h)
|
|
|
|
return out
|
|
|
|
|
|
def collect_hosts_from_manifests() -> dict[str, list[str]]:
|
|
"""hostname -> [list of manifest files that referenced it]."""
|
|
index: dict[str, list[str]] = {}
|
|
for path in sorted(APPS_DIR.rglob("*.yaml")):
|
|
try:
|
|
with path.open("r", encoding="utf-8") as f:
|
|
for doc in yaml.safe_load_all(f):
|
|
for host in extract_hosts_from_doc(doc):
|
|
index.setdefault(host, []).append(str(path.relative_to(REPO_ROOT)))
|
|
except yaml.YAMLError as e:
|
|
print(f"warn: could not parse {path}: {e}", file=sys.stderr)
|
|
return index
|
|
|
|
|
|
def _kubectl_json(args: list[str]) -> dict | None:
|
|
"""Run `kubectl ... -o json` and return the parsed result, or None on failure."""
|
|
try:
|
|
r = subprocess.run(
|
|
["kubectl", *args, "-o", "json"],
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=20,
|
|
)
|
|
except (FileNotFoundError, subprocess.TimeoutExpired):
|
|
return None
|
|
if r.returncode != 0:
|
|
return None
|
|
try:
|
|
return json.loads(r.stdout)
|
|
except json.JSONDecodeError:
|
|
return None
|
|
|
|
|
|
def collect_hosts_from_cluster() -> tuple[dict[str, list[str]], bool]:
|
|
"""
|
|
Scan live cluster. Returns (host_index, ok).
|
|
|
|
ok=False means kubectl wasn't usable; the caller decides whether that's
|
|
fatal (--live) or just a warning (auto mode).
|
|
"""
|
|
if not shutil.which("kubectl"):
|
|
return {}, False
|
|
|
|
index: dict[str, list[str]] = {}
|
|
|
|
certs = _kubectl_json(["get", "certificate", "-A"])
|
|
if certs is None:
|
|
return {}, False
|
|
for item in certs.get("items", []):
|
|
meta = item.get("metadata", {})
|
|
ns = meta.get("namespace", "?")
|
|
name = meta.get("name", "?")
|
|
ref = f"{LIVE_SOURCE} Certificate {ns}/{name}"
|
|
for dn in (item.get("spec", {}) or {}).get("dnsNames", []) or []:
|
|
if isinstance(dn, str) and dn.endswith(".iamworkin.lan"):
|
|
index.setdefault(dn, []).append(ref)
|
|
|
|
irs = _kubectl_json(["get", "ingressroute", "-A"])
|
|
if irs is not None:
|
|
for item in irs.get("items", []):
|
|
meta = item.get("metadata", {})
|
|
ns = meta.get("namespace", "?")
|
|
name = meta.get("name", "?")
|
|
ref = f"{LIVE_SOURCE} IngressRoute {ns}/{name}"
|
|
for route in (item.get("spec", {}) or {}).get("routes", []) or []:
|
|
match = route.get("match", "") if isinstance(route, dict) else ""
|
|
for h in HOST_RE.findall(match):
|
|
if h.endswith(".iamworkin.lan"):
|
|
index.setdefault(h, []).append(ref)
|
|
|
|
return index, True
|
|
|
|
|
|
def _ssl_context(insecure: bool) -> ssl.SSLContext:
|
|
return ssl._create_unverified_context() if insecure else ssl.create_default_context()
|
|
|
|
|
|
def preflight_host(
|
|
base_url: str,
|
|
zone: str,
|
|
host: str,
|
|
timeout_seconds: float,
|
|
insecure: bool,
|
|
) -> PreflightResult:
|
|
path = (
|
|
f"/api/v1/zones/{urllib.parse.quote(zone, safe='')}/resolve-preflight"
|
|
f"?hostname={urllib.parse.quote(host, safe='')}"
|
|
)
|
|
url = urllib.parse.urljoin(base_url.rstrip("/") + "/", path.lstrip("/"))
|
|
request = urllib.request.Request(url, headers={"Accept": "application/json"})
|
|
|
|
try:
|
|
with urllib.request.urlopen(
|
|
request,
|
|
timeout=timeout_seconds,
|
|
context=_ssl_context(insecure),
|
|
) as response:
|
|
payload = json.loads(response.read().decode("utf-8"))
|
|
except urllib.error.HTTPError as exc:
|
|
body = exc.read().decode("utf-8", errors="replace").strip()
|
|
detail = body[:200] if body else exc.reason
|
|
return PreflightResult(
|
|
host=host,
|
|
ok=False,
|
|
resolved_zone=None,
|
|
server_name=None,
|
|
provider=None,
|
|
addresses=[],
|
|
challenge_fqdn=f"_acme-challenge.{host.rstrip('.')}.",
|
|
message=f"HTTP {exc.code}: {detail}",
|
|
)
|
|
except Exception as exc: # noqa: BLE001 - surfaced as preflight failure detail
|
|
return PreflightResult(
|
|
host=host,
|
|
ok=False,
|
|
resolved_zone=None,
|
|
server_name=None,
|
|
provider=None,
|
|
addresses=[],
|
|
challenge_fqdn=f"_acme-challenge.{host.rstrip('.')}.",
|
|
message=f"{type(exc).__name__}: {exc}",
|
|
)
|
|
|
|
resolved_zone = payload.get("resolvedZone")
|
|
server_name = payload.get("serverName")
|
|
provider = payload.get("provider")
|
|
addresses = [value for value in payload.get("addresses", []) if isinstance(value, str)]
|
|
supports_acme = bool(payload.get("supportsAcmeDns01"))
|
|
resolvable = bool(payload.get("resolvable"))
|
|
challenge_fqdn = str(payload.get("challengeFqdn", f"_acme-challenge.{host.rstrip('.')}."))
|
|
message = str(payload.get("message", "")).strip()
|
|
|
|
if not supports_acme and not message:
|
|
message = "Matched DNS server does not advertise ACME DNS-01 support."
|
|
|
|
ok = supports_acme and resolvable and bool(resolved_zone)
|
|
return PreflightResult(
|
|
host=host,
|
|
ok=ok,
|
|
resolved_zone=resolved_zone,
|
|
server_name=server_name,
|
|
provider=provider,
|
|
addresses=addresses,
|
|
challenge_fqdn=challenge_fqdn,
|
|
message=message,
|
|
)
|
|
|
|
|
|
def run_preflight(
|
|
hosts: list[str],
|
|
base_url: str,
|
|
zone: str,
|
|
timeout_seconds: float,
|
|
insecure: bool,
|
|
workers: int,
|
|
) -> dict[str, PreflightResult]:
|
|
if not hosts:
|
|
return {}
|
|
|
|
max_workers = max(1, min(workers, len(hosts)))
|
|
results: dict[str, PreflightResult] = {}
|
|
with ThreadPoolExecutor(max_workers=max_workers) as pool:
|
|
future_map = {
|
|
pool.submit(preflight_host, base_url, zone, host, timeout_seconds, insecure): host
|
|
for host in hosts
|
|
}
|
|
for future in as_completed(future_map):
|
|
host = future_map[future]
|
|
results[host] = future.result()
|
|
return results
|
|
|
|
|
|
def main() -> int:
|
|
parser = argparse.ArgumentParser(description=__doc__.splitlines()[1] if __doc__ else None)
|
|
parser.add_argument(
|
|
"--live",
|
|
dest="live",
|
|
action="store_true",
|
|
default=None,
|
|
help="Require a live-cluster scan (fail if kubectl unreachable).",
|
|
)
|
|
parser.add_argument(
|
|
"--no-live",
|
|
dest="live",
|
|
action="store_false",
|
|
help="Skip live-cluster scan (manifests only).",
|
|
)
|
|
parser.add_argument(
|
|
"--base-url",
|
|
default=DEFAULT_BASE_URL,
|
|
help=f"FlowerCore.DNS base URL (default: {DEFAULT_BASE_URL}).",
|
|
)
|
|
parser.add_argument(
|
|
"--zone",
|
|
default=DEFAULT_ZONE,
|
|
help=f"Zone passed to resolve-preflight (default: {DEFAULT_ZONE}).",
|
|
)
|
|
parser.add_argument(
|
|
"--timeout-seconds",
|
|
type=float,
|
|
default=DEFAULT_TIMEOUT_SECONDS,
|
|
help=f"Per-host resolve-preflight timeout (default: {DEFAULT_TIMEOUT_SECONDS}).",
|
|
)
|
|
parser.add_argument(
|
|
"--workers",
|
|
type=int,
|
|
default=DEFAULT_WORKERS,
|
|
help=f"Parallel preflight workers (default: {DEFAULT_WORKERS}).",
|
|
)
|
|
parser.add_argument(
|
|
"--insecure",
|
|
action="store_true",
|
|
help="Skip TLS verification when calling FlowerCore.DNS.",
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
hosts = collect_hosts_from_manifests()
|
|
|
|
live_requested = args.live is True
|
|
live_auto = args.live is None
|
|
|
|
if live_requested or live_auto:
|
|
live_hosts, live_ok = collect_hosts_from_cluster()
|
|
if live_requested and not live_ok:
|
|
print("ERROR: --live requested but kubectl is not available or auth failed.", file=sys.stderr)
|
|
return 2
|
|
if live_ok:
|
|
before = len(hosts)
|
|
for host, refs in live_hosts.items():
|
|
hosts.setdefault(host, []).extend(refs)
|
|
new_hosts = len(hosts) - before
|
|
print(f"(live scan: {len(live_hosts)} cluster host(s); {new_hosts} not covered by manifests)")
|
|
elif live_auto:
|
|
print("(kubectl not reachable — skipping live scan; run from a workstation with cluster access to catch retail-style drift)")
|
|
|
|
if not hosts:
|
|
print("No iamworkin.lan hostnames found in manifests or cluster — nothing to check.")
|
|
return 0
|
|
|
|
print(
|
|
f"(preflight: {len(hosts)} host(s) via {args.base_url.rstrip('/')}"
|
|
f"/api/v1/zones/{args.zone}/resolve-preflight)"
|
|
)
|
|
results = run_preflight(
|
|
sorted(hosts),
|
|
base_url=args.base_url,
|
|
zone=args.zone,
|
|
timeout_seconds=args.timeout_seconds,
|
|
insecure=args.insecure,
|
|
workers=args.workers,
|
|
)
|
|
|
|
failed: list[tuple[str, list[str], PreflightResult]] = []
|
|
for host in sorted(hosts):
|
|
result = results[host]
|
|
if result.ok:
|
|
addresses = ", ".join(result.addresses) if result.addresses else "(no A/AAAA answers)"
|
|
zone_label = result.resolved_zone or args.zone
|
|
server_label = result.server_name or "unknown-server"
|
|
print(f"OK {host:<45} -> {addresses} via {server_label} [{zone_label}]")
|
|
else:
|
|
print(f"FAIL {host:<45} ({result.message})")
|
|
failed.append((host, hosts[host], result))
|
|
|
|
if failed:
|
|
print()
|
|
print(f"ERROR: {len(failed)} host(s) failed FlowerCore.DNS preflight.")
|
|
for host, refs, result in failed:
|
|
print(f" {host}")
|
|
print(f" preflight: {result.message}")
|
|
print(f" challenge: {result.challenge_fqdn}")
|
|
for ref in sorted(set(refs)):
|
|
print(f" via: {ref}")
|
|
print()
|
|
print("Fix the DNS record in FlowerCore.DNS before merging, then rerun this gate.")
|
|
print()
|
|
print("Example:")
|
|
print(f" curl -sk {args.base_url.rstrip('/')}/api/v1/servers")
|
|
print(
|
|
" curl -sk -X POST "
|
|
f"{args.base_url.rstrip('/')}/api/v1/servers/<serverId>/zones/{args.zone}/records "
|
|
"-H 'Content-Type: application/json' "
|
|
"-d '{\"name\":\"<host>\",\"type\":\"A\",\"data\":\"10.0.56.200\",\"ttl\":300}'"
|
|
)
|
|
return 1
|
|
|
|
print()
|
|
print(f"All {len(hosts)} iamworkin.lan host(s) passed FlowerCore.DNS preflight. Safe to deploy.")
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|