feat(infra): route dns preflight through flowercore dns

This commit is contained in:
Andrew Stoltz
2026-04-23 17:03:22 -05:00
parent f9593e494a
commit 407d473b71
4 changed files with 256 additions and 66 deletions

View File

@@ -2,9 +2,14 @@
"""
check-pfsense-dns.py
Historical name retained for continuity, but the check now runs through the
public FlowerCore.DNS preflight API instead of a raw local resolver lookup.
Fails if any *.iamworkin.lan hostname referenced by a cert-manager Certificate
`spec.dnsNames` or a Traefik IngressRoute `Host(...)` match rule does NOT
resolve via the system DNS resolver (pfSense Unbound at 10.0.56.1 on this LAN).
`spec.dnsNames` or a Traefik IngressRoute `Host(...)` match rule is NOT
resolvable via FlowerCore.DNS:
GET /api/v1/zones/{zone}/resolve-preflight?hostname=<host>
Two sources are scanned:
@@ -15,18 +20,19 @@ Two sources are scanned:
e.g. services deployed via their own repo's deploy script. Retail.Web on
2026-04-23 was stuck Issuing for 15h because of exactly this gap.
Run from anywhere that uses pfSense as a resolver (LAN hosts, noc1,
BLUEJAY-WS):
Run from anywhere that can reach the FlowerCore.DNS host:
python scripts/check-pfsense-dns.py # auto live scan if kubectl works
python scripts/check-pfsense-dns.py --live # require live scan
python scripts/check-pfsense-dns.py --no-live # manifests only (CI default)
Exit code 0: all referenced hosts resolve. 1: at least one doesn't.
Exit code 0: all referenced hosts pass FlowerCore.DNS preflight.
Exit code 1: at least one host fails preflight.
Exit code 2: --live requested but kubectl was unusable.
This is intentionally narrow: it only flags hostnames that cert-manager will
actually try to validate via HTTP-01, or that Traefik will route. IRC
server-link names, Docker image tags, comments, etc. are ignored.
actually try to validate or that Traefik will route. IRC server-link names,
Docker image tags, comments, etc. are ignored.
"""
from __future__ import annotations
@@ -35,9 +41,14 @@ import json
import os
import re
import shutil
import socket
import ssl
import subprocess
import sys
import urllib.error
import urllib.parse
import urllib.request
from concurrent.futures import ThreadPoolExecutor, as_completed
from dataclasses import dataclass
from pathlib import Path
try:
@@ -51,6 +62,22 @@ APPS_DIR = REPO_ROOT / "apps"
HOST_RE = re.compile(r"Host\(`([^`]+)`\)")
LIVE_SOURCE = "live-cluster"
DEFAULT_BASE_URL = os.environ.get("FLOWERCORE_DNS_BASE_URL", "https://dns.iamworkin.lan")
DEFAULT_ZONE = os.environ.get("FLOWERCORE_DNS_ZONE", "iamworkin.lan")
DEFAULT_TIMEOUT_SECONDS = float(os.environ.get("FLOWERCORE_DNS_TIMEOUT_SECONDS", "20"))
DEFAULT_WORKERS = max(1, int(os.environ.get("FLOWERCORE_DNS_WORKERS", "8")))
@dataclass(frozen=True)
class PreflightResult:
host: str
ok: bool
resolved_zone: str | None
server_name: str | None
provider: str | None
addresses: list[str]
challenge_fqdn: str
message: str
def extract_hosts_from_doc(doc: dict) -> set[str]:
@@ -122,7 +149,6 @@ def collect_hosts_from_cluster() -> tuple[dict[str, list[str]], bool]:
index: dict[str, list[str]] = {}
# Certificates (cert-manager.io/v1) — spec.dnsNames
certs = _kubectl_json(["get", "certificate", "-A"])
if certs is None:
return {}, False
@@ -135,8 +161,6 @@ def collect_hosts_from_cluster() -> tuple[dict[str, list[str]], bool]:
if isinstance(dn, str) and dn.endswith(".iamworkin.lan"):
index.setdefault(dn, []).append(ref)
# IngressRoutes (traefik.io/v1alpha1) — spec.routes[].match Host(...)
# The CRD may or may not be installed. Silent skip when it isn't.
irs = _kubectl_json(["get", "ingressroute", "-A"])
if irs is not None:
for item in irs.get("items", []):
@@ -153,11 +177,103 @@ def collect_hosts_from_cluster() -> tuple[dict[str, list[str]], bool]:
return index, True
def resolves(host: str) -> str | None:
def _ssl_context(insecure: bool) -> ssl.SSLContext:
return ssl._create_unverified_context() if insecure else ssl.create_default_context()
def preflight_host(
base_url: str,
zone: str,
host: str,
timeout_seconds: float,
insecure: bool,
) -> PreflightResult:
path = (
f"/api/v1/zones/{urllib.parse.quote(zone, safe='')}/resolve-preflight"
f"?hostname={urllib.parse.quote(host, safe='')}"
)
url = urllib.parse.urljoin(base_url.rstrip("/") + "/", path.lstrip("/"))
request = urllib.request.Request(url, headers={"Accept": "application/json"})
try:
return socket.gethostbyname(host)
except OSError:
return None
with urllib.request.urlopen(
request,
timeout=timeout_seconds,
context=_ssl_context(insecure),
) as response:
payload = json.loads(response.read().decode("utf-8"))
except urllib.error.HTTPError as exc:
body = exc.read().decode("utf-8", errors="replace").strip()
detail = body[:200] if body else exc.reason
return PreflightResult(
host=host,
ok=False,
resolved_zone=None,
server_name=None,
provider=None,
addresses=[],
challenge_fqdn=f"_acme-challenge.{host.rstrip('.')}.",
message=f"HTTP {exc.code}: {detail}",
)
except Exception as exc: # noqa: BLE001 - surfaced as preflight failure detail
return PreflightResult(
host=host,
ok=False,
resolved_zone=None,
server_name=None,
provider=None,
addresses=[],
challenge_fqdn=f"_acme-challenge.{host.rstrip('.')}.",
message=f"{type(exc).__name__}: {exc}",
)
resolved_zone = payload.get("resolvedZone")
server_name = payload.get("serverName")
provider = payload.get("provider")
addresses = [value for value in payload.get("addresses", []) if isinstance(value, str)]
supports_acme = bool(payload.get("supportsAcmeDns01"))
resolvable = bool(payload.get("resolvable"))
challenge_fqdn = str(payload.get("challengeFqdn", f"_acme-challenge.{host.rstrip('.')}."))
message = str(payload.get("message", "")).strip()
if not supports_acme and not message:
message = "Matched DNS server does not advertise ACME DNS-01 support."
ok = supports_acme and resolvable and bool(resolved_zone)
return PreflightResult(
host=host,
ok=ok,
resolved_zone=resolved_zone,
server_name=server_name,
provider=provider,
addresses=addresses,
challenge_fqdn=challenge_fqdn,
message=message,
)
def run_preflight(
hosts: list[str],
base_url: str,
zone: str,
timeout_seconds: float,
insecure: bool,
workers: int,
) -> dict[str, PreflightResult]:
if not hosts:
return {}
max_workers = max(1, min(workers, len(hosts)))
results: dict[str, PreflightResult] = {}
with ThreadPoolExecutor(max_workers=max_workers) as pool:
future_map = {
pool.submit(preflight_host, base_url, zone, host, timeout_seconds, insecure): host
for host in hosts
}
for future in as_completed(future_map):
host = future_map[future]
results[host] = future.result()
return results
def main() -> int:
@@ -175,12 +291,39 @@ def main() -> int:
action="store_false",
help="Skip live-cluster scan (manifests only).",
)
parser.add_argument(
"--base-url",
default=DEFAULT_BASE_URL,
help=f"FlowerCore.DNS base URL (default: {DEFAULT_BASE_URL}).",
)
parser.add_argument(
"--zone",
default=DEFAULT_ZONE,
help=f"Zone passed to resolve-preflight (default: {DEFAULT_ZONE}).",
)
parser.add_argument(
"--timeout-seconds",
type=float,
default=DEFAULT_TIMEOUT_SECONDS,
help=f"Per-host resolve-preflight timeout (default: {DEFAULT_TIMEOUT_SECONDS}).",
)
parser.add_argument(
"--workers",
type=int,
default=DEFAULT_WORKERS,
help=f"Parallel preflight workers (default: {DEFAULT_WORKERS}).",
)
parser.add_argument(
"--insecure",
action="store_true",
help="Skip TLS verification when calling FlowerCore.DNS.",
)
args = parser.parse_args()
hosts = collect_hosts_from_manifests()
live_requested = args.live is True
live_auto = args.live is None # neither --live nor --no-live
live_auto = args.live is None
if live_requested or live_auto:
live_hosts, live_ok = collect_hosts_from_cluster()
@@ -197,36 +340,58 @@ def main() -> int:
print("(kubectl not reachable — skipping live scan; run from a workstation with cluster access to catch retail-style drift)")
if not hosts:
print(f"No iamworkin.lan hostnames found in manifests or cluster — nothing to check.")
print("No iamworkin.lan hostnames found in manifests or cluster — nothing to check.")
return 0
failed: list[tuple[str, list[str]]] = []
print(
f"(preflight: {len(hosts)} host(s) via {args.base_url.rstrip('/')}"
f"/api/v1/zones/{args.zone}/resolve-preflight)"
)
results = run_preflight(
sorted(hosts),
base_url=args.base_url,
zone=args.zone,
timeout_seconds=args.timeout_seconds,
insecure=args.insecure,
workers=args.workers,
)
failed: list[tuple[str, list[str], PreflightResult]] = []
for host in sorted(hosts):
ip = resolves(host)
if ip:
print(f"OK {host:<45} -> {ip}")
result = results[host]
if result.ok:
addresses = ", ".join(result.addresses) if result.addresses else "(no A/AAAA answers)"
zone_label = result.resolved_zone or args.zone
server_label = result.server_name or "unknown-server"
print(f"OK {host:<45} -> {addresses} via {server_label} [{zone_label}]")
else:
print(f"FAIL {host:<45} (no pfSense Unbound override)")
failed.append((host, hosts[host]))
print(f"FAIL {host:<45} ({result.message})")
failed.append((host, hosts[host], result))
if failed:
print()
print(f"ERROR: {len(failed)} host(s) referenced but not in pfSense Unbound.")
for host, refs in failed:
print(f"ERROR: {len(failed)} host(s) failed FlowerCore.DNS preflight.")
for host, refs, result in failed:
print(f" {host}")
print(f" preflight: {result.message}")
print(f" challenge: {result.challenge_fqdn}")
for ref in sorted(set(refs)):
print(f" via: {ref}")
print()
print("Add them before merging — see README.md step 1.")
print("Fix the DNS record in FlowerCore.DNS before merging, then rerun this gate.")
print()
print("From FlowerCore.Notes:")
print(" # edit HOSTS list in scripts/pfsense-add-dns-overrides.py")
print(" export PFSENSE_PASS=$(get_cred 'pfSense Admin')")
print(" python scripts/pfsense-add-dns-overrides.py")
print("Example:")
print(f" curl -sk {args.base_url.rstrip('/')}/api/v1/servers")
print(
" curl -sk -X POST "
f"{args.base_url.rstrip('/')}/api/v1/servers/<serverId>/zones/{args.zone}/records "
"-H 'Content-Type: application/json' "
"-d '{\"name\":\"<host>\",\"type\":\"A\",\"data\":\"10.0.56.200\",\"ttl\":300}'"
)
return 1
print()
print(f"All {len(hosts)} iamworkin.lan host(s) resolve via pfSense. Safe to deploy.")
print(f"All {len(hosts)} iamworkin.lan host(s) passed FlowerCore.DNS preflight. Safe to deploy.")
return 0