From 407d473b71f90bc8616c8c26cad20140e6672652 Mon Sep 17 00:00:00 2001 From: Andrew Stoltz Date: Thu, 23 Apr 2026 17:03:22 -0500 Subject: [PATCH] feat(infra): route dns preflight through flowercore dns --- README.md | 39 ++++-- apps/fc-distribution/README.md | 22 +++- apps/fc-llm-bridge/README.md | 34 ++--- scripts/check-pfsense-dns.py | 227 ++++++++++++++++++++++++++++----- 4 files changed, 256 insertions(+), 66 deletions(-) diff --git a/README.md b/README.md index 0f5a4f5..137e184 100644 --- a/README.md +++ b/README.md @@ -6,28 +6,33 @@ Infrastructure manifests for ArgoCD. An `ApplicationSet` in `argocd` namespace w Follow these steps in order. **Step 1 must run before step 3** — if you skip it, cert-manager HTTP-01 will silently fail for ~2h per cert (exponential backoff) until someone diagnoses the DNS. -### 1. Add the pfSense Unbound DNS override (REQUIRED) +### 1. Create or verify the FlowerCore.DNS A record (REQUIRED for current HTTP-01 manifests) step-ca (the ACME CA on noc1) runs in a Podman container with host networking. Its container resolver uses pfSense Unbound (10.0.56.1), **not** cluster CoreDNS. So even though CoreDNS has a wildcard `*.iamworkin.lan → 10.0.56.200` for in-cluster lookups, step-ca cannot see it. Every new public hostname needs an explicit pfSense host override. -From `FlowerCore.Notes`: +The management path is now `FlowerCore.DNS`, not `FlowerCore.Notes/scripts/pfsense-add-dns-overrides.py`. Add or verify the public A record there before you apply the manifest: ```bash -# 1. Edit the HOSTS list in scripts/pfsense-add-dns-overrides.py -# Add: ("", "10.0.56.200", "cert-manager HTTP-01 target (Traefik VIP)") -# 2. Run: -source scripts/credential-helper.sh -export PFSENSE_PASS=$(get_cred "pfSense Admin") -python scripts/pfsense-add-dns-overrides.py +curl -sk https://dns.iamworkin.lan/api/v1/servers +# Find the pfSense serverId, then create the record using the host label only. +# Example: for foo.iamworkin.lan, use "name":"foo". + +curl -sk -X POST https://dns.iamworkin.lan/api/v1/servers//zones/iamworkin.lan/records \ + -H "Content-Type: application/json" \ + -d '{"name":"","type":"A","data":"10.0.56.200","ttl":300}' ``` Verify all referenced iamworkin.lan hosts resolve (run from anywhere on LAN): ```bash python scripts/check-pfsense-dns.py -# Parses every apps/*/*.yaml, extracts hostnames from Certificate dnsNames -# and Traefik IngressRoute Host(...) rules, and fails if any don't resolve. -# Safe to run as a pre-merge / pre-sync check. +# Historical filename retained. The script now calls +# https://dns.iamworkin.lan/api/v1/zones/iamworkin.lan/resolve-preflight +# for every Certificate dnsName and Traefik Host(...) rule it finds. + +python scripts/check-pfsense-dns.py --live +# Optional stronger pass when kubectl access is available; also checks +# live-cluster Certificates and IngressRoutes for drift outside manifests. ``` **Symptom if you skip this:** the Certificate resource stays `Ready: False` with `status.reason: unexpected non-ACME API error: context deadline exceeded`. Recovery requires `kubectl -n delete order ` after adding the DNS to bypass cert-manager's backoff. @@ -43,7 +48,7 @@ Conventions: - Image: `localhost/:v`, `imagePullPolicy: Never`. Import the image to every RKE2 node (server + both agents) via `ctr images import` before applying — pods schedule anywhere. - If the app persists local state (SQLite, uploads), declare the `PersistentVolumeClaim` here with `storageClassName: longhorn` and `accessModes: [ReadWriteOnce]`. Add `strategy.type: Recreate` to the Deployment — RWO PVC blocks rolling updates. - Probes: use `tcpSocket` if the app has middleware that intercepts unauth requests (returns 404/401 for `/health`). Otherwise prefer `httpGet` against whatever the app exposes (verify the path isn't gated by auth). -- Certificate: `issuerRef.name: step-ca-acme`, `issuerRef.kind: ClusterIssuer`. `dnsNames` must match the hostname you added to pfSense in step 1. +- Certificate: `issuerRef.name: step-ca-acme`, `issuerRef.kind: ClusterIssuer`. `dnsNames` must match the hostname you created in FlowerCore.DNS in step 1. ### 3. Commit & push @@ -76,13 +81,18 @@ Before `git push`, always run: python scripts/check-pfsense-dns.py ``` -It's a ~3-second check that would have caught the entire 2026-04-22 cert-manager outage. Consider wiring it into a pre-commit hook or a Gitea Actions workflow. +It's a quick service-backed check that would have caught the entire 2026-04-22 cert-manager outage. Consider wiring it into a pre-commit hook or a Gitea Actions workflow. ## Retiring a service 1. `kubectl -n argocd delete application infra-` (cascade deletes the K8s resources via ArgoCD finalizers) 2. `git rm -r apps//` and push -3. Remove the pfSense Unbound override — edit `scripts/pfsense-add-dns-overrides.py` to remove from HOSTS, or delete manually via the pfSense UI (Services → DNS Resolver → Host Overrides) +3. Remove the FlowerCore.DNS record through the UI or API, for example: + +```bash +curl -sk https://dns.iamworkin.lan/api/v1/servers +curl -sk -X DELETE https://dns.iamworkin.lan/api/v1/servers//zones/iamworkin.lan/records/ +``` ## Known gotchas @@ -95,5 +105,6 @@ It's a ~3-second check that would have caught the entire 2026-04-22 cert-manager - Cert-manager recovery playbook: `FlowerCore.Notes/memory/project_cert_manager_recovery_2026_04_22.md` - Why pfSense DNS is required: `FlowerCore.Notes/memory/feedback_pfsense_dns_required_for_acme.md` +- Public DNS operator host: `https://dns.iamworkin.lan` - Canonical credential helper: `FlowerCore.Notes/scripts/credential-helper.sh` - pfSense admin automation: `FlowerCore.Notes/memory/feedback_pfsense_automation.md` diff --git a/apps/fc-distribution/README.md b/apps/fc-distribution/README.md index 9d946f5..f2a2725 100644 --- a/apps/fc-distribution/README.md +++ b/apps/fc-distribution/README.md @@ -15,14 +15,28 @@ Root CA` as the trust anchor; per-edition leaf signing material lives in ## Deployment order (do NOT skip / reorder) -### 1. pfSense Unbound DNS — DONE 2026-04-23 +### 1. FlowerCore.DNS preflight — VERIFIED 2026-04-23 -`dist.iamworkin.lan -> 10.0.56.200` was added to pfSense Unbound out of band. -Verify before push: +`dist.iamworkin.lan` already resolves to `10.0.56.200`, but keep the +FlowerCore.DNS preflight green before push: ```bash -nslookup dist.iamworkin.lan 10.0.56.1 # expect 10.0.56.200 +curl -sk "https://dns.iamworkin.lan/api/v1/zones/iamworkin.lan/resolve-preflight?hostname=dist.iamworkin.lan" +# Expect: "resolvable": true + python bluejay-infra/scripts/check-pfsense-dns.py +# Historical filename retained; implementation now calls FlowerCore.DNS +# resolve-preflight instead of raw resolver lookups. +``` + +If the record ever disappears, recreate it through FlowerCore.DNS before +push/apply: + +```bash +curl -sk https://dns.iamworkin.lan/api/v1/servers +curl -sk -X POST https://dns.iamworkin.lan/api/v1/servers//zones/iamworkin.lan/records \ + -H "Content-Type: application/json" \ + -d '{"name":"dist","type":"A","data":"10.0.56.200","ttl":300}' ``` If this is missing, cert-manager HTTP-01 will silently back off ~2h. See diff --git a/apps/fc-llm-bridge/README.md b/apps/fc-llm-bridge/README.md index 6213769..8b0391d 100644 --- a/apps/fc-llm-bridge/README.md +++ b/apps/fc-llm-bridge/README.md @@ -8,10 +8,10 @@ ADR: ADR-088 in [`../../../FlowerCore.Notes/ARCHITECTURE.md`](../../../FlowerCor ## Deployment order (do NOT skip / reorder) -### 1. pfSense Unbound DNS override — REQUIRED FIRST +### 1. FlowerCore.DNS preflight — REQUIRED FIRST -`fc-llm-bridge.iamworkin.lan` is not currently in pfSense Unbound. Verified -with `python bluejay-infra/scripts/check-pfsense-dns.py` at staging time. +`fc-llm-bridge.iamworkin.lan` must keep resolving to `10.0.56.200` through +FlowerCore.DNS before this manifest is applied. step-ca (the ACME CA on noc1) uses pfSense Unbound (10.0.56.1), **not** cluster CoreDNS. If you apply this manifest before adding the DNS override, @@ -19,29 +19,28 @@ cert-manager's HTTP-01 challenge silently fails for ~2h (exponential backoff) until someone manually runs `kubectl -n fc-llm-bridge delete order ` to bust the cache. See memory `feedback_pfsense_dns_required_for_acme.md`. -From `FlowerCore.Notes`: +Verify the record through the public preflight API: ```bash -# 1. Edit HOSTS list in scripts/pfsense-add-dns-overrides.py, append: -# ("fc-llm-bridge", "10.0.56.200", "cert-manager HTTP-01 target (Traefik VIP)"), -# 2. Source creds + run: -source scripts/credential-helper.sh -export PFSENSE_PASS=$(get_cred "pfSense Admin") -python scripts/pfsense-add-dns-overrides.py +curl -sk "https://dns.iamworkin.lan/api/v1/zones/iamworkin.lan/resolve-preflight?hostname=fc-llm-bridge.iamworkin.lan" +# Expect: "resolvable": true ``` Verify: ```bash -nslookup fc-llm-bridge.iamworkin.lan 10.0.56.1 -# Expect: Address: 10.0.56.200 +python scripts/check-pfsense-dns.py +# Historical filename retained; implementation now calls FlowerCore.DNS +# resolve-preflight instead of raw resolver lookups. ``` -Or run the full pre-merge gate from `bluejay-infra`: +If the record is missing, recreate it through FlowerCore.DNS before pushing: ```bash -python scripts/check-pfsense-dns.py -# Expect: OK fc-llm-bridge.iamworkin.lan -> 10.0.56.200 +curl -sk https://dns.iamworkin.lan/api/v1/servers +curl -sk -X POST https://dns.iamworkin.lan/api/v1/servers//zones/iamworkin.lan/records \ + -H "Content-Type: application/json" \ + -d '{"name":"fc-llm-bridge","type":"A","data":"10.0.56.200","ttl":300}' ``` ### 2. Create the `FC LLM Bridge API Keys` 1Password item @@ -154,8 +153,9 @@ bridge (the design doc describes this split as the preferred approach). ## Current state at staging time (2026-04-23) -- `fc-llm-bridge.iamworkin.lan` — NOT in pfSense Unbound (verified via - `nslookup fc-llm-bridge.iamworkin.lan 10.0.56.1`: NXDOMAIN). +- `fc-llm-bridge.iamworkin.lan` — public FlowerCore.DNS preflight is now + green and resolves to `10.0.56.200`; keep `python scripts/check-pfsense-dns.py` + green before push. - `FC LLM Bridge API Keys` — NOT created in 1Password (user action). - `Claude API Key` — already exists in `IAmWorkin` vault (`e5tth3y5mp3lhdavg35pxadzca`), also consumed by AiStation and Chat.Web. diff --git a/scripts/check-pfsense-dns.py b/scripts/check-pfsense-dns.py index ef9e04b..6acf640 100644 --- a/scripts/check-pfsense-dns.py +++ b/scripts/check-pfsense-dns.py @@ -2,9 +2,14 @@ """ check-pfsense-dns.py +Historical name retained for continuity, but the check now runs through the +public FlowerCore.DNS preflight API instead of a raw local resolver lookup. + Fails if any *.iamworkin.lan hostname referenced by a cert-manager Certificate -`spec.dnsNames` or a Traefik IngressRoute `Host(...)` match rule does NOT -resolve via the system DNS resolver (pfSense Unbound at 10.0.56.1 on this LAN). +`spec.dnsNames` or a Traefik IngressRoute `Host(...)` match rule is NOT +resolvable via FlowerCore.DNS: + + GET /api/v1/zones/{zone}/resolve-preflight?hostname= Two sources are scanned: @@ -15,18 +20,19 @@ Two sources are scanned: e.g. services deployed via their own repo's deploy script. Retail.Web on 2026-04-23 was stuck Issuing for 15h because of exactly this gap. -Run from anywhere that uses pfSense as a resolver (LAN hosts, noc1, -BLUEJAY-WS): +Run from anywhere that can reach the FlowerCore.DNS host: python scripts/check-pfsense-dns.py # auto live scan if kubectl works python scripts/check-pfsense-dns.py --live # require live scan python scripts/check-pfsense-dns.py --no-live # manifests only (CI default) -Exit code 0: all referenced hosts resolve. 1: at least one doesn't. +Exit code 0: all referenced hosts pass FlowerCore.DNS preflight. +Exit code 1: at least one host fails preflight. +Exit code 2: --live requested but kubectl was unusable. This is intentionally narrow: it only flags hostnames that cert-manager will -actually try to validate via HTTP-01, or that Traefik will route. IRC -server-link names, Docker image tags, comments, etc. are ignored. +actually try to validate or that Traefik will route. IRC server-link names, +Docker image tags, comments, etc. are ignored. """ from __future__ import annotations @@ -35,9 +41,14 @@ import json import os import re import shutil -import socket +import ssl import subprocess import sys +import urllib.error +import urllib.parse +import urllib.request +from concurrent.futures import ThreadPoolExecutor, as_completed +from dataclasses import dataclass from pathlib import Path try: @@ -51,6 +62,22 @@ APPS_DIR = REPO_ROOT / "apps" HOST_RE = re.compile(r"Host\(`([^`]+)`\)") LIVE_SOURCE = "live-cluster" +DEFAULT_BASE_URL = os.environ.get("FLOWERCORE_DNS_BASE_URL", "https://dns.iamworkin.lan") +DEFAULT_ZONE = os.environ.get("FLOWERCORE_DNS_ZONE", "iamworkin.lan") +DEFAULT_TIMEOUT_SECONDS = float(os.environ.get("FLOWERCORE_DNS_TIMEOUT_SECONDS", "20")) +DEFAULT_WORKERS = max(1, int(os.environ.get("FLOWERCORE_DNS_WORKERS", "8"))) + + +@dataclass(frozen=True) +class PreflightResult: + host: str + ok: bool + resolved_zone: str | None + server_name: str | None + provider: str | None + addresses: list[str] + challenge_fqdn: str + message: str def extract_hosts_from_doc(doc: dict) -> set[str]: @@ -122,7 +149,6 @@ def collect_hosts_from_cluster() -> tuple[dict[str, list[str]], bool]: index: dict[str, list[str]] = {} - # Certificates (cert-manager.io/v1) — spec.dnsNames certs = _kubectl_json(["get", "certificate", "-A"]) if certs is None: return {}, False @@ -135,8 +161,6 @@ def collect_hosts_from_cluster() -> tuple[dict[str, list[str]], bool]: if isinstance(dn, str) and dn.endswith(".iamworkin.lan"): index.setdefault(dn, []).append(ref) - # IngressRoutes (traefik.io/v1alpha1) — spec.routes[].match Host(...) - # The CRD may or may not be installed. Silent skip when it isn't. irs = _kubectl_json(["get", "ingressroute", "-A"]) if irs is not None: for item in irs.get("items", []): @@ -153,11 +177,103 @@ def collect_hosts_from_cluster() -> tuple[dict[str, list[str]], bool]: return index, True -def resolves(host: str) -> str | None: +def _ssl_context(insecure: bool) -> ssl.SSLContext: + return ssl._create_unverified_context() if insecure else ssl.create_default_context() + + +def preflight_host( + base_url: str, + zone: str, + host: str, + timeout_seconds: float, + insecure: bool, +) -> PreflightResult: + path = ( + f"/api/v1/zones/{urllib.parse.quote(zone, safe='')}/resolve-preflight" + f"?hostname={urllib.parse.quote(host, safe='')}" + ) + url = urllib.parse.urljoin(base_url.rstrip("/") + "/", path.lstrip("/")) + request = urllib.request.Request(url, headers={"Accept": "application/json"}) + try: - return socket.gethostbyname(host) - except OSError: - return None + with urllib.request.urlopen( + request, + timeout=timeout_seconds, + context=_ssl_context(insecure), + ) as response: + payload = json.loads(response.read().decode("utf-8")) + except urllib.error.HTTPError as exc: + body = exc.read().decode("utf-8", errors="replace").strip() + detail = body[:200] if body else exc.reason + return PreflightResult( + host=host, + ok=False, + resolved_zone=None, + server_name=None, + provider=None, + addresses=[], + challenge_fqdn=f"_acme-challenge.{host.rstrip('.')}.", + message=f"HTTP {exc.code}: {detail}", + ) + except Exception as exc: # noqa: BLE001 - surfaced as preflight failure detail + return PreflightResult( + host=host, + ok=False, + resolved_zone=None, + server_name=None, + provider=None, + addresses=[], + challenge_fqdn=f"_acme-challenge.{host.rstrip('.')}.", + message=f"{type(exc).__name__}: {exc}", + ) + + resolved_zone = payload.get("resolvedZone") + server_name = payload.get("serverName") + provider = payload.get("provider") + addresses = [value for value in payload.get("addresses", []) if isinstance(value, str)] + supports_acme = bool(payload.get("supportsAcmeDns01")) + resolvable = bool(payload.get("resolvable")) + challenge_fqdn = str(payload.get("challengeFqdn", f"_acme-challenge.{host.rstrip('.')}.")) + message = str(payload.get("message", "")).strip() + + if not supports_acme and not message: + message = "Matched DNS server does not advertise ACME DNS-01 support." + + ok = supports_acme and resolvable and bool(resolved_zone) + return PreflightResult( + host=host, + ok=ok, + resolved_zone=resolved_zone, + server_name=server_name, + provider=provider, + addresses=addresses, + challenge_fqdn=challenge_fqdn, + message=message, + ) + + +def run_preflight( + hosts: list[str], + base_url: str, + zone: str, + timeout_seconds: float, + insecure: bool, + workers: int, +) -> dict[str, PreflightResult]: + if not hosts: + return {} + + max_workers = max(1, min(workers, len(hosts))) + results: dict[str, PreflightResult] = {} + with ThreadPoolExecutor(max_workers=max_workers) as pool: + future_map = { + pool.submit(preflight_host, base_url, zone, host, timeout_seconds, insecure): host + for host in hosts + } + for future in as_completed(future_map): + host = future_map[future] + results[host] = future.result() + return results def main() -> int: @@ -175,12 +291,39 @@ def main() -> int: action="store_false", help="Skip live-cluster scan (manifests only).", ) + parser.add_argument( + "--base-url", + default=DEFAULT_BASE_URL, + help=f"FlowerCore.DNS base URL (default: {DEFAULT_BASE_URL}).", + ) + parser.add_argument( + "--zone", + default=DEFAULT_ZONE, + help=f"Zone passed to resolve-preflight (default: {DEFAULT_ZONE}).", + ) + parser.add_argument( + "--timeout-seconds", + type=float, + default=DEFAULT_TIMEOUT_SECONDS, + help=f"Per-host resolve-preflight timeout (default: {DEFAULT_TIMEOUT_SECONDS}).", + ) + parser.add_argument( + "--workers", + type=int, + default=DEFAULT_WORKERS, + help=f"Parallel preflight workers (default: {DEFAULT_WORKERS}).", + ) + parser.add_argument( + "--insecure", + action="store_true", + help="Skip TLS verification when calling FlowerCore.DNS.", + ) args = parser.parse_args() hosts = collect_hosts_from_manifests() live_requested = args.live is True - live_auto = args.live is None # neither --live nor --no-live + live_auto = args.live is None if live_requested or live_auto: live_hosts, live_ok = collect_hosts_from_cluster() @@ -197,36 +340,58 @@ def main() -> int: print("(kubectl not reachable — skipping live scan; run from a workstation with cluster access to catch retail-style drift)") if not hosts: - print(f"No iamworkin.lan hostnames found in manifests or cluster — nothing to check.") + print("No iamworkin.lan hostnames found in manifests or cluster — nothing to check.") return 0 - failed: list[tuple[str, list[str]]] = [] + print( + f"(preflight: {len(hosts)} host(s) via {args.base_url.rstrip('/')}" + f"/api/v1/zones/{args.zone}/resolve-preflight)" + ) + results = run_preflight( + sorted(hosts), + base_url=args.base_url, + zone=args.zone, + timeout_seconds=args.timeout_seconds, + insecure=args.insecure, + workers=args.workers, + ) + + failed: list[tuple[str, list[str], PreflightResult]] = [] for host in sorted(hosts): - ip = resolves(host) - if ip: - print(f"OK {host:<45} -> {ip}") + result = results[host] + if result.ok: + addresses = ", ".join(result.addresses) if result.addresses else "(no A/AAAA answers)" + zone_label = result.resolved_zone or args.zone + server_label = result.server_name or "unknown-server" + print(f"OK {host:<45} -> {addresses} via {server_label} [{zone_label}]") else: - print(f"FAIL {host:<45} (no pfSense Unbound override)") - failed.append((host, hosts[host])) + print(f"FAIL {host:<45} ({result.message})") + failed.append((host, hosts[host], result)) if failed: print() - print(f"ERROR: {len(failed)} host(s) referenced but not in pfSense Unbound.") - for host, refs in failed: + print(f"ERROR: {len(failed)} host(s) failed FlowerCore.DNS preflight.") + for host, refs, result in failed: print(f" {host}") + print(f" preflight: {result.message}") + print(f" challenge: {result.challenge_fqdn}") for ref in sorted(set(refs)): print(f" via: {ref}") print() - print("Add them before merging — see README.md step 1.") + print("Fix the DNS record in FlowerCore.DNS before merging, then rerun this gate.") print() - print("From FlowerCore.Notes:") - print(" # edit HOSTS list in scripts/pfsense-add-dns-overrides.py") - print(" export PFSENSE_PASS=$(get_cred 'pfSense Admin')") - print(" python scripts/pfsense-add-dns-overrides.py") + print("Example:") + print(f" curl -sk {args.base_url.rstrip('/')}/api/v1/servers") + print( + " curl -sk -X POST " + f"{args.base_url.rstrip('/')}/api/v1/servers//zones/{args.zone}/records " + "-H 'Content-Type: application/json' " + "-d '{\"name\":\"\",\"type\":\"A\",\"data\":\"10.0.56.200\",\"ttl\":300}'" + ) return 1 print() - print(f"All {len(hosts)} iamworkin.lan host(s) resolve via pfSense. Safe to deploy.") + print(f"All {len(hosts)} iamworkin.lan host(s) passed FlowerCore.DNS preflight. Safe to deploy.") return 0