diff --git a/README.md b/README.md index 4d74f5c..0f5a4f5 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,99 @@ -# bluejay-infra - -Infrastructure manifests for ArgoCD \ No newline at end of file +# bluejay-infra + +Infrastructure manifests for ArgoCD. An `ApplicationSet` in `argocd` namespace watches the `apps/*` directories in this repo and creates one `Application` per subdir (prefixed `infra-`). + +## Adding a new service to the cluster + +Follow these steps in order. **Step 1 must run before step 3** — if you skip it, cert-manager HTTP-01 will silently fail for ~2h per cert (exponential backoff) until someone diagnoses the DNS. + +### 1. Add the pfSense Unbound DNS override (REQUIRED) + +step-ca (the ACME CA on noc1) runs in a Podman container with host networking. Its container resolver uses pfSense Unbound (10.0.56.1), **not** cluster CoreDNS. So even though CoreDNS has a wildcard `*.iamworkin.lan → 10.0.56.200` for in-cluster lookups, step-ca cannot see it. Every new public hostname needs an explicit pfSense host override. + +From `FlowerCore.Notes`: + +```bash +# 1. Edit the HOSTS list in scripts/pfsense-add-dns-overrides.py +# Add: ("", "10.0.56.200", "cert-manager HTTP-01 target (Traefik VIP)") +# 2. Run: +source scripts/credential-helper.sh +export PFSENSE_PASS=$(get_cred "pfSense Admin") +python scripts/pfsense-add-dns-overrides.py +``` + +Verify all referenced iamworkin.lan hosts resolve (run from anywhere on LAN): + +```bash +python scripts/check-pfsense-dns.py +# Parses every apps/*/*.yaml, extracts hostnames from Certificate dnsNames +# and Traefik IngressRoute Host(...) rules, and fails if any don't resolve. +# Safe to run as a pre-merge / pre-sync check. +``` + +**Symptom if you skip this:** the Certificate resource stays `Ready: False` with `status.reason: unexpected non-ACME API error: context deadline exceeded`. Recovery requires `kubectl -n delete order ` after adding the DNS to bypass cert-manager's backoff. + +### 2. Create the app manifest + +Create `apps//.yaml` containing the Namespace, Deployment, Service, Certificate, and IngressRoute. Reference an existing directory (e.g. `apps/fc-messageboard/`) for the canonical shape. + +Conventions: + +- `Namespace` has label `app.kubernetes.io/part-of: bluejay-infra` +- `Deployment.spec.selector.matchLabels` and `Service.spec.selector` MUST use the same label key. The historical convention here is `app: ` (not `app.kubernetes.io/name`) — don't mix. +- Image: `localhost/:v`, `imagePullPolicy: Never`. Import the image to every RKE2 node (server + both agents) via `ctr images import` before applying — pods schedule anywhere. +- If the app persists local state (SQLite, uploads), declare the `PersistentVolumeClaim` here with `storageClassName: longhorn` and `accessModes: [ReadWriteOnce]`. Add `strategy.type: Recreate` to the Deployment — RWO PVC blocks rolling updates. +- Probes: use `tcpSocket` if the app has middleware that intercepts unauth requests (returns 404/401 for `/health`). Otherwise prefer `httpGet` against whatever the app exposes (verify the path isn't gated by auth). +- Certificate: `issuerRef.name: step-ca-acme`, `issuerRef.kind: ClusterIssuer`. `dnsNames` must match the hostname you added to pfSense in step 1. + +### 3. Commit & push + +```bash +git add apps// +git commit -m ": initial deployment" +git push +``` + +ArgoCD's `ApplicationSet` picks up the new directory within ~3 minutes and creates `infra-` with auto-sync + self-heal enabled. + +### 4. Verify + +```bash +# From noc1 +fcadmin_ssh noc1 ' + kubectl -n argocd get application infra- + kubectl -n get certificate,pod + curl -sk -m 8 -o /dev/null -w "HTTP %{http_code}\n" https://.iamworkin.lan/ +' +``` + +Certificate should be `Ready: True` within ~60s. If it stalls `False` for >2m, the pfSense DNS step got skipped — go back to step 1, then `kubectl -n delete order ` to bust the backoff. + +### Pre-merge gate + +Before `git push`, always run: + +```bash +python scripts/check-pfsense-dns.py +``` + +It's a ~3-second check that would have caught the entire 2026-04-22 cert-manager outage. Consider wiring it into a pre-commit hook or a Gitea Actions workflow. + +## Retiring a service + +1. `kubectl -n argocd delete application infra-` (cascade deletes the K8s resources via ArgoCD finalizers) +2. `git rm -r apps//` and push +3. Remove the pfSense Unbound override — edit `scripts/pfsense-add-dns-overrides.py` to remove from HOSTS, or delete manually via the pfSense UI (Services → DNS Resolver → Host Overrides) + +## Known gotchas + +- **CoreDNS template + ndots:5 collision**: inside pods, `..svc.cluster.local` with <5 dots gets search-expanded through `iamworkin.lan` FIRST and hits the wildcard template → resolves to Traefik VIP, not the real ClusterIP. Use short service names (``) in K8s manifests. See memory `feedback_coredns_ndots_template_collision.md`. +- **Image not on node**: pods stuck `ErrImageNeverPull` means the image wasn't imported to the node Kubernetes scheduled the pod onto. `ctr images import` on all of rke2-server, rke2-agent1, rke2-agent2. +- **StatefulSet PVC drift**: `volumeClaimTemplates` needs explicit `volumeMode: Filesystem` or ArgoCD SSA self-heals forever. See memory `feedback_argocd_statefulset_pvc_drift.md`. +- **ArgoCD must use internal Gitea URL**: `http://gitea-clusterip.gitea.svc.cluster.local:3000/bluejay/bluejay-infra.git`, not the external HTTPS URL (step-ca cert isn't trusted by ArgoCD). The `ApplicationSet` and any hand-created `Application` must both use the internal URL. + +## References + +- Cert-manager recovery playbook: `FlowerCore.Notes/memory/project_cert_manager_recovery_2026_04_22.md` +- Why pfSense DNS is required: `FlowerCore.Notes/memory/feedback_pfsense_dns_required_for_acme.md` +- Canonical credential helper: `FlowerCore.Notes/scripts/credential-helper.sh` +- pfSense admin automation: `FlowerCore.Notes/memory/feedback_pfsense_automation.md` diff --git a/scripts/check-pfsense-dns.py b/scripts/check-pfsense-dns.py new file mode 100644 index 0000000..e6793be --- /dev/null +++ b/scripts/check-pfsense-dns.py @@ -0,0 +1,119 @@ +#!/usr/bin/env python3 +""" +check-pfsense-dns.py + +Fails if any apps/*/*.yaml references an iamworkin.lan hostname in a +cert-manager Certificate `spec.dnsNames` or a Traefik IngressRoute +`Host(...)` match rule that does NOT resolve via the system DNS resolver +(which on this LAN is pfSense Unbound at 10.0.56.1). + +Run from anywhere that uses pfSense as a resolver (LAN hosts, noc1, BLUEJAY-WS): + + python scripts/check-pfsense-dns.py + +Exit code 0: all referenced hosts resolve. 1: at least one doesn't. + +This is intentionally narrow: it only flags hostnames that cert-manager will +actually try to validate via HTTP-01, or that Traefik will route. IRC +server-link names, Docker image tags, comments, etc. are ignored. +""" +from __future__ import annotations + +import os +import re +import socket +import sys +from pathlib import Path + +try: + import yaml # PyYAML +except ImportError: + sys.exit("PyYAML required: pip install pyyaml") + +REPO_ROOT = Path(__file__).resolve().parent.parent +APPS_DIR = REPO_ROOT / "apps" + +HOST_RE = re.compile(r"Host\(`([^`]+)`\)") + + +def extract_hosts_from_doc(doc: dict) -> set[str]: + """Pull iamworkin.lan hostnames from a single K8s manifest doc.""" + out: set[str] = set() + if not isinstance(doc, dict): + return out + + kind = doc.get("kind", "") + spec = doc.get("spec") or {} + + if kind == "Certificate": + for name in spec.get("dnsNames", []) or []: + if isinstance(name, str) and name.endswith(".iamworkin.lan"): + out.add(name) + + elif kind == "IngressRoute": + for route in spec.get("routes", []) or []: + match = route.get("match", "") if isinstance(route, dict) else "" + for h in HOST_RE.findall(match): + if h.endswith(".iamworkin.lan"): + out.add(h) + + return out + + +def collect_hosts() -> dict[str, list[str]]: + """hostname -> [list of manifest files that referenced it].""" + index: dict[str, list[str]] = {} + for path in sorted(APPS_DIR.rglob("*.yaml")): + try: + with path.open("r", encoding="utf-8") as f: + for doc in yaml.safe_load_all(f): + for host in extract_hosts_from_doc(doc): + index.setdefault(host, []).append(str(path.relative_to(REPO_ROOT))) + except yaml.YAMLError as e: + print(f"warn: could not parse {path}: {e}", file=sys.stderr) + return index + + +def resolves(host: str) -> str | None: + try: + return socket.gethostbyname(host) + except OSError: + return None + + +def main() -> int: + hosts = collect_hosts() + if not hosts: + print(f"No iamworkin.lan hostnames found in {APPS_DIR} — nothing to check.") + return 0 + + failed: list[tuple[str, list[str]]] = [] + for host in sorted(hosts): + ip = resolves(host) + if ip: + print(f"OK {host:<45} -> {ip}") + else: + print(f"FAIL {host:<45} (no pfSense Unbound override)") + failed.append((host, hosts[host])) + + if failed: + print() + print(f"ERROR: {len(failed)} host(s) referenced in manifests but not in pfSense Unbound.") + for host, files in failed: + print(f" {host} (referenced in: {', '.join(sorted(set(files)))})") + print() + print("Add them before merging — see README.md step 1.") + print() + print("From FlowerCore.Notes:") + print(" # edit HOSTS list in scripts/pfsense-add-dns-overrides.py") + print(" export PFSENSE_PASS=$(get_cred 'pfSense Admin')") + print(" python scripts/pfsense-add-dns-overrides.py") + return 1 + + print() + print(f"All {len(hosts)} iamworkin.lan host(s) resolve via pfSense. Safe to deploy.") + return 0 + + +if __name__ == "__main__": + sys.exit(main())