docs(openvox): document quadlet durability smoke (#12)
This commit was merged in pull request #12.
This commit is contained in:
@@ -118,6 +118,7 @@ That test project sweeps `bluejay-infra/apps/**` plus the canonical sibling `Flo
|
||||
|
||||
## References
|
||||
|
||||
- OpenVox noc1 durability runbook: `docs/runbooks/openvoxserver-quadlet-durability.md`
|
||||
- Cert-manager recovery playbook: `FlowerCore.Notes/memory/project_cert_manager_recovery_2026_04_22.md`
|
||||
- Why pfSense DNS is required: `FlowerCore.Notes/memory/feedback_pfsense_dns_required_for_acme.md`
|
||||
- Public DNS operator host: `https://dns.iamworkin.lan`
|
||||
|
||||
84
docs/runbooks/openvoxserver-quadlet-durability.md
Normal file
84
docs/runbooks/openvoxserver-quadlet-durability.md
Normal file
@@ -0,0 +1,84 @@
|
||||
# openvoxserver Quadlet Durability
|
||||
|
||||
This runbook documents the noc1 `openvoxserver` durability fix for the Puppet control-repo deploy path. The service is a noc1 host artifact, not an ArgoCD application, so discovery always starts on noc1 rather than in `apps/*`.
|
||||
|
||||
## Current State
|
||||
|
||||
As of the Sprint 32 Cx-12 apply on 2026-05-17:
|
||||
|
||||
- `/etc/containers/systemd/openvoxserver.container` has a `GIT_SSH_COMMAND` environment entry that points at the persisted serverdata deploy key.
|
||||
- `/etc/systemd/system/openvoxserver-safeconfig.service` is enabled and active, and reapplies `git config --global --add safe.directory *` inside the running container.
|
||||
- `/opt/puppet/r10k-deploy.sh` self-heals before each fetch by setting `safe.directory`, the repo-local `core.sshCommand`, and the persisted `known_hosts` file when needed.
|
||||
- `puppet-deploy.service` exits `0/SUCCESS` after the apply and the control repo reports `HEAD == origin/master`.
|
||||
- `systemctl cat openvoxserver` does not currently resolve to a generated unit on noc1. The container is running through Podman with `restart=always`, so destructive recreate smoke must not run until the generated unit is present.
|
||||
|
||||
## Discovery
|
||||
|
||||
Run every command through noc1 as `fcadmin`; do not assume BLUEJAY-WS can reach container-local surfaces directly.
|
||||
|
||||
```bash
|
||||
ssh -i ~/.ssh/fcadmin_ed25519 fcadmin@10.0.56.10 "hostname && sudo -n true"
|
||||
ssh -i ~/.ssh/fcadmin_ed25519 fcadmin@10.0.56.10 "sudo find /etc/containers/systemd /usr/share/containers/systemd /etc/systemd/system -name 'openvoxserver*' 2>/dev/null"
|
||||
ssh -i ~/.ssh/fcadmin_ed25519 fcadmin@10.0.56.10 "sudo sed -n '1,220p' /etc/containers/systemd/openvoxserver.container"
|
||||
ssh -i ~/.ssh/fcadmin_ed25519 fcadmin@10.0.56.10 "sudo systemctl cat puppet-deploy.service"
|
||||
```
|
||||
|
||||
If a future noc1 profile manages these files, update the Puppet control repo and let `puppet-deploy.service` apply the change. On 2026-05-17, host `puppet` was not installed, so Cx-12 used a direct noc1 host edit.
|
||||
|
||||
## Durable Fix Shape
|
||||
|
||||
The Quadlet keeps the deploy key as a path reference only:
|
||||
|
||||
```ini
|
||||
Environment=GIT_SSH_COMMAND=ssh -i /opt/puppetlabs/server/data/puppetserver/.puppet-deploy-key -o StrictHostKeyChecking=yes -o IdentitiesOnly=yes -o UserKnownHostsFile=/opt/puppetlabs/server/data/puppetserver/.known_hosts
|
||||
```
|
||||
|
||||
The safeconfig service is intentionally independent of `openvoxserver.service` until the generated unit exists. It waits for the `openvoxserver` container name and then runs:
|
||||
|
||||
```bash
|
||||
/usr/bin/podman exec openvoxserver git config --global --add safe.directory *
|
||||
```
|
||||
|
||||
The deploy script self-heals inside the container before it fetches the control repo:
|
||||
|
||||
```bash
|
||||
git config --global --add safe.directory "*" 2>/dev/null || true
|
||||
DEPLOY_KEY="/opt/puppetlabs/server/data/puppetserver/.puppet-deploy-key"
|
||||
KNOWN_HOSTS="/opt/puppetlabs/server/data/puppetserver/.known_hosts"
|
||||
REPO="/etc/puppetlabs/code/environments/production"
|
||||
export GIT_SSH_COMMAND="ssh -i $DEPLOY_KEY -o StrictHostKeyChecking=yes -o IdentitiesOnly=yes -o UserKnownHostsFile=$KNOWN_HOSTS"
|
||||
git -C "$REPO" config core.sshCommand "$GIT_SSH_COMMAND" 2>/dev/null || true
|
||||
```
|
||||
|
||||
## Validation
|
||||
|
||||
Non-destructive validation:
|
||||
|
||||
```bash
|
||||
ssh -i ~/.ssh/fcadmin_ed25519 fcadmin@10.0.56.10 "sudo grep -n 'GIT_SSH_COMMAND' /etc/containers/systemd/openvoxserver.container"
|
||||
ssh -i ~/.ssh/fcadmin_ed25519 fcadmin@10.0.56.10 "sudo systemctl status openvoxserver-safeconfig.service --no-pager -l"
|
||||
ssh -i ~/.ssh/fcadmin_ed25519 fcadmin@10.0.56.10 "sudo systemctl start puppet-deploy.service && sudo systemctl status puppet-deploy.service --no-pager -l"
|
||||
ssh -i ~/.ssh/fcadmin_ed25519 fcadmin@10.0.56.10 "sudo podman exec openvoxserver git -C /etc/puppetlabs/code/environments/production config --get core.sshCommand"
|
||||
```
|
||||
|
||||
Destructive recreate smoke is opt-in only:
|
||||
|
||||
```bash
|
||||
scp scripts/monitoring/openvox-recreate-smoke.sh fcadmin@10.0.56.10:/tmp/openvox-recreate-smoke.sh
|
||||
ssh -i ~/.ssh/fcadmin_ed25519 fcadmin@10.0.56.10 "chmod +x /tmp/openvox-recreate-smoke.sh && sudo OPENVOX_RECREATE_SMOKE=1 /tmp/openvox-recreate-smoke.sh"
|
||||
```
|
||||
|
||||
Do not run the smoke during normal sprint work. It stops and removes the production container before starting it again through systemd, and it now refuses to continue unless `systemctl cat openvoxserver` succeeds.
|
||||
|
||||
## Credential Rotation Note
|
||||
|
||||
When rotating the Puppet deploy key, update the persisted serverdata copy on noc1:
|
||||
|
||||
```bash
|
||||
sudo install -m 0600 -o root -g root <new-deploy-key> /opt/puppet/serverdata/.puppet-deploy-key
|
||||
sudo podman exec openvoxserver sh -c "ssh-keyscan github.com > /opt/puppetlabs/server/data/puppetserver/.known_hosts"
|
||||
sudo systemctl start openvoxserver-safeconfig.service
|
||||
sudo systemctl start puppet-deploy.service
|
||||
```
|
||||
|
||||
Never commit the deploy key or print it in logs.
|
||||
48
scripts/monitoring/openvox-recreate-smoke.sh
Executable file
48
scripts/monitoring/openvox-recreate-smoke.sh
Executable file
@@ -0,0 +1,48 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
if [ "${OPENVOX_RECREATE_SMOKE:-}" != "1" ]; then
|
||||
echo "SKIP: set OPENVOX_RECREATE_SMOKE=1 to run the destructive openvoxserver recreate smoke." >&2
|
||||
exit 64
|
||||
fi
|
||||
|
||||
SUDO="${SUDO:-sudo}"
|
||||
REPO="/etc/puppetlabs/code/environments/production"
|
||||
CORE_SSH_COMMAND_FRAGMENT=".puppet-deploy-key"
|
||||
|
||||
if ! $SUDO systemctl cat openvoxserver >/dev/null 2>&1; then
|
||||
echo "SKIP: systemctl cat openvoxserver failed; refusing to remove a container without a verified systemd recreate path." >&2
|
||||
exit 65
|
||||
fi
|
||||
|
||||
before="$($SUDO podman exec openvoxserver git -C "$REPO" rev-parse --short HEAD)"
|
||||
echo "Before recreate: $before"
|
||||
|
||||
$SUDO systemctl stop openvoxserver
|
||||
$SUDO podman rm openvoxserver 2>/dev/null || true
|
||||
$SUDO systemctl start openvoxserver
|
||||
|
||||
sleep 50
|
||||
|
||||
$SUDO systemctl start puppet-deploy.service
|
||||
sleep 5
|
||||
|
||||
$SUDO systemctl status puppet-deploy.service --no-pager -l
|
||||
|
||||
after="$($SUDO podman exec openvoxserver git -C "$REPO" rev-parse --short origin/master)"
|
||||
echo "After recreate origin/master: $after"
|
||||
|
||||
$SUDO test -d /opt/puppet/code/environments/production/site-modules/profile/manifests
|
||||
|
||||
core_ssh="$($SUDO podman exec openvoxserver git -C "$REPO" config --get core.sshCommand)"
|
||||
case "$core_ssh" in
|
||||
*"$CORE_SSH_COMMAND_FRAGMENT"*) ;;
|
||||
*)
|
||||
echo "FAIL: core.sshCommand does not reference the persisted deploy key." >&2
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
$SUDO podman exec openvoxserver git -C "$REPO" status --short --branch
|
||||
|
||||
echo "PASS: openvoxserver recreate smoke completed without git safety or deploy-key failure."
|
||||
99
tests/bluejay-infra-lint/OpenVoxServerDurabilityTests.cs
Normal file
99
tests/bluejay-infra-lint/OpenVoxServerDurabilityTests.cs
Normal file
@@ -0,0 +1,99 @@
|
||||
using FluentAssertions;
|
||||
using Xunit;
|
||||
|
||||
namespace BluejayInfraLint.Tests;
|
||||
|
||||
[Trait("Category", "Unit")]
|
||||
public sealed class OpenVoxServerDurabilityTests
|
||||
{
|
||||
private static readonly string Root = FindRepoRoot();
|
||||
private static readonly string RunbookPath = Path.Combine(Root, "docs", "runbooks", "openvoxserver-quadlet-durability.md");
|
||||
private static readonly string SmokePath = Path.Combine(Root, "scripts", "monitoring", "openvox-recreate-smoke.sh");
|
||||
|
||||
[Fact]
|
||||
public void Runbook_DocumentsHostArtifactAndNonArgoPath()
|
||||
{
|
||||
var runbook = File.ReadAllText(RunbookPath);
|
||||
|
||||
runbook.Should().Contain("noc1 host artifact");
|
||||
runbook.Should().Contain("not an ArgoCD application");
|
||||
runbook.Should().Contain("systemctl cat openvoxserver");
|
||||
runbook.Should().Contain("/etc/containers/systemd/openvoxserver.container");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Runbook_DocumentsCx12LiveApplyState()
|
||||
{
|
||||
var runbook = File.ReadAllText(RunbookPath);
|
||||
|
||||
runbook.Should().Contain("Sprint 32 Cx-12");
|
||||
runbook.Should().Contain("openvoxserver-safeconfig.service");
|
||||
runbook.Should().Contain("/opt/puppet/r10k-deploy.sh");
|
||||
runbook.Should().Contain("HEAD == origin/master");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void SmokeScript_IsExplicitlyOptIn()
|
||||
{
|
||||
var smoke = File.ReadAllText(SmokePath);
|
||||
|
||||
smoke.Should().Contain("OPENVOX_RECREATE_SMOKE");
|
||||
smoke.Should().Contain("exit 64");
|
||||
smoke.IndexOf("OPENVOX_RECREATE_SMOKE", StringComparison.Ordinal)
|
||||
.Should().BeLessThan(smoke.IndexOf("systemctl stop openvoxserver", StringComparison.Ordinal));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void SmokeScript_RequiresGeneratedSystemdUnitBeforeRemovingContainer()
|
||||
{
|
||||
var smoke = File.ReadAllText(SmokePath);
|
||||
|
||||
smoke.Should().Contain("systemctl cat openvoxserver");
|
||||
smoke.Should().Contain("refusing to remove a container without a verified systemd recreate path");
|
||||
smoke.IndexOf("systemctl cat openvoxserver", StringComparison.Ordinal)
|
||||
.Should().BeLessThan(smoke.IndexOf("podman rm openvoxserver", StringComparison.Ordinal));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Artifacts_DoNotStoreSecretsOrPaidRunnerLabels()
|
||||
{
|
||||
var forbidden = new[]
|
||||
{
|
||||
"BEGIN OPENSSH PRIVATE KEY",
|
||||
"BEGIN RSA PRIVATE KEY",
|
||||
"ubuntu-latest",
|
||||
"windows-latest",
|
||||
"macos-latest",
|
||||
};
|
||||
|
||||
var violations = new[] { RunbookPath, SmokePath }
|
||||
.SelectMany(path =>
|
||||
{
|
||||
var text = File.ReadAllText(path);
|
||||
return forbidden
|
||||
.Where(token => text.Contains(token, StringComparison.OrdinalIgnoreCase))
|
||||
.Select(token => $"{Path.GetRelativePath(Root, path)} contains forbidden token {token}");
|
||||
})
|
||||
.ToList();
|
||||
|
||||
violations.Should().BeEmpty();
|
||||
}
|
||||
|
||||
private static string FindRepoRoot()
|
||||
{
|
||||
var current = new DirectoryInfo(AppContext.BaseDirectory);
|
||||
while (current is not null)
|
||||
{
|
||||
if (Directory.Exists(Path.Combine(current.FullName, "apps"))
|
||||
&& Directory.Exists(Path.Combine(current.FullName, "scripts"))
|
||||
&& File.Exists(Path.Combine(current.FullName, "README.md")))
|
||||
{
|
||||
return current.FullName;
|
||||
}
|
||||
|
||||
current = current.Parent;
|
||||
}
|
||||
|
||||
throw new DirectoryNotFoundException("Could not find bluejay-infra root.");
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user