Compare commits

..

1 Commits

Author SHA1 Message Date
Andrew Stoltz
46bbd00d09 Add step-ca agent issuer manifest 2026-05-19 17:52:58 -05:00
3 changed files with 90 additions and 205 deletions

View File

@@ -0,0 +1,31 @@
# Step issuer for FlowerCore.DeviceManagement runtime mTLS leaves.
#
# Requires the smallstep step-issuer CRDs/controller:
# stepclusterissuers.certmanager.step.sm
# The provisioner password lives in the live cert-manager Secret below; do not
# commit the password or generated private key material to this repo.
apiVersion: certmanager.step.sm/v1beta1
kind: StepClusterIssuer
metadata:
name: step-ca-agent
labels:
app.kubernetes.io/name: step-ca-agent
app.kubernetes.io/component: pki
app.kubernetes.io/part-of: flowercore
app.kubernetes.io/managed-by: argocd
flowercore.io/tenant-id: system
flowercore.io/created-by: bluejay-infra
annotations:
flowercore.io/traceability-standard: k8s-pod-ownership-and-traceability-standard
flowercore.io/provisioner-source: profile::pki::stepca
flowercore.io/secret-source: cert-manager/step-ca-agent-provisioner-password
spec:
url: https://10.0.56.10:9443
caBundle: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUJ4RENDQVdxZ0F3SUJBZ0lSQVBZMzU3RzZvdzZ6TUFMNSs0YlMya2t3Q2dZSUtvWkl6ajBFQXdJd1FERWEKTUJnR0ExVUVDaE1SU1VGdFYyOXlhMmx1SUVGRFRVVWdRMEV4SWpBZ0JnTlZCQU1UR1VsQmJWZHZjbXRwYmlCQgpRMDFGSUVOQklGSnZiM1FnUTBFd0hoY05Nall3TXpBNE1UZ3dOekV4V2hjTk16WXdNekExTVRnd056RXhXakJBCk1Sb3dHQVlEVlFRS0V4RkpRVzFYYjNKcmFXNGdRVU5OUlNCRFFURWlNQ0FHQTFVRUF4TVpTVUZ0VjI5eWEybHUKSUVGRFRVVWdRMEVnVW05dmRDQkRRVEJaTUJNR0J5cUdTTTQ5QWdFR0NDcUdTTTQ5QXdFSEEwSUFCSjJuMDRYMQpKWm81WmRxL2kxSWR2OCtmcXdaeUF6Qmg3d2hicWowU1dzSkw4VVdSYWJDTXFZQ3M3K2RYTzB4UlN6cWt3RkRMCngrdm9vT2FpOFJnUk5oYWpSVEJETUE0R0ExVWREd0VCL3dRRUF3SUJCakFTQmdOVkhSTUJBZjhFQ0RBR0FRSC8KQWdFQk1CMEdBMVVkRGdRV0JCUm51UFBRUjZpTS9INnZPbHVpVTNTeWdheXo4akFLQmdncWhrak9QUVFEQWdOSQpBREJGQWlFQXJRSzlkWVBHbUFac2RZbmp6aXVGVlZFNU5LWlVjY2VZdkdmR0MrdExYVXNDSUF1ZEYyekpyQ1JxCjNtSzUwWlpFVC9md1RrSndpRUY0ODI0bWpQOHAxQ0tNCi0tLS0tRU5EIENFUlRJRklDQVRFLS0tLS0K
provisioner:
name: step-ca-agent
kid: RF3A9welUYVOWBX8tr19aWyA2kQlxoGZN1dRwTElUEM
passwordRef:
name: step-ca-agent-provisioner-password
namespace: cert-manager
key: password

View File

@@ -656,15 +656,14 @@ data:
summary: "Print queue backlog on edge2 ({{ $value }} active jobs)"
description: "CUPS has {{ $value }} active jobs queued. Possible printer jam, USB disconnect, or paper out."
# Printer hardware and paper-roll lifecycle alerts.
# print_printer_online: 1 when the transport is reachable/selected.
# print_printer_state enum: 0 unknown, 1 online, 2 offline,
# 3 paper_depleted, 4 jam, 5 head_error, 6 cover_open.
# Offline/jam/cover alerts stay IRC-only. Paper depleted and head
# error may route to the thermal digest only when the printer is
# online enough to make that useful.
# Paper roll lifecycle alerts (XL Track I, 2026-04-26).
# Source-of-truth gauge: print_paper_remaining_percent (Print.Web OTEL,
# hydrated on startup from the active PaperRoll row).
# alert_channel=thermal_print routes through irc-notify -> Print.Web
# /api/print/alert so the printer announces its own paper-out warning
# on its remaining paper. Self-referential humor + operator nudge.
- alert: PrintPaperRollLow
expr: (print_paper_remaining_percent{job="printweb-otel"} < 10 and print_paper_remaining_percent{job="printweb-otel"} > 5) and print_printer_online{job="printweb-otel"} == 1
expr: print_paper_remaining_percent{job="printweb-otel"} < 10 and print_paper_remaining_percent{job="printweb-otel"} > 5
for: 5m
labels:
severity: warning
@@ -673,59 +672,15 @@ data:
summary: "Print roll low on edge2 ({{ $value | printf \"%.1f\" }}% remaining)"
description: "NuPrint 210 paper roll has {{ $value | printf \"%.1f\" }}% remaining. Operator should load a fresh roll soon. Run /api/paper/status for the precise mm + estimated jobs left."
- alert: PrinterOfflineWarning
expr: print_printer_state{job="printweb-otel"} == 2
for: 2m
labels:
severity: warning
service: print-web
alert_channel: irc
annotations:
summary: "Print.Web printer offline on edge2"
description: "Print.Web reports the NuPrint 210 transport is offline or unreachable. IRC-only by design: do not thermal-print an alert when the thermal printer itself is offline."
- alert: PrintPaperRollCritical
expr: print_printer_state{job="printweb-otel"} == 3 and print_printer_online{job="printweb-otel"} == 1
expr: print_paper_remaining_percent{job="printweb-otel"} <= 5
for: 2m
labels:
severity: critical
alert_channel: thermal_print
annotations:
summary: "Print paper depleted on edge2"
description: "NuPrint 210 reports paper depleted while the printer is still online. Load a new roll, drain the hardware buffer if needed, then replay DeadLetter jobs from /print-log."
- alert: PrinterJamWarning
expr: print_printer_state{job="printweb-otel"} == 4
for: 2m
labels:
severity: warning
service: print-web
alert_channel: irc
annotations:
summary: "Print.Web printer jam on edge2"
description: "Print.Web reports a paper/cutter jam state. IRC-only: clear the jam, drain the hardware buffer if bytes were queued, then retry affected jobs."
- alert: PrinterHeadErrorCritical
expr: print_printer_state{job="printweb-otel"} == 5
for: 2m
labels:
severity: critical
service: print-web
alert_channel: thermal_print
annotations:
summary: "Print.Web printer head error on edge2"
description: "Print.Web reports a thermal head or unrecoverable printer error. Critical routing may enter the thermal digest per existing policy; IRC remains the primary triage stream."
- alert: PrinterCoverOpenWarning
expr: print_printer_state{job="printweb-otel"} == 6
for: 2m
labels:
severity: warning
service: print-web
alert_channel: irc
annotations:
summary: "Print.Web printer cover open on edge2"
description: "Print.Web reports the printer cover/lid is open. IRC-only: close the cover and verify /api/print/status before retrying jobs."
summary: "Print roll critical on edge2 ({{ $value | printf \"%.1f\" }}% remaining)"
description: "NuPrint 210 paper roll at {{ $value | printf \"%.1f\" }}% — load a new roll NOW. The 50ft roll has a ~12% red-stripe zone; once paper passes that, the printer can run dry mid-job."
- alert: PrintJobDeadLetter
expr: increase(print_jobs_dead_letter_total[15m]) > 0
@@ -3680,146 +3635,6 @@ data:
relativeTimeRange: {from: 120, to: 0}
datasourceUid: __expr__
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [600], type: gt}}], refId: C}
- orgId: 1
name: Print Services
folder: Print Alerts
interval: 1m
rules:
- uid: printer-offline-warning
title: PrinterOfflineWarning
condition: C
for: 2m
noDataState: OK
execErrState: OK
annotations:
summary: "Print.Web printer offline on edge2"
description: "Print.Web reports the NuPrint 210 transport is offline or unreachable. IRC-only by design: do not thermal-print an alert when the thermal printer itself is offline."
runbook: "1. Check edge2 power/network 2. Check USB/CUPS queue 3. Open https://print.iamworkin.lan/admin 4. Do not force thermal routing for offline alerts."
labels:
severity: warning
service: print-web
alert_channel: irc
data:
- refId: A
relativeTimeRange: {from: 120, to: 0}
datasourceUid: prometheus
model: {expr: 'print_printer_state{job="printweb-otel"} == 2', instant: true, refId: A}
- refId: B
relativeTimeRange: {from: 120, to: 0}
datasourceUid: __expr__
model: {type: reduce, expression: A, reducer: last, refId: B}
- refId: C
relativeTimeRange: {from: 120, to: 0}
datasourceUid: __expr__
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [0], type: gt}}], refId: C}
- uid: print-paper-roll-critical
title: PrintPaperRollCritical
condition: C
for: 2m
noDataState: OK
execErrState: OK
annotations:
summary: "Print paper depleted on edge2"
description: "NuPrint 210 reports paper depleted while the printer is still online. Load a new roll, drain the hardware buffer if needed, then replay DeadLetter jobs from /print-log."
runbook: "1. Load a fresh roll 2. Drain the hardware buffer if paper-out happened mid-job 3. Open https://print.iamworkin.lan/print-log 4. Retry DeadLetter jobs after the state clears."
labels:
severity: critical
service: print-web
alert_channel: thermal_print
data:
- refId: A
relativeTimeRange: {from: 120, to: 0}
datasourceUid: prometheus
model: {expr: 'print_printer_state{job="printweb-otel"} == 3 and print_printer_online{job="printweb-otel"} == 1', instant: true, refId: A}
- refId: B
relativeTimeRange: {from: 120, to: 0}
datasourceUid: __expr__
model: {type: reduce, expression: A, reducer: last, refId: B}
- refId: C
relativeTimeRange: {from: 120, to: 0}
datasourceUid: __expr__
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [0], type: gt}}], refId: C}
- uid: printer-jam-warning
title: PrinterJamWarning
condition: C
for: 2m
noDataState: OK
execErrState: OK
annotations:
summary: "Print.Web printer jam on edge2"
description: "Print.Web reports a paper/cutter jam state. IRC-only: clear the jam, drain the hardware buffer if bytes were queued, then retry affected jobs."
runbook: "1. Clear paper/cutter path 2. Drain hardware buffer if CUPS queued bytes 3. Verify /api/print/status 4. Retry affected jobs."
labels:
severity: warning
service: print-web
alert_channel: irc
data:
- refId: A
relativeTimeRange: {from: 120, to: 0}
datasourceUid: prometheus
model: {expr: 'print_printer_state{job="printweb-otel"} == 4', instant: true, refId: A}
- refId: B
relativeTimeRange: {from: 120, to: 0}
datasourceUid: __expr__
model: {type: reduce, expression: A, reducer: last, refId: B}
- refId: C
relativeTimeRange: {from: 120, to: 0}
datasourceUid: __expr__
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [0], type: gt}}], refId: C}
- uid: printer-head-error-critical
title: PrinterHeadErrorCritical
condition: C
for: 2m
noDataState: OK
execErrState: OK
annotations:
summary: "Print.Web printer head error on edge2"
description: "Print.Web reports a thermal head or unrecoverable printer error. Critical routing may enter the thermal digest per existing policy; IRC remains the primary triage stream."
runbook: "1. Let the printer cool if overheated 2. Power-cycle only after checking queued jobs 3. Verify /api/print/status 4. Retry jobs after the state clears."
labels:
severity: critical
service: print-web
alert_channel: thermal_print
data:
- refId: A
relativeTimeRange: {from: 120, to: 0}
datasourceUid: prometheus
model: {expr: 'print_printer_state{job="printweb-otel"} == 5', instant: true, refId: A}
- refId: B
relativeTimeRange: {from: 120, to: 0}
datasourceUid: __expr__
model: {type: reduce, expression: A, reducer: last, refId: B}
- refId: C
relativeTimeRange: {from: 120, to: 0}
datasourceUid: __expr__
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [0], type: gt}}], refId: C}
- uid: printer-cover-open-warning
title: PrinterCoverOpenWarning
condition: C
for: 2m
noDataState: OK
execErrState: OK
annotations:
summary: "Print.Web printer cover open on edge2"
description: "Print.Web reports the printer cover/lid is open. IRC-only: close the cover and verify /api/print/status before retrying jobs."
runbook: "1. Close the printer cover 2. Verify /api/print/status returns online 3. Retry affected jobs only after the state clears."
labels:
severity: warning
service: print-web
alert_channel: irc
data:
- refId: A
relativeTimeRange: {from: 120, to: 0}
datasourceUid: prometheus
model: {expr: 'print_printer_state{job="printweb-otel"} == 6', instant: true, refId: A}
- refId: B
relativeTimeRange: {from: 120, to: 0}
datasourceUid: __expr__
model: {type: reduce, expression: A, reducer: last, refId: B}
- refId: C
relativeTimeRange: {from: 120, to: 0}
datasourceUid: __expr__
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [0], type: gt}}], refId: C}
- orgId: 1
name: CI Runners
folder: CI Alerts

View File

@@ -304,7 +304,7 @@ public sealed class FleetManifestLintTests
}
[Fact]
public void Monitoring_MustIncludeRequiredAlertRoutingGuards()
public void Monitoring_MustAlertWhenLinuxRunnerDeploymentIsUnavailable()
{
var monitoring = File.ReadAllText(Path.Combine(Inventory.BluejayRoot, "apps", "monitoring", "noc-monitoring.yaml"));
@@ -315,15 +315,6 @@ public sealed class FleetManifestLintTests
monitoring.Should().Contain("folder: CI Alerts");
monitoring.Should().Contain("uid: linux-runner-offline");
monitoring.Should().Contain("alert_channel: irc");
monitoring.Should().Contain("PrinterOfflineWarning");
monitoring.Should().Contain("expr: print_printer_state{job=\"printweb-otel\"} == 2");
monitoring.Should().Contain("IRC-only by design: do not thermal-print an alert when the thermal printer itself is offline.");
monitoring.Should().Contain("PrintPaperRollCritical");
monitoring.Should().Contain("expr: print_printer_state{job=\"printweb-otel\"} == 3 and print_printer_online{job=\"printweb-otel\"} == 1");
monitoring.Should().Contain("PrinterJamWarning");
monitoring.Should().Contain("PrinterHeadErrorCritical");
monitoring.Should().Contain("PrinterCoverOpenWarning");
}
[Fact]
@@ -441,6 +432,7 @@ public sealed class FleetManifestLintTests
"1password-item.yaml",
"argocd-application.yaml",
"certificate-web.yaml",
"clusterissuer-step-ca-agent.yaml",
"clusterrole-operator.yaml",
"clusterrolebinding-operator.yaml",
"deployment-operator.yaml",
@@ -525,6 +517,53 @@ public sealed class FleetManifestLintTests
.ContainSingle("devices.iamworkin.lan");
}
[Fact]
public void FcDeviceManagement_StepCaAgentIssuerMustTargetNocProvisioner()
{
var issuer = FcDeviceManagementDocuments()
.Single(document => document.Kind == "StepClusterIssuer" && document.Name == "step-ca-agent");
issuer.Scalar("apiVersion").Should().Be("certmanager.step.sm/v1beta1");
issuer.Scalar("spec", "url").Should().Be("https://10.0.56.10:9443");
issuer.Scalar("spec", "caBundle").Should().NotBeNullOrWhiteSpace();
issuer.Scalar("spec", "provisioner", "name").Should().Be("step-ca-agent");
issuer.Scalar("spec", "provisioner", "kid").Should().Be("RF3A9welUYVOWBX8tr19aWyA2kQlxoGZN1dRwTElUEM");
}
[Fact]
public void FcDeviceManagement_StepCaAgentIssuerMustReferencePasswordSecretOnly()
{
var issuer = FcDeviceManagementDocuments()
.Single(document => document.Kind == "StepClusterIssuer" && document.Name == "step-ca-agent");
issuer.Scalar("spec", "provisioner", "passwordRef", "name")
.Should()
.Be("step-ca-agent-provisioner-password");
issuer.Scalar("spec", "provisioner", "passwordRef", "namespace").Should().Be("cert-manager");
issuer.Scalar("spec", "provisioner", "passwordRef", "key").Should().Be("password");
var issuerText = File.ReadAllText(Path.Combine(Inventory.BluejayRoot, "apps", "fc-devicemgmt", "clusterissuer-step-ca-agent.yaml"));
issuerText.Should().NotContain("stringData:");
issuerText.Should().NotContain("password:");
issuerText.Should().NotContain("privateKey");
}
[Fact]
public void FcDeviceManagement_StepCaAgentIssuerMustCarryTraceabilityMetadata()
{
var issuer = FcDeviceManagementDocuments()
.Single(document => document.Kind == "StepClusterIssuer" && document.Name == "step-ca-agent");
issuer.Scalar("metadata", "labels", "app.kubernetes.io/managed-by").Should().Be("argocd");
issuer.Scalar("metadata", "labels", "flowercore.io/tenant-id").Should().Be("system");
issuer.Scalar("metadata", "annotations", "flowercore.io/provisioner-source")
.Should()
.Be("profile::pki::stepca");
issuer.Scalar("metadata", "annotations", "flowercore.io/secret-source")
.Should()
.Be("cert-manager/step-ca-agent-provisioner-password");
}
[Fact]
public void FcDeviceManagement_OperatorRbacMustCoverDevicesAndOwnerLookup()
{