Compare commits
9 Commits
sprint40/c
...
1f1f6823db
| Author | SHA1 | Date | |
|---|---|---|---|
| 1f1f6823db | |||
|
|
b92f74b63a | ||
|
|
cb7f7dbc4d | ||
|
|
03126d5584 | ||
|
|
495e884c41 | ||
|
|
65aa1e6104 | ||
|
|
7f2a3b76b4 | ||
| ea73f00461 | |||
|
|
25ace30a03 |
448
apps/authentik/authentik.yaml
Normal file
448
apps/authentik/authentik.yaml
Normal file
@@ -0,0 +1,448 @@
|
|||||||
|
# Authentik OIDC backend
|
||||||
|
# ArgoCD-managed. BlueJay Lab.
|
||||||
|
#
|
||||||
|
# Stack:
|
||||||
|
# - PostgreSQL 16 StatefulSet (single replica, Longhorn RWO 5Gi)
|
||||||
|
# - Redis 7 Deployment (no persistence — session/cache only)
|
||||||
|
# - Authentik server + worker Deployments (image ghcr.io/goauthentik/server:2024.12.3)
|
||||||
|
# - Media PVC shared between server + worker (Longhorn RWO 2Gi)
|
||||||
|
# - Certificate via step-ca-acme ClusterIssuer
|
||||||
|
# - Traefik IngressRoute at id.iamworkin.lan
|
||||||
|
#
|
||||||
|
# Secrets come from 1Password item "authentik-credentials" (IAmWorkin vault, id y6i74ch22q5wvm7znquq4nhhcu)
|
||||||
|
# via the OnePasswordItem CRD, materialized into k8s Secret authentik/authentik-credentials.
|
||||||
|
#
|
||||||
|
# Why the discovery URL is /application/o/pimanager/ : Authentik issues per-application OIDC providers.
|
||||||
|
# The pimanager OIDC application/provider is created after the cluster pods are healthy (manual or
|
||||||
|
# via API once the bootstrap token is available — see Notes substrate).
|
||||||
|
|
||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Namespace
|
||||||
|
metadata:
|
||||||
|
name: authentik
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/part-of: bluejay-infra
|
||||||
|
|
||||||
|
---
|
||||||
|
# 1Password operator pulls the authentik-credentials item into a k8s Secret of the same name.
|
||||||
|
# Field labels in 1P become Secret keys: AUTHENTIK_SECRET_KEY, POSTGRES_PASSWORD, REDIS_PASSWORD,
|
||||||
|
# BOOTSTRAP_ADMIN_PASSWORD, BOOTSTRAP_ADMIN_TOKEN, BOOTSTRAP_ADMIN_EMAIL.
|
||||||
|
apiVersion: onepassword.com/v1
|
||||||
|
kind: OnePasswordItem
|
||||||
|
metadata:
|
||||||
|
name: authentik-credentials
|
||||||
|
namespace: authentik
|
||||||
|
spec:
|
||||||
|
itemPath: "vaults/IAmWorkin/items/authentik-credentials"
|
||||||
|
|
||||||
|
---
|
||||||
|
# Shared media volume for server + worker pods.
|
||||||
|
apiVersion: v1
|
||||||
|
kind: PersistentVolumeClaim
|
||||||
|
metadata:
|
||||||
|
name: authentik-media
|
||||||
|
namespace: authentik
|
||||||
|
spec:
|
||||||
|
storageClassName: longhorn
|
||||||
|
accessModes: [ReadWriteOnce]
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
storage: 2Gi
|
||||||
|
|
||||||
|
---
|
||||||
|
# PostgreSQL 16 StatefulSet — Authentik's primary store.
|
||||||
|
apiVersion: apps/v1
|
||||||
|
kind: StatefulSet
|
||||||
|
metadata:
|
||||||
|
name: authentik-postgres
|
||||||
|
namespace: authentik
|
||||||
|
labels:
|
||||||
|
app: authentik-postgres
|
||||||
|
argocd.argoproj.io/instance: infra-authentik
|
||||||
|
spec:
|
||||||
|
persistentVolumeClaimRetentionPolicy:
|
||||||
|
whenDeleted: Retain
|
||||||
|
whenScaled: Retain
|
||||||
|
podManagementPolicy: OrderedReady
|
||||||
|
serviceName: authentik-postgres
|
||||||
|
replicas: 1
|
||||||
|
revisionHistoryLimit: 10
|
||||||
|
selector:
|
||||||
|
matchLabels:
|
||||||
|
app: authentik-postgres
|
||||||
|
template:
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app: authentik-postgres
|
||||||
|
spec:
|
||||||
|
containers:
|
||||||
|
- name: postgres
|
||||||
|
image: postgres:16-alpine
|
||||||
|
ports:
|
||||||
|
- containerPort: 5432
|
||||||
|
name: postgres
|
||||||
|
env:
|
||||||
|
- name: POSTGRES_USER
|
||||||
|
value: authentik
|
||||||
|
- name: POSTGRES_PASSWORD
|
||||||
|
valueFrom:
|
||||||
|
secretKeyRef:
|
||||||
|
name: authentik-credentials
|
||||||
|
key: POSTGRES_PASSWORD
|
||||||
|
- name: POSTGRES_DB
|
||||||
|
value: authentik
|
||||||
|
- name: POSTGRES_INITDB_ARGS
|
||||||
|
value: "--encoding=UTF-8 --lc-collate=C --lc-ctype=C"
|
||||||
|
- name: PGDATA
|
||||||
|
value: /var/lib/postgresql/data/pgdata
|
||||||
|
readinessProbe:
|
||||||
|
exec:
|
||||||
|
command: ["pg_isready", "-U", "authentik"]
|
||||||
|
initialDelaySeconds: 5
|
||||||
|
periodSeconds: 5
|
||||||
|
livenessProbe:
|
||||||
|
exec:
|
||||||
|
command: ["pg_isready", "-U", "authentik"]
|
||||||
|
initialDelaySeconds: 30
|
||||||
|
periodSeconds: 30
|
||||||
|
resources:
|
||||||
|
requests: { cpu: 100m, memory: 256Mi }
|
||||||
|
limits: { cpu: 1000m, memory: 1Gi }
|
||||||
|
volumeMounts:
|
||||||
|
- name: pgdata
|
||||||
|
mountPath: /var/lib/postgresql/data
|
||||||
|
volumeClaimTemplates:
|
||||||
|
- metadata:
|
||||||
|
name: pgdata
|
||||||
|
spec:
|
||||||
|
storageClassName: longhorn
|
||||||
|
accessModes: [ReadWriteOnce]
|
||||||
|
volumeMode: Filesystem
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
storage: 5Gi
|
||||||
|
|
||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Service
|
||||||
|
metadata:
|
||||||
|
name: authentik-postgres
|
||||||
|
namespace: authentik
|
||||||
|
spec:
|
||||||
|
clusterIP: None
|
||||||
|
selector:
|
||||||
|
app: authentik-postgres
|
||||||
|
ports:
|
||||||
|
- name: postgres
|
||||||
|
port: 5432
|
||||||
|
targetPort: 5432
|
||||||
|
|
||||||
|
---
|
||||||
|
# Redis 7 — session storage + Celery broker. No persistence needed (cache).
|
||||||
|
apiVersion: apps/v1
|
||||||
|
kind: Deployment
|
||||||
|
metadata:
|
||||||
|
name: authentik-redis
|
||||||
|
namespace: authentik
|
||||||
|
labels:
|
||||||
|
app: authentik-redis
|
||||||
|
argocd.argoproj.io/instance: infra-authentik
|
||||||
|
spec:
|
||||||
|
replicas: 1
|
||||||
|
strategy:
|
||||||
|
type: Recreate
|
||||||
|
selector:
|
||||||
|
matchLabels:
|
||||||
|
app: authentik-redis
|
||||||
|
template:
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app: authentik-redis
|
||||||
|
spec:
|
||||||
|
containers:
|
||||||
|
- name: redis
|
||||||
|
image: redis:7-alpine
|
||||||
|
args:
|
||||||
|
- "--save"
|
||||||
|
- ""
|
||||||
|
- "--appendonly"
|
||||||
|
- "no"
|
||||||
|
- "--requirepass"
|
||||||
|
- "$(REDIS_PASSWORD)"
|
||||||
|
env:
|
||||||
|
- name: REDIS_PASSWORD
|
||||||
|
valueFrom:
|
||||||
|
secretKeyRef:
|
||||||
|
name: authentik-credentials
|
||||||
|
key: REDIS_PASSWORD
|
||||||
|
ports:
|
||||||
|
- containerPort: 6379
|
||||||
|
name: redis
|
||||||
|
readinessProbe:
|
||||||
|
tcpSocket: { port: 6379 }
|
||||||
|
initialDelaySeconds: 5
|
||||||
|
periodSeconds: 5
|
||||||
|
livenessProbe:
|
||||||
|
tcpSocket: { port: 6379 }
|
||||||
|
initialDelaySeconds: 30
|
||||||
|
periodSeconds: 30
|
||||||
|
resources:
|
||||||
|
requests: { cpu: 50m, memory: 64Mi }
|
||||||
|
limits: { cpu: 500m, memory: 256Mi }
|
||||||
|
|
||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Service
|
||||||
|
metadata:
|
||||||
|
name: authentik-redis
|
||||||
|
namespace: authentik
|
||||||
|
spec:
|
||||||
|
selector:
|
||||||
|
app: authentik-redis
|
||||||
|
ports:
|
||||||
|
- name: redis
|
||||||
|
port: 6379
|
||||||
|
targetPort: 6379
|
||||||
|
|
||||||
|
---
|
||||||
|
# Authentik server Deployment — HTTP frontend on :9000.
|
||||||
|
apiVersion: apps/v1
|
||||||
|
kind: Deployment
|
||||||
|
metadata:
|
||||||
|
name: authentik-server
|
||||||
|
namespace: authentik
|
||||||
|
labels:
|
||||||
|
app: authentik-server
|
||||||
|
argocd.argoproj.io/instance: infra-authentik
|
||||||
|
spec:
|
||||||
|
replicas: 1
|
||||||
|
strategy:
|
||||||
|
type: Recreate # shares /media RWO PVC with worker
|
||||||
|
selector:
|
||||||
|
matchLabels:
|
||||||
|
app: authentik-server
|
||||||
|
template:
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app: authentik-server
|
||||||
|
spec:
|
||||||
|
securityContext:
|
||||||
|
# Authentik image runs as uid 1000 "authentik" but the Longhorn PVC mounts
|
||||||
|
# root:root by default. fsGroup recursively chgrp + chmod g+rwx so the
|
||||||
|
# non-root container can mkdir /media/public during the tenant_files migration.
|
||||||
|
fsGroup: 1000
|
||||||
|
containers:
|
||||||
|
- name: server
|
||||||
|
image: ghcr.io/goauthentik/server:2024.12.3
|
||||||
|
args: ["server"]
|
||||||
|
ports:
|
||||||
|
- containerPort: 9000
|
||||||
|
name: http
|
||||||
|
- containerPort: 9443
|
||||||
|
name: https
|
||||||
|
env:
|
||||||
|
- name: AUTHENTIK_SECRET_KEY
|
||||||
|
valueFrom:
|
||||||
|
secretKeyRef:
|
||||||
|
name: authentik-credentials
|
||||||
|
key: AUTHENTIK_SECRET_KEY
|
||||||
|
- name: AUTHENTIK_REDIS__HOST
|
||||||
|
value: authentik-redis
|
||||||
|
- name: AUTHENTIK_REDIS__PASSWORD
|
||||||
|
valueFrom:
|
||||||
|
secretKeyRef:
|
||||||
|
name: authentik-credentials
|
||||||
|
key: REDIS_PASSWORD
|
||||||
|
- name: AUTHENTIK_POSTGRESQL__HOST
|
||||||
|
value: authentik-postgres
|
||||||
|
- name: AUTHENTIK_POSTGRESQL__NAME
|
||||||
|
value: authentik
|
||||||
|
- name: AUTHENTIK_POSTGRESQL__USER
|
||||||
|
value: authentik
|
||||||
|
- name: AUTHENTIK_POSTGRESQL__PASSWORD
|
||||||
|
valueFrom:
|
||||||
|
secretKeyRef:
|
||||||
|
name: authentik-credentials
|
||||||
|
key: POSTGRES_PASSWORD
|
||||||
|
- name: AUTHENTIK_BOOTSTRAP_PASSWORD
|
||||||
|
valueFrom:
|
||||||
|
secretKeyRef:
|
||||||
|
name: authentik-credentials
|
||||||
|
key: BOOTSTRAP_ADMIN_PASSWORD
|
||||||
|
- name: AUTHENTIK_BOOTSTRAP_TOKEN
|
||||||
|
valueFrom:
|
||||||
|
secretKeyRef:
|
||||||
|
name: authentik-credentials
|
||||||
|
key: BOOTSTRAP_ADMIN_TOKEN
|
||||||
|
- name: AUTHENTIK_BOOTSTRAP_EMAIL
|
||||||
|
valueFrom:
|
||||||
|
secretKeyRef:
|
||||||
|
name: authentik-credentials
|
||||||
|
key: BOOTSTRAP_ADMIN_EMAIL
|
||||||
|
- name: AUTHENTIK_DISABLE_UPDATE_CHECK
|
||||||
|
value: "true"
|
||||||
|
- name: AUTHENTIK_ERROR_REPORTING__ENABLED
|
||||||
|
value: "false"
|
||||||
|
- name: AUTHENTIK_LOG_LEVEL
|
||||||
|
value: info
|
||||||
|
# First-boot Authentik can take 3+ min on the migration phase
|
||||||
|
# (waiting on DB lock while worker also runs migrations). Initial
|
||||||
|
# delays are generous so kubelet doesn't kill the pod mid-migration;
|
||||||
|
# periodSeconds keeps post-startup probing responsive.
|
||||||
|
readinessProbe:
|
||||||
|
httpGet:
|
||||||
|
path: /-/health/ready/
|
||||||
|
port: 9000
|
||||||
|
initialDelaySeconds: 60
|
||||||
|
periodSeconds: 10
|
||||||
|
timeoutSeconds: 5
|
||||||
|
failureThreshold: 12
|
||||||
|
livenessProbe:
|
||||||
|
httpGet:
|
||||||
|
path: /-/health/live/
|
||||||
|
port: 9000
|
||||||
|
initialDelaySeconds: 300
|
||||||
|
periodSeconds: 30
|
||||||
|
timeoutSeconds: 10
|
||||||
|
failureThreshold: 3
|
||||||
|
startupProbe:
|
||||||
|
httpGet:
|
||||||
|
path: /-/health/live/
|
||||||
|
port: 9000
|
||||||
|
initialDelaySeconds: 30
|
||||||
|
periodSeconds: 15
|
||||||
|
timeoutSeconds: 10
|
||||||
|
failureThreshold: 40 # 30s + 40*15s = 10.5 min budget
|
||||||
|
resources:
|
||||||
|
requests: { cpu: 150m, memory: 512Mi }
|
||||||
|
limits: { cpu: 1500m, memory: 1Gi }
|
||||||
|
volumeMounts:
|
||||||
|
- name: media
|
||||||
|
mountPath: /media
|
||||||
|
volumes:
|
||||||
|
- name: media
|
||||||
|
persistentVolumeClaim:
|
||||||
|
claimName: authentik-media
|
||||||
|
|
||||||
|
---
|
||||||
|
# Authentik worker Deployment — runs Celery background tasks.
|
||||||
|
apiVersion: apps/v1
|
||||||
|
kind: Deployment
|
||||||
|
metadata:
|
||||||
|
name: authentik-worker
|
||||||
|
namespace: authentik
|
||||||
|
labels:
|
||||||
|
app: authentik-worker
|
||||||
|
argocd.argoproj.io/instance: infra-authentik
|
||||||
|
spec:
|
||||||
|
replicas: 1
|
||||||
|
strategy:
|
||||||
|
type: Recreate # shares /media RWO PVC with server
|
||||||
|
selector:
|
||||||
|
matchLabels:
|
||||||
|
app: authentik-worker
|
||||||
|
template:
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app: authentik-worker
|
||||||
|
spec:
|
||||||
|
securityContext:
|
||||||
|
# Same as server pod — non-root uid 1000 needs PVC group write.
|
||||||
|
fsGroup: 1000
|
||||||
|
containers:
|
||||||
|
- name: worker
|
||||||
|
image: ghcr.io/goauthentik/server:2024.12.3
|
||||||
|
args: ["worker"]
|
||||||
|
env:
|
||||||
|
- name: AUTHENTIK_SECRET_KEY
|
||||||
|
valueFrom:
|
||||||
|
secretKeyRef:
|
||||||
|
name: authentik-credentials
|
||||||
|
key: AUTHENTIK_SECRET_KEY
|
||||||
|
- name: AUTHENTIK_REDIS__HOST
|
||||||
|
value: authentik-redis
|
||||||
|
- name: AUTHENTIK_REDIS__PASSWORD
|
||||||
|
valueFrom:
|
||||||
|
secretKeyRef:
|
||||||
|
name: authentik-credentials
|
||||||
|
key: REDIS_PASSWORD
|
||||||
|
- name: AUTHENTIK_POSTGRESQL__HOST
|
||||||
|
value: authentik-postgres
|
||||||
|
- name: AUTHENTIK_POSTGRESQL__NAME
|
||||||
|
value: authentik
|
||||||
|
- name: AUTHENTIK_POSTGRESQL__USER
|
||||||
|
value: authentik
|
||||||
|
- name: AUTHENTIK_POSTGRESQL__PASSWORD
|
||||||
|
valueFrom:
|
||||||
|
secretKeyRef:
|
||||||
|
name: authentik-credentials
|
||||||
|
key: POSTGRES_PASSWORD
|
||||||
|
- name: AUTHENTIK_DISABLE_UPDATE_CHECK
|
||||||
|
value: "true"
|
||||||
|
- name: AUTHENTIK_ERROR_REPORTING__ENABLED
|
||||||
|
value: "false"
|
||||||
|
- name: AUTHENTIK_LOG_LEVEL
|
||||||
|
value: info
|
||||||
|
resources:
|
||||||
|
requests: { cpu: 100m, memory: 256Mi }
|
||||||
|
limits: { cpu: 1000m, memory: 768Mi }
|
||||||
|
volumeMounts:
|
||||||
|
- name: media
|
||||||
|
mountPath: /media
|
||||||
|
volumes:
|
||||||
|
- name: media
|
||||||
|
persistentVolumeClaim:
|
||||||
|
claimName: authentik-media
|
||||||
|
|
||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Service
|
||||||
|
metadata:
|
||||||
|
name: authentik-server
|
||||||
|
namespace: authentik
|
||||||
|
spec:
|
||||||
|
selector:
|
||||||
|
app: authentik-server
|
||||||
|
ports:
|
||||||
|
- name: http
|
||||||
|
port: 9000
|
||||||
|
targetPort: 9000
|
||||||
|
- name: https
|
||||||
|
port: 9443
|
||||||
|
targetPort: 9443
|
||||||
|
|
||||||
|
---
|
||||||
|
# step-ca leaf certificate for id.iamworkin.lan.
|
||||||
|
# step-ca container resolver uses pfSense Unbound, so the public A record for id.iamworkin.lan
|
||||||
|
# MUST exist before this Certificate is applied (cert-manager HTTP-01 will silently 2h-backoff
|
||||||
|
# otherwise). Added 2026-05-25 via scripts/pfsense-add-id-host.py.
|
||||||
|
apiVersion: cert-manager.io/v1
|
||||||
|
kind: Certificate
|
||||||
|
metadata:
|
||||||
|
name: authentik-tls
|
||||||
|
namespace: authentik
|
||||||
|
spec:
|
||||||
|
secretName: authentik-tls
|
||||||
|
dnsNames:
|
||||||
|
- id.iamworkin.lan
|
||||||
|
issuerRef:
|
||||||
|
name: step-ca-acme
|
||||||
|
kind: ClusterIssuer
|
||||||
|
|
||||||
|
---
|
||||||
|
apiVersion: traefik.io/v1alpha1
|
||||||
|
kind: IngressRoute
|
||||||
|
metadata:
|
||||||
|
name: authentik
|
||||||
|
namespace: authentik
|
||||||
|
spec:
|
||||||
|
entryPoints: [websecure]
|
||||||
|
routes:
|
||||||
|
- match: Host(`id.iamworkin.lan`)
|
||||||
|
kind: Rule
|
||||||
|
services:
|
||||||
|
- name: authentik-server
|
||||||
|
port: 9000
|
||||||
|
tls:
|
||||||
|
secretName: authentik-tls
|
||||||
@@ -1,33 +0,0 @@
|
|||||||
# Explicit ArgoCD Application shape for bootstrap/review.
|
|
||||||
#
|
|
||||||
# The live bluejay-infra ApplicationSet already discovers apps/* directories
|
|
||||||
# and creates this same Application name (`infra-fc-devicemgmt`) automatically.
|
|
||||||
# Keep repoURL on the internal Gitea ClusterIP URL; ArgoCD does not trust the
|
|
||||||
# external step-ca HTTPS endpoint.
|
|
||||||
apiVersion: argoproj.io/v1alpha1
|
|
||||||
kind: Application
|
|
||||||
metadata:
|
|
||||||
name: infra-fc-devicemgmt
|
|
||||||
namespace: argocd
|
|
||||||
labels:
|
|
||||||
app.kubernetes.io/name: fc-devicemgmt
|
|
||||||
app.kubernetes.io/part-of: flowercore
|
|
||||||
app.kubernetes.io/managed-by: argocd
|
|
||||||
flowercore.io/tenant-id: system
|
|
||||||
flowercore.io/created-by: bluejay-infra
|
|
||||||
spec:
|
|
||||||
project: default
|
|
||||||
source:
|
|
||||||
repoURL: http://gitea-clusterip.gitea.svc.cluster.local:3000/bluejay/bluejay-infra.git
|
|
||||||
targetRevision: main
|
|
||||||
path: apps/fc-devicemgmt
|
|
||||||
destination:
|
|
||||||
server: https://kubernetes.default.svc
|
|
||||||
namespace: fc-devicemgmt
|
|
||||||
syncPolicy:
|
|
||||||
automated:
|
|
||||||
prune: true
|
|
||||||
selfHeal: true
|
|
||||||
syncOptions:
|
|
||||||
- CreateNamespace=true
|
|
||||||
- ServerSideApply=true
|
|
||||||
2
apps/github-runner/.gitattributes
vendored
Normal file
2
apps/github-runner/.gitattributes
vendored
Normal file
@@ -0,0 +1,2 @@
|
|||||||
|
*.sh text eol=lf
|
||||||
|
Dockerfile text eol=lf
|
||||||
44
apps/github-runner/Dockerfile
Normal file
44
apps/github-runner/Dockerfile
Normal file
@@ -0,0 +1,44 @@
|
|||||||
|
FROM myoung34/github-runner:latest
|
||||||
|
|
||||||
|
ARG RUBY_VERSION=3.3.11
|
||||||
|
ARG RUBY_MINOR=3.3
|
||||||
|
ARG RUBY_BUILD_VERSION=v20260326
|
||||||
|
ARG RUNNER_UID=1001
|
||||||
|
ARG RUNNER_GID=1001
|
||||||
|
|
||||||
|
ENV RUNNER_TOOL_CACHE=/home/runner/_tool
|
||||||
|
ENV RUNNER_RUBY_TOOLCACHE=/opt/runner-toolcache
|
||||||
|
ENV PATH="/home/runner/_tool/Ruby/${RUBY_MINOR}/x64/bin:/opt/runner-toolcache/Ruby/${RUBY_MINOR}/x64/bin:${PATH}"
|
||||||
|
|
||||||
|
USER root
|
||||||
|
|
||||||
|
RUN apt-get update \
|
||||||
|
&& DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
|
||||||
|
autoconf \
|
||||||
|
bison \
|
||||||
|
build-essential \
|
||||||
|
ca-certificates \
|
||||||
|
curl \
|
||||||
|
libdb-dev \
|
||||||
|
libffi-dev \
|
||||||
|
libgdbm-dev \
|
||||||
|
libgmp-dev \
|
||||||
|
libncurses-dev \
|
||||||
|
libreadline-dev \
|
||||||
|
libssl-dev \
|
||||||
|
libyaml-dev \
|
||||||
|
patch \
|
||||||
|
pkg-config \
|
||||||
|
uuid-dev \
|
||||||
|
zlib1g-dev \
|
||||||
|
&& curl -fsSL "https://github.com/rbenv/ruby-build/archive/refs/tags/${RUBY_BUILD_VERSION}.tar.gz" -o /tmp/ruby-build.tar.gz \
|
||||||
|
&& mkdir -p /tmp/ruby-build \
|
||||||
|
&& tar -xzf /tmp/ruby-build.tar.gz --strip-components=1 -C /tmp/ruby-build \
|
||||||
|
&& /tmp/ruby-build/install.sh \
|
||||||
|
&& rm -rf /tmp/ruby-build /tmp/ruby-build.tar.gz /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
COPY install-ruby-toolcache.sh /usr/local/bin/install-ruby-toolcache.sh
|
||||||
|
|
||||||
|
RUN chmod +x /usr/local/bin/install-ruby-toolcache.sh \
|
||||||
|
&& RUBY_VERSION="${RUBY_VERSION}" RUBY_MINOR="${RUBY_MINOR}" TOOLCACHE_ROOT="${RUNNER_RUBY_TOOLCACHE}" RUNNER_UID="${RUNNER_UID}" RUNNER_GID="${RUNNER_GID}" /usr/local/bin/install-ruby-toolcache.sh \
|
||||||
|
&& ruby -v
|
||||||
@@ -7,12 +7,17 @@ Deployments with `kubectl`; update this manifest and let ArgoCD reconcile.
|
|||||||
|
|
||||||
All repo-scoped Linux runners use:
|
All repo-scoped Linux runners use:
|
||||||
|
|
||||||
|
- `localhost/fc-github-runner:v20260520-ruby3.3.11`, derived from
|
||||||
|
`myoung34/github-runner:latest`
|
||||||
- `ACCESS_TOKEN` from the `github-runner-token` Secret
|
- `ACCESS_TOKEN` from the `github-runner-token` Secret
|
||||||
- `RUN_AS_ROOT=false`
|
- `RUN_AS_ROOT=false`
|
||||||
- `EPHEMERAL=true`
|
- `EPHEMERAL=true`
|
||||||
- `LABELS=self-hosted,linux,fc-build-linux`
|
- `LABELS=self-hosted,linux,fc-build-linux`
|
||||||
- writable non-root paths under `/home/runner` for .NET, NuGet, XDG cache, and
|
- writable non-root paths under `/home/runner` for .NET, NuGet, XDG cache, and
|
||||||
Actions tool cache
|
Actions tool cache
|
||||||
|
- Ruby 3.3.11 seeded into `/home/runner/_tool/Ruby/3.3/x64` from the baked
|
||||||
|
`/opt/runner-toolcache` copy so `ruby/setup-ruby@v1` can discover it on
|
||||||
|
self-hosted `ubuntu-20.04-x64` runners
|
||||||
|
|
||||||
`github-runner` for `FlowerCore.Common` is single-replica because it retains the
|
`github-runner` for `FlowerCore.Common` is single-replica because it retains the
|
||||||
original Longhorn ReadWriteOnce NuGet PVC. Every other repo-scoped runner uses
|
original Longhorn ReadWriteOnce NuGet PVC. Every other repo-scoped runner uses
|
||||||
@@ -28,6 +33,34 @@ Sprint 32 final long-tail wave adds 16 two-replica Deployments:
|
|||||||
`FlowerCore.Provisioning`, `FlowerCore.Redis`, `FlowerCore.MessageBoard`, and
|
`FlowerCore.Provisioning`, `FlowerCore.Redis`, `FlowerCore.MessageBoard`, and
|
||||||
`FlowerCore.MenuBoard`.
|
`FlowerCore.MenuBoard`.
|
||||||
|
|
||||||
|
## Image Build
|
||||||
|
|
||||||
|
Ruby is baked with a pinned `ruby-build` release and Ruby patch version. The pod
|
||||||
|
still mounts an `emptyDir` over `/home/runner`, so the `setup-runner-home` init
|
||||||
|
container copies the baked toolcache from `/opt/runner-toolcache/Ruby` into
|
||||||
|
`/home/runner/_tool/Ruby` before the runner container starts.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd apps/github-runner
|
||||||
|
podman build -t localhost/fc-github-runner:v20260520-ruby3.3.11 .
|
||||||
|
podman run --rm localhost/fc-github-runner:v20260520-ruby3.3.11 ruby -v
|
||||||
|
podman run --rm localhost/fc-github-runner:v20260520-ruby3.3.11 \
|
||||||
|
test -f /opt/runner-toolcache/Ruby/3.3/x64.complete
|
||||||
|
podman save localhost/fc-github-runner:v20260520-ruby3.3.11 \
|
||||||
|
-o fc-github-runner-v20260520-ruby3.3.11.tar
|
||||||
|
```
|
||||||
|
|
||||||
|
Import the saved image on every schedulable RKE2 node before ArgoCD rolls the
|
||||||
|
Deployments:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
for node in rke2-server rke2-agent1 rke2-agent2; do
|
||||||
|
scp fc-github-runner-v20260520-ruby3.3.11.tar "$node:/tmp/"
|
||||||
|
ssh "$node" 'sudo ctr -a /run/k3s/containerd/containerd.sock -n k8s.io images rm localhost/fc-github-runner:v20260520-ruby3.3.11 || true'
|
||||||
|
ssh "$node" 'sudo ctr -a /run/k3s/containerd/containerd.sock -n k8s.io images import /tmp/fc-github-runner-v20260520-ruby3.3.11.tar'
|
||||||
|
done
|
||||||
|
```
|
||||||
|
|
||||||
## Post-Merge Proof
|
## Post-Merge Proof
|
||||||
|
|
||||||
After the PR is merged and ArgoCD syncs, verify the runner fleet:
|
After the PR is merged and ArgoCD syncs, verify the runner fleet:
|
||||||
@@ -36,6 +69,14 @@ After the PR is merged and ArgoCD syncs, verify the runner fleet:
|
|||||||
kubectl -n github-runner get deploy,pods,pvc
|
kubectl -n github-runner get deploy,pods,pvc
|
||||||
```
|
```
|
||||||
|
|
||||||
|
Verify the Ruby toolcache in a fresh pod:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
kubectl -n github-runner exec deploy/github-runner-puppet -c runner -- ruby -v
|
||||||
|
kubectl -n github-runner exec deploy/github-runner-puppet -c runner -- sh -c \
|
||||||
|
'echo "$RUNNER_TOOL_CACHE" && test -f "$RUNNER_TOOL_CACHE/Ruby/3.3/x64.complete"'
|
||||||
|
```
|
||||||
|
|
||||||
Verify GitHub registration for the repo-scoped runners:
|
Verify GitHub registration for the repo-scoped runners:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
@@ -69,6 +110,10 @@ from GitHub Actions and verify it lands on an `rke2-linux-*` runner.
|
|||||||
- `actions/setup-dotnet` permission error at `/usr/share/dotnet`: check that
|
- `actions/setup-dotnet` permission error at `/usr/share/dotnet`: check that
|
||||||
`DOTNET_INSTALL_DIR=/home/runner/.dotnet` and related cache env vars are
|
`DOTNET_INSTALL_DIR=/home/runner/.dotnet` and related cache env vars are
|
||||||
present on the runner pod.
|
present on the runner pod.
|
||||||
|
- `ruby/setup-ruby@v1` says self-hosted runners must install Ruby in
|
||||||
|
`$RUNNER_TOOL_CACHE`: check that the init container copied
|
||||||
|
`/opt/runner-toolcache/Ruby` into `/home/runner/_tool/Ruby` and that
|
||||||
|
`/home/runner/_tool/Ruby/3.3/x64.complete` exists.
|
||||||
- `404` during runner registration: the fine-grained PAT is valid but missing
|
- `404` during runner registration: the fine-grained PAT is valid but missing
|
||||||
repository access for that repo. Add the repo to the PAT access list; the PAT
|
repository access for that repo. Add the repo to the PAT access list; the PAT
|
||||||
value does not change.
|
value does not change.
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
19
apps/github-runner/install-ruby-toolcache.sh
Normal file
19
apps/github-runner/install-ruby-toolcache.sh
Normal file
@@ -0,0 +1,19 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
RUBY_VERSION="${RUBY_VERSION:-3.3.11}"
|
||||||
|
RUBY_MINOR="${RUBY_MINOR:-3.3}"
|
||||||
|
TOOLCACHE_ROOT="${TOOLCACHE_ROOT:-/opt/runner-toolcache}"
|
||||||
|
RUNNER_UID="${RUNNER_UID:-1001}"
|
||||||
|
RUNNER_GID="${RUNNER_GID:-1001}"
|
||||||
|
RUBY_PREFIX="${TOOLCACHE_ROOT}/Ruby/${RUBY_VERSION}/x64"
|
||||||
|
|
||||||
|
mkdir -p "${TOOLCACHE_ROOT}/Ruby"
|
||||||
|
RUBY_CONFIGURE_OPTS="${RUBY_CONFIGURE_OPTS:---disable-install-doc --disable-yjit}" ruby-build "${RUBY_VERSION}" "${RUBY_PREFIX}"
|
||||||
|
|
||||||
|
touch "${TOOLCACHE_ROOT}/Ruby/${RUBY_VERSION}/x64.complete"
|
||||||
|
ln -sfn "${RUBY_VERSION}" "${TOOLCACHE_ROOT}/Ruby/${RUBY_MINOR}"
|
||||||
|
|
||||||
|
"${RUBY_PREFIX}/bin/ruby" -v
|
||||||
|
chown -R "${RUNNER_UID}:${RUNNER_GID}" "${TOOLCACHE_ROOT}"
|
||||||
|
chmod -R a+rX "${TOOLCACHE_ROOT}"
|
||||||
@@ -280,13 +280,14 @@ data:
|
|||||||
printer_model: "NuPrint 210"
|
printer_model: "NuPrint 210"
|
||||||
|
|
||||||
# Print.Web health (Blazor app on edge2:5200)
|
# Print.Web health (Blazor app on edge2:5200)
|
||||||
|
# Target `/health` (anonymous) — root path requires API key auth and returns 401.
|
||||||
- job_name: "probe-printweb"
|
- job_name: "probe-printweb"
|
||||||
metrics_path: /probe
|
metrics_path: /probe
|
||||||
params:
|
params:
|
||||||
module: [http_2xx]
|
module: [http_2xx]
|
||||||
scrape_interval: 30s
|
scrape_interval: 30s
|
||||||
static_configs:
|
static_configs:
|
||||||
- targets: ["http://10.0.57.16:5200/"]
|
- targets: ["http://10.0.57.16:5200/health"]
|
||||||
labels:
|
labels:
|
||||||
instance: "print-web"
|
instance: "print-web"
|
||||||
service: "print-web"
|
service: "print-web"
|
||||||
@@ -656,15 +657,14 @@ data:
|
|||||||
summary: "Print queue backlog on edge2 ({{ $value }} active jobs)"
|
summary: "Print queue backlog on edge2 ({{ $value }} active jobs)"
|
||||||
description: "CUPS has {{ $value }} active jobs queued. Possible printer jam, USB disconnect, or paper out."
|
description: "CUPS has {{ $value }} active jobs queued. Possible printer jam, USB disconnect, or paper out."
|
||||||
|
|
||||||
# Printer hardware and paper-roll lifecycle alerts.
|
# Paper roll lifecycle alerts (XL Track I, 2026-04-26).
|
||||||
# print_printer_online: 1 when the transport is reachable/selected.
|
# Source-of-truth gauge: print_paper_remaining_percent (Print.Web OTEL,
|
||||||
# print_printer_state enum: 0 unknown, 1 online, 2 offline,
|
# hydrated on startup from the active PaperRoll row).
|
||||||
# 3 paper_depleted, 4 jam, 5 head_error, 6 cover_open.
|
# alert_channel=thermal_print routes through irc-notify -> Print.Web
|
||||||
# Offline/jam/cover alerts stay IRC-only. Paper depleted and head
|
# /api/print/alert so the printer announces its own paper-out warning
|
||||||
# error may route to the thermal digest only when the printer is
|
# on its remaining paper. Self-referential humor + operator nudge.
|
||||||
# online enough to make that useful.
|
|
||||||
- alert: PrintPaperRollLow
|
- alert: PrintPaperRollLow
|
||||||
expr: (print_paper_remaining_percent{job="printweb-otel"} < 10 and print_paper_remaining_percent{job="printweb-otel"} > 5) and print_printer_online{job="printweb-otel"} == 1
|
expr: print_paper_remaining_percent{job="printweb-otel"} < 10 and print_paper_remaining_percent{job="printweb-otel"} > 5
|
||||||
for: 5m
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
@@ -673,59 +673,15 @@ data:
|
|||||||
summary: "Print roll low on edge2 ({{ $value | printf \"%.1f\" }}% remaining)"
|
summary: "Print roll low on edge2 ({{ $value | printf \"%.1f\" }}% remaining)"
|
||||||
description: "NuPrint 210 paper roll has {{ $value | printf \"%.1f\" }}% remaining. Operator should load a fresh roll soon. Run /api/paper/status for the precise mm + estimated jobs left."
|
description: "NuPrint 210 paper roll has {{ $value | printf \"%.1f\" }}% remaining. Operator should load a fresh roll soon. Run /api/paper/status for the precise mm + estimated jobs left."
|
||||||
|
|
||||||
- alert: PrinterOfflineWarning
|
|
||||||
expr: print_printer_state{job="printweb-otel"} == 2
|
|
||||||
for: 2m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
service: print-web
|
|
||||||
alert_channel: irc
|
|
||||||
annotations:
|
|
||||||
summary: "Print.Web printer offline on edge2"
|
|
||||||
description: "Print.Web reports the NuPrint 210 transport is offline or unreachable. IRC-only by design: do not thermal-print an alert when the thermal printer itself is offline."
|
|
||||||
|
|
||||||
- alert: PrintPaperRollCritical
|
- alert: PrintPaperRollCritical
|
||||||
expr: print_printer_state{job="printweb-otel"} == 3 and print_printer_online{job="printweb-otel"} == 1
|
expr: print_paper_remaining_percent{job="printweb-otel"} <= 5
|
||||||
for: 2m
|
for: 2m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
alert_channel: thermal_print
|
alert_channel: thermal_print
|
||||||
annotations:
|
annotations:
|
||||||
summary: "Print paper depleted on edge2"
|
summary: "Print roll critical on edge2 ({{ $value | printf \"%.1f\" }}% remaining)"
|
||||||
description: "NuPrint 210 reports paper depleted while the printer is still online. Load a new roll, drain the hardware buffer if needed, then replay DeadLetter jobs from /print-log."
|
description: "NuPrint 210 paper roll at {{ $value | printf \"%.1f\" }}% — load a new roll NOW. The 50ft roll has a ~12% red-stripe zone; once paper passes that, the printer can run dry mid-job."
|
||||||
|
|
||||||
- alert: PrinterJamWarning
|
|
||||||
expr: print_printer_state{job="printweb-otel"} == 4
|
|
||||||
for: 2m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
service: print-web
|
|
||||||
alert_channel: irc
|
|
||||||
annotations:
|
|
||||||
summary: "Print.Web printer jam on edge2"
|
|
||||||
description: "Print.Web reports a paper/cutter jam state. IRC-only: clear the jam, drain the hardware buffer if bytes were queued, then retry affected jobs."
|
|
||||||
|
|
||||||
- alert: PrinterHeadErrorCritical
|
|
||||||
expr: print_printer_state{job="printweb-otel"} == 5
|
|
||||||
for: 2m
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
service: print-web
|
|
||||||
alert_channel: thermal_print
|
|
||||||
annotations:
|
|
||||||
summary: "Print.Web printer head error on edge2"
|
|
||||||
description: "Print.Web reports a thermal head or unrecoverable printer error. Critical routing may enter the thermal digest per existing policy; IRC remains the primary triage stream."
|
|
||||||
|
|
||||||
- alert: PrinterCoverOpenWarning
|
|
||||||
expr: print_printer_state{job="printweb-otel"} == 6
|
|
||||||
for: 2m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
service: print-web
|
|
||||||
alert_channel: irc
|
|
||||||
annotations:
|
|
||||||
summary: "Print.Web printer cover open on edge2"
|
|
||||||
description: "Print.Web reports the printer cover/lid is open. IRC-only: close the cover and verify /api/print/status before retrying jobs."
|
|
||||||
|
|
||||||
- alert: PrintJobDeadLetter
|
- alert: PrintJobDeadLetter
|
||||||
expr: increase(print_jobs_dead_letter_total[15m]) > 0
|
expr: increase(print_jobs_dead_letter_total[15m]) > 0
|
||||||
@@ -3680,146 +3636,6 @@ data:
|
|||||||
relativeTimeRange: {from: 120, to: 0}
|
relativeTimeRange: {from: 120, to: 0}
|
||||||
datasourceUid: __expr__
|
datasourceUid: __expr__
|
||||||
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [600], type: gt}}], refId: C}
|
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [600], type: gt}}], refId: C}
|
||||||
- orgId: 1
|
|
||||||
name: Print Services
|
|
||||||
folder: Print Alerts
|
|
||||||
interval: 1m
|
|
||||||
rules:
|
|
||||||
- uid: printer-offline-warning
|
|
||||||
title: PrinterOfflineWarning
|
|
||||||
condition: C
|
|
||||||
for: 2m
|
|
||||||
noDataState: OK
|
|
||||||
execErrState: OK
|
|
||||||
annotations:
|
|
||||||
summary: "Print.Web printer offline on edge2"
|
|
||||||
description: "Print.Web reports the NuPrint 210 transport is offline or unreachable. IRC-only by design: do not thermal-print an alert when the thermal printer itself is offline."
|
|
||||||
runbook: "1. Check edge2 power/network 2. Check USB/CUPS queue 3. Open https://print.iamworkin.lan/admin 4. Do not force thermal routing for offline alerts."
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
service: print-web
|
|
||||||
alert_channel: irc
|
|
||||||
data:
|
|
||||||
- refId: A
|
|
||||||
relativeTimeRange: {from: 120, to: 0}
|
|
||||||
datasourceUid: prometheus
|
|
||||||
model: {expr: 'print_printer_state{job="printweb-otel"} == 2', instant: true, refId: A}
|
|
||||||
- refId: B
|
|
||||||
relativeTimeRange: {from: 120, to: 0}
|
|
||||||
datasourceUid: __expr__
|
|
||||||
model: {type: reduce, expression: A, reducer: last, refId: B}
|
|
||||||
- refId: C
|
|
||||||
relativeTimeRange: {from: 120, to: 0}
|
|
||||||
datasourceUid: __expr__
|
|
||||||
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [0], type: gt}}], refId: C}
|
|
||||||
- uid: print-paper-roll-critical
|
|
||||||
title: PrintPaperRollCritical
|
|
||||||
condition: C
|
|
||||||
for: 2m
|
|
||||||
noDataState: OK
|
|
||||||
execErrState: OK
|
|
||||||
annotations:
|
|
||||||
summary: "Print paper depleted on edge2"
|
|
||||||
description: "NuPrint 210 reports paper depleted while the printer is still online. Load a new roll, drain the hardware buffer if needed, then replay DeadLetter jobs from /print-log."
|
|
||||||
runbook: "1. Load a fresh roll 2. Drain the hardware buffer if paper-out happened mid-job 3. Open https://print.iamworkin.lan/print-log 4. Retry DeadLetter jobs after the state clears."
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
service: print-web
|
|
||||||
alert_channel: thermal_print
|
|
||||||
data:
|
|
||||||
- refId: A
|
|
||||||
relativeTimeRange: {from: 120, to: 0}
|
|
||||||
datasourceUid: prometheus
|
|
||||||
model: {expr: 'print_printer_state{job="printweb-otel"} == 3 and print_printer_online{job="printweb-otel"} == 1', instant: true, refId: A}
|
|
||||||
- refId: B
|
|
||||||
relativeTimeRange: {from: 120, to: 0}
|
|
||||||
datasourceUid: __expr__
|
|
||||||
model: {type: reduce, expression: A, reducer: last, refId: B}
|
|
||||||
- refId: C
|
|
||||||
relativeTimeRange: {from: 120, to: 0}
|
|
||||||
datasourceUid: __expr__
|
|
||||||
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [0], type: gt}}], refId: C}
|
|
||||||
- uid: printer-jam-warning
|
|
||||||
title: PrinterJamWarning
|
|
||||||
condition: C
|
|
||||||
for: 2m
|
|
||||||
noDataState: OK
|
|
||||||
execErrState: OK
|
|
||||||
annotations:
|
|
||||||
summary: "Print.Web printer jam on edge2"
|
|
||||||
description: "Print.Web reports a paper/cutter jam state. IRC-only: clear the jam, drain the hardware buffer if bytes were queued, then retry affected jobs."
|
|
||||||
runbook: "1. Clear paper/cutter path 2. Drain hardware buffer if CUPS queued bytes 3. Verify /api/print/status 4. Retry affected jobs."
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
service: print-web
|
|
||||||
alert_channel: irc
|
|
||||||
data:
|
|
||||||
- refId: A
|
|
||||||
relativeTimeRange: {from: 120, to: 0}
|
|
||||||
datasourceUid: prometheus
|
|
||||||
model: {expr: 'print_printer_state{job="printweb-otel"} == 4', instant: true, refId: A}
|
|
||||||
- refId: B
|
|
||||||
relativeTimeRange: {from: 120, to: 0}
|
|
||||||
datasourceUid: __expr__
|
|
||||||
model: {type: reduce, expression: A, reducer: last, refId: B}
|
|
||||||
- refId: C
|
|
||||||
relativeTimeRange: {from: 120, to: 0}
|
|
||||||
datasourceUid: __expr__
|
|
||||||
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [0], type: gt}}], refId: C}
|
|
||||||
- uid: printer-head-error-critical
|
|
||||||
title: PrinterHeadErrorCritical
|
|
||||||
condition: C
|
|
||||||
for: 2m
|
|
||||||
noDataState: OK
|
|
||||||
execErrState: OK
|
|
||||||
annotations:
|
|
||||||
summary: "Print.Web printer head error on edge2"
|
|
||||||
description: "Print.Web reports a thermal head or unrecoverable printer error. Critical routing may enter the thermal digest per existing policy; IRC remains the primary triage stream."
|
|
||||||
runbook: "1. Let the printer cool if overheated 2. Power-cycle only after checking queued jobs 3. Verify /api/print/status 4. Retry jobs after the state clears."
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
service: print-web
|
|
||||||
alert_channel: thermal_print
|
|
||||||
data:
|
|
||||||
- refId: A
|
|
||||||
relativeTimeRange: {from: 120, to: 0}
|
|
||||||
datasourceUid: prometheus
|
|
||||||
model: {expr: 'print_printer_state{job="printweb-otel"} == 5', instant: true, refId: A}
|
|
||||||
- refId: B
|
|
||||||
relativeTimeRange: {from: 120, to: 0}
|
|
||||||
datasourceUid: __expr__
|
|
||||||
model: {type: reduce, expression: A, reducer: last, refId: B}
|
|
||||||
- refId: C
|
|
||||||
relativeTimeRange: {from: 120, to: 0}
|
|
||||||
datasourceUid: __expr__
|
|
||||||
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [0], type: gt}}], refId: C}
|
|
||||||
- uid: printer-cover-open-warning
|
|
||||||
title: PrinterCoverOpenWarning
|
|
||||||
condition: C
|
|
||||||
for: 2m
|
|
||||||
noDataState: OK
|
|
||||||
execErrState: OK
|
|
||||||
annotations:
|
|
||||||
summary: "Print.Web printer cover open on edge2"
|
|
||||||
description: "Print.Web reports the printer cover/lid is open. IRC-only: close the cover and verify /api/print/status before retrying jobs."
|
|
||||||
runbook: "1. Close the printer cover 2. Verify /api/print/status returns online 3. Retry affected jobs only after the state clears."
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
service: print-web
|
|
||||||
alert_channel: irc
|
|
||||||
data:
|
|
||||||
- refId: A
|
|
||||||
relativeTimeRange: {from: 120, to: 0}
|
|
||||||
datasourceUid: prometheus
|
|
||||||
model: {expr: 'print_printer_state{job="printweb-otel"} == 6', instant: true, refId: A}
|
|
||||||
- refId: B
|
|
||||||
relativeTimeRange: {from: 120, to: 0}
|
|
||||||
datasourceUid: __expr__
|
|
||||||
model: {type: reduce, expression: A, reducer: last, refId: B}
|
|
||||||
- refId: C
|
|
||||||
relativeTimeRange: {from: 120, to: 0}
|
|
||||||
datasourceUid: __expr__
|
|
||||||
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [0], type: gt}}], refId: C}
|
|
||||||
- orgId: 1
|
- orgId: 1
|
||||||
name: CI Runners
|
name: CI Runners
|
||||||
folder: CI Alerts
|
folder: CI Alerts
|
||||||
|
|||||||
@@ -304,7 +304,7 @@ public sealed class FleetManifestLintTests
|
|||||||
}
|
}
|
||||||
|
|
||||||
[Fact]
|
[Fact]
|
||||||
public void Monitoring_MustIncludeRequiredAlertRoutingGuards()
|
public void Monitoring_MustAlertWhenLinuxRunnerDeploymentIsUnavailable()
|
||||||
{
|
{
|
||||||
var monitoring = File.ReadAllText(Path.Combine(Inventory.BluejayRoot, "apps", "monitoring", "noc-monitoring.yaml"));
|
var monitoring = File.ReadAllText(Path.Combine(Inventory.BluejayRoot, "apps", "monitoring", "noc-monitoring.yaml"));
|
||||||
|
|
||||||
@@ -315,15 +315,6 @@ public sealed class FleetManifestLintTests
|
|||||||
monitoring.Should().Contain("folder: CI Alerts");
|
monitoring.Should().Contain("folder: CI Alerts");
|
||||||
monitoring.Should().Contain("uid: linux-runner-offline");
|
monitoring.Should().Contain("uid: linux-runner-offline");
|
||||||
monitoring.Should().Contain("alert_channel: irc");
|
monitoring.Should().Contain("alert_channel: irc");
|
||||||
|
|
||||||
monitoring.Should().Contain("PrinterOfflineWarning");
|
|
||||||
monitoring.Should().Contain("expr: print_printer_state{job=\"printweb-otel\"} == 2");
|
|
||||||
monitoring.Should().Contain("IRC-only by design: do not thermal-print an alert when the thermal printer itself is offline.");
|
|
||||||
monitoring.Should().Contain("PrintPaperRollCritical");
|
|
||||||
monitoring.Should().Contain("expr: print_printer_state{job=\"printweb-otel\"} == 3 and print_printer_online{job=\"printweb-otel\"} == 1");
|
|
||||||
monitoring.Should().Contain("PrinterJamWarning");
|
|
||||||
monitoring.Should().Contain("PrinterHeadErrorCritical");
|
|
||||||
monitoring.Should().Contain("PrinterCoverOpenWarning");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
[Fact]
|
[Fact]
|
||||||
|
|||||||
Reference in New Issue
Block a user