Compare commits
14 Commits
claude/k8s
...
f78f8c8192
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
f78f8c8192 | ||
|
|
9b255fefc1 | ||
|
|
6a89a76e39 | ||
|
|
2489464d4f | ||
|
|
4b777b16ac | ||
|
|
8c60e3a4d3 | ||
|
|
df02b4c3c3 | ||
|
|
c0dceafffd | ||
|
|
490db8f9e6 | ||
|
|
1926bdaf3b | ||
|
|
ca8d062826 | ||
|
|
1889462fc4 | ||
|
|
523ba61232 | ||
|
|
53f67c8713 |
@@ -532,7 +532,7 @@ spec:
|
||||
fsGroupChangePolicy: OnRootMismatch
|
||||
containers:
|
||||
- name: web
|
||||
image: localhost/fc-ttsreader-web:v202604301236-b6ca2d5
|
||||
image: localhost/fc-ttsreader-web:v202605061500
|
||||
imagePullPolicy: Never
|
||||
ports:
|
||||
- containerPort: 5217
|
||||
|
||||
@@ -46,7 +46,7 @@ spec:
|
||||
spec:
|
||||
containers:
|
||||
- name: intranet-web
|
||||
image: localhost/fc-intranet-web:v20260429-1646
|
||||
image: localhost/fc-intranet-web:v20260505-1108
|
||||
imagePullPolicy: Never
|
||||
ports:
|
||||
- containerPort: 5300
|
||||
|
||||
762
apps/monitoring/fc-updatecenter-dashboard.json
Normal file
762
apps/monitoring/fc-updatecenter-dashboard.json
Normal file
@@ -0,0 +1,762 @@
|
||||
{
|
||||
"annotations": {
|
||||
"list": []
|
||||
},
|
||||
"editable": true,
|
||||
"fiscalYearStartMonth": 0,
|
||||
"graphTooltip": 1,
|
||||
"id": null,
|
||||
"links": [
|
||||
{
|
||||
"icon": "external link",
|
||||
"includeVars": false,
|
||||
"keepTime": false,
|
||||
"targetBlank": true,
|
||||
"title": "Open Service",
|
||||
"type": "link",
|
||||
"url": "https://updatecenter.iamworkin.lan/"
|
||||
}
|
||||
],
|
||||
"panels": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "fffjikve8llhce"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"mappings": [
|
||||
{
|
||||
"options": {
|
||||
"0": {
|
||||
"color": "#f87171",
|
||||
"index": 1,
|
||||
"text": "DOWN"
|
||||
},
|
||||
"1": {
|
||||
"color": "#4ade80",
|
||||
"index": 0,
|
||||
"text": "UP"
|
||||
}
|
||||
},
|
||||
"type": "value"
|
||||
}
|
||||
],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "#f87171",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "#4ade80",
|
||||
"value": 1
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 8,
|
||||
"x": 0,
|
||||
"y": 0
|
||||
},
|
||||
"id": 1,
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "center",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "value_and_name"
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "fffjikve8llhce"
|
||||
},
|
||||
"expr": "probe_success{job=\"probe-traefik-services\",instance=\"updatecenter.iamworkin.lan\"}",
|
||||
"refId": "A",
|
||||
"legendFormat": "Availability"
|
||||
}
|
||||
],
|
||||
"title": "Service Availability",
|
||||
"transparent": true,
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "fffjikve8llhce"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"decimals": 2,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "#f87171",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "#fbbf24",
|
||||
"value": 95
|
||||
},
|
||||
{
|
||||
"color": "#FFB300",
|
||||
"value": 99
|
||||
},
|
||||
{
|
||||
"color": "#4ade80",
|
||||
"value": 99.9
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "percent"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 8,
|
||||
"x": 8,
|
||||
"y": 0
|
||||
},
|
||||
"id": 2,
|
||||
"options": {
|
||||
"colorMode": "background_solid",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "center",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "value_and_name"
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "fffjikve8llhce"
|
||||
},
|
||||
"expr": "avg_over_time(probe_success{job=\"probe-traefik-services\",instance=\"updatecenter.iamworkin.lan\"}[24h]) * 100",
|
||||
"refId": "A",
|
||||
"legendFormat": "24h Uptime"
|
||||
}
|
||||
],
|
||||
"title": "24-Hour Uptime",
|
||||
"transparent": true,
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "fffjikve8llhce"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"max": 30,
|
||||
"min": 0,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "#f87171",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "#fbbf24",
|
||||
"value": 2
|
||||
},
|
||||
{
|
||||
"color": "#4ade80",
|
||||
"value": 7
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "d"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 8,
|
||||
"x": 16,
|
||||
"y": 0
|
||||
},
|
||||
"id": 3,
|
||||
"options": {
|
||||
"minVizHeight": 75,
|
||||
"minVizWidth": 75,
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"showThresholdLabels": false,
|
||||
"showThresholdMarkers": true
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "fffjikve8llhce"
|
||||
},
|
||||
"expr": "(probe_ssl_earliest_cert_expiry{job=\"probe-traefik-services\",instance=\"updatecenter.iamworkin.lan\"} - time()) / 86400",
|
||||
"refId": "A",
|
||||
"legendFormat": "Days Remaining"
|
||||
}
|
||||
],
|
||||
"title": "Cert Expiry (Days)",
|
||||
"transparent": true,
|
||||
"type": "gauge"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "fffjikve8llhce"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"custom": {
|
||||
"axisBorderShow": false,
|
||||
"axisCenteredZero": false,
|
||||
"axisColorMode": "text",
|
||||
"axisLabel": "Response Time (seconds)",
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 12,
|
||||
"gradientMode": "scheme",
|
||||
"lineInterpolation": "smooth",
|
||||
"lineWidth": 2,
|
||||
"pointSize": 4,
|
||||
"showPoints": "never",
|
||||
"spanNulls": true,
|
||||
"thresholdsStyle": {
|
||||
"mode": "dashed"
|
||||
}
|
||||
},
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "#4ade80",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "#fbbf24",
|
||||
"value": 2
|
||||
},
|
||||
{
|
||||
"color": "#f87171",
|
||||
"value": 5
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "s"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 14,
|
||||
"x": 0,
|
||||
"y": 4
|
||||
},
|
||||
"id": 4,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [
|
||||
"lastNotNull",
|
||||
"mean",
|
||||
"max"
|
||||
],
|
||||
"displayMode": "table",
|
||||
"placement": "right"
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "single",
|
||||
"sort": "none"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "fffjikve8llhce"
|
||||
},
|
||||
"expr": "probe_duration_seconds{job=\"probe-traefik-services\",instance=\"updatecenter.iamworkin.lan\"}",
|
||||
"refId": "A",
|
||||
"legendFormat": "Probe Duration"
|
||||
}
|
||||
],
|
||||
"timeFrom": "1h",
|
||||
"title": "Response Time (1h Trend)",
|
||||
"transparent": true,
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "fffjikve8llhce"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 10,
|
||||
"x": 14,
|
||||
"y": 4
|
||||
},
|
||||
"id": 5,
|
||||
"options": {
|
||||
"alertInstanceLabelFilter": "{instance=\"updatecenter.iamworkin.lan\"}",
|
||||
"alertName": "",
|
||||
"dashboardAlerts": false,
|
||||
"groupBy": [],
|
||||
"groupMode": "default",
|
||||
"maxItems": 10,
|
||||
"sortOrder": 1,
|
||||
"stateFilter": {
|
||||
"error": true,
|
||||
"firing": true,
|
||||
"noData": true,
|
||||
"normal": false,
|
||||
"pending": true
|
||||
},
|
||||
"viewMode": "list"
|
||||
},
|
||||
"title": "Active Alerts",
|
||||
"type": "alertlist"
|
||||
},
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": {
|
||||
"h": 1,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 12
|
||||
},
|
||||
"id": 20,
|
||||
"title": "OTEL Counters — Track 1D",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "fffjikve8llhce"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"custom": {
|
||||
"lineWidth": 1,
|
||||
"fillOpacity": 10
|
||||
},
|
||||
"unit": "reqps"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 13
|
||||
},
|
||||
"id": 21,
|
||||
"options": {
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"calcs": ["mean", "lastNotNull"]
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "fffjikve8llhce"
|
||||
},
|
||||
"expr": "sum by (status) (rate(updatecenter_manifest_requests_total[5m]))",
|
||||
"refId": "A",
|
||||
"legendFormat": "status={{status}}"
|
||||
}
|
||||
],
|
||||
"title": "Manifest Requests rate by status (5m)",
|
||||
"transparent": true,
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "fffjikve8llhce"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"custom": {
|
||||
"lineWidth": 1,
|
||||
"fillOpacity": 10
|
||||
},
|
||||
"unit": "Bps"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 13
|
||||
},
|
||||
"id": 22,
|
||||
"options": {
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"calcs": ["mean", "lastNotNull"]
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "fffjikve8llhce"
|
||||
},
|
||||
"expr": "sum by (slug) (rate(updatecenter_bundle_download_bytes_total[5m]))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{slug}}"
|
||||
}
|
||||
],
|
||||
"title": "Bundle Download Throughput by slug (5m)",
|
||||
"transparent": true,
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "fffjikve8llhce"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"custom": {
|
||||
"lineWidth": 1,
|
||||
"fillOpacity": 10
|
||||
},
|
||||
"unit": "reqps"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 21
|
||||
},
|
||||
"id": 23,
|
||||
"options": {
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"calcs": ["mean", "lastNotNull"]
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "fffjikve8llhce"
|
||||
},
|
||||
"expr": "sum by (status) (rate(updatecenter_checkins_total[5m]))",
|
||||
"refId": "A",
|
||||
"legendFormat": "status={{status}}"
|
||||
}
|
||||
],
|
||||
"title": "Agent Check-in Rate by status (5m)",
|
||||
"transparent": true,
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "fffjikve8llhce"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "#4ade80", "value": null },
|
||||
{ "color": "#f87171", "value": 1 }
|
||||
]
|
||||
},
|
||||
"unit": "none",
|
||||
"decimals": 2
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 6,
|
||||
"x": 12,
|
||||
"y": 21
|
||||
},
|
||||
"id": 24,
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "center",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"calcs": ["sum"],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "value_and_name"
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "fffjikve8llhce"
|
||||
},
|
||||
"expr": "increase(updatecenter_signature_verify_failures_total[1h])",
|
||||
"refId": "A",
|
||||
"legendFormat": "Sig Verify Failures (1h)"
|
||||
}
|
||||
],
|
||||
"title": "Signature Verify Failures (1h)",
|
||||
"transparent": true,
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "fffjikve8llhce"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"custom": {
|
||||
"lineWidth": 1,
|
||||
"fillOpacity": 10
|
||||
},
|
||||
"unit": "reqps"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 6,
|
||||
"x": 18,
|
||||
"y": 21
|
||||
},
|
||||
"id": 25,
|
||||
"options": {
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"calcs": ["mean", "lastNotNull"]
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "fffjikve8llhce"
|
||||
},
|
||||
"expr": "sum by (slug, channel) (rate(updatecenter_release_publishes_total[5m]))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{slug}}/{{channel}}"
|
||||
}
|
||||
],
|
||||
"title": "Release Publishes rate by slug/channel (5m)",
|
||||
"transparent": true,
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "fffjikve8llhce"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"custom": {
|
||||
"lineWidth": 1,
|
||||
"fillOpacity": 10
|
||||
},
|
||||
"unit": "reqps"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 29
|
||||
},
|
||||
"id": 26,
|
||||
"options": {
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"calcs": ["mean", "lastNotNull"]
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "fffjikve8llhce"
|
||||
},
|
||||
"expr": "sum by (kind, status) (rate(updatecenter_bundle_downloads_total[5m]))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{kind}} / {{status}}"
|
||||
}
|
||||
],
|
||||
"title": "Bundle Download Requests by kind/status (5m)",
|
||||
"transparent": true,
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "fffjikve8llhce"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"custom": {
|
||||
"lineWidth": 2,
|
||||
"fillOpacity": 20
|
||||
},
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "#4ade80", "value": null },
|
||||
{ "color": "#f87171", "value": 0.01 }
|
||||
]
|
||||
},
|
||||
"unit": "reqps"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 29
|
||||
},
|
||||
"id": 27,
|
||||
"options": {
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"calcs": ["mean", "lastNotNull"]
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "fffjikve8llhce"
|
||||
},
|
||||
"expr": "rate(updatecenter_signature_verify_failures_total[5m])",
|
||||
"refId": "A",
|
||||
"legendFormat": "Sig verify failures/s"
|
||||
}
|
||||
],
|
||||
"title": "Signature Verify Failure Rate (5m) — Critical if >0",
|
||||
"transparent": true,
|
||||
"type": "timeseries"
|
||||
}
|
||||
],
|
||||
"refresh": "30s",
|
||||
"schemaVersion": 39,
|
||||
"style": "dark",
|
||||
"tags": [
|
||||
"blue-jay",
|
||||
"flowercore",
|
||||
"synthetic",
|
||||
"updatecenter",
|
||||
"otel"
|
||||
],
|
||||
"templating": {
|
||||
"list": []
|
||||
},
|
||||
"time": {
|
||||
"from": "now-24h",
|
||||
"to": "now"
|
||||
},
|
||||
"timezone": "browser",
|
||||
"title": "FlowerCore.UpdateCenter Dashboard",
|
||||
"uid": "fc-updatecenter",
|
||||
"version": 2
|
||||
}
|
||||
@@ -1024,6 +1024,72 @@ data:
|
||||
summary: "Longhorn node {{ $labels.node }} not Ready"
|
||||
description: "Node {{ $labels.node }} reports ready=false (reason: {{ $labels.condition_reason }}). Volumes scheduled to this node will be unavailable until it recovers."
|
||||
|
||||
# ============================================================
|
||||
# FC Signage Marquee Performance — Track 3 + 8 (2026-05-06)
|
||||
# Live-mirrored from FlowerCore.Notes/scripts/monitoring/alerts.yml.
|
||||
# Source-of-truth for the live Podman Prometheus on noc1 is the
|
||||
# Notes file; this K8s ConfigMap exists so a future migration to
|
||||
# in-cluster Prometheus inherits the ruleset automatically.
|
||||
# See feedback_monitoring_k8s_target_vs_live_podman.
|
||||
# ============================================================
|
||||
- name: fc-signage-marquee
|
||||
rules:
|
||||
- alert: MarqueeDroppedFramesHigh
|
||||
expr: |
|
||||
(
|
||||
sum by (renderer, phase, node_id) (rate(marquee_dropped_frames_total[5m]))
|
||||
/
|
||||
sum by (renderer, phase, node_id) (rate(marquee_render_latency_ms_count[5m]))
|
||||
) > 0.05
|
||||
unless on()
|
||||
absent_over_time(marquee_dropped_frames_total[7d])
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: signage
|
||||
alert_channel: irc
|
||||
annotations:
|
||||
summary: "Marquee dropped-frame rate >5% on {{ $labels.renderer }}/{{ $labels.node_id }} ({{ $labels.phase }})"
|
||||
description: "Renderer {{ $labels.renderer }} on {{ $labels.node_id }} drops >5% of frames during {{ $labels.phase }}. Animation visibly stuttery."
|
||||
|
||||
- alert: MarqueeRenderLatencyP99High
|
||||
expr: |
|
||||
histogram_quantile(
|
||||
0.99,
|
||||
sum by (renderer, phase, node_id, le) (rate(marquee_render_latency_ms_bucket[5m]))
|
||||
) > 16
|
||||
unless on()
|
||||
absent_over_time(marquee_render_latency_ms_bucket[7d])
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
service: signage
|
||||
alert_channel: irc
|
||||
annotations:
|
||||
summary: "Marquee render latency p99 > 16ms on {{ $labels.renderer }}/{{ $labels.node_id }} ({{ $labels.phase }})"
|
||||
description: "Per-frame render latency p99 has exceeded the Pi-class 16ms budget for 10 minutes."
|
||||
|
||||
- alert: MarqueeAnimationDurationDrift
|
||||
expr: |
|
||||
abs(
|
||||
histogram_quantile(0.5, sum by (renderer, phase, le) (rate(marquee_animation_duration_ms_bucket[15m])))
|
||||
-
|
||||
on (phase) group_left() avg by (phase) (marquee_animation_duration_target_ms)
|
||||
)
|
||||
/
|
||||
on (phase) group_left() avg by (phase) (marquee_animation_duration_target_ms)
|
||||
> 0.10
|
||||
unless on()
|
||||
absent_over_time(marquee_animation_duration_ms_bucket[7d])
|
||||
for: 15m
|
||||
labels:
|
||||
severity: info
|
||||
service: signage
|
||||
alert_channel: irc
|
||||
annotations:
|
||||
summary: "Marquee animation duration drifting > 10% on {{ $labels.renderer }} ({{ $labels.phase }})"
|
||||
description: "Median observed cycle duration deviates from target DurationMs by >10%. Could indicate browser tab throttling, GPU pressure, or phase-advancement bug."
|
||||
|
||||
# =============================================================================
|
||||
# ConfigMap: Blackbox Exporter Configuration
|
||||
# =============================================================================
|
||||
|
||||
60
apps/worldbuilder/README.md
Normal file
60
apps/worldbuilder/README.md
Normal file
@@ -0,0 +1,60 @@
|
||||
# FlowerCore.WorldBuilder
|
||||
|
||||
ArgoCD-managed manifest for FlowerCore.WorldBuilder.Web — comic / storyboard
|
||||
authoring service that drives ComfyUI for panel image generation and
|
||||
QuestPDF for letter / A4 export.
|
||||
|
||||
Source: `D:\git\FlowerCore\FlowerCore.WorldBuilder` (master)
|
||||
|
||||
## Deployment order
|
||||
|
||||
1. **DNS preflight** — `worldbuilder.iamworkin.lan -> 10.0.56.200` MUST exist
|
||||
in pfSense Unbound before this manifest is applied, or cert-manager
|
||||
HTTP-01 silently exponential-backs-off ~2h.
|
||||
Memory: `feedback_pfsense_dns_required_for_acme`.
|
||||
2. **Image import to ALL RKE2 nodes** — pod can schedule to any of
|
||||
`rke2-server` (10.0.56.11), `rke2-agent1` (10.0.56.12),
|
||||
`rke2-agent2` (10.0.56.13). Build with:
|
||||
```bash
|
||||
bash deploy/build.sh # in FlowerCore.WorldBuilder repo
|
||||
podman save localhost/fc-worldbuilder:v<TAG> -o /tmp/fc-worldbuilder-v<TAG>.tar
|
||||
for h in 10.0.56.11 10.0.56.12 10.0.56.13; do
|
||||
scp /tmp/fc-worldbuilder-v<TAG>.tar fcadmin@$h:/tmp/
|
||||
ssh fcadmin@$h \
|
||||
"sudo /var/lib/rancher/rke2/bin/ctr -a /run/k3s/containerd/containerd.sock \
|
||||
-n k8s.io images import /tmp/fc-worldbuilder-v<TAG>.tar"
|
||||
done
|
||||
```
|
||||
Memory: `feedback_rke2_image_import_per_node_scp`.
|
||||
3. **Bump image tag** in `worldbuilder.yaml` and git push.
|
||||
ArgoCD ApplicationSet picks up within ~3 minutes.
|
||||
4. **First production render** — open `https://worldbuilder.iamworkin.lan`,
|
||||
create World → Character → Storyboard → ExportJob, confirm artifact
|
||||
downloads. ComfyUI lives on BLUEJAY-WS at `http://10.0.56.20:8188`.
|
||||
|
||||
## Health probes
|
||||
|
||||
- `startupProbe` + `readinessProbe`: `httpGet /healthz` (registered explicitly
|
||||
in Program.cs — anonymous, no DB or OpenAPI dependency).
|
||||
- `livenessProbe`: `tcpSocket` as a cheap fallback.
|
||||
Memory: `feedback_k8s_probes_must_not_hit_openapi`,
|
||||
`feedback_k8s_probes_behind_auth_middleware`.
|
||||
|
||||
## Storage
|
||||
|
||||
- Longhorn RWO PVC `worldbuilder-data` (5Gi) mounted at `/data`. SQLite DB
|
||||
lives at `/data/worldbuilder.db`, generated images under `/data/gallery/`,
|
||||
PDF/PNG exports under `/data/exports/`.
|
||||
- DataProtection keys persist to the same SQLite via
|
||||
`AddFlowerCoreDataProtection<WorldBuilderDbContext>` — explicit migration
|
||||
`20260429133417_Initial` already creates `fc_dp_keys`.
|
||||
Memory: `feedback_dataprotection_keys_persist_to_app_dbcontext`,
|
||||
`feedback_intranet_dataprotection_table_must_have_explicit_migration`.
|
||||
|
||||
## Image generation backend
|
||||
|
||||
`FlowerCore:WorldBuilder:ImageGeneration:BaseUrl=http://10.0.56.20:8188` —
|
||||
ComfyUI runs on BLUEJAY-WS Windows (R9700 / gfx1201 / ROCm 7.2.1). Pod reaches
|
||||
the workstation directly across the 10.0.56.0/24 VLAN (no Podman-style host-
|
||||
filter issues — K8s pods route via Calico, which is L3-routed across the
|
||||
VLAN).
|
||||
208
apps/worldbuilder/worldbuilder.yaml
Normal file
208
apps/worldbuilder/worldbuilder.yaml
Normal file
@@ -0,0 +1,208 @@
|
||||
# FlowerCore.WorldBuilder — comic / storyboard authoring service.
|
||||
#
|
||||
# Deployment + Service + PVC + Certificate + IngressRoute. ArgoCD-managed
|
||||
# end-to-end. See apps/worldbuilder/README.md for the per-deploy runbook.
|
||||
#
|
||||
# Image build (BLUEJAY-WS):
|
||||
# bash deploy/build.sh # in FlowerCore.WorldBuilder repo
|
||||
# podman save localhost/fc-worldbuilder:v<TAG> -o /tmp/fc-worldbuilder-v<TAG>.tar
|
||||
# for h in 10.0.56.11 10.0.56.12 10.0.56.13; do
|
||||
# scp /tmp/fc-worldbuilder-v<TAG>.tar fcadmin@$h:/tmp/
|
||||
# ssh fcadmin@$h "sudo /var/lib/rancher/rke2/bin/ctr -a /run/k3s/containerd/containerd.sock -n k8s.io images import /tmp/fc-worldbuilder-v<TAG>.tar"
|
||||
# done
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Namespace
|
||||
metadata:
|
||||
name: fc-worldbuilder
|
||||
labels:
|
||||
app.kubernetes.io/part-of: flowercore
|
||||
---
|
||||
# SQLite DB + generated image gallery + PDF/PNG exports.
|
||||
# Longhorn RWO — single replica with `Recreate` rollout strategy keeps it safe.
|
||||
apiVersion: v1
|
||||
kind: PersistentVolumeClaim
|
||||
metadata:
|
||||
name: worldbuilder-data
|
||||
namespace: fc-worldbuilder
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
storageClassName: longhorn
|
||||
resources:
|
||||
requests:
|
||||
storage: 5Gi
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: worldbuilder-web
|
||||
namespace: fc-worldbuilder
|
||||
labels:
|
||||
app.kubernetes.io/name: worldbuilder-web
|
||||
app.kubernetes.io/part-of: flowercore
|
||||
spec:
|
||||
replicas: 1
|
||||
revisionHistoryLimit: 3
|
||||
strategy:
|
||||
# RWO PVC + single replica. Recreate avoids multi-attach overlap.
|
||||
type: Recreate
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: worldbuilder-web
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/name: worldbuilder-web
|
||||
app.kubernetes.io/part-of: flowercore
|
||||
annotations:
|
||||
prometheus.io/scrape: "true"
|
||||
prometheus.io/port: "8080"
|
||||
prometheus.io/path: "/metrics/prometheus"
|
||||
spec:
|
||||
securityContext:
|
||||
fsGroup: 1654
|
||||
fsGroupChangePolicy: OnRootMismatch
|
||||
containers:
|
||||
- name: web
|
||||
# Bump tag for each rebuild. Initial deploy: v202605062048
|
||||
image: localhost/fc-worldbuilder:v202605062048
|
||||
imagePullPolicy: Never
|
||||
ports:
|
||||
- containerPort: 8080
|
||||
name: http
|
||||
env:
|
||||
- name: ASPNETCORE_URLS
|
||||
value: "http://+:8080"
|
||||
- name: ASPNETCORE_ENVIRONMENT
|
||||
value: "Production"
|
||||
- name: DOTNET_RUNNING_IN_CONTAINER
|
||||
value: "true"
|
||||
- name: DOTNET_SYSTEM_GLOBALIZATION_INVARIANT
|
||||
value: "false"
|
||||
# SQLite path overrides (default appsettings uses relative paths).
|
||||
- name: ConnectionStrings__DefaultConnection
|
||||
value: "Data Source=/data/worldbuilder.db"
|
||||
- name: FlowerCore__Database__Provider
|
||||
value: "Sqlite"
|
||||
- name: FlowerCore__Database__ConnectionStrings__Sqlite
|
||||
value: "Data Source=/data/worldbuilder.db"
|
||||
# Generated image gallery + exports persist on /data.
|
||||
- name: FlowerCore__WorldBuilder__ImageStore__RootPath
|
||||
value: "/data/gallery"
|
||||
- name: FlowerCore__WorldBuilder__Export__RootPath
|
||||
value: "/data/exports"
|
||||
# ComfyUI on BLUEJAY-WS (R9700 / gfx1201 / ROCm 7.2.1).
|
||||
- name: FlowerCore__WorldBuilder__ImageGeneration__BaseUrl
|
||||
value: "http://10.0.56.20:8188"
|
||||
- name: FlowerCore__WorldBuilder__ImageGeneration__ClientMode
|
||||
value: "comfyui"
|
||||
resources:
|
||||
# Cluster CPU-request budget runs hot (99% on all 3 nodes at deploy
|
||||
# time) while actual CPU usage is well below capacity. Idle Blazor
|
||||
# Server + SignalR + a single ComfyUI poller uses ~5m, so 25m is
|
||||
# generous. Re-evaluate if active rendering/export workers ever
|
||||
# push past the limit.
|
||||
requests:
|
||||
cpu: 25m
|
||||
memory: 256Mi
|
||||
limits:
|
||||
cpu: 1000m
|
||||
memory: 768Mi
|
||||
# /healthz is registered explicitly in Program.cs (anonymous, no DB
|
||||
# or OpenAPI dependency). Liveness uses tcpSocket as a cheap fallback
|
||||
# in case future middleware changes accidentally gate /healthz.
|
||||
# Memory: feedback_k8s_probes_must_not_hit_openapi,
|
||||
# feedback_k8s_probes_behind_auth_middleware.
|
||||
startupProbe:
|
||||
httpGet:
|
||||
path: /healthz
|
||||
port: 8080
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
failureThreshold: 30
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /healthz
|
||||
port: 8080
|
||||
periodSeconds: 10
|
||||
failureThreshold: 3
|
||||
livenessProbe:
|
||||
tcpSocket:
|
||||
port: 8080
|
||||
initialDelaySeconds: 30
|
||||
periodSeconds: 30
|
||||
failureThreshold: 3
|
||||
securityContext:
|
||||
runAsNonRoot: true
|
||||
runAsUser: 1654
|
||||
runAsGroup: 1654
|
||||
allowPrivilegeEscalation: false
|
||||
readOnlyRootFilesystem: true
|
||||
capabilities:
|
||||
drop:
|
||||
- ALL
|
||||
volumeMounts:
|
||||
- name: data
|
||||
mountPath: /data
|
||||
- name: tmp
|
||||
mountPath: /tmp
|
||||
- name: logs
|
||||
mountPath: /app/logs
|
||||
volumes:
|
||||
- name: data
|
||||
persistentVolumeClaim:
|
||||
claimName: worldbuilder-data
|
||||
- name: tmp
|
||||
emptyDir: {}
|
||||
- name: logs
|
||||
emptyDir: {}
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: worldbuilder-web
|
||||
namespace: fc-worldbuilder
|
||||
labels:
|
||||
app.kubernetes.io/name: worldbuilder-web
|
||||
app.kubernetes.io/part-of: flowercore
|
||||
spec:
|
||||
type: ClusterIP
|
||||
selector:
|
||||
app.kubernetes.io/name: worldbuilder-web
|
||||
ports:
|
||||
- name: http
|
||||
port: 80
|
||||
targetPort: 8080
|
||||
---
|
||||
apiVersion: cert-manager.io/v1
|
||||
kind: Certificate
|
||||
metadata:
|
||||
name: worldbuilder-web-tls
|
||||
namespace: fc-worldbuilder
|
||||
spec:
|
||||
secretName: worldbuilder-web-tls
|
||||
issuerRef:
|
||||
name: step-ca-acme
|
||||
kind: ClusterIssuer
|
||||
dnsNames:
|
||||
- worldbuilder.iamworkin.lan
|
||||
duration: 2160h # 90d
|
||||
renewBefore: 720h # 30d
|
||||
---
|
||||
apiVersion: traefik.io/v1alpha1
|
||||
kind: IngressRoute
|
||||
metadata:
|
||||
name: worldbuilder-web
|
||||
namespace: fc-worldbuilder
|
||||
spec:
|
||||
entryPoints:
|
||||
- websecure
|
||||
routes:
|
||||
- match: Host(`worldbuilder.iamworkin.lan`)
|
||||
kind: Rule
|
||||
services:
|
||||
- name: worldbuilder-web
|
||||
port: 80
|
||||
tls:
|
||||
secretName: worldbuilder-web-tls
|
||||
Reference in New Issue
Block a user