Compare commits
2 Commits
cae03296f5
...
bb39a0c1fd
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
bb39a0c1fd | ||
|
|
c23e903ba7 |
@@ -142,7 +142,7 @@ spec:
|
|||||||
fsGroupChangePolicy: OnRootMismatch
|
fsGroupChangePolicy: OnRootMismatch
|
||||||
containers:
|
containers:
|
||||||
- name: web
|
- name: web
|
||||||
image: localhost/fc-ttsreader-web:v202604240023
|
image: localhost/fc-ttsreader-web:v202604240053
|
||||||
imagePullPolicy: Never
|
imagePullPolicy: Never
|
||||||
ports:
|
ports:
|
||||||
- containerPort: 5217
|
- containerPort: 5217
|
||||||
@@ -166,6 +166,12 @@ spec:
|
|||||||
value: "ttsreader-piper.fc-ttsreader.svc.cluster.local."
|
value: "ttsreader-piper.fc-ttsreader.svc.cluster.local."
|
||||||
- name: TtsReader__Piper__Port
|
- name: TtsReader__Piper__Port
|
||||||
value: "10200"
|
value: "10200"
|
||||||
|
- name: TtsReader__Kokoro__Enabled
|
||||||
|
value: "true"
|
||||||
|
- name: TtsReader__Kokoro__BaseUrl
|
||||||
|
value: "http://10.0.56.20:10401"
|
||||||
|
- name: TtsReader__Kokoro__TimeoutSeconds
|
||||||
|
value: "120"
|
||||||
- name: TtsReader__Ollama__BaseUrl
|
- name: TtsReader__Ollama__BaseUrl
|
||||||
value: "http://10.0.57.17:11434"
|
value: "http://10.0.57.17:11434"
|
||||||
- name: TtsReader__Ollama__DefaultModel
|
- name: TtsReader__Ollama__DefaultModel
|
||||||
|
|||||||
@@ -3139,6 +3139,172 @@ data:
|
|||||||
relativeTimeRange: {from: 600, to: 0}
|
relativeTimeRange: {from: 600, to: 0}
|
||||||
datasourceUid: __expr__
|
datasourceUid: __expr__
|
||||||
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [85], type: gt}}], refId: C}
|
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [85], type: gt}}], refId: C}
|
||||||
|
- orgId: 1
|
||||||
|
name: RemoteDesktop
|
||||||
|
folder: AI Stack Alerts
|
||||||
|
interval: 1m
|
||||||
|
rules:
|
||||||
|
- uid: remotedesktop-web-down
|
||||||
|
title: RemoteDesktop Web DOWN
|
||||||
|
condition: C
|
||||||
|
for: 3m
|
||||||
|
noDataState: Alerting
|
||||||
|
execErrState: OK
|
||||||
|
annotations:
|
||||||
|
summary: FlowerCore RemoteDesktop /health probe failing
|
||||||
|
description: "https://desktop.iamworkin.lan/health has failed for 3 minutes. Catalog + session launch surface offline."
|
||||||
|
runbook: "1. kubectl -n fc-desktop get pods -l app.kubernetes.io/name=remotedesktop-web 2. kubectl -n fc-desktop logs deploy/remotedesktop-web --tail=50 3. Check Traefik IngressRoute + step-ca cert 4. Rollout restart if pod is stuck"
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
service: remotedesktop
|
||||||
|
data:
|
||||||
|
- refId: A
|
||||||
|
relativeTimeRange: {from: 180, to: 0}
|
||||||
|
datasourceUid: prometheus
|
||||||
|
model: {expr: 'probe_success{job="probe-remotedesktop"}', instant: true, refId: A}
|
||||||
|
- refId: B
|
||||||
|
relativeTimeRange: {from: 180, to: 0}
|
||||||
|
datasourceUid: __expr__
|
||||||
|
model: {type: reduce, expression: A, reducer: last, refId: B}
|
||||||
|
- refId: C
|
||||||
|
relativeTimeRange: {from: 180, to: 0}
|
||||||
|
datasourceUid: __expr__
|
||||||
|
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [1], type: lt}}], refId: C}
|
||||||
|
|
||||||
|
- uid: remotedesktop-metrics-stale
|
||||||
|
title: RemoteDesktop metrics stale
|
||||||
|
condition: C
|
||||||
|
for: 10m
|
||||||
|
noDataState: Alerting
|
||||||
|
execErrState: OK
|
||||||
|
annotations:
|
||||||
|
summary: RemoteDesktop /metrics returning no series
|
||||||
|
description: "No fc_desktop_session_events_total series for 10 minutes. Either the Prometheus scrape is misconfigured or the web deployment stopped exporting metrics. Cross-checked by Zabbix template's identical 10m no-data trigger."
|
||||||
|
runbook: "1. curl -sk https://desktop.iamworkin.lan/metrics | head 2. kubectl -n monitoring exec deploy/prometheus -- wget -qO- localhost:9090/api/v1/targets?scrapePool=fc-remotedesktop 3. Check monitoring-netpol egress allows to fc-desktop:8080"
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
service: remotedesktop
|
||||||
|
data:
|
||||||
|
- refId: A
|
||||||
|
relativeTimeRange: {from: 600, to: 0}
|
||||||
|
datasourceUid: prometheus
|
||||||
|
model: {expr: 'count(fc_desktop_session_events_total) or vector(0)', instant: true, refId: A}
|
||||||
|
- refId: B
|
||||||
|
relativeTimeRange: {from: 600, to: 0}
|
||||||
|
datasourceUid: __expr__
|
||||||
|
model: {type: reduce, expression: A, reducer: last, refId: B}
|
||||||
|
- refId: C
|
||||||
|
relativeTimeRange: {from: 600, to: 0}
|
||||||
|
datasourceUid: __expr__
|
||||||
|
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [1], type: lt}}], refId: C}
|
||||||
|
|
||||||
|
- uid: remotedesktop-pool-depleted
|
||||||
|
title: RemoteDesktop pool depleted
|
||||||
|
condition: C
|
||||||
|
for: 5m
|
||||||
|
noDataState: OK
|
||||||
|
execErrState: OK
|
||||||
|
annotations:
|
||||||
|
summary: RemoteDesktop warm pool depleted for 5m
|
||||||
|
description: "A RemoteDesktop warm pool has fc_desktop_pool_depleted=1 for 5 minutes. New launches will cold-start. Check pod scheduling, image pull, node capacity."
|
||||||
|
runbook: "1. kubectl -n fc-desktop get pods -l app.kubernetes.io/name=remote-desktop --sort-by=.status.startTime 2. kubectl -n fc-desktop describe desktoppool <name> 3. Verify localhost/fc-desktop:* images imported on all 3 RKE2 nodes"
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
service: remotedesktop
|
||||||
|
data:
|
||||||
|
- refId: A
|
||||||
|
relativeTimeRange: {from: 300, to: 0}
|
||||||
|
datasourceUid: prometheus
|
||||||
|
model: {expr: 'max(fc_desktop_pool_depleted)', instant: true, refId: A}
|
||||||
|
- refId: B
|
||||||
|
relativeTimeRange: {from: 300, to: 0}
|
||||||
|
datasourceUid: __expr__
|
||||||
|
model: {type: reduce, expression: A, reducer: last, refId: B}
|
||||||
|
- refId: C
|
||||||
|
relativeTimeRange: {from: 300, to: 0}
|
||||||
|
datasourceUid: __expr__
|
||||||
|
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [0.5], type: gt}}], refId: C}
|
||||||
|
|
||||||
|
- uid: remotedesktop-pool-deficit-sustained
|
||||||
|
title: RemoteDesktop pool below desired
|
||||||
|
condition: C
|
||||||
|
for: 10m
|
||||||
|
noDataState: OK
|
||||||
|
execErrState: OK
|
||||||
|
annotations:
|
||||||
|
summary: RemoteDesktop pool sustained deficit
|
||||||
|
description: "A pool has fc_desktop_pool_deficit>0 for 10 minutes. Operator is reconciling but can't reach desired size — likely image pull, NFS affinity, or claim-init issue."
|
||||||
|
runbook: "1. kubectl -n fc-desktop get pods -l flowercore.io/pool=<pool> 2. kubectl logs -n fc-desktop deploy/remotedesktop-operator 3. Check claim-init hook env on template"
|
||||||
|
labels:
|
||||||
|
severity: info
|
||||||
|
service: remotedesktop
|
||||||
|
data:
|
||||||
|
- refId: A
|
||||||
|
relativeTimeRange: {from: 600, to: 0}
|
||||||
|
datasourceUid: prometheus
|
||||||
|
model: {expr: 'max(fc_desktop_pool_deficit)', instant: true, refId: A}
|
||||||
|
- refId: B
|
||||||
|
relativeTimeRange: {from: 600, to: 0}
|
||||||
|
datasourceUid: __expr__
|
||||||
|
model: {type: reduce, expression: A, reducer: last, refId: B}
|
||||||
|
- refId: C
|
||||||
|
relativeTimeRange: {from: 600, to: 0}
|
||||||
|
datasourceUid: __expr__
|
||||||
|
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [0], type: gt}}], refId: C}
|
||||||
|
|
||||||
|
- uid: remotedesktop-session-churn-spike
|
||||||
|
title: RemoteDesktop launch rate spike
|
||||||
|
condition: C
|
||||||
|
for: 5m
|
||||||
|
noDataState: OK
|
||||||
|
execErrState: OK
|
||||||
|
annotations:
|
||||||
|
summary: RemoteDesktop launch rate exceeds 20/min
|
||||||
|
description: "Launch events >20/min for 5 minutes. Could be a user-facing feature launch, pooled template thrashing, or runaway automation loop."
|
||||||
|
runbook: "1. kubectl -n fc-desktop get pods -l app.kubernetes.io/name=remote-desktop -o wide | wc -l 2. curl -sk https://desktop.iamworkin.lan/api/sessions/active 3. Check operator logs for reconcile loops"
|
||||||
|
labels:
|
||||||
|
severity: info
|
||||||
|
service: remotedesktop
|
||||||
|
data:
|
||||||
|
- refId: A
|
||||||
|
relativeTimeRange: {from: 300, to: 0}
|
||||||
|
datasourceUid: prometheus
|
||||||
|
model: {expr: 'sum(rate(fc_desktop_session_events_total{event="launch"}[5m])) * 60', instant: true, refId: A}
|
||||||
|
- refId: B
|
||||||
|
relativeTimeRange: {from: 300, to: 0}
|
||||||
|
datasourceUid: __expr__
|
||||||
|
model: {type: reduce, expression: A, reducer: last, refId: B}
|
||||||
|
- refId: C
|
||||||
|
relativeTimeRange: {from: 300, to: 0}
|
||||||
|
datasourceUid: __expr__
|
||||||
|
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [20], type: gt}}], refId: C}
|
||||||
|
|
||||||
|
- uid: remotedesktop-tls-expiry
|
||||||
|
title: RemoteDesktop TLS cert expiring
|
||||||
|
condition: C
|
||||||
|
for: 6h
|
||||||
|
noDataState: OK
|
||||||
|
execErrState: OK
|
||||||
|
annotations:
|
||||||
|
summary: desktop.iamworkin.lan cert <2d to expiry
|
||||||
|
description: "The desktop.iamworkin.lan certificate is inside the 2-day renewal window and cert-manager has not renewed. Check cert-manager logs, step-ca reachability, FlowerCore.DNS preflight for dnsNames."
|
||||||
|
runbook: "1. kubectl -n fc-desktop get certificate remotedesktop-web-tls 2. kubectl -n cert-manager logs deploy/cert-manager --tail=50 3. Verify pfSense DNS override for desktop.iamworkin.lan"
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
service: remotedesktop
|
||||||
|
data:
|
||||||
|
- refId: A
|
||||||
|
relativeTimeRange: {from: 21600, to: 0}
|
||||||
|
datasourceUid: prometheus
|
||||||
|
model: {expr: '(probe_ssl_earliest_cert_expiry{job="probe-remotedesktop"} - time()) / 86400', instant: true, refId: A}
|
||||||
|
- refId: B
|
||||||
|
relativeTimeRange: {from: 21600, to: 0}
|
||||||
|
datasourceUid: __expr__
|
||||||
|
model: {type: reduce, expression: A, reducer: last, refId: B}
|
||||||
|
- refId: C
|
||||||
|
relativeTimeRange: {from: 21600, to: 0}
|
||||||
|
datasourceUid: __expr__
|
||||||
|
model: {type: threshold, expression: B, conditions: [{evaluator: {params: [2], type: lt}}], refId: C}
|
||||||
|
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
# Deployment: Grafana
|
# Deployment: Grafana
|
||||||
|
|||||||
Reference in New Issue
Block a user