diff --git a/apps/monitoring/noc-monitoring.yaml b/apps/monitoring/noc-monitoring.yaml index 8efd051..00c8020 100644 --- a/apps/monitoring/noc-monitoring.yaml +++ b/apps/monitoring/noc-monitoring.yaml @@ -316,6 +316,21 @@ data: labels: service: "remotedesktop-web" + # FlowerCore voice synthesis metrics — direct app exposition where present. + # Chat live returned 404 on /metrics as of 2026-05-06, so this migration + # target only scrapes TtsReader until the Chat producer exports metrics. + - job_name: "fc-ttsreader-voice" + scrape_interval: 30s + metrics_path: /metrics + scheme: https + tls_config: + insecure_skip_verify: true + static_configs: + - targets: ["ttsreader.iamworkin.lan:443"] + labels: + service: "ttsreader-web" + namespace: "fc-ttsreader" + # CUPS web UI health (port 631) - job_name: "probe-cups" metrics_path: /probe @@ -697,6 +712,43 @@ data: summary: "Print.Web Ollama runner held for >10m ({{ $labels.model }})" description: "Print.Web reports model {{ $labels.model }} with {{ $value | printf \"%.0f\" }}s of keep-alive remaining. Check concurrent requests before the Pi 5 Ollama lane thrashes." + - name: fc-voice-synthesis + rules: + - alert: voice_synthesis_error_rate_high + expr: | + ( + sum(rate(voice_synthesis_total{status=~"error|failed|failure|throttled"}[5m])) + / + clamp_min(sum(rate(voice_synthesis_total[5m])), 0.001) + ) > 0.05 + unless on() + absent_over_time(voice_synthesis_total[7d]) + for: 5m + labels: + severity: warning + service: voice-stack + alert_channel: irc + annotations: + summary: "Voice synthesis error rate above 5%" + description: "More than 5% of voice synthesis attempts ended in error, failure, or throttling over the last 5 minutes." + + - alert: voice_synthesis_latency_p99_high + expr: | + histogram_quantile( + 0.99, + sum by (engine, le) (rate(voice_synthesis_latency_ms_bucket[5m])) + ) > 10000 + unless on() + absent_over_time(voice_synthesis_latency_ms_bucket[7d]) + for: 2m + labels: + severity: warning + service: voice-stack + alert_channel: irc + annotations: + summary: "Voice synthesis p99 latency above 10s on {{ $labels.engine }}" + description: "Voice synthesis latency p99 for engine {{ $labels.engine }} exceeded 10 seconds for two evaluation cycles." + - name: remote-desktop rules: - alert: RemoteDesktopWebDown @@ -1425,6 +1477,19 @@ data: path: /var/lib/grafana/dashboards foldersFromFilesStructure: true +# ============================================================================= +# ConfigMap: Grafana Dashboard (Voice Stack) +# ============================================================================= +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-dashboard-voice-stack + namespace: monitoring +data: + voice-stack.json: | + {"annotations":{"list":[]},"editable":true,"fiscalYearStartMonth":0,"graphTooltip":1,"id":null,"links":[{"icon":"external link","includeVars":false,"keepTime":false,"targetBlank":true,"title":"TtsReader","type":"link","url":"https://ttsreader.iamworkin.lan/"},{"icon":"external link","includeVars":false,"keepTime":false,"targetBlank":true,"title":"Chat","type":"link","url":"https://chat.iamworkin.lan/"}],"panels":[{"datasource":{"type":"prometheus","uid":"prometheus"},"fieldConfig":{"defaults":{"color":{"mode":"thresholds"},"thresholds":{"mode":"absolute","steps":[{"color":"#64748b","value":null},{"color":"#38bdf8","value":1},{"color":"#22c55e","value":25}]},"unit":"short"},"overrides":[]},"gridPos":{"h":4,"w":6,"x":0,"y":0},"id":1,"options":{"colorMode":"background","graphMode":"area","justifyMode":"center","orientation":"auto","reduceOptions":{"calcs":["lastNotNull"],"fields":"","values":false},"showPercentChange":false,"textMode":"auto","wideLayout":true},"pluginVersion":"12.4.0","targets":[{"datasource":{"type":"prometheus","uid":"prometheus"},"editorMode":"code","expr":"sum(increase(voice_synthesis_total[1h])) or vector(0)","legendFormat":"synth/hr","range":true,"refId":"A"}],"title":"Total Syntheses / Hour","type":"stat"},{"datasource":{"type":"prometheus","uid":"prometheus"},"fieldConfig":{"defaults":{"color":{"mode":"thresholds"},"thresholds":{"mode":"absolute","steps":[{"color":"#22c55e","value":null},{"color":"#f59e0b","value":5},{"color":"#ef4444","value":10}]},"unit":"s"},"overrides":[]},"gridPos":{"h":4,"w":6,"x":6,"y":0},"id":2,"options":{"colorMode":"background","graphMode":"area","justifyMode":"center","orientation":"auto","reduceOptions":{"calcs":["lastNotNull"],"fields":"","values":false},"showPercentChange":false,"textMode":"auto","wideLayout":true},"pluginVersion":"12.4.0","targets":[{"datasource":{"type":"prometheus","uid":"prometheus"},"editorMode":"code","expr":"(histogram_quantile(0.99, sum by (le) (rate(voice_synthesis_latency_ms_bucket[5m]))) / 1000) or vector(0)","legendFormat":"p99","range":true,"refId":"A"}],"title":"Latency p99","type":"stat"},{"datasource":{"type":"prometheus","uid":"prometheus"},"fieldConfig":{"defaults":{"color":{"mode":"thresholds"},"thresholds":{"mode":"absolute","steps":[{"color":"#22c55e","value":null},{"color":"#f59e0b","value":2},{"color":"#ef4444","value":5}]},"unit":"percent"},"overrides":[]},"gridPos":{"h":4,"w":6,"x":12,"y":0},"id":3,"options":{"colorMode":"background","graphMode":"area","justifyMode":"center","orientation":"auto","reduceOptions":{"calcs":["lastNotNull"],"fields":"","values":false},"showPercentChange":false,"textMode":"auto","wideLayout":true},"pluginVersion":"12.4.0","targets":[{"datasource":{"type":"prometheus","uid":"prometheus"},"editorMode":"code","expr":"(100 * sum(rate(voice_synthesis_total{status=~\"error|failed|failure|throttled\"}[5m])) / clamp_min(sum(rate(voice_synthesis_total[5m])), 0.001)) or vector(0)","legendFormat":"error %","range":true,"refId":"A"}],"title":"Error Rate","type":"stat"},{"datasource":{"type":"prometheus","uid":"prometheus"},"fieldConfig":{"defaults":{"color":{"mode":"palette-classic"},"custom":{"fillOpacity":18,"lineWidth":2},"unit":"short"},"overrides":[]},"gridPos":{"h":4,"w":6,"x":18,"y":0},"id":4,"options":{"legend":{"calcs":["lastNotNull"],"displayMode":"table","placement":"bottom","showLegend":true},"tooltip":{"hideZeros":false,"mode":"multi","sort":"desc"}},"targets":[{"datasource":{"type":"prometheus","uid":"prometheus"},"editorMode":"code","expr":"sum by (status) (increase(voice_synthesis_total[1h]))","legendFormat":"{{status}}","range":true,"refId":"A"}],"title":"Status Counters","type":"timeseries"},{"datasource":{"type":"prometheus","uid":"prometheus"},"fieldConfig":{"defaults":{"color":{"mode":"palette-classic"},"custom":{"fillOpacity":10,"lineWidth":2},"unit":"ms"},"overrides":[]},"gridPos":{"h":8,"w":12,"x":0,"y":4},"id":5,"options":{"legend":{"calcs":["lastNotNull","max"],"displayMode":"table","placement":"bottom","showLegend":true},"tooltip":{"hideZeros":false,"mode":"multi","sort":"desc"}},"targets":[{"datasource":{"type":"prometheus","uid":"prometheus"},"editorMode":"code","expr":"histogram_quantile(0.50, sum by (engine, le) (rate(voice_synthesis_latency_ms_bucket[5m])))","legendFormat":"p50 {{engine}}","range":true,"refId":"A"},{"datasource":{"type":"prometheus","uid":"prometheus"},"editorMode":"code","expr":"histogram_quantile(0.95, sum by (engine, le) (rate(voice_synthesis_latency_ms_bucket[5m])))","legendFormat":"p95 {{engine}}","range":true,"refId":"B"},{"datasource":{"type":"prometheus","uid":"prometheus"},"editorMode":"code","expr":"histogram_quantile(0.99, sum by (engine, le) (rate(voice_synthesis_latency_ms_bucket[5m])))","legendFormat":"p99 {{engine}}","range":true,"refId":"C"}],"title":"Latency p50 / p95 / p99 by Engine","type":"timeseries"},{"datasource":{"type":"prometheus","uid":"prometheus"},"fieldConfig":{"defaults":{"color":{"mode":"palette-classic"},"custom":{"fillOpacity":25,"lineWidth":2,"stacking":{"mode":"normal"}},"unit":"short"},"overrides":[]},"gridPos":{"h":8,"w":12,"x":12,"y":4},"id":6,"options":{"legend":{"calcs":["lastNotNull"],"displayMode":"table","placement":"bottom","showLegend":true},"tooltip":{"hideZeros":false,"mode":"multi","sort":"desc"}},"targets":[{"datasource":{"type":"prometheus","uid":"prometheus"},"editorMode":"code","expr":"sum(increase(voice_synthesis_total{engine=~\"wyoming-piper|piper|piper-ft\"}[1h]))","legendFormat":"Tier P / Pi","range":true,"refId":"A"},{"datasource":{"type":"prometheus","uid":"prometheus"},"editorMode":"code","expr":"sum(increase(voice_synthesis_total{engine=~\"kokoro|gpu|openai|polly|edge\"}[1h]))","legendFormat":"Tier W / GPU+API","range":true,"refId":"B"},{"datasource":{"type":"prometheus","uid":"prometheus"},"editorMode":"code","expr":"sum(increase(voice_synthesis_total{engine=\"biblical-tts\"}[1h]))","legendFormat":"Biblical TTS","range":true,"refId":"C"}],"title":"Per-Tier Breakdown","type":"timeseries"},{"datasource":{"type":"prometheus","uid":"prometheus"},"fieldConfig":{"defaults":{"color":{"mode":"palette-classic"},"custom":{"align":"auto","cellOptions":{"type":"auto"}},"unit":"short"},"overrides":[]},"gridPos":{"h":8,"w":12,"x":0,"y":12},"id":7,"options":{"cellHeight":"sm","footer":{"countRows":false,"fields":"","reducer":["sum"],"show":false},"showHeader":true},"pluginVersion":"12.4.0","targets":[{"datasource":{"type":"prometheus","uid":"prometheus"},"editorMode":"code","expr":"topk(10, sum by (personality) (increase(voice_synthesis_total[30d])))","format":"table","instant":true,"legendFormat":"{{personality}}","range":false,"refId":"A"}],"title":"Per-Personality Top 10 (30d)","type":"table"},{"datasource":{"type":"prometheus","uid":"prometheus"},"fieldConfig":{"defaults":{"color":{"mode":"palette-classic"},"custom":{"align":"auto","cellOptions":{"type":"auto"}},"unit":"currencyUSD"},"overrides":[]},"gridPos":{"h":8,"w":12,"x":12,"y":12},"id":8,"options":{"cellHeight":"sm","footer":{"countRows":false,"fields":"","reducer":["sum"],"show":false},"showHeader":true},"pluginVersion":"12.4.0","targets":[{"datasource":{"type":"prometheus","uid":"prometheus"},"editorMode":"code","expr":"sum by (personality, engine) (increase(voice_synthesis_total{engine=~\"openai|polly\"}[30d])) * 0.015","format":"table","instant":true,"legendFormat":"api {{personality}}","range":false,"refId":"A"},{"datasource":{"type":"prometheus","uid":"prometheus"},"editorMode":"code","expr":"sum by (personality, engine) (increase(voice_synthesis_total{engine=~\"kokoro|gpu\"}[30d])) * 0.0002","format":"table","instant":true,"legendFormat":"gpu {{personality}}","range":false,"refId":"B"},{"datasource":{"type":"prometheus","uid":"prometheus"},"editorMode":"code","expr":"sum by (personality, engine) (increase(voice_synthesis_total{engine=~\"wyoming-piper|piper|piper-ft|biblical-tts\"}[30d])) * 0","format":"table","instant":true,"legendFormat":"local {{personality}}","range":false,"refId":"C"}],"title":"Estimated Monthly Cost by Personality","type":"table"},{"datasource":{"type":"prometheus","uid":"prometheus"},"fieldConfig":{"defaults":{"color":{"mode":"thresholds"},"custom":{"fillOpacity":20,"lineWidth":2},"thresholds":{"mode":"absolute","steps":[{"color":"#22c55e","value":null},{"color":"#f59e0b","value":2},{"color":"#ef4444","value":5}]},"unit":"percent"},"overrides":[]},"gridPos":{"h":6,"w":24,"x":0,"y":20},"id":9,"options":{"legend":{"calcs":["lastNotNull"],"displayMode":"table","placement":"bottom","showLegend":true},"tooltip":{"hideZeros":false,"mode":"multi","sort":"desc"}},"targets":[{"datasource":{"type":"prometheus","uid":"prometheus"},"editorMode":"code","expr":"(100 * sum(rate(voice_synthesis_total{status=~\"error|failed|failure|throttled\"}[5m])) / clamp_min(sum(rate(voice_synthesis_total[5m])), 0.001)) or vector(0)","legendFormat":"error %","range":true,"refId":"A"}],"title":"Error-Rate Sparkline","type":"timeseries"}],"preload":false,"refresh":"30s","schemaVersion":42,"tags":["flowercore","voice","tts","observability"],"templating":{"list":[]},"time":{"from":"now-6h","to":"now"},"timepicker":{},"timezone":"browser","title":"Voice Stack","uid":"voice-stack","version":1} + # ============================================================================= # ConfigMap: Grafana Dashboards (AI Stack Health) # ============================================================================= @@ -3107,6 +3172,7 @@ data: apiVersion: 1 datasources: - name: Prometheus + uid: prometheus type: prometheus access: proxy url: http://prometheus.monitoring.svc:9090 @@ -3296,6 +3362,68 @@ data: relativeTimeRange: {from: 120, to: 0} datasourceUid: __expr__ model: {type: threshold, expression: B, conditions: [{evaluator: {params: [600], type: gt}}], refId: C} + + - uid: voice-synthesis-error-rate-high + title: voice_synthesis_error_rate_high + condition: C + for: 5m + noDataState: OK + execErrState: OK + annotations: + summary: Voice synthesis error rate above 5% + description: More than 5% of voice synthesis attempts ended in error, failure, or throttling over the last 5 minutes. + runbook: "1. Open https://grafana-noc1.iamworkin.lan/d/voice-stack 2. Check status counters by engine/personality 3. Verify TtsReader quick synth 4. Check Piper/Kokoro/biblical-tts backend logs." + labels: + severity: warning + service: voice-stack + alert_channel: irc + data: + - refId: A + relativeTimeRange: {from: 300, to: 0} + datasourceUid: prometheus + model: + expr: '((sum(rate(voice_synthesis_total{status=~"error|failed|failure|throttled"}[5m])) / clamp_min(sum(rate(voice_synthesis_total[5m])), 0.001)) unless on() absent_over_time(voice_synthesis_total[7d]))' + instant: true + refId: A + - refId: B + relativeTimeRange: {from: 300, to: 0} + datasourceUid: __expr__ + model: {type: reduce, expression: A, reducer: last, refId: B} + - refId: C + relativeTimeRange: {from: 300, to: 0} + datasourceUid: __expr__ + model: {type: threshold, expression: B, conditions: [{evaluator: {params: [0.05], type: gt}}], refId: C} + + - uid: voice-synthesis-latency-p99-high + title: voice_synthesis_latency_p99_high + condition: C + for: 2m + noDataState: OK + execErrState: OK + annotations: + summary: Voice synthesis p99 latency above 10s + description: Voice synthesis p99 latency has exceeded 10 seconds for two evaluation cycles. + runbook: "1. Open https://grafana-noc1.iamworkin.lan/d/voice-stack 2. Compare p50/p95/p99 by engine and tier 3. Check whether Piper/Kokoro/GPU/API backend is cold-starting or overloaded." + labels: + severity: warning + service: voice-stack + alert_channel: irc + data: + - refId: A + relativeTimeRange: {from: 300, to: 0} + datasourceUid: prometheus + model: + expr: '(histogram_quantile(0.99, sum by (engine, le) (rate(voice_synthesis_latency_ms_bucket[5m]))) unless on() absent_over_time(voice_synthesis_latency_ms_bucket[7d]))' + instant: true + refId: A + - refId: B + relativeTimeRange: {from: 300, to: 0} + datasourceUid: __expr__ + model: {type: reduce, expression: A, reducer: last, refId: B} + - refId: C + relativeTimeRange: {from: 300, to: 0} + datasourceUid: __expr__ + model: {type: threshold, expression: B, conditions: [{evaluator: {params: [10000], type: gt}}], refId: C} - orgId: 1 name: Infrastructure folder: AI Stack Alerts @@ -3652,6 +3780,9 @@ spec: - name: dashboards-remotedesktop mountPath: /var/lib/grafana/dashboards/remotedesktop readOnly: true + - name: dashboards-voice-stack + mountPath: /var/lib/grafana/dashboards/voice-stack + readOnly: true - name: datasource-provisioning mountPath: /etc/grafana/provisioning/datasources readOnly: true @@ -3705,6 +3836,9 @@ spec: - name: dashboards-remotedesktop configMap: name: grafana-dashboard-remotedesktop + - name: dashboards-voice-stack + configMap: + name: grafana-dashboard-voice-stack - name: datasource-provisioning configMap: name: grafana-datasource-provisioning