fc-ttsreader: ship cluster-native fc-speech-align (faster-whisper) + bump web
- New ttsreader-align Deployment + Service + 5Gi PVC under apps/fc-ttsreader/. Wraps SYSTRAN/faster-whisper in a small FastAPI app exposing POST /align (fc-align contract used by Shared.Speech) AND POST /transcribe (audio-in feature consumed by ttsreader-web Lane G). Source: apps/fc-ttsreader/speech-align/ (Dockerfile + app.py + requirements.txt). Built locally (apt-get RUN steps need BLUEJAY-WS, not noc1) and ctr-imported to all 3 RKE2 nodes. - ttsreader-web env: flip Speech__Alignment__Enabled=true and point BaseUrl at http://ttsreader-align.fc-ttsreader.svc.cluster.local.:9200. Add new TtsReader__Transcription__* env triplet pointing at the same service (same /transcribe endpoint). - Bump ttsreader-web image to v202604251046 (carries the TranscriptionController + MCP tool + Quick.razor InputFile UI).
This commit is contained in:
@@ -112,6 +112,109 @@ spec:
|
||||
persistentVolumeClaim:
|
||||
claimName: ttsreader-piper-data
|
||||
---
|
||||
# fc-speech-align — cluster-native faster-whisper wrapper.
|
||||
# Exposes POST /align (fc-align contract used by FlowerCore.Shared.Speech) AND
|
||||
# POST /transcribe (audio-file-in feature). CPU model = base.en, int8 compute.
|
||||
# Source: bluejay-infra/apps/fc-ttsreader/speech-align/ (Dockerfile + app.py).
|
||||
apiVersion: v1
|
||||
kind: PersistentVolumeClaim
|
||||
metadata:
|
||||
name: ttsreader-align-models
|
||||
namespace: fc-ttsreader
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
resources:
|
||||
requests:
|
||||
storage: 5Gi
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: ttsreader-align
|
||||
namespace: fc-ttsreader
|
||||
labels:
|
||||
app.kubernetes.io/name: ttsreader-align
|
||||
app.kubernetes.io/part-of: flowercore
|
||||
spec:
|
||||
replicas: 1
|
||||
strategy:
|
||||
type: Recreate
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: ttsreader-align
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/name: ttsreader-align
|
||||
app.kubernetes.io/part-of: flowercore
|
||||
spec:
|
||||
securityContext:
|
||||
fsGroup: 1654
|
||||
runAsNonRoot: true
|
||||
runAsUser: 1654
|
||||
containers:
|
||||
- name: align
|
||||
image: localhost/fc-speech-align:v1
|
||||
imagePullPolicy: Never
|
||||
ports:
|
||||
- containerPort: 9200
|
||||
name: http
|
||||
env:
|
||||
- name: WHISPER_MODEL
|
||||
value: "Systran/faster-whisper-base.en"
|
||||
- name: WHISPER_DEVICE
|
||||
value: "cpu"
|
||||
- name: WHISPER_COMPUTE_TYPE
|
||||
value: "int8"
|
||||
- name: WHISPER_CACHE_DIR
|
||||
value: "/models"
|
||||
- name: DEFAULT_LANGUAGE
|
||||
value: "en"
|
||||
resources:
|
||||
requests:
|
||||
cpu: 250m
|
||||
memory: 512Mi
|
||||
limits:
|
||||
cpu: 2000m
|
||||
memory: 2Gi
|
||||
volumeMounts:
|
||||
- name: models
|
||||
mountPath: /models
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /health
|
||||
port: 9200
|
||||
initialDelaySeconds: 30
|
||||
periodSeconds: 10
|
||||
timeoutSeconds: 5
|
||||
failureThreshold: 18
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /health
|
||||
port: 9200
|
||||
initialDelaySeconds: 180
|
||||
periodSeconds: 30
|
||||
timeoutSeconds: 5
|
||||
failureThreshold: 3
|
||||
volumes:
|
||||
- name: models
|
||||
persistentVolumeClaim:
|
||||
claimName: ttsreader-align-models
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: ttsreader-align
|
||||
namespace: fc-ttsreader
|
||||
spec:
|
||||
selector:
|
||||
app.kubernetes.io/name: ttsreader-align
|
||||
ports:
|
||||
- port: 9200
|
||||
targetPort: 9200
|
||||
name: http
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
@@ -142,7 +245,7 @@ spec:
|
||||
fsGroupChangePolicy: OnRootMismatch
|
||||
containers:
|
||||
- name: web
|
||||
image: localhost/fc-ttsreader-web:v202604251018
|
||||
image: localhost/fc-ttsreader-web:v202604251046
|
||||
imagePullPolicy: Never
|
||||
ports:
|
||||
- containerPort: 5217
|
||||
@@ -173,20 +276,24 @@ spec:
|
||||
- name: TtsReader__Kokoro__TimeoutSeconds
|
||||
value: "120"
|
||||
- name: Speech__Alignment__Enabled
|
||||
# Off until either:
|
||||
# (a) a native /align backend is deployed inside the cluster, or
|
||||
# (b) the BLUEJAY-WS host exposes the speaches container on the
|
||||
# LAN-routable bind (10.0.56.20:9200, not just 127.0.0.1)
|
||||
# AND Common ships the openai-compatible Backend support
|
||||
# (currently on feat/shared-indexing, not on master).
|
||||
# While disabled, /preview-with-timings still returns word timings
|
||||
# via EstimatedAlignmentClient — slightly less accurate, but the
|
||||
# UI can still drive word-level highlight playback.
|
||||
value: "false"
|
||||
# Cluster-native faster-whisper (Lane F, 2026-04-25). The
|
||||
# ttsreader-align deployment in this manifest wraps
|
||||
# SYSTRAN/faster-whisper with a /align endpoint matching the
|
||||
# FlowerCore.Shared.Speech master contract.
|
||||
value: "true"
|
||||
- name: Speech__Alignment__BaseUrl
|
||||
value: "http://10.0.56.20:9200"
|
||||
value: "http://ttsreader-align.fc-ttsreader.svc.cluster.local.:9200"
|
||||
- name: Speech__Alignment__TimeoutSeconds
|
||||
value: "120"
|
||||
# Cluster-native transcription endpoint shares the same pod
|
||||
# (POST /transcribe). Lane G consumes this from the
|
||||
# FlowerCore.TtsReader.Web AudioImport feature.
|
||||
- name: TtsReader__Transcription__Enabled
|
||||
value: "true"
|
||||
- name: TtsReader__Transcription__BaseUrl
|
||||
value: "http://ttsreader-align.fc-ttsreader.svc.cluster.local.:9200"
|
||||
- name: TtsReader__Transcription__TimeoutSeconds
|
||||
value: "300"
|
||||
- name: TtsReader__Ollama__BaseUrl
|
||||
value: "http://10.0.57.17:11434"
|
||||
- name: TtsReader__Ollama__DefaultModel
|
||||
|
||||
47
apps/fc-ttsreader/speech-align/Dockerfile
Normal file
47
apps/fc-ttsreader/speech-align/Dockerfile
Normal file
@@ -0,0 +1,47 @@
|
||||
# FlowerCore speech-align — wraps SYSTRAN/faster-whisper with /align +
|
||||
# /transcribe endpoints used by FlowerCore.TtsReader. CPU-only image; the
|
||||
# default int8 compute type runs base.en at ~real-time on a single core.
|
||||
#
|
||||
# Build: podman build -t localhost/fc-speech-align:<ver> .
|
||||
# Run: podman run --rm -p 9200:9200 -v fc-speech-align-models:/models localhost/fc-speech-align:<ver>
|
||||
|
||||
FROM python:3.12-slim AS base
|
||||
|
||||
ENV PYTHONDONTWRITEBYTECODE=1 \
|
||||
PYTHONUNBUFFERED=1 \
|
||||
PIP_DISABLE_PIP_VERSION_CHECK=1 \
|
||||
PIP_NO_CACHE_DIR=1 \
|
||||
WHISPER_MODEL=Systran/faster-whisper-base.en \
|
||||
WHISPER_CACHE_DIR=/models \
|
||||
WHISPER_DEVICE=cpu \
|
||||
WHISPER_COMPUTE_TYPE=int8 \
|
||||
DEFAULT_LANGUAGE=en \
|
||||
MAX_AUDIO_BYTES=52428800
|
||||
|
||||
# faster-whisper depends on libsndfile1 + libgomp1 (OpenMP runtime). ffmpeg is
|
||||
# pulled in for non-WAV inputs (transcribe accepts any container).
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y --no-install-recommends \
|
||||
libsndfile1 \
|
||||
libgomp1 \
|
||||
ffmpeg \
|
||||
ca-certificates \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
WORKDIR /app
|
||||
COPY requirements.txt /app/
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
COPY app.py /app/
|
||||
|
||||
# Run as a non-root user to satisfy K8s securityContext.runAsNonRoot.
|
||||
RUN useradd --create-home --shell /usr/sbin/nologin --uid 1654 align \
|
||||
&& mkdir -p /models \
|
||||
&& chown -R 1654:1654 /models
|
||||
USER 1654
|
||||
|
||||
EXPOSE 9200
|
||||
HEALTHCHECK --interval=30s --timeout=5s --start-period=120s --retries=3 \
|
||||
CMD python -c "import urllib.request,sys; urllib.request.urlopen('http://127.0.0.1:9200/health',timeout=3); sys.exit(0)" || exit 1
|
||||
|
||||
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "9200", "--workers", "1"]
|
||||
174
apps/fc-ttsreader/speech-align/app.py
Normal file
174
apps/fc-ttsreader/speech-align/app.py
Normal file
@@ -0,0 +1,174 @@
|
||||
"""FlowerCore speech-align service.
|
||||
|
||||
Wraps SYSTRAN/faster-whisper (https://github.com/SYSTRAN/faster-whisper) in a
|
||||
small FastAPI app exposing two endpoints:
|
||||
|
||||
* POST /align — fc-align contract used by FlowerCore.Shared.Speech's
|
||||
FasterWhisperAlignmentClient on master. Multipart form
|
||||
(`audio`, `language`) returns
|
||||
`{text, words: [{word, startSeconds, endSeconds, confidence}],
|
||||
durationMs, language}`.
|
||||
* POST /transcribe — audio-file-in transcription used by the new TtsReader
|
||||
audio-import feature. Multipart form (`audio`, optional
|
||||
`language`) returns `{text, language, durationMs,
|
||||
segments: [{startSeconds, endSeconds, text}]}` so the
|
||||
UI can preview the transcript before piping it into
|
||||
Quick Read or saving as a project.
|
||||
|
||||
Both endpoints share the same WhisperModel instance (loaded once at startup).
|
||||
Model is pinned by the WHISPER_MODEL env var (defaults to base.en) and cached
|
||||
under WHISPER_CACHE_DIR (defaults to /models, backed by a PVC in K8s).
|
||||
|
||||
Health: GET /health → {status: ok, model, device, computeType}.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import io
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
from contextlib import asynccontextmanager
|
||||
from typing import Optional
|
||||
|
||||
from fastapi import FastAPI, File, Form, HTTPException, UploadFile
|
||||
from fastapi.responses import JSONResponse
|
||||
from faster_whisper import WhisperModel
|
||||
|
||||
LOG = logging.getLogger("speech_align")
|
||||
logging.basicConfig(
|
||||
level=os.environ.get("LOG_LEVEL", "INFO"),
|
||||
format="%(asctime)s %(levelname)s %(name)s %(message)s",
|
||||
)
|
||||
|
||||
MODEL_NAME = os.environ.get("WHISPER_MODEL", "Systran/faster-whisper-base.en")
|
||||
DEVICE = os.environ.get("WHISPER_DEVICE", "cpu")
|
||||
COMPUTE_TYPE = os.environ.get("WHISPER_COMPUTE_TYPE", "int8")
|
||||
CACHE_DIR = os.environ.get("WHISPER_CACHE_DIR", "/models")
|
||||
MAX_BYTES = int(os.environ.get("MAX_AUDIO_BYTES", str(50 * 1024 * 1024))) # 50 MB
|
||||
DEFAULT_LANGUAGE = os.environ.get("DEFAULT_LANGUAGE", "en")
|
||||
|
||||
_state: dict[str, object] = {}
|
||||
|
||||
|
||||
@asynccontextmanager
|
||||
async def lifespan(_app: FastAPI):
|
||||
LOG.info("Loading faster-whisper model %s (device=%s compute=%s cache=%s)", MODEL_NAME, DEVICE, COMPUTE_TYPE, CACHE_DIR)
|
||||
started = time.time()
|
||||
model = WhisperModel(MODEL_NAME, device=DEVICE, compute_type=COMPUTE_TYPE, download_root=CACHE_DIR)
|
||||
_state["model"] = model
|
||||
LOG.info("Model loaded in %.2fs", time.time() - started)
|
||||
yield
|
||||
_state.clear()
|
||||
|
||||
|
||||
app = FastAPI(title="FlowerCore speech-align", version="1.0.0", lifespan=lifespan)
|
||||
|
||||
|
||||
def _get_model() -> WhisperModel:
|
||||
model = _state.get("model")
|
||||
if model is None:
|
||||
raise HTTPException(status_code=503, detail="Model not loaded yet")
|
||||
return model # type: ignore[return-value]
|
||||
|
||||
|
||||
async def _read_upload(upload: UploadFile) -> bytes:
|
||||
payload = await upload.read()
|
||||
if not payload:
|
||||
raise HTTPException(status_code=400, detail="audio is empty")
|
||||
if len(payload) > MAX_BYTES:
|
||||
raise HTTPException(
|
||||
status_code=413,
|
||||
detail=f"audio exceeds {MAX_BYTES} byte limit ({len(payload)} bytes received)",
|
||||
)
|
||||
return payload
|
||||
|
||||
|
||||
def _normalize_language(value: Optional[str]) -> Optional[str]:
|
||||
if not value or not value.strip():
|
||||
return DEFAULT_LANGUAGE
|
||||
return value.strip().lower()
|
||||
|
||||
|
||||
def _transcribe_bytes(audio_bytes: bytes, language: Optional[str], word_timestamps: bool):
|
||||
model = _get_model()
|
||||
started = time.time()
|
||||
segments_iter, info = model.transcribe(
|
||||
io.BytesIO(audio_bytes),
|
||||
language=language,
|
||||
word_timestamps=word_timestamps,
|
||||
beam_size=1,
|
||||
vad_filter=True,
|
||||
)
|
||||
segments = list(segments_iter)
|
||||
elapsed_ms = int((time.time() - started) * 1000)
|
||||
return segments, info, elapsed_ms
|
||||
|
||||
|
||||
@app.get("/health")
|
||||
def health():
|
||||
return {
|
||||
"status": "ok" if _state.get("model") is not None else "loading",
|
||||
"model": MODEL_NAME,
|
||||
"device": DEVICE,
|
||||
"computeType": COMPUTE_TYPE,
|
||||
"defaultLanguage": DEFAULT_LANGUAGE,
|
||||
"maxBytes": MAX_BYTES,
|
||||
}
|
||||
|
||||
|
||||
@app.post("/align")
|
||||
async def align(audio: UploadFile = File(...), language: str = Form(DEFAULT_LANGUAGE)):
|
||||
"""fc-align contract — used by FlowerCore.Shared.Speech.FasterWhisperAlignmentClient."""
|
||||
payload = await _read_upload(audio)
|
||||
lang = _normalize_language(language)
|
||||
segments, info, elapsed_ms = _transcribe_bytes(payload, lang, word_timestamps=True)
|
||||
|
||||
text_parts: list[str] = []
|
||||
words: list[dict] = []
|
||||
for segment in segments:
|
||||
text_parts.append(segment.text.strip())
|
||||
for word in (segment.words or []):
|
||||
words.append({
|
||||
"word": word.word.strip(),
|
||||
"startSeconds": float(word.start or 0.0),
|
||||
"endSeconds": float(word.end or 0.0),
|
||||
"confidence": float(getattr(word, "probability", 0.0) or 0.0),
|
||||
})
|
||||
|
||||
duration_ms = int((info.duration or 0.0) * 1000)
|
||||
return JSONResponse({
|
||||
"text": " ".join(p for p in text_parts if p).strip(),
|
||||
"words": words,
|
||||
"durationMs": duration_ms,
|
||||
"language": info.language or lang,
|
||||
"elapsedMs": elapsed_ms,
|
||||
})
|
||||
|
||||
|
||||
@app.post("/transcribe")
|
||||
async def transcribe(audio: UploadFile = File(...), language: Optional[str] = Form(None)):
|
||||
"""Audio-in transcription contract — used by the new TtsReader audio-import feature.
|
||||
|
||||
Returns full segments (no per-word timestamps) so the UI can preview the
|
||||
transcript before piping it into Quick Read or saving as a project.
|
||||
"""
|
||||
payload = await _read_upload(audio)
|
||||
lang = _normalize_language(language)
|
||||
segments, info, elapsed_ms = _transcribe_bytes(payload, lang, word_timestamps=False)
|
||||
|
||||
out_segments = [
|
||||
{
|
||||
"startSeconds": float(segment.start or 0.0),
|
||||
"endSeconds": float(segment.end or 0.0),
|
||||
"text": segment.text.strip(),
|
||||
}
|
||||
for segment in segments
|
||||
]
|
||||
|
||||
return JSONResponse({
|
||||
"text": " ".join(s["text"] for s in out_segments if s["text"]).strip(),
|
||||
"segments": out_segments,
|
||||
"language": info.language or lang,
|
||||
"durationMs": int((info.duration or 0.0) * 1000),
|
||||
"elapsedMs": elapsed_ms,
|
||||
})
|
||||
4
apps/fc-ttsreader/speech-align/requirements.txt
Normal file
4
apps/fc-ttsreader/speech-align/requirements.txt
Normal file
@@ -0,0 +1,4 @@
|
||||
faster-whisper==1.0.3
|
||||
fastapi==0.115.0
|
||||
uvicorn[standard]==0.30.6
|
||||
python-multipart==0.0.10
|
||||
Reference in New Issue
Block a user