From df115e4d1ef881197a274ff8bdabd675fd529b09 Mon Sep 17 00:00:00 2001 From: Andrew Stoltz Date: Sat, 25 Apr 2026 10:50:45 -0500 Subject: [PATCH] fc-ttsreader: ship cluster-native fc-speech-align (faster-whisper) + bump web - New ttsreader-align Deployment + Service + 5Gi PVC under apps/fc-ttsreader/. Wraps SYSTRAN/faster-whisper in a small FastAPI app exposing POST /align (fc-align contract used by Shared.Speech) AND POST /transcribe (audio-in feature consumed by ttsreader-web Lane G). Source: apps/fc-ttsreader/speech-align/ (Dockerfile + app.py + requirements.txt). Built locally (apt-get RUN steps need BLUEJAY-WS, not noc1) and ctr-imported to all 3 RKE2 nodes. - ttsreader-web env: flip Speech__Alignment__Enabled=true and point BaseUrl at http://ttsreader-align.fc-ttsreader.svc.cluster.local.:9200. Add new TtsReader__Transcription__* env triplet pointing at the same service (same /transcribe endpoint). - Bump ttsreader-web image to v202604251046 (carries the TranscriptionController + MCP tool + Quick.razor InputFile UI). --- apps/fc-ttsreader/fc-ttsreader.yaml | 131 +++++++++++-- apps/fc-ttsreader/speech-align/Dockerfile | 47 +++++ apps/fc-ttsreader/speech-align/app.py | 174 ++++++++++++++++++ .../speech-align/requirements.txt | 4 + 4 files changed, 344 insertions(+), 12 deletions(-) create mode 100644 apps/fc-ttsreader/speech-align/Dockerfile create mode 100644 apps/fc-ttsreader/speech-align/app.py create mode 100644 apps/fc-ttsreader/speech-align/requirements.txt diff --git a/apps/fc-ttsreader/fc-ttsreader.yaml b/apps/fc-ttsreader/fc-ttsreader.yaml index 9d58115..fcc9c15 100644 --- a/apps/fc-ttsreader/fc-ttsreader.yaml +++ b/apps/fc-ttsreader/fc-ttsreader.yaml @@ -112,6 +112,109 @@ spec: persistentVolumeClaim: claimName: ttsreader-piper-data --- +# fc-speech-align — cluster-native faster-whisper wrapper. +# Exposes POST /align (fc-align contract used by FlowerCore.Shared.Speech) AND +# POST /transcribe (audio-file-in feature). CPU model = base.en, int8 compute. +# Source: bluejay-infra/apps/fc-ttsreader/speech-align/ (Dockerfile + app.py). +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: ttsreader-align-models + namespace: fc-ttsreader +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 5Gi +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: ttsreader-align + namespace: fc-ttsreader + labels: + app.kubernetes.io/name: ttsreader-align + app.kubernetes.io/part-of: flowercore +spec: + replicas: 1 + strategy: + type: Recreate + selector: + matchLabels: + app.kubernetes.io/name: ttsreader-align + template: + metadata: + labels: + app.kubernetes.io/name: ttsreader-align + app.kubernetes.io/part-of: flowercore + spec: + securityContext: + fsGroup: 1654 + runAsNonRoot: true + runAsUser: 1654 + containers: + - name: align + image: localhost/fc-speech-align:v1 + imagePullPolicy: Never + ports: + - containerPort: 9200 + name: http + env: + - name: WHISPER_MODEL + value: "Systran/faster-whisper-base.en" + - name: WHISPER_DEVICE + value: "cpu" + - name: WHISPER_COMPUTE_TYPE + value: "int8" + - name: WHISPER_CACHE_DIR + value: "/models" + - name: DEFAULT_LANGUAGE + value: "en" + resources: + requests: + cpu: 250m + memory: 512Mi + limits: + cpu: 2000m + memory: 2Gi + volumeMounts: + - name: models + mountPath: /models + readinessProbe: + httpGet: + path: /health + port: 9200 + initialDelaySeconds: 30 + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 18 + livenessProbe: + httpGet: + path: /health + port: 9200 + initialDelaySeconds: 180 + periodSeconds: 30 + timeoutSeconds: 5 + failureThreshold: 3 + volumes: + - name: models + persistentVolumeClaim: + claimName: ttsreader-align-models +--- +apiVersion: v1 +kind: Service +metadata: + name: ttsreader-align + namespace: fc-ttsreader +spec: + selector: + app.kubernetes.io/name: ttsreader-align + ports: + - port: 9200 + targetPort: 9200 + name: http +--- apiVersion: apps/v1 kind: Deployment metadata: @@ -142,7 +245,7 @@ spec: fsGroupChangePolicy: OnRootMismatch containers: - name: web - image: localhost/fc-ttsreader-web:v202604251018 + image: localhost/fc-ttsreader-web:v202604251046 imagePullPolicy: Never ports: - containerPort: 5217 @@ -173,20 +276,24 @@ spec: - name: TtsReader__Kokoro__TimeoutSeconds value: "120" - name: Speech__Alignment__Enabled - # Off until either: - # (a) a native /align backend is deployed inside the cluster, or - # (b) the BLUEJAY-WS host exposes the speaches container on the - # LAN-routable bind (10.0.56.20:9200, not just 127.0.0.1) - # AND Common ships the openai-compatible Backend support - # (currently on feat/shared-indexing, not on master). - # While disabled, /preview-with-timings still returns word timings - # via EstimatedAlignmentClient — slightly less accurate, but the - # UI can still drive word-level highlight playback. - value: "false" + # Cluster-native faster-whisper (Lane F, 2026-04-25). The + # ttsreader-align deployment in this manifest wraps + # SYSTRAN/faster-whisper with a /align endpoint matching the + # FlowerCore.Shared.Speech master contract. + value: "true" - name: Speech__Alignment__BaseUrl - value: "http://10.0.56.20:9200" + value: "http://ttsreader-align.fc-ttsreader.svc.cluster.local.:9200" - name: Speech__Alignment__TimeoutSeconds value: "120" + # Cluster-native transcription endpoint shares the same pod + # (POST /transcribe). Lane G consumes this from the + # FlowerCore.TtsReader.Web AudioImport feature. + - name: TtsReader__Transcription__Enabled + value: "true" + - name: TtsReader__Transcription__BaseUrl + value: "http://ttsreader-align.fc-ttsreader.svc.cluster.local.:9200" + - name: TtsReader__Transcription__TimeoutSeconds + value: "300" - name: TtsReader__Ollama__BaseUrl value: "http://10.0.57.17:11434" - name: TtsReader__Ollama__DefaultModel diff --git a/apps/fc-ttsreader/speech-align/Dockerfile b/apps/fc-ttsreader/speech-align/Dockerfile new file mode 100644 index 0000000..1be1d6d --- /dev/null +++ b/apps/fc-ttsreader/speech-align/Dockerfile @@ -0,0 +1,47 @@ +# FlowerCore speech-align — wraps SYSTRAN/faster-whisper with /align + +# /transcribe endpoints used by FlowerCore.TtsReader. CPU-only image; the +# default int8 compute type runs base.en at ~real-time on a single core. +# +# Build: podman build -t localhost/fc-speech-align: . +# Run: podman run --rm -p 9200:9200 -v fc-speech-align-models:/models localhost/fc-speech-align: + +FROM python:3.12-slim AS base + +ENV PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 \ + PIP_DISABLE_PIP_VERSION_CHECK=1 \ + PIP_NO_CACHE_DIR=1 \ + WHISPER_MODEL=Systran/faster-whisper-base.en \ + WHISPER_CACHE_DIR=/models \ + WHISPER_DEVICE=cpu \ + WHISPER_COMPUTE_TYPE=int8 \ + DEFAULT_LANGUAGE=en \ + MAX_AUDIO_BYTES=52428800 + +# faster-whisper depends on libsndfile1 + libgomp1 (OpenMP runtime). ffmpeg is +# pulled in for non-WAV inputs (transcribe accepts any container). +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + libsndfile1 \ + libgomp1 \ + ffmpeg \ + ca-certificates \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /app +COPY requirements.txt /app/ +RUN pip install --no-cache-dir -r requirements.txt + +COPY app.py /app/ + +# Run as a non-root user to satisfy K8s securityContext.runAsNonRoot. +RUN useradd --create-home --shell /usr/sbin/nologin --uid 1654 align \ + && mkdir -p /models \ + && chown -R 1654:1654 /models +USER 1654 + +EXPOSE 9200 +HEALTHCHECK --interval=30s --timeout=5s --start-period=120s --retries=3 \ + CMD python -c "import urllib.request,sys; urllib.request.urlopen('http://127.0.0.1:9200/health',timeout=3); sys.exit(0)" || exit 1 + +CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "9200", "--workers", "1"] diff --git a/apps/fc-ttsreader/speech-align/app.py b/apps/fc-ttsreader/speech-align/app.py new file mode 100644 index 0000000..092bb48 --- /dev/null +++ b/apps/fc-ttsreader/speech-align/app.py @@ -0,0 +1,174 @@ +"""FlowerCore speech-align service. + +Wraps SYSTRAN/faster-whisper (https://github.com/SYSTRAN/faster-whisper) in a +small FastAPI app exposing two endpoints: + +* POST /align — fc-align contract used by FlowerCore.Shared.Speech's + FasterWhisperAlignmentClient on master. Multipart form + (`audio`, `language`) returns + `{text, words: [{word, startSeconds, endSeconds, confidence}], + durationMs, language}`. +* POST /transcribe — audio-file-in transcription used by the new TtsReader + audio-import feature. Multipart form (`audio`, optional + `language`) returns `{text, language, durationMs, + segments: [{startSeconds, endSeconds, text}]}` so the + UI can preview the transcript before piping it into + Quick Read or saving as a project. + +Both endpoints share the same WhisperModel instance (loaded once at startup). +Model is pinned by the WHISPER_MODEL env var (defaults to base.en) and cached +under WHISPER_CACHE_DIR (defaults to /models, backed by a PVC in K8s). + +Health: GET /health → {status: ok, model, device, computeType}. +""" +from __future__ import annotations + +import io +import logging +import os +import time +from contextlib import asynccontextmanager +from typing import Optional + +from fastapi import FastAPI, File, Form, HTTPException, UploadFile +from fastapi.responses import JSONResponse +from faster_whisper import WhisperModel + +LOG = logging.getLogger("speech_align") +logging.basicConfig( + level=os.environ.get("LOG_LEVEL", "INFO"), + format="%(asctime)s %(levelname)s %(name)s %(message)s", +) + +MODEL_NAME = os.environ.get("WHISPER_MODEL", "Systran/faster-whisper-base.en") +DEVICE = os.environ.get("WHISPER_DEVICE", "cpu") +COMPUTE_TYPE = os.environ.get("WHISPER_COMPUTE_TYPE", "int8") +CACHE_DIR = os.environ.get("WHISPER_CACHE_DIR", "/models") +MAX_BYTES = int(os.environ.get("MAX_AUDIO_BYTES", str(50 * 1024 * 1024))) # 50 MB +DEFAULT_LANGUAGE = os.environ.get("DEFAULT_LANGUAGE", "en") + +_state: dict[str, object] = {} + + +@asynccontextmanager +async def lifespan(_app: FastAPI): + LOG.info("Loading faster-whisper model %s (device=%s compute=%s cache=%s)", MODEL_NAME, DEVICE, COMPUTE_TYPE, CACHE_DIR) + started = time.time() + model = WhisperModel(MODEL_NAME, device=DEVICE, compute_type=COMPUTE_TYPE, download_root=CACHE_DIR) + _state["model"] = model + LOG.info("Model loaded in %.2fs", time.time() - started) + yield + _state.clear() + + +app = FastAPI(title="FlowerCore speech-align", version="1.0.0", lifespan=lifespan) + + +def _get_model() -> WhisperModel: + model = _state.get("model") + if model is None: + raise HTTPException(status_code=503, detail="Model not loaded yet") + return model # type: ignore[return-value] + + +async def _read_upload(upload: UploadFile) -> bytes: + payload = await upload.read() + if not payload: + raise HTTPException(status_code=400, detail="audio is empty") + if len(payload) > MAX_BYTES: + raise HTTPException( + status_code=413, + detail=f"audio exceeds {MAX_BYTES} byte limit ({len(payload)} bytes received)", + ) + return payload + + +def _normalize_language(value: Optional[str]) -> Optional[str]: + if not value or not value.strip(): + return DEFAULT_LANGUAGE + return value.strip().lower() + + +def _transcribe_bytes(audio_bytes: bytes, language: Optional[str], word_timestamps: bool): + model = _get_model() + started = time.time() + segments_iter, info = model.transcribe( + io.BytesIO(audio_bytes), + language=language, + word_timestamps=word_timestamps, + beam_size=1, + vad_filter=True, + ) + segments = list(segments_iter) + elapsed_ms = int((time.time() - started) * 1000) + return segments, info, elapsed_ms + + +@app.get("/health") +def health(): + return { + "status": "ok" if _state.get("model") is not None else "loading", + "model": MODEL_NAME, + "device": DEVICE, + "computeType": COMPUTE_TYPE, + "defaultLanguage": DEFAULT_LANGUAGE, + "maxBytes": MAX_BYTES, + } + + +@app.post("/align") +async def align(audio: UploadFile = File(...), language: str = Form(DEFAULT_LANGUAGE)): + """fc-align contract — used by FlowerCore.Shared.Speech.FasterWhisperAlignmentClient.""" + payload = await _read_upload(audio) + lang = _normalize_language(language) + segments, info, elapsed_ms = _transcribe_bytes(payload, lang, word_timestamps=True) + + text_parts: list[str] = [] + words: list[dict] = [] + for segment in segments: + text_parts.append(segment.text.strip()) + for word in (segment.words or []): + words.append({ + "word": word.word.strip(), + "startSeconds": float(word.start or 0.0), + "endSeconds": float(word.end or 0.0), + "confidence": float(getattr(word, "probability", 0.0) or 0.0), + }) + + duration_ms = int((info.duration or 0.0) * 1000) + return JSONResponse({ + "text": " ".join(p for p in text_parts if p).strip(), + "words": words, + "durationMs": duration_ms, + "language": info.language or lang, + "elapsedMs": elapsed_ms, + }) + + +@app.post("/transcribe") +async def transcribe(audio: UploadFile = File(...), language: Optional[str] = Form(None)): + """Audio-in transcription contract — used by the new TtsReader audio-import feature. + + Returns full segments (no per-word timestamps) so the UI can preview the + transcript before piping it into Quick Read or saving as a project. + """ + payload = await _read_upload(audio) + lang = _normalize_language(language) + segments, info, elapsed_ms = _transcribe_bytes(payload, lang, word_timestamps=False) + + out_segments = [ + { + "startSeconds": float(segment.start or 0.0), + "endSeconds": float(segment.end or 0.0), + "text": segment.text.strip(), + } + for segment in segments + ] + + return JSONResponse({ + "text": " ".join(s["text"] for s in out_segments if s["text"]).strip(), + "segments": out_segments, + "language": info.language or lang, + "durationMs": int((info.duration or 0.0) * 1000), + "elapsedMs": elapsed_ms, + }) diff --git a/apps/fc-ttsreader/speech-align/requirements.txt b/apps/fc-ttsreader/speech-align/requirements.txt new file mode 100644 index 0000000..47b0514 --- /dev/null +++ b/apps/fc-ttsreader/speech-align/requirements.txt @@ -0,0 +1,4 @@ +faster-whisper==1.0.3 +fastapi==0.115.0 +uvicorn[standard]==0.30.6 +python-multipart==0.0.10