fc-ttsreader: ship cluster-native fc-speech-align (faster-whisper) + bump web

- New ttsreader-align Deployment + Service + 5Gi PVC under apps/fc-ttsreader/. Wraps SYSTRAN/faster-whisper in a small FastAPI app exposing POST /align (fc-align contract used by Shared.Speech) AND POST /transcribe (audio-in feature consumed by ttsreader-web Lane G). Source: apps/fc-ttsreader/speech-align/ (Dockerfile + app.py + requirements.txt). Built locally (apt-get RUN steps need BLUEJAY-WS, not noc1) and ctr-imported to all 3 RKE2 nodes. - ttsreader-web env: flip Speech__Alignment__Enabled=true and point BaseUrl at http://ttsreader-align.fc-ttsreader.svc.cluster.local.:9200. Add new TtsReader__Transcription__* env triplet pointing at the same service (same /transcribe endpoint). - Bump ttsreader-web image to v202604251046 (carries the TranscriptionController + MCP tool + Quick.razor InputFile UI).
2026-04-25 10:50:45 -05:00
parent 9df26620b8
commit df115e4d1e
4 changed files with 344 additions and 12 deletions
--- a/apps/fc-ttsreader/fc-ttsreader.yaml
+++ b/apps/fc-ttsreader/fc-ttsreader.yaml
@@ -112,6 +112,109 @@ spec:
          persistentVolumeClaim:
            claimName: ttsreader-piper-data
 ---
+# fc-speech-align — cluster-native faster-whisper wrapper.
+# Exposes POST /align (fc-align contract used by FlowerCore.Shared.Speech) AND
+# POST /transcribe (audio-file-in feature). CPU model = base.en, int8 compute.
+# Source: bluejay-infra/apps/fc-ttsreader/speech-align/ (Dockerfile + app.py).
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: ttsreader-align-models
+  namespace: fc-ttsreader
+spec:
+  accessModes:
+    - ReadWriteOnce
+  resources:
+    requests:
+      storage: 5Gi
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: ttsreader-align
+  namespace: fc-ttsreader
+  labels:
+    app.kubernetes.io/name: ttsreader-align
+    app.kubernetes.io/part-of: flowercore
+spec:
+  replicas: 1
+  strategy:
+    type: Recreate
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: ttsreader-align
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: ttsreader-align
+        app.kubernetes.io/part-of: flowercore
+    spec:
+      securityContext:
+        fsGroup: 1654
+        runAsNonRoot: true
+        runAsUser: 1654
+      containers:
+        - name: align
+          image: localhost/fc-speech-align:v1
+          imagePullPolicy: Never
+          ports:
+            - containerPort: 9200
+              name: http
+          env:
+            - name: WHISPER_MODEL
+              value: "Systran/faster-whisper-base.en"
+            - name: WHISPER_DEVICE
+              value: "cpu"
+            - name: WHISPER_COMPUTE_TYPE
+              value: "int8"
+            - name: WHISPER_CACHE_DIR
+              value: "/models"
+            - name: DEFAULT_LANGUAGE
+              value: "en"
+          resources:
+            requests:
+              cpu: 250m
+              memory: 512Mi
+            limits:
+              cpu: 2000m
+              memory: 2Gi
+          volumeMounts:
+            - name: models
+              mountPath: /models
+          readinessProbe:
+            httpGet:
+              path: /health
+              port: 9200
+            initialDelaySeconds: 30
+            periodSeconds: 10
+            timeoutSeconds: 5
+            failureThreshold: 18
+          livenessProbe:
+            httpGet:
+              path: /health
+              port: 9200
+            initialDelaySeconds: 180
+            periodSeconds: 30
+            timeoutSeconds: 5
+            failureThreshold: 3
+      volumes:
+        - name: models
+          persistentVolumeClaim:
+            claimName: ttsreader-align-models
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: ttsreader-align
+  namespace: fc-ttsreader
+spec:
+  selector:
+    app.kubernetes.io/name: ttsreader-align
+  ports:
+    - port: 9200
+      targetPort: 9200
+      name: http
+---
 apiVersion: apps/v1
 kind: Deployment
 metadata:
@@ -142,7 +245,7 @@ spec:
        fsGroupChangePolicy: OnRootMismatch
      containers:
        - name: web
-          image: localhost/fc-ttsreader-web:v202604251018
+          image: localhost/fc-ttsreader-web:v202604251046
          imagePullPolicy: Never
          ports:
            - containerPort: 5217
@@ -173,20 +276,24 @@ spec:
            - name: TtsReader__Kokoro__TimeoutSeconds
              value: "120"
            - name: Speech__Alignment__Enabled
-              # Off until either:
-              #   (a) a native /align backend is deployed inside the cluster, or
-              #   (b) the BLUEJAY-WS host exposes the speaches container on the
-              #       LAN-routable bind (10.0.56.20:9200, not just 127.0.0.1)
-              #       AND Common ships the openai-compatible Backend support
-              #       (currently on feat/shared-indexing, not on master).
-              # While disabled, /preview-with-timings still returns word timings
-              # via EstimatedAlignmentClient — slightly less accurate, but the
-              # UI can still drive word-level highlight playback.
-              value: "false"
+              # Cluster-native faster-whisper (Lane F, 2026-04-25). The
+              # ttsreader-align deployment in this manifest wraps
+              # SYSTRAN/faster-whisper with a /align endpoint matching the
+              # FlowerCore.Shared.Speech master contract.
+              value: "true"
            - name: Speech__Alignment__BaseUrl
-              value: "http://10.0.56.20:9200"
+              value: "http://ttsreader-align.fc-ttsreader.svc.cluster.local.:9200"
            - name: Speech__Alignment__TimeoutSeconds
              value: "120"
+            # Cluster-native transcription endpoint shares the same pod
+            # (POST /transcribe). Lane G consumes this from the
+            # FlowerCore.TtsReader.Web AudioImport feature.
+            - name: TtsReader__Transcription__Enabled
+              value: "true"
+            - name: TtsReader__Transcription__BaseUrl
+              value: "http://ttsreader-align.fc-ttsreader.svc.cluster.local.:9200"
+            - name: TtsReader__Transcription__TimeoutSeconds
+              value: "300"
            - name: TtsReader__Ollama__BaseUrl
              value: "http://10.0.57.17:11434"
            - name: TtsReader__Ollama__DefaultModel
--- a/apps/fc-ttsreader/speech-align/Dockerfile
+++ b/apps/fc-ttsreader/speech-align/Dockerfile
@@ -0,0 +1,47 @@
+# FlowerCore speech-align — wraps SYSTRAN/faster-whisper with /align +
+# /transcribe endpoints used by FlowerCore.TtsReader. CPU-only image; the
+# default int8 compute type runs base.en at ~real-time on a single core.
+#
+# Build: podman build -t localhost/fc-speech-align:<ver> .
+# Run:   podman run --rm -p 9200:9200 -v fc-speech-align-models:/models localhost/fc-speech-align:<ver>
+
+FROM python:3.12-slim AS base
+
+ENV PYTHONDONTWRITEBYTECODE=1 \
+    PYTHONUNBUFFERED=1 \
+    PIP_DISABLE_PIP_VERSION_CHECK=1 \
+    PIP_NO_CACHE_DIR=1 \
+    WHISPER_MODEL=Systran/faster-whisper-base.en \
+    WHISPER_CACHE_DIR=/models \
+    WHISPER_DEVICE=cpu \
+    WHISPER_COMPUTE_TYPE=int8 \
+    DEFAULT_LANGUAGE=en \
+    MAX_AUDIO_BYTES=52428800
+
+# faster-whisper depends on libsndfile1 + libgomp1 (OpenMP runtime). ffmpeg is
+# pulled in for non-WAV inputs (transcribe accepts any container).
+RUN apt-get update \
+    && apt-get install -y --no-install-recommends \
+        libsndfile1 \
+        libgomp1 \
+        ffmpeg \
+        ca-certificates \
+    && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /app
+COPY requirements.txt /app/
+RUN pip install --no-cache-dir -r requirements.txt
+
+COPY app.py /app/
+
+# Run as a non-root user to satisfy K8s securityContext.runAsNonRoot.
+RUN useradd --create-home --shell /usr/sbin/nologin --uid 1654 align \
+    && mkdir -p /models \
+    && chown -R 1654:1654 /models
+USER 1654
+
+EXPOSE 9200
+HEALTHCHECK --interval=30s --timeout=5s --start-period=120s --retries=3 \
+    CMD python -c "import urllib.request,sys; urllib.request.urlopen('http://127.0.0.1:9200/health',timeout=3); sys.exit(0)" || exit 1
+
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "9200", "--workers", "1"]
--- a/apps/fc-ttsreader/speech-align/app.py
+++ b/apps/fc-ttsreader/speech-align/app.py
@@ -0,0 +1,174 @@
+"""FlowerCore speech-align service.
+
+Wraps SYSTRAN/faster-whisper (https://github.com/SYSTRAN/faster-whisper) in a
+small FastAPI app exposing two endpoints:
+
+* POST /align       — fc-align contract used by FlowerCore.Shared.Speech's
+                       FasterWhisperAlignmentClient on master. Multipart form
+                       (`audio`, `language`) returns
+                       `{text, words: [{word, startSeconds, endSeconds, confidence}],
+                         durationMs, language}`.
+* POST /transcribe  — audio-file-in transcription used by the new TtsReader
+                       audio-import feature. Multipart form (`audio`, optional
+                       `language`) returns `{text, language, durationMs,
+                       segments: [{startSeconds, endSeconds, text}]}` so the
+                       UI can preview the transcript before piping it into
+                       Quick Read or saving as a project.
+
+Both endpoints share the same WhisperModel instance (loaded once at startup).
+Model is pinned by the WHISPER_MODEL env var (defaults to base.en) and cached
+under WHISPER_CACHE_DIR (defaults to /models, backed by a PVC in K8s).
+
+Health: GET /health → {status: ok, model, device, computeType}.
+"""
+from __future__ import annotations
+
+import io
+import logging
+import os
+import time
+from contextlib import asynccontextmanager
+from typing import Optional
+
+from fastapi import FastAPI, File, Form, HTTPException, UploadFile
+from fastapi.responses import JSONResponse
+from faster_whisper import WhisperModel
+
+LOG = logging.getLogger("speech_align")
+logging.basicConfig(
+    level=os.environ.get("LOG_LEVEL", "INFO"),
+    format="%(asctime)s %(levelname)s %(name)s %(message)s",
+)
+
+MODEL_NAME = os.environ.get("WHISPER_MODEL", "Systran/faster-whisper-base.en")
+DEVICE = os.environ.get("WHISPER_DEVICE", "cpu")
+COMPUTE_TYPE = os.environ.get("WHISPER_COMPUTE_TYPE", "int8")
+CACHE_DIR = os.environ.get("WHISPER_CACHE_DIR", "/models")
+MAX_BYTES = int(os.environ.get("MAX_AUDIO_BYTES", str(50 * 1024 * 1024)))  # 50 MB
+DEFAULT_LANGUAGE = os.environ.get("DEFAULT_LANGUAGE", "en")
+
+_state: dict[str, object] = {}
+
+
+@asynccontextmanager
+async def lifespan(_app: FastAPI):
+    LOG.info("Loading faster-whisper model %s (device=%s compute=%s cache=%s)", MODEL_NAME, DEVICE, COMPUTE_TYPE, CACHE_DIR)
+    started = time.time()
+    model = WhisperModel(MODEL_NAME, device=DEVICE, compute_type=COMPUTE_TYPE, download_root=CACHE_DIR)
+    _state["model"] = model
+    LOG.info("Model loaded in %.2fs", time.time() - started)
+    yield
+    _state.clear()
+
+
+app = FastAPI(title="FlowerCore speech-align", version="1.0.0", lifespan=lifespan)
+
+
+def _get_model() -> WhisperModel:
+    model = _state.get("model")
+    if model is None:
+        raise HTTPException(status_code=503, detail="Model not loaded yet")
+    return model  # type: ignore[return-value]
+
+
+async def _read_upload(upload: UploadFile) -> bytes:
+    payload = await upload.read()
+    if not payload:
+        raise HTTPException(status_code=400, detail="audio is empty")
+    if len(payload) > MAX_BYTES:
+        raise HTTPException(
+            status_code=413,
+            detail=f"audio exceeds {MAX_BYTES} byte limit ({len(payload)} bytes received)",
+        )
+    return payload
+
+
+def _normalize_language(value: Optional[str]) -> Optional[str]:
+    if not value or not value.strip():
+        return DEFAULT_LANGUAGE
+    return value.strip().lower()
+
+
+def _transcribe_bytes(audio_bytes: bytes, language: Optional[str], word_timestamps: bool):
+    model = _get_model()
+    started = time.time()
+    segments_iter, info = model.transcribe(
+        io.BytesIO(audio_bytes),
+        language=language,
+        word_timestamps=word_timestamps,
+        beam_size=1,
+        vad_filter=True,
+    )
+    segments = list(segments_iter)
+    elapsed_ms = int((time.time() - started) * 1000)
+    return segments, info, elapsed_ms
+
+
+@app.get("/health")
+def health():
+    return {
+        "status": "ok" if _state.get("model") is not None else "loading",
+        "model": MODEL_NAME,
+        "device": DEVICE,
+        "computeType": COMPUTE_TYPE,
+        "defaultLanguage": DEFAULT_LANGUAGE,
+        "maxBytes": MAX_BYTES,
+    }
+
+
+@app.post("/align")
+async def align(audio: UploadFile = File(...), language: str = Form(DEFAULT_LANGUAGE)):
+    """fc-align contract — used by FlowerCore.Shared.Speech.FasterWhisperAlignmentClient."""
+    payload = await _read_upload(audio)
+    lang = _normalize_language(language)
+    segments, info, elapsed_ms = _transcribe_bytes(payload, lang, word_timestamps=True)
+
+    text_parts: list[str] = []
+    words: list[dict] = []
+    for segment in segments:
+        text_parts.append(segment.text.strip())
+        for word in (segment.words or []):
+            words.append({
+                "word": word.word.strip(),
+                "startSeconds": float(word.start or 0.0),
+                "endSeconds": float(word.end or 0.0),
+                "confidence": float(getattr(word, "probability", 0.0) or 0.0),
+            })
+
+    duration_ms = int((info.duration or 0.0) * 1000)
+    return JSONResponse({
+        "text": " ".join(p for p in text_parts if p).strip(),
+        "words": words,
+        "durationMs": duration_ms,
+        "language": info.language or lang,
+        "elapsedMs": elapsed_ms,
+    })
+
+
+@app.post("/transcribe")
+async def transcribe(audio: UploadFile = File(...), language: Optional[str] = Form(None)):
+    """Audio-in transcription contract — used by the new TtsReader audio-import feature.
+
+    Returns full segments (no per-word timestamps) so the UI can preview the
+    transcript before piping it into Quick Read or saving as a project.
+    """
+    payload = await _read_upload(audio)
+    lang = _normalize_language(language)
+    segments, info, elapsed_ms = _transcribe_bytes(payload, lang, word_timestamps=False)
+
+    out_segments = [
+        {
+            "startSeconds": float(segment.start or 0.0),
+            "endSeconds": float(segment.end or 0.0),
+            "text": segment.text.strip(),
+        }
+        for segment in segments
+    ]
+
+    return JSONResponse({
+        "text": " ".join(s["text"] for s in out_segments if s["text"]).strip(),
+        "segments": out_segments,
+        "language": info.language or lang,
+        "durationMs": int((info.duration or 0.0) * 1000),
+        "elapsedMs": elapsed_ms,
+    })
--- a/apps/fc-ttsreader/speech-align/requirements.txt
+++ b/apps/fc-ttsreader/speech-align/requirements.txt
@@ -0,0 +1,4 @@
+faster-whisper==1.0.3
+fastapi==0.115.0
+uvicorn[standard]==0.30.6
+python-multipart==0.0.10