fc-ttsreader: ship cluster-native fc-speech-align (faster-whisper) + bump web

- New ttsreader-align Deployment + Service + 5Gi PVC under
  apps/fc-ttsreader/. Wraps SYSTRAN/faster-whisper in a small FastAPI app
  exposing POST /align (fc-align contract used by Shared.Speech) AND
  POST /transcribe (audio-in feature consumed by ttsreader-web Lane G).
  Source: apps/fc-ttsreader/speech-align/ (Dockerfile + app.py +
  requirements.txt). Built locally (apt-get RUN steps need BLUEJAY-WS,
  not noc1) and ctr-imported to all 3 RKE2 nodes.
- ttsreader-web env: flip Speech__Alignment__Enabled=true and point
  BaseUrl at http://ttsreader-align.fc-ttsreader.svc.cluster.local.:9200.
  Add new TtsReader__Transcription__* env triplet pointing at the same
  service (same /transcribe endpoint).
- Bump ttsreader-web image to v202604251046 (carries the
  TranscriptionController + MCP tool + Quick.razor InputFile UI).
This commit is contained in:
Andrew Stoltz
2026-04-25 10:50:45 -05:00
parent 9df26620b8
commit df115e4d1e
4 changed files with 344 additions and 12 deletions

View File

@@ -112,6 +112,109 @@ spec:
persistentVolumeClaim:
claimName: ttsreader-piper-data
---
# fc-speech-align — cluster-native faster-whisper wrapper.
# Exposes POST /align (fc-align contract used by FlowerCore.Shared.Speech) AND
# POST /transcribe (audio-file-in feature). CPU model = base.en, int8 compute.
# Source: bluejay-infra/apps/fc-ttsreader/speech-align/ (Dockerfile + app.py).
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: ttsreader-align-models
namespace: fc-ttsreader
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 5Gi
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: ttsreader-align
namespace: fc-ttsreader
labels:
app.kubernetes.io/name: ttsreader-align
app.kubernetes.io/part-of: flowercore
spec:
replicas: 1
strategy:
type: Recreate
selector:
matchLabels:
app.kubernetes.io/name: ttsreader-align
template:
metadata:
labels:
app.kubernetes.io/name: ttsreader-align
app.kubernetes.io/part-of: flowercore
spec:
securityContext:
fsGroup: 1654
runAsNonRoot: true
runAsUser: 1654
containers:
- name: align
image: localhost/fc-speech-align:v1
imagePullPolicy: Never
ports:
- containerPort: 9200
name: http
env:
- name: WHISPER_MODEL
value: "Systran/faster-whisper-base.en"
- name: WHISPER_DEVICE
value: "cpu"
- name: WHISPER_COMPUTE_TYPE
value: "int8"
- name: WHISPER_CACHE_DIR
value: "/models"
- name: DEFAULT_LANGUAGE
value: "en"
resources:
requests:
cpu: 250m
memory: 512Mi
limits:
cpu: 2000m
memory: 2Gi
volumeMounts:
- name: models
mountPath: /models
readinessProbe:
httpGet:
path: /health
port: 9200
initialDelaySeconds: 30
periodSeconds: 10
timeoutSeconds: 5
failureThreshold: 18
livenessProbe:
httpGet:
path: /health
port: 9200
initialDelaySeconds: 180
periodSeconds: 30
timeoutSeconds: 5
failureThreshold: 3
volumes:
- name: models
persistentVolumeClaim:
claimName: ttsreader-align-models
---
apiVersion: v1
kind: Service
metadata:
name: ttsreader-align
namespace: fc-ttsreader
spec:
selector:
app.kubernetes.io/name: ttsreader-align
ports:
- port: 9200
targetPort: 9200
name: http
---
apiVersion: apps/v1
kind: Deployment
metadata:
@@ -142,7 +245,7 @@ spec:
fsGroupChangePolicy: OnRootMismatch
containers:
- name: web
image: localhost/fc-ttsreader-web:v202604251018
image: localhost/fc-ttsreader-web:v202604251046
imagePullPolicy: Never
ports:
- containerPort: 5217
@@ -173,20 +276,24 @@ spec:
- name: TtsReader__Kokoro__TimeoutSeconds
value: "120"
- name: Speech__Alignment__Enabled
# Off until either:
# (a) a native /align backend is deployed inside the cluster, or
# (b) the BLUEJAY-WS host exposes the speaches container on the
# LAN-routable bind (10.0.56.20:9200, not just 127.0.0.1)
# AND Common ships the openai-compatible Backend support
# (currently on feat/shared-indexing, not on master).
# While disabled, /preview-with-timings still returns word timings
# via EstimatedAlignmentClient — slightly less accurate, but the
# UI can still drive word-level highlight playback.
value: "false"
# Cluster-native faster-whisper (Lane F, 2026-04-25). The
# ttsreader-align deployment in this manifest wraps
# SYSTRAN/faster-whisper with a /align endpoint matching the
# FlowerCore.Shared.Speech master contract.
value: "true"
- name: Speech__Alignment__BaseUrl
value: "http://10.0.56.20:9200"
value: "http://ttsreader-align.fc-ttsreader.svc.cluster.local.:9200"
- name: Speech__Alignment__TimeoutSeconds
value: "120"
# Cluster-native transcription endpoint shares the same pod
# (POST /transcribe). Lane G consumes this from the
# FlowerCore.TtsReader.Web AudioImport feature.
- name: TtsReader__Transcription__Enabled
value: "true"
- name: TtsReader__Transcription__BaseUrl
value: "http://ttsreader-align.fc-ttsreader.svc.cluster.local.:9200"
- name: TtsReader__Transcription__TimeoutSeconds
value: "300"
- name: TtsReader__Ollama__BaseUrl
value: "http://10.0.57.17:11434"
- name: TtsReader__Ollama__DefaultModel

View File

@@ -0,0 +1,47 @@
# FlowerCore speech-align — wraps SYSTRAN/faster-whisper with /align +
# /transcribe endpoints used by FlowerCore.TtsReader. CPU-only image; the
# default int8 compute type runs base.en at ~real-time on a single core.
#
# Build: podman build -t localhost/fc-speech-align:<ver> .
# Run: podman run --rm -p 9200:9200 -v fc-speech-align-models:/models localhost/fc-speech-align:<ver>
FROM python:3.12-slim AS base
ENV PYTHONDONTWRITEBYTECODE=1 \
PYTHONUNBUFFERED=1 \
PIP_DISABLE_PIP_VERSION_CHECK=1 \
PIP_NO_CACHE_DIR=1 \
WHISPER_MODEL=Systran/faster-whisper-base.en \
WHISPER_CACHE_DIR=/models \
WHISPER_DEVICE=cpu \
WHISPER_COMPUTE_TYPE=int8 \
DEFAULT_LANGUAGE=en \
MAX_AUDIO_BYTES=52428800
# faster-whisper depends on libsndfile1 + libgomp1 (OpenMP runtime). ffmpeg is
# pulled in for non-WAV inputs (transcribe accepts any container).
RUN apt-get update \
&& apt-get install -y --no-install-recommends \
libsndfile1 \
libgomp1 \
ffmpeg \
ca-certificates \
&& rm -rf /var/lib/apt/lists/*
WORKDIR /app
COPY requirements.txt /app/
RUN pip install --no-cache-dir -r requirements.txt
COPY app.py /app/
# Run as a non-root user to satisfy K8s securityContext.runAsNonRoot.
RUN useradd --create-home --shell /usr/sbin/nologin --uid 1654 align \
&& mkdir -p /models \
&& chown -R 1654:1654 /models
USER 1654
EXPOSE 9200
HEALTHCHECK --interval=30s --timeout=5s --start-period=120s --retries=3 \
CMD python -c "import urllib.request,sys; urllib.request.urlopen('http://127.0.0.1:9200/health',timeout=3); sys.exit(0)" || exit 1
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "9200", "--workers", "1"]

View File

@@ -0,0 +1,174 @@
"""FlowerCore speech-align service.
Wraps SYSTRAN/faster-whisper (https://github.com/SYSTRAN/faster-whisper) in a
small FastAPI app exposing two endpoints:
* POST /align — fc-align contract used by FlowerCore.Shared.Speech's
FasterWhisperAlignmentClient on master. Multipart form
(`audio`, `language`) returns
`{text, words: [{word, startSeconds, endSeconds, confidence}],
durationMs, language}`.
* POST /transcribe — audio-file-in transcription used by the new TtsReader
audio-import feature. Multipart form (`audio`, optional
`language`) returns `{text, language, durationMs,
segments: [{startSeconds, endSeconds, text}]}` so the
UI can preview the transcript before piping it into
Quick Read or saving as a project.
Both endpoints share the same WhisperModel instance (loaded once at startup).
Model is pinned by the WHISPER_MODEL env var (defaults to base.en) and cached
under WHISPER_CACHE_DIR (defaults to /models, backed by a PVC in K8s).
Health: GET /health → {status: ok, model, device, computeType}.
"""
from __future__ import annotations
import io
import logging
import os
import time
from contextlib import asynccontextmanager
from typing import Optional
from fastapi import FastAPI, File, Form, HTTPException, UploadFile
from fastapi.responses import JSONResponse
from faster_whisper import WhisperModel
LOG = logging.getLogger("speech_align")
logging.basicConfig(
level=os.environ.get("LOG_LEVEL", "INFO"),
format="%(asctime)s %(levelname)s %(name)s %(message)s",
)
MODEL_NAME = os.environ.get("WHISPER_MODEL", "Systran/faster-whisper-base.en")
DEVICE = os.environ.get("WHISPER_DEVICE", "cpu")
COMPUTE_TYPE = os.environ.get("WHISPER_COMPUTE_TYPE", "int8")
CACHE_DIR = os.environ.get("WHISPER_CACHE_DIR", "/models")
MAX_BYTES = int(os.environ.get("MAX_AUDIO_BYTES", str(50 * 1024 * 1024))) # 50 MB
DEFAULT_LANGUAGE = os.environ.get("DEFAULT_LANGUAGE", "en")
_state: dict[str, object] = {}
@asynccontextmanager
async def lifespan(_app: FastAPI):
LOG.info("Loading faster-whisper model %s (device=%s compute=%s cache=%s)", MODEL_NAME, DEVICE, COMPUTE_TYPE, CACHE_DIR)
started = time.time()
model = WhisperModel(MODEL_NAME, device=DEVICE, compute_type=COMPUTE_TYPE, download_root=CACHE_DIR)
_state["model"] = model
LOG.info("Model loaded in %.2fs", time.time() - started)
yield
_state.clear()
app = FastAPI(title="FlowerCore speech-align", version="1.0.0", lifespan=lifespan)
def _get_model() -> WhisperModel:
model = _state.get("model")
if model is None:
raise HTTPException(status_code=503, detail="Model not loaded yet")
return model # type: ignore[return-value]
async def _read_upload(upload: UploadFile) -> bytes:
payload = await upload.read()
if not payload:
raise HTTPException(status_code=400, detail="audio is empty")
if len(payload) > MAX_BYTES:
raise HTTPException(
status_code=413,
detail=f"audio exceeds {MAX_BYTES} byte limit ({len(payload)} bytes received)",
)
return payload
def _normalize_language(value: Optional[str]) -> Optional[str]:
if not value or not value.strip():
return DEFAULT_LANGUAGE
return value.strip().lower()
def _transcribe_bytes(audio_bytes: bytes, language: Optional[str], word_timestamps: bool):
model = _get_model()
started = time.time()
segments_iter, info = model.transcribe(
io.BytesIO(audio_bytes),
language=language,
word_timestamps=word_timestamps,
beam_size=1,
vad_filter=True,
)
segments = list(segments_iter)
elapsed_ms = int((time.time() - started) * 1000)
return segments, info, elapsed_ms
@app.get("/health")
def health():
return {
"status": "ok" if _state.get("model") is not None else "loading",
"model": MODEL_NAME,
"device": DEVICE,
"computeType": COMPUTE_TYPE,
"defaultLanguage": DEFAULT_LANGUAGE,
"maxBytes": MAX_BYTES,
}
@app.post("/align")
async def align(audio: UploadFile = File(...), language: str = Form(DEFAULT_LANGUAGE)):
"""fc-align contract — used by FlowerCore.Shared.Speech.FasterWhisperAlignmentClient."""
payload = await _read_upload(audio)
lang = _normalize_language(language)
segments, info, elapsed_ms = _transcribe_bytes(payload, lang, word_timestamps=True)
text_parts: list[str] = []
words: list[dict] = []
for segment in segments:
text_parts.append(segment.text.strip())
for word in (segment.words or []):
words.append({
"word": word.word.strip(),
"startSeconds": float(word.start or 0.0),
"endSeconds": float(word.end or 0.0),
"confidence": float(getattr(word, "probability", 0.0) or 0.0),
})
duration_ms = int((info.duration or 0.0) * 1000)
return JSONResponse({
"text": " ".join(p for p in text_parts if p).strip(),
"words": words,
"durationMs": duration_ms,
"language": info.language or lang,
"elapsedMs": elapsed_ms,
})
@app.post("/transcribe")
async def transcribe(audio: UploadFile = File(...), language: Optional[str] = Form(None)):
"""Audio-in transcription contract — used by the new TtsReader audio-import feature.
Returns full segments (no per-word timestamps) so the UI can preview the
transcript before piping it into Quick Read or saving as a project.
"""
payload = await _read_upload(audio)
lang = _normalize_language(language)
segments, info, elapsed_ms = _transcribe_bytes(payload, lang, word_timestamps=False)
out_segments = [
{
"startSeconds": float(segment.start or 0.0),
"endSeconds": float(segment.end or 0.0),
"text": segment.text.strip(),
}
for segment in segments
]
return JSONResponse({
"text": " ".join(s["text"] for s in out_segments if s["text"]).strip(),
"segments": out_segments,
"language": info.language or lang,
"durationMs": int((info.duration or 0.0) * 1000),
"elapsedMs": elapsed_ms,
})

View File

@@ -0,0 +1,4 @@
faster-whisper==1.0.3
fastapi==0.115.0
uvicorn[standard]==0.30.6
python-multipart==0.0.10