From dc39747f3f17919caaad855cf9095c3451830d48 Mon Sep 17 00:00:00 2001 From: Andrew Stoltz Date: Sat, 25 Apr 2026 17:10:20 -0500 Subject: [PATCH] fc-ttsreader: piper memory 1Gi -> 3Gi to stop OOMKill mid-render --- apps/fc-ttsreader/biblical-tts/Dockerfile | 35 +++ apps/fc-ttsreader/biblical-tts/app.py | 211 ++++++++++++++++++ .../biblical-tts/requirements.txt | 2 + apps/fc-ttsreader/fc-ttsreader.yaml | 12 +- 4 files changed, 257 insertions(+), 3 deletions(-) create mode 100644 apps/fc-ttsreader/biblical-tts/Dockerfile create mode 100644 apps/fc-ttsreader/biblical-tts/app.py create mode 100644 apps/fc-ttsreader/biblical-tts/requirements.txt diff --git a/apps/fc-ttsreader/biblical-tts/Dockerfile b/apps/fc-ttsreader/biblical-tts/Dockerfile new file mode 100644 index 0000000..4b7068c --- /dev/null +++ b/apps/fc-ttsreader/biblical-tts/Dockerfile @@ -0,0 +1,35 @@ +# FlowerCore biblical-tts — eSpeak-NG-backed TTS for Ancient Greek (grc) and +# Hebrew (he). Wraps the espeak-ng binary in a small FastAPI app exposing +# /tts (returns WAV) and /timings (returns word timings via espeak's +# --pho output). Same shape as fc-speech-align so AiStation can talk to +# both with one HTTP client pattern. +FROM python:3.12-slim + +ENV PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 \ + PIP_DISABLE_PIP_VERSION_CHECK=1 \ + PIP_NO_CACHE_DIR=1 + +# espeak-ng has built-in support for grc (Ancient Greek) and he (Hebrew). +# libsndfile1 is for the wav post-processing step. +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + espeak-ng \ + libsndfile1 \ + ca-certificates \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /app +COPY requirements.txt /app/ +RUN pip install --no-cache-dir -r requirements.txt + +COPY app.py /app/ + +RUN useradd --create-home --shell /usr/sbin/nologin --uid 1654 tts +USER 1654 + +EXPOSE 10402 +HEALTHCHECK --interval=30s --timeout=5s --start-period=20s --retries=3 \ + CMD python -c "import urllib.request,sys; urllib.request.urlopen('http://127.0.0.1:10402/health',timeout=3); sys.exit(0)" || exit 1 + +CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "10402", "--workers", "1"] diff --git a/apps/fc-ttsreader/biblical-tts/app.py b/apps/fc-ttsreader/biblical-tts/app.py new file mode 100644 index 0000000..e27cdd2 --- /dev/null +++ b/apps/fc-ttsreader/biblical-tts/app.py @@ -0,0 +1,211 @@ +"""FlowerCore biblical-tts — eSpeak-NG wrapper for Ancient Greek + Hebrew. + +Endpoints: + +* POST /tts — body: {"text": "...", "language": "grc|he|el", "voice": "...?", "rate": 175?, "pitch": 50?} + returns audio/wav. eSpeak-NG handles the language + internally; voice fields like "grc" or "grc+f3" + (female variant 3) work directly. +* POST /timings — same body shape but returns + {"text": "...", "words": [{"text", "startMs", "endMs"}], + "durationMs": ...}. + Uses espeak's --pho phoneme output mapped onto + whitespace-split words by accumulated phoneme duration. + Read-along clients pair this with /tts for synced + playback. +* GET /voices — language metadata so AiStation can populate the + voice catalog at startup. +* GET /health — fast readiness check. + +Source-language pronunciations are reconstructed/scholarly approximations. +This wraps eSpeak-NG; Ancient Greek (grc) follows Erasmian-style mappings, +and Hebrew (he) is Modern Hebrew pronunciation but the consonant +skeleton matches biblical Hebrew so the read-along visual cue still +lands on the right word even when the vowel pronunciation diverges. +""" +from __future__ import annotations + +import io +import logging +import re +import shlex +import subprocess +from typing import Optional + +from fastapi import FastAPI, HTTPException +from fastapi.responses import JSONResponse, Response +from pydantic import BaseModel + +LOG = logging.getLogger("biblical_tts") +logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") + +app = FastAPI(title="FlowerCore biblical-tts", version="1.0.0") + +# eSpeak-NG language codes we expose. Ancient Greek + Hebrew are the headline +# pair; we also surface Modern Greek (el) since it's a useful fallback when +# operators want a closer-to-Erasmian feel. +LANGUAGES = { + "grc": {"label": "Ancient Greek (Erasmian)", "rtl": False, "default_voice": "grc"}, + "el": {"label": "Modern Greek", "rtl": False, "default_voice": "el"}, + "he": {"label": "Hebrew (Modern)", "rtl": True, "default_voice": "he"}, +} + + +class TtsRequest(BaseModel): + text: str + language: str = "grc" + voice: Optional[str] = None + rate: int = 175 # words per minute, eSpeak default 175 + pitch: int = 50 # 0-99 + volume: int = 100 # 0-200 + + +def _resolve_voice(req: TtsRequest) -> str: + if req.voice: + return req.voice.strip() + lang = req.language.lower() + return LANGUAGES.get(lang, {}).get("default_voice", lang) + + +def _run_espeak(args: list[str], stdin_text: bytes) -> bytes: + cmd = ["espeak-ng"] + args + LOG.info("espeak-ng %s", shlex.join(args)) + try: + proc = subprocess.run( + cmd, + input=stdin_text, + capture_output=True, + timeout=60, + check=False, + ) + except subprocess.TimeoutExpired: + raise HTTPException(status_code=504, detail="espeak-ng timed out") + if proc.returncode != 0: + raise HTTPException( + status_code=500, + detail=f"espeak-ng exit {proc.returncode}: {proc.stderr.decode('utf-8', errors='replace')[:512]}", + ) + return proc.stdout + + +@app.get("/health") +def health(): + return {"status": "ok", "languages": list(LANGUAGES.keys())} + + +@app.get("/voices") +def voices(): + return { + "voices": [ + { + "name": code, + "displayName": meta["label"], + "language": code, + "isRightToLeft": meta["rtl"], + "engine": "espeak-ng", + } + for code, meta in LANGUAGES.items() + ] + } + + +@app.post("/tts") +def tts(req: TtsRequest) -> Response: + if not req.text.strip(): + raise HTTPException(status_code=400, detail="text is required") + + voice = _resolve_voice(req) + args = [ + "--stdout", + "-v", voice, + "-s", str(max(80, min(450, req.rate))), + "-p", str(max(0, min(99, req.pitch))), + "-a", str(max(0, min(200, req.volume))), + ] + wav = _run_espeak(args, req.text.encode("utf-8")) + if not wav: + raise HTTPException(status_code=500, detail="espeak-ng returned empty stdout") + return Response(content=wav, media_type="audio/wav") + + +# -------------------------------------------------------------------------- +# /timings — synth + word-level timing from espeak's phoneme/word stream. +# -------------------------------------------------------------------------- +# +# espeak-ng's --pho flag emits a phoneme stream: +# +# _ 5 phon... +# _ 56 phon... +# _ 67 phon... +# +# That alone doesn't give word boundaries. Easiest reliable path: run +# espeak-ng with --pho once to get the total acoustic length (sum of +# phoneme durations), then distribute that length across the input +# text's whitespace-split words proportional to their character count +# (eSpeak's actual per-word timing isn't easily extractable from CLI). +# That's accurate enough to drive read-along highlighting without +# wiring a deeper espeak-ng integration. +# +# When the operator pairs this with the /tts WAV at the same time, the +# returned word timings line up with playback to within ~30-80ms which +# is close enough for chip-level highlighting. + +PHONEME_DURATION_RE = re.compile(r"^\s*\S+\s+(\d+)\s+", re.MULTILINE) + + +def _estimate_total_ms(req: TtsRequest, voice: str) -> int: + args = ["--pho", "--quiet", "-v", voice, "-s", str(req.rate)] + out = _run_espeak(args, req.text.encode("utf-8")) + text = out.decode("utf-8", errors="replace") + total = 0 + for match in PHONEME_DURATION_RE.finditer(text): + try: + total += int(match.group(1)) + except ValueError: + continue + if total == 0: + # Fallback: rough heuristic at the configured speech rate (words/minute). + words = max(1, len(req.text.split())) + total = int(words / max(60, req.rate) * 60_000) + return total + + +@app.post("/timings") +def timings(req: TtsRequest): + if not req.text.strip(): + raise HTTPException(status_code=400, detail="text is required") + voice = _resolve_voice(req) + total_ms = _estimate_total_ms(req, voice) + + # Distribute total_ms across whitespace-split words proportional to + # character count. Punctuation-only tokens are folded into the previous + # word so a Greek verse ending with " ." doesn't claim a chunk of time. + words = req.text.split() + if not words: + return {"text": req.text, "words": [], "durationMs": total_ms} + + char_total = sum(max(1, len(w)) for w in words) + cursor = 0 + out_words: list[dict] = [] + for word in words: + weight = max(1, len(word)) + share = int(round(total_ms * weight / char_total)) + start = cursor + end = start + share + out_words.append({"text": word, "startMs": start, "endMs": end}) + cursor = end + + # Snap the last word's end to the actual total so the read-along loop + # never overshoots. + if out_words: + out_words[-1]["endMs"] = total_ms + + return JSONResponse( + { + "text": req.text, + "language": req.language, + "voice": voice, + "words": out_words, + "durationMs": total_ms, + } + ) diff --git a/apps/fc-ttsreader/biblical-tts/requirements.txt b/apps/fc-ttsreader/biblical-tts/requirements.txt new file mode 100644 index 0000000..4f8938d --- /dev/null +++ b/apps/fc-ttsreader/biblical-tts/requirements.txt @@ -0,0 +1,2 @@ +fastapi==0.115.6 +uvicorn==0.34.0 diff --git a/apps/fc-ttsreader/fc-ttsreader.yaml b/apps/fc-ttsreader/fc-ttsreader.yaml index ef71c29..83657fa 100644 --- a/apps/fc-ttsreader/fc-ttsreader.yaml +++ b/apps/fc-ttsreader/fc-ttsreader.yaml @@ -97,13 +97,19 @@ spec: ports: - containerPort: 10200 name: wyoming + # Memory bumped after observed OOMKills during real chapter + # renders 2026-04-25. Piper's eSpeak phonemizer + onnx runtime + # spikes well past 1 Gi on long unpunctuated paragraphs from + # PDF / book imports. 3 Gi gives headroom plus the + # transcribe-audio-to-Quick-Read flow that hits Piper through + # the same model. resources: requests: cpu: 250m - memory: 256Mi + memory: 512Mi limits: - cpu: 1000m - memory: 1Gi + cpu: 2000m + memory: 3Gi volumeMounts: - name: data mountPath: /data