fc-ttsreader: piper memory 1Gi -> 3Gi to stop OOMKill mid-render

2026-04-25 17:10:20 -05:00
parent 87050e72a9
commit dc39747f3f
4 changed files with 257 additions and 3 deletions
--- a/apps/fc-ttsreader/biblical-tts/app.py
+++ b/apps/fc-ttsreader/biblical-tts/app.py
@@ -0,0 +1,211 @@
+"""FlowerCore biblical-tts — eSpeak-NG wrapper for Ancient Greek + Hebrew.
+
+Endpoints:
+
+* POST /tts          — body: {"text": "...", "language": "grc|he|el", "voice": "...?", "rate": 175?, "pitch": 50?}
+                       returns audio/wav. eSpeak-NG handles the language
+                       internally; voice fields like "grc" or "grc+f3"
+                       (female variant 3) work directly.
+* POST /timings      — same body shape but returns
+                       {"text": "...", "words": [{"text", "startMs", "endMs"}],
+                        "durationMs": ...}.
+                       Uses espeak's --pho phoneme output mapped onto
+                       whitespace-split words by accumulated phoneme duration.
+                       Read-along clients pair this with /tts for synced
+                       playback.
+* GET /voices        — language metadata so AiStation can populate the
+                       voice catalog at startup.
+* GET /health        — fast readiness check.
+
+Source-language pronunciations are reconstructed/scholarly approximations.
+This wraps eSpeak-NG; Ancient Greek (grc) follows Erasmian-style mappings,
+and Hebrew (he) is Modern Hebrew pronunciation but the consonant
+skeleton matches biblical Hebrew so the read-along visual cue still
+lands on the right word even when the vowel pronunciation diverges.
+"""
+from __future__ import annotations
+
+import io
+import logging
+import re
+import shlex
+import subprocess
+from typing import Optional
+
+from fastapi import FastAPI, HTTPException
+from fastapi.responses import JSONResponse, Response
+from pydantic import BaseModel
+
+LOG = logging.getLogger("biblical_tts")
+logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
+
+app = FastAPI(title="FlowerCore biblical-tts", version="1.0.0")
+
+# eSpeak-NG language codes we expose. Ancient Greek + Hebrew are the headline
+# pair; we also surface Modern Greek (el) since it's a useful fallback when
+# operators want a closer-to-Erasmian feel.
+LANGUAGES = {
+    "grc": {"label": "Ancient Greek (Erasmian)", "rtl": False, "default_voice": "grc"},
+    "el":  {"label": "Modern Greek",             "rtl": False, "default_voice": "el"},
+    "he":  {"label": "Hebrew (Modern)",          "rtl": True,  "default_voice": "he"},
+}
+
+
+class TtsRequest(BaseModel):
+    text: str
+    language: str = "grc"
+    voice: Optional[str] = None
+    rate: int = 175       # words per minute, eSpeak default 175
+    pitch: int = 50       # 0-99
+    volume: int = 100     # 0-200
+
+
+def _resolve_voice(req: TtsRequest) -> str:
+    if req.voice:
+        return req.voice.strip()
+    lang = req.language.lower()
+    return LANGUAGES.get(lang, {}).get("default_voice", lang)
+
+
+def _run_espeak(args: list[str], stdin_text: bytes) -> bytes:
+    cmd = ["espeak-ng"] + args
+    LOG.info("espeak-ng %s", shlex.join(args))
+    try:
+        proc = subprocess.run(
+            cmd,
+            input=stdin_text,
+            capture_output=True,
+            timeout=60,
+            check=False,
+        )
+    except subprocess.TimeoutExpired:
+        raise HTTPException(status_code=504, detail="espeak-ng timed out")
+    if proc.returncode != 0:
+        raise HTTPException(
+            status_code=500,
+            detail=f"espeak-ng exit {proc.returncode}: {proc.stderr.decode('utf-8', errors='replace')[:512]}",
+        )
+    return proc.stdout
+
+
+@app.get("/health")
+def health():
+    return {"status": "ok", "languages": list(LANGUAGES.keys())}
+
+
+@app.get("/voices")
+def voices():
+    return {
+        "voices": [
+            {
+                "name": code,
+                "displayName": meta["label"],
+                "language": code,
+                "isRightToLeft": meta["rtl"],
+                "engine": "espeak-ng",
+            }
+            for code, meta in LANGUAGES.items()
+        ]
+    }
+
+
+@app.post("/tts")
+def tts(req: TtsRequest) -> Response:
+    if not req.text.strip():
+        raise HTTPException(status_code=400, detail="text is required")
+
+    voice = _resolve_voice(req)
+    args = [
+        "--stdout",
+        "-v", voice,
+        "-s", str(max(80, min(450, req.rate))),
+        "-p", str(max(0, min(99, req.pitch))),
+        "-a", str(max(0, min(200, req.volume))),
+    ]
+    wav = _run_espeak(args, req.text.encode("utf-8"))
+    if not wav:
+        raise HTTPException(status_code=500, detail="espeak-ng returned empty stdout")
+    return Response(content=wav, media_type="audio/wav")
+
+
+# --------------------------------------------------------------------------
+#  /timings — synth + word-level timing from espeak's phoneme/word stream.
+# --------------------------------------------------------------------------
+#
+# espeak-ng's --pho flag emits a phoneme stream:
+#
+#   _ 5 phon...
+#   _ 56 phon...
+#   _ 67 phon...
+#
+# That alone doesn't give word boundaries. Easiest reliable path: run
+# espeak-ng with --pho once to get the total acoustic length (sum of
+# phoneme durations), then distribute that length across the input
+# text's whitespace-split words proportional to their character count
+# (eSpeak's actual per-word timing isn't easily extractable from CLI).
+# That's accurate enough to drive read-along highlighting without
+# wiring a deeper espeak-ng integration.
+#
+# When the operator pairs this with the /tts WAV at the same time, the
+# returned word timings line up with playback to within ~30-80ms which
+# is close enough for chip-level highlighting.
+
+PHONEME_DURATION_RE = re.compile(r"^\s*\S+\s+(\d+)\s+", re.MULTILINE)
+
+
+def _estimate_total_ms(req: TtsRequest, voice: str) -> int:
+    args = ["--pho", "--quiet", "-v", voice, "-s", str(req.rate)]
+    out = _run_espeak(args, req.text.encode("utf-8"))
+    text = out.decode("utf-8", errors="replace")
+    total = 0
+    for match in PHONEME_DURATION_RE.finditer(text):
+        try:
+            total += int(match.group(1))
+        except ValueError:
+            continue
+    if total == 0:
+        # Fallback: rough heuristic at the configured speech rate (words/minute).
+        words = max(1, len(req.text.split()))
+        total = int(words / max(60, req.rate) * 60_000)
+    return total
+
+
+@app.post("/timings")
+def timings(req: TtsRequest):
+    if not req.text.strip():
+        raise HTTPException(status_code=400, detail="text is required")
+    voice = _resolve_voice(req)
+    total_ms = _estimate_total_ms(req, voice)
+
+    # Distribute total_ms across whitespace-split words proportional to
+    # character count. Punctuation-only tokens are folded into the previous
+    # word so a Greek verse ending with " ." doesn't claim a chunk of time.
+    words = req.text.split()
+    if not words:
+        return {"text": req.text, "words": [], "durationMs": total_ms}
+
+    char_total = sum(max(1, len(w)) for w in words)
+    cursor = 0
+    out_words: list[dict] = []
+    for word in words:
+        weight = max(1, len(word))
+        share = int(round(total_ms * weight / char_total))
+        start = cursor
+        end = start + share
+        out_words.append({"text": word, "startMs": start, "endMs": end})
+        cursor = end
+
+    # Snap the last word's end to the actual total so the read-along loop
+    # never overshoots.
+    if out_words:
+        out_words[-1]["endMs"] = total_ms
+
+    return JSONResponse(
+        {
+            "text": req.text,
+            "language": req.language,
+            "voice": voice,
+            "words": out_words,
+            "durationMs": total_ms,
+        }
+    )