fc-ttsreader: piper memory 1Gi -> 3Gi to stop OOMKill mid-render

This commit is contained in:
Andrew Stoltz
2026-04-25 17:10:20 -05:00
parent 87050e72a9
commit dc39747f3f
4 changed files with 257 additions and 3 deletions

View File

@@ -0,0 +1,211 @@
"""FlowerCore biblical-tts — eSpeak-NG wrapper for Ancient Greek + Hebrew.
Endpoints:
* POST /tts — body: {"text": "...", "language": "grc|he|el", "voice": "...?", "rate": 175?, "pitch": 50?}
returns audio/wav. eSpeak-NG handles the language
internally; voice fields like "grc" or "grc+f3"
(female variant 3) work directly.
* POST /timings — same body shape but returns
{"text": "...", "words": [{"text", "startMs", "endMs"}],
"durationMs": ...}.
Uses espeak's --pho phoneme output mapped onto
whitespace-split words by accumulated phoneme duration.
Read-along clients pair this with /tts for synced
playback.
* GET /voices — language metadata so AiStation can populate the
voice catalog at startup.
* GET /health — fast readiness check.
Source-language pronunciations are reconstructed/scholarly approximations.
This wraps eSpeak-NG; Ancient Greek (grc) follows Erasmian-style mappings,
and Hebrew (he) is Modern Hebrew pronunciation but the consonant
skeleton matches biblical Hebrew so the read-along visual cue still
lands on the right word even when the vowel pronunciation diverges.
"""
from __future__ import annotations
import io
import logging
import re
import shlex
import subprocess
from typing import Optional
from fastapi import FastAPI, HTTPException
from fastapi.responses import JSONResponse, Response
from pydantic import BaseModel
LOG = logging.getLogger("biblical_tts")
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
app = FastAPI(title="FlowerCore biblical-tts", version="1.0.0")
# eSpeak-NG language codes we expose. Ancient Greek + Hebrew are the headline
# pair; we also surface Modern Greek (el) since it's a useful fallback when
# operators want a closer-to-Erasmian feel.
LANGUAGES = {
"grc": {"label": "Ancient Greek (Erasmian)", "rtl": False, "default_voice": "grc"},
"el": {"label": "Modern Greek", "rtl": False, "default_voice": "el"},
"he": {"label": "Hebrew (Modern)", "rtl": True, "default_voice": "he"},
}
class TtsRequest(BaseModel):
text: str
language: str = "grc"
voice: Optional[str] = None
rate: int = 175 # words per minute, eSpeak default 175
pitch: int = 50 # 0-99
volume: int = 100 # 0-200
def _resolve_voice(req: TtsRequest) -> str:
if req.voice:
return req.voice.strip()
lang = req.language.lower()
return LANGUAGES.get(lang, {}).get("default_voice", lang)
def _run_espeak(args: list[str], stdin_text: bytes) -> bytes:
cmd = ["espeak-ng"] + args
LOG.info("espeak-ng %s", shlex.join(args))
try:
proc = subprocess.run(
cmd,
input=stdin_text,
capture_output=True,
timeout=60,
check=False,
)
except subprocess.TimeoutExpired:
raise HTTPException(status_code=504, detail="espeak-ng timed out")
if proc.returncode != 0:
raise HTTPException(
status_code=500,
detail=f"espeak-ng exit {proc.returncode}: {proc.stderr.decode('utf-8', errors='replace')[:512]}",
)
return proc.stdout
@app.get("/health")
def health():
return {"status": "ok", "languages": list(LANGUAGES.keys())}
@app.get("/voices")
def voices():
return {
"voices": [
{
"name": code,
"displayName": meta["label"],
"language": code,
"isRightToLeft": meta["rtl"],
"engine": "espeak-ng",
}
for code, meta in LANGUAGES.items()
]
}
@app.post("/tts")
def tts(req: TtsRequest) -> Response:
if not req.text.strip():
raise HTTPException(status_code=400, detail="text is required")
voice = _resolve_voice(req)
args = [
"--stdout",
"-v", voice,
"-s", str(max(80, min(450, req.rate))),
"-p", str(max(0, min(99, req.pitch))),
"-a", str(max(0, min(200, req.volume))),
]
wav = _run_espeak(args, req.text.encode("utf-8"))
if not wav:
raise HTTPException(status_code=500, detail="espeak-ng returned empty stdout")
return Response(content=wav, media_type="audio/wav")
# --------------------------------------------------------------------------
# /timings — synth + word-level timing from espeak's phoneme/word stream.
# --------------------------------------------------------------------------
#
# espeak-ng's --pho flag emits a phoneme stream:
#
# _ 5 phon...
# _ 56 phon...
# _ 67 phon...
#
# That alone doesn't give word boundaries. Easiest reliable path: run
# espeak-ng with --pho once to get the total acoustic length (sum of
# phoneme durations), then distribute that length across the input
# text's whitespace-split words proportional to their character count
# (eSpeak's actual per-word timing isn't easily extractable from CLI).
# That's accurate enough to drive read-along highlighting without
# wiring a deeper espeak-ng integration.
#
# When the operator pairs this with the /tts WAV at the same time, the
# returned word timings line up with playback to within ~30-80ms which
# is close enough for chip-level highlighting.
PHONEME_DURATION_RE = re.compile(r"^\s*\S+\s+(\d+)\s+", re.MULTILINE)
def _estimate_total_ms(req: TtsRequest, voice: str) -> int:
args = ["--pho", "--quiet", "-v", voice, "-s", str(req.rate)]
out = _run_espeak(args, req.text.encode("utf-8"))
text = out.decode("utf-8", errors="replace")
total = 0
for match in PHONEME_DURATION_RE.finditer(text):
try:
total += int(match.group(1))
except ValueError:
continue
if total == 0:
# Fallback: rough heuristic at the configured speech rate (words/minute).
words = max(1, len(req.text.split()))
total = int(words / max(60, req.rate) * 60_000)
return total
@app.post("/timings")
def timings(req: TtsRequest):
if not req.text.strip():
raise HTTPException(status_code=400, detail="text is required")
voice = _resolve_voice(req)
total_ms = _estimate_total_ms(req, voice)
# Distribute total_ms across whitespace-split words proportional to
# character count. Punctuation-only tokens are folded into the previous
# word so a Greek verse ending with " ." doesn't claim a chunk of time.
words = req.text.split()
if not words:
return {"text": req.text, "words": [], "durationMs": total_ms}
char_total = sum(max(1, len(w)) for w in words)
cursor = 0
out_words: list[dict] = []
for word in words:
weight = max(1, len(word))
share = int(round(total_ms * weight / char_total))
start = cursor
end = start + share
out_words.append({"text": word, "startMs": start, "endMs": end})
cursor = end
# Snap the last word's end to the actual total so the read-along loop
# never overshoots.
if out_words:
out_words[-1]["endMs"] = total_ms
return JSONResponse(
{
"text": req.text,
"language": req.language,
"voice": voice,
"words": out_words,
"durationMs": total_ms,
}
)