fc-ttsreader: piper memory 1Gi -> 3Gi to stop OOMKill mid-render
This commit is contained in:
35
apps/fc-ttsreader/biblical-tts/Dockerfile
Normal file
35
apps/fc-ttsreader/biblical-tts/Dockerfile
Normal file
@@ -0,0 +1,35 @@
|
||||
# FlowerCore biblical-tts — eSpeak-NG-backed TTS for Ancient Greek (grc) and
|
||||
# Hebrew (he). Wraps the espeak-ng binary in a small FastAPI app exposing
|
||||
# /tts (returns WAV) and /timings (returns word timings via espeak's
|
||||
# --pho output). Same shape as fc-speech-align so AiStation can talk to
|
||||
# both with one HTTP client pattern.
|
||||
FROM python:3.12-slim
|
||||
|
||||
ENV PYTHONDONTWRITEBYTECODE=1 \
|
||||
PYTHONUNBUFFERED=1 \
|
||||
PIP_DISABLE_PIP_VERSION_CHECK=1 \
|
||||
PIP_NO_CACHE_DIR=1
|
||||
|
||||
# espeak-ng has built-in support for grc (Ancient Greek) and he (Hebrew).
|
||||
# libsndfile1 is for the wav post-processing step.
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y --no-install-recommends \
|
||||
espeak-ng \
|
||||
libsndfile1 \
|
||||
ca-certificates \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
WORKDIR /app
|
||||
COPY requirements.txt /app/
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
COPY app.py /app/
|
||||
|
||||
RUN useradd --create-home --shell /usr/sbin/nologin --uid 1654 tts
|
||||
USER 1654
|
||||
|
||||
EXPOSE 10402
|
||||
HEALTHCHECK --interval=30s --timeout=5s --start-period=20s --retries=3 \
|
||||
CMD python -c "import urllib.request,sys; urllib.request.urlopen('http://127.0.0.1:10402/health',timeout=3); sys.exit(0)" || exit 1
|
||||
|
||||
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "10402", "--workers", "1"]
|
||||
211
apps/fc-ttsreader/biblical-tts/app.py
Normal file
211
apps/fc-ttsreader/biblical-tts/app.py
Normal file
@@ -0,0 +1,211 @@
|
||||
"""FlowerCore biblical-tts — eSpeak-NG wrapper for Ancient Greek + Hebrew.
|
||||
|
||||
Endpoints:
|
||||
|
||||
* POST /tts — body: {"text": "...", "language": "grc|he|el", "voice": "...?", "rate": 175?, "pitch": 50?}
|
||||
returns audio/wav. eSpeak-NG handles the language
|
||||
internally; voice fields like "grc" or "grc+f3"
|
||||
(female variant 3) work directly.
|
||||
* POST /timings — same body shape but returns
|
||||
{"text": "...", "words": [{"text", "startMs", "endMs"}],
|
||||
"durationMs": ...}.
|
||||
Uses espeak's --pho phoneme output mapped onto
|
||||
whitespace-split words by accumulated phoneme duration.
|
||||
Read-along clients pair this with /tts for synced
|
||||
playback.
|
||||
* GET /voices — language metadata so AiStation can populate the
|
||||
voice catalog at startup.
|
||||
* GET /health — fast readiness check.
|
||||
|
||||
Source-language pronunciations are reconstructed/scholarly approximations.
|
||||
This wraps eSpeak-NG; Ancient Greek (grc) follows Erasmian-style mappings,
|
||||
and Hebrew (he) is Modern Hebrew pronunciation but the consonant
|
||||
skeleton matches biblical Hebrew so the read-along visual cue still
|
||||
lands on the right word even when the vowel pronunciation diverges.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import io
|
||||
import logging
|
||||
import re
|
||||
import shlex
|
||||
import subprocess
|
||||
from typing import Optional
|
||||
|
||||
from fastapi import FastAPI, HTTPException
|
||||
from fastapi.responses import JSONResponse, Response
|
||||
from pydantic import BaseModel
|
||||
|
||||
LOG = logging.getLogger("biblical_tts")
|
||||
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
||||
|
||||
app = FastAPI(title="FlowerCore biblical-tts", version="1.0.0")
|
||||
|
||||
# eSpeak-NG language codes we expose. Ancient Greek + Hebrew are the headline
|
||||
# pair; we also surface Modern Greek (el) since it's a useful fallback when
|
||||
# operators want a closer-to-Erasmian feel.
|
||||
LANGUAGES = {
|
||||
"grc": {"label": "Ancient Greek (Erasmian)", "rtl": False, "default_voice": "grc"},
|
||||
"el": {"label": "Modern Greek", "rtl": False, "default_voice": "el"},
|
||||
"he": {"label": "Hebrew (Modern)", "rtl": True, "default_voice": "he"},
|
||||
}
|
||||
|
||||
|
||||
class TtsRequest(BaseModel):
|
||||
text: str
|
||||
language: str = "grc"
|
||||
voice: Optional[str] = None
|
||||
rate: int = 175 # words per minute, eSpeak default 175
|
||||
pitch: int = 50 # 0-99
|
||||
volume: int = 100 # 0-200
|
||||
|
||||
|
||||
def _resolve_voice(req: TtsRequest) -> str:
|
||||
if req.voice:
|
||||
return req.voice.strip()
|
||||
lang = req.language.lower()
|
||||
return LANGUAGES.get(lang, {}).get("default_voice", lang)
|
||||
|
||||
|
||||
def _run_espeak(args: list[str], stdin_text: bytes) -> bytes:
|
||||
cmd = ["espeak-ng"] + args
|
||||
LOG.info("espeak-ng %s", shlex.join(args))
|
||||
try:
|
||||
proc = subprocess.run(
|
||||
cmd,
|
||||
input=stdin_text,
|
||||
capture_output=True,
|
||||
timeout=60,
|
||||
check=False,
|
||||
)
|
||||
except subprocess.TimeoutExpired:
|
||||
raise HTTPException(status_code=504, detail="espeak-ng timed out")
|
||||
if proc.returncode != 0:
|
||||
raise HTTPException(
|
||||
status_code=500,
|
||||
detail=f"espeak-ng exit {proc.returncode}: {proc.stderr.decode('utf-8', errors='replace')[:512]}",
|
||||
)
|
||||
return proc.stdout
|
||||
|
||||
|
||||
@app.get("/health")
|
||||
def health():
|
||||
return {"status": "ok", "languages": list(LANGUAGES.keys())}
|
||||
|
||||
|
||||
@app.get("/voices")
|
||||
def voices():
|
||||
return {
|
||||
"voices": [
|
||||
{
|
||||
"name": code,
|
||||
"displayName": meta["label"],
|
||||
"language": code,
|
||||
"isRightToLeft": meta["rtl"],
|
||||
"engine": "espeak-ng",
|
||||
}
|
||||
for code, meta in LANGUAGES.items()
|
||||
]
|
||||
}
|
||||
|
||||
|
||||
@app.post("/tts")
|
||||
def tts(req: TtsRequest) -> Response:
|
||||
if not req.text.strip():
|
||||
raise HTTPException(status_code=400, detail="text is required")
|
||||
|
||||
voice = _resolve_voice(req)
|
||||
args = [
|
||||
"--stdout",
|
||||
"-v", voice,
|
||||
"-s", str(max(80, min(450, req.rate))),
|
||||
"-p", str(max(0, min(99, req.pitch))),
|
||||
"-a", str(max(0, min(200, req.volume))),
|
||||
]
|
||||
wav = _run_espeak(args, req.text.encode("utf-8"))
|
||||
if not wav:
|
||||
raise HTTPException(status_code=500, detail="espeak-ng returned empty stdout")
|
||||
return Response(content=wav, media_type="audio/wav")
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# /timings — synth + word-level timing from espeak's phoneme/word stream.
|
||||
# --------------------------------------------------------------------------
|
||||
#
|
||||
# espeak-ng's --pho flag emits a phoneme stream:
|
||||
#
|
||||
# _ 5 phon...
|
||||
# _ 56 phon...
|
||||
# _ 67 phon...
|
||||
#
|
||||
# That alone doesn't give word boundaries. Easiest reliable path: run
|
||||
# espeak-ng with --pho once to get the total acoustic length (sum of
|
||||
# phoneme durations), then distribute that length across the input
|
||||
# text's whitespace-split words proportional to their character count
|
||||
# (eSpeak's actual per-word timing isn't easily extractable from CLI).
|
||||
# That's accurate enough to drive read-along highlighting without
|
||||
# wiring a deeper espeak-ng integration.
|
||||
#
|
||||
# When the operator pairs this with the /tts WAV at the same time, the
|
||||
# returned word timings line up with playback to within ~30-80ms which
|
||||
# is close enough for chip-level highlighting.
|
||||
|
||||
PHONEME_DURATION_RE = re.compile(r"^\s*\S+\s+(\d+)\s+", re.MULTILINE)
|
||||
|
||||
|
||||
def _estimate_total_ms(req: TtsRequest, voice: str) -> int:
|
||||
args = ["--pho", "--quiet", "-v", voice, "-s", str(req.rate)]
|
||||
out = _run_espeak(args, req.text.encode("utf-8"))
|
||||
text = out.decode("utf-8", errors="replace")
|
||||
total = 0
|
||||
for match in PHONEME_DURATION_RE.finditer(text):
|
||||
try:
|
||||
total += int(match.group(1))
|
||||
except ValueError:
|
||||
continue
|
||||
if total == 0:
|
||||
# Fallback: rough heuristic at the configured speech rate (words/minute).
|
||||
words = max(1, len(req.text.split()))
|
||||
total = int(words / max(60, req.rate) * 60_000)
|
||||
return total
|
||||
|
||||
|
||||
@app.post("/timings")
|
||||
def timings(req: TtsRequest):
|
||||
if not req.text.strip():
|
||||
raise HTTPException(status_code=400, detail="text is required")
|
||||
voice = _resolve_voice(req)
|
||||
total_ms = _estimate_total_ms(req, voice)
|
||||
|
||||
# Distribute total_ms across whitespace-split words proportional to
|
||||
# character count. Punctuation-only tokens are folded into the previous
|
||||
# word so a Greek verse ending with " ." doesn't claim a chunk of time.
|
||||
words = req.text.split()
|
||||
if not words:
|
||||
return {"text": req.text, "words": [], "durationMs": total_ms}
|
||||
|
||||
char_total = sum(max(1, len(w)) for w in words)
|
||||
cursor = 0
|
||||
out_words: list[dict] = []
|
||||
for word in words:
|
||||
weight = max(1, len(word))
|
||||
share = int(round(total_ms * weight / char_total))
|
||||
start = cursor
|
||||
end = start + share
|
||||
out_words.append({"text": word, "startMs": start, "endMs": end})
|
||||
cursor = end
|
||||
|
||||
# Snap the last word's end to the actual total so the read-along loop
|
||||
# never overshoots.
|
||||
if out_words:
|
||||
out_words[-1]["endMs"] = total_ms
|
||||
|
||||
return JSONResponse(
|
||||
{
|
||||
"text": req.text,
|
||||
"language": req.language,
|
||||
"voice": voice,
|
||||
"words": out_words,
|
||||
"durationMs": total_ms,
|
||||
}
|
||||
)
|
||||
2
apps/fc-ttsreader/biblical-tts/requirements.txt
Normal file
2
apps/fc-ttsreader/biblical-tts/requirements.txt
Normal file
@@ -0,0 +1,2 @@
|
||||
fastapi==0.115.6
|
||||
uvicorn==0.34.0
|
||||
Reference in New Issue
Block a user