bluejay-infra/apps/fc-ttsreader/biblical-tts/app.py

"""FlowerCore biblical-tts — eSpeak-NG wrapper for Ancient Greek + Hebrew.

Endpoints:

* POST /tts          — body: {"text": "...", "language": "grc|he|el", "voice": "...?", "rate": 175?, "pitch": 50?}
                       returns audio/wav. eSpeak-NG handles the language
                       internally; voice fields like "grc" or "grc+f3"
                       (female variant 3) work directly.
* POST /timings      — same body shape but returns
                       {"text": "...", "words": [{"text", "startMs", "endMs"}],
                        "durationMs": ...}.
                       Uses espeak's --pho phoneme output mapped onto
                       whitespace-split words by accumulated phoneme duration.
                       Read-along clients pair this with /tts for synced
                       playback.
* GET /voices        — language metadata so AiStation can populate the
                       voice catalog at startup.
* GET /health        — fast readiness check.

Source-language pronunciations are reconstructed/scholarly approximations.
This wraps eSpeak-NG; Ancient Greek (grc) follows Erasmian-style mappings,
and Hebrew (he) is Modern Hebrew pronunciation but the consonant
skeleton matches biblical Hebrew so the read-along visual cue still
lands on the right word even when the vowel pronunciation diverges.
"""
from __future__ import annotations

import io
import logging
import re
import shlex
import subprocess
import unicodedata
from typing import Optional

from fastapi import FastAPI, HTTPException
from fastapi.responses import JSONResponse, Response
from pydantic import BaseModel

LOG = logging.getLogger("biblical_tts")
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")

app = FastAPI(title="FlowerCore biblical-tts", version="1.0.0")

# eSpeak-NG language codes we expose. Ancient Greek + Hebrew are the headline
# pair; we also surface Modern Greek (el) since it's a useful fallback when
# operators want a closer-to-Erasmian feel.
LANGUAGES = {
    "grc": {"label": "Ancient Greek (Erasmian)", "rtl": False, "default_voice": "grc"},
    "el":  {"label": "Modern Greek",             "rtl": False, "default_voice": "el"},
    "he":  {"label": "Hebrew (Modern)",          "rtl": True,  "default_voice": "he"},
}


class TtsRequest(BaseModel):
    text: str
    language: str = "grc"
    voice: Optional[str] = None
    rate: int = 175       # words per minute, eSpeak default 175
    pitch: int = 50       # 0-99
    volume: int = 100     # 0-200


HEBREW_CHAR_RE = re.compile(r"[\u0590-\u05FF]")
HEBREW_WORD_RE = re.compile(r"[\u0590-\u05FF]+")

# eSpeak-NG's Hebrew voice can spell unpointed Hebrew as Unicode character
# names on some builds. For source-text study reads, prefer a stable
# scholarly transliteration so words sound like words even without niqqud.
HEBREW_WORD_TRANSLITERATIONS = {
    "אב": "av",
    "אבא": "abba",
    "אברהם": "Avraham",
    "אדמה": "adamah",
    "אדני": "Adonai",
    "אדם": "adam",
    "אור": "or",
    "אלהים": "Elohim",
    "אלוהים": "Elohim",
    "אמן": "amen",
    "אם": "em",
    "אמת": "emet",
    "ארץ": "eretz",
    "אש": "esh",
    "את": "et",
    "בית": "beit",
    "בן": "ben",
    "ברא": "bara",
    "בראשית": "bereshit",
    "ברית": "berit",
    "ברוך": "barukh",
    "בת": "bat",
    "גוי": "goy",
    "גוים": "goyim",
    "גויים": "goyim",
    "דבר": "davar",
    "דברים": "devarim",
    "דוד": "David",
    "הלל": "hallel",
    "הארץ": "ha-aretz",
    "הברית": "ha-berit",
    "החדשה": "ha-chadashah",
    "השמים": "ha-shamayim",
    "השמיים": "ha-shamayim",
    "ויאמר": "vayomer",
    "יהוה": "Adonai",
    "יוסף": "Yosef",
    "יוחנן": "Yochanan",
    "ישראל": "Yisrael",
    "ישוע": "Yeshua",
    "יצחק": "Yitzchak",
    "יעקב": "Yaakov",
    "ירושלים": "Yerushalayim",
    "כהן": "kohen",
    "כהנים": "kohanim",
    "מים": "mayim",
    "מות": "mavet",
    "מושיע": "moshia",
    "מלך": "melekh",
    "מלכות": "malkhut",
    "מרים": "Miriam",
    "משה": "Moshe",
    "משיח": "Mashiach",
    "נביא": "navi",
    "נביאים": "neviim",
    "עם": "am",
    "עולם": "olam",
    "צדק": "tzedek",
    "קדוש": "qadosh",
    "קדושים": "qedoshim",
    "קול": "qol",
    "רוח": "ruach",
    "שאול": "Shaul",
    "שמים": "shamayim",
    "שמיים": "shamayim",
    "שמעון": "Shimon",
    "שלום": "Shalom",
    "תורה": "torah",
    "חכמה": "chokhmah",
    "חסד": "chesed",
    "חיים": "chayim",
    "חושך": "choshekh",
}

HEBREW_LETTERS = {
    "א": "a",
    "ב": "b",
    "ג": "g",
    "ד": "d",
    "ה": "h",
    "ו": "v",
    "ז": "z",
    "ח": "kh",
    "ט": "t",
    "י": "y",
    "כ": "kh",
    "ך": "kh",
    "ל": "l",
    "מ": "m",
    "ם": "m",
    "נ": "n",
    "ן": "n",
    "ס": "s",
    "ע": "a",
    "פ": "p",
    "ף": "f",
    "צ": "ts",
    "ץ": "ts",
    "ק": "q",
    "ר": "r",
    "ש": "sh",
    "ת": "t",
}

HEBREW_VOWELISH = {"a", "e", "i", "o", "u"}


def _strip_hebrew_marks(value: str) -> str:
    decomposed = unicodedata.normalize("NFD", value)
    return "".join(
        ch for ch in decomposed
        if unicodedata.category(ch) != "Mn" and ch not in {"׳", "״", "־"}
    )


def _fallback_hebrew_transliteration(word: str) -> str:
    tokens: list[str] = []
    chars = list(word)
    for index, ch in enumerate(chars):
        token = HEBREW_LETTERS.get(ch)
        if token is None:
            continue
        if ch == "ה" and index == len(chars) - 1:
            token = "ah"
        elif ch == "י" and index > 0:
            token = "i"
        elif ch == "ו" and index > 0:
            token = "o"
        tokens.append(token)

    if not tokens:
        return word

    spoken: list[str] = []
    for index, token in enumerate(tokens):
        spoken.append(token)
        next_token = tokens[index + 1] if index + 1 < len(tokens) else ""
        if (
            token[-1:] not in HEBREW_VOWELISH
            and next_token
            and next_token[:1] not in HEBREW_VOWELISH
        ):
            spoken.append("a")
    return "".join(spoken)


def _transliterate_hebrew_word(match: re.Match[str]) -> str:
    original = match.group(0)
    normalized = _strip_hebrew_marks(original)
    if not normalized:
        return original

    direct = HEBREW_WORD_TRANSLITERATIONS.get(normalized)
    if direct:
        return direct

    if normalized.startswith("ו") and len(normalized) > 1:
        rest = HEBREW_WORD_TRANSLITERATIONS.get(normalized[1:])
        if rest:
            return f"ve-{rest}"

    if normalized.startswith("ה") and len(normalized) > 1:
        rest = HEBREW_WORD_TRANSLITERATIONS.get(normalized[1:])
        if rest:
            return f"ha-{rest}"

    return _fallback_hebrew_transliteration(normalized)


def _prepare_synthesis_input(text: str, language: str, voice: str) -> tuple[str, str]:
    if language.lower().startswith("he") and HEBREW_CHAR_RE.search(text):
        spoken = HEBREW_WORD_RE.sub(_transliterate_hebrew_word, text)
        return spoken, "en-us"
    return text, voice


def _resolve_voice(req: TtsRequest) -> str:
    if req.voice:
        return req.voice.strip()
    lang = req.language.lower()
    return LANGUAGES.get(lang, {}).get("default_voice", lang)


def _run_espeak(args: list[str], stdin_text: bytes) -> bytes:
    cmd = ["espeak-ng"] + args
    LOG.info("espeak-ng %s", shlex.join(args))
    try:
        proc = subprocess.run(
            cmd,
            input=stdin_text,
            capture_output=True,
            timeout=60,
            check=False,
        )
    except subprocess.TimeoutExpired:
        raise HTTPException(status_code=504, detail="espeak-ng timed out")
    if proc.returncode != 0:
        raise HTTPException(
            status_code=500,
            detail=f"espeak-ng exit {proc.returncode}: {proc.stderr.decode('utf-8', errors='replace')[:512]}",
        )
    return proc.stdout


@app.get("/health")
def health():
    return {"status": "ok", "languages": list(LANGUAGES.keys())}


@app.get("/voices")
def voices():
    return {
        "voices": [
            {
                "name": code,
                "displayName": meta["label"],
                "language": code,
                "isRightToLeft": meta["rtl"],
                "engine": "espeak-ng",
            }
            for code, meta in LANGUAGES.items()
        ]
    }


@app.post("/tts")
def tts(req: TtsRequest) -> Response:
    if not req.text.strip():
        raise HTTPException(status_code=400, detail="text is required")

    voice = _resolve_voice(req)
    spoken_text, synth_voice = _prepare_synthesis_input(req.text, req.language, voice)
    args = [
        "--stdout",
        "-v", synth_voice,
        "-s", str(max(80, min(450, req.rate))),
        "-p", str(max(0, min(99, req.pitch))),
        "-a", str(max(0, min(200, req.volume))),
    ]
    wav = _run_espeak(args, spoken_text.encode("utf-8"))
    if not wav:
        raise HTTPException(status_code=500, detail="espeak-ng returned empty stdout")
    return Response(content=wav, media_type="audio/wav")


# --------------------------------------------------------------------------
#  /timings — synth + word-level timing from espeak's phoneme/word stream.
# --------------------------------------------------------------------------
#
# espeak-ng's --pho flag emits a phoneme stream:
#
#   _ 5 phon...
#   _ 56 phon...
#   _ 67 phon...
#
# That alone doesn't give word boundaries. Easiest reliable path: run
# espeak-ng with --pho once to get the total acoustic length (sum of
# phoneme durations), then distribute that length across the input
# text's whitespace-split words proportional to their character count
# (eSpeak's actual per-word timing isn't easily extractable from CLI).
# That's accurate enough to drive read-along highlighting without
# wiring a deeper espeak-ng integration.
#
# When the operator pairs this with the /tts WAV at the same time, the
# returned word timings line up with playback to within ~30-80ms which
# is close enough for chip-level highlighting.

PHONEME_DURATION_RE = re.compile(r"^\s*\S+\s+(\d+)\s+", re.MULTILINE)


def _estimate_total_ms(req: TtsRequest, voice: str, spoken_text: str) -> int:
    args = ["--pho", "--quiet", "-v", voice, "-s", str(req.rate)]
    out = _run_espeak(args, spoken_text.encode("utf-8"))
    text = out.decode("utf-8", errors="replace")
    total = 0
    for match in PHONEME_DURATION_RE.finditer(text):
        try:
            total += int(match.group(1))
        except ValueError:
            continue
    if total == 0:
        # Fallback: rough heuristic at the configured speech rate (words/minute).
        words = max(1, len(req.text.split()))
        total = int(words / max(60, req.rate) * 60_000)
    return total


@app.post("/timings")
def timings(req: TtsRequest):
    if not req.text.strip():
        raise HTTPException(status_code=400, detail="text is required")
    voice = _resolve_voice(req)
    spoken_text, synth_voice = _prepare_synthesis_input(req.text, req.language, voice)
    total_ms = _estimate_total_ms(req, synth_voice, spoken_text)

    # Distribute total_ms across whitespace-split words proportional to
    # character count. Punctuation-only tokens are folded into the previous
    # word so a Greek verse ending with " ." doesn't claim a chunk of time.
    words = req.text.split()
    if not words:
        return {"text": req.text, "words": [], "durationMs": total_ms}

    char_total = sum(max(1, len(w)) for w in words)
    cursor = 0
    out_words: list[dict] = []
    for word in words:
        weight = max(1, len(word))
        share = int(round(total_ms * weight / char_total))
        start = cursor
        end = start + share
        out_words.append({"text": word, "startMs": start, "endMs": end})
        cursor = end

    # Snap the last word's end to the actual total so the read-along loop
    # never overshoots.
    if out_words:
        out_words[-1]["endMs"] = total_ms

    return JSONResponse(
        {
            "text": req.text,
            "language": req.language,
            "voice": synth_voice,
            "words": out_words,
            "durationMs": total_ms,
        }
    )