Adds a fourth TTS engine alongside Piper / Kokoro / biblical-tts: a small FastAPI bridge to Microsoft Edge's Read Aloud TTS via the edge-tts Python package. Provides studio-quality Modern Hebrew (he-IL) and Modern Greek (el-GR) narrators for the cluster. modern-tts/Dockerfile + app.py: - Python 3.12 base + edge-tts==7.2.8 (older versions hit 403 from MS). - POST /tts -> MP3 audio (audio/mpeg). - POST /timings -> word-level timings. Edge sometimes omits WordBoundary events for non-English voices; fall back to MP3-frame-walking duration estimate + proportional distribution across whitespace-split words (same approach biblical-tts uses for eSpeak). - GET /voices?language=all|default — filtered to he-/el- by default so the AiStation voice picker isn't overwhelmed by 400+ voices. - GET /health for probes. - Body shape mirrors BiblicalTtsRequest so the .NET client lives in the same FlowerCore.Shared.Speech package. K8s deployment in fc-ttsreader namespace: - ttsreader-modern Deployment + Service on port 10403. - localhost/fc-modern-tts:v1, imagePullPolicy: Never (built on noc1, imported to all 3 RKE2 nodes via ctr). - runAsNonRoot uid 1654 + fsGroup 1654. - dnsPolicy: None to bypass the *.iamworkin.lan template hijack on Microsoft endpoint lookups. - Modest resources (100m/128Mi req, 1000m/512Mi limit) — edge-tts is network-bound, not compute-bound. - Probes against /health. Verified live locally: container handles 'Καλημέρα Ελλάδα Πώς είστε' in 2496ms, returns el-GR-NestorasNeural voice + 4 word timings. Hebrew: 'בְּרֵאשִׁית בָּרָא אֱלֹהִים' returns he-IL-AvriNeural, 2472ms, 3 words. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
239 lines
9.0 KiB
Python
239 lines
9.0 KiB
Python
"""FlowerCore modern-tts — Microsoft Edge Read Aloud bridge for Modern
|
|
Hebrew and Modern Greek (and other Edge-supported languages).
|
|
|
|
Endpoints:
|
|
|
|
* POST /tts — body: {"text", "voice", "rate"?, "volume"?, "pitch"?}
|
|
returns audio/mpeg (Edge returns MP3) which the
|
|
upstream FasterWhisperAlignmentClient + the WPF
|
|
MediaPlayer both handle natively.
|
|
* POST /timings — same body shape but returns
|
|
{"text", "voice", "words": [{"text","startMs","endMs"}],
|
|
"durationMs": ...} sourced from Edge's WordBoundary
|
|
events — much more accurate than eSpeak's
|
|
proportional-distribution approach because Edge
|
|
emits real per-word offsets during synthesis.
|
|
* GET /voices — voice catalog Edge knows about. Filtered to
|
|
Hebrew + Greek by default; ?language=all returns
|
|
everything Edge supports.
|
|
* GET /health — fast readiness check.
|
|
|
|
Pairs with fc-biblical-tts (eSpeak Ancient Greek + Hebrew). The biblical
|
|
engine handles unpointed Hebrew + Erasmian Greek; this engine handles
|
|
narrative Modern Hebrew + Modern Greek for translations the operator
|
|
might be reading alongside the original.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import io
|
|
import logging
|
|
from typing import Optional
|
|
|
|
import edge_tts
|
|
from fastapi import FastAPI, HTTPException
|
|
from fastapi.responses import JSONResponse, Response
|
|
from pydantic import BaseModel
|
|
|
|
LOG = logging.getLogger("modern_tts")
|
|
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
|
|
|
app = FastAPI(title="FlowerCore modern-tts", version="1.0.0")
|
|
|
|
# Default voices by short code so AiStation can pick a sensible default
|
|
# when the operator hasn't explicitly asked for one. Edge has multiple
|
|
# voices per locale — these are the calmest male+female narrators.
|
|
DEFAULT_VOICES = {
|
|
"he": "he-IL-AvriNeural",
|
|
"he-IL": "he-IL-AvriNeural",
|
|
"el": "el-GR-NestorasNeural",
|
|
"el-GR": "el-GR-NestorasNeural",
|
|
"en": "en-US-AriaNeural",
|
|
}
|
|
|
|
|
|
class TtsRequest(BaseModel):
|
|
text: str
|
|
voice: Optional[str] = None
|
|
language: Optional[str] = None
|
|
rate: str = "+0%" # Edge accepts +20%, -10%, etc.
|
|
volume: str = "+0%"
|
|
pitch: str = "+0Hz"
|
|
|
|
|
|
def _resolve_voice(req: TtsRequest) -> str:
|
|
if req.voice:
|
|
return req.voice.strip()
|
|
if req.language and req.language in DEFAULT_VOICES:
|
|
return DEFAULT_VOICES[req.language]
|
|
return DEFAULT_VOICES["he"]
|
|
|
|
|
|
@app.get("/health")
|
|
def health():
|
|
return {"status": "ok"}
|
|
|
|
|
|
@app.get("/voices")
|
|
async def voices(language: str = "default"):
|
|
catalog = await edge_tts.list_voices()
|
|
if language == "all":
|
|
return {"voices": catalog}
|
|
|
|
# Default response: filter to languages relevant to the FlowerCore
|
|
# biblical workflow (Hebrew + Greek) so the AiStation voice picker
|
|
# isn't overwhelmed by 400+ Edge voices.
|
|
keep = ("he-", "el-")
|
|
filtered = [v for v in catalog if any(v.get("ShortName", "").startswith(k) for k in keep)]
|
|
return {"voices": filtered}
|
|
|
|
|
|
async def _synth_with_subtitles(req: TtsRequest):
|
|
voice = _resolve_voice(req)
|
|
LOG.info("edge-tts synth voice=%s len=%d", voice, len(req.text))
|
|
communicate = edge_tts.Communicate(
|
|
req.text,
|
|
voice=voice,
|
|
rate=req.rate,
|
|
volume=req.volume,
|
|
pitch=req.pitch,
|
|
)
|
|
audio_buf = io.BytesIO()
|
|
word_events: list[dict] = []
|
|
async for chunk in communicate.stream():
|
|
if chunk["type"] == "audio":
|
|
audio_buf.write(chunk["data"])
|
|
elif chunk["type"] == "WordBoundary":
|
|
word_events.append({
|
|
"text": chunk.get("text") or "",
|
|
"offset": chunk.get("offset", 0), # 100-ns ticks
|
|
"duration": chunk.get("duration", 0), # 100-ns ticks
|
|
})
|
|
return voice, audio_buf.getvalue(), word_events
|
|
|
|
|
|
def _to_ms(ticks_100ns: int) -> int:
|
|
# Edge emits offsets in 100-nanosecond ticks (.NET TimeSpan style).
|
|
return int(round(ticks_100ns / 10_000))
|
|
|
|
|
|
@app.post("/tts")
|
|
async def tts(req: TtsRequest):
|
|
if not req.text.strip():
|
|
raise HTTPException(status_code=400, detail="text is required")
|
|
try:
|
|
voice, audio_bytes, _ = await _synth_with_subtitles(req)
|
|
except edge_tts.exceptions.NoAudioReceived:
|
|
raise HTTPException(status_code=502, detail="edge-tts returned no audio for the supplied voice/text.")
|
|
except Exception as ex:
|
|
raise HTTPException(status_code=502, detail=f"edge-tts failure: {ex}")
|
|
if not audio_bytes:
|
|
raise HTTPException(status_code=502, detail="edge-tts returned an empty audio stream.")
|
|
return Response(content=audio_bytes, media_type="audio/mpeg",
|
|
headers={"X-FlowerCore-Voice": voice})
|
|
|
|
|
|
def _estimate_duration_ms_from_mp3(audio_bytes: bytes) -> int:
|
|
"""Best-effort duration estimate from raw MP3 bytes by walking frame
|
|
headers. Edge always returns CBR ~24kbps mono so we can infer total ms
|
|
from frame count. If parsing fails, return 0 and let the caller fall
|
|
through to a per-character heuristic."""
|
|
if not audio_bytes:
|
|
return 0
|
|
# MP3 sample rates by version+layer (MPEG1 layer3 / MPEG2 layer3 / MPEG2.5 layer3).
|
|
# We just walk frame headers and count frames; each frame is 1152 samples.
|
|
sample_rates_v1 = [44100, 48000, 32000, 0]
|
|
sample_rates_v2 = [22050, 24000, 16000, 0]
|
|
sample_rates_v25 = [11025, 12000, 8000, 0]
|
|
bitrates_v1_l3 = [0,32000,40000,48000,56000,64000,80000,96000,112000,128000,160000,192000,224000,256000,320000,0]
|
|
bitrates_v2_l3 = [0,8000,16000,24000,32000,40000,48000,56000,64000,80000,96000,112000,128000,144000,160000,0]
|
|
|
|
pos = 0
|
|
total_samples = 0
|
|
sample_rate = 0
|
|
while pos + 4 <= len(audio_bytes):
|
|
b0, b1, b2, b3 = audio_bytes[pos], audio_bytes[pos+1], audio_bytes[pos+2], audio_bytes[pos+3]
|
|
if b0 != 0xFF or (b1 & 0xE0) != 0xE0:
|
|
pos += 1
|
|
continue
|
|
version_bits = (b1 >> 3) & 0x03
|
|
layer_bits = (b1 >> 1) & 0x03
|
|
if layer_bits != 0x01: # layer 3 only
|
|
pos += 1
|
|
continue
|
|
bitrate_index = (b2 >> 4) & 0x0F
|
|
sample_rate_index = (b2 >> 2) & 0x03
|
|
padding = (b2 >> 1) & 0x01
|
|
if version_bits == 0x03: # MPEG1
|
|
sample_rate = sample_rates_v1[sample_rate_index]
|
|
bitrate = bitrates_v1_l3[bitrate_index]
|
|
samples_per_frame = 1152
|
|
elif version_bits == 0x02: # MPEG2
|
|
sample_rate = sample_rates_v2[sample_rate_index]
|
|
bitrate = bitrates_v2_l3[bitrate_index]
|
|
samples_per_frame = 576
|
|
elif version_bits == 0x00: # MPEG2.5
|
|
sample_rate = sample_rates_v25[sample_rate_index]
|
|
bitrate = bitrates_v2_l3[bitrate_index]
|
|
samples_per_frame = 576
|
|
else:
|
|
pos += 1
|
|
continue
|
|
if not (sample_rate and bitrate):
|
|
pos += 1
|
|
continue
|
|
frame_length = int((samples_per_frame * bitrate / 8) / sample_rate) + padding
|
|
if frame_length <= 0:
|
|
pos += 1
|
|
continue
|
|
total_samples += samples_per_frame
|
|
pos += frame_length
|
|
|
|
if sample_rate <= 0:
|
|
return 0
|
|
return int(round(total_samples * 1000 / sample_rate))
|
|
|
|
|
|
@app.post("/timings")
|
|
async def timings(req: TtsRequest):
|
|
if not req.text.strip():
|
|
raise HTTPException(status_code=400, detail="text is required")
|
|
try:
|
|
voice, audio_bytes, events = await _synth_with_subtitles(req)
|
|
except Exception as ex:
|
|
raise HTTPException(status_code=502, detail=f"edge-tts failure: {ex}")
|
|
|
|
words: list[dict] = []
|
|
for event in events:
|
|
start = _to_ms(event["offset"])
|
|
end = start + _to_ms(event["duration"])
|
|
words.append({"text": event.get("text", ""), "startMs": start, "endMs": end})
|
|
|
|
# Edge sometimes omits WordBoundary events for non-English voices
|
|
# (notably he-IL-* and el-GR-*). Fall back to proportional distribution
|
|
# over the input text — same approach the eSpeak biblical-tts uses.
|
|
if not words and req.text.strip():
|
|
total_ms = _estimate_duration_ms_from_mp3(audio_bytes)
|
|
if total_ms <= 0:
|
|
# Last-resort fallback: ~600ms per word at average speaking rate.
|
|
total_ms = max(1, len(req.text.split())) * 600
|
|
tokens = req.text.split()
|
|
if tokens:
|
|
char_total = sum(max(1, len(w)) for w in tokens)
|
|
cursor = 0
|
|
for token in tokens:
|
|
share = int(round(total_ms * max(1, len(token)) / char_total))
|
|
start = cursor
|
|
end = start + share
|
|
words.append({"text": token, "startMs": start, "endMs": end})
|
|
cursor = end
|
|
words[-1]["endMs"] = total_ms
|
|
|
|
duration_ms = words[-1]["endMs"] if words else 0
|
|
return JSONResponse({
|
|
"text": req.text,
|
|
"voice": voice,
|
|
"words": words,
|
|
"durationMs": duration_ms,
|
|
"audioBytes": len(audio_bytes),
|
|
})
|