bluejay-infra/gx10/tts/tts_service.py

#!/usr/bin/env python3
"""GX10 Piper TTS microservice — telephony /tts contract.

POST /tts  {"text": "..."}  -> 16 kHz / 16-bit / mono WAV (canonical 44-byte header)
GET  /health                -> JSON status

The telephony AsteriskProvider strips the 44-byte WAV header and writes the
remainder as a `.sln16` (signed-linear 16 kHz) file that Asterisk transcodes to
any codec. So the response MUST be 16 kHz / 16-bit / mono. The en_US-amy-medium
voice is 22.05 kHz native, so we resample to 16 kHz (a 22.05 kHz stream treated
as 16 kHz plays ~1.38x too fast). This is a drop-in upgrade over edge1's
en_US-amy-low (16 kHz native, lower quality), keeping the exact wire contract.
"""
import io
import logging
import os
import sys
import threading
import wave

import numpy as np
from flask import Flask, Response, jsonify, request

API_PORT = int(os.environ.get("TTS_PORT", "8500"))
PIPER_VOICE = os.environ.get("PIPER_VOICE", "en_US-amy-medium")
VOICES_DIR = os.environ.get("VOICES_DIR", "/voices")
TARGET_RATE = int(os.environ.get("TARGET_RATE", "16000"))

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
    stream=sys.stdout,
)
log = logging.getLogger("gx10-tts")

piper_voice_obj = None
piper_loaded = False
piper_lock = threading.Lock()
native_rate = None

app = Flask(__name__)


def load_piper():
    """Load the Piper voice model once at startup (shared, lock-guarded)."""
    global piper_voice_obj, piper_loaded
    try:
        from piper import PiperVoice
        model_path = os.path.join(VOICES_DIR, f"{PIPER_VOICE}.onnx")
        if not os.path.isfile(model_path):
            log.error("Piper voice model not found at %s — TTS disabled", model_path)
            piper_loaded = False
            return
        log.info("Loading Piper voice %s from %s", PIPER_VOICE, model_path)
        piper_voice_obj = PiperVoice.load(model_path)
        piper_loaded = True
        log.info("Piper voice loaded")
    except Exception as exc:  # noqa: BLE001 — fail-soft, /health reports it
        log.error("Failed to load Piper: %s", exc)
        piper_loaded = False


def synthesize_chunks(text):
    """Run Piper synthesis under a lock because the loaded voice is shared."""
    with piper_lock:
        return list(piper_voice_obj.synthesize(text))


def resample_i16(pcm_i16, src_rate, dst_rate):
    """Linear-interpolation resample of int16 PCM (matches edge1's STT resample)."""
    if src_rate == dst_rate or len(pcm_i16) == 0:
        return pcm_i16
    audio = pcm_i16.astype(np.float32)
    target_len = int(round(len(audio) * dst_rate / src_rate))
    if target_len <= 0:
        return np.zeros(0, dtype=np.int16)
    idx = np.linspace(0, len(audio) - 1, target_len)
    res = np.interp(idx, np.arange(len(audio)), audio)
    return np.clip(np.round(res), -32768, 32767).astype(np.int16)


@app.route("/health", methods=["GET"])
def health():
    return jsonify({
        "status": "ok",
        "voice": PIPER_VOICE,
        "loaded": piper_loaded,
        "target_rate": TARGET_RATE,
        "native_rate": native_rate,
    })


@app.route("/tts", methods=["POST"])
def tts():
    """Text -> 16 kHz/16-bit/mono WAV. Mirrors the edge1 speech-pipeline contract."""
    if not piper_loaded:
        return jsonify({"error": "Piper TTS model not loaded"}), 503

    data = request.get_json(silent=True)
    if not data or "text" not in data:
        return jsonify({"error": "Missing required field: text"}), 400

    text = data["text"].strip()
    if not text:
        return jsonify({"error": "Text field is empty"}), 400
    if len(text) > 10000:
        return jsonify({"error": "Text too long (max 10000 characters)"}), 400

    try:
        chunks = synthesize_chunks(text)
        if not chunks:
            return jsonify({"error": "No audio produced"}), 500

        global native_rate
        first = chunks[0]
        native_rate = first.sample_rate

        if first.sample_width != 2 or first.sample_channels != 1:
            return jsonify({
                "error": f"Unexpected PCM format: width={first.sample_width} "
                         f"channels={first.sample_channels} (need 16-bit mono)"
            }), 500

        pcm = np.frombuffer(
            b"".join(c.audio_int16_bytes for c in chunks), dtype=np.int16
        )
        out = resample_i16(pcm, native_rate, TARGET_RATE)

        wav_buffer = io.BytesIO()
        with wave.open(wav_buffer, "wb") as wav_file:
            wav_file.setnchannels(1)
            wav_file.setsampwidth(2)
            wav_file.setframerate(TARGET_RATE)
            wav_file.writeframes(out.tobytes())
        wav_buffer.seek(0)

        return Response(
            wav_buffer.read(),
            mimetype="audio/wav",
            headers={"Content-Disposition": 'inline; filename="speech.wav"'},
        )
    except Exception as exc:  # noqa: BLE001
        log.error("TTS synthesis failed: %s", exc)
        return jsonify({"error": f"Synthesis failed: {exc}"}), 500


if __name__ == "__main__":
    log.info(
        "GX10 TTS starting on port %d (voice=%s -> %d Hz)",
        API_PORT, PIPER_VOICE, TARGET_RATE,
    )
    load_piper()
    app.run(host="0.0.0.0", port=API_PORT, threaded=True)