#!/usr/bin/env python3 """GX10 Piper TTS microservice — telephony /tts contract. POST /tts {"text": "..."} -> 16 kHz / 16-bit / mono WAV (canonical 44-byte header) GET /health -> JSON status The telephony AsteriskProvider strips the 44-byte WAV header and writes the remainder as a `.sln16` (signed-linear 16 kHz) file that Asterisk transcodes to any codec. So the response MUST be 16 kHz / 16-bit / mono. The en_US-amy-medium voice is 22.05 kHz native, so we resample to 16 kHz (a 22.05 kHz stream treated as 16 kHz plays ~1.38x too fast). This is a drop-in upgrade over edge1's en_US-amy-low (16 kHz native, lower quality), keeping the exact wire contract. """ import io import logging import os import sys import threading import wave import numpy as np from flask import Flask, Response, jsonify, request API_PORT = int(os.environ.get("TTS_PORT", "8500")) PIPER_VOICE = os.environ.get("PIPER_VOICE", "en_US-amy-medium") VOICES_DIR = os.environ.get("VOICES_DIR", "/voices") TARGET_RATE = int(os.environ.get("TARGET_RATE", "16000")) logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)s] %(name)s: %(message)s", stream=sys.stdout, ) log = logging.getLogger("gx10-tts") piper_voice_obj = None piper_loaded = False piper_lock = threading.Lock() native_rate = None app = Flask(__name__) def load_piper(): """Load the Piper voice model once at startup (shared, lock-guarded).""" global piper_voice_obj, piper_loaded try: from piper import PiperVoice model_path = os.path.join(VOICES_DIR, f"{PIPER_VOICE}.onnx") if not os.path.isfile(model_path): log.error("Piper voice model not found at %s — TTS disabled", model_path) piper_loaded = False return log.info("Loading Piper voice %s from %s", PIPER_VOICE, model_path) piper_voice_obj = PiperVoice.load(model_path) piper_loaded = True log.info("Piper voice loaded") except Exception as exc: # noqa: BLE001 — fail-soft, /health reports it log.error("Failed to load Piper: %s", exc) piper_loaded = False def synthesize_chunks(text): """Run Piper synthesis under a lock because the loaded voice is shared.""" with piper_lock: return list(piper_voice_obj.synthesize(text)) def resample_i16(pcm_i16, src_rate, dst_rate): """Linear-interpolation resample of int16 PCM (matches edge1's STT resample).""" if src_rate == dst_rate or len(pcm_i16) == 0: return pcm_i16 audio = pcm_i16.astype(np.float32) target_len = int(round(len(audio) * dst_rate / src_rate)) if target_len <= 0: return np.zeros(0, dtype=np.int16) idx = np.linspace(0, len(audio) - 1, target_len) res = np.interp(idx, np.arange(len(audio)), audio) return np.clip(np.round(res), -32768, 32767).astype(np.int16) @app.route("/health", methods=["GET"]) def health(): return jsonify({ "status": "ok", "voice": PIPER_VOICE, "loaded": piper_loaded, "target_rate": TARGET_RATE, "native_rate": native_rate, }) @app.route("/tts", methods=["POST"]) def tts(): """Text -> 16 kHz/16-bit/mono WAV. Mirrors the edge1 speech-pipeline contract.""" if not piper_loaded: return jsonify({"error": "Piper TTS model not loaded"}), 503 data = request.get_json(silent=True) if not data or "text" not in data: return jsonify({"error": "Missing required field: text"}), 400 text = data["text"].strip() if not text: return jsonify({"error": "Text field is empty"}), 400 if len(text) > 10000: return jsonify({"error": "Text too long (max 10000 characters)"}), 400 try: chunks = synthesize_chunks(text) if not chunks: return jsonify({"error": "No audio produced"}), 500 global native_rate first = chunks[0] native_rate = first.sample_rate if first.sample_width != 2 or first.sample_channels != 1: return jsonify({ "error": f"Unexpected PCM format: width={first.sample_width} " f"channels={first.sample_channels} (need 16-bit mono)" }), 500 pcm = np.frombuffer( b"".join(c.audio_int16_bytes for c in chunks), dtype=np.int16 ) out = resample_i16(pcm, native_rate, TARGET_RATE) wav_buffer = io.BytesIO() with wave.open(wav_buffer, "wb") as wav_file: wav_file.setnchannels(1) wav_file.setsampwidth(2) wav_file.setframerate(TARGET_RATE) wav_file.writeframes(out.tobytes()) wav_buffer.seek(0) return Response( wav_buffer.read(), mimetype="audio/wav", headers={"Content-Disposition": 'inline; filename="speech.wav"'}, ) except Exception as exc: # noqa: BLE001 log.error("TTS synthesis failed: %s", exc) return jsonify({"error": f"Synthesis failed: {exc}"}), 500 if __name__ == "__main__": log.info( "GX10 TTS starting on port %d (voice=%s -> %d Hz)", API_PORT, PIPER_VOICE, TARGET_RATE, ) load_piper() app.run(host="0.0.0.0", port=API_PORT, threaded=True)