gx10/tts: persist Piper /tts source + manifest (telephony TTS port baseline)
Dockerfile (linux/arm64, en_US-amy-medium baked), tts_service.py (16kHz/16-bit/mono WAV, numpy resample 22050->16000), gx10-tts.yaml (CPU NodePort 30850, no GPU request), README (build/import/cutover/verify on the GX10 cluster).
This commit is contained in:
153
gx10/tts/tts_service.py
Normal file
153
gx10/tts/tts_service.py
Normal file
@@ -0,0 +1,153 @@
|
||||
#!/usr/bin/env python3
|
||||
"""GX10 Piper TTS microservice — telephony /tts contract.
|
||||
|
||||
POST /tts {"text": "..."} -> 16 kHz / 16-bit / mono WAV (canonical 44-byte header)
|
||||
GET /health -> JSON status
|
||||
|
||||
The telephony AsteriskProvider strips the 44-byte WAV header and writes the
|
||||
remainder as a `.sln16` (signed-linear 16 kHz) file that Asterisk transcodes to
|
||||
any codec. So the response MUST be 16 kHz / 16-bit / mono. The en_US-amy-medium
|
||||
voice is 22.05 kHz native, so we resample to 16 kHz (a 22.05 kHz stream treated
|
||||
as 16 kHz plays ~1.38x too fast). This is a drop-in upgrade over edge1's
|
||||
en_US-amy-low (16 kHz native, lower quality), keeping the exact wire contract.
|
||||
"""
|
||||
import io
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
import threading
|
||||
import wave
|
||||
|
||||
import numpy as np
|
||||
from flask import Flask, Response, jsonify, request
|
||||
|
||||
API_PORT = int(os.environ.get("TTS_PORT", "8500"))
|
||||
PIPER_VOICE = os.environ.get("PIPER_VOICE", "en_US-amy-medium")
|
||||
VOICES_DIR = os.environ.get("VOICES_DIR", "/voices")
|
||||
TARGET_RATE = int(os.environ.get("TARGET_RATE", "16000"))
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
|
||||
stream=sys.stdout,
|
||||
)
|
||||
log = logging.getLogger("gx10-tts")
|
||||
|
||||
piper_voice_obj = None
|
||||
piper_loaded = False
|
||||
piper_lock = threading.Lock()
|
||||
native_rate = None
|
||||
|
||||
app = Flask(__name__)
|
||||
|
||||
|
||||
def load_piper():
|
||||
"""Load the Piper voice model once at startup (shared, lock-guarded)."""
|
||||
global piper_voice_obj, piper_loaded
|
||||
try:
|
||||
from piper import PiperVoice
|
||||
model_path = os.path.join(VOICES_DIR, f"{PIPER_VOICE}.onnx")
|
||||
if not os.path.isfile(model_path):
|
||||
log.error("Piper voice model not found at %s — TTS disabled", model_path)
|
||||
piper_loaded = False
|
||||
return
|
||||
log.info("Loading Piper voice %s from %s", PIPER_VOICE, model_path)
|
||||
piper_voice_obj = PiperVoice.load(model_path)
|
||||
piper_loaded = True
|
||||
log.info("Piper voice loaded")
|
||||
except Exception as exc: # noqa: BLE001 — fail-soft, /health reports it
|
||||
log.error("Failed to load Piper: %s", exc)
|
||||
piper_loaded = False
|
||||
|
||||
|
||||
def synthesize_chunks(text):
|
||||
"""Run Piper synthesis under a lock because the loaded voice is shared."""
|
||||
with piper_lock:
|
||||
return list(piper_voice_obj.synthesize(text))
|
||||
|
||||
|
||||
def resample_i16(pcm_i16, src_rate, dst_rate):
|
||||
"""Linear-interpolation resample of int16 PCM (matches edge1's STT resample)."""
|
||||
if src_rate == dst_rate or len(pcm_i16) == 0:
|
||||
return pcm_i16
|
||||
audio = pcm_i16.astype(np.float32)
|
||||
target_len = int(round(len(audio) * dst_rate / src_rate))
|
||||
if target_len <= 0:
|
||||
return np.zeros(0, dtype=np.int16)
|
||||
idx = np.linspace(0, len(audio) - 1, target_len)
|
||||
res = np.interp(idx, np.arange(len(audio)), audio)
|
||||
return np.clip(np.round(res), -32768, 32767).astype(np.int16)
|
||||
|
||||
|
||||
@app.route("/health", methods=["GET"])
|
||||
def health():
|
||||
return jsonify({
|
||||
"status": "ok",
|
||||
"voice": PIPER_VOICE,
|
||||
"loaded": piper_loaded,
|
||||
"target_rate": TARGET_RATE,
|
||||
"native_rate": native_rate,
|
||||
})
|
||||
|
||||
|
||||
@app.route("/tts", methods=["POST"])
|
||||
def tts():
|
||||
"""Text -> 16 kHz/16-bit/mono WAV. Mirrors the edge1 speech-pipeline contract."""
|
||||
if not piper_loaded:
|
||||
return jsonify({"error": "Piper TTS model not loaded"}), 503
|
||||
|
||||
data = request.get_json(silent=True)
|
||||
if not data or "text" not in data:
|
||||
return jsonify({"error": "Missing required field: text"}), 400
|
||||
|
||||
text = data["text"].strip()
|
||||
if not text:
|
||||
return jsonify({"error": "Text field is empty"}), 400
|
||||
if len(text) > 10000:
|
||||
return jsonify({"error": "Text too long (max 10000 characters)"}), 400
|
||||
|
||||
try:
|
||||
chunks = synthesize_chunks(text)
|
||||
if not chunks:
|
||||
return jsonify({"error": "No audio produced"}), 500
|
||||
|
||||
global native_rate
|
||||
first = chunks[0]
|
||||
native_rate = first.sample_rate
|
||||
|
||||
if first.sample_width != 2 or first.sample_channels != 1:
|
||||
return jsonify({
|
||||
"error": f"Unexpected PCM format: width={first.sample_width} "
|
||||
f"channels={first.sample_channels} (need 16-bit mono)"
|
||||
}), 500
|
||||
|
||||
pcm = np.frombuffer(
|
||||
b"".join(c.audio_int16_bytes for c in chunks), dtype=np.int16
|
||||
)
|
||||
out = resample_i16(pcm, native_rate, TARGET_RATE)
|
||||
|
||||
wav_buffer = io.BytesIO()
|
||||
with wave.open(wav_buffer, "wb") as wav_file:
|
||||
wav_file.setnchannels(1)
|
||||
wav_file.setsampwidth(2)
|
||||
wav_file.setframerate(TARGET_RATE)
|
||||
wav_file.writeframes(out.tobytes())
|
||||
wav_buffer.seek(0)
|
||||
|
||||
return Response(
|
||||
wav_buffer.read(),
|
||||
mimetype="audio/wav",
|
||||
headers={"Content-Disposition": 'inline; filename="speech.wav"'},
|
||||
)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
log.error("TTS synthesis failed: %s", exc)
|
||||
return jsonify({"error": f"Synthesis failed: {exc}"}), 500
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
log.info(
|
||||
"GX10 TTS starting on port %d (voice=%s -> %d Hz)",
|
||||
API_PORT, PIPER_VOICE, TARGET_RATE,
|
||||
)
|
||||
load_piper()
|
||||
app.run(host="0.0.0.0", port=API_PORT, threaded=True)
|
||||
Reference in New Issue
Block a user