# GX10 Piper TTS — linux/arm64 (built natively on the GX10 / DGX Spark, aarch64).
# Serves the telephony /tts contract: POST {"text"} -> 16 kHz/16-bit/mono WAV.
# Voice baked into the image so there is no runtime HuggingFace dependency.
FROM python:3.12-slim

# espeak-ng is the phonemizer backend piper-tts uses at synthesis time.
RUN apt-get update \
    && apt-get install -y --no-install-recommends espeak-ng ca-certificates curl \
    && rm -rf /var/lib/apt/lists/*

RUN pip install --no-cache-dir piper-tts flask numpy

# Bake the voice model (en_US-amy-medium, 22.05 kHz native) into the image.
ARG PIPER_VOICE=en_US-amy-medium
ARG VOICE_BASE=https://huggingface.co/rhasspy/piper-voices/resolve/v1.0.0/en/en_US/amy/medium
RUN mkdir -p /voices \
    && curl -sSL -o "/voices/${PIPER_VOICE}.onnx" "${VOICE_BASE}/${PIPER_VOICE}.onnx" \
    && curl -sSL -o "/voices/${PIPER_VOICE}.onnx.json" "${VOICE_BASE}/${PIPER_VOICE}.onnx.json" \
    && test -s "/voices/${PIPER_VOICE}.onnx" \
    && test -s "/voices/${PIPER_VOICE}.onnx.json"

COPY tts_service.py /app/tts_service.py
WORKDIR /app

ENV TTS_PORT=8500 \
    PIPER_VOICE=en_US-amy-medium \
    VOICES_DIR=/voices \
    TARGET_RATE=16000

EXPOSE 8500
CMD ["python", "tts_service.py"]
