From 13f343f88c3740f6f6f56c8110bdfa6b42584a5f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=A1clav=20Volhejn?= <8401624+vvolhejn@users.noreply.github.com> Date: Wed, 26 Nov 2025 19:19:28 +0100 Subject: [PATCH] Document TTS configuration better (#142) * Document TTS configuration better * Improve CFG documentation --- README.md | 2 ++ configs/config-tts.toml | 42 +++++++++++++++++++++++++++++++++++++- scripts/tts_rust_server.py | 9 +++++++- 3 files changed, 51 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index aa0e75e..db49417 100644 --- a/README.md +++ b/README.md @@ -276,6 +276,8 @@ echo "Hey, how are you?" | python scripts/tts_rust_server.py - - # From text file to audio file python scripts/tts_rust_server.py text_to_say.txt audio_output.wav ``` + +You can configure the server by modifying `configs/config-tts.toml`. See comments in that file to see what options are available.
diff --git a/configs/config-tts.toml b/configs/config-tts.toml index 367336f..faec211 100644 --- a/configs/config-tts.toml +++ b/configs/config-tts.toml @@ -1,20 +1,60 @@ static_dir = "./static/" log_dir = "$HOME/tmp/tts-logs" +# Used to identify the server when logging. instance_name = "tts" +# Simple security: require clients to provide an auth token when connecting. +# It can be set by setting auth_id to the query string, e.g. +# "localhost:8089/api/tts_streaming?auth_id=public_token" +# or by setting the kyutai-api-key HTTP header, see the tts_rust_server.py example. authorized_ids = ["public_token"] [modules.tts_py] type = "Py" +# Under which path should the TTS be available? This is relevant because the server +# can run STT at the same time. path = "/api/tts_streaming" text_tokenizer_file = "hf://kyutai/tts-1.6b-en_fr/tokenizer_spm_8k_en_fr_audio.model" -batch_size = 8 # Adjust to your GPU memory capacity +# Batch size determines how many parallel connections can the server handle. +# Higher values mean slower inference. Adjust to your GPU memory capacity. +batch_size = 8 text_bos_token = 1 [modules.tts_py.py] log_folder = "$HOME/tmp/moshi-server-logs" +# The folder to read voices from. Can be a local directory, or a Hugging Face repo +# using the "hf-snapshot://" prefix. We use a glob to only download the .safetensors files +# with voice embeddings since the repo also contains .wav files we don't need. voice_folder = "hf-snapshot://kyutai/tts-voices/**/*.safetensors" +# This voice will be used if the user doesn't specify one, or selects a non-existent one. +# This usually means something is wrong, so here we set it to a strange voice to make it clear +# that something is off. +# Relative to the voice folder. default_voice = "unmute-prod-website/default_voice.wav" + +# Classifier-free guidance coefficient (see https://arxiv.org/abs/2207.12598). +# TLDR: A higher CFG value makes the model adhere to the voice more closely, +# but it can affect audio quality and make it more likely to make mistakes +# like inserting words that aren't in the script. +# Technical details: +# CFG has the disadvantage of increasing inference time, because you need to run the model +# twice for each step (once with the voice embedding, once without). +# The default model, "tts-1.6b-en_fr", is trained with CFG distillation, which means it learns +# to mimic CFG with different coefs during training, without actually using CFG at inference time. +# These is only a fixed set of CFG coefs the model was trained with, so using a different value +# will not work. The recommended value for this model is 2.0. cfg_coef = 2.0 + +# Whether the unconditioned branch of the CFG should still have text conditioning or not. +# Typically, no need to touch this. cfg_is_no_text = true + +# Number of padding frames to force between words. Will make the model articulate +# a bit better with values such as 1. padding_between = 1 +# Number of quantization levels for the residual vector quantizer. +# Higher means better sounding audio but longer inference. +# The maximum is typically 32, reasonable values are 8-32. n_q = 24 +# Make the model speak faster or slower by changing how likely it is to sample the padding token. +# Should be between -2 and 2, with positive values leading to slower speech. +padding_bonus = 0 \ No newline at end of file diff --git a/scripts/tts_rust_server.py b/scripts/tts_rust_server.py index 77ec367..979f649 100644 --- a/scripts/tts_rust_server.py +++ b/scripts/tts_rust_server.py @@ -16,7 +16,6 @@ from urllib.parse import urlencode import msgpack import numpy as np -import sounddevice as sd import sphn import tqdm import websockets @@ -52,6 +51,10 @@ async def receive_messages(websocket: websockets.ClientConnection, output_queue) async def output_audio(out: str, output_queue: asyncio.Queue[np.ndarray | None]): if out == "-": + # This will fail with "OSError: PortAudio library not found" on servers with no + # audio output, so only import if the user requests it. + import sounddevice as sd + should_exit = False def audio_callback(outdata, _a, _b, _c): @@ -157,6 +160,10 @@ async def websocket_client(): print("Enter text to synthesize (Ctrl+D to end input):") headers = {"kyutai-api-key": args.api_key} + # For clients that don't support the `additional_headers` parameter when connecting + # (notably: JS libraries like react-use-websocket), + # you can also provide the API key in the query string with the "auth_id" key, + # i.e. adding "&auth_id=public_token" at the end of `uri` async with websockets.connect(uri, additional_headers=headers) as websocket: print("connected")