Document TTS configuration better (#142)

* Document TTS configuration better * Improve CFG documentation
2025-12-22 19:09:57 +00:00 · 2025-11-26 19:19:28 +01:00
parent 8fb54b2b07
commit 13f343f88c
3 changed files with 51 additions and 2 deletions
--- a/README.md
+++ b/README.md
@@ -276,6 +276,8 @@ echo "Hey, how are you?" | python scripts/tts_rust_server.py - -
 # From text file to audio file
 python scripts/tts_rust_server.py text_to_say.txt audio_output.wav
 ```
+
+You can configure the server by modifying `configs/config-tts.toml`. See comments in that file to see what options are available.
 </details>

 <details>
--- a/configs/config-tts.toml
+++ b/configs/config-tts.toml
@@ -1,20 +1,60 @@
 static_dir = "./static/"
 log_dir = "$HOME/tmp/tts-logs"
+# Used to identify the server when logging.
 instance_name = "tts"
+# Simple security: require clients to provide an auth token when connecting.
+# It can be set by setting auth_id to the query string, e.g.
+# "localhost:8089/api/tts_streaming?auth_id=public_token"
+# or by setting the kyutai-api-key HTTP header, see the tts_rust_server.py example.
 authorized_ids = ["public_token"]

 [modules.tts_py]
 type = "Py"
+# Under which path should the TTS be available? This is relevant because the server
+# can run STT at the same time.
 path = "/api/tts_streaming"
 text_tokenizer_file = "hf://kyutai/tts-1.6b-en_fr/tokenizer_spm_8k_en_fr_audio.model"
-batch_size = 8  # Adjust to your GPU memory capacity
+# Batch size determines how many parallel connections can the server handle.
+# Higher values mean slower inference. Adjust to your GPU memory capacity.
+batch_size = 8
 text_bos_token = 1

 [modules.tts_py.py]
 log_folder = "$HOME/tmp/moshi-server-logs"
+# The folder to read voices from. Can be a local directory, or a Hugging Face repo
+# using the "hf-snapshot://" prefix. We use a glob to only download the .safetensors files
+# with voice embeddings since the repo also contains .wav files we don't need.
 voice_folder = "hf-snapshot://kyutai/tts-voices/**/*.safetensors"
+# This voice will be used if the user doesn't specify one, or selects a non-existent one.
+# This usually means something is wrong, so here we set it to a strange voice to make it clear
+# that something is off.
+# Relative to the voice folder.
 default_voice = "unmute-prod-website/default_voice.wav"
+
+# Classifier-free guidance coefficient (see https://arxiv.org/abs/2207.12598).
+# TLDR: A higher CFG value makes the model adhere to the voice more closely,
+# but it can affect audio quality and make it more likely to make mistakes
+# like inserting words that aren't in the script.
+# Technical details:
+# CFG has the disadvantage of increasing inference time, because you need to run the model
+# twice for each step (once with the voice embedding, once without).
+# The default model, "tts-1.6b-en_fr", is trained with CFG distillation, which means it learns
+# to mimic CFG with different coefs during training, without actually using CFG at inference time.
+# These is only a fixed set of CFG coefs the model was trained with, so using a different value
+# will not work. The recommended value for this model is 2.0.
 cfg_coef = 2.0
+
+# Whether the unconditioned branch of the CFG should still have text conditioning or not.
+# Typically, no need to touch this.
 cfg_is_no_text = true
+
+# Number of padding frames to force between words. Will make the model articulate
+# a bit better with values such as 1.
 padding_between = 1
+# Number of quantization levels for the residual vector quantizer.
+# Higher means better sounding audio but longer inference.
+# The maximum is typically 32, reasonable values are 8-32.
 n_q = 24
+# Make the model speak faster or slower by changing how likely it is to sample the padding token.
+# Should be between -2 and 2, with positive values leading to slower speech.
+padding_bonus = 0
--- a/scripts/tts_rust_server.py
+++ b/scripts/tts_rust_server.py
@@ -16,7 +16,6 @@ from urllib.parse import urlencode

 import msgpack
 import numpy as np
-import sounddevice as sd
 import sphn
 import tqdm
 import websockets
@@ -52,6 +51,10 @@ async def receive_messages(websocket: websockets.ClientConnection, output_queue)

 async def output_audio(out: str, output_queue: asyncio.Queue[np.ndarray | None]):
    if out == "-":
+        # This will fail with "OSError: PortAudio library not found" on servers with no
+        # audio output, so only import if the user requests it.
+        import sounddevice as sd
+
        should_exit = False

        def audio_callback(outdata, _a, _b, _c):
@@ -157,6 +160,10 @@ async def websocket_client():
            print("Enter text to synthesize (Ctrl+D to end input):")
    headers = {"kyutai-api-key": args.api_key}

+    # For clients that don't support the `additional_headers` parameter when connecting
+    # (notably: JS libraries like react-use-websocket),
+    # you can also provide the API key in the query string with the "auth_id" key,
+    # i.e. adding "&auth_id=public_token" at the end of `uri`
    async with websockets.connect(uri, additional_headers=headers) as websocket:
        print("connected")