Document TTS configuration better (#142)

* Document TTS configuration better

* Improve CFG documentation
This commit is contained in:
Václav Volhejn
2025-11-26 19:19:28 +01:00
committed by GitHub
parent 8fb54b2b07
commit 13f343f88c
3 changed files with 51 additions and 2 deletions

View File

@@ -276,6 +276,8 @@ echo "Hey, how are you?" | python scripts/tts_rust_server.py - -
# From text file to audio file
python scripts/tts_rust_server.py text_to_say.txt audio_output.wav
```
You can configure the server by modifying `configs/config-tts.toml`. See comments in that file to see what options are available.
</details>
<details>

View File

@@ -1,20 +1,60 @@
static_dir = "./static/"
log_dir = "$HOME/tmp/tts-logs"
# Used to identify the server when logging.
instance_name = "tts"
# Simple security: require clients to provide an auth token when connecting.
# It can be set by setting auth_id to the query string, e.g.
# "localhost:8089/api/tts_streaming?auth_id=public_token"
# or by setting the kyutai-api-key HTTP header, see the tts_rust_server.py example.
authorized_ids = ["public_token"]
[modules.tts_py]
type = "Py"
# Under which path should the TTS be available? This is relevant because the server
# can run STT at the same time.
path = "/api/tts_streaming"
text_tokenizer_file = "hf://kyutai/tts-1.6b-en_fr/tokenizer_spm_8k_en_fr_audio.model"
batch_size = 8 # Adjust to your GPU memory capacity
# Batch size determines how many parallel connections can the server handle.
# Higher values mean slower inference. Adjust to your GPU memory capacity.
batch_size = 8
text_bos_token = 1
[modules.tts_py.py]
log_folder = "$HOME/tmp/moshi-server-logs"
# The folder to read voices from. Can be a local directory, or a Hugging Face repo
# using the "hf-snapshot://" prefix. We use a glob to only download the .safetensors files
# with voice embeddings since the repo also contains .wav files we don't need.
voice_folder = "hf-snapshot://kyutai/tts-voices/**/*.safetensors"
# This voice will be used if the user doesn't specify one, or selects a non-existent one.
# This usually means something is wrong, so here we set it to a strange voice to make it clear
# that something is off.
# Relative to the voice folder.
default_voice = "unmute-prod-website/default_voice.wav"
# Classifier-free guidance coefficient (see https://arxiv.org/abs/2207.12598).
# TLDR: A higher CFG value makes the model adhere to the voice more closely,
# but it can affect audio quality and make it more likely to make mistakes
# like inserting words that aren't in the script.
# Technical details:
# CFG has the disadvantage of increasing inference time, because you need to run the model
# twice for each step (once with the voice embedding, once without).
# The default model, "tts-1.6b-en_fr", is trained with CFG distillation, which means it learns
# to mimic CFG with different coefs during training, without actually using CFG at inference time.
# These is only a fixed set of CFG coefs the model was trained with, so using a different value
# will not work. The recommended value for this model is 2.0.
cfg_coef = 2.0
# Whether the unconditioned branch of the CFG should still have text conditioning or not.
# Typically, no need to touch this.
cfg_is_no_text = true
# Number of padding frames to force between words. Will make the model articulate
# a bit better with values such as 1.
padding_between = 1
# Number of quantization levels for the residual vector quantizer.
# Higher means better sounding audio but longer inference.
# The maximum is typically 32, reasonable values are 8-32.
n_q = 24
# Make the model speak faster or slower by changing how likely it is to sample the padding token.
# Should be between -2 and 2, with positive values leading to slower speech.
padding_bonus = 0

View File

@@ -16,7 +16,6 @@ from urllib.parse import urlencode
import msgpack
import numpy as np
import sounddevice as sd
import sphn
import tqdm
import websockets
@@ -52,6 +51,10 @@ async def receive_messages(websocket: websockets.ClientConnection, output_queue)
async def output_audio(out: str, output_queue: asyncio.Queue[np.ndarray | None]):
if out == "-":
# This will fail with "OSError: PortAudio library not found" on servers with no
# audio output, so only import if the user requests it.
import sounddevice as sd
should_exit = False
def audio_callback(outdata, _a, _b, _c):
@@ -157,6 +160,10 @@ async def websocket_client():
print("Enter text to synthesize (Ctrl+D to end input):")
headers = {"kyutai-api-key": args.api_key}
# For clients that don't support the `additional_headers` parameter when connecting
# (notably: JS libraries like react-use-websocket),
# you can also provide the API key in the query string with the "auth_id" key,
# i.e. adding "&auth_id=public_token" at the end of `uri`
async with websockets.connect(uri, additional_headers=headers) as websocket:
print("connected")