mirror of
https://github.com/kyutai-labs/delayed-streams-modeling.git
synced 2025-12-22 19:09:57 +00:00
60 lines
3.0 KiB
TOML
60 lines
3.0 KiB
TOML
static_dir = "./static/"
|
|
log_dir = "$HOME/tmp/tts-logs"
|
|
# Used to identify the server when logging.
|
|
instance_name = "tts"
|
|
# Simple security: require clients to provide an auth token when connecting.
|
|
# It can be set by setting auth_id to the query string, e.g.
|
|
# "localhost:8089/api/tts_streaming?auth_id=public_token"
|
|
# or by setting the kyutai-api-key HTTP header, see the tts_rust_server.py example.
|
|
authorized_ids = ["public_token"]
|
|
|
|
[modules.tts_py]
|
|
type = "Py"
|
|
# Under which path should the TTS be available? This is relevant because the server
|
|
# can run STT at the same time.
|
|
path = "/api/tts_streaming"
|
|
text_tokenizer_file = "hf://kyutai/tts-1.6b-en_fr/tokenizer_spm_8k_en_fr_audio.model"
|
|
# Batch size determines how many parallel connections can the server handle.
|
|
# Higher values mean slower inference. Adjust to your GPU memory capacity.
|
|
batch_size = 8
|
|
text_bos_token = 1
|
|
|
|
[modules.tts_py.py]
|
|
log_folder = "$HOME/tmp/moshi-server-logs"
|
|
# The folder to read voices from. Can be a local directory, or a Hugging Face repo
|
|
# using the "hf-snapshot://" prefix. We use a glob to only download the .safetensors files
|
|
# with voice embeddings since the repo also contains .wav files we don't need.
|
|
voice_folder = "hf-snapshot://kyutai/tts-voices/**/*.safetensors"
|
|
# This voice will be used if the user doesn't specify one, or selects a non-existent one.
|
|
# This usually means something is wrong, so here we set it to a strange voice to make it clear
|
|
# that something is off.
|
|
# Relative to the voice folder.
|
|
default_voice = "unmute-prod-website/default_voice.wav"
|
|
|
|
# Classifier-free guidance coefficient (see https://arxiv.org/abs/2207.12598).
|
|
# TLDR: A higher CFG value makes the model adhere to the voice more closely,
|
|
# but it can affect audio quality and make it more likely to make mistakes
|
|
# like inserting words that aren't in the script.
|
|
# Technical details:
|
|
# CFG has the disadvantage of increasing inference time, because you need to run the model
|
|
# twice for each step (once with the voice embedding, once without).
|
|
# The default model, "tts-1.6b-en_fr", is trained with CFG distillation, which means it learns
|
|
# to mimic CFG with different coefs during training, without actually using CFG at inference time.
|
|
# These is only a fixed set of CFG coefs the model was trained with, so using a different value
|
|
# will not work. The recommended value for this model is 2.0.
|
|
cfg_coef = 2.0
|
|
|
|
# Whether the unconditioned branch of the CFG should still have text conditioning or not.
|
|
# Typically, no need to touch this.
|
|
cfg_is_no_text = true
|
|
|
|
# Number of padding frames to force between words. Will make the model articulate
|
|
# a bit better with values such as 1.
|
|
padding_between = 1
|
|
# Number of quantization levels for the residual vector quantizer.
|
|
# Higher means better sounding audio but longer inference.
|
|
# The maximum is typically 32, reasonable values are 8-32.
|
|
n_q = 24
|
|
# Make the model speak faster or slower by changing how likely it is to sample the padding token.
|
|
# Should be between -2 and 2, with positive values leading to slower speech.
|
|
padding_bonus = 0 |