mirror of
https://github.com/kyutai-labs/delayed-streams-modeling.git
synced 2025-12-22 19:09:57 +00:00
Document TTS configuration better (#142)
* Document TTS configuration better * Improve CFG documentation
This commit is contained in:
@@ -276,6 +276,8 @@ echo "Hey, how are you?" | python scripts/tts_rust_server.py - -
|
||||
# From text file to audio file
|
||||
python scripts/tts_rust_server.py text_to_say.txt audio_output.wav
|
||||
```
|
||||
|
||||
You can configure the server by modifying `configs/config-tts.toml`. See comments in that file to see what options are available.
|
||||
</details>
|
||||
|
||||
<details>
|
||||
|
||||
@@ -1,20 +1,60 @@
|
||||
static_dir = "./static/"
|
||||
log_dir = "$HOME/tmp/tts-logs"
|
||||
# Used to identify the server when logging.
|
||||
instance_name = "tts"
|
||||
# Simple security: require clients to provide an auth token when connecting.
|
||||
# It can be set by setting auth_id to the query string, e.g.
|
||||
# "localhost:8089/api/tts_streaming?auth_id=public_token"
|
||||
# or by setting the kyutai-api-key HTTP header, see the tts_rust_server.py example.
|
||||
authorized_ids = ["public_token"]
|
||||
|
||||
[modules.tts_py]
|
||||
type = "Py"
|
||||
# Under which path should the TTS be available? This is relevant because the server
|
||||
# can run STT at the same time.
|
||||
path = "/api/tts_streaming"
|
||||
text_tokenizer_file = "hf://kyutai/tts-1.6b-en_fr/tokenizer_spm_8k_en_fr_audio.model"
|
||||
batch_size = 8 # Adjust to your GPU memory capacity
|
||||
# Batch size determines how many parallel connections can the server handle.
|
||||
# Higher values mean slower inference. Adjust to your GPU memory capacity.
|
||||
batch_size = 8
|
||||
text_bos_token = 1
|
||||
|
||||
[modules.tts_py.py]
|
||||
log_folder = "$HOME/tmp/moshi-server-logs"
|
||||
# The folder to read voices from. Can be a local directory, or a Hugging Face repo
|
||||
# using the "hf-snapshot://" prefix. We use a glob to only download the .safetensors files
|
||||
# with voice embeddings since the repo also contains .wav files we don't need.
|
||||
voice_folder = "hf-snapshot://kyutai/tts-voices/**/*.safetensors"
|
||||
# This voice will be used if the user doesn't specify one, or selects a non-existent one.
|
||||
# This usually means something is wrong, so here we set it to a strange voice to make it clear
|
||||
# that something is off.
|
||||
# Relative to the voice folder.
|
||||
default_voice = "unmute-prod-website/default_voice.wav"
|
||||
|
||||
# Classifier-free guidance coefficient (see https://arxiv.org/abs/2207.12598).
|
||||
# TLDR: A higher CFG value makes the model adhere to the voice more closely,
|
||||
# but it can affect audio quality and make it more likely to make mistakes
|
||||
# like inserting words that aren't in the script.
|
||||
# Technical details:
|
||||
# CFG has the disadvantage of increasing inference time, because you need to run the model
|
||||
# twice for each step (once with the voice embedding, once without).
|
||||
# The default model, "tts-1.6b-en_fr", is trained with CFG distillation, which means it learns
|
||||
# to mimic CFG with different coefs during training, without actually using CFG at inference time.
|
||||
# These is only a fixed set of CFG coefs the model was trained with, so using a different value
|
||||
# will not work. The recommended value for this model is 2.0.
|
||||
cfg_coef = 2.0
|
||||
|
||||
# Whether the unconditioned branch of the CFG should still have text conditioning or not.
|
||||
# Typically, no need to touch this.
|
||||
cfg_is_no_text = true
|
||||
|
||||
# Number of padding frames to force between words. Will make the model articulate
|
||||
# a bit better with values such as 1.
|
||||
padding_between = 1
|
||||
# Number of quantization levels for the residual vector quantizer.
|
||||
# Higher means better sounding audio but longer inference.
|
||||
# The maximum is typically 32, reasonable values are 8-32.
|
||||
n_q = 24
|
||||
# Make the model speak faster or slower by changing how likely it is to sample the padding token.
|
||||
# Should be between -2 and 2, with positive values leading to slower speech.
|
||||
padding_bonus = 0
|
||||
@@ -16,7 +16,6 @@ from urllib.parse import urlencode
|
||||
|
||||
import msgpack
|
||||
import numpy as np
|
||||
import sounddevice as sd
|
||||
import sphn
|
||||
import tqdm
|
||||
import websockets
|
||||
@@ -52,6 +51,10 @@ async def receive_messages(websocket: websockets.ClientConnection, output_queue)
|
||||
|
||||
async def output_audio(out: str, output_queue: asyncio.Queue[np.ndarray | None]):
|
||||
if out == "-":
|
||||
# This will fail with "OSError: PortAudio library not found" on servers with no
|
||||
# audio output, so only import if the user requests it.
|
||||
import sounddevice as sd
|
||||
|
||||
should_exit = False
|
||||
|
||||
def audio_callback(outdata, _a, _b, _c):
|
||||
@@ -157,6 +160,10 @@ async def websocket_client():
|
||||
print("Enter text to synthesize (Ctrl+D to end input):")
|
||||
headers = {"kyutai-api-key": args.api_key}
|
||||
|
||||
# For clients that don't support the `additional_headers` parameter when connecting
|
||||
# (notably: JS libraries like react-use-websocket),
|
||||
# you can also provide the API key in the query string with the "auth_id" key,
|
||||
# i.e. adding "&auth_id=public_token" at the end of `uri`
|
||||
async with websockets.connect(uri, additional_headers=headers) as websocket:
|
||||
print("connected")
|
||||
|
||||
|
||||
Reference in New Issue
Block a user