From 51fc312d7b3f690fb1254f23878d81231d5a5f19 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?V=C3=A1clav=20Volhejn?= <vaclav@kyutai.org>
Date: Thu, 20 Nov 2025 10:56:59 +0100
Subject: [PATCH] Improve CFG documentation

---
 configs/config-tts.toml | 24 +++++++++++++++++-------
 1 file changed, 17 insertions(+), 7 deletions(-)

diff --git a/configs/config-tts.toml b/configs/config-tts.toml
index 0f04192..faec211 100644
--- a/configs/config-tts.toml
+++ b/configs/config-tts.toml
@@ -30,16 +30,26 @@ voice_folder = "hf-snapshot://kyutai/tts-voices/**/*.safetensors"
 # that something is off.
 # Relative to the voice folder.
 default_voice = "unmute-prod-website/default_voice.wav"
-# Classifier-free guidance coefficient. 
-# The default model, "tts-1.6b-en_fr", is trained with CFG distillation, which means it learns
-# to mimic CFG behavior during training, and this parameter is ignored.
-# For other models, a higher CFG value makes them adhere to the voice more
-# closely, but makes them more likely to make mistakes like inserting words that
-# aren't in the script.
-# See: https://arxiv.org/abs/2207.12598
 
+# Classifier-free guidance coefficient (see https://arxiv.org/abs/2207.12598).
+# TLDR: A higher CFG value makes the model adhere to the voice more closely,
+# but it can affect audio quality and make it more likely to make mistakes
+# like inserting words that aren't in the script.
+# Technical details:
+# CFG has the disadvantage of increasing inference time, because you need to run the model
+# twice for each step (once with the voice embedding, once without).
+# The default model, "tts-1.6b-en_fr", is trained with CFG distillation, which means it learns
+# to mimic CFG with different coefs during training, without actually using CFG at inference time.
+# These is only a fixed set of CFG coefs the model was trained with, so using a different value
+# will not work. The recommended value for this model is 2.0.
 cfg_coef = 2.0
+
+# Whether the unconditioned branch of the CFG should still have text conditioning or not.
+# Typically, no need to touch this.
 cfg_is_no_text = true
+
+# Number of padding frames to force between words. Will make the model articulate
+# a bit better with values such as 1.
 padding_between = 1
 # Number of quantization levels for the residual vector quantizer.
 # Higher means better sounding audio but longer inference.