Fix usage examples and a few small things (#24)

2026-01-04 16:32:55 +00:00 · 2025-07-02 08:58:45 +02:00
parent 4985940aad
commit 395eaeae95
4 changed files with 17 additions and 30 deletions
--- a/scripts/evaluate_on_dataset.py
+++ b/scripts/evaluate_on_dataset.py
@@ -14,14 +14,6 @@
 Example implementation of the streaming STT example. Here we group
 test utterances in batches (pre- and post-padded with silence) and
 and then feed these batches into the streaming STT model frame-by-frame.
-
-Example command:
-```
-uv run scripts/streaming_stt.py \
-    --dataset meanwhile \
-    --hf-repo  kyutai/stt-2.6b-en
-```
-
 """

 # The outputs I get on my H100 using this code with the 2.6B model,
@@ -365,7 +357,7 @@ if __name__ == "__main__":
    )

    parser.add_argument(
-        "--hf-repo", type=str, help="HF repo to load the STT model from. "
+        "--hf-repo", type=str, help="HF repo to load the STT model from."
    )
    parser.add_argument("--tokenizer", type=str, help="Path to a local tokenizer file.")
    parser.add_argument(
--- a/scripts/transcribe_from_file_via_pytorch.py
+++ b/scripts/transcribe_from_file_via_pytorch.py
@@ -10,13 +10,6 @@

 """An example script that illustrates how one can get per-word timestamps from
 Kyutai STT models.
-
-Usage:
-```
-uv run scripts/streaming_stt_timestamps.py \
-    --hf-repo kyutai/stt-2.6b-en \
-    --file bria.mp3
-```
 """

 import argparse
@@ -185,6 +178,8 @@ def main(args):
            if text_tokens is not None:
                text_tokens_accum.append(text_tokens)

+            print(tokenizer.decode(text_tokens.numpy().tolist()))
+
    utterance_tokens = torch.concat(text_tokens_accum, dim=-1)
    timed_text = tokens_to_timestamped_text(
        utterance_tokens,
@@ -201,11 +196,7 @@ def main(args):

 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Example streaming STT w/ timestamps.")
-    parser.add_argument(
-        "--file",
-        required=True,
-        help="File to transcribe.",
-    )
+    parser.add_argument("in_file", help="The file to transcribe.")

    parser.add_argument(
        "--hf-repo", type=str, help="HF repo to load the STT model from. "
--- a/scripts/transcribe_from_mic_via_mlx.py
+++ b/scripts/transcribe_from_mic_via_mlx.py
@@ -70,7 +70,7 @@ if __name__ == "__main__":
    def audio_callback(indata, _frames, _time, _status):
        block_queue.put(indata.copy())

-    print("start recording the user input")
+    print("recording audio from microphone, speak to get your words transcribed")
    with sd.InputStream(
        channels=1,
        dtype="float32",