mirror of
https://github.com/kyutai-labs/delayed-streams-modeling.git
synced 2026-01-04 16:32:55 +00:00
Fix usage examples and a few small things (#24)
This commit is contained in:
@@ -14,14 +14,6 @@
|
||||
Example implementation of the streaming STT example. Here we group
|
||||
test utterances in batches (pre- and post-padded with silence) and
|
||||
and then feed these batches into the streaming STT model frame-by-frame.
|
||||
|
||||
Example command:
|
||||
```
|
||||
uv run scripts/streaming_stt.py \
|
||||
--dataset meanwhile \
|
||||
--hf-repo kyutai/stt-2.6b-en
|
||||
```
|
||||
|
||||
"""
|
||||
|
||||
# The outputs I get on my H100 using this code with the 2.6B model,
|
||||
@@ -365,7 +357,7 @@ if __name__ == "__main__":
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--hf-repo", type=str, help="HF repo to load the STT model from. "
|
||||
"--hf-repo", type=str, help="HF repo to load the STT model from."
|
||||
)
|
||||
parser.add_argument("--tokenizer", type=str, help="Path to a local tokenizer file.")
|
||||
parser.add_argument(
|
||||
|
||||
@@ -10,13 +10,6 @@
|
||||
|
||||
"""An example script that illustrates how one can get per-word timestamps from
|
||||
Kyutai STT models.
|
||||
|
||||
Usage:
|
||||
```
|
||||
uv run scripts/streaming_stt_timestamps.py \
|
||||
--hf-repo kyutai/stt-2.6b-en \
|
||||
--file bria.mp3
|
||||
```
|
||||
"""
|
||||
|
||||
import argparse
|
||||
@@ -185,6 +178,8 @@ def main(args):
|
||||
if text_tokens is not None:
|
||||
text_tokens_accum.append(text_tokens)
|
||||
|
||||
print(tokenizer.decode(text_tokens.numpy().tolist()))
|
||||
|
||||
utterance_tokens = torch.concat(text_tokens_accum, dim=-1)
|
||||
timed_text = tokens_to_timestamped_text(
|
||||
utterance_tokens,
|
||||
@@ -201,11 +196,7 @@ def main(args):
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Example streaming STT w/ timestamps.")
|
||||
parser.add_argument(
|
||||
"--file",
|
||||
required=True,
|
||||
help="File to transcribe.",
|
||||
)
|
||||
parser.add_argument("in_file", help="The file to transcribe.")
|
||||
|
||||
parser.add_argument(
|
||||
"--hf-repo", type=str, help="HF repo to load the STT model from. "
|
||||
|
||||
@@ -70,7 +70,7 @@ if __name__ == "__main__":
|
||||
def audio_callback(indata, _frames, _time, _status):
|
||||
block_queue.put(indata.copy())
|
||||
|
||||
print("start recording the user input")
|
||||
print("recording audio from microphone, speak to get your words transcribed")
|
||||
with sd.InputStream(
|
||||
channels=1,
|
||||
dtype="float32",
|
||||
|
||||
Reference in New Issue
Block a user