add easy_synthesize

2026-05-16 13:50:40 +00:00 · 2024-09-13 08:46:03 +00:00
parent 5f52896173
commit 94c30aed9b
2 changed files with 43 additions and 6 deletions
--- a/sbv2_core/src/main.rs
+++ b/sbv2_core/src/main.rs
@@ -24,7 +24,8 @@ fn main() -> anyhow::Result<()> {
        tones.clone(),
        lang_ids.clone(),
        style_vector.clone(),
-        tts::SynthesizeOptions::default(),
+        0.0,
+        0.5,
    )?;
    std::fs::write("output.wav", data)?;
    let now = Instant::now();
@@ -44,7 +45,8 @@ fn main() -> anyhow::Result<()> {
            tones.clone(),
            lang_ids.clone(),
            style_vector.clone(),
-            tts::SynthesizeOptions::default(),
+            0.0,
+            1.0,
        )?;
    }
    println!(
--- a/sbv2_core/src/tts.rs
+++ b/sbv2_core/src/tts.rs
@@ -1,6 +1,7 @@
 use crate::error::{Error, Result};
 use crate::{bert, jtalk, model, nlp, norm, style, tokenizer, utils};
-use ndarray::{concatenate, s, Array, Array1, Array2, Axis};
+use ndarray::{concatenate, s, Array, Array1, Array2, Array3, Axis};
+use hound::{SampleFormat, WavSpec, WavWriter};
 use ort::Session;
 use std::io::{Cursor, Read};
 use tar::Archive;
@@ -197,6 +198,37 @@ impl TTSModelHolder {
        style::get_style_vector(&self.find_model(ident)?.style_vectors, style_id, weight)
    }

+    pub fn easy_synthesize<I: Into<TTSIdent>>(
+        &self,
+        ident: I,
+        text: &str,
+        style_id: i32,
+        options: SynthesizeOptions,
+    ) -> Result<()> {
+        let (bert_ori, phones, tones, lang_ids) = self.parse_text(text)?;
+        let style_vector = self.get_style_vector(ident, style_id, options.style_weight)?;
+        Ok(())
+    }
+
+    fn array_to_vec(audio_array: Array3<f32>) -> Result<Vec<u8>> {
+        let spec = WavSpec {
+            channels: 1,
+            sample_rate: 44100,
+            bits_per_sample: 32,
+            sample_format: SampleFormat::Float,
+        };
+        let mut cursor = Cursor::new(Vec::new());
+        let mut writer = WavWriter::new(&mut cursor, spec)?;
+        for i in 0..audio_array.shape()[0] {
+            let output = audio_array.slice(s![i, 0, ..]).to_vec();
+            for sample in output {
+                writer.write_sample(sample)?;
+            }
+        }
+        writer.finalize()?;
+        Ok(cursor.into_inner())
+    }
+
    #[allow(clippy::too_many_arguments)]
    pub fn synthesize<I: Into<TTSIdent>>(
        &self,
@@ -206,7 +238,8 @@ impl TTSModelHolder {
        tones: Array1<i64>,
        lang_ids: Array1<i64>,
        style_vector: Array1<f32>,
-        options: SynthesizeOptions,
+        sdp_ratio: f32,
+        length_scale: f32,
    ) -> Result<Vec<u8>> {
        let buffer = model::synthesize(
            &self.find_model(ident)?.vits2,
@@ -215,8 +248,8 @@ impl TTSModelHolder {
            tones,
            lang_ids,
            style_vector,
-            options.sdp_ratio,
-            options.length_scale,
+            sdp_ratio,
+            length_scale,
        )?;
        Ok(buffer)
    }
@@ -225,6 +258,7 @@ impl TTSModelHolder {
 pub struct SynthesizeOptions {
    sdp_ratio: f32,
    length_scale: f32,
+    style_weight: f32,
    split_sentences: bool,
 }

@@ -233,6 +267,7 @@ impl Default for SynthesizeOptions {
        SynthesizeOptions {
            sdp_ratio: 0.0,
            length_scale: 1.0,
+            style_weight: 1.0,
            split_sentences: true,
        }
    }