とりあえず完成？

2025-12-27 17:42:57 +00:00 · 2024-09-13 09:05:43 +00:00
parent 94c30aed9b
commit 703e1a99c1
2 changed files with 55 additions and 30 deletions
--- a/sbv2_core/src/model.rs
+++ b/sbv2_core/src/model.rs
@@ -1,8 +1,6 @@
 use crate::error::Result;
-use hound::{SampleFormat, WavSpec, WavWriter};
-use ndarray::{array, s, Array1, Array2, Axis};
+use ndarray::{array, Array1, Array2, Array3, Axis};
 use ort::{GraphOptimizationLevel, Session};
-use std::io::Cursor;

 #[allow(clippy::vec_init_then_push, unused_variables)]
 pub fn load_model<P: AsRef<[u8]>>(model_file: P, bert: bool) -> Result<Session> {
@@ -59,7 +57,7 @@ pub fn synthesize(
    style_vector: Array1<f32>,
    sdp_ratio: f32,
    length_scale: f32,
-) -> Result<Vec<u8>> {
+) -> Result<Array3<f32>> {
    let bert = bert_ori.insert_axis(Axis(0));
    let x_tst_lengths: Array1<i64> = array![x_tst.shape()[0] as i64];
    let x_tst = x_tst.insert_axis(Axis(0));
@@ -84,24 +82,8 @@ pub fn synthesize(
        .try_extract_tensor::<f32>()?
        .to_owned();

-    let buffer = {
-        let spec = WavSpec {
-            channels: 1,
-            sample_rate: 44100,
-            bits_per_sample: 32,
-            sample_format: SampleFormat::Float,
-        };
-        let mut cursor = Cursor::new(Vec::new());
-        let mut writer = WavWriter::new(&mut cursor, spec)?;
-        for i in 0..audio_array.shape()[0] {
-            let output = audio_array.slice(s![i, 0, ..]).to_vec();
-            for sample in output {
-                writer.write_sample(sample)?;
-            }
-        }
-        writer.finalize()?;
-        cursor.into_inner()
-    };
-
-    Ok(buffer)
+    Ok(Array3::from_shape_vec(
+        (audio_array.shape()[0], audio_array.shape()[1], audio_array.shape()[2]),
+        audio_array.into_raw_vec_and_offset().0,
+    )?)
 }
--- a/sbv2_core/src/tts.rs
+++ b/sbv2_core/src/tts.rs
@@ -198,16 +198,59 @@ impl TTSModelHolder {
        style::get_style_vector(&self.find_model(ident)?.style_vectors, style_id, weight)
    }

-    pub fn easy_synthesize<I: Into<TTSIdent>>(
+    pub fn easy_synthesize<I: Into<TTSIdent> + Copy>(
        &self,
        ident: I,
        text: &str,
        style_id: i32,
        options: SynthesizeOptions,
-    ) -> Result<()> {
-        let (bert_ori, phones, tones, lang_ids) = self.parse_text(text)?;
+    ) -> Result<Vec<u8>> {
        let style_vector = self.get_style_vector(ident, style_id, options.style_weight)?;
-        Ok(())
+        let audio_array = if options.split_sentences {
+            let texts: Vec<&str> = text.split("\n").collect();
+            let mut audios = vec![];
+            for (i, t) in texts.iter().enumerate() {
+                if t.is_empty() {
+                    continue;
+                }
+                let (bert_ori, phones, tones, lang_ids) = self.parse_text(t)?;
+                let audio = model::synthesize(
+                    &self.find_model(ident)?.vits2,
+                    bert_ori.to_owned(),
+                    phones,
+                    tones,
+                    lang_ids,
+                    style_vector.clone(),
+                    options.sdp_ratio,
+                    options.length_scale,
+                )?;
+                audios.push(audio);
+                if i != texts.len() - 1 {
+                    // 44100 * 0.5s 無音区間
+                    audios.push(Array3::zeros((1, 22050, 1)));
+                }
+            }
+            concatenate(
+                Axis(0),
+                &audios
+                    .iter()
+                    .map(|x| x.view())
+                    .collect::<Vec<_>>()
+            )?
+        } else {
+            let (bert_ori, phones, tones, lang_ids) = self.parse_text(text)?;
+            model::synthesize(
+                &self.find_model(ident)?.vits2,
+                bert_ori.to_owned(),
+                phones,
+                tones,
+                lang_ids,
+                style_vector,
+                options.sdp_ratio,
+                options.length_scale,
+            )?
+        };
+        Ok(Self::array_to_vec(audio_array)?)
    }

    fn array_to_vec(audio_array: Array3<f32>) -> Result<Vec<u8>> {
@@ -241,7 +284,7 @@ impl TTSModelHolder {
        sdp_ratio: f32,
        length_scale: f32,
    ) -> Result<Vec<u8>> {
-        let buffer = model::synthesize(
+        let audio_array = model::synthesize(
            &self.find_model(ident)?.vits2,
            bert_ori.to_owned(),
            phones,
@@ -251,7 +294,7 @@ impl TTSModelHolder {
            sdp_ratio,
            length_scale,
        )?;
-        Ok(buffer)
+        Ok(Self::array_to_vec(audio_array)?)
    }
 }