From 703e1a99c1a2eeae9da781137ed35d37576e759d Mon Sep 17 00:00:00 2001
From: tuna2134 <masato@priv.tuna2134.dev>
Date: Fri, 13 Sep 2024 09:05:43 +0000
Subject: [PATCH] =?UTF-8?q?=E3=81=A8=E3=82=8A=E3=81=82=E3=81=88=E3=81=9A?=
 =?UTF-8?q?=E5=AE=8C=E6=88=90=EF=BC=9F?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 sbv2_core/src/model.rs | 30 +++++------------------
 sbv2_core/src/tts.rs   | 55 +++++++++++++++++++++++++++++++++++++-----
 2 files changed, 55 insertions(+), 30 deletions(-)
diff --git a/sbv2_core/src/model.rs b/sbv2_core/src/model.rs
index 2b975ca..173ab23 100644
--- a/sbv2_core/src/model.rs
+++ b/sbv2_core/src/model.rs
@@ -1,8 +1,6 @@
 use crate::error::Result;
-use hound::{SampleFormat, WavSpec, WavWriter};
-use ndarray::{array, s, Array1, Array2, Axis};
+use ndarray::{array, Array1, Array2, Array3, Axis};
 use ort::{GraphOptimizationLevel, Session};
-use std::io::Cursor;
 
 #[allow(clippy::vec_init_then_push, unused_variables)]
 pub fn load_model<P: AsRef<[u8]>>(model_file: P, bert: bool) -> Result<Session> {
@@ -59,7 +57,7 @@ pub fn synthesize(
     style_vector: Array1<f32>,
     sdp_ratio: f32,
     length_scale: f32,
-) -> Result<Vec<u8>> {
+) -> Result<Array3<f32>> {
     let bert = bert_ori.insert_axis(Axis(0));
     let x_tst_lengths: Array1<i64> = array![x_tst.shape()[0] as i64];
     let x_tst = x_tst.insert_axis(Axis(0));
@@ -84,24 +82,8 @@ pub fn synthesize(
         .try_extract_tensor::<f32>()?
         .to_owned();
 
-    let buffer = {
-        let spec = WavSpec {
-            channels: 1,
-            sample_rate: 44100,
-            bits_per_sample: 32,
-            sample_format: SampleFormat::Float,
-        };
-        let mut cursor = Cursor::new(Vec::new());
-        let mut writer = WavWriter::new(&mut cursor, spec)?;
-        for i in 0..audio_array.shape()[0] {
-            let output = audio_array.slice(s![i, 0, ..]).to_vec();
-            for sample in output {
-                writer.write_sample(sample)?;
-            }
-        }
-        writer.finalize()?;
-        cursor.into_inner()
-    };
-
-    Ok(buffer)
+    Ok(Array3::from_shape_vec(
+        (audio_array.shape()[0], audio_array.shape()[1], audio_array.shape()[2]),
+        audio_array.into_raw_vec_and_offset().0,
+    )?)
 }
diff --git a/sbv2_core/src/tts.rs b/sbv2_core/src/tts.rs
index 26a4375..b031e6f 100644
--- a/sbv2_core/src/tts.rs
+++ b/sbv2_core/src/tts.rs
@@ -198,16 +198,59 @@ impl TTSModelHolder {
         style::get_style_vector(&self.find_model(ident)?.style_vectors, style_id, weight)
     }
 
-    pub fn easy_synthesize<I: Into<TTSIdent>>(
+    pub fn easy_synthesize<I: Into<TTSIdent> + Copy>(
         &self,
         ident: I,
         text: &str,
         style_id: i32,
         options: SynthesizeOptions,
-    ) -> Result<()> {
-        let (bert_ori, phones, tones, lang_ids) = self.parse_text(text)?;
+    ) -> Result<Vec<u8>> {
         let style_vector = self.get_style_vector(ident, style_id, options.style_weight)?;
-        Ok(())
+        let audio_array = if options.split_sentences {
+            let texts: Vec<&str> = text.split("\n").collect();
+            let mut audios = vec![];
+            for (i, t) in texts.iter().enumerate() {
+                if t.is_empty() {
+                    continue;
+                }
+                let (bert_ori, phones, tones, lang_ids) = self.parse_text(t)?;
+                let audio = model::synthesize(
+                    &self.find_model(ident)?.vits2,
+                    bert_ori.to_owned(),
+                    phones,
+                    tones,
+                    lang_ids,
+                    style_vector.clone(),
+                    options.sdp_ratio,
+                    options.length_scale,
+                )?;
+                audios.push(audio);
+                if i != texts.len() - 1 {
+                    // 44100 * 0.5s 無音区間
+                    audios.push(Array3::zeros((1, 22050, 1)));
+                }
+            }
+            concatenate(
+                Axis(0),
+                &audios
+                    .iter()
+                    .map(|x| x.view())
+                    .collect::<Vec<_>>()
+            )?
+        } else {
+            let (bert_ori, phones, tones, lang_ids) = self.parse_text(text)?;
+            model::synthesize(
+                &self.find_model(ident)?.vits2,
+                bert_ori.to_owned(),
+                phones,
+                tones,
+                lang_ids,
+                style_vector,
+                options.sdp_ratio,
+                options.length_scale,
+            )?
+        };
+        Ok(Self::array_to_vec(audio_array)?)
     }
 
     fn array_to_vec(audio_array: Array3<f32>) -> Result<Vec<u8>> {
@@ -241,7 +284,7 @@ impl TTSModelHolder {
         sdp_ratio: f32,
         length_scale: f32,
     ) -> Result<Vec<u8>> {
-        let buffer = model::synthesize(
+        let audio_array = model::synthesize(
             &self.find_model(ident)?.vits2,
             bert_ori.to_owned(),
             phones,
@@ -251,7 +294,7 @@ impl TTSModelHolder {
             sdp_ratio,
             length_scale,
         )?;
-        Ok(buffer)
+        Ok(Self::array_to_vec(audio_array)?)
     }
 }