From 703e1a99c1a2eeae9da781137ed35d37576e759d Mon Sep 17 00:00:00 2001 From: tuna2134 Date: Fri, 13 Sep 2024 09:05:43 +0000 Subject: [PATCH] =?UTF-8?q?=E3=81=A8=E3=82=8A=E3=81=82=E3=81=88=E3=81=9A?= =?UTF-8?q?=E5=AE=8C=E6=88=90=EF=BC=9F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- sbv2_core/src/model.rs | 30 +++++------------------ sbv2_core/src/tts.rs | 55 +++++++++++++++++++++++++++++++++++++----- 2 files changed, 55 insertions(+), 30 deletions(-) diff --git a/sbv2_core/src/model.rs b/sbv2_core/src/model.rs index 2b975ca..173ab23 100644 --- a/sbv2_core/src/model.rs +++ b/sbv2_core/src/model.rs @@ -1,8 +1,6 @@ use crate::error::Result; -use hound::{SampleFormat, WavSpec, WavWriter}; -use ndarray::{array, s, Array1, Array2, Axis}; +use ndarray::{array, Array1, Array2, Array3, Axis}; use ort::{GraphOptimizationLevel, Session}; -use std::io::Cursor; #[allow(clippy::vec_init_then_push, unused_variables)] pub fn load_model>(model_file: P, bert: bool) -> Result { @@ -59,7 +57,7 @@ pub fn synthesize( style_vector: Array1, sdp_ratio: f32, length_scale: f32, -) -> Result> { +) -> Result> { let bert = bert_ori.insert_axis(Axis(0)); let x_tst_lengths: Array1 = array![x_tst.shape()[0] as i64]; let x_tst = x_tst.insert_axis(Axis(0)); @@ -84,24 +82,8 @@ pub fn synthesize( .try_extract_tensor::()? .to_owned(); - let buffer = { - let spec = WavSpec { - channels: 1, - sample_rate: 44100, - bits_per_sample: 32, - sample_format: SampleFormat::Float, - }; - let mut cursor = Cursor::new(Vec::new()); - let mut writer = WavWriter::new(&mut cursor, spec)?; - for i in 0..audio_array.shape()[0] { - let output = audio_array.slice(s![i, 0, ..]).to_vec(); - for sample in output { - writer.write_sample(sample)?; - } - } - writer.finalize()?; - cursor.into_inner() - }; - - Ok(buffer) + Ok(Array3::from_shape_vec( + (audio_array.shape()[0], audio_array.shape()[1], audio_array.shape()[2]), + audio_array.into_raw_vec_and_offset().0, + )?) } diff --git a/sbv2_core/src/tts.rs b/sbv2_core/src/tts.rs index 26a4375..b031e6f 100644 --- a/sbv2_core/src/tts.rs +++ b/sbv2_core/src/tts.rs @@ -198,16 +198,59 @@ impl TTSModelHolder { style::get_style_vector(&self.find_model(ident)?.style_vectors, style_id, weight) } - pub fn easy_synthesize>( + pub fn easy_synthesize + Copy>( &self, ident: I, text: &str, style_id: i32, options: SynthesizeOptions, - ) -> Result<()> { - let (bert_ori, phones, tones, lang_ids) = self.parse_text(text)?; + ) -> Result> { let style_vector = self.get_style_vector(ident, style_id, options.style_weight)?; - Ok(()) + let audio_array = if options.split_sentences { + let texts: Vec<&str> = text.split("\n").collect(); + let mut audios = vec![]; + for (i, t) in texts.iter().enumerate() { + if t.is_empty() { + continue; + } + let (bert_ori, phones, tones, lang_ids) = self.parse_text(t)?; + let audio = model::synthesize( + &self.find_model(ident)?.vits2, + bert_ori.to_owned(), + phones, + tones, + lang_ids, + style_vector.clone(), + options.sdp_ratio, + options.length_scale, + )?; + audios.push(audio); + if i != texts.len() - 1 { + // 44100 * 0.5s 無音区間 + audios.push(Array3::zeros((1, 22050, 1))); + } + } + concatenate( + Axis(0), + &audios + .iter() + .map(|x| x.view()) + .collect::>() + )? + } else { + let (bert_ori, phones, tones, lang_ids) = self.parse_text(text)?; + model::synthesize( + &self.find_model(ident)?.vits2, + bert_ori.to_owned(), + phones, + tones, + lang_ids, + style_vector, + options.sdp_ratio, + options.length_scale, + )? + }; + Ok(Self::array_to_vec(audio_array)?) } fn array_to_vec(audio_array: Array3) -> Result> { @@ -241,7 +284,7 @@ impl TTSModelHolder { sdp_ratio: f32, length_scale: f32, ) -> Result> { - let buffer = model::synthesize( + let audio_array = model::synthesize( &self.find_model(ident)?.vits2, bert_ori.to_owned(), phones, @@ -251,7 +294,7 @@ impl TTSModelHolder { sdp_ratio, length_scale, )?; - Ok(buffer) + Ok(Self::array_to_vec(audio_array)?) } }