diff --git a/sbv2_core/src/main.rs b/sbv2_core/src/main.rs index b84e44a..925e735 100644 --- a/sbv2_core/src/main.rs +++ b/sbv2_core/src/main.rs @@ -24,7 +24,8 @@ fn main() -> anyhow::Result<()> { tones.clone(), lang_ids.clone(), style_vector.clone(), - tts::SynthesizeOptions::default(), + 0.0, + 0.5, )?; std::fs::write("output.wav", data)?; let now = Instant::now(); @@ -44,7 +45,8 @@ fn main() -> anyhow::Result<()> { tones.clone(), lang_ids.clone(), style_vector.clone(), - tts::SynthesizeOptions::default(), + 0.0, + 1.0, )?; } println!( diff --git a/sbv2_core/src/tts.rs b/sbv2_core/src/tts.rs index 9ff3197..26a4375 100644 --- a/sbv2_core/src/tts.rs +++ b/sbv2_core/src/tts.rs @@ -1,6 +1,7 @@ use crate::error::{Error, Result}; use crate::{bert, jtalk, model, nlp, norm, style, tokenizer, utils}; -use ndarray::{concatenate, s, Array, Array1, Array2, Axis}; +use ndarray::{concatenate, s, Array, Array1, Array2, Array3, Axis}; +use hound::{SampleFormat, WavSpec, WavWriter}; use ort::Session; use std::io::{Cursor, Read}; use tar::Archive; @@ -197,6 +198,37 @@ impl TTSModelHolder { style::get_style_vector(&self.find_model(ident)?.style_vectors, style_id, weight) } + pub fn easy_synthesize>( + &self, + ident: I, + text: &str, + style_id: i32, + options: SynthesizeOptions, + ) -> Result<()> { + let (bert_ori, phones, tones, lang_ids) = self.parse_text(text)?; + let style_vector = self.get_style_vector(ident, style_id, options.style_weight)?; + Ok(()) + } + + fn array_to_vec(audio_array: Array3) -> Result> { + let spec = WavSpec { + channels: 1, + sample_rate: 44100, + bits_per_sample: 32, + sample_format: SampleFormat::Float, + }; + let mut cursor = Cursor::new(Vec::new()); + let mut writer = WavWriter::new(&mut cursor, spec)?; + for i in 0..audio_array.shape()[0] { + let output = audio_array.slice(s![i, 0, ..]).to_vec(); + for sample in output { + writer.write_sample(sample)?; + } + } + writer.finalize()?; + Ok(cursor.into_inner()) + } + #[allow(clippy::too_many_arguments)] pub fn synthesize>( &self, @@ -206,7 +238,8 @@ impl TTSModelHolder { tones: Array1, lang_ids: Array1, style_vector: Array1, - options: SynthesizeOptions, + sdp_ratio: f32, + length_scale: f32, ) -> Result> { let buffer = model::synthesize( &self.find_model(ident)?.vits2, @@ -215,8 +248,8 @@ impl TTSModelHolder { tones, lang_ids, style_vector, - options.sdp_ratio, - options.length_scale, + sdp_ratio, + length_scale, )?; Ok(buffer) } @@ -225,6 +258,7 @@ impl TTSModelHolder { pub struct SynthesizeOptions { sdp_ratio: f32, length_scale: f32, + style_weight: f32, split_sentences: bool, } @@ -233,6 +267,7 @@ impl Default for SynthesizeOptions { SynthesizeOptions { sdp_ratio: 0.0, length_scale: 1.0, + style_weight: 1.0, split_sentences: true, } }