diff --git a/sbv2_api/src/main.rs b/sbv2_api/src/main.rs index 6a636d2..6844c21 100644 --- a/sbv2_api/src/main.rs +++ b/sbv2_api/src/main.rs @@ -44,6 +44,10 @@ fn style_id_default() -> i32 { 0 } +fn speaker_id_default() -> i64 { + 0 +} + #[derive(Deserialize, ToSchema)] struct SynthesizeRequest { text: String, @@ -54,6 +58,8 @@ struct SynthesizeRequest { length_scale: f32, #[serde(default = "style_id_default")] style_id: i32, + #[serde(default = "speaker_id_default")] + speaker_id: i64, } #[utoipa::path( @@ -71,7 +77,8 @@ async fn synthesize( ident, sdp_ratio, length_scale, - style_id + style_id, + speaker_id, }): Json, ) -> AppResult { log::debug!("processing request: text={text}, ident={ident}, sdp_ratio={sdp_ratio}, length_scale={length_scale}"); @@ -81,6 +88,7 @@ async fn synthesize( &ident, &text, style_id, + speaker_id, SynthesizeOptions { sdp_ratio, length_scale, diff --git a/sbv2_bindings/src/sbv2.rs b/sbv2_bindings/src/sbv2.rs index 0ecc414..c109869 100644 --- a/sbv2_bindings/src/sbv2.rs +++ b/sbv2_bindings/src/sbv2.rs @@ -142,6 +142,7 @@ impl TTSModel { text: String, ident: String, style_id: i32, + speaker_id: i64, sdp_ratio: f32, length_scale: f32, ) -> anyhow::Result> { @@ -149,6 +150,7 @@ impl TTSModel { ident.as_str(), &text, style_id, + speaker_id, SynthesizeOptions { sdp_ratio, length_scale, diff --git a/sbv2_core/src/main.rs b/sbv2_core/src/main.rs index 3a4cbdd..8665b3d 100644 --- a/sbv2_core/src/main.rs +++ b/sbv2_core/src/main.rs @@ -17,7 +17,7 @@ fn main_inner() -> anyhow::Result<()> { )?; tts_holder.load_sbv2file(ident, fs::read(env::var("MODEL_PATH")?)?)?; - let audio = tts_holder.easy_synthesize(ident, &text, 0, tts::SynthesizeOptions::default())?; + let audio = tts_holder.easy_synthesize(ident, &text, 0, 0, tts::SynthesizeOptions::default())?; fs::write("output.wav", audio)?; Ok(()) diff --git a/sbv2_core/src/model.rs b/sbv2_core/src/model.rs index 9f2a221..a851909 100644 --- a/sbv2_core/src/model.rs +++ b/sbv2_core/src/model.rs @@ -52,6 +52,7 @@ pub fn synthesize( session: &Session, bert_ori: Array2, x_tst: Array1, + sid: Array1, tones: Array1, lang_ids: Array1, style_vector: Array1, @@ -67,7 +68,7 @@ pub fn synthesize( let outputs = session.run(ort::inputs! { "x_tst" => x_tst, "x_tst_lengths" => x_tst_lengths, - "sid" => array![0_i64], + "sid" => sid, "tones" => tones, "language" => lang_ids, "bert" => bert, diff --git a/sbv2_core/src/tts.rs b/sbv2_core/src/tts.rs index d248e31..dedb124 100644 --- a/sbv2_core/src/tts.rs +++ b/sbv2_core/src/tts.rs @@ -229,6 +229,7 @@ impl TTSModelHolder { ident: I, text: &str, style_id: i32, + speaker_id: i64, options: SynthesizeOptions, ) -> Result> { self.find_and_load_model(ident)?; @@ -250,6 +251,7 @@ impl TTSModelHolder { &vits2, bert_ori.to_owned(), phones, + Array1::from_vec(vec![speaker_id]), tones, lang_ids, style_vector.clone(), @@ -271,6 +273,7 @@ impl TTSModelHolder { &vits2, bert_ori.to_owned(), phones, + Array1::from_vec(vec![speaker_id]), tones, lang_ids, style_vector, @@ -280,41 +283,6 @@ impl TTSModelHolder { }; tts_util::array_to_vec(audio_array) } - - /// Synthesize text to audio - /// - /// # Note - /// This function is for low-level usage, use `easy_synthesize` for high-level usage. - #[allow(clippy::too_many_arguments)] - pub fn synthesize + Copy>( - &mut self, - ident: I, - bert_ori: Array2, - phones: Array1, - tones: Array1, - lang_ids: Array1, - style_vector: Array1, - sdp_ratio: f32, - length_scale: f32, - ) -> Result> { - self.find_and_load_model(ident)?; - let vits2 = &self - .find_model(ident)? - .vits2 - .as_ref() - .ok_or(Error::ModelNotFoundError(ident.into().to_string()))?; - let audio_array = model::synthesize( - &vits2, - bert_ori.to_owned(), - phones, - tones, - lang_ids, - style_vector, - sdp_ratio, - length_scale, - )?; - tts_util::array_to_vec(audio_array) - } } /// Synthesize options