add speaker id code

2025-12-26 00:59:58 +00:00 · 2024-11-13 11:39:05 +00:00
parent 8598167114
commit b76738f467
5 changed files with 17 additions and 38 deletions
--- a/sbv2_api/src/main.rs
+++ b/sbv2_api/src/main.rs
@@ -44,6 +44,10 @@ fn style_id_default() -> i32 {
    0
 }

+fn speaker_id_default() -> i64 {
+    0
+}
+
 #[derive(Deserialize, ToSchema)]
 struct SynthesizeRequest {
    text: String,
@@ -54,6 +58,8 @@ struct SynthesizeRequest {
    length_scale: f32,
    #[serde(default = "style_id_default")]
    style_id: i32,
+    #[serde(default = "speaker_id_default")]
+    speaker_id: i64,
 }

 #[utoipa::path(
@@ -71,7 +77,8 @@ async fn synthesize(
        ident,
        sdp_ratio,
        length_scale,
-        style_id
+        style_id,
+        speaker_id,
    }): Json<SynthesizeRequest>,
 ) -> AppResult<impl IntoResponse> {
    log::debug!("processing request: text={text}, ident={ident}, sdp_ratio={sdp_ratio}, length_scale={length_scale}");
@@ -81,6 +88,7 @@ async fn synthesize(
            &ident,
            &text,
            style_id,
+            speaker_id,
            SynthesizeOptions {
                sdp_ratio,
                length_scale,
--- a/sbv2_bindings/src/sbv2.rs
+++ b/sbv2_bindings/src/sbv2.rs
@@ -142,6 +142,7 @@ impl TTSModel {
        text: String,
        ident: String,
        style_id: i32,
+        speaker_id: i64,
        sdp_ratio: f32,
        length_scale: f32,
    ) -> anyhow::Result<Bound<PyBytes>> {
@@ -149,6 +150,7 @@ impl TTSModel {
            ident.as_str(),
            &text,
            style_id,
+            speaker_id,
            SynthesizeOptions {
                sdp_ratio,
                length_scale,
--- a/sbv2_core/src/main.rs
+++ b/sbv2_core/src/main.rs
@@ -17,7 +17,7 @@ fn main_inner() -> anyhow::Result<()> {
    )?;
    tts_holder.load_sbv2file(ident, fs::read(env::var("MODEL_PATH")?)?)?;

-    let audio = tts_holder.easy_synthesize(ident, &text, 0, tts::SynthesizeOptions::default())?;
+    let audio = tts_holder.easy_synthesize(ident, &text, 0, 0, tts::SynthesizeOptions::default())?;
    fs::write("output.wav", audio)?;

    Ok(())
--- a/sbv2_core/src/model.rs
+++ b/sbv2_core/src/model.rs
@@ -52,6 +52,7 @@ pub fn synthesize(
    session: &Session,
    bert_ori: Array2<f32>,
    x_tst: Array1<i64>,
+    sid: Array1<i64>,
    tones: Array1<i64>,
    lang_ids: Array1<i64>,
    style_vector: Array1<f32>,
@@ -67,7 +68,7 @@ pub fn synthesize(
    let outputs = session.run(ort::inputs! {
        "x_tst" => x_tst,
        "x_tst_lengths" => x_tst_lengths,
-        "sid" => array![0_i64],
+        "sid" => sid,
        "tones" => tones,
        "language" => lang_ids,
        "bert" => bert,
--- a/sbv2_core/src/tts.rs
+++ b/sbv2_core/src/tts.rs
@@ -229,6 +229,7 @@ impl TTSModelHolder {
        ident: I,
        text: &str,
        style_id: i32,
+        speaker_id: i64,
        options: SynthesizeOptions,
    ) -> Result<Vec<u8>> {
        self.find_and_load_model(ident)?;
@@ -250,6 +251,7 @@ impl TTSModelHolder {
                    &vits2,
                    bert_ori.to_owned(),
                    phones,
+                    Array1::from_vec(vec![speaker_id]),
                    tones,
                    lang_ids,
                    style_vector.clone(),
@@ -271,6 +273,7 @@ impl TTSModelHolder {
                &vits2,
                bert_ori.to_owned(),
                phones,
+                Array1::from_vec(vec![speaker_id]),
                tones,
                lang_ids,
                style_vector,
@@ -280,41 +283,6 @@ impl TTSModelHolder {
        };
        tts_util::array_to_vec(audio_array)
    }
-
-    /// Synthesize text to audio
-    ///
-    /// # Note
-    /// This function is for low-level usage, use `easy_synthesize` for high-level usage.
-    #[allow(clippy::too_many_arguments)]
-    pub fn synthesize<I: Into<TTSIdent> + Copy>(
-        &mut self,
-        ident: I,
-        bert_ori: Array2<f32>,
-        phones: Array1<i64>,
-        tones: Array1<i64>,
-        lang_ids: Array1<i64>,
-        style_vector: Array1<f32>,
-        sdp_ratio: f32,
-        length_scale: f32,
-    ) -> Result<Vec<u8>> {
-        self.find_and_load_model(ident)?;
-        let vits2 = &self
-            .find_model(ident)?
-            .vits2
-            .as_ref()
-            .ok_or(Error::ModelNotFoundError(ident.into().to_string()))?;
-        let audio_array = model::synthesize(
-            &vits2,
-            bert_ori.to_owned(),
-            phones,
-            tones,
-            lang_ids,
-            style_vector,
-            sdp_ratio,
-            length_scale,
-        )?;
-        tts_util::array_to_vec(audio_array)
-    }
 }

 /// Synthesize options