diff --git a/sbv2_api/src/main.rs b/sbv2_api/src/main.rs
index 6a636d2..6844c21 100644
--- a/sbv2_api/src/main.rs
+++ b/sbv2_api/src/main.rs
@@ -44,6 +44,10 @@ fn style_id_default() -> i32 {
     0
 }
 
+fn speaker_id_default() -> i64 {
+    0
+}
+
 #[derive(Deserialize, ToSchema)]
 struct SynthesizeRequest {
     text: String,
@@ -54,6 +58,8 @@ struct SynthesizeRequest {
     length_scale: f32,
     #[serde(default = "style_id_default")]
     style_id: i32,
+    #[serde(default = "speaker_id_default")]
+    speaker_id: i64,
 }
 
 #[utoipa::path(
@@ -71,7 +77,8 @@ async fn synthesize(
         ident,
         sdp_ratio,
         length_scale,
-        style_id
+        style_id,
+        speaker_id,
     }): Json<SynthesizeRequest>,
 ) -> AppResult<impl IntoResponse> {
     log::debug!("processing request: text={text}, ident={ident}, sdp_ratio={sdp_ratio}, length_scale={length_scale}");
@@ -81,6 +88,7 @@ async fn synthesize(
             &ident,
             &text,
             style_id,
+            speaker_id,
             SynthesizeOptions {
                 sdp_ratio,
                 length_scale,
diff --git a/sbv2_bindings/src/sbv2.rs b/sbv2_bindings/src/sbv2.rs
index 0ecc414..c109869 100644
--- a/sbv2_bindings/src/sbv2.rs
+++ b/sbv2_bindings/src/sbv2.rs
@@ -142,6 +142,7 @@ impl TTSModel {
         text: String,
         ident: String,
         style_id: i32,
+        speaker_id: i64,
         sdp_ratio: f32,
         length_scale: f32,
     ) -> anyhow::Result<Bound<PyBytes>> {
@@ -149,6 +150,7 @@ impl TTSModel {
             ident.as_str(),
             &text,
             style_id,
+            speaker_id,
             SynthesizeOptions {
                 sdp_ratio,
                 length_scale,
diff --git a/sbv2_core/src/main.rs b/sbv2_core/src/main.rs
index 3a4cbdd..8665b3d 100644
--- a/sbv2_core/src/main.rs
+++ b/sbv2_core/src/main.rs
@@ -17,7 +17,7 @@ fn main_inner() -> anyhow::Result<()> {
     )?;
     tts_holder.load_sbv2file(ident, fs::read(env::var("MODEL_PATH")?)?)?;
 
-    let audio = tts_holder.easy_synthesize(ident, &text, 0, tts::SynthesizeOptions::default())?;
+    let audio = tts_holder.easy_synthesize(ident, &text, 0, 0, tts::SynthesizeOptions::default())?;
     fs::write("output.wav", audio)?;
 
     Ok(())
diff --git a/sbv2_core/src/model.rs b/sbv2_core/src/model.rs
index 9f2a221..a851909 100644
--- a/sbv2_core/src/model.rs
+++ b/sbv2_core/src/model.rs
@@ -52,6 +52,7 @@ pub fn synthesize(
     session: &Session,
     bert_ori: Array2<f32>,
     x_tst: Array1<i64>,
+    sid: Array1<i64>,
     tones: Array1<i64>,
     lang_ids: Array1<i64>,
     style_vector: Array1<f32>,
@@ -67,7 +68,7 @@ pub fn synthesize(
     let outputs = session.run(ort::inputs! {
         "x_tst" => x_tst,
         "x_tst_lengths" => x_tst_lengths,
-        "sid" => array![0_i64],
+        "sid" => sid,
         "tones" => tones,
         "language" => lang_ids,
         "bert" => bert,
diff --git a/sbv2_core/src/tts.rs b/sbv2_core/src/tts.rs
index d248e31..dedb124 100644
--- a/sbv2_core/src/tts.rs
+++ b/sbv2_core/src/tts.rs
@@ -229,6 +229,7 @@ impl TTSModelHolder {
         ident: I,
         text: &str,
         style_id: i32,
+        speaker_id: i64,
         options: SynthesizeOptions,
     ) -> Result<Vec<u8>> {
         self.find_and_load_model(ident)?;
@@ -250,6 +251,7 @@ impl TTSModelHolder {
                     &vits2,
                     bert_ori.to_owned(),
                     phones,
+                    Array1::from_vec(vec![speaker_id]),
                     tones,
                     lang_ids,
                     style_vector.clone(),
@@ -271,6 +273,7 @@ impl TTSModelHolder {
                 &vits2,
                 bert_ori.to_owned(),
                 phones,
+                Array1::from_vec(vec![speaker_id]),
                 tones,
                 lang_ids,
                 style_vector,
@@ -280,41 +283,6 @@ impl TTSModelHolder {
         };
         tts_util::array_to_vec(audio_array)
     }
-
-    /// Synthesize text to audio
-    ///
-    /// # Note
-    /// This function is for low-level usage, use `easy_synthesize` for high-level usage.
-    #[allow(clippy::too_many_arguments)]
-    pub fn synthesize<I: Into<TTSIdent> + Copy>(
-        &mut self,
-        ident: I,
-        bert_ori: Array2<f32>,
-        phones: Array1<i64>,
-        tones: Array1<i64>,
-        lang_ids: Array1<i64>,
-        style_vector: Array1<f32>,
-        sdp_ratio: f32,
-        length_scale: f32,
-    ) -> Result<Vec<u8>> {
-        self.find_and_load_model(ident)?;
-        let vits2 = &self
-            .find_model(ident)?
-            .vits2
-            .as_ref()
-            .ok_or(Error::ModelNotFoundError(ident.into().to_string()))?;
-        let audio_array = model::synthesize(
-            &vits2,
-            bert_ori.to_owned(),
-            phones,
-            tones,
-            lang_ids,
-            style_vector,
-            sdp_ratio,
-            length_scale,
-        )?;
-        tts_util::array_to_vec(audio_array)
-    }
 }
 
 /// Synthesize options