add speaker id code

This commit is contained in:
tuna2134
2024-11-13 11:39:05 +00:00
parent 8598167114
commit b76738f467
5 changed files with 17 additions and 38 deletions

View File

@@ -44,6 +44,10 @@ fn style_id_default() -> i32 {
0
}
fn speaker_id_default() -> i64 {
0
}
#[derive(Deserialize, ToSchema)]
struct SynthesizeRequest {
text: String,
@@ -54,6 +58,8 @@ struct SynthesizeRequest {
length_scale: f32,
#[serde(default = "style_id_default")]
style_id: i32,
#[serde(default = "speaker_id_default")]
speaker_id: i64,
}
#[utoipa::path(
@@ -71,7 +77,8 @@ async fn synthesize(
ident,
sdp_ratio,
length_scale,
style_id
style_id,
speaker_id,
}): Json<SynthesizeRequest>,
) -> AppResult<impl IntoResponse> {
log::debug!("processing request: text={text}, ident={ident}, sdp_ratio={sdp_ratio}, length_scale={length_scale}");
@@ -81,6 +88,7 @@ async fn synthesize(
&ident,
&text,
style_id,
speaker_id,
SynthesizeOptions {
sdp_ratio,
length_scale,

View File

@@ -142,6 +142,7 @@ impl TTSModel {
text: String,
ident: String,
style_id: i32,
speaker_id: i64,
sdp_ratio: f32,
length_scale: f32,
) -> anyhow::Result<Bound<PyBytes>> {
@@ -149,6 +150,7 @@ impl TTSModel {
ident.as_str(),
&text,
style_id,
speaker_id,
SynthesizeOptions {
sdp_ratio,
length_scale,

View File

@@ -17,7 +17,7 @@ fn main_inner() -> anyhow::Result<()> {
)?;
tts_holder.load_sbv2file(ident, fs::read(env::var("MODEL_PATH")?)?)?;
let audio = tts_holder.easy_synthesize(ident, &text, 0, tts::SynthesizeOptions::default())?;
let audio = tts_holder.easy_synthesize(ident, &text, 0, 0, tts::SynthesizeOptions::default())?;
fs::write("output.wav", audio)?;
Ok(())

View File

@@ -52,6 +52,7 @@ pub fn synthesize(
session: &Session,
bert_ori: Array2<f32>,
x_tst: Array1<i64>,
sid: Array1<i64>,
tones: Array1<i64>,
lang_ids: Array1<i64>,
style_vector: Array1<f32>,
@@ -67,7 +68,7 @@ pub fn synthesize(
let outputs = session.run(ort::inputs! {
"x_tst" => x_tst,
"x_tst_lengths" => x_tst_lengths,
"sid" => array![0_i64],
"sid" => sid,
"tones" => tones,
"language" => lang_ids,
"bert" => bert,

View File

@@ -229,6 +229,7 @@ impl TTSModelHolder {
ident: I,
text: &str,
style_id: i32,
speaker_id: i64,
options: SynthesizeOptions,
) -> Result<Vec<u8>> {
self.find_and_load_model(ident)?;
@@ -250,6 +251,7 @@ impl TTSModelHolder {
&vits2,
bert_ori.to_owned(),
phones,
Array1::from_vec(vec![speaker_id]),
tones,
lang_ids,
style_vector.clone(),
@@ -271,6 +273,7 @@ impl TTSModelHolder {
&vits2,
bert_ori.to_owned(),
phones,
Array1::from_vec(vec![speaker_id]),
tones,
lang_ids,
style_vector,
@@ -280,41 +283,6 @@ impl TTSModelHolder {
};
tts_util::array_to_vec(audio_array)
}
/// Synthesize text to audio
///
/// # Note
/// This function is for low-level usage, use `easy_synthesize` for high-level usage.
#[allow(clippy::too_many_arguments)]
pub fn synthesize<I: Into<TTSIdent> + Copy>(
&mut self,
ident: I,
bert_ori: Array2<f32>,
phones: Array1<i64>,
tones: Array1<i64>,
lang_ids: Array1<i64>,
style_vector: Array1<f32>,
sdp_ratio: f32,
length_scale: f32,
) -> Result<Vec<u8>> {
self.find_and_load_model(ident)?;
let vits2 = &self
.find_model(ident)?
.vits2
.as_ref()
.ok_or(Error::ModelNotFoundError(ident.into().to_string()))?;
let audio_array = model::synthesize(
&vits2,
bert_ori.to_owned(),
phones,
tones,
lang_ids,
style_vector,
sdp_ratio,
length_scale,
)?;
tts_util::array_to_vec(audio_array)
}
}
/// Synthesize options