From a5d783bd65f3f5eb51bc6d20a143054b710de076 Mon Sep 17 00:00:00 2001 From: Masato Kikuchi Date: Mon, 31 Mar 2025 23:35:39 +0900 Subject: [PATCH] fix: bug --- crates/sbv2_core/src/tts_util.rs | 23 +++++++++++++++++++++++ crates/sbv2_editor/src/main.rs | 14 +++++++++++--- 2 files changed, 34 insertions(+), 3 deletions(-) diff --git a/crates/sbv2_core/src/tts_util.rs b/crates/sbv2_core/src/tts_util.rs index 3fc4f77..97b69c6 100644 --- a/crates/sbv2_core/src/tts_util.rs +++ b/crates/sbv2_core/src/tts_util.rs @@ -2,6 +2,8 @@ use std::io::Cursor; use crate::error::Result; use crate::jtalk::JTalkProcess; +use crate::mora::MORA_KATA_TO_MORA_PHONEMES; +use crate::norm::PUNCTUATIONS; use crate::{jtalk, nlp, norm, tokenizer, utils}; use hound::{SampleFormat, WavSpec, WavWriter}; use ndarray::{concatenate, s, Array, Array1, Array2, Array3, Axis}; @@ -111,6 +113,7 @@ pub fn parse_text_blocking( if let Some(given_tones) = given_tones { tones = given_tones; } + println!("tones: {:?}", tones); let (phones, tones, lang_ids) = nlp::cleaned_text_to_sequence(phones, tones); let phones = utils::intersperse(&phones, 0); @@ -188,3 +191,23 @@ pub fn array_to_vec(audio_array: Array3) -> Result> { writer.finalize()?; Ok(cursor.into_inner()) } + +pub fn kata_tone2phone_tone(kata_tone: Vec<(String, i32)>) -> Vec<(String, i32)> { + let mut results = vec![("_".to_string(), 0)]; + for (mora, tone) in kata_tone { + if PUNCTUATIONS.contains(&mora.as_str()) { + results.push((mora, 0)); + continue; + } else { + let (consonant, vowel) = MORA_KATA_TO_MORA_PHONEMES.get(&mora).unwrap(); + if let Some(consonant) = consonant { + results.push((consonant.to_string(), tone)); + results.push((vowel.to_string(), tone)); + } else { + results.push((vowel.to_string(), tone)); + } + } + } + results.push(("_".to_string(), 0)); + results +} \ No newline at end of file diff --git a/crates/sbv2_editor/src/main.rs b/crates/sbv2_editor/src/main.rs index 88d79d4..a3b6995 100644 --- a/crates/sbv2_editor/src/main.rs +++ b/crates/sbv2_editor/src/main.rs @@ -6,6 +6,7 @@ use axum::{ Json, Router, http::header::CONTENT_TYPE, }; +use sbv2_core::tts_util::kata_tone2phone_tone; use sbv2_core::{jtalk::JTalk, tts::{TTSModelHolder, SynthesizeOptions}, tts_util::preprocess_parse_text}; use serde::{Deserialize, Serialize}; use tokio::{fs, net::TcpListener, sync::Mutex}; @@ -68,9 +69,16 @@ async fn synthesis( State(state): State, Json(request): Json, ) -> AppResult { - let mut tones: Vec = request.audio_query.iter().map(|query| query.tone).collect(); - tones.insert(0, 0); - tones.push(0); + let phone_tone = request + .audio_query + .iter() + .map(|query| (query.kana.clone(), query.tone)) + .collect::>(); + let phone_tone = kata_tone2phone_tone(phone_tone); + let tones = phone_tone + .iter() + .map(|(_, tone)| *tone) + .collect::>(); let buffer = { let mut tts_model = state.tts_model.lock().await; tts_model.easy_synthesize_neo(