From fc944b9d33c0fa58f95692cf2e89e44572b751ae Mon Sep 17 00:00:00 2001 From: Masato Kikuchi Date: Wed, 26 Mar 2025 15:14:22 +0900 Subject: [PATCH 01/26] split the code for support voicevox --- Cargo.lock | 9 +++++++++ Cargo.toml | 2 +- crates/sbv2_core/src/tts_util.rs | 19 ++++++++++++++----- crates/sbv2_voicevox/Cargo.toml | 14 ++++++++++++++ crates/sbv2_voicevox/README.md | 2 ++ crates/sbv2_voicevox/src/main.rs | 5 +++++ 6 files changed, 45 insertions(+), 6 deletions(-) create mode 100644 crates/sbv2_voicevox/Cargo.toml create mode 100644 crates/sbv2_voicevox/README.md create mode 100644 crates/sbv2_voicevox/src/main.rs diff --git a/Cargo.lock b/Cargo.lock index 3245a45..1d69f54 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2312,6 +2312,15 @@ dependencies = [ "zstd", ] +[[package]] +name = "sbv2_voicevox" +version = "0.2.0-alpha6" +dependencies = [ + "anyhow", + "axum", + "sbv2_core", +] + [[package]] name = "sbv2_wasm" version = "0.2.0-alpha6" diff --git a/Cargo.toml b/Cargo.toml index 240b6e0..fcc17c0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [workspace] resolver = "2" -members = ["./crates/sbv2_api", "./crates/sbv2_core", "./crates/sbv2_bindings", "./crates/sbv2_wasm"] +members = ["./crates/sbv2_api", "./crates/sbv2_core", "./crates/sbv2_bindings", "./crates/sbv2_wasm", "crates/sbv2_voicevox"] [workspace.package] version = "0.2.0-alpha6" diff --git a/crates/sbv2_core/src/tts_util.rs b/crates/sbv2_core/src/tts_util.rs index 24b059a..8cab20d 100644 --- a/crates/sbv2_core/src/tts_util.rs +++ b/crates/sbv2_core/src/tts_util.rs @@ -5,6 +5,19 @@ use crate::{jtalk, nlp, norm, tokenizer, utils}; use hound::{SampleFormat, WavSpec, WavWriter}; use ndarray::{concatenate, s, Array, Array1, Array2, Array3, Axis}; use tokenizers::Tokenizer; + +pub fn preprocess_parse_text( + text: &str, + jtalk: &jtalk::JTalk, +) -> Result<(Vec, Vec, Vec)> { + let text = jtalk.num2word(text)?; + let normalized_text = norm::normalize_text(&text); + + let process = jtalk.process_text(&normalized_text)?; + let result = process.g2p()?; + Ok(result) +} + /// Parse text and return the input for synthesize /// /// # Note @@ -21,11 +34,7 @@ pub async fn parse_text( Box>>>, >, ) -> Result<(Array2, Array1, Array1, Array1)> { - let text = jtalk.num2word(text)?; - let normalized_text = norm::normalize_text(&text); - - let process = jtalk.process_text(&normalized_text)?; - let (phones, tones, mut word2ph) = process.g2p()?; + let (phones, tones, mut word2ph) = preprocess_parse_text(text, jtalk)?; let (phones, tones, lang_ids) = nlp::cleaned_text_to_sequence(phones, tones); let phones = utils::intersperse(&phones, 0); diff --git a/crates/sbv2_voicevox/Cargo.toml b/crates/sbv2_voicevox/Cargo.toml new file mode 100644 index 0000000..4bd10aa --- /dev/null +++ b/crates/sbv2_voicevox/Cargo.toml @@ -0,0 +1,14 @@ +[package] +name = "sbv2_voicevox" +version.workspace = true +edition.workspace = true +description.workspace = true +license.workspace = true +readme.workspace = true +repository.workspace = true +documentation.workspace = true + +[dependencies] +anyhow.workspace = true +axum = "0.8.1" +sbv2_core = { version = "0.2.0-alpha6", path = "../sbv2_core" } diff --git a/crates/sbv2_voicevox/README.md b/crates/sbv2_voicevox/README.md new file mode 100644 index 0000000..d9b4b1f --- /dev/null +++ b/crates/sbv2_voicevox/README.md @@ -0,0 +1,2 @@ +# sbv2-voicevox +sbv2-apiをvoicevox化します。 \ No newline at end of file diff --git a/crates/sbv2_voicevox/src/main.rs b/crates/sbv2_voicevox/src/main.rs new file mode 100644 index 0000000..8d6f05f --- /dev/null +++ b/crates/sbv2_voicevox/src/main.rs @@ -0,0 +1,5 @@ + +async fn main() -> anyhow::Result<()> { + println!("Hello, world!"); + Ok(()) +} From f4de3e15ae238a68be603235e2ce9f713ea41397 Mon Sep 17 00:00:00 2001 From: Masato Kikuchi Date: Wed, 26 Mar 2025 16:14:29 +0900 Subject: [PATCH 02/26] initial commit: voicevox --- Cargo.lock | 5 +++-- crates/sbv2_core/src/tts_util.rs | 10 ++++++---- crates/sbv2_voicevox/Cargo.toml | 1 + crates/sbv2_voicevox/src/main.rs | 7 ++++++- 4 files changed, 16 insertions(+), 7 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 1d69f54..97f0509 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2319,6 +2319,7 @@ dependencies = [ "anyhow", "axum", "sbv2_core", + "tokio", ] [[package]] @@ -2741,9 +2742,9 @@ dependencies = [ [[package]] name = "tokio" -version = "1.43.0" +version = "1.44.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3d61fa4ffa3de412bfea335c6ecff681de2b609ba3c77ef3e00e521813a9ed9e" +checksum = "f382da615b842244d4b8738c82ed1275e6c5dd90c459a30941cd07080b06c91a" dependencies = [ "backtrace", "bytes", diff --git a/crates/sbv2_core/src/tts_util.rs b/crates/sbv2_core/src/tts_util.rs index 8cab20d..1334128 100644 --- a/crates/sbv2_core/src/tts_util.rs +++ b/crates/sbv2_core/src/tts_util.rs @@ -1,6 +1,7 @@ use std::io::Cursor; use crate::error::Result; +use crate::jtalk::JTalkProcess; use crate::{jtalk, nlp, norm, tokenizer, utils}; use hound::{SampleFormat, WavSpec, WavWriter}; use ndarray::{concatenate, s, Array, Array1, Array2, Array3, Axis}; @@ -9,13 +10,13 @@ use tokenizers::Tokenizer; pub fn preprocess_parse_text( text: &str, jtalk: &jtalk::JTalk, -) -> Result<(Vec, Vec, Vec)> { +) -> Result<(Vec, Vec, Vec, String, JTalkProcess)> { let text = jtalk.num2word(text)?; let normalized_text = norm::normalize_text(&text); let process = jtalk.process_text(&normalized_text)?; - let result = process.g2p()?; - Ok(result) + let (phones, tones, word2ph) = process.g2p()?; + Ok((phones, tones, word2ph, normalized_text, process)) } /// Parse text and return the input for synthesize @@ -34,7 +35,8 @@ pub async fn parse_text( Box>>>, >, ) -> Result<(Array2, Array1, Array1, Array1)> { - let (phones, tones, mut word2ph) = preprocess_parse_text(text, jtalk)?; + let (phones, tones, mut word2ph, normalized_text, process) = + preprocess_parse_text(text, jtalk)?; let (phones, tones, lang_ids) = nlp::cleaned_text_to_sequence(phones, tones); let phones = utils::intersperse(&phones, 0); diff --git a/crates/sbv2_voicevox/Cargo.toml b/crates/sbv2_voicevox/Cargo.toml index 4bd10aa..14c7b92 100644 --- a/crates/sbv2_voicevox/Cargo.toml +++ b/crates/sbv2_voicevox/Cargo.toml @@ -12,3 +12,4 @@ documentation.workspace = true anyhow.workspace = true axum = "0.8.1" sbv2_core = { version = "0.2.0-alpha6", path = "../sbv2_core" } +tokio = { version = "1.44.1", features = ["full"] } diff --git a/crates/sbv2_voicevox/src/main.rs b/crates/sbv2_voicevox/src/main.rs index 8d6f05f..911d91b 100644 --- a/crates/sbv2_voicevox/src/main.rs +++ b/crates/sbv2_voicevox/src/main.rs @@ -1,5 +1,10 @@ +use axum::{routing::get, Router}; +use tokio::net::TcpListener; +#[tokio::main] async fn main() -> anyhow::Result<()> { - println!("Hello, world!"); + let app = Router::new().route("/", get(|| async { "Hello, world!" })); + let listener = TcpListener::bind("0.0.0.0:8080").await?; + axum::serve(listener, app).await?; Ok(()) } From b8f0477318aa9eb08846d291da11312a4d886935 Mon Sep 17 00:00:00 2001 From: Masato Kikuchi Date: Wed, 26 Mar 2025 16:30:31 +0900 Subject: [PATCH 03/26] feat: audio query request --- Cargo.lock | 9 +- crates/sbv2_voicevox/Cargo.toml | 1 + crates/sbv2_voicevox/query2.json | 226 ++++++++++++++++++++++++++++++ crates/sbv2_voicevox/src/error.rs | 27 ++++ crates/sbv2_voicevox/src/main.rs | 20 ++- 5 files changed, 278 insertions(+), 5 deletions(-) create mode 100644 crates/sbv2_voicevox/query2.json create mode 100644 crates/sbv2_voicevox/src/error.rs diff --git a/Cargo.lock b/Cargo.lock index 97f0509..cba845e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2319,6 +2319,7 @@ dependencies = [ "anyhow", "axum", "sbv2_core", + "serde", "tokio", ] @@ -2374,18 +2375,18 @@ dependencies = [ [[package]] name = "serde" -version = "1.0.218" +version = "1.0.219" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e8dfc9d19bdbf6d17e22319da49161d5d0108e4188e8b680aef6299eed22df60" +checksum = "5f0e2c6ed6606019b4e29e69dbaba95b11854410e5347d525002456dbbb786b6" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.218" +version = "1.0.219" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f09503e191f4e797cb8aac08e9a4a4695c5edf6a2e70e376d961ddd5c969f82b" +checksum = "5b0276cf7f2c73365f7157c8123c21cd9a50fbbd844757af28ca1f5925fc2a00" dependencies = [ "proc-macro2", "quote", diff --git a/crates/sbv2_voicevox/Cargo.toml b/crates/sbv2_voicevox/Cargo.toml index 14c7b92..6403e89 100644 --- a/crates/sbv2_voicevox/Cargo.toml +++ b/crates/sbv2_voicevox/Cargo.toml @@ -12,4 +12,5 @@ documentation.workspace = true anyhow.workspace = true axum = "0.8.1" sbv2_core = { version = "0.2.0-alpha6", path = "../sbv2_core" } +serde = { version = "1.0.219", features = ["derive"] } tokio = { version = "1.44.1", features = ["full"] } diff --git a/crates/sbv2_voicevox/query2.json b/crates/sbv2_voicevox/query2.json new file mode 100644 index 0000000..6cffac4 --- /dev/null +++ b/crates/sbv2_voicevox/query2.json @@ -0,0 +1,226 @@ +{ + "accent_phrases": [ + { + "moras": [ + { + "text": "コ", + "consonant": "k", + "consonant_length": 0.10002632439136505, + "vowel": "o", + "vowel_length": 0.15740256011486053, + "pitch": 5.749961853027344 + }, + { + "text": "ン", + "consonant": null, + "consonant_length": null, + "vowel": "N", + "vowel_length": 0.08265873789787292, + "pitch": 5.89122200012207 + }, + { + "text": "ニ", + "consonant": "n", + "consonant_length": 0.03657080978155136, + "vowel": "i", + "vowel_length": 0.1175866425037384, + "pitch": 5.969866752624512 + }, + { + "text": "チ", + "consonant": "ch", + "consonant_length": 0.09005842357873917, + "vowel": "i", + "vowel_length": 0.08666137605905533, + "pitch": 5.958892822265625 + }, + { + "text": "ワ", + "consonant": "w", + "consonant_length": 0.07833231985569, + "vowel": "a", + "vowel_length": 0.21250136196613312, + "pitch": 5.949411392211914 + } + ], + "accent": 5, + "pause_mora": { + "text": "、", + "consonant": null, + "consonant_length": null, + "vowel": "pau", + "vowel_length": 0.4723339378833771, + "pitch": 0.0 + }, + "is_interrogative": false + }, + { + "moras": [ + { + "text": "オ", + "consonant": null, + "consonant_length": null, + "vowel": "o", + "vowel_length": 0.22004225850105286, + "pitch": 5.6870927810668945 + }, + { + "text": "ン", + "consonant": null, + "consonant_length": null, + "vowel": "N", + "vowel_length": 0.09161105751991272, + "pitch": 5.93472957611084 + }, + { + "text": "セ", + "consonant": "s", + "consonant_length": 0.08924821764230728, + "vowel": "e", + "vowel_length": 0.14142127335071564, + "pitch": 6.121850490570068 + }, + { + "text": "エ", + "consonant": null, + "consonant_length": null, + "vowel": "e", + "vowel_length": 0.10636933892965317, + "pitch": 6.157896041870117 + }, + { + "text": "ゴ", + "consonant": "g", + "consonant_length": 0.07600915431976318, + "vowel": "o", + "vowel_length": 0.09598273783922195, + "pitch": 6.188933849334717 + }, + { + "text": "オ", + "consonant": null, + "consonant_length": null, + "vowel": "o", + "vowel_length": 0.1079121008515358, + "pitch": 6.235202789306641 + }, + { + "text": "セ", + "consonant": "s", + "consonant_length": 0.09591838717460632, + "vowel": "e", + "vowel_length": 0.10286372154951096, + "pitch": 6.153214454650879 + }, + { + "text": "エ", + "consonant": null, + "consonant_length": null, + "vowel": "e", + "vowel_length": 0.08992656320333481, + "pitch": 6.02571439743042 + }, + { + "text": "ノ", + "consonant": "n", + "consonant_length": 0.05660202354192734, + "vowel": "o", + "vowel_length": 0.09676017612218857, + "pitch": 5.711844444274902 + } + ], + "accent": 5, + "pause_mora": null, + "is_interrogative": false + }, + { + "moras": [ + { + "text": "セ", + "consonant": "s", + "consonant_length": 0.07805486768484116, + "vowel": "e", + "vowel_length": 0.09617523103952408, + "pitch": 5.774399280548096 + }, + { + "text": "カ", + "consonant": "k", + "consonant_length": 0.06712044775485992, + "vowel": "a", + "vowel_length": 0.148829385638237, + "pitch": 6.063965797424316 + }, + { + "text": "イ", + "consonant": null, + "consonant_length": null, + "vowel": "i", + "vowel_length": 0.11061104387044907, + "pitch": 6.040698051452637 + }, + { + "text": "エ", + "consonant": null, + "consonant_length": null, + "vowel": "e", + "vowel_length": 0.13046696782112122, + "pitch": 5.806027889251709 + } + ], + "accent": 1, + "pause_mora": null, + "is_interrogative": false + }, + { + "moras": [ + { + "text": "ヨ", + "consonant": "y", + "consonant_length": 0.07194744795560837, + "vowel": "o", + "vowel_length": 0.08622600883245468, + "pitch": 5.694094657897949 + }, + { + "text": "オ", + "consonant": null, + "consonant_length": null, + "vowel": "o", + "vowel_length": 0.10635452717542648, + "pitch": 5.787222385406494 + }, + { + "text": "コ", + "consonant": "k", + "consonant_length": 0.07077334076166153, + "vowel": "o", + "vowel_length": 0.09248624742031097, + "pitch": 5.793357849121094 + }, + { + "text": "ソ", + "consonant": "s", + "consonant_length": 0.08705667406320572, + "vowel": "o", + "vowel_length": 0.2238258570432663, + "pitch": 5.643765449523926 + } + ], + "accent": 1, + "pause_mora": null, + "is_interrogative": false + } + ], + "speedScale": 1.0, + "pitchScale": 0.0, + "intonationScale": 1.0, + "volumeScale": 1.0, + "prePhonemeLength": 0.1, + "postPhonemeLength": 0.1, + "pauseLength": null, + "pauseLengthScale": 1.0, + "outputSamplingRate": 24000, + "outputStereo": false, + "kana": "コンニチワ'、オンセエゴ'オセエノ/セ'カイエ/ヨ'オコソ" +} \ No newline at end of file diff --git a/crates/sbv2_voicevox/src/error.rs b/crates/sbv2_voicevox/src/error.rs new file mode 100644 index 0000000..d3cf600 --- /dev/null +++ b/crates/sbv2_voicevox/src/error.rs @@ -0,0 +1,27 @@ +use axum::{ + http::StatusCode, + response::{IntoResponse, Response}, +}; + +pub type AppResult = std::result::Result; + +pub struct AppError(anyhow::Error); + +impl IntoResponse for AppError { + fn into_response(self) -> Response { + ( + StatusCode::INTERNAL_SERVER_ERROR, + format!("Something went wrong: {}", self.0), + ) + .into_response() + } +} + +impl From for AppError +where + E: Into, +{ + fn from(err: E) -> Self { + Self(err.into()) + } +} diff --git a/crates/sbv2_voicevox/src/main.rs b/crates/sbv2_voicevox/src/main.rs index 911d91b..f1a36d2 100644 --- a/crates/sbv2_voicevox/src/main.rs +++ b/crates/sbv2_voicevox/src/main.rs @@ -1,5 +1,23 @@ -use axum::{routing::get, Router}; +use axum::{routing::get, Router, extract::Query}; +use sbv2_core::{jtalk::JTalk, tts_util::preprocess_parse_text}; use tokio::net::TcpListener; +use serde::Deserialize; + +use error::AppResult; + +mod error; + +#[derive(Deserialize)] +struct RequestCreateAudioQuery { + text: String, +} + +async fn create_audio_query( + Query(request): Query, +) -> AppResult<()> { + let (phones, tones, mut word2ph, normalized_text, process) = preprocess_parse_text(&request.text, &JTalk::new()?)?; + Ok(()) +} #[tokio::main] async fn main() -> anyhow::Result<()> { From 22ed55739539bde56e9067858c21563530e13c2a Mon Sep 17 00:00:00 2001 From: Masato Kikuchi Date: Thu, 27 Mar 2025 09:59:08 +0900 Subject: [PATCH 04/26] oh --- crates/sbv2_voicevox/src/main.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/crates/sbv2_voicevox/src/main.rs b/crates/sbv2_voicevox/src/main.rs index f1a36d2..3c4e757 100644 --- a/crates/sbv2_voicevox/src/main.rs +++ b/crates/sbv2_voicevox/src/main.rs @@ -16,6 +16,7 @@ async fn create_audio_query( Query(request): Query, ) -> AppResult<()> { let (phones, tones, mut word2ph, normalized_text, process) = preprocess_parse_text(&request.text, &JTalk::new()?)?; + println!("{:?}", phones); Ok(()) } From e915e2bc846eb9819d06e32e499869428e62ccca Mon Sep 17 00:00:00 2001 From: Masato Kikuchi Date: Thu, 27 Mar 2025 13:17:20 +0900 Subject: [PATCH 05/26] feat: phone_tone_to_kana --- crates/sbv2_core/src/jtalk.rs | 26 +++++++++++++++++++++++++- crates/sbv2_core/src/mora.rs | 23 +++++++++++++++++++++++ crates/sbv2_voicevox/src/main.rs | 11 +++++------ 3 files changed, 53 insertions(+), 7 deletions(-) diff --git a/crates/sbv2_core/src/jtalk.rs b/crates/sbv2_core/src/jtalk.rs index 8359784..594c14d 100644 --- a/crates/sbv2_core/src/jtalk.rs +++ b/crates/sbv2_core/src/jtalk.rs @@ -1,5 +1,5 @@ use crate::error::{Error, Result}; -use crate::mora::{MORA_KATA_TO_MORA_PHONEMES, VOWELS}; +use crate::mora::{CONSONANTS, MORA_KATA_TO_MORA_PHONEMES, MORA_PHONEMES_TO_MORA_KATA, VOWELS}; use crate::norm::{replace_punctuation, PUNCTUATIONS}; use jpreprocess::{kind, DefaultTokenizer, JPreprocess, SystemDictionaryConfig, UserDictionary}; use once_cell::sync::Lazy; @@ -76,6 +76,30 @@ static MORA_PATTERN: Lazy> = Lazy::new(|| { }); static LONG_PATTERN: Lazy = Lazy::new(|| Regex::new(r"(\w)(ー*)").unwrap()); +fn phone_tone_to_kana(phones: Vec, tones: Vec) { + let mut results = Vec::new(); + let mut current_mora = String::new(); + for ((phone, next_phone), (tone, next_tone)) in phones + .iter() + .zip(phones.iter().skip(1)) + .zip(tones.iter().zip(tones.iter().skip(1))) + { + if PUNCTUATIONS.contains(&phone.clone().as_str()) { + results.push((phone, tone)); + continue; + } + if CONSONANTS.contains(&phone.clone()) { + assert_eq!(current_mora, ""); + assert_eq!(tone, next_tone); + current_mora = phone.to_string() + } else { + current_mora += phone; + results.push((MORA_PHONEMES_TO_MORA_KATA.get(¤t_mora).unwrap(), tone)); + current_mora = String::new(); + } + } +} + pub struct JTalkProcess { jpreprocess: Arc, parsed: Vec, diff --git a/crates/sbv2_core/src/mora.rs b/crates/sbv2_core/src/mora.rs index de7f54f..4becd67 100644 --- a/crates/sbv2_core/src/mora.rs +++ b/crates/sbv2_core/src/mora.rs @@ -25,6 +25,21 @@ static MORA_LIST_ADDITIONAL: Lazy> = Lazy::new(|| { data.additional }); +pub static MORA_PHONEMES_TO_MORA_KATA: Lazy> = Lazy::new(|| { + let mut map = HashMap::new(); + for mora in MORA_LIST_MINIMUM.iter() { + map.insert( + format!( + "{}{}", + mora.consonant.clone().unwrap_or("".to_string()), + mora.vowel + ), + mora.mora.clone(), + ); + } + map +}); + pub static MORA_KATA_TO_MORA_PHONEMES: Lazy, String)>> = Lazy::new(|| { let mut map = HashMap::new(); @@ -37,4 +52,12 @@ pub static MORA_KATA_TO_MORA_PHONEMES: Lazy, Str map }); +pub static CONSONANTS: Lazy> = Lazy::new(|| { + let consonants = MORA_KATA_TO_MORA_PHONEMES + .values() + .filter_map(|(consonant, _)| consonant.clone()) + .collect::>(); + consonants +}); + pub const VOWELS: [&str; 6] = ["a", "i", "u", "e", "o", "N"]; diff --git a/crates/sbv2_voicevox/src/main.rs b/crates/sbv2_voicevox/src/main.rs index 3c4e757..e552d6c 100644 --- a/crates/sbv2_voicevox/src/main.rs +++ b/crates/sbv2_voicevox/src/main.rs @@ -1,7 +1,7 @@ -use axum::{routing::get, Router, extract::Query}; +use axum::{extract::Query, routing::get, Router}; use sbv2_core::{jtalk::JTalk, tts_util::preprocess_parse_text}; -use tokio::net::TcpListener; use serde::Deserialize; +use tokio::net::TcpListener; use error::AppResult; @@ -12,10 +12,9 @@ struct RequestCreateAudioQuery { text: String, } -async fn create_audio_query( - Query(request): Query, -) -> AppResult<()> { - let (phones, tones, mut word2ph, normalized_text, process) = preprocess_parse_text(&request.text, &JTalk::new()?)?; +async fn create_audio_query(Query(request): Query) -> AppResult<()> { + let (phones, tones, _, normalized_text, process) = + preprocess_parse_text(&request.text, &JTalk::new()?)?; println!("{:?}", phones); Ok(()) } From 2687af1a9ba1f4bf26044331f0411268c004912a Mon Sep 17 00:00:00 2001 From: Masato Kikuchi Date: Thu, 27 Mar 2025 13:18:22 +0900 Subject: [PATCH 06/26] clippy --- crates/sbv2_core/src/main.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/sbv2_core/src/main.rs b/crates/sbv2_core/src/main.rs index 07d9843..36ea49b 100644 --- a/crates/sbv2_core/src/main.rs +++ b/crates/sbv2_core/src/main.rs @@ -31,7 +31,7 @@ fn main_inner() -> anyhow::Result<()> { } let audio = - tts_holder.easy_synthesize(ident, &text, 0, 0, tts::SynthesizeOptions::default())?; + tts_holder.easy_synthesize(ident, text, 0, 0, tts::SynthesizeOptions::default())?; fs::write("output.wav", audio)?; Ok(()) From e8dbf956e14c63c18e8e9988d86a14ad96e5837a Mon Sep 17 00:00:00 2001 From: Masato Kikuchi Date: Thu, 27 Mar 2025 13:21:07 +0900 Subject: [PATCH 07/26] fix: forget to give return --- crates/sbv2_core/src/jtalk.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/crates/sbv2_core/src/jtalk.rs b/crates/sbv2_core/src/jtalk.rs index 594c14d..705e066 100644 --- a/crates/sbv2_core/src/jtalk.rs +++ b/crates/sbv2_core/src/jtalk.rs @@ -76,7 +76,7 @@ static MORA_PATTERN: Lazy> = Lazy::new(|| { }); static LONG_PATTERN: Lazy = Lazy::new(|| Regex::new(r"(\w)(ー*)").unwrap()); -fn phone_tone_to_kana(phones: Vec, tones: Vec) { +fn phone_tone_to_kana(phones: Vec, tones: Vec) -> Vec<(String, i32)> { let mut results = Vec::new(); let mut current_mora = String::new(); for ((phone, next_phone), (tone, next_tone)) in phones @@ -98,6 +98,7 @@ fn phone_tone_to_kana(phones: Vec, tones: Vec) { current_mora = String::new(); } } + results } pub struct JTalkProcess { From 07637f587d7536629b2d9773686a4632f17aa1e5 Mon Sep 17 00:00:00 2001 From: Masato Kikuchi Date: Thu, 27 Mar 2025 13:23:53 +0900 Subject: [PATCH 08/26] fix: type --- crates/sbv2_core/src/jtalk.rs | 12 +++++++++--- crates/sbv2_core/src/main.rs | 3 +-- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/crates/sbv2_core/src/jtalk.rs b/crates/sbv2_core/src/jtalk.rs index 705e066..7ef7232 100644 --- a/crates/sbv2_core/src/jtalk.rs +++ b/crates/sbv2_core/src/jtalk.rs @@ -79,13 +79,13 @@ static LONG_PATTERN: Lazy = Lazy::new(|| Regex::new(r"(\w)(ー*)").unwrap fn phone_tone_to_kana(phones: Vec, tones: Vec) -> Vec<(String, i32)> { let mut results = Vec::new(); let mut current_mora = String::new(); - for ((phone, next_phone), (tone, next_tone)) in phones + for ((phone, next_phone), (&tone, &next_tone)) in phones .iter() .zip(phones.iter().skip(1)) .zip(tones.iter().zip(tones.iter().skip(1))) { if PUNCTUATIONS.contains(&phone.clone().as_str()) { - results.push((phone, tone)); + results.push((phone.to_string(), tone)); continue; } if CONSONANTS.contains(&phone.clone()) { @@ -94,7 +94,13 @@ fn phone_tone_to_kana(phones: Vec, tones: Vec) -> Vec<(String, i32) current_mora = phone.to_string() } else { current_mora += phone; - results.push((MORA_PHONEMES_TO_MORA_KATA.get(¤t_mora).unwrap(), tone)); + results.push(( + MORA_PHONEMES_TO_MORA_KATA + .get(¤t_mora) + .unwrap() + .to_string(), + tone, + )); current_mora = String::new(); } } diff --git a/crates/sbv2_core/src/main.rs b/crates/sbv2_core/src/main.rs index 36ea49b..d635a4f 100644 --- a/crates/sbv2_core/src/main.rs +++ b/crates/sbv2_core/src/main.rs @@ -30,8 +30,7 @@ fn main_inner() -> anyhow::Result<()> { } } - let audio = - tts_holder.easy_synthesize(ident, text, 0, 0, tts::SynthesizeOptions::default())?; + let audio = tts_holder.easy_synthesize(ident, text, 0, 0, tts::SynthesizeOptions::default())?; fs::write("output.wav", audio)?; Ok(()) From dd5c536f39d149d71b1d1fd29188022e70e4c749 Mon Sep 17 00:00:00 2001 From: Masato Kikuchi Date: Thu, 27 Mar 2025 13:53:27 +0900 Subject: [PATCH 09/26] feat: g2kana_tone --- crates/sbv2_core/src/jtalk.rs | 5 +++++ crates/sbv2_core/src/tts_util.rs | 9 ++++----- crates/sbv2_voicevox/src/main.rs | 5 ++--- 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/crates/sbv2_core/src/jtalk.rs b/crates/sbv2_core/src/jtalk.rs index 7ef7232..c242f65 100644 --- a/crates/sbv2_core/src/jtalk.rs +++ b/crates/sbv2_core/src/jtalk.rs @@ -196,6 +196,11 @@ impl JTalkProcess { Ok((phones, tones, new_word2ph)) } + pub fn g2kana_tone(&self) -> Result> { + let (phones, tones, _) = self.g2p()?; + Ok(phone_tone_to_kana(phones, tones)) + } + fn distribute_phone(n_phone: i32, n_word: i32) -> Vec { let mut phones_per_word = vec![0; n_word as usize]; for _ in 0..n_phone { diff --git a/crates/sbv2_core/src/tts_util.rs b/crates/sbv2_core/src/tts_util.rs index 1334128..58cfbeb 100644 --- a/crates/sbv2_core/src/tts_util.rs +++ b/crates/sbv2_core/src/tts_util.rs @@ -10,13 +10,12 @@ use tokenizers::Tokenizer; pub fn preprocess_parse_text( text: &str, jtalk: &jtalk::JTalk, -) -> Result<(Vec, Vec, Vec, String, JTalkProcess)> { +) -> Result<(String, JTalkProcess)> { let text = jtalk.num2word(text)?; let normalized_text = norm::normalize_text(&text); let process = jtalk.process_text(&normalized_text)?; - let (phones, tones, word2ph) = process.g2p()?; - Ok((phones, tones, word2ph, normalized_text, process)) + Ok((normalized_text, process)) } /// Parse text and return the input for synthesize @@ -35,8 +34,8 @@ pub async fn parse_text( Box>>>, >, ) -> Result<(Array2, Array1, Array1, Array1)> { - let (phones, tones, mut word2ph, normalized_text, process) = - preprocess_parse_text(text, jtalk)?; + let (normalized_text, process) = preprocess_parse_text(text, jtalk)?; + let (phones, tones, mut word2ph) = process.g2p()?; let (phones, tones, lang_ids) = nlp::cleaned_text_to_sequence(phones, tones); let phones = utils::intersperse(&phones, 0); diff --git a/crates/sbv2_voicevox/src/main.rs b/crates/sbv2_voicevox/src/main.rs index e552d6c..2e21525 100644 --- a/crates/sbv2_voicevox/src/main.rs +++ b/crates/sbv2_voicevox/src/main.rs @@ -13,9 +13,8 @@ struct RequestCreateAudioQuery { } async fn create_audio_query(Query(request): Query) -> AppResult<()> { - let (phones, tones, _, normalized_text, process) = - preprocess_parse_text(&request.text, &JTalk::new()?)?; - println!("{:?}", phones); + let (normalized_text, process) = preprocess_parse_text(&request.text, &JTalk::new()?)?; + let kana_tone_list = process.g2kana_tone()?; Ok(()) } From acf94a12832660e20c7306c8052dbcda649f6564 Mon Sep 17 00:00:00 2001 From: Masato Kikuchi Date: Thu, 27 Mar 2025 13:53:52 +0900 Subject: [PATCH 10/26] format --- crates/sbv2_core/src/tts_util.rs | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/crates/sbv2_core/src/tts_util.rs b/crates/sbv2_core/src/tts_util.rs index 58cfbeb..17eb9d7 100644 --- a/crates/sbv2_core/src/tts_util.rs +++ b/crates/sbv2_core/src/tts_util.rs @@ -7,10 +7,7 @@ use hound::{SampleFormat, WavSpec, WavWriter}; use ndarray::{concatenate, s, Array, Array1, Array2, Array3, Axis}; use tokenizers::Tokenizer; -pub fn preprocess_parse_text( - text: &str, - jtalk: &jtalk::JTalk, -) -> Result<(String, JTalkProcess)> { +pub fn preprocess_parse_text(text: &str, jtalk: &jtalk::JTalk) -> Result<(String, JTalkProcess)> { let text = jtalk.num2word(text)?; let normalized_text = norm::normalize_text(&text); From 472d1c600f31a4747858d13808fce1c1df4fe616 Mon Sep 17 00:00:00 2001 From: Masato Kikuchi Date: Thu, 27 Mar 2025 13:59:00 +0900 Subject: [PATCH 11/26] fix: add route --- crates/sbv2_voicevox/src/main.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/crates/sbv2_voicevox/src/main.rs b/crates/sbv2_voicevox/src/main.rs index 2e21525..46b2e75 100644 --- a/crates/sbv2_voicevox/src/main.rs +++ b/crates/sbv2_voicevox/src/main.rs @@ -15,12 +15,15 @@ struct RequestCreateAudioQuery { async fn create_audio_query(Query(request): Query) -> AppResult<()> { let (normalized_text, process) = preprocess_parse_text(&request.text, &JTalk::new()?)?; let kana_tone_list = process.g2kana_tone()?; + println!("{:?}", kana_tone_list); Ok(()) } #[tokio::main] async fn main() -> anyhow::Result<()> { - let app = Router::new().route("/", get(|| async { "Hello, world!" })); + let app = Router::new() + .route("/", get(|| async { "Hello, world!" })) + .route("/audio_query", get(create_audio_query)); let listener = TcpListener::bind("0.0.0.0:8080").await?; axum::serve(listener, app).await?; Ok(()) From a67df43fc709aea777609ee6e8fc7dcfcc197e0f Mon Sep 17 00:00:00 2001 From: Masato Kikuchi Date: Thu, 27 Mar 2025 14:42:43 +0900 Subject: [PATCH 12/26] fix --- crates/sbv2_core/src/jtalk.rs | 20 +++++++++---------- crates/sbv2_core/src/mora.rs | 33 ++++++++++++++++++-------------- crates/sbv2_voicevox/src/main.rs | 4 ++-- 3 files changed, 31 insertions(+), 26 deletions(-) diff --git a/crates/sbv2_core/src/jtalk.rs b/crates/sbv2_core/src/jtalk.rs index c242f65..75588ec 100644 --- a/crates/sbv2_core/src/jtalk.rs +++ b/crates/sbv2_core/src/jtalk.rs @@ -76,7 +76,12 @@ static MORA_PATTERN: Lazy> = Lazy::new(|| { }); static LONG_PATTERN: Lazy = Lazy::new(|| Regex::new(r"(\w)(ー*)").unwrap()); -fn phone_tone_to_kana(phones: Vec, tones: Vec) -> Vec<(String, i32)> { +fn phone_tone_to_kana( + phones: Vec, + tones: Vec, +) -> Vec<(String, Option, String, i32)> { + let phones = &phones[1..]; + let tones = &tones[1..]; let mut results = Vec::new(); let mut current_mora = String::new(); for ((phone, next_phone), (&tone, &next_tone)) in phones @@ -85,7 +90,7 @@ fn phone_tone_to_kana(phones: Vec, tones: Vec) -> Vec<(String, i32) .zip(tones.iter().zip(tones.iter().skip(1))) { if PUNCTUATIONS.contains(&phone.clone().as_str()) { - results.push((phone.to_string(), tone)); + results.push((phone.to_string(), None, "pau".to_string(), tone)); continue; } if CONSONANTS.contains(&phone.clone()) { @@ -94,13 +99,8 @@ fn phone_tone_to_kana(phones: Vec, tones: Vec) -> Vec<(String, i32) current_mora = phone.to_string() } else { current_mora += phone; - results.push(( - MORA_PHONEMES_TO_MORA_KATA - .get(¤t_mora) - .unwrap() - .to_string(), - tone, - )); + let (kana, consonant, vowel) = MORA_PHONEMES_TO_MORA_KATA.get(¤t_mora).unwrap(); + results.push((kana.to_string(), consonant.clone(), vowel.to_string(), tone)); current_mora = String::new(); } } @@ -196,7 +196,7 @@ impl JTalkProcess { Ok((phones, tones, new_word2ph)) } - pub fn g2kana_tone(&self) -> Result> { + pub fn g2kana_tone(&self) -> Result, String, i32)>> { let (phones, tones, _) = self.g2p()?; Ok(phone_tone_to_kana(phones, tones)) } diff --git a/crates/sbv2_core/src/mora.rs b/crates/sbv2_core/src/mora.rs index 4becd67..1f63062 100644 --- a/crates/sbv2_core/src/mora.rs +++ b/crates/sbv2_core/src/mora.rs @@ -25,20 +25,25 @@ static MORA_LIST_ADDITIONAL: Lazy> = Lazy::new(|| { data.additional }); -pub static MORA_PHONEMES_TO_MORA_KATA: Lazy> = Lazy::new(|| { - let mut map = HashMap::new(); - for mora in MORA_LIST_MINIMUM.iter() { - map.insert( - format!( - "{}{}", - mora.consonant.clone().unwrap_or("".to_string()), - mora.vowel - ), - mora.mora.clone(), - ); - } - map -}); +pub static MORA_PHONEMES_TO_MORA_KATA: Lazy, String)>> = + Lazy::new(|| { + let mut map = HashMap::new(); + for mora in MORA_LIST_MINIMUM.iter() { + map.insert( + format!( + "{}{}", + mora.consonant.clone().unwrap_or("".to_string()), + mora.vowel + ), + ( + mora.mora.clone(), + mora.consonant.clone(), + mora.vowel.clone(), + ), + ); + } + map + }); pub static MORA_KATA_TO_MORA_PHONEMES: Lazy, String)>> = Lazy::new(|| { diff --git a/crates/sbv2_voicevox/src/main.rs b/crates/sbv2_voicevox/src/main.rs index 46b2e75..a68f1c6 100644 --- a/crates/sbv2_voicevox/src/main.rs +++ b/crates/sbv2_voicevox/src/main.rs @@ -12,11 +12,11 @@ struct RequestCreateAudioQuery { text: String, } -async fn create_audio_query(Query(request): Query) -> AppResult<()> { +async fn create_audio_query(Query(request): Query) -> AppResult { let (normalized_text, process) = preprocess_parse_text(&request.text, &JTalk::new()?)?; let kana_tone_list = process.g2kana_tone()?; println!("{:?}", kana_tone_list); - Ok(()) + Ok(normalized_text) } #[tokio::main] From 70e16f95adab72cc2045d802901dfe8650cead85 Mon Sep 17 00:00:00 2001 From: Masato Kikuchi Date: Fri, 28 Mar 2025 20:06:00 +0900 Subject: [PATCH 13/26] =?UTF-8?q?fix:=20voicevox=E5=8C=96=E3=81=AF?= =?UTF-8?q?=E9=9B=A3=E3=81=97=E3=81=84=E3=81=AE=E3=81=A7=E3=80=81=E7=8B=AC?= =?UTF-8?q?=E8=87=AA=E3=81=AE=E3=82=A8=E3=83=87=E3=82=A3=E3=82=BF=E3=83=BC?= =?UTF-8?q?=E9=96=8B=E7=99=BA=E3=82=92=E3=81=99=E3=82=8B=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Cargo.toml | 2 +- crates/sbv2_core/src/jtalk.rs | 10 +++++----- crates/sbv2_core/src/mora.rs | 8 ++------ .../{sbv2_voicevox => sbv2_editor}/Cargo.toml | 0 .../{sbv2_voicevox => sbv2_editor}/README.md | 0 .../query2.json | 0 .../src/error.rs | 0 .../src/main.rs | 19 +++++++++++++++---- 8 files changed, 23 insertions(+), 16 deletions(-) rename crates/{sbv2_voicevox => sbv2_editor}/Cargo.toml (100%) rename crates/{sbv2_voicevox => sbv2_editor}/README.md (100%) rename crates/{sbv2_voicevox => sbv2_editor}/query2.json (100%) rename crates/{sbv2_voicevox => sbv2_editor}/src/error.rs (100%) rename crates/{sbv2_voicevox => sbv2_editor}/src/main.rs (63%) diff --git a/Cargo.toml b/Cargo.toml index fcc17c0..fe3e0ab 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [workspace] resolver = "2" -members = ["./crates/sbv2_api", "./crates/sbv2_core", "./crates/sbv2_bindings", "./crates/sbv2_wasm", "crates/sbv2_voicevox"] +members = ["./crates/sbv2_api", "./crates/sbv2_core", "./crates/sbv2_bindings", "./crates/sbv2_wasm", "crates/sbv2_editor"] [workspace.package] version = "0.2.0-alpha6" diff --git a/crates/sbv2_core/src/jtalk.rs b/crates/sbv2_core/src/jtalk.rs index 75588ec..f73c159 100644 --- a/crates/sbv2_core/src/jtalk.rs +++ b/crates/sbv2_core/src/jtalk.rs @@ -79,7 +79,7 @@ static LONG_PATTERN: Lazy = Lazy::new(|| Regex::new(r"(\w)(ー*)").unwrap fn phone_tone_to_kana( phones: Vec, tones: Vec, -) -> Vec<(String, Option, String, i32)> { +) -> Vec<(String, i32)> { let phones = &phones[1..]; let tones = &tones[1..]; let mut results = Vec::new(); @@ -90,7 +90,7 @@ fn phone_tone_to_kana( .zip(tones.iter().zip(tones.iter().skip(1))) { if PUNCTUATIONS.contains(&phone.clone().as_str()) { - results.push((phone.to_string(), None, "pau".to_string(), tone)); + results.push((phone.to_string(), tone)); continue; } if CONSONANTS.contains(&phone.clone()) { @@ -99,8 +99,8 @@ fn phone_tone_to_kana( current_mora = phone.to_string() } else { current_mora += phone; - let (kana, consonant, vowel) = MORA_PHONEMES_TO_MORA_KATA.get(¤t_mora).unwrap(); - results.push((kana.to_string(), consonant.clone(), vowel.to_string(), tone)); + let kana = MORA_PHONEMES_TO_MORA_KATA.get(¤t_mora).unwrap(); + results.push((kana.to_string(), tone)); current_mora = String::new(); } } @@ -196,7 +196,7 @@ impl JTalkProcess { Ok((phones, tones, new_word2ph)) } - pub fn g2kana_tone(&self) -> Result, String, i32)>> { + pub fn g2kana_tone(&self) -> Result> { let (phones, tones, _) = self.g2p()?; Ok(phone_tone_to_kana(phones, tones)) } diff --git a/crates/sbv2_core/src/mora.rs b/crates/sbv2_core/src/mora.rs index 1f63062..dfe1ea8 100644 --- a/crates/sbv2_core/src/mora.rs +++ b/crates/sbv2_core/src/mora.rs @@ -25,7 +25,7 @@ static MORA_LIST_ADDITIONAL: Lazy> = Lazy::new(|| { data.additional }); -pub static MORA_PHONEMES_TO_MORA_KATA: Lazy, String)>> = +pub static MORA_PHONEMES_TO_MORA_KATA: Lazy> = Lazy::new(|| { let mut map = HashMap::new(); for mora in MORA_LIST_MINIMUM.iter() { @@ -35,11 +35,7 @@ pub static MORA_PHONEMES_TO_MORA_KATA: Lazy) -> AppResult { +#[derive(Deserialize)] +struct ResponseCreateAudioQuery { + kana: String, + tone: i32, +} + +async fn create_audio_query(Query(request): Query) -> AppResult>> { let (normalized_text, process) = preprocess_parse_text(&request.text, &JTalk::new()?)?; let kana_tone_list = process.g2kana_tone()?; - println!("{:?}", kana_tone_list); - Ok(normalized_text) + let response = kana_tone_list.iter().map(|(kana, tone)| { + ResponseCreateAudioQuery { + kana: kana.clone(), + tone: *tone, + } + }).collect::>(); + Ok(Json(response)) } #[tokio::main] From 3785faf81edc082172271b8b88c3f199763dd1fa Mon Sep 17 00:00:00 2001 From: Masato Kikuchi Date: Fri, 28 Mar 2025 20:08:07 +0900 Subject: [PATCH 14/26] fix --- crates/sbv2_core/src/jtalk.rs | 5 +---- crates/sbv2_core/src/mora.rs | 29 ++++++++++++++--------------- crates/sbv2_editor/src/main.rs | 21 ++++++++++++--------- 3 files changed, 27 insertions(+), 28 deletions(-) diff --git a/crates/sbv2_core/src/jtalk.rs b/crates/sbv2_core/src/jtalk.rs index f73c159..205be8e 100644 --- a/crates/sbv2_core/src/jtalk.rs +++ b/crates/sbv2_core/src/jtalk.rs @@ -76,10 +76,7 @@ static MORA_PATTERN: Lazy> = Lazy::new(|| { }); static LONG_PATTERN: Lazy = Lazy::new(|| Regex::new(r"(\w)(ー*)").unwrap()); -fn phone_tone_to_kana( - phones: Vec, - tones: Vec, -) -> Vec<(String, i32)> { +fn phone_tone_to_kana(phones: Vec, tones: Vec) -> Vec<(String, i32)> { let phones = &phones[1..]; let tones = &tones[1..]; let mut results = Vec::new(); diff --git a/crates/sbv2_core/src/mora.rs b/crates/sbv2_core/src/mora.rs index dfe1ea8..4becd67 100644 --- a/crates/sbv2_core/src/mora.rs +++ b/crates/sbv2_core/src/mora.rs @@ -25,21 +25,20 @@ static MORA_LIST_ADDITIONAL: Lazy> = Lazy::new(|| { data.additional }); -pub static MORA_PHONEMES_TO_MORA_KATA: Lazy> = - Lazy::new(|| { - let mut map = HashMap::new(); - for mora in MORA_LIST_MINIMUM.iter() { - map.insert( - format!( - "{}{}", - mora.consonant.clone().unwrap_or("".to_string()), - mora.vowel - ), - mora.mora.clone(), - ); - } - map - }); +pub static MORA_PHONEMES_TO_MORA_KATA: Lazy> = Lazy::new(|| { + let mut map = HashMap::new(); + for mora in MORA_LIST_MINIMUM.iter() { + map.insert( + format!( + "{}{}", + mora.consonant.clone().unwrap_or("".to_string()), + mora.vowel + ), + mora.mora.clone(), + ); + } + map +}); pub static MORA_KATA_TO_MORA_PHONEMES: Lazy, String)>> = Lazy::new(|| { diff --git a/crates/sbv2_editor/src/main.rs b/crates/sbv2_editor/src/main.rs index 7cc13f5..ac77598 100644 --- a/crates/sbv2_editor/src/main.rs +++ b/crates/sbv2_editor/src/main.rs @@ -1,6 +1,6 @@ -use axum::{extract::Query, routing::get, Router, Json}; +use axum::{extract::Query, response::IntoResponse, routing::get, Json, Router}; use sbv2_core::{jtalk::JTalk, tts_util::preprocess_parse_text}; -use serde::Deserialize; +use serde::{Deserialize, Serialize}; use tokio::net::TcpListener; use error::AppResult; @@ -12,21 +12,24 @@ struct RequestCreateAudioQuery { text: String, } -#[derive(Deserialize)] +#[derive(Serialize)] struct ResponseCreateAudioQuery { kana: String, tone: i32, } -async fn create_audio_query(Query(request): Query) -> AppResult>> { - let (normalized_text, process) = preprocess_parse_text(&request.text, &JTalk::new()?)?; +async fn create_audio_query( + Query(request): Query, +) -> AppResult { + let (_, process) = preprocess_parse_text(&request.text, &JTalk::new()?)?; let kana_tone_list = process.g2kana_tone()?; - let response = kana_tone_list.iter().map(|(kana, tone)| { - ResponseCreateAudioQuery { + let response = kana_tone_list + .iter() + .map(|(kana, tone)| ResponseCreateAudioQuery { kana: kana.clone(), tone: *tone, - } - }).collect::>(); + }) + .collect::>(); Ok(Json(response)) } From 01f2aaa4069a25e4c00cad94f2aeedd8b6beafdb Mon Sep 17 00:00:00 2001 From: Masato Kikuchi Date: Fri, 28 Mar 2025 20:14:51 +0900 Subject: [PATCH 15/26] no voicevox --- Cargo.lock | 2 +- crates/sbv2_editor/Cargo.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index cba845e..cf397ea 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2313,7 +2313,7 @@ dependencies = [ ] [[package]] -name = "sbv2_voicevox" +name = "sbv2_editor" version = "0.2.0-alpha6" dependencies = [ "anyhow", diff --git a/crates/sbv2_editor/Cargo.toml b/crates/sbv2_editor/Cargo.toml index 6403e89..734109d 100644 --- a/crates/sbv2_editor/Cargo.toml +++ b/crates/sbv2_editor/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "sbv2_voicevox" +name = "sbv2_editor" version.workspace = true edition.workspace = true description.workspace = true From 00e95cd77c817a391e346d80a69d0fecf5e19fa2 Mon Sep 17 00:00:00 2001 From: Masato Kikuchi Date: Sat, 29 Mar 2025 10:50:30 +0900 Subject: [PATCH 16/26] feat: synthesis --- Cargo.lock | 5 +- crates/sbv2_core/src/tts.rs | 89 ++++++++++++++++++ crates/sbv2_core/src/tts_util.rs | 7 +- crates/sbv2_editor/Cargo.toml | 1 + crates/sbv2_editor/src/main.rs | 154 +++++++++++++++++++++++++++++-- 5 files changed, 242 insertions(+), 14 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index cf397ea..839c8a5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1425,9 +1425,9 @@ dependencies = [ [[package]] name = "log" -version = "0.4.26" +version = "0.4.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "30bde2b3dc3671ae49d8e2e9f044c7c005836e7a023ee57cffa25ab82764bb9e" +checksum = "13dc2df351e3202783a1fe0d44375f7295ffb4049267b0f3018346dc122a1d94" [[package]] name = "macro_rules_attribute" @@ -2318,6 +2318,7 @@ version = "0.2.0-alpha6" dependencies = [ "anyhow", "axum", + "log", "sbv2_core", "serde", "tokio", diff --git a/crates/sbv2_core/src/tts.rs b/crates/sbv2_core/src/tts.rs index 0c45da5..4ec9fae 100644 --- a/crates/sbv2_core/src/tts.rs +++ b/crates/sbv2_core/src/tts.rs @@ -205,6 +205,23 @@ impl TTSModelHolder { ) -> Result<(Array2, Array1, Array1, Array1)> { crate::tts_util::parse_text_blocking( text, + None, + &self.jtalk, + &self.tokenizer, + |token_ids, attention_masks| { + crate::bert::predict(&mut self.bert, token_ids, attention_masks) + }, + ) + } + + pub fn parse_text_neo( + &mut self, + text: String, + given_tones: Option>, + ) -> Result<(Array2, Array1, Array1, Array1)> { + crate::tts_util::parse_text_blocking( + &text, + given_tones, &self.jtalk, &self.tokenizer, |token_ids, attention_masks| { @@ -347,6 +364,78 @@ impl TTSModelHolder { }; tts_util::array_to_vec(audio_array) } + + pub fn easy_synthesize_neo + Copy>( + &mut self, + ident: I, + text: &str, + given_tones: Option>, + style_id: i32, + speaker_id: i64, + options: SynthesizeOptions, + ) -> Result> { + self.find_and_load_model(ident)?; + let style_vector = self.get_style_vector(ident, style_id, options.style_weight)?; + let audio_array = if options.split_sentences { + let texts: Vec<&str> = text.split('\n').collect(); + let mut audios = vec![]; + for (i, t) in texts.iter().enumerate() { + if t.is_empty() { + continue; + } + let (bert_ori, phones, tones, lang_ids) = self.parse_text_neo(t, given_tones)?; + + let vits2 = self + .find_model(ident)? + .vits2 + .as_mut() + .ok_or(Error::ModelNotFoundError(ident.into().to_string()))?; + let audio = model::synthesize( + vits2, + bert_ori.to_owned(), + phones, + Array1::from_vec(vec![speaker_id]), + tones, + lang_ids, + style_vector.clone(), + options.sdp_ratio, + options.length_scale, + 0.677, + 0.8, + )?; + audios.push(audio.clone()); + if i != texts.len() - 1 { + audios.push(Array3::zeros((1, 1, 22050))); + } + } + concatenate( + Axis(2), + &audios.iter().map(|x| x.view()).collect::>(), + )? + } else { + let (bert_ori, phones, tones, lang_ids) = self.parse_text(text)?; + + let vits2 = self + .find_model(ident)? + .vits2 + .as_mut() + .ok_or(Error::ModelNotFoundError(ident.into().to_string()))?; + model::synthesize( + vits2, + bert_ori.to_owned(), + phones, + Array1::from_vec(vec![speaker_id]), + tones, + lang_ids, + style_vector, + options.sdp_ratio, + options.length_scale, + 0.677, + 0.8, + )? + }; + tts_util::array_to_vec(audio_array) + } } /// Synthesize options diff --git a/crates/sbv2_core/src/tts_util.rs b/crates/sbv2_core/src/tts_util.rs index 17eb9d7..3fc4f77 100644 --- a/crates/sbv2_core/src/tts_util.rs +++ b/crates/sbv2_core/src/tts_util.rs @@ -34,7 +34,6 @@ pub async fn parse_text( let (normalized_text, process) = preprocess_parse_text(text, jtalk)?; let (phones, tones, mut word2ph) = process.g2p()?; let (phones, tones, lang_ids) = nlp::cleaned_text_to_sequence(phones, tones); - let phones = utils::intersperse(&phones, 0); let tones = utils::intersperse(&tones, 0); let lang_ids = utils::intersperse(&lang_ids, 0); @@ -99,6 +98,7 @@ pub async fn parse_text( #[allow(clippy::type_complexity)] pub fn parse_text_blocking( text: &str, + given_tones: Option>, jtalk: &jtalk::JTalk, tokenizer: &Tokenizer, bert_predict: impl FnOnce(Vec, Vec) -> Result>, @@ -107,7 +107,10 @@ pub fn parse_text_blocking( let normalized_text = norm::normalize_text(&text); let process = jtalk.process_text(&normalized_text)?; - let (phones, tones, mut word2ph) = process.g2p()?; + let (phones, mut tones, mut word2ph) = process.g2p()?; + if let Some(given_tones) = given_tones { + tones = given_tones; + } let (phones, tones, lang_ids) = nlp::cleaned_text_to_sequence(phones, tones); let phones = utils::intersperse(&phones, 0); diff --git a/crates/sbv2_editor/Cargo.toml b/crates/sbv2_editor/Cargo.toml index 734109d..3d7ffab 100644 --- a/crates/sbv2_editor/Cargo.toml +++ b/crates/sbv2_editor/Cargo.toml @@ -11,6 +11,7 @@ documentation.workspace = true [dependencies] anyhow.workspace = true axum = "0.8.1" +log = "0.4.27" sbv2_core = { version = "0.2.0-alpha6", path = "../sbv2_core" } serde = { version = "1.0.219", features = ["derive"] } tokio = { version = "1.44.1", features = ["full"] } diff --git a/crates/sbv2_editor/src/main.rs b/crates/sbv2_editor/src/main.rs index ac77598..2268210 100644 --- a/crates/sbv2_editor/src/main.rs +++ b/crates/sbv2_editor/src/main.rs @@ -1,7 +1,11 @@ -use axum::{extract::Query, response::IntoResponse, routing::get, Json, Router}; -use sbv2_core::{jtalk::JTalk, tts_util::preprocess_parse_text}; +use axum::extract::State; +use axum::{extract::Query, response::IntoResponse, routing::{get, post}, Json, Router}; +use sbv2_core::{jtalk::JTalk, tts::TTSModelHolder, tts_util::preprocess_parse_text}; use serde::{Deserialize, Serialize}; -use tokio::net::TcpListener; +use tokio::{fs, net::TcpListener, sync::Mutex}; + +use std::env; +use std::sync::Arc; use error::AppResult; @@ -12,32 +16,162 @@ struct RequestCreateAudioQuery { text: String, } -#[derive(Serialize)] -struct ResponseCreateAudioQuery { +#[derive(Serialize, Deserialize)] +struct AudioQuery { kana: String, tone: i32, } +#[derive(Serialize)] +struct ResponseCreateAudioQuery { + audio_query: Vec, + text: String, +} + async fn create_audio_query( Query(request): Query, ) -> AppResult { - let (_, process) = preprocess_parse_text(&request.text, &JTalk::new()?)?; + let (text, process) = preprocess_parse_text(&request.text, &JTalk::new()?)?; let kana_tone_list = process.g2kana_tone()?; - let response = kana_tone_list + let audio_query = kana_tone_list .iter() - .map(|(kana, tone)| ResponseCreateAudioQuery { + .map(|(kana, tone)| AudioQuery { kana: kana.clone(), tone: *tone, }) .collect::>(); - Ok(Json(response)) + Ok(Json(ResponseCreateAudioQuery { audio_query, text })) +} + +#[derive(Deserialize)] +pub struct RequestSynthesis { + text: String, + speaker_id: i32, + sdp_ratio: f32, + length_scale: f32, + style_id: i32, + audio_query: Vec, +} + +async fn synthesis( + State(state): State, + Json(request): Json, +) -> AppResult { + let mut tones: Vec = request.audio_query.iter().map(|query| query.tone).collect(); + tones.insert(0, 0); + tones.push(0); + let buffer = { + let mut tts_model = state.tts_model.lock().await; + tts_model.easy_synthesize_neo( + &ident, + &text, + Some(tones), + request.style_id, + request.speaker_id, + SynthesizeOptions { + sdp_ratio: request.sdp_ratio, + length_scale: request.length_scale, + ..Default::default() + }, + )? + }; + Ok(([(CONTENT_TYPE, "audio/wav")], buffer)) +} + +#[derive(Clone)] +struct AppState { + tts_model: Arc>, +} + +impl AppState { + pub async fn new() -> anyhow::Result { + let mut tts_model = TTSModelHolder::new( + &fs::read(env::var("BERT_MODEL_PATH")?).await?, + &fs::read(env::var("TOKENIZER_PATH")?).await?, + env::var("HOLDER_MAX_LOADED_MODElS") + .ok() + .and_then(|x| x.parse().ok()), + )?; + let models = env::var("MODELS_PATH").unwrap_or("models".to_string()); + let mut f = fs::read_dir(&models).await?; + let mut entries = vec![]; + while let Ok(Some(e)) = f.next_entry().await { + let name = e.file_name().to_string_lossy().to_string(); + if name.ends_with(".onnx") && name.starts_with("model_") { + let name_len = name.len(); + let name = name.chars(); + entries.push( + name.collect::>()[6..name_len - 5] + .iter() + .collect::(), + ); + } else if name.ends_with(".sbv2") { + let entry = &name[..name.len() - 5]; + log::info!("Try loading: {entry}"); + let sbv2_bytes = match fs::read(format!("{models}/{entry}.sbv2")).await { + Ok(b) => b, + Err(e) => { + log::warn!("Error loading sbv2_bytes from file {entry}: {e}"); + continue; + } + }; + if let Err(e) = tts_model.load_sbv2file(entry, sbv2_bytes) { + log::warn!("Error loading {entry}: {e}"); + }; + log::info!("Loaded: {entry}"); + } else if name.ends_with(".aivmx") { + let entry = &name[..name.len() - 6]; + log::info!("Try loading: {entry}"); + let aivmx_bytes = match fs::read(format!("{models}/{entry}.aivmx")).await { + Ok(b) => b, + Err(e) => { + log::warn!("Error loading aivmx bytes from file {entry}: {e}"); + continue; + } + }; + if let Err(e) = tts_model.load_aivmx(entry, aivmx_bytes) { + log::error!("Error loading {entry}: {e}"); + } + log::info!("Loaded: {entry}"); + } + } + for entry in entries { + log::info!("Try loading: {entry}"); + let style_vectors_bytes = + match fs::read(format!("{models}/style_vectors_{entry}.json")).await { + Ok(b) => b, + Err(e) => { + log::warn!("Error loading style_vectors_bytes from file {entry}: {e}"); + continue; + } + }; + let vits2_bytes = match fs::read(format!("{models}/model_{entry}.onnx")).await { + Ok(b) => b, + Err(e) => { + log::warn!("Error loading vits2_bytes from file {entry}: {e}"); + continue; + } + }; + if let Err(e) = tts_model.load(&entry, style_vectors_bytes, vits2_bytes) { + log::warn!("Error loading {entry}: {e}"); + }; + log::info!("Loaded: {entry}"); + } + Ok(Self { + tts_model: Arc::new(Mutex::new(tts_model)), + }) + } } #[tokio::main] async fn main() -> anyhow::Result<()> { + dotenvy::dotenv_override().ok(); + env_logger::init(); let app = Router::new() .route("/", get(|| async { "Hello, world!" })) - .route("/audio_query", get(create_audio_query)); + .route("/audio_query", get(create_audio_query)) + .route("/synthesis", post(synthesis)) + .with_state(AppState::new().await?); let listener = TcpListener::bind("0.0.0.0:8080").await?; axum::serve(listener, app).await?; Ok(()) From 6e01103c5def1fe374b4a476954bcf743fd622dd Mon Sep 17 00:00:00 2001 From: Masato Kikuchi Date: Sat, 29 Mar 2025 10:50:40 +0900 Subject: [PATCH 17/26] format --- crates/sbv2_editor/src/main.rs | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/crates/sbv2_editor/src/main.rs b/crates/sbv2_editor/src/main.rs index 2268210..9c28534 100644 --- a/crates/sbv2_editor/src/main.rs +++ b/crates/sbv2_editor/src/main.rs @@ -1,5 +1,10 @@ use axum::extract::State; -use axum::{extract::Query, response::IntoResponse, routing::{get, post}, Json, Router}; +use axum::{ + extract::Query, + response::IntoResponse, + routing::{get, post}, + Json, Router, +}; use sbv2_core::{jtalk::JTalk, tts::TTSModelHolder, tts_util::preprocess_parse_text}; use serde::{Deserialize, Serialize}; use tokio::{fs, net::TcpListener, sync::Mutex}; From 64fc74eee6945aa7b042d5f871711d384a2649e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?tuna2134=40=E3=82=B3=E3=83=9E=E3=83=AA=E3=83=B3=E8=A6=AA?= =?UTF-8?q?=E8=A1=9B=E9=9A=8A?= Date: Sat, 29 Mar 2025 10:58:24 +0900 Subject: [PATCH 18/26] fix: bug --- crates/sbv2_core/src/tts.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/sbv2_core/src/tts.rs b/crates/sbv2_core/src/tts.rs index 4ec9fae..56b3bf5 100644 --- a/crates/sbv2_core/src/tts.rs +++ b/crates/sbv2_core/src/tts.rs @@ -383,7 +383,7 @@ impl TTSModelHolder { if t.is_empty() { continue; } - let (bert_ori, phones, tones, lang_ids) = self.parse_text_neo(t, given_tones)?; + let (bert_ori, phones, tones, lang_ids) = self.parse_text_neo(t.to_string(), given_tones)?; let vits2 = self .find_model(ident)? From 48aef6cef4c165a87e2d6684c3f77cd481226785 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?tuna2134=40=E3=82=B3=E3=83=9E=E3=83=AA=E3=83=B3=E8=A6=AA?= =?UTF-8?q?=E8=A1=9B=E9=9A=8A?= Date: Sat, 29 Mar 2025 11:02:23 +0900 Subject: [PATCH 19/26] =?UTF-8?q?tts.rs=20=E3=82=92=E6=9B=B4=E6=96=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crates/sbv2_core/src/tts.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/sbv2_core/src/tts.rs b/crates/sbv2_core/src/tts.rs index 56b3bf5..b8c3966 100644 --- a/crates/sbv2_core/src/tts.rs +++ b/crates/sbv2_core/src/tts.rs @@ -383,7 +383,7 @@ impl TTSModelHolder { if t.is_empty() { continue; } - let (bert_ori, phones, tones, lang_ids) = self.parse_text_neo(t.to_string(), given_tones)?; + let (bert_ori, phones, tones, lang_ids) = self.parse_text_neo(t.to_string(), given_tones.clone())?; let vits2 = self .find_model(ident)? From 5abfe732e44d10716500907122e8ec645f72ace2 Mon Sep 17 00:00:00 2001 From: Masato Kikuchi Date: Mon, 31 Mar 2025 22:45:55 +0900 Subject: [PATCH 20/26] fix bug --- Cargo.lock | 2 ++ crates/sbv2_editor/Cargo.toml | 4 +++- crates/sbv2_editor/src/main.rs | 10 ++++++---- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 839c8a5..6a64f06 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2318,6 +2318,8 @@ version = "0.2.0-alpha6" dependencies = [ "anyhow", "axum", + "dotenvy", + "env_logger", "log", "sbv2_core", "serde", diff --git a/crates/sbv2_editor/Cargo.toml b/crates/sbv2_editor/Cargo.toml index 3d7ffab..3dd2501 100644 --- a/crates/sbv2_editor/Cargo.toml +++ b/crates/sbv2_editor/Cargo.toml @@ -11,7 +11,9 @@ documentation.workspace = true [dependencies] anyhow.workspace = true axum = "0.8.1" +dotenvy.workspace = true +env_logger.workspace = true log = "0.4.27" -sbv2_core = { version = "0.2.0-alpha6", path = "../sbv2_core" } +sbv2_core = { version = "0.2.0-alpha6", path = "../sbv2_core", features = ["aivmx"] } serde = { version = "1.0.219", features = ["derive"] } tokio = { version = "1.44.1", features = ["full"] } diff --git a/crates/sbv2_editor/src/main.rs b/crates/sbv2_editor/src/main.rs index 9c28534..3cf4859 100644 --- a/crates/sbv2_editor/src/main.rs +++ b/crates/sbv2_editor/src/main.rs @@ -4,8 +4,9 @@ use axum::{ response::IntoResponse, routing::{get, post}, Json, Router, + http::header::CONTENT_TYPE, }; -use sbv2_core::{jtalk::JTalk, tts::TTSModelHolder, tts_util::preprocess_parse_text}; +use sbv2_core::{jtalk::JTalk, tts::{TTSModelHolder, SynthesizeOptions}, tts_util::preprocess_parse_text}; use serde::{Deserialize, Serialize}; use tokio::{fs, net::TcpListener, sync::Mutex}; @@ -51,11 +52,12 @@ async fn create_audio_query( #[derive(Deserialize)] pub struct RequestSynthesis { text: String, - speaker_id: i32, + speaker_id: i64, sdp_ratio: f32, length_scale: f32, style_id: i32, audio_query: Vec, + ident: String, } async fn synthesis( @@ -68,8 +70,8 @@ async fn synthesis( let buffer = { let mut tts_model = state.tts_model.lock().await; tts_model.easy_synthesize_neo( - &ident, - &text, + &request.ident, + &request.text, Some(tones), request.style_id, request.speaker_id, From 53d7daf11a96b7158c39a83490244bad3c4862b4 Mon Sep 17 00:00:00 2001 From: Masato Kikuchi Date: Mon, 31 Mar 2025 23:03:30 +0900 Subject: [PATCH 21/26] fix --- crates/sbv2_core/src/tts.rs | 2 +- crates/sbv2_editor/src/main.rs | 6 +++++- test.py | 19 +++++++++++++++++++ 3 files changed, 25 insertions(+), 2 deletions(-) create mode 100644 test.py diff --git a/crates/sbv2_core/src/tts.rs b/crates/sbv2_core/src/tts.rs index b8c3966..be7c9fd 100644 --- a/crates/sbv2_core/src/tts.rs +++ b/crates/sbv2_core/src/tts.rs @@ -41,7 +41,7 @@ pub struct TTSModelHolder { tokenizer: Tokenizer, bert: Session, models: Vec, - jtalk: jtalk::JTalk, + pub jtalk: jtalk::JTalk, max_loaded_models: Option, } diff --git a/crates/sbv2_editor/src/main.rs b/crates/sbv2_editor/src/main.rs index 3cf4859..cc4bf16 100644 --- a/crates/sbv2_editor/src/main.rs +++ b/crates/sbv2_editor/src/main.rs @@ -35,9 +35,13 @@ struct ResponseCreateAudioQuery { } async fn create_audio_query( + State(state): State, Query(request): Query, ) -> AppResult { - let (text, process) = preprocess_parse_text(&request.text, &JTalk::new()?)?; + let (text, process) = { + let mut tts_model = state.tts_model.lock().await; + preprocess_parse_text(&request.text, &tts_model.jtalk)? + }; let kana_tone_list = process.g2kana_tone()?; let audio_query = kana_tone_list .iter() diff --git a/test.py b/test.py new file mode 100644 index 0000000..371abb5 --- /dev/null +++ b/test.py @@ -0,0 +1,19 @@ +import requests + + +data = (requests.get("http://localhost:8080/audio_query", params={ + "text": "こんにちは", +})).json() +print(data) + +data = (requests.post("http://localhost:8080/synthesis", json={ + "text": "こんにちは", + "ident": "tsukuyomi", + "speaker_id": 0, + "style_id": 0, + "sdp_ratio": 0.5, + "length_scale": 0.5, + "audio_query": data["audio_query"], +})).content +with open("test.wav", "wb") as f: + f.write(data) \ No newline at end of file From 633dfc305e92e5d32448ffd0ac7b631017d45aa1 Mon Sep 17 00:00:00 2001 From: Masato Kikuchi Date: Mon, 31 Mar 2025 23:04:23 +0900 Subject: [PATCH 22/26] delete mut --- crates/sbv2_editor/src/main.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/sbv2_editor/src/main.rs b/crates/sbv2_editor/src/main.rs index cc4bf16..88d79d4 100644 --- a/crates/sbv2_editor/src/main.rs +++ b/crates/sbv2_editor/src/main.rs @@ -39,7 +39,7 @@ async fn create_audio_query( Query(request): Query, ) -> AppResult { let (text, process) = { - let mut tts_model = state.tts_model.lock().await; + let tts_model = state.tts_model.lock().await; preprocess_parse_text(&request.text, &tts_model.jtalk)? }; let kana_tone_list = process.g2kana_tone()?; From a5d783bd65f3f5eb51bc6d20a143054b710de076 Mon Sep 17 00:00:00 2001 From: Masato Kikuchi Date: Mon, 31 Mar 2025 23:35:39 +0900 Subject: [PATCH 23/26] fix: bug --- crates/sbv2_core/src/tts_util.rs | 23 +++++++++++++++++++++++ crates/sbv2_editor/src/main.rs | 14 +++++++++++--- 2 files changed, 34 insertions(+), 3 deletions(-) diff --git a/crates/sbv2_core/src/tts_util.rs b/crates/sbv2_core/src/tts_util.rs index 3fc4f77..97b69c6 100644 --- a/crates/sbv2_core/src/tts_util.rs +++ b/crates/sbv2_core/src/tts_util.rs @@ -2,6 +2,8 @@ use std::io::Cursor; use crate::error::Result; use crate::jtalk::JTalkProcess; +use crate::mora::MORA_KATA_TO_MORA_PHONEMES; +use crate::norm::PUNCTUATIONS; use crate::{jtalk, nlp, norm, tokenizer, utils}; use hound::{SampleFormat, WavSpec, WavWriter}; use ndarray::{concatenate, s, Array, Array1, Array2, Array3, Axis}; @@ -111,6 +113,7 @@ pub fn parse_text_blocking( if let Some(given_tones) = given_tones { tones = given_tones; } + println!("tones: {:?}", tones); let (phones, tones, lang_ids) = nlp::cleaned_text_to_sequence(phones, tones); let phones = utils::intersperse(&phones, 0); @@ -188,3 +191,23 @@ pub fn array_to_vec(audio_array: Array3) -> Result> { writer.finalize()?; Ok(cursor.into_inner()) } + +pub fn kata_tone2phone_tone(kata_tone: Vec<(String, i32)>) -> Vec<(String, i32)> { + let mut results = vec![("_".to_string(), 0)]; + for (mora, tone) in kata_tone { + if PUNCTUATIONS.contains(&mora.as_str()) { + results.push((mora, 0)); + continue; + } else { + let (consonant, vowel) = MORA_KATA_TO_MORA_PHONEMES.get(&mora).unwrap(); + if let Some(consonant) = consonant { + results.push((consonant.to_string(), tone)); + results.push((vowel.to_string(), tone)); + } else { + results.push((vowel.to_string(), tone)); + } + } + } + results.push(("_".to_string(), 0)); + results +} \ No newline at end of file diff --git a/crates/sbv2_editor/src/main.rs b/crates/sbv2_editor/src/main.rs index 88d79d4..a3b6995 100644 --- a/crates/sbv2_editor/src/main.rs +++ b/crates/sbv2_editor/src/main.rs @@ -6,6 +6,7 @@ use axum::{ Json, Router, http::header::CONTENT_TYPE, }; +use sbv2_core::tts_util::kata_tone2phone_tone; use sbv2_core::{jtalk::JTalk, tts::{TTSModelHolder, SynthesizeOptions}, tts_util::preprocess_parse_text}; use serde::{Deserialize, Serialize}; use tokio::{fs, net::TcpListener, sync::Mutex}; @@ -68,9 +69,16 @@ async fn synthesis( State(state): State, Json(request): Json, ) -> AppResult { - let mut tones: Vec = request.audio_query.iter().map(|query| query.tone).collect(); - tones.insert(0, 0); - tones.push(0); + let phone_tone = request + .audio_query + .iter() + .map(|query| (query.kana.clone(), query.tone)) + .collect::>(); + let phone_tone = kata_tone2phone_tone(phone_tone); + let tones = phone_tone + .iter() + .map(|(_, tone)| *tone) + .collect::>(); let buffer = { let mut tts_model = state.tts_model.lock().await; tts_model.easy_synthesize_neo( From 70c2341afd92ada354290fcfae76ee79ff5a7925 Mon Sep 17 00:00:00 2001 From: Masato Kikuchi Date: Mon, 31 Mar 2025 23:35:51 +0900 Subject: [PATCH 24/26] format --- crates/sbv2_core/src/tts.rs | 3 ++- crates/sbv2_core/src/tts_util.rs | 2 +- crates/sbv2_editor/src/main.rs | 13 +++++++------ 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/crates/sbv2_core/src/tts.rs b/crates/sbv2_core/src/tts.rs index be7c9fd..83d8c1a 100644 --- a/crates/sbv2_core/src/tts.rs +++ b/crates/sbv2_core/src/tts.rs @@ -383,7 +383,8 @@ impl TTSModelHolder { if t.is_empty() { continue; } - let (bert_ori, phones, tones, lang_ids) = self.parse_text_neo(t.to_string(), given_tones.clone())?; + let (bert_ori, phones, tones, lang_ids) = + self.parse_text_neo(t.to_string(), given_tones.clone())?; let vits2 = self .find_model(ident)? diff --git a/crates/sbv2_core/src/tts_util.rs b/crates/sbv2_core/src/tts_util.rs index 97b69c6..a5569f6 100644 --- a/crates/sbv2_core/src/tts_util.rs +++ b/crates/sbv2_core/src/tts_util.rs @@ -210,4 +210,4 @@ pub fn kata_tone2phone_tone(kata_tone: Vec<(String, i32)>) -> Vec<(String, i32)> } results.push(("_".to_string(), 0)); results -} \ No newline at end of file +} diff --git a/crates/sbv2_editor/src/main.rs b/crates/sbv2_editor/src/main.rs index a3b6995..5e79ffb 100644 --- a/crates/sbv2_editor/src/main.rs +++ b/crates/sbv2_editor/src/main.rs @@ -1,13 +1,17 @@ use axum::extract::State; use axum::{ extract::Query, + http::header::CONTENT_TYPE, response::IntoResponse, routing::{get, post}, Json, Router, - http::header::CONTENT_TYPE, }; use sbv2_core::tts_util::kata_tone2phone_tone; -use sbv2_core::{jtalk::JTalk, tts::{TTSModelHolder, SynthesizeOptions}, tts_util::preprocess_parse_text}; +use sbv2_core::{ + jtalk::JTalk, + tts::{SynthesizeOptions, TTSModelHolder}, + tts_util::preprocess_parse_text, +}; use serde::{Deserialize, Serialize}; use tokio::{fs, net::TcpListener, sync::Mutex}; @@ -75,10 +79,7 @@ async fn synthesis( .map(|query| (query.kana.clone(), query.tone)) .collect::>(); let phone_tone = kata_tone2phone_tone(phone_tone); - let tones = phone_tone - .iter() - .map(|(_, tone)| *tone) - .collect::>(); + let tones = phone_tone.iter().map(|(_, tone)| *tone).collect::>(); let buffer = { let mut tts_model = state.tts_model.lock().await; tts_model.easy_synthesize_neo( From 01541ff3810438f03f4b6686680ce90f58cd113a Mon Sep 17 00:00:00 2001 From: Masato Kikuchi Date: Mon, 31 Mar 2025 23:36:10 +0900 Subject: [PATCH 25/26] delete unimport --- crates/sbv2_editor/src/main.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/crates/sbv2_editor/src/main.rs b/crates/sbv2_editor/src/main.rs index 5e79ffb..fb596d5 100644 --- a/crates/sbv2_editor/src/main.rs +++ b/crates/sbv2_editor/src/main.rs @@ -8,7 +8,6 @@ use axum::{ }; use sbv2_core::tts_util::kata_tone2phone_tone; use sbv2_core::{ - jtalk::JTalk, tts::{SynthesizeOptions, TTSModelHolder}, tts_util::preprocess_parse_text, }; From 103eb51ca86e9a3ce5071eaa9e29ad9cc1490434 Mon Sep 17 00:00:00 2001 From: Masato Kikuchi Date: Mon, 31 Mar 2025 23:39:44 +0900 Subject: [PATCH 26/26] delete --- crates/sbv2_core/src/tts_util.rs | 1 - test.py | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/crates/sbv2_core/src/tts_util.rs b/crates/sbv2_core/src/tts_util.rs index a5569f6..37f89ae 100644 --- a/crates/sbv2_core/src/tts_util.rs +++ b/crates/sbv2_core/src/tts_util.rs @@ -113,7 +113,6 @@ pub fn parse_text_blocking( if let Some(given_tones) = given_tones { tones = given_tones; } - println!("tones: {:?}", tones); let (phones, tones, lang_ids) = nlp::cleaned_text_to_sequence(phones, tones); let phones = utils::intersperse(&phones, 0); diff --git a/test.py b/test.py index 371abb5..4b97abf 100644 --- a/test.py +++ b/test.py @@ -2,12 +2,12 @@ import requests data = (requests.get("http://localhost:8080/audio_query", params={ - "text": "こんにちは", + "text": "こんにちは、今日はいい天気ですね。", })).json() print(data) data = (requests.post("http://localhost:8080/synthesis", json={ - "text": "こんにちは", + "text": data["text"], "ident": "tsukuyomi", "speaker_id": 0, "style_id": 0,