From b8f0477318aa9eb08846d291da11312a4d886935 Mon Sep 17 00:00:00 2001 From: Masato Kikuchi Date: Wed, 26 Mar 2025 16:30:31 +0900 Subject: [PATCH] feat: audio query request --- Cargo.lock | 9 +- crates/sbv2_voicevox/Cargo.toml | 1 + crates/sbv2_voicevox/query2.json | 226 ++++++++++++++++++++++++++++++ crates/sbv2_voicevox/src/error.rs | 27 ++++ crates/sbv2_voicevox/src/main.rs | 20 ++- 5 files changed, 278 insertions(+), 5 deletions(-) create mode 100644 crates/sbv2_voicevox/query2.json create mode 100644 crates/sbv2_voicevox/src/error.rs diff --git a/Cargo.lock b/Cargo.lock index 97f0509..cba845e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2319,6 +2319,7 @@ dependencies = [ "anyhow", "axum", "sbv2_core", + "serde", "tokio", ] @@ -2374,18 +2375,18 @@ dependencies = [ [[package]] name = "serde" -version = "1.0.218" +version = "1.0.219" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e8dfc9d19bdbf6d17e22319da49161d5d0108e4188e8b680aef6299eed22df60" +checksum = "5f0e2c6ed6606019b4e29e69dbaba95b11854410e5347d525002456dbbb786b6" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.218" +version = "1.0.219" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f09503e191f4e797cb8aac08e9a4a4695c5edf6a2e70e376d961ddd5c969f82b" +checksum = "5b0276cf7f2c73365f7157c8123c21cd9a50fbbd844757af28ca1f5925fc2a00" dependencies = [ "proc-macro2", "quote", diff --git a/crates/sbv2_voicevox/Cargo.toml b/crates/sbv2_voicevox/Cargo.toml index 14c7b92..6403e89 100644 --- a/crates/sbv2_voicevox/Cargo.toml +++ b/crates/sbv2_voicevox/Cargo.toml @@ -12,4 +12,5 @@ documentation.workspace = true anyhow.workspace = true axum = "0.8.1" sbv2_core = { version = "0.2.0-alpha6", path = "../sbv2_core" } +serde = { version = "1.0.219", features = ["derive"] } tokio = { version = "1.44.1", features = ["full"] } diff --git a/crates/sbv2_voicevox/query2.json b/crates/sbv2_voicevox/query2.json new file mode 100644 index 0000000..6cffac4 --- /dev/null +++ b/crates/sbv2_voicevox/query2.json @@ -0,0 +1,226 @@ +{ + "accent_phrases": [ + { + "moras": [ + { + "text": "コ", + "consonant": "k", + "consonant_length": 0.10002632439136505, + "vowel": "o", + "vowel_length": 0.15740256011486053, + "pitch": 5.749961853027344 + }, + { + "text": "ン", + "consonant": null, + "consonant_length": null, + "vowel": "N", + "vowel_length": 0.08265873789787292, + "pitch": 5.89122200012207 + }, + { + "text": "ニ", + "consonant": "n", + "consonant_length": 0.03657080978155136, + "vowel": "i", + "vowel_length": 0.1175866425037384, + "pitch": 5.969866752624512 + }, + { + "text": "チ", + "consonant": "ch", + "consonant_length": 0.09005842357873917, + "vowel": "i", + "vowel_length": 0.08666137605905533, + "pitch": 5.958892822265625 + }, + { + "text": "ワ", + "consonant": "w", + "consonant_length": 0.07833231985569, + "vowel": "a", + "vowel_length": 0.21250136196613312, + "pitch": 5.949411392211914 + } + ], + "accent": 5, + "pause_mora": { + "text": "、", + "consonant": null, + "consonant_length": null, + "vowel": "pau", + "vowel_length": 0.4723339378833771, + "pitch": 0.0 + }, + "is_interrogative": false + }, + { + "moras": [ + { + "text": "オ", + "consonant": null, + "consonant_length": null, + "vowel": "o", + "vowel_length": 0.22004225850105286, + "pitch": 5.6870927810668945 + }, + { + "text": "ン", + "consonant": null, + "consonant_length": null, + "vowel": "N", + "vowel_length": 0.09161105751991272, + "pitch": 5.93472957611084 + }, + { + "text": "セ", + "consonant": "s", + "consonant_length": 0.08924821764230728, + "vowel": "e", + "vowel_length": 0.14142127335071564, + "pitch": 6.121850490570068 + }, + { + "text": "エ", + "consonant": null, + "consonant_length": null, + "vowel": "e", + "vowel_length": 0.10636933892965317, + "pitch": 6.157896041870117 + }, + { + "text": "ゴ", + "consonant": "g", + "consonant_length": 0.07600915431976318, + "vowel": "o", + "vowel_length": 0.09598273783922195, + "pitch": 6.188933849334717 + }, + { + "text": "オ", + "consonant": null, + "consonant_length": null, + "vowel": "o", + "vowel_length": 0.1079121008515358, + "pitch": 6.235202789306641 + }, + { + "text": "セ", + "consonant": "s", + "consonant_length": 0.09591838717460632, + "vowel": "e", + "vowel_length": 0.10286372154951096, + "pitch": 6.153214454650879 + }, + { + "text": "エ", + "consonant": null, + "consonant_length": null, + "vowel": "e", + "vowel_length": 0.08992656320333481, + "pitch": 6.02571439743042 + }, + { + "text": "ノ", + "consonant": "n", + "consonant_length": 0.05660202354192734, + "vowel": "o", + "vowel_length": 0.09676017612218857, + "pitch": 5.711844444274902 + } + ], + "accent": 5, + "pause_mora": null, + "is_interrogative": false + }, + { + "moras": [ + { + "text": "セ", + "consonant": "s", + "consonant_length": 0.07805486768484116, + "vowel": "e", + "vowel_length": 0.09617523103952408, + "pitch": 5.774399280548096 + }, + { + "text": "カ", + "consonant": "k", + "consonant_length": 0.06712044775485992, + "vowel": "a", + "vowel_length": 0.148829385638237, + "pitch": 6.063965797424316 + }, + { + "text": "イ", + "consonant": null, + "consonant_length": null, + "vowel": "i", + "vowel_length": 0.11061104387044907, + "pitch": 6.040698051452637 + }, + { + "text": "エ", + "consonant": null, + "consonant_length": null, + "vowel": "e", + "vowel_length": 0.13046696782112122, + "pitch": 5.806027889251709 + } + ], + "accent": 1, + "pause_mora": null, + "is_interrogative": false + }, + { + "moras": [ + { + "text": "ヨ", + "consonant": "y", + "consonant_length": 0.07194744795560837, + "vowel": "o", + "vowel_length": 0.08622600883245468, + "pitch": 5.694094657897949 + }, + { + "text": "オ", + "consonant": null, + "consonant_length": null, + "vowel": "o", + "vowel_length": 0.10635452717542648, + "pitch": 5.787222385406494 + }, + { + "text": "コ", + "consonant": "k", + "consonant_length": 0.07077334076166153, + "vowel": "o", + "vowel_length": 0.09248624742031097, + "pitch": 5.793357849121094 + }, + { + "text": "ソ", + "consonant": "s", + "consonant_length": 0.08705667406320572, + "vowel": "o", + "vowel_length": 0.2238258570432663, + "pitch": 5.643765449523926 + } + ], + "accent": 1, + "pause_mora": null, + "is_interrogative": false + } + ], + "speedScale": 1.0, + "pitchScale": 0.0, + "intonationScale": 1.0, + "volumeScale": 1.0, + "prePhonemeLength": 0.1, + "postPhonemeLength": 0.1, + "pauseLength": null, + "pauseLengthScale": 1.0, + "outputSamplingRate": 24000, + "outputStereo": false, + "kana": "コンニチワ'、オンセエゴ'オセエノ/セ'カイエ/ヨ'オコソ" +} \ No newline at end of file diff --git a/crates/sbv2_voicevox/src/error.rs b/crates/sbv2_voicevox/src/error.rs new file mode 100644 index 0000000..d3cf600 --- /dev/null +++ b/crates/sbv2_voicevox/src/error.rs @@ -0,0 +1,27 @@ +use axum::{ + http::StatusCode, + response::{IntoResponse, Response}, +}; + +pub type AppResult = std::result::Result; + +pub struct AppError(anyhow::Error); + +impl IntoResponse for AppError { + fn into_response(self) -> Response { + ( + StatusCode::INTERNAL_SERVER_ERROR, + format!("Something went wrong: {}", self.0), + ) + .into_response() + } +} + +impl From for AppError +where + E: Into, +{ + fn from(err: E) -> Self { + Self(err.into()) + } +} diff --git a/crates/sbv2_voicevox/src/main.rs b/crates/sbv2_voicevox/src/main.rs index 911d91b..f1a36d2 100644 --- a/crates/sbv2_voicevox/src/main.rs +++ b/crates/sbv2_voicevox/src/main.rs @@ -1,5 +1,23 @@ -use axum::{routing::get, Router}; +use axum::{routing::get, Router, extract::Query}; +use sbv2_core::{jtalk::JTalk, tts_util::preprocess_parse_text}; use tokio::net::TcpListener; +use serde::Deserialize; + +use error::AppResult; + +mod error; + +#[derive(Deserialize)] +struct RequestCreateAudioQuery { + text: String, +} + +async fn create_audio_query( + Query(request): Query, +) -> AppResult<()> { + let (phones, tones, mut word2ph, normalized_text, process) = preprocess_parse_text(&request.text, &JTalk::new()?)?; + Ok(()) +} #[tokio::main] async fn main() -> anyhow::Result<()> {