From f4de3e15ae238a68be603235e2ce9f713ea41397 Mon Sep 17 00:00:00 2001 From: Masato Kikuchi Date: Wed, 26 Mar 2025 16:14:29 +0900 Subject: [PATCH] initial commit: voicevox --- Cargo.lock | 5 +++-- crates/sbv2_core/src/tts_util.rs | 10 ++++++---- crates/sbv2_voicevox/Cargo.toml | 1 + crates/sbv2_voicevox/src/main.rs | 7 ++++++- 4 files changed, 16 insertions(+), 7 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 1d69f54..97f0509 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2319,6 +2319,7 @@ dependencies = [ "anyhow", "axum", "sbv2_core", + "tokio", ] [[package]] @@ -2741,9 +2742,9 @@ dependencies = [ [[package]] name = "tokio" -version = "1.43.0" +version = "1.44.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3d61fa4ffa3de412bfea335c6ecff681de2b609ba3c77ef3e00e521813a9ed9e" +checksum = "f382da615b842244d4b8738c82ed1275e6c5dd90c459a30941cd07080b06c91a" dependencies = [ "backtrace", "bytes", diff --git a/crates/sbv2_core/src/tts_util.rs b/crates/sbv2_core/src/tts_util.rs index 8cab20d..1334128 100644 --- a/crates/sbv2_core/src/tts_util.rs +++ b/crates/sbv2_core/src/tts_util.rs @@ -1,6 +1,7 @@ use std::io::Cursor; use crate::error::Result; +use crate::jtalk::JTalkProcess; use crate::{jtalk, nlp, norm, tokenizer, utils}; use hound::{SampleFormat, WavSpec, WavWriter}; use ndarray::{concatenate, s, Array, Array1, Array2, Array3, Axis}; @@ -9,13 +10,13 @@ use tokenizers::Tokenizer; pub fn preprocess_parse_text( text: &str, jtalk: &jtalk::JTalk, -) -> Result<(Vec, Vec, Vec)> { +) -> Result<(Vec, Vec, Vec, String, JTalkProcess)> { let text = jtalk.num2word(text)?; let normalized_text = norm::normalize_text(&text); let process = jtalk.process_text(&normalized_text)?; - let result = process.g2p()?; - Ok(result) + let (phones, tones, word2ph) = process.g2p()?; + Ok((phones, tones, word2ph, normalized_text, process)) } /// Parse text and return the input for synthesize @@ -34,7 +35,8 @@ pub async fn parse_text( Box>>>, >, ) -> Result<(Array2, Array1, Array1, Array1)> { - let (phones, tones, mut word2ph) = preprocess_parse_text(text, jtalk)?; + let (phones, tones, mut word2ph, normalized_text, process) = + preprocess_parse_text(text, jtalk)?; let (phones, tones, lang_ids) = nlp::cleaned_text_to_sequence(phones, tones); let phones = utils::intersperse(&phones, 0); diff --git a/crates/sbv2_voicevox/Cargo.toml b/crates/sbv2_voicevox/Cargo.toml index 4bd10aa..14c7b92 100644 --- a/crates/sbv2_voicevox/Cargo.toml +++ b/crates/sbv2_voicevox/Cargo.toml @@ -12,3 +12,4 @@ documentation.workspace = true anyhow.workspace = true axum = "0.8.1" sbv2_core = { version = "0.2.0-alpha6", path = "../sbv2_core" } +tokio = { version = "1.44.1", features = ["full"] } diff --git a/crates/sbv2_voicevox/src/main.rs b/crates/sbv2_voicevox/src/main.rs index 8d6f05f..911d91b 100644 --- a/crates/sbv2_voicevox/src/main.rs +++ b/crates/sbv2_voicevox/src/main.rs @@ -1,5 +1,10 @@ +use axum::{routing::get, Router}; +use tokio::net::TcpListener; +#[tokio::main] async fn main() -> anyhow::Result<()> { - println!("Hello, world!"); + let app = Router::new().route("/", get(|| async { "Hello, world!" })); + let listener = TcpListener::bind("0.0.0.0:8080").await?; + axum::serve(listener, app).await?; Ok(()) }