Merge pull request #198 from tuna2134/voicevox

Editor APIを追加
delete
2025-12-23 07:59:56 +00:00 · 2025-03-31 23:56:25 +09:00 · 2025-03-31 23:39:44 +09:00 · 2025-03-31 23:36:10 +09:00 · 2025-03-31 23:35:51 +09:00 · 2025-03-31 23:35:39 +09:00
13 changed files with 700 additions and 19 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1425,9 +1425,9 @@ dependencies = [

 [[package]]
 name = "log"
-version = "0.4.26"
+version = "0.4.27"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "30bde2b3dc3671ae49d8e2e9f044c7c005836e7a023ee57cffa25ab82764bb9e"
+checksum = "13dc2df351e3202783a1fe0d44375f7295ffb4049267b0f3018346dc122a1d94"

 [[package]]
 name = "macro_rules_attribute"
@@ -2312,6 +2312,20 @@ dependencies = [
 "zstd",
 ]

+[[package]]
+name = "sbv2_editor"
+version = "0.2.0-alpha6"
+dependencies = [
+ "anyhow",
+ "axum",
+ "dotenvy",
+ "env_logger",
+ "log",
+ "sbv2_core",
+ "serde",
+ "tokio",
+]
+
 [[package]]
 name = "sbv2_wasm"
 version = "0.2.0-alpha6"
@@ -2364,18 +2378,18 @@ dependencies = [

 [[package]]
 name = "serde"
-version = "1.0.218"
+version = "1.0.219"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e8dfc9d19bdbf6d17e22319da49161d5d0108e4188e8b680aef6299eed22df60"
+checksum = "5f0e2c6ed6606019b4e29e69dbaba95b11854410e5347d525002456dbbb786b6"
 dependencies = [
 "serde_derive",
 ]

 [[package]]
 name = "serde_derive"
-version = "1.0.218"
+version = "1.0.219"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f09503e191f4e797cb8aac08e9a4a4695c5edf6a2e70e376d961ddd5c969f82b"
+checksum = "5b0276cf7f2c73365f7157c8123c21cd9a50fbbd844757af28ca1f5925fc2a00"
 dependencies = [
 "proc-macro2",
 "quote",
@@ -2732,9 +2746,9 @@ dependencies = [

 [[package]]
 name = "tokio"
-version = "1.43.0"
+version = "1.44.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3d61fa4ffa3de412bfea335c6ecff681de2b609ba3c77ef3e00e521813a9ed9e"
+checksum = "f382da615b842244d4b8738c82ed1275e6c5dd90c459a30941cd07080b06c91a"
 dependencies = [
 "backtrace",
 "bytes",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [workspace]
 resolver = "2"
-members = ["./crates/sbv2_api", "./crates/sbv2_core", "./crates/sbv2_bindings", "./crates/sbv2_wasm"]
+members = ["./crates/sbv2_api", "./crates/sbv2_core", "./crates/sbv2_bindings", "./crates/sbv2_wasm", "crates/sbv2_editor"]

 [workspace.package]
 version = "0.2.0-alpha6"
--- a/crates/sbv2_core/src/jtalk.rs
+++ b/crates/sbv2_core/src/jtalk.rs
@@ -1,5 +1,5 @@
 use crate::error::{Error, Result};
-use crate::mora::{MORA_KATA_TO_MORA_PHONEMES, VOWELS};
+use crate::mora::{CONSONANTS, MORA_KATA_TO_MORA_PHONEMES, MORA_PHONEMES_TO_MORA_KATA, VOWELS};
 use crate::norm::{replace_punctuation, PUNCTUATIONS};
 use jpreprocess::{kind, DefaultTokenizer, JPreprocess, SystemDictionaryConfig, UserDictionary};
 use once_cell::sync::Lazy;
@@ -76,6 +76,34 @@ static MORA_PATTERN: Lazy<Vec<String>> = Lazy::new(|| {
 });
 static LONG_PATTERN: Lazy<Regex> = Lazy::new(|| Regex::new(r"(\w)(ー*)").unwrap());

+fn phone_tone_to_kana(phones: Vec<String>, tones: Vec<i32>) -> Vec<(String, i32)> {
+    let phones = &phones[1..];
+    let tones = &tones[1..];
+    let mut results = Vec::new();
+    let mut current_mora = String::new();
+    for ((phone, next_phone), (&tone, &next_tone)) in phones
+        .iter()
+        .zip(phones.iter().skip(1))
+        .zip(tones.iter().zip(tones.iter().skip(1)))
+    {
+        if PUNCTUATIONS.contains(&phone.clone().as_str()) {
+            results.push((phone.to_string(), tone));
+            continue;
+        }
+        if CONSONANTS.contains(&phone.clone()) {
+            assert_eq!(current_mora, "");
+            assert_eq!(tone, next_tone);
+            current_mora = phone.to_string()
+        } else {
+            current_mora += phone;
+            let kana = MORA_PHONEMES_TO_MORA_KATA.get(&current_mora).unwrap();
+            results.push((kana.to_string(), tone));
+            current_mora = String::new();
+        }
+    }
+    results
+}
+
 pub struct JTalkProcess {
    jpreprocess: Arc<JPreprocessType>,
    parsed: Vec<String>,
@@ -165,6 +193,11 @@ impl JTalkProcess {
        Ok((phones, tones, new_word2ph))
    }

+    pub fn g2kana_tone(&self) -> Result<Vec<(String, i32)>> {
+        let (phones, tones, _) = self.g2p()?;
+        Ok(phone_tone_to_kana(phones, tones))
+    }
+
    fn distribute_phone(n_phone: i32, n_word: i32) -> Vec<i32> {
        let mut phones_per_word = vec![0; n_word as usize];
        for _ in 0..n_phone {
--- a/crates/sbv2_core/src/main.rs
+++ b/crates/sbv2_core/src/main.rs
@@ -30,8 +30,7 @@ fn main_inner() -> anyhow::Result<()> {
        }
    }

-    let audio =
-        tts_holder.easy_synthesize(ident, &text, 0, 0, tts::SynthesizeOptions::default())?;
+    let audio = tts_holder.easy_synthesize(ident, text, 0, 0, tts::SynthesizeOptions::default())?;
    fs::write("output.wav", audio)?;

    Ok(())
--- a/crates/sbv2_core/src/mora.rs
+++ b/crates/sbv2_core/src/mora.rs
@@ -25,6 +25,21 @@ static MORA_LIST_ADDITIONAL: Lazy<Vec<Mora>> = Lazy::new(|| {
    data.additional
 });

+pub static MORA_PHONEMES_TO_MORA_KATA: Lazy<HashMap<String, String>> = Lazy::new(|| {
+    let mut map = HashMap::new();
+    for mora in MORA_LIST_MINIMUM.iter() {
+        map.insert(
+            format!(
+                "{}{}",
+                mora.consonant.clone().unwrap_or("".to_string()),
+                mora.vowel
+            ),
+            mora.mora.clone(),
+        );
+    }
+    map
+});
+
 pub static MORA_KATA_TO_MORA_PHONEMES: Lazy<HashMap<String, (Option<String>, String)>> =
    Lazy::new(|| {
        let mut map = HashMap::new();
@@ -37,4 +52,12 @@ pub static MORA_KATA_TO_MORA_PHONEMES: Lazy<HashMap<String, (Option<String>, Str
        map
    });

+pub static CONSONANTS: Lazy<Vec<String>> = Lazy::new(|| {
+    let consonants = MORA_KATA_TO_MORA_PHONEMES
+        .values()
+        .filter_map(|(consonant, _)| consonant.clone())
+        .collect::<Vec<_>>();
+    consonants
+});
+
 pub const VOWELS: [&str; 6] = ["a", "i", "u", "e", "o", "N"];
--- a/crates/sbv2_core/src/tts.rs
+++ b/crates/sbv2_core/src/tts.rs
@@ -41,7 +41,7 @@ pub struct TTSModelHolder {
    tokenizer: Tokenizer,
    bert: Session,
    models: Vec<TTSModel>,
-    jtalk: jtalk::JTalk,
+    pub jtalk: jtalk::JTalk,
    max_loaded_models: Option<usize>,
 }

@@ -205,6 +205,23 @@ impl TTSModelHolder {
    ) -> Result<(Array2<f32>, Array1<i64>, Array1<i64>, Array1<i64>)> {
        crate::tts_util::parse_text_blocking(
            text,
+            None,
+            &self.jtalk,
+            &self.tokenizer,
+            |token_ids, attention_masks| {
+                crate::bert::predict(&mut self.bert, token_ids, attention_masks)
+            },
+        )
+    }
+
+    pub fn parse_text_neo(
+        &mut self,
+        text: String,
+        given_tones: Option<Vec<i32>>,
+    ) -> Result<(Array2<f32>, Array1<i64>, Array1<i64>, Array1<i64>)> {
+        crate::tts_util::parse_text_blocking(
+            &text,
+            given_tones,
            &self.jtalk,
            &self.tokenizer,
            |token_ids, attention_masks| {
@@ -347,6 +364,79 @@ impl TTSModelHolder {
        };
        tts_util::array_to_vec(audio_array)
    }
+
+    pub fn easy_synthesize_neo<I: Into<TTSIdent> + Copy>(
+        &mut self,
+        ident: I,
+        text: &str,
+        given_tones: Option<Vec<i32>>,
+        style_id: i32,
+        speaker_id: i64,
+        options: SynthesizeOptions,
+    ) -> Result<Vec<u8>> {
+        self.find_and_load_model(ident)?;
+        let style_vector = self.get_style_vector(ident, style_id, options.style_weight)?;
+        let audio_array = if options.split_sentences {
+            let texts: Vec<&str> = text.split('\n').collect();
+            let mut audios = vec![];
+            for (i, t) in texts.iter().enumerate() {
+                if t.is_empty() {
+                    continue;
+                }
+                let (bert_ori, phones, tones, lang_ids) =
+                    self.parse_text_neo(t.to_string(), given_tones.clone())?;
+
+                let vits2 = self
+                    .find_model(ident)?
+                    .vits2
+                    .as_mut()
+                    .ok_or(Error::ModelNotFoundError(ident.into().to_string()))?;
+                let audio = model::synthesize(
+                    vits2,
+                    bert_ori.to_owned(),
+                    phones,
+                    Array1::from_vec(vec![speaker_id]),
+                    tones,
+                    lang_ids,
+                    style_vector.clone(),
+                    options.sdp_ratio,
+                    options.length_scale,
+                    0.677,
+                    0.8,
+                )?;
+                audios.push(audio.clone());
+                if i != texts.len() - 1 {
+                    audios.push(Array3::zeros((1, 1, 22050)));
+                }
+            }
+            concatenate(
+                Axis(2),
+                &audios.iter().map(|x| x.view()).collect::<Vec<_>>(),
+            )?
+        } else {
+            let (bert_ori, phones, tones, lang_ids) = self.parse_text(text)?;
+
+            let vits2 = self
+                .find_model(ident)?
+                .vits2
+                .as_mut()
+                .ok_or(Error::ModelNotFoundError(ident.into().to_string()))?;
+            model::synthesize(
+                vits2,
+                bert_ori.to_owned(),
+                phones,
+                Array1::from_vec(vec![speaker_id]),
+                tones,
+                lang_ids,
+                style_vector,
+                options.sdp_ratio,
+                options.length_scale,
+                0.677,
+                0.8,
+            )?
+        };
+        tts_util::array_to_vec(audio_array)
+    }
 }

 /// Synthesize options
--- a/crates/sbv2_core/src/tts_util.rs
+++ b/crates/sbv2_core/src/tts_util.rs
@@ -1,10 +1,22 @@
 use std::io::Cursor;

 use crate::error::Result;
+use crate::jtalk::JTalkProcess;
+use crate::mora::MORA_KATA_TO_MORA_PHONEMES;
+use crate::norm::PUNCTUATIONS;
 use crate::{jtalk, nlp, norm, tokenizer, utils};
 use hound::{SampleFormat, WavSpec, WavWriter};
 use ndarray::{concatenate, s, Array, Array1, Array2, Array3, Axis};
 use tokenizers::Tokenizer;
+
+pub fn preprocess_parse_text(text: &str, jtalk: &jtalk::JTalk) -> Result<(String, JTalkProcess)> {
+    let text = jtalk.num2word(text)?;
+    let normalized_text = norm::normalize_text(&text);
+
+    let process = jtalk.process_text(&normalized_text)?;
+    Ok((normalized_text, process))
+}
+
 /// Parse text and return the input for synthesize
 ///
 /// # Note
@@ -21,13 +33,9 @@ pub async fn parse_text(
        Box<dyn std::future::Future<Output = Result<ndarray::Array2<f32>>>>,
    >,
 ) -> Result<(Array2<f32>, Array1<i64>, Array1<i64>, Array1<i64>)> {
-    let text = jtalk.num2word(text)?;
-    let normalized_text = norm::normalize_text(&text);
-
-    let process = jtalk.process_text(&normalized_text)?;
+    let (normalized_text, process) = preprocess_parse_text(text, jtalk)?;
    let (phones, tones, mut word2ph) = process.g2p()?;
    let (phones, tones, lang_ids) = nlp::cleaned_text_to_sequence(phones, tones);
-
    let phones = utils::intersperse(&phones, 0);
    let tones = utils::intersperse(&tones, 0);
    let lang_ids = utils::intersperse(&lang_ids, 0);
@@ -92,6 +100,7 @@ pub async fn parse_text(
 #[allow(clippy::type_complexity)]
 pub fn parse_text_blocking(
    text: &str,
+    given_tones: Option<Vec<i32>>,
    jtalk: &jtalk::JTalk,
    tokenizer: &Tokenizer,
    bert_predict: impl FnOnce(Vec<i64>, Vec<i64>) -> Result<ndarray::Array2<f32>>,
@@ -100,7 +109,10 @@ pub fn parse_text_blocking(
    let normalized_text = norm::normalize_text(&text);

    let process = jtalk.process_text(&normalized_text)?;
-    let (phones, tones, mut word2ph) = process.g2p()?;
+    let (phones, mut tones, mut word2ph) = process.g2p()?;
+    if let Some(given_tones) = given_tones {
+        tones = given_tones;
+    }
    let (phones, tones, lang_ids) = nlp::cleaned_text_to_sequence(phones, tones);

    let phones = utils::intersperse(&phones, 0);
@@ -178,3 +190,23 @@ pub fn array_to_vec(audio_array: Array3<f32>) -> Result<Vec<u8>> {
    writer.finalize()?;
    Ok(cursor.into_inner())
 }
+
+pub fn kata_tone2phone_tone(kata_tone: Vec<(String, i32)>) -> Vec<(String, i32)> {
+    let mut results = vec![("_".to_string(), 0)];
+    for (mora, tone) in kata_tone {
+        if PUNCTUATIONS.contains(&mora.as_str()) {
+            results.push((mora, 0));
+            continue;
+        } else {
+            let (consonant, vowel) = MORA_KATA_TO_MORA_PHONEMES.get(&mora).unwrap();
+            if let Some(consonant) = consonant {
+                results.push((consonant.to_string(), tone));
+                results.push((vowel.to_string(), tone));
+            } else {
+                results.push((vowel.to_string(), tone));
+            }
+        }
+    }
+    results.push(("_".to_string(), 0));
+    results
+}
--- a/crates/sbv2_editor/Cargo.toml
+++ b/crates/sbv2_editor/Cargo.toml
@@ -0,0 +1,19 @@
+[package]
+name = "sbv2_editor"
+version.workspace = true
+edition.workspace = true
+description.workspace = true
+license.workspace = true
+readme.workspace = true
+repository.workspace = true
+documentation.workspace = true
+
+[dependencies]
+anyhow.workspace = true
+axum = "0.8.1"
+dotenvy.workspace = true
+env_logger.workspace = true
+log = "0.4.27"
+sbv2_core = { version = "0.2.0-alpha6", path = "../sbv2_core", features = ["aivmx"] }
+serde = { version = "1.0.219", features = ["derive"] }
+tokio = { version = "1.44.1", features = ["full"] }
--- a/crates/sbv2_editor/README.md
+++ b/crates/sbv2_editor/README.md
@@ -0,0 +1,2 @@
+# sbv2-voicevox
+sbv2-apiをvoicevox化します。
--- a/crates/sbv2_editor/query2.json
+++ b/crates/sbv2_editor/query2.json
@@ -0,0 +1,226 @@
+{
+    "accent_phrases": [
+        {
+            "moras": [
+                {
+                    "text": "コ",
+                    "consonant": "k",
+                    "consonant_length": 0.10002632439136505,
+                    "vowel": "o",
+                    "vowel_length": 0.15740256011486053,
+                    "pitch": 5.749961853027344
+                },
+                {
+                    "text": "ン",
+                    "consonant": null,
+                    "consonant_length": null,
+                    "vowel": "N",
+                    "vowel_length": 0.08265873789787292,
+                    "pitch": 5.89122200012207
+                },
+                {
+                    "text": "ニ",
+                    "consonant": "n",
+                    "consonant_length": 0.03657080978155136,
+                    "vowel": "i",
+                    "vowel_length": 0.1175866425037384,
+                    "pitch": 5.969866752624512
+                },
+                {
+                    "text": "チ",
+                    "consonant": "ch",
+                    "consonant_length": 0.09005842357873917,
+                    "vowel": "i",
+                    "vowel_length": 0.08666137605905533,
+                    "pitch": 5.958892822265625
+                },
+                {
+                    "text": "ワ",
+                    "consonant": "w",
+                    "consonant_length": 0.07833231985569,
+                    "vowel": "a",
+                    "vowel_length": 0.21250136196613312,
+                    "pitch": 5.949411392211914
+                }
+            ],
+            "accent": 5,
+            "pause_mora": {
+                "text": "、",
+                "consonant": null,
+                "consonant_length": null,
+                "vowel": "pau",
+                "vowel_length": 0.4723339378833771,
+                "pitch": 0.0
+            },
+            "is_interrogative": false
+        },
+        {
+            "moras": [
+                {
+                    "text": "オ",
+                    "consonant": null,
+                    "consonant_length": null,
+                    "vowel": "o",
+                    "vowel_length": 0.22004225850105286,
+                    "pitch": 5.6870927810668945
+                },
+                {
+                    "text": "ン",
+                    "consonant": null,
+                    "consonant_length": null,
+                    "vowel": "N",
+                    "vowel_length": 0.09161105751991272,
+                    "pitch": 5.93472957611084
+                },
+                {
+                    "text": "セ",
+                    "consonant": "s",
+                    "consonant_length": 0.08924821764230728,
+                    "vowel": "e",
+                    "vowel_length": 0.14142127335071564,
+                    "pitch": 6.121850490570068
+                },
+                {
+                    "text": "エ",
+                    "consonant": null,
+                    "consonant_length": null,
+                    "vowel": "e",
+                    "vowel_length": 0.10636933892965317,
+                    "pitch": 6.157896041870117
+                },
+                {
+                    "text": "ゴ",
+                    "consonant": "g",
+                    "consonant_length": 0.07600915431976318,
+                    "vowel": "o",
+                    "vowel_length": 0.09598273783922195,
+                    "pitch": 6.188933849334717
+                },
+                {
+                    "text": "オ",
+                    "consonant": null,
+                    "consonant_length": null,
+                    "vowel": "o",
+                    "vowel_length": 0.1079121008515358,
+                    "pitch": 6.235202789306641
+                },
+                {
+                    "text": "セ",
+                    "consonant": "s",
+                    "consonant_length": 0.09591838717460632,
+                    "vowel": "e",
+                    "vowel_length": 0.10286372154951096,
+                    "pitch": 6.153214454650879
+                },
+                {
+                    "text": "エ",
+                    "consonant": null,
+                    "consonant_length": null,
+                    "vowel": "e",
+                    "vowel_length": 0.08992656320333481,
+                    "pitch": 6.02571439743042
+                },
+                {
+                    "text": "ノ",
+                    "consonant": "n",
+                    "consonant_length": 0.05660202354192734,
+                    "vowel": "o",
+                    "vowel_length": 0.09676017612218857,
+                    "pitch": 5.711844444274902
+                }
+            ],
+            "accent": 5,
+            "pause_mora": null,
+            "is_interrogative": false
+        },
+        {
+            "moras": [
+                {
+                    "text": "セ",
+                    "consonant": "s",
+                    "consonant_length": 0.07805486768484116,
+                    "vowel": "e",
+                    "vowel_length": 0.09617523103952408,
+                    "pitch": 5.774399280548096
+                },
+                {
+                    "text": "カ",
+                    "consonant": "k",
+                    "consonant_length": 0.06712044775485992,
+                    "vowel": "a",
+                    "vowel_length": 0.148829385638237,
+                    "pitch": 6.063965797424316
+                },
+                {
+                    "text": "イ",
+                    "consonant": null,
+                    "consonant_length": null,
+                    "vowel": "i",
+                    "vowel_length": 0.11061104387044907,
+                    "pitch": 6.040698051452637
+                },
+                {
+                    "text": "エ",
+                    "consonant": null,
+                    "consonant_length": null,
+                    "vowel": "e",
+                    "vowel_length": 0.13046696782112122,
+                    "pitch": 5.806027889251709
+                }
+            ],
+            "accent": 1,
+            "pause_mora": null,
+            "is_interrogative": false
+        },
+        {
+            "moras": [
+                {
+                    "text": "ヨ",
+                    "consonant": "y",
+                    "consonant_length": 0.07194744795560837,
+                    "vowel": "o",
+                    "vowel_length": 0.08622600883245468,
+                    "pitch": 5.694094657897949
+                },
+                {
+                    "text": "オ",
+                    "consonant": null,
+                    "consonant_length": null,
+                    "vowel": "o",
+                    "vowel_length": 0.10635452717542648,
+                    "pitch": 5.787222385406494
+                },
+                {
+                    "text": "コ",
+                    "consonant": "k",
+                    "consonant_length": 0.07077334076166153,
+                    "vowel": "o",
+                    "vowel_length": 0.09248624742031097,
+                    "pitch": 5.793357849121094
+                },
+                {
+                    "text": "ソ",
+                    "consonant": "s",
+                    "consonant_length": 0.08705667406320572,
+                    "vowel": "o",
+                    "vowel_length": 0.2238258570432663,
+                    "pitch": 5.643765449523926
+                }
+            ],
+            "accent": 1,
+            "pause_mora": null,
+            "is_interrogative": false
+        }
+    ],
+    "speedScale": 1.0,
+    "pitchScale": 0.0,
+    "intonationScale": 1.0,
+    "volumeScale": 1.0,
+    "prePhonemeLength": 0.1,
+    "postPhonemeLength": 0.1,
+    "pauseLength": null,
+    "pauseLengthScale": 1.0,
+    "outputSamplingRate": 24000,
+    "outputStereo": false,
+    "kana": "コンニチワ'、オンセエゴ'オセエノ/セ'カイエ/ヨ'オコソ"
+}
--- a/crates/sbv2_editor/src/error.rs
+++ b/crates/sbv2_editor/src/error.rs
@@ -0,0 +1,27 @@
+use axum::{
+    http::StatusCode,
+    response::{IntoResponse, Response},
+};
+
+pub type AppResult<T> = std::result::Result<T, AppError>;
+
+pub struct AppError(anyhow::Error);
+
+impl IntoResponse for AppError {
+    fn into_response(self) -> Response {
+        (
+            StatusCode::INTERNAL_SERVER_ERROR,
+            format!("Something went wrong: {}", self.0),
+        )
+            .into_response()
+    }
+}
+
+impl<E> From<E> for AppError
+where
+    E: Into<anyhow::Error>,
+{
+    fn from(err: E) -> Self {
+        Self(err.into())
+    }
+}
--- a/crates/sbv2_editor/src/main.rs
+++ b/crates/sbv2_editor/src/main.rs
@@ -0,0 +1,197 @@
+use axum::extract::State;
+use axum::{
+    extract::Query,
+    http::header::CONTENT_TYPE,
+    response::IntoResponse,
+    routing::{get, post},
+    Json, Router,
+};
+use sbv2_core::tts_util::kata_tone2phone_tone;
+use sbv2_core::{
+    tts::{SynthesizeOptions, TTSModelHolder},
+    tts_util::preprocess_parse_text,
+};
+use serde::{Deserialize, Serialize};
+use tokio::{fs, net::TcpListener, sync::Mutex};
+
+use std::env;
+use std::sync::Arc;
+
+use error::AppResult;
+
+mod error;
+
+#[derive(Deserialize)]
+struct RequestCreateAudioQuery {
+    text: String,
+}
+
+#[derive(Serialize, Deserialize)]
+struct AudioQuery {
+    kana: String,
+    tone: i32,
+}
+
+#[derive(Serialize)]
+struct ResponseCreateAudioQuery {
+    audio_query: Vec<AudioQuery>,
+    text: String,
+}
+
+async fn create_audio_query(
+    State(state): State<AppState>,
+    Query(request): Query<RequestCreateAudioQuery>,
+) -> AppResult<impl IntoResponse> {
+    let (text, process) = {
+        let tts_model = state.tts_model.lock().await;
+        preprocess_parse_text(&request.text, &tts_model.jtalk)?
+    };
+    let kana_tone_list = process.g2kana_tone()?;
+    let audio_query = kana_tone_list
+        .iter()
+        .map(|(kana, tone)| AudioQuery {
+            kana: kana.clone(),
+            tone: *tone,
+        })
+        .collect::<Vec<_>>();
+    Ok(Json(ResponseCreateAudioQuery { audio_query, text }))
+}
+
+#[derive(Deserialize)]
+pub struct RequestSynthesis {
+    text: String,
+    speaker_id: i64,
+    sdp_ratio: f32,
+    length_scale: f32,
+    style_id: i32,
+    audio_query: Vec<AudioQuery>,
+    ident: String,
+}
+
+async fn synthesis(
+    State(state): State<AppState>,
+    Json(request): Json<RequestSynthesis>,
+) -> AppResult<impl IntoResponse> {
+    let phone_tone = request
+        .audio_query
+        .iter()
+        .map(|query| (query.kana.clone(), query.tone))
+        .collect::<Vec<_>>();
+    let phone_tone = kata_tone2phone_tone(phone_tone);
+    let tones = phone_tone.iter().map(|(_, tone)| *tone).collect::<Vec<_>>();
+    let buffer = {
+        let mut tts_model = state.tts_model.lock().await;
+        tts_model.easy_synthesize_neo(
+            &request.ident,
+            &request.text,
+            Some(tones),
+            request.style_id,
+            request.speaker_id,
+            SynthesizeOptions {
+                sdp_ratio: request.sdp_ratio,
+                length_scale: request.length_scale,
+                ..Default::default()
+            },
+        )?
+    };
+    Ok(([(CONTENT_TYPE, "audio/wav")], buffer))
+}
+
+#[derive(Clone)]
+struct AppState {
+    tts_model: Arc<Mutex<TTSModelHolder>>,
+}
+
+impl AppState {
+    pub async fn new() -> anyhow::Result<Self> {
+        let mut tts_model = TTSModelHolder::new(
+            &fs::read(env::var("BERT_MODEL_PATH")?).await?,
+            &fs::read(env::var("TOKENIZER_PATH")?).await?,
+            env::var("HOLDER_MAX_LOADED_MODElS")
+                .ok()
+                .and_then(|x| x.parse().ok()),
+        )?;
+        let models = env::var("MODELS_PATH").unwrap_or("models".to_string());
+        let mut f = fs::read_dir(&models).await?;
+        let mut entries = vec![];
+        while let Ok(Some(e)) = f.next_entry().await {
+            let name = e.file_name().to_string_lossy().to_string();
+            if name.ends_with(".onnx") && name.starts_with("model_") {
+                let name_len = name.len();
+                let name = name.chars();
+                entries.push(
+                    name.collect::<Vec<_>>()[6..name_len - 5]
+                        .iter()
+                        .collect::<String>(),
+                );
+            } else if name.ends_with(".sbv2") {
+                let entry = &name[..name.len() - 5];
+                log::info!("Try loading: {entry}");
+                let sbv2_bytes = match fs::read(format!("{models}/{entry}.sbv2")).await {
+                    Ok(b) => b,
+                    Err(e) => {
+                        log::warn!("Error loading sbv2_bytes from file {entry}: {e}");
+                        continue;
+                    }
+                };
+                if let Err(e) = tts_model.load_sbv2file(entry, sbv2_bytes) {
+                    log::warn!("Error loading {entry}: {e}");
+                };
+                log::info!("Loaded: {entry}");
+            } else if name.ends_with(".aivmx") {
+                let entry = &name[..name.len() - 6];
+                log::info!("Try loading: {entry}");
+                let aivmx_bytes = match fs::read(format!("{models}/{entry}.aivmx")).await {
+                    Ok(b) => b,
+                    Err(e) => {
+                        log::warn!("Error loading aivmx bytes from file {entry}: {e}");
+                        continue;
+                    }
+                };
+                if let Err(e) = tts_model.load_aivmx(entry, aivmx_bytes) {
+                    log::error!("Error loading {entry}: {e}");
+                }
+                log::info!("Loaded: {entry}");
+            }
+        }
+        for entry in entries {
+            log::info!("Try loading: {entry}");
+            let style_vectors_bytes =
+                match fs::read(format!("{models}/style_vectors_{entry}.json")).await {
+                    Ok(b) => b,
+                    Err(e) => {
+                        log::warn!("Error loading style_vectors_bytes from file {entry}: {e}");
+                        continue;
+                    }
+                };
+            let vits2_bytes = match fs::read(format!("{models}/model_{entry}.onnx")).await {
+                Ok(b) => b,
+                Err(e) => {
+                    log::warn!("Error loading vits2_bytes from file {entry}: {e}");
+                    continue;
+                }
+            };
+            if let Err(e) = tts_model.load(&entry, style_vectors_bytes, vits2_bytes) {
+                log::warn!("Error loading {entry}: {e}");
+            };
+            log::info!("Loaded: {entry}");
+        }
+        Ok(Self {
+            tts_model: Arc::new(Mutex::new(tts_model)),
+        })
+    }
+}
+
+#[tokio::main]
+async fn main() -> anyhow::Result<()> {
+    dotenvy::dotenv_override().ok();
+    env_logger::init();
+    let app = Router::new()
+        .route("/", get(|| async { "Hello, world!" }))
+        .route("/audio_query", get(create_audio_query))
+        .route("/synthesis", post(synthesis))
+        .with_state(AppState::new().await?);
+    let listener = TcpListener::bind("0.0.0.0:8080").await?;
+    axum::serve(listener, app).await?;
+    Ok(())
+}
--- a/test.py
+++ b/test.py
@@ -0,0 +1,19 @@
+import requests
+
+
+data = (requests.get("http://localhost:8080/audio_query", params={
+    "text": "こんにちは、今日はいい天気ですね。",
+})).json()
+print(data)
+
+data = (requests.post("http://localhost:8080/synthesis", json={
+    "text": data["text"],
+    "ident": "tsukuyomi",
+    "speaker_id": 0,
+    "style_id": 0,
+    "sdp_ratio": 0.5,
+    "length_scale": 0.5,
+    "audio_query": data["audio_query"],
+})).content
+with open("test.wav", "wb") as f:
+    f.write(data)
Author	SHA1	Message	Date
tuna2134@コマリン親衛隊	f081b2ed22	Merge pull request #198 from tuna2134/voicevox Editor APIを追加	2025-03-31 23:56:25 +09:00
Masato Kikuchi	103eb51ca8	delete	2025-03-31 23:39:44 +09:00
Masato Kikuchi	01541ff381	delete unimport	2025-03-31 23:36:10 +09:00
Masato Kikuchi	70c2341afd	format	2025-03-31 23:35:51 +09:00
Masato Kikuchi	a5d783bd65	fix: bug	2025-03-31 23:35:39 +09:00
Masato Kikuchi	633dfc305e	delete mut	2025-03-31 23:04:23 +09:00
Masato Kikuchi	53d7daf11a	fix	2025-03-31 23:03:30 +09:00
Masato Kikuchi	5abfe732e4	fix bug	2025-03-31 22:45:55 +09:00
tuna2134@コマリン親衛隊	48aef6cef4	tts.rs を更新	2025-03-29 11:02:23 +09:00
tuna2134@コマリン親衛隊	64fc74eee6	fix: bug	2025-03-29 10:58:24 +09:00
Masato Kikuchi	6e01103c5d	format	2025-03-29 10:50:40 +09:00
Masato Kikuchi	00e95cd77c	feat: synthesis	2025-03-29 10:50:30 +09:00
Masato Kikuchi	01f2aaa406	no voicevox	2025-03-28 20:14:51 +09:00
Masato Kikuchi	3785faf81e	fix	2025-03-28 20:08:07 +09:00
Masato Kikuchi	70e16f95ad	fix: voicevox化は難しいので、独自のエディター開発をする。	2025-03-28 20:06:00 +09:00
Masato Kikuchi	a67df43fc7	fix	2025-03-27 14:42:43 +09:00
Masato Kikuchi	472d1c600f	fix: add route	2025-03-27 13:59:00 +09:00
Masato Kikuchi	acf94a1283	format	2025-03-27 13:53:52 +09:00
Masato Kikuchi	dd5c536f39	feat: g2kana_tone	2025-03-27 13:53:27 +09:00
Masato Kikuchi	07637f587d	fix: type	2025-03-27 13:23:53 +09:00
Masato Kikuchi	e8dbf956e1	fix: forget to give return	2025-03-27 13:21:07 +09:00
Masato Kikuchi	2687af1a9b	clippy	2025-03-27 13:18:22 +09:00
Masato Kikuchi	e915e2bc84	feat: phone_tone_to_kana	2025-03-27 13:17:20 +09:00
Masato Kikuchi	22ed557395	oh	2025-03-27 09:59:08 +09:00
Masato Kikuchi	b8f0477318	feat: audio query request	2025-03-26 16:30:31 +09:00
Masato Kikuchi	f4de3e15ae	initial commit: voicevox	2025-03-26 16:14:29 +09:00
Masato Kikuchi	fc944b9d33	split the code for support voicevox	2025-03-26 15:14:22 +09:00