Compare commits

...

33 Commits

Author SHA1 Message Date
tuna2134@コマリン親衛隊
f081b2ed22 Merge pull request #198 from tuna2134/voicevox
Editor APIを追加
2025-03-31 23:56:25 +09:00
Masato Kikuchi
103eb51ca8 delete 2025-03-31 23:39:44 +09:00
Masato Kikuchi
01541ff381 delete unimport 2025-03-31 23:36:10 +09:00
Masato Kikuchi
70c2341afd format 2025-03-31 23:35:51 +09:00
Masato Kikuchi
a5d783bd65 fix: bug 2025-03-31 23:35:39 +09:00
Masato Kikuchi
633dfc305e delete mut 2025-03-31 23:04:23 +09:00
Masato Kikuchi
53d7daf11a fix 2025-03-31 23:03:30 +09:00
Masato Kikuchi
5abfe732e4 fix bug 2025-03-31 22:45:55 +09:00
tuna2134@コマリン親衛隊
48aef6cef4 tts.rs を更新 2025-03-29 11:02:23 +09:00
tuna2134@コマリン親衛隊
64fc74eee6 fix: bug 2025-03-29 10:58:24 +09:00
Masato Kikuchi
6e01103c5d format 2025-03-29 10:50:40 +09:00
Masato Kikuchi
00e95cd77c feat: synthesis 2025-03-29 10:50:30 +09:00
Masato Kikuchi
01f2aaa406 no voicevox 2025-03-28 20:14:51 +09:00
Masato Kikuchi
3785faf81e fix 2025-03-28 20:08:07 +09:00
Masato Kikuchi
70e16f95ad fix: voicevox化は難しいので、独自のエディター開発をする。 2025-03-28 20:06:00 +09:00
Masato Kikuchi
a67df43fc7 fix 2025-03-27 14:42:43 +09:00
Masato Kikuchi
472d1c600f fix: add route 2025-03-27 13:59:00 +09:00
Masato Kikuchi
acf94a1283 format 2025-03-27 13:53:52 +09:00
Masato Kikuchi
dd5c536f39 feat: g2kana_tone 2025-03-27 13:53:27 +09:00
Masato Kikuchi
07637f587d fix: type 2025-03-27 13:23:53 +09:00
Masato Kikuchi
e8dbf956e1 fix: forget to give return 2025-03-27 13:21:07 +09:00
Masato Kikuchi
2687af1a9b clippy 2025-03-27 13:18:22 +09:00
Masato Kikuchi
e915e2bc84 feat: phone_tone_to_kana 2025-03-27 13:17:20 +09:00
Masato Kikuchi
22ed557395 oh 2025-03-27 09:59:08 +09:00
Masato Kikuchi
b8f0477318 feat: audio query request 2025-03-26 16:30:31 +09:00
Masato Kikuchi
f4de3e15ae initial commit: voicevox 2025-03-26 16:14:29 +09:00
Masato Kikuchi
fc944b9d33 split the code for support voicevox 2025-03-26 15:14:22 +09:00
tuna2134@コマリン親衛隊
99a4b130af Merge pull request #189 from tuna2134/renovate/ureq-3.x-lockfile 2025-03-01 16:59:08 +09:00
renovate[bot]
d430a6cb51 chore(deps): update rust crate ureq to v3.0.8 2025-02-28 18:57:35 +00:00
tuna2134@コマリン親衛隊
61aae68d2d Merge pull request #188 from tuna2134/tuna2134-patch-1
Fix example
2025-02-26 15:03:18 +09:00
tuna2134@コマリン親衛隊
abb40d4d2d Fix example 2025-02-26 14:38:09 +09:00
tuna2134@コマリン親衛隊
adb699efe7 Merge pull request #186 from tuna2134/renovate/pyo3-0.x-lockfile 2025-02-26 05:35:36 +09:00
renovate[bot]
00fa8025d7 fix(deps): update rust crate pyo3 to v0.23.5 2025-02-25 13:49:05 +00:00
14 changed files with 715 additions and 34 deletions

58
Cargo.lock generated
View File

@@ -1425,9 +1425,9 @@ dependencies = [
[[package]]
name = "log"
version = "0.4.26"
version = "0.4.27"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "30bde2b3dc3671ae49d8e2e9f044c7c005836e7a023ee57cffa25ab82764bb9e"
checksum = "13dc2df351e3202783a1fe0d44375f7295ffb4049267b0f3018346dc122a1d94"
[[package]]
name = "macro_rules_attribute"
@@ -1943,9 +1943,9 @@ dependencies = [
[[package]]
name = "pyo3"
version = "0.23.4"
version = "0.23.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "57fe09249128b3173d092de9523eaa75136bf7ba85e0d69eca241c7939c933cc"
checksum = "7778bffd85cf38175ac1f545509665d0b9b92a198ca7941f131f85f7a4f9a872"
dependencies = [
"anyhow",
"cfg-if",
@@ -1962,9 +1962,9 @@ dependencies = [
[[package]]
name = "pyo3-build-config"
version = "0.23.4"
version = "0.23.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1cd3927b5a78757a0d71aa9dff669f903b1eb64b54142a9bd9f757f8fde65fd7"
checksum = "94f6cbe86ef3bf18998d9df6e0f3fc1050a8c5efa409bf712e661a4366e010fb"
dependencies = [
"once_cell",
"target-lexicon",
@@ -1972,9 +1972,9 @@ dependencies = [
[[package]]
name = "pyo3-ffi"
version = "0.23.4"
version = "0.23.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dab6bb2102bd8f991e7749f130a70d05dd557613e39ed2deeee8e9ca0c4d548d"
checksum = "e9f1b4c431c0bb1c8fb0a338709859eed0d030ff6daa34368d3b152a63dfdd8d"
dependencies = [
"libc",
"pyo3-build-config",
@@ -1982,9 +1982,9 @@ dependencies = [
[[package]]
name = "pyo3-macros"
version = "0.23.4"
version = "0.23.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "91871864b353fd5ffcb3f91f2f703a22a9797c91b9ab497b1acac7b07ae509c7"
checksum = "fbc2201328f63c4710f68abdf653c89d8dbc2858b88c5d88b0ff38a75288a9da"
dependencies = [
"proc-macro2",
"pyo3-macros-backend",
@@ -1994,9 +1994,9 @@ dependencies = [
[[package]]
name = "pyo3-macros-backend"
version = "0.23.4"
version = "0.23.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "43abc3b80bc20f3facd86cd3c60beed58c3e2aa26213f3cda368de39c60a27e4"
checksum = "fca6726ad0f3da9c9de093d6f116a93c1a38e417ed73bf138472cf4064f72028"
dependencies = [
"heck",
"proc-macro2",
@@ -2312,6 +2312,20 @@ dependencies = [
"zstd",
]
[[package]]
name = "sbv2_editor"
version = "0.2.0-alpha6"
dependencies = [
"anyhow",
"axum",
"dotenvy",
"env_logger",
"log",
"sbv2_core",
"serde",
"tokio",
]
[[package]]
name = "sbv2_wasm"
version = "0.2.0-alpha6"
@@ -2364,18 +2378,18 @@ dependencies = [
[[package]]
name = "serde"
version = "1.0.218"
version = "1.0.219"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e8dfc9d19bdbf6d17e22319da49161d5d0108e4188e8b680aef6299eed22df60"
checksum = "5f0e2c6ed6606019b4e29e69dbaba95b11854410e5347d525002456dbbb786b6"
dependencies = [
"serde_derive",
]
[[package]]
name = "serde_derive"
version = "1.0.218"
version = "1.0.219"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f09503e191f4e797cb8aac08e9a4a4695c5edf6a2e70e376d961ddd5c969f82b"
checksum = "5b0276cf7f2c73365f7157c8123c21cd9a50fbbd844757af28ca1f5925fc2a00"
dependencies = [
"proc-macro2",
"quote",
@@ -2732,9 +2746,9 @@ dependencies = [
[[package]]
name = "tokio"
version = "1.43.0"
version = "1.44.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3d61fa4ffa3de412bfea335c6ecff681de2b609ba3c77ef3e00e521813a9ed9e"
checksum = "f382da615b842244d4b8738c82ed1275e6c5dd90c459a30941cd07080b06c91a"
dependencies = [
"backtrace",
"bytes",
@@ -2926,9 +2940,9 @@ checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1"
[[package]]
name = "ureq"
version = "3.0.6"
version = "3.0.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ca2e2dbdf4e95780e5d41804fab88b928a24585721018409eb429b1d29356bde"
checksum = "06f78313c985f2fba11100dd06d60dd402d0cabb458af4d94791b8e09c025323"
dependencies = [
"base64 0.22.1",
"der",
@@ -2948,9 +2962,9 @@ dependencies = [
[[package]]
name = "ureq-proto"
version = "0.3.0"
version = "0.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2c51fe73e1d8c4e06bb2698286f7e7453c6fc90528d6d2e7fc36bb4e87fe09b1"
checksum = "64adb55464bad1ab1aa9229133d0d59d2f679180f4d15f0d9debe616f541f25e"
dependencies = [
"base64 0.22.1",
"http",

View File

@@ -1,6 +1,6 @@
[workspace]
resolver = "2"
members = ["./crates/sbv2_api", "./crates/sbv2_core", "./crates/sbv2_bindings", "./crates/sbv2_wasm"]
members = ["./crates/sbv2_api", "./crates/sbv2_core", "./crates/sbv2_bindings", "./crates/sbv2_wasm", "crates/sbv2_editor"]
[workspace.package]
version = "0.2.0-alpha6"

View File

@@ -1,5 +1,5 @@
use crate::error::{Error, Result};
use crate::mora::{MORA_KATA_TO_MORA_PHONEMES, VOWELS};
use crate::mora::{CONSONANTS, MORA_KATA_TO_MORA_PHONEMES, MORA_PHONEMES_TO_MORA_KATA, VOWELS};
use crate::norm::{replace_punctuation, PUNCTUATIONS};
use jpreprocess::{kind, DefaultTokenizer, JPreprocess, SystemDictionaryConfig, UserDictionary};
use once_cell::sync::Lazy;
@@ -76,6 +76,34 @@ static MORA_PATTERN: Lazy<Vec<String>> = Lazy::new(|| {
});
static LONG_PATTERN: Lazy<Regex> = Lazy::new(|| Regex::new(r"(\w)(ー*)").unwrap());
fn phone_tone_to_kana(phones: Vec<String>, tones: Vec<i32>) -> Vec<(String, i32)> {
let phones = &phones[1..];
let tones = &tones[1..];
let mut results = Vec::new();
let mut current_mora = String::new();
for ((phone, next_phone), (&tone, &next_tone)) in phones
.iter()
.zip(phones.iter().skip(1))
.zip(tones.iter().zip(tones.iter().skip(1)))
{
if PUNCTUATIONS.contains(&phone.clone().as_str()) {
results.push((phone.to_string(), tone));
continue;
}
if CONSONANTS.contains(&phone.clone()) {
assert_eq!(current_mora, "");
assert_eq!(tone, next_tone);
current_mora = phone.to_string()
} else {
current_mora += phone;
let kana = MORA_PHONEMES_TO_MORA_KATA.get(&current_mora).unwrap();
results.push((kana.to_string(), tone));
current_mora = String::new();
}
}
results
}
pub struct JTalkProcess {
jpreprocess: Arc<JPreprocessType>,
parsed: Vec<String>,
@@ -165,6 +193,11 @@ impl JTalkProcess {
Ok((phones, tones, new_word2ph))
}
pub fn g2kana_tone(&self) -> Result<Vec<(String, i32)>> {
let (phones, tones, _) = self.g2p()?;
Ok(phone_tone_to_kana(phones, tones))
}
fn distribute_phone(n_phone: i32, n_word: i32) -> Vec<i32> {
let mut phones_per_word = vec![0; n_word as usize];
for _ in 0..n_phone {

View File

@@ -30,8 +30,7 @@ fn main_inner() -> anyhow::Result<()> {
}
}
let audio =
tts_holder.easy_synthesize(ident, &text, 0, 0, tts::SynthesizeOptions::default())?;
let audio = tts_holder.easy_synthesize(ident, text, 0, 0, tts::SynthesizeOptions::default())?;
fs::write("output.wav", audio)?;
Ok(())

View File

@@ -25,6 +25,21 @@ static MORA_LIST_ADDITIONAL: Lazy<Vec<Mora>> = Lazy::new(|| {
data.additional
});
pub static MORA_PHONEMES_TO_MORA_KATA: Lazy<HashMap<String, String>> = Lazy::new(|| {
let mut map = HashMap::new();
for mora in MORA_LIST_MINIMUM.iter() {
map.insert(
format!(
"{}{}",
mora.consonant.clone().unwrap_or("".to_string()),
mora.vowel
),
mora.mora.clone(),
);
}
map
});
pub static MORA_KATA_TO_MORA_PHONEMES: Lazy<HashMap<String, (Option<String>, String)>> =
Lazy::new(|| {
let mut map = HashMap::new();
@@ -37,4 +52,12 @@ pub static MORA_KATA_TO_MORA_PHONEMES: Lazy<HashMap<String, (Option<String>, Str
map
});
pub static CONSONANTS: Lazy<Vec<String>> = Lazy::new(|| {
let consonants = MORA_KATA_TO_MORA_PHONEMES
.values()
.filter_map(|(consonant, _)| consonant.clone())
.collect::<Vec<_>>();
consonants
});
pub const VOWELS: [&str; 6] = ["a", "i", "u", "e", "o", "N"];

View File

@@ -41,7 +41,7 @@ pub struct TTSModelHolder {
tokenizer: Tokenizer,
bert: Session,
models: Vec<TTSModel>,
jtalk: jtalk::JTalk,
pub jtalk: jtalk::JTalk,
max_loaded_models: Option<usize>,
}
@@ -205,6 +205,23 @@ impl TTSModelHolder {
) -> Result<(Array2<f32>, Array1<i64>, Array1<i64>, Array1<i64>)> {
crate::tts_util::parse_text_blocking(
text,
None,
&self.jtalk,
&self.tokenizer,
|token_ids, attention_masks| {
crate::bert::predict(&mut self.bert, token_ids, attention_masks)
},
)
}
pub fn parse_text_neo(
&mut self,
text: String,
given_tones: Option<Vec<i32>>,
) -> Result<(Array2<f32>, Array1<i64>, Array1<i64>, Array1<i64>)> {
crate::tts_util::parse_text_blocking(
&text,
given_tones,
&self.jtalk,
&self.tokenizer,
|token_ids, attention_masks| {
@@ -347,6 +364,79 @@ impl TTSModelHolder {
};
tts_util::array_to_vec(audio_array)
}
pub fn easy_synthesize_neo<I: Into<TTSIdent> + Copy>(
&mut self,
ident: I,
text: &str,
given_tones: Option<Vec<i32>>,
style_id: i32,
speaker_id: i64,
options: SynthesizeOptions,
) -> Result<Vec<u8>> {
self.find_and_load_model(ident)?;
let style_vector = self.get_style_vector(ident, style_id, options.style_weight)?;
let audio_array = if options.split_sentences {
let texts: Vec<&str> = text.split('\n').collect();
let mut audios = vec![];
for (i, t) in texts.iter().enumerate() {
if t.is_empty() {
continue;
}
let (bert_ori, phones, tones, lang_ids) =
self.parse_text_neo(t.to_string(), given_tones.clone())?;
let vits2 = self
.find_model(ident)?
.vits2
.as_mut()
.ok_or(Error::ModelNotFoundError(ident.into().to_string()))?;
let audio = model::synthesize(
vits2,
bert_ori.to_owned(),
phones,
Array1::from_vec(vec![speaker_id]),
tones,
lang_ids,
style_vector.clone(),
options.sdp_ratio,
options.length_scale,
0.677,
0.8,
)?;
audios.push(audio.clone());
if i != texts.len() - 1 {
audios.push(Array3::zeros((1, 1, 22050)));
}
}
concatenate(
Axis(2),
&audios.iter().map(|x| x.view()).collect::<Vec<_>>(),
)?
} else {
let (bert_ori, phones, tones, lang_ids) = self.parse_text(text)?;
let vits2 = self
.find_model(ident)?
.vits2
.as_mut()
.ok_or(Error::ModelNotFoundError(ident.into().to_string()))?;
model::synthesize(
vits2,
bert_ori.to_owned(),
phones,
Array1::from_vec(vec![speaker_id]),
tones,
lang_ids,
style_vector,
options.sdp_ratio,
options.length_scale,
0.677,
0.8,
)?
};
tts_util::array_to_vec(audio_array)
}
}
/// Synthesize options

View File

@@ -1,10 +1,22 @@
use std::io::Cursor;
use crate::error::Result;
use crate::jtalk::JTalkProcess;
use crate::mora::MORA_KATA_TO_MORA_PHONEMES;
use crate::norm::PUNCTUATIONS;
use crate::{jtalk, nlp, norm, tokenizer, utils};
use hound::{SampleFormat, WavSpec, WavWriter};
use ndarray::{concatenate, s, Array, Array1, Array2, Array3, Axis};
use tokenizers::Tokenizer;
pub fn preprocess_parse_text(text: &str, jtalk: &jtalk::JTalk) -> Result<(String, JTalkProcess)> {
let text = jtalk.num2word(text)?;
let normalized_text = norm::normalize_text(&text);
let process = jtalk.process_text(&normalized_text)?;
Ok((normalized_text, process))
}
/// Parse text and return the input for synthesize
///
/// # Note
@@ -21,13 +33,9 @@ pub async fn parse_text(
Box<dyn std::future::Future<Output = Result<ndarray::Array2<f32>>>>,
>,
) -> Result<(Array2<f32>, Array1<i64>, Array1<i64>, Array1<i64>)> {
let text = jtalk.num2word(text)?;
let normalized_text = norm::normalize_text(&text);
let process = jtalk.process_text(&normalized_text)?;
let (normalized_text, process) = preprocess_parse_text(text, jtalk)?;
let (phones, tones, mut word2ph) = process.g2p()?;
let (phones, tones, lang_ids) = nlp::cleaned_text_to_sequence(phones, tones);
let phones = utils::intersperse(&phones, 0);
let tones = utils::intersperse(&tones, 0);
let lang_ids = utils::intersperse(&lang_ids, 0);
@@ -92,6 +100,7 @@ pub async fn parse_text(
#[allow(clippy::type_complexity)]
pub fn parse_text_blocking(
text: &str,
given_tones: Option<Vec<i32>>,
jtalk: &jtalk::JTalk,
tokenizer: &Tokenizer,
bert_predict: impl FnOnce(Vec<i64>, Vec<i64>) -> Result<ndarray::Array2<f32>>,
@@ -100,7 +109,10 @@ pub fn parse_text_blocking(
let normalized_text = norm::normalize_text(&text);
let process = jtalk.process_text(&normalized_text)?;
let (phones, tones, mut word2ph) = process.g2p()?;
let (phones, mut tones, mut word2ph) = process.g2p()?;
if let Some(given_tones) = given_tones {
tones = given_tones;
}
let (phones, tones, lang_ids) = nlp::cleaned_text_to_sequence(phones, tones);
let phones = utils::intersperse(&phones, 0);
@@ -178,3 +190,23 @@ pub fn array_to_vec(audio_array: Array3<f32>) -> Result<Vec<u8>> {
writer.finalize()?;
Ok(cursor.into_inner())
}
pub fn kata_tone2phone_tone(kata_tone: Vec<(String, i32)>) -> Vec<(String, i32)> {
let mut results = vec![("_".to_string(), 0)];
for (mora, tone) in kata_tone {
if PUNCTUATIONS.contains(&mora.as_str()) {
results.push((mora, 0));
continue;
} else {
let (consonant, vowel) = MORA_KATA_TO_MORA_PHONEMES.get(&mora).unwrap();
if let Some(consonant) = consonant {
results.push((consonant.to_string(), tone));
results.push((vowel.to_string(), tone));
} else {
results.push((vowel.to_string(), tone));
}
}
}
results.push(("_".to_string(), 0));
results
}

View File

@@ -0,0 +1,19 @@
[package]
name = "sbv2_editor"
version.workspace = true
edition.workspace = true
description.workspace = true
license.workspace = true
readme.workspace = true
repository.workspace = true
documentation.workspace = true
[dependencies]
anyhow.workspace = true
axum = "0.8.1"
dotenvy.workspace = true
env_logger.workspace = true
log = "0.4.27"
sbv2_core = { version = "0.2.0-alpha6", path = "../sbv2_core", features = ["aivmx"] }
serde = { version = "1.0.219", features = ["derive"] }
tokio = { version = "1.44.1", features = ["full"] }

View File

@@ -0,0 +1,2 @@
# sbv2-voicevox
sbv2-apiをvoicevox化します。

View File

@@ -0,0 +1,226 @@
{
"accent_phrases": [
{
"moras": [
{
"text": "コ",
"consonant": "k",
"consonant_length": 0.10002632439136505,
"vowel": "o",
"vowel_length": 0.15740256011486053,
"pitch": 5.749961853027344
},
{
"text": "ン",
"consonant": null,
"consonant_length": null,
"vowel": "N",
"vowel_length": 0.08265873789787292,
"pitch": 5.89122200012207
},
{
"text": "ニ",
"consonant": "n",
"consonant_length": 0.03657080978155136,
"vowel": "i",
"vowel_length": 0.1175866425037384,
"pitch": 5.969866752624512
},
{
"text": "チ",
"consonant": "ch",
"consonant_length": 0.09005842357873917,
"vowel": "i",
"vowel_length": 0.08666137605905533,
"pitch": 5.958892822265625
},
{
"text": "ワ",
"consonant": "w",
"consonant_length": 0.07833231985569,
"vowel": "a",
"vowel_length": 0.21250136196613312,
"pitch": 5.949411392211914
}
],
"accent": 5,
"pause_mora": {
"text": "、",
"consonant": null,
"consonant_length": null,
"vowel": "pau",
"vowel_length": 0.4723339378833771,
"pitch": 0.0
},
"is_interrogative": false
},
{
"moras": [
{
"text": "オ",
"consonant": null,
"consonant_length": null,
"vowel": "o",
"vowel_length": 0.22004225850105286,
"pitch": 5.6870927810668945
},
{
"text": "ン",
"consonant": null,
"consonant_length": null,
"vowel": "N",
"vowel_length": 0.09161105751991272,
"pitch": 5.93472957611084
},
{
"text": "セ",
"consonant": "s",
"consonant_length": 0.08924821764230728,
"vowel": "e",
"vowel_length": 0.14142127335071564,
"pitch": 6.121850490570068
},
{
"text": "エ",
"consonant": null,
"consonant_length": null,
"vowel": "e",
"vowel_length": 0.10636933892965317,
"pitch": 6.157896041870117
},
{
"text": "ゴ",
"consonant": "g",
"consonant_length": 0.07600915431976318,
"vowel": "o",
"vowel_length": 0.09598273783922195,
"pitch": 6.188933849334717
},
{
"text": "オ",
"consonant": null,
"consonant_length": null,
"vowel": "o",
"vowel_length": 0.1079121008515358,
"pitch": 6.235202789306641
},
{
"text": "セ",
"consonant": "s",
"consonant_length": 0.09591838717460632,
"vowel": "e",
"vowel_length": 0.10286372154951096,
"pitch": 6.153214454650879
},
{
"text": "エ",
"consonant": null,
"consonant_length": null,
"vowel": "e",
"vowel_length": 0.08992656320333481,
"pitch": 6.02571439743042
},
{
"text": "",
"consonant": "n",
"consonant_length": 0.05660202354192734,
"vowel": "o",
"vowel_length": 0.09676017612218857,
"pitch": 5.711844444274902
}
],
"accent": 5,
"pause_mora": null,
"is_interrogative": false
},
{
"moras": [
{
"text": "セ",
"consonant": "s",
"consonant_length": 0.07805486768484116,
"vowel": "e",
"vowel_length": 0.09617523103952408,
"pitch": 5.774399280548096
},
{
"text": "カ",
"consonant": "k",
"consonant_length": 0.06712044775485992,
"vowel": "a",
"vowel_length": 0.148829385638237,
"pitch": 6.063965797424316
},
{
"text": "イ",
"consonant": null,
"consonant_length": null,
"vowel": "i",
"vowel_length": 0.11061104387044907,
"pitch": 6.040698051452637
},
{
"text": "エ",
"consonant": null,
"consonant_length": null,
"vowel": "e",
"vowel_length": 0.13046696782112122,
"pitch": 5.806027889251709
}
],
"accent": 1,
"pause_mora": null,
"is_interrogative": false
},
{
"moras": [
{
"text": "ヨ",
"consonant": "y",
"consonant_length": 0.07194744795560837,
"vowel": "o",
"vowel_length": 0.08622600883245468,
"pitch": 5.694094657897949
},
{
"text": "オ",
"consonant": null,
"consonant_length": null,
"vowel": "o",
"vowel_length": 0.10635452717542648,
"pitch": 5.787222385406494
},
{
"text": "コ",
"consonant": "k",
"consonant_length": 0.07077334076166153,
"vowel": "o",
"vowel_length": 0.09248624742031097,
"pitch": 5.793357849121094
},
{
"text": "ソ",
"consonant": "s",
"consonant_length": 0.08705667406320572,
"vowel": "o",
"vowel_length": 0.2238258570432663,
"pitch": 5.643765449523926
}
],
"accent": 1,
"pause_mora": null,
"is_interrogative": false
}
],
"speedScale": 1.0,
"pitchScale": 0.0,
"intonationScale": 1.0,
"volumeScale": 1.0,
"prePhonemeLength": 0.1,
"postPhonemeLength": 0.1,
"pauseLength": null,
"pauseLengthScale": 1.0,
"outputSamplingRate": 24000,
"outputStereo": false,
"kana": "コンニチワ'、オンセエゴ'オセエノ/セ'カイエ/ヨ'オコソ"
}

View File

@@ -0,0 +1,27 @@
use axum::{
http::StatusCode,
response::{IntoResponse, Response},
};
pub type AppResult<T> = std::result::Result<T, AppError>;
pub struct AppError(anyhow::Error);
impl IntoResponse for AppError {
fn into_response(self) -> Response {
(
StatusCode::INTERNAL_SERVER_ERROR,
format!("Something went wrong: {}", self.0),
)
.into_response()
}
}
impl<E> From<E> for AppError
where
E: Into<anyhow::Error>,
{
fn from(err: E) -> Self {
Self(err.into())
}
}

View File

@@ -0,0 +1,197 @@
use axum::extract::State;
use axum::{
extract::Query,
http::header::CONTENT_TYPE,
response::IntoResponse,
routing::{get, post},
Json, Router,
};
use sbv2_core::tts_util::kata_tone2phone_tone;
use sbv2_core::{
tts::{SynthesizeOptions, TTSModelHolder},
tts_util::preprocess_parse_text,
};
use serde::{Deserialize, Serialize};
use tokio::{fs, net::TcpListener, sync::Mutex};
use std::env;
use std::sync::Arc;
use error::AppResult;
mod error;
#[derive(Deserialize)]
struct RequestCreateAudioQuery {
text: String,
}
#[derive(Serialize, Deserialize)]
struct AudioQuery {
kana: String,
tone: i32,
}
#[derive(Serialize)]
struct ResponseCreateAudioQuery {
audio_query: Vec<AudioQuery>,
text: String,
}
async fn create_audio_query(
State(state): State<AppState>,
Query(request): Query<RequestCreateAudioQuery>,
) -> AppResult<impl IntoResponse> {
let (text, process) = {
let tts_model = state.tts_model.lock().await;
preprocess_parse_text(&request.text, &tts_model.jtalk)?
};
let kana_tone_list = process.g2kana_tone()?;
let audio_query = kana_tone_list
.iter()
.map(|(kana, tone)| AudioQuery {
kana: kana.clone(),
tone: *tone,
})
.collect::<Vec<_>>();
Ok(Json(ResponseCreateAudioQuery { audio_query, text }))
}
#[derive(Deserialize)]
pub struct RequestSynthesis {
text: String,
speaker_id: i64,
sdp_ratio: f32,
length_scale: f32,
style_id: i32,
audio_query: Vec<AudioQuery>,
ident: String,
}
async fn synthesis(
State(state): State<AppState>,
Json(request): Json<RequestSynthesis>,
) -> AppResult<impl IntoResponse> {
let phone_tone = request
.audio_query
.iter()
.map(|query| (query.kana.clone(), query.tone))
.collect::<Vec<_>>();
let phone_tone = kata_tone2phone_tone(phone_tone);
let tones = phone_tone.iter().map(|(_, tone)| *tone).collect::<Vec<_>>();
let buffer = {
let mut tts_model = state.tts_model.lock().await;
tts_model.easy_synthesize_neo(
&request.ident,
&request.text,
Some(tones),
request.style_id,
request.speaker_id,
SynthesizeOptions {
sdp_ratio: request.sdp_ratio,
length_scale: request.length_scale,
..Default::default()
},
)?
};
Ok(([(CONTENT_TYPE, "audio/wav")], buffer))
}
#[derive(Clone)]
struct AppState {
tts_model: Arc<Mutex<TTSModelHolder>>,
}
impl AppState {
pub async fn new() -> anyhow::Result<Self> {
let mut tts_model = TTSModelHolder::new(
&fs::read(env::var("BERT_MODEL_PATH")?).await?,
&fs::read(env::var("TOKENIZER_PATH")?).await?,
env::var("HOLDER_MAX_LOADED_MODElS")
.ok()
.and_then(|x| x.parse().ok()),
)?;
let models = env::var("MODELS_PATH").unwrap_or("models".to_string());
let mut f = fs::read_dir(&models).await?;
let mut entries = vec![];
while let Ok(Some(e)) = f.next_entry().await {
let name = e.file_name().to_string_lossy().to_string();
if name.ends_with(".onnx") && name.starts_with("model_") {
let name_len = name.len();
let name = name.chars();
entries.push(
name.collect::<Vec<_>>()[6..name_len - 5]
.iter()
.collect::<String>(),
);
} else if name.ends_with(".sbv2") {
let entry = &name[..name.len() - 5];
log::info!("Try loading: {entry}");
let sbv2_bytes = match fs::read(format!("{models}/{entry}.sbv2")).await {
Ok(b) => b,
Err(e) => {
log::warn!("Error loading sbv2_bytes from file {entry}: {e}");
continue;
}
};
if let Err(e) = tts_model.load_sbv2file(entry, sbv2_bytes) {
log::warn!("Error loading {entry}: {e}");
};
log::info!("Loaded: {entry}");
} else if name.ends_with(".aivmx") {
let entry = &name[..name.len() - 6];
log::info!("Try loading: {entry}");
let aivmx_bytes = match fs::read(format!("{models}/{entry}.aivmx")).await {
Ok(b) => b,
Err(e) => {
log::warn!("Error loading aivmx bytes from file {entry}: {e}");
continue;
}
};
if let Err(e) = tts_model.load_aivmx(entry, aivmx_bytes) {
log::error!("Error loading {entry}: {e}");
}
log::info!("Loaded: {entry}");
}
}
for entry in entries {
log::info!("Try loading: {entry}");
let style_vectors_bytes =
match fs::read(format!("{models}/style_vectors_{entry}.json")).await {
Ok(b) => b,
Err(e) => {
log::warn!("Error loading style_vectors_bytes from file {entry}: {e}");
continue;
}
};
let vits2_bytes = match fs::read(format!("{models}/model_{entry}.onnx")).await {
Ok(b) => b,
Err(e) => {
log::warn!("Error loading vits2_bytes from file {entry}: {e}");
continue;
}
};
if let Err(e) = tts_model.load(&entry, style_vectors_bytes, vits2_bytes) {
log::warn!("Error loading {entry}: {e}");
};
log::info!("Loaded: {entry}");
}
Ok(Self {
tts_model: Arc::new(Mutex::new(tts_model)),
})
}
}
#[tokio::main]
async fn main() -> anyhow::Result<()> {
dotenvy::dotenv_override().ok();
env_logger::init();
let app = Router::new()
.route("/", get(|| async { "Hello, world!" }))
.route("/audio_query", get(create_audio_query))
.route("/synthesis", post(synthesis))
.with_state(AppState::new().await?);
let listener = TcpListener::bind("0.0.0.0:8080").await?;
axum::serve(listener, app).await?;
Ok(())
}

View File

@@ -12,7 +12,7 @@ def main():
style_vector = model.get_style_vector("amitaro", 0, 1.0)
with open("output.wav", "wb") as f:
f.write(
model.synthesize("おはようございます。", "amitaro", style_vector, 0.0, 0.5)
model.synthesize("おはようございます。", "amitaro", 0, 0, 0.0, 0.5)
)

19
test.py Normal file
View File

@@ -0,0 +1,19 @@
import requests
data = (requests.get("http://localhost:8080/audio_query", params={
"text": "こんにちは、今日はいい天気ですね。",
})).json()
print(data)
data = (requests.post("http://localhost:8080/synthesis", json={
"text": data["text"],
"ident": "tsukuyomi",
"speaker_id": 0,
"style_id": 0,
"sdp_ratio": 0.5,
"length_scale": 0.5,
"audio_query": data["audio_query"],
})).content
with open("test.wav", "wb") as f:
f.write(data)