This commit is contained in:
tuna2134
2024-09-09 15:31:25 +00:00
parent 30c2bd1e77
commit a5138fc5b6
3 changed files with 51 additions and 19 deletions

View File

@@ -1,8 +1,8 @@
pub mod bert;
pub mod error;
pub mod mora;
pub mod norm;
pub mod text;
pub mod mora;
pub fn add(left: usize, right: usize) -> usize {
left + right

View File

@@ -1,5 +1,5 @@
use serde::{Deserialize, Serialize};
use once_cell::sync::Lazy;
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
#[derive(Debug, Serialize, Deserialize)]
@@ -25,10 +25,16 @@ static MORA_LIST_ADDITIONAL: Lazy<Vec<Mora>> = Lazy::new(|| {
data.additional
});
pub static MORA_KATA_TO_MORA_PHONEMES: Lazy<HashMap<String, (Option<String>, String)>> = Lazy::new(|| {
let mut map = HashMap::new();
for mora in MORA_LIST_MINIMUM.iter().chain(MORA_LIST_ADDITIONAL.iter()) {
map.insert(mora.mora.clone(), (mora.consonant.clone(), mora.vowel.clone()));
}
map
});
pub static MORA_KATA_TO_MORA_PHONEMES: Lazy<HashMap<String, (Option<String>, String)>> =
Lazy::new(|| {
let mut map = HashMap::new();
for mora in MORA_LIST_MINIMUM.iter().chain(MORA_LIST_ADDITIONAL.iter()) {
map.insert(
mora.mora.clone(),
(mora.consonant.clone(), mora.vowel.clone()),
);
}
map
});
pub const VOWELS: [&str; 6] = ["a", "i", "u", "e", "o", "N"];

View File

@@ -1,6 +1,6 @@
use crate::error::{Error, Result};
use crate::mora::{MORA_KATA_TO_MORA_PHONEMES, VOWELS};
use crate::norm::{replace_punctuation, PUNCTUATIONS};
use crate::mora::MORA_KATA_TO_MORA_PHONEMES;
use jpreprocess::*;
use once_cell::sync::Lazy;
use regex::Regex;
@@ -113,23 +113,44 @@ impl JTalkProcess {
pub fn g2p(&self) -> Result<()> {
let phone_tone_list_wo_punct = self.g2phone_tone_wo_punct()?;
let (seq_text, seq_kata) = self.text_to_seq_kata()?;
println!("{:?}", seq_text);
let sep_phonemes = JTalkProcess::handle_long(
seq_kata
.iter()
.map(|x| JTalkProcess::kata_to_phoneme_list(x.clone()).unwrap())
.collect(),
);
println!("{:?}", sep_phonemes);
println!("{:?}", seq_kata);
Ok(())
}
fn handle_long(sep_phonemes: Vec<Vec<String>>) -> Vec<Vec<String>> {
for (i, phonemes) in sep_phonemes.iter().enumerate() {
if phonemes.len() == 0 {
fn handle_long(mut sep_phonemes: Vec<Vec<String>>) -> Vec<Vec<String>> {
for i in 0..sep_phonemes.len() {
if sep_phonemes[i].len() == 0 {
continue;
}
if phonemes[0] == "" {
if sep_phonemes[i][0] == "" {
if i != 0 {
let prev_phoneme = sep_phonemes[i - 1].last().unwrap();
if VOWELS.contains(&prev_phoneme.as_str()) {
sep_phonemes[i][0] = prev_phoneme.clone();
} else {
sep_phonemes[i][0] = "".to_string();
}
} else {
sep_phonemes[i][0] = "".to_string();
}
}
if sep_phonemes[i].contains(&"".to_string()) {
for e in 0..sep_phonemes[i].len() {
if sep_phonemes[i][e] == "" {
sep_phonemes[i][e] =
sep_phonemes[i][e - 1].chars().last().unwrap().to_string();
}
}
}
}
vec![]
sep_phonemes
}
fn kata_to_phoneme_list(mut text: String) -> Result<Vec<String>> {
@@ -157,8 +178,11 @@ impl JTalkProcess {
if PUNCTUATIONS.contains(&text.as_str()) {
return Ok(text.chars().map(|x| x.to_string()).collect());
}
if KATAKANA_PATTERN.is_match(&text) {
return Err(Error::ValueError(format!("Input must be katakana only: {}", text)));
if !KATAKANA_PATTERN.is_match(&text) {
return Err(Error::ValueError(format!(
"Input must be katakana only: {}",
text
)));
}
fn mora2phonemes(mora: &str) -> String {
@@ -182,7 +206,9 @@ impl JTalkProcess {
}
result
};
text = LONG_PATTERN.replace_all(&text, long_replacement).to_string();
text = LONG_PATTERN
.replace_all(&text, long_replacement)
.to_string();
return Ok(text.trim().split(" ").map(|x| x.to_string()).collect());
}