This commit is contained in:
tuna2134
2024-09-09 12:03:55 +00:00
parent fee041da12
commit 5d9bea6a2a
2 changed files with 30 additions and 14 deletions

View File

@@ -1,7 +1,6 @@
use once_cell::sync::Lazy; use once_cell::sync::Lazy;
use std::collections::HashMap; use std::collections::HashMap;
static REPLACE_MAP: Lazy<HashMap<&str, &str>> = Lazy::new(|| { static REPLACE_MAP: Lazy<HashMap<&str, &str>> = Lazy::new(|| {
let mut map = HashMap::new(); let mut map = HashMap::new();
map.insert("", ","); map.insert("", ",");
@@ -69,14 +68,15 @@ __PUNCTUATION_CLEANUP_PATTERN = re.compile(
+ "".join(PUNCTUATIONS) + r"]+", # fmt: skip + "".join(PUNCTUATIONS) + r"]+", # fmt: skip
) )
*/ */
pub static PUNCTUATIONS: [&str; 7] = ["!", "?", "", ",", ".", "'", "-"];
static PUNCTUATION_CLEANUP_PATTERN: Lazy<regex::Regex> = Lazy::new(|| { static PUNCTUATION_CLEANUP_PATTERN: Lazy<regex::Regex> = Lazy::new(|| {
let pattern = ( let pattern = (r"[^\u{3040}-\u{309F}\u{30A0}-\u{30FF}\u{4E00}-\u{9FFF}\u{3400}-\u{4DBF}\u{3005}"
r"[^\u{3040}-\u{309F}\u{30A0}-\u{30FF}\u{4E00}-\u{9FFF}\u{3400}-\u{4DBF}\u{3005}".to_owned() .to_owned()
+ r"\u{0041}-\u{005A}\u{0061}-\u{007A}" + r"\u{0041}-\u{005A}\u{0061}-\u{007A}"
+ r"\u{FF21}-\u{FF3A}\u{FF41}-\u{FF5A}" + r"\u{FF21}-\u{FF3A}\u{FF41}-\u{FF5A}"
+ r"\u{0370}-\u{03FF}\u{1F00}-\u{1FFF}" + r"\u{0370}-\u{03FF}\u{1F00}-\u{1FFF}"
+ r"[!?\u{2026},.'-]+" + &PUNCTUATIONS.join("") + r"]+");
);
regex::Regex::new(&pattern).unwrap() regex::Regex::new(&pattern).unwrap()
}); });
@@ -84,5 +84,7 @@ pub fn replace_punctuation(mut text: String) -> String {
for (k, v) in REPLACE_MAP.iter() { for (k, v) in REPLACE_MAP.iter() {
text = text.replace(k, v); text = text.replace(k, v);
} }
text.to_string() PUNCTUATION_CLEANUP_PATTERN
} .replace_all(&text, "")
.to_string()
}

View File

@@ -1,4 +1,5 @@
use crate::error::{Error, Result}; use crate::error::{Error, Result};
use crate::norm::{replace_punctuation, PUNCTUATIONS};
use jpreprocess::*; use jpreprocess::*;
use once_cell::sync::Lazy; use once_cell::sync::Lazy;
use regex::Regex; use regex::Regex;
@@ -106,16 +107,29 @@ impl JTalkProcess {
Ok(()) Ok(())
} }
fn text_to_seq_kata(&self) -> Result<()> { fn text_to_seq_kata(&self) -> Result<(Vec<String>, Vec<String>)> {
// let seq_kata: Vec<_> = vec![]; let mut seq_kata = vec![];
// let seq_text: Vec<_> = vec![]; let mut seq_text = vec![];
for parts in &self.parsed { for parts in &self.parsed {
let (string, mut pron) = self.parse_to_string_and_pron(parts.clone()); let (string, pron) = self.parse_to_string_and_pron(parts.clone());
println!("{} {}", string, pron); let mut yomi = pron.replace('', "");
pron = pron.replace('', ""); let word = replace_punctuation(string);
assert!(yomi != "", "Empty yomi: {}", word);
if yomi == "" {
if !word.chars().all(|x| PUNCTUATIONS.contains(&x.to_string().as_str())) {
yomi = "'".repeat(word.len());
} else {
yomi = word.clone();
}
} else if yomi == "" {
assert!(word == "?", "yomi `` comes from: {}", word);
yomi = "?".to_string();
}
seq_text.push(word);
seq_kata.push(yomi);
} }
Ok(()) Ok((seq_text, seq_kata))
} }
fn parse_to_string_and_pron(&self, parts: String) -> (String, String) { fn parse_to_string_and_pron(&self, parts: String) -> (String, String) {