diff --git a/sbv2_core/src/norm.rs b/sbv2_core/src/norm.rs index 3bce32f..49c91ab 100644 --- a/sbv2_core/src/norm.rs +++ b/sbv2_core/src/norm.rs @@ -1,7 +1,6 @@ use once_cell::sync::Lazy; use std::collections::HashMap; - static REPLACE_MAP: Lazy> = Lazy::new(|| { let mut map = HashMap::new(); map.insert(":", ","); @@ -69,14 +68,15 @@ __PUNCTUATION_CLEANUP_PATTERN = re.compile( + "".join(PUNCTUATIONS) + r"]+", # fmt: skip ) */ + +pub static PUNCTUATIONS: [&str; 7] = ["!", "?", "…", ",", ".", "'", "-"]; static PUNCTUATION_CLEANUP_PATTERN: Lazy = Lazy::new(|| { - let pattern = ( - r"[^\u{3040}-\u{309F}\u{30A0}-\u{30FF}\u{4E00}-\u{9FFF}\u{3400}-\u{4DBF}\u{3005}".to_owned() + let pattern = (r"[^\u{3040}-\u{309F}\u{30A0}-\u{30FF}\u{4E00}-\u{9FFF}\u{3400}-\u{4DBF}\u{3005}" + .to_owned() + r"\u{0041}-\u{005A}\u{0061}-\u{007A}" + r"\u{FF21}-\u{FF3A}\u{FF41}-\u{FF5A}" + r"\u{0370}-\u{03FF}\u{1F00}-\u{1FFF}" - + r"[!?\u{2026},.'-]+" - ); + + &PUNCTUATIONS.join("") + r"]+"); regex::Regex::new(&pattern).unwrap() }); @@ -84,5 +84,7 @@ pub fn replace_punctuation(mut text: String) -> String { for (k, v) in REPLACE_MAP.iter() { text = text.replace(k, v); } - text.to_string() -} \ No newline at end of file + PUNCTUATION_CLEANUP_PATTERN + .replace_all(&text, "") + .to_string() +} diff --git a/sbv2_core/src/text.rs b/sbv2_core/src/text.rs index 45def9a..7a45d51 100644 --- a/sbv2_core/src/text.rs +++ b/sbv2_core/src/text.rs @@ -1,4 +1,5 @@ use crate::error::{Error, Result}; +use crate::norm::{replace_punctuation, PUNCTUATIONS}; use jpreprocess::*; use once_cell::sync::Lazy; use regex::Regex; @@ -106,16 +107,29 @@ impl JTalkProcess { Ok(()) } - fn text_to_seq_kata(&self) -> Result<()> { - // let seq_kata: Vec<_> = vec![]; - // let seq_text: Vec<_> = vec![]; + fn text_to_seq_kata(&self) -> Result<(Vec, Vec)> { + let mut seq_kata = vec![]; + let mut seq_text = vec![]; for parts in &self.parsed { - let (string, mut pron) = self.parse_to_string_and_pron(parts.clone()); - println!("{} {}", string, pron); - pron = pron.replace('’', ""); + let (string, pron) = self.parse_to_string_and_pron(parts.clone()); + let mut yomi = pron.replace('’', ""); + let word = replace_punctuation(string); + assert!(yomi != "", "Empty yomi: {}", word); + if yomi == "、" { + if !word.chars().all(|x| PUNCTUATIONS.contains(&x.to_string().as_str())) { + yomi = "'".repeat(word.len()); + } else { + yomi = word.clone(); + } + } else if yomi == "?" { + assert!(word == "?", "yomi `?` comes from: {}", word); + yomi = "?".to_string(); + } + seq_text.push(word); + seq_kata.push(yomi); } - Ok(()) + Ok((seq_text, seq_kata)) } fn parse_to_string_and_pron(&self, parts: String) -> (String, String) {