mirror of
https://github.com/neodyland/sbv2-api.git
synced 2026-05-14 04:40:40 +00:00
aded
This commit is contained in:
@@ -1,7 +1,6 @@
|
||||
use once_cell::sync::Lazy;
|
||||
use std::collections::HashMap;
|
||||
|
||||
|
||||
static REPLACE_MAP: Lazy<HashMap<&str, &str>> = Lazy::new(|| {
|
||||
let mut map = HashMap::new();
|
||||
map.insert(":", ",");
|
||||
@@ -69,14 +68,15 @@ __PUNCTUATION_CLEANUP_PATTERN = re.compile(
|
||||
+ "".join(PUNCTUATIONS) + r"]+", # fmt: skip
|
||||
)
|
||||
*/
|
||||
|
||||
pub static PUNCTUATIONS: [&str; 7] = ["!", "?", "…", ",", ".", "'", "-"];
|
||||
static PUNCTUATION_CLEANUP_PATTERN: Lazy<regex::Regex> = Lazy::new(|| {
|
||||
let pattern = (
|
||||
r"[^\u{3040}-\u{309F}\u{30A0}-\u{30FF}\u{4E00}-\u{9FFF}\u{3400}-\u{4DBF}\u{3005}".to_owned()
|
||||
let pattern = (r"[^\u{3040}-\u{309F}\u{30A0}-\u{30FF}\u{4E00}-\u{9FFF}\u{3400}-\u{4DBF}\u{3005}"
|
||||
.to_owned()
|
||||
+ r"\u{0041}-\u{005A}\u{0061}-\u{007A}"
|
||||
+ r"\u{FF21}-\u{FF3A}\u{FF41}-\u{FF5A}"
|
||||
+ r"\u{0370}-\u{03FF}\u{1F00}-\u{1FFF}"
|
||||
+ r"[!?\u{2026},.'-]+"
|
||||
);
|
||||
+ &PUNCTUATIONS.join("") + r"]+");
|
||||
regex::Regex::new(&pattern).unwrap()
|
||||
});
|
||||
|
||||
@@ -84,5 +84,7 @@ pub fn replace_punctuation(mut text: String) -> String {
|
||||
for (k, v) in REPLACE_MAP.iter() {
|
||||
text = text.replace(k, v);
|
||||
}
|
||||
text.to_string()
|
||||
}
|
||||
PUNCTUATION_CLEANUP_PATTERN
|
||||
.replace_all(&text, "")
|
||||
.to_string()
|
||||
}
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
use crate::error::{Error, Result};
|
||||
use crate::norm::{replace_punctuation, PUNCTUATIONS};
|
||||
use jpreprocess::*;
|
||||
use once_cell::sync::Lazy;
|
||||
use regex::Regex;
|
||||
@@ -106,16 +107,29 @@ impl JTalkProcess {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn text_to_seq_kata(&self) -> Result<()> {
|
||||
// let seq_kata: Vec<_> = vec![];
|
||||
// let seq_text: Vec<_> = vec![];
|
||||
fn text_to_seq_kata(&self) -> Result<(Vec<String>, Vec<String>)> {
|
||||
let mut seq_kata = vec![];
|
||||
let mut seq_text = vec![];
|
||||
|
||||
for parts in &self.parsed {
|
||||
let (string, mut pron) = self.parse_to_string_and_pron(parts.clone());
|
||||
println!("{} {}", string, pron);
|
||||
pron = pron.replace('’', "");
|
||||
let (string, pron) = self.parse_to_string_and_pron(parts.clone());
|
||||
let mut yomi = pron.replace('’', "");
|
||||
let word = replace_punctuation(string);
|
||||
assert!(yomi != "", "Empty yomi: {}", word);
|
||||
if yomi == "、" {
|
||||
if !word.chars().all(|x| PUNCTUATIONS.contains(&x.to_string().as_str())) {
|
||||
yomi = "'".repeat(word.len());
|
||||
} else {
|
||||
yomi = word.clone();
|
||||
}
|
||||
} else if yomi == "?" {
|
||||
assert!(word == "?", "yomi `?` comes from: {}", word);
|
||||
yomi = "?".to_string();
|
||||
}
|
||||
seq_text.push(word);
|
||||
seq_kata.push(yomi);
|
||||
}
|
||||
Ok(())
|
||||
Ok((seq_text, seq_kata))
|
||||
}
|
||||
|
||||
fn parse_to_string_and_pron(&self, parts: String) -> (String, String) {
|
||||
|
||||
Reference in New Issue
Block a user