mirror of
https://github.com/neodyland/sbv2-api.git
synced 2026-01-07 06:52:57 +00:00
aded
This commit is contained in:
@@ -1,7 +1,6 @@
|
|||||||
use once_cell::sync::Lazy;
|
use once_cell::sync::Lazy;
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
|
|
||||||
|
|
||||||
static REPLACE_MAP: Lazy<HashMap<&str, &str>> = Lazy::new(|| {
|
static REPLACE_MAP: Lazy<HashMap<&str, &str>> = Lazy::new(|| {
|
||||||
let mut map = HashMap::new();
|
let mut map = HashMap::new();
|
||||||
map.insert(":", ",");
|
map.insert(":", ",");
|
||||||
@@ -69,14 +68,15 @@ __PUNCTUATION_CLEANUP_PATTERN = re.compile(
|
|||||||
+ "".join(PUNCTUATIONS) + r"]+", # fmt: skip
|
+ "".join(PUNCTUATIONS) + r"]+", # fmt: skip
|
||||||
)
|
)
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
pub static PUNCTUATIONS: [&str; 7] = ["!", "?", "…", ",", ".", "'", "-"];
|
||||||
static PUNCTUATION_CLEANUP_PATTERN: Lazy<regex::Regex> = Lazy::new(|| {
|
static PUNCTUATION_CLEANUP_PATTERN: Lazy<regex::Regex> = Lazy::new(|| {
|
||||||
let pattern = (
|
let pattern = (r"[^\u{3040}-\u{309F}\u{30A0}-\u{30FF}\u{4E00}-\u{9FFF}\u{3400}-\u{4DBF}\u{3005}"
|
||||||
r"[^\u{3040}-\u{309F}\u{30A0}-\u{30FF}\u{4E00}-\u{9FFF}\u{3400}-\u{4DBF}\u{3005}".to_owned()
|
.to_owned()
|
||||||
+ r"\u{0041}-\u{005A}\u{0061}-\u{007A}"
|
+ r"\u{0041}-\u{005A}\u{0061}-\u{007A}"
|
||||||
+ r"\u{FF21}-\u{FF3A}\u{FF41}-\u{FF5A}"
|
+ r"\u{FF21}-\u{FF3A}\u{FF41}-\u{FF5A}"
|
||||||
+ r"\u{0370}-\u{03FF}\u{1F00}-\u{1FFF}"
|
+ r"\u{0370}-\u{03FF}\u{1F00}-\u{1FFF}"
|
||||||
+ r"[!?\u{2026},.'-]+"
|
+ &PUNCTUATIONS.join("") + r"]+");
|
||||||
);
|
|
||||||
regex::Regex::new(&pattern).unwrap()
|
regex::Regex::new(&pattern).unwrap()
|
||||||
});
|
});
|
||||||
|
|
||||||
@@ -84,5 +84,7 @@ pub fn replace_punctuation(mut text: String) -> String {
|
|||||||
for (k, v) in REPLACE_MAP.iter() {
|
for (k, v) in REPLACE_MAP.iter() {
|
||||||
text = text.replace(k, v);
|
text = text.replace(k, v);
|
||||||
}
|
}
|
||||||
text.to_string()
|
PUNCTUATION_CLEANUP_PATTERN
|
||||||
}
|
.replace_all(&text, "")
|
||||||
|
.to_string()
|
||||||
|
}
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
use crate::error::{Error, Result};
|
use crate::error::{Error, Result};
|
||||||
|
use crate::norm::{replace_punctuation, PUNCTUATIONS};
|
||||||
use jpreprocess::*;
|
use jpreprocess::*;
|
||||||
use once_cell::sync::Lazy;
|
use once_cell::sync::Lazy;
|
||||||
use regex::Regex;
|
use regex::Regex;
|
||||||
@@ -106,16 +107,29 @@ impl JTalkProcess {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn text_to_seq_kata(&self) -> Result<()> {
|
fn text_to_seq_kata(&self) -> Result<(Vec<String>, Vec<String>)> {
|
||||||
// let seq_kata: Vec<_> = vec![];
|
let mut seq_kata = vec![];
|
||||||
// let seq_text: Vec<_> = vec![];
|
let mut seq_text = vec![];
|
||||||
|
|
||||||
for parts in &self.parsed {
|
for parts in &self.parsed {
|
||||||
let (string, mut pron) = self.parse_to_string_and_pron(parts.clone());
|
let (string, pron) = self.parse_to_string_and_pron(parts.clone());
|
||||||
println!("{} {}", string, pron);
|
let mut yomi = pron.replace('’', "");
|
||||||
pron = pron.replace('’', "");
|
let word = replace_punctuation(string);
|
||||||
|
assert!(yomi != "", "Empty yomi: {}", word);
|
||||||
|
if yomi == "、" {
|
||||||
|
if !word.chars().all(|x| PUNCTUATIONS.contains(&x.to_string().as_str())) {
|
||||||
|
yomi = "'".repeat(word.len());
|
||||||
|
} else {
|
||||||
|
yomi = word.clone();
|
||||||
|
}
|
||||||
|
} else if yomi == "?" {
|
||||||
|
assert!(word == "?", "yomi `?` comes from: {}", word);
|
||||||
|
yomi = "?".to_string();
|
||||||
|
}
|
||||||
|
seq_text.push(word);
|
||||||
|
seq_kata.push(yomi);
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok((seq_text, seq_kata))
|
||||||
}
|
}
|
||||||
|
|
||||||
fn parse_to_string_and_pron(&self, parts: String) -> (String, String) {
|
fn parse_to_string_and_pron(&self, parts: String) -> (String, String) {
|
||||||
|
|||||||
Reference in New Issue
Block a user