This commit is contained in:
tuna2134
2024-09-09 09:33:48 +00:00
parent 3e601662d9
commit 57f55f8038
3 changed files with 104 additions and 9 deletions

View File

@@ -1,5 +1,6 @@
pub mod bert;
pub mod error;
pub mod norm;
pub mod text;
pub fn add(left: usize, right: usize) -> usize {

55
sbv2_core/src/norm.rs Normal file
View File

@@ -0,0 +1,55 @@
use once_cell::sync::Lazy;
use std::collections::HashMap;
static REPLACE_MAP: Lazy<HashMap<&str, &str>> = Lazy::new(|| {
let mut map = HashMap::new();
map.insert("", ",");
map.insert("", ",");
map.insert("", ",");
map.insert("", ".");
map.insert("", "!");
map.insert("", "?");
map.insert("\n", ".");
map.insert("", ".");
map.insert("", "...");
map.insert("···", "...");
map.insert("・・・", "...");
map.insert("·", ",");
map.insert("", ",");
map.insert("", ",");
map.insert("$", ".");
map.insert("", "'");
map.insert("", "'");
map.insert("\"", "'");
map.insert("", "'");
map.insert("", "'");
map.insert("", "'");
map.insert("", "'");
map.insert("(", "'");
map.insert(")", "'");
map.insert("", "'");
map.insert("", "'");
map.insert("", "'");
map.insert("", "'");
map.insert("[", "'");
map.insert("]", "'");
// NFKC 正規化後のハイフン・ダッシュの変種を全て通常半角ハイフン - \u002d に変換
map.insert("\u{02d7}", "\u{002d}"); // ˗, Modifier Letter Minus Sign
map.insert("\u{2010}", "\u{002d}"); // , Hyphen,
map.insert("\u{2012}", "\u{002d}"); // , Figure Dash
map.insert("\u{2013}", "\u{002d}"); // , En Dash
map.insert("\u{2014}", "\u{002d}"); // —, Em Dash
map.insert("\u{2015}", "\u{002d}"); // ―, Horizontal Bar
map.insert("\u{2043}", "\u{002d}"); // , Hyphen Bullet
map.insert("\u{2212}", "\u{002d}"); // , Minus Sign
map.insert("\u{23af}", "\u{002d}"); // ⎯, Horizontal Line Extension
map.insert("\u{23e4}", "\u{002d}"); // ⏤, Straightness
map.insert("\u{2500}", "\u{002d}"); // ─, Box Drawings Light Horizontal
map.insert("\u{2501}", "\u{002d}"); // ━, Box Drawings Heavy Horizontal
map.insert("\u{2e3a}", "\u{002d}"); // ⸺, Two-Em Dash
map.insert("\u{2e3b}", "\u{002d}"); // ⸻, Three-Em Dash
map.insert("", "'");
map.insert("", "'");
map
});

View File

@@ -3,6 +3,7 @@ use jpreprocess::*;
use once_cell::sync::Lazy;
use regex::Regex;
use std::collections::HashSet;
use std::sync::Arc;
use tokenizers::Tokenizer;
type JPreprocessType = JPreprocess<DefaultFetcher>;
@@ -42,15 +43,36 @@ macro_rules! hash_set {
}
pub struct JTalk {
pub jpreprocess: JPreprocessType,
pub jpreprocess: Arc<JPreprocessType>,
}
impl JTalk {
pub fn new() -> Result<Self> {
let jpreprocess = get_jtalk()?;
let jpreprocess = Arc::new(get_jtalk()?);
Ok(Self { jpreprocess })
}
pub fn g2p(&self, text: &str) -> Result<()> {
let parsed = self.jpreprocess.run_frontend(text)?;
let jtalk_process = JTalkProcess::new(Arc::clone(&self.jpreprocess), parsed);
jtalk_process.g2p()?;
Ok(())
}
}
struct JTalkProcess {
jpreprocess: Arc<JPreprocessType>,
parsed: Vec<String>,
}
impl JTalkProcess {
fn new(jpreprocess: Arc<JPreprocessType>, parsed: Vec<String>) -> Self {
Self {
jpreprocess,
parsed,
}
}
fn fix_phone_tone(&self, phone_tone_list: Vec<(String, i32)>) -> Result<Vec<(String, i32)>> {
let tone_values: HashSet<i32> = phone_tone_list
.iter()
@@ -78,13 +100,31 @@ impl JTalk {
}
}
pub fn g2p(&self, text: &str) -> Result<()> {
let phone_tone_list_wo_punct = self.g2phone_tone_wo_punct(text)?;
pub fn g2p(&self) -> Result<()> {
let phone_tone_list_wo_punct = self.g2phone_tone_wo_punct()?;
self.text_to_seq_kata()?;
Ok(())
}
fn g2phone_tone_wo_punct(&self, text: &str) -> Result<Vec<(String, i32)>> {
let prosodies = self.g2p_prosody(text)?;
fn text_to_seq_kata(&self) -> Result<()> {
// let seq_kata: Vec<_> = vec![];
// let seq_text: Vec<_> = vec![];
for parts in &self.parsed {
let (string, mut pron) = self.parse_to_string_and_pron(parts.clone());
println!("{} {}", string, pron);
pron = pron.replace("", "");
}
Ok(())
}
fn parse_to_string_and_pron(&self, parts: String) -> (String, String) {
let part_lists: Vec<String> = parts.split(",").map(|x| x.to_string()).collect();
(part_lists[0].clone(), part_lists[9].clone())
}
fn g2phone_tone_wo_punct(&self) -> Result<Vec<(String, i32)>> {
let prosodies = self.g2p_prosody()?;
let mut results: Vec<(String, i32)> = Vec::new();
let mut current_phrase: Vec<(String, i32)> = Vec::new();
@@ -117,8 +157,8 @@ impl JTalk {
Ok(results)
}
fn g2p_prosody(&self, text: &str) -> Result<Vec<String>> {
let labels = self.jpreprocess.extract_fullcontext(text)?;
fn g2p_prosody(&self) -> Result<Vec<String>> {
let labels = self.jpreprocess.make_label(self.parsed.clone());
let mut phones: Vec<String> = Vec::new();
for (i, label) in labels.iter().enumerate() {
@@ -177,7 +217,6 @@ pub fn normalize_text(text: &str) -> String {
// 日本語のテキストを正規化する
let text = text.replace('~', "");
let text = text.replace('', "");
text.replace('〜', "")
}