new: cleaned_sequence

This commit is contained in:
tuna2134
2024-09-10 00:06:57 +00:00
parent 724915b6a8
commit 6762a87760
5 changed files with 50 additions and 7 deletions

View File

@@ -128,7 +128,6 @@ impl JTalkProcess {
let mut phone_tone_list =
JTalkProcess::align_tones(phone_w_punct, phone_tone_list_wo_punct)?;
println!("{:?}", phone_tone_list);
let mut sep_tokenized: Vec<Vec<String>> = Vec::new();
for i in 0..seq_text.len() {

View File

@@ -1,8 +1,9 @@
pub mod bert;
pub mod error;
pub mod jtalk;
pub mod mora;
pub mod nlp;
pub mod norm;
pub mod text;
pub fn add(left: usize, right: usize) -> usize {
left + right

View File

@@ -1,19 +1,19 @@
use sbv2_core::{bert, error, text};
use sbv2_core::{bert, error, jtalk};
fn main() -> error::Result<()> {
let text = "こんにちは,世界!";
let normalized_text = text::normalize_text(text);
let normalized_text = jtalk::normalize_text(text);
println!("{}", normalized_text);
let jtalk = text::JTalk::new()?;
let jtalk = jtalk::JTalk::new()?;
let (phones, tones, _) = jtalk.g2p(&normalized_text)?;
println!("{:?}", tones);
let tokenizer = text::get_tokenizer()?;
let tokenizer = jtalk::get_tokenizer()?;
println!("{:?}", tokenizer);
let (token_ids, attention_masks) = text::tokenize(&normalized_text, &tokenizer)?;
let (token_ids, attention_masks) = jtalk::tokenize(&normalized_text, &tokenizer)?;
println!("{:?}", token_ids);
let session = bert::load_model()?;

24
sbv2_core/src/nlp.rs Normal file
View File

@@ -0,0 +1,24 @@
use crate::norm::SYMBOLS;
use once_cell::sync::Lazy;
use std::collections::HashMap;
static SYMBOL_TO_ID: Lazy<HashMap<String, i32>> = Lazy::new(|| {
let mut map = HashMap::new();
for (i, symbols) in SYMBOLS.iter().enumerate() {
map.insert(symbols.to_string(), i as i32);
}
map
});
pub fn cleaned_text_to_sequence(
cleaned_phones: Vec<String>,
tones: Vec<i32>,
) -> (Vec<i32>, Vec<i32>, Vec<i32>) {
let phones: Vec<i32> = cleaned_phones
.iter()
.map(|phone| SYMBOL_TO_ID.get(phone).unwrap())
.collect();
let tones: Vec<i32> = tones.iter().map(|tone| tone + 6).collect();
let lang_ids: Vec<i32> = vec![1; phones.len()];
(phones, tones, lang_ids)
}

View File

@@ -69,7 +69,26 @@ __PUNCTUATION_CLEANUP_PATTERN = re.compile(
)
*/
pub const JP_SYMBOLS: [&str; 42] = [
"N", "a", "a:", "b", "by", "ch", "d", "dy", "e", "e:", "f", "g", "gy", "h", "hy", "i", "i:",
"j", "k", "ky", "m", "my", "n", "ny", "o", "o:", "p", "py", "q", "r", "ry", "s", "sh", "t",
"ts", "ty", "u", "u:", "w", "y", "z", "zy",
];
pub static PUNCTUATIONS: [&str; 7] = ["!", "?", "", ",", ".", "'", "-"];
pub static PUNCTUATION_SYMBOLS: Lazy<Vec<&str>> = Lazy::new(|| {
let mut symbols = PUNCTUATIONS.to_vec();
symbols.append(&mut vec!["SP", "UNK"]);
symbols
});
const PAD: &str = "_";
pub static SYMBOLS: Lazy<Vec<&str>> = Lazy::new(|| {
let mut symbols = JP_SYMBOLS.to_vec();
symbols.append(&mut JP_SYMBOLS.to_vec());
symbols.append(&mut PUNCTUATION_SYMBOLS.to_vec());
symbols
});
static PUNCTUATION_CLEANUP_PATTERN: Lazy<regex::Regex> = Lazy::new(|| {
let pattern = r"[^\u{3040}-\u{309F}\u{30A0}-\u{30FF}\u{4E00}-\u{9FFF}\u{3400}-\u{4DBF}\u{3005}"
.to_owned()