mirror of
https://github.com/neodyland/sbv2-api.git
synced 2026-05-14 12:50:40 +00:00
new: cleaned_sequence
This commit is contained in:
@@ -128,7 +128,6 @@ impl JTalkProcess {
|
||||
|
||||
let mut phone_tone_list =
|
||||
JTalkProcess::align_tones(phone_w_punct, phone_tone_list_wo_punct)?;
|
||||
println!("{:?}", phone_tone_list);
|
||||
|
||||
let mut sep_tokenized: Vec<Vec<String>> = Vec::new();
|
||||
for i in 0..seq_text.len() {
|
||||
@@ -1,8 +1,9 @@
|
||||
pub mod bert;
|
||||
pub mod error;
|
||||
pub mod jtalk;
|
||||
pub mod mora;
|
||||
pub mod nlp;
|
||||
pub mod norm;
|
||||
pub mod text;
|
||||
|
||||
pub fn add(left: usize, right: usize) -> usize {
|
||||
left + right
|
||||
|
||||
@@ -1,19 +1,19 @@
|
||||
use sbv2_core::{bert, error, text};
|
||||
use sbv2_core::{bert, error, jtalk};
|
||||
|
||||
fn main() -> error::Result<()> {
|
||||
let text = "こんにちは,世界!";
|
||||
|
||||
let normalized_text = text::normalize_text(text);
|
||||
let normalized_text = jtalk::normalize_text(text);
|
||||
println!("{}", normalized_text);
|
||||
|
||||
let jtalk = text::JTalk::new()?;
|
||||
let jtalk = jtalk::JTalk::new()?;
|
||||
let (phones, tones, _) = jtalk.g2p(&normalized_text)?;
|
||||
println!("{:?}", tones);
|
||||
|
||||
let tokenizer = text::get_tokenizer()?;
|
||||
let tokenizer = jtalk::get_tokenizer()?;
|
||||
println!("{:?}", tokenizer);
|
||||
|
||||
let (token_ids, attention_masks) = text::tokenize(&normalized_text, &tokenizer)?;
|
||||
let (token_ids, attention_masks) = jtalk::tokenize(&normalized_text, &tokenizer)?;
|
||||
println!("{:?}", token_ids);
|
||||
|
||||
let session = bert::load_model()?;
|
||||
|
||||
24
sbv2_core/src/nlp.rs
Normal file
24
sbv2_core/src/nlp.rs
Normal file
@@ -0,0 +1,24 @@
|
||||
use crate::norm::SYMBOLS;
|
||||
use once_cell::sync::Lazy;
|
||||
use std::collections::HashMap;
|
||||
|
||||
static SYMBOL_TO_ID: Lazy<HashMap<String, i32>> = Lazy::new(|| {
|
||||
let mut map = HashMap::new();
|
||||
for (i, symbols) in SYMBOLS.iter().enumerate() {
|
||||
map.insert(symbols.to_string(), i as i32);
|
||||
}
|
||||
map
|
||||
});
|
||||
|
||||
pub fn cleaned_text_to_sequence(
|
||||
cleaned_phones: Vec<String>,
|
||||
tones: Vec<i32>,
|
||||
) -> (Vec<i32>, Vec<i32>, Vec<i32>) {
|
||||
let phones: Vec<i32> = cleaned_phones
|
||||
.iter()
|
||||
.map(|phone| SYMBOL_TO_ID.get(phone).unwrap())
|
||||
.collect();
|
||||
let tones: Vec<i32> = tones.iter().map(|tone| tone + 6).collect();
|
||||
let lang_ids: Vec<i32> = vec![1; phones.len()];
|
||||
(phones, tones, lang_ids)
|
||||
}
|
||||
@@ -69,7 +69,26 @@ __PUNCTUATION_CLEANUP_PATTERN = re.compile(
|
||||
)
|
||||
*/
|
||||
|
||||
pub const JP_SYMBOLS: [&str; 42] = [
|
||||
"N", "a", "a:", "b", "by", "ch", "d", "dy", "e", "e:", "f", "g", "gy", "h", "hy", "i", "i:",
|
||||
"j", "k", "ky", "m", "my", "n", "ny", "o", "o:", "p", "py", "q", "r", "ry", "s", "sh", "t",
|
||||
"ts", "ty", "u", "u:", "w", "y", "z", "zy",
|
||||
];
|
||||
|
||||
pub static PUNCTUATIONS: [&str; 7] = ["!", "?", "…", ",", ".", "'", "-"];
|
||||
pub static PUNCTUATION_SYMBOLS: Lazy<Vec<&str>> = Lazy::new(|| {
|
||||
let mut symbols = PUNCTUATIONS.to_vec();
|
||||
symbols.append(&mut vec!["SP", "UNK"]);
|
||||
symbols
|
||||
});
|
||||
const PAD: &str = "_";
|
||||
pub static SYMBOLS: Lazy<Vec<&str>> = Lazy::new(|| {
|
||||
let mut symbols = JP_SYMBOLS.to_vec();
|
||||
symbols.append(&mut JP_SYMBOLS.to_vec());
|
||||
symbols.append(&mut PUNCTUATION_SYMBOLS.to_vec());
|
||||
symbols
|
||||
});
|
||||
|
||||
static PUNCTUATION_CLEANUP_PATTERN: Lazy<regex::Regex> = Lazy::new(|| {
|
||||
let pattern = r"[^\u{3040}-\u{309F}\u{30A0}-\u{30FF}\u{4E00}-\u{9FFF}\u{3400}-\u{4DBF}\u{3005}"
|
||||
.to_owned()
|
||||
|
||||
Reference in New Issue
Block a user