add g2p

2025-12-22 23:49:58 +00:00 · 2024-09-09 08:46:48 +00:00
parent ac94add3ed
commit e075937ee7
11 changed files with 24211 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,2 @@
 target
 models/*.onnx
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -0,0 +1,6 @@
 [workspace]
 resolver = "2"
 members = ["sbv2_core"]
 [workspace.dependencies]
 anyhow = "1.0.86"
--- a/models/.gitkeep
+++ b/models/.gitkeep
--- a/sbv2_core/Cargo.toml
+++ b/sbv2_core/Cargo.toml
@@ -0,0 +1,14 @@
 [package]
 name = "sbv2_core"
 version = "0.1.0"
 edition = "2021"
 [dependencies]
 anyhow.workspace = true
 jpreprocess = { version = "0.10.0", features = ["naist-jdic"] }
 ndarray = "0.16.1"
 once_cell = "1.19.0"
 ort = { git = "https://github.com/pykeio/ort.git", version = "2.0.0-rc.5" }
 regex = "1.10.6"
 thiserror = "1.0.63"
 tokenizers = "0.20.0"
--- a/sbv2_core/src/bert.rs
+++ b/sbv2_core/src/bert.rs
@@ -0,0 +1,26 @@
 use crate::error::Result;
 use ndarray::Array2;
 use ort::{GraphOptimizationLevel, Session};
 pub fn load_model() -> Result<Session> {
    let session = Session::builder()?
        .with_optimization_level(GraphOptimizationLevel::Level1)?
        .with_intra_threads(1)?
        .commit_from_file("models/debert.onnx")?;
    Ok(session)
 }
 pub fn predict(session: &Session, token_ids: Vec<i64>, attention_masks: Vec<i64>) -> Result<()> {
    let outputs = session.run(
        ort::inputs! {
            "input_ids" => Array2::from_shape_vec((1, token_ids.len()), token_ids).unwrap(),
            "attention_mask" => Array2::from_shape_vec((1, attention_masks.len()), attention_masks).unwrap(),
        }?
    )?;
    let output = outputs.get("output").unwrap();
    println!("{:?}", output);
    Ok(())
 }
--- a/sbv2_core/src/error.rs
+++ b/sbv2_core/src/error.rs
@@ -0,0 +1,17 @@
 use thiserror::Error;
 #[derive(Error, Debug)]
 pub enum Error {
    #[error("Tokenizer error: {0}")]
    TokenizerError(#[from] tokenizers::Error),
    #[error("JPreprocess error: {0}")]
    JPreprocessError(#[from] jpreprocess::error::JPreprocessError),
    #[error("ONNX error: {0}")]
    OrtError(#[from] ort::Error),
    #[error("NDArray error: {0}")]
    NdArrayError(#[from] ndarray::ShapeError),
    #[error("Value error: {0}")]
    ValueError(String),
 }
 pub type Result<T> = std::result::Result<T, Error>;
--- a/sbv2_core/src/lib.rs
+++ b/sbv2_core/src/lib.rs
@@ -0,0 +1,18 @@
 pub mod bert;
 pub mod error;
 pub mod text;
 pub fn add(left: usize, right: usize) -> usize {
    left + right
 }
 #[cfg(test)]
 mod tests {
    use super::*;
    #[test]
    fn it_works() {
        let result = add(2, 2);
        assert_eq!(result, 4);
    }
 }
--- a/sbv2_core/src/main.rs
+++ b/sbv2_core/src/main.rs
@@ -0,0 +1,24 @@
 use sbv2_core::{bert, error, text};
 fn main() -> error::Result<()> {
    let text = "こんにちは,世界!";
    let normalized_text = text::normalize_text(text);
    println!("{}", normalized_text);
    let jtalk = text::JTalk::new()?;
    let phones = jtalk.g2p(&normalized_text)?;
    println!("{:?}", phones);
    let tokenizer = text::get_tokenizer()?;
    println!("{:?}", tokenizer);
    let (token_ids, attention_masks) = text::tokenize(&normalized_text, &tokenizer)?;
    println!("{:?}", token_ids);
    let session = bert::load_model()?;
    bert::predict(&session, token_ids, attention_masks)?;
    Ok(())
 }
--- a/sbv2_core/src/text.rs
+++ b/sbv2_core/src/text.rs
@@ -0,0 +1,202 @@
 use crate::error::{Error, Result};
 use jpreprocess::*;
 use once_cell::sync::Lazy;
 use regex::Regex;
 use std::collections::HashSet;
 use tokenizers::Tokenizer;
 type JPreprocessType = JPreprocess<DefaultFetcher>;
 fn get_jtalk() -> Result<JPreprocessType> {
    let config = JPreprocessConfig {
        dictionary: SystemDictionaryConfig::Bundled(kind::JPreprocessDictionaryKind::NaistJdic),
        user_dictionary: None,
    };
    let jpreprocess = JPreprocess::from_config(config)?;
    Ok(jpreprocess)
 }
 static JTALK_G2P_G_A1_PATTERN: Lazy<Regex> = Lazy::new(|| Regex::new(r"/A:([0-9\-]+)\+").unwrap());
 static JTALK_G2P_G_A2_PATTERN: Lazy<Regex> = Lazy::new(|| Regex::new(r"\+(\d+)\+").unwrap());
 static JTALK_G2P_G_A3_PATTERN: Lazy<Regex> = Lazy::new(|| Regex::new(r"\+(\d+)/").unwrap());
 static JTALK_G2P_G_E3_PATTERN: Lazy<Regex> = Lazy::new(|| Regex::new(r"!(\d+)_").unwrap());
 static JTALK_G2P_G_F1_PATTERN: Lazy<Regex> = Lazy::new(|| Regex::new(r"/F:(\d+)_").unwrap());
 static JTALK_G2P_G_P3_PATTERN: Lazy<Regex> = Lazy::new(|| Regex::new(r"\-(.*?)\+").unwrap());
 fn numeric_feature_by_regex(regex: &Regex, text: &str) -> i32 {
    if let Some(mat) = regex.captures(text) {
        mat[1].parse::<i32>().unwrap()
    } else {
        -50
    }
 }
 macro_rules! hash_set {
    ($($elem:expr),* $(,)?) => {{
        let mut set = HashSet::new();
        $(
            set.insert($elem);
        )*
        set
    }};
 }
 pub struct JTalk {
    pub jpreprocess: JPreprocessType,
 }
 impl JTalk {
    pub fn new() -> Result<Self> {
        let jpreprocess = get_jtalk()?;
        Ok(Self { jpreprocess })
    }
    fn fix_phone_tone(&self, phone_tone_list: Vec<(String, i32)>) -> Result<Vec<(String, i32)>> {
        let tone_values: HashSet<i32> = phone_tone_list
            .iter()
            .map(|(_letter, tone)| tone.clone())
            .collect();
        if tone_values.len() == 1 {
            assert!(tone_values == hash_set![0], "{:?}", tone_values);
            return Ok(phone_tone_list);
        } else if tone_values.len() == 2 {
            if tone_values == hash_set![0, 1] {
                return Ok(phone_tone_list);
            } else if tone_values == hash_set![-1, 0] {
                return Ok(phone_tone_list
                    .iter()
                    .map(|x| {
                        let new_tone = if x.1 == -1 { 0 } else { x.1 };
                        (x.0.clone(), new_tone)
                    })
                    .collect());
            } else {
                return Err(Error::ValueError("Invalid tone values 0".to_string()));
            }
        } else {
            return Err(Error::ValueError("Invalid tone values 1".to_string()));
        }
    }
    pub fn g2p(&self, text: &str) -> Result<()> {
        let phone_tone_list_wo_punct = self.g2phone_tone_wo_punct(text)?;
        Ok(())
    }
    fn g2phone_tone_wo_punct(&self, text: &str) -> Result<Vec<(String, i32)>> {
        let prosodies = self.g2p_prosody(text)?;
        let mut results: Vec<(String, i32)> = Vec::new();
        let mut current_phrase: Vec<(String, i32)> = Vec::new();
        let mut current_tone = 0;
        for (i, letter) in prosodies.iter().enumerate() {
            if letter == "^" {
                assert!(i == 0);
            } else if vec!["$", "?", "_", "#"].contains(&letter.as_str()) {
                results.extend(self.fix_phone_tone(current_phrase.clone())?);
                if vec!["$", "?"].contains(&letter.as_str()) {
                    assert!(i == prosodies.len() - 1);
                }
                current_phrase = Vec::new();
                current_tone = 0;
            } else if letter == "[" {
                current_tone += 1;
            } else if letter == "]" {
                current_tone -= 1;
            } else {
                let new_letter = if letter == "cl" {
                    "q".to_string()
                } else {
                    letter.clone()
                };
                current_phrase.push((new_letter, current_tone));
            }
        }
        Ok(results)
    }
    fn g2p_prosody(&self, text: &str) -> Result<Vec<String>> {
        let labels = self.jpreprocess.extract_fullcontext(text)?;
        let mut phones: Vec<String> = Vec::new();
        for (i, label) in labels.iter().enumerate() {
            let mut p3 = {
                let label_text = label.to_string();
                let mattched = JTALK_G2P_G_P3_PATTERN.captures(&label_text).unwrap();
                mattched[1].to_string()
            };
            if "AIUEO".contains(&p3) {
                // 文字をlowerする
                p3 = p3.to_lowercase();
            }
            if p3 == "sil" {
                assert!(i == 0 || i == labels.len() - 1);
                if i == 0 {
                    phones.push("^".to_string());
                } else if i == labels.len() - 1 {
                    let e3 = numeric_feature_by_regex(&JTALK_G2P_G_E3_PATTERN, &label.to_string());
                    if e3 == 0 {
                        phones.push("$".to_string());
                    } else if e3 == 1 {
                        phones.push("?".to_string());
                    }
                }
                continue;
            } else if p3 == "pau" {
                phones.push("_".to_string());
                continue;
            } else {
                phones.push(p3.clone());
            }
            let a1 = numeric_feature_by_regex(&JTALK_G2P_G_A1_PATTERN, &label.to_string());
            let a2 = numeric_feature_by_regex(&JTALK_G2P_G_A2_PATTERN, &label.to_string());
            let a3 = numeric_feature_by_regex(&JTALK_G2P_G_A3_PATTERN, &label.to_string());
            let f1 = numeric_feature_by_regex(&JTALK_G2P_G_F1_PATTERN, &label.to_string());
            let a2_next =
                numeric_feature_by_regex(&JTALK_G2P_G_A2_PATTERN, &labels[i + 1].to_string());
            if a3 == 1 && a2_next == 1 && "aeiouAEIOUNcl".contains(&p3) {
                phones.push("#".to_string());
            } else if a1 == 0 && a2_next == a2 + 1 && a2 != f1 {
                phones.push("]".to_string());
            } else if a2 == 1 && a2_next == 2 {
                phones.push("[".to_string());
            }
        }
        Ok(phones)
    }
 }
 pub fn normalize_text(text: &str) -> String {
    // 日本語のテキストを正規化する
    let text = text.replace("~", "ー");
    let text = text.replace("～", "ー");
    let text = text.replace("〜", "ー");
    text
 }
 pub fn get_tokenizer() -> Result<Tokenizer> {
    let tokenizer = Tokenizer::from_file("tokenizer.json")?;
    Ok(tokenizer)
 }
 pub fn tokenize(text: &str, tokenizer: &Tokenizer) -> Result<(Vec<i64>, Vec<i64>)> {
    let mut token_ids = vec![1];
    let mut attention_masks = vec![1];
    for content in text.chars() {
        let token = tokenizer.encode(content.to_string(), false)?;
        let ids = token.get_ids();
        token_ids.extend(ids.iter().map(|&x| x as i64));
        attention_masks.extend(token.get_attention_mask().iter().map(|&x| x as i64));
    }
    token_ids.push(2);
    attention_masks.push(1);
    Ok((token_ids, attention_masks))
 }
--- a/tokenizer.json
+++ b/tokenizer.json