From 30c2bd1e77020cd52d707d18dba2e1a1da57f4e7 Mon Sep 17 00:00:00 2001 From: tuna2134 Date: Mon, 9 Sep 2024 15:05:56 +0000 Subject: [PATCH] ok --- Cargo.lock | 10 +- sbv2_core/Cargo.toml | 2 + sbv2_core/convert.py | 194 +++++++++ sbv2_core/src/lib.rs | 1 + sbv2_core/src/mora.rs | 34 ++ sbv2_core/src/mora_list.json | 816 +++++++++++++++++++++++++++++++++++ sbv2_core/src/text.rs | 80 ++++ 7 files changed, 1133 insertions(+), 4 deletions(-) create mode 100644 sbv2_core/convert.py create mode 100644 sbv2_core/src/mora.rs create mode 100644 sbv2_core/src/mora_list.json diff --git a/Cargo.lock b/Cargo.lock index d5e3771..e9d2f58 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1293,24 +1293,26 @@ dependencies = [ "once_cell", "ort", "regex", + "serde", + "serde_json", "thiserror", "tokenizers", ] [[package]] name = "serde" -version = "1.0.209" +version = "1.0.210" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "99fce0ffe7310761ca6bf9faf5115afbc19688edd00171d81b1bb1b116c63e09" +checksum = "c8e3592472072e6e22e0a54d5904d9febf8508f65fb8552499a1abc7d1078c3a" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.209" +version = "1.0.210" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a5831b979fd7b5439637af1752d535ff49f4860c0f341d1baeb6faf0f4242170" +checksum = "243902eda00fad750862fc144cea25caca5e20d615af0a81bee94ca738f1df1f" dependencies = [ "proc-macro2", "quote", diff --git a/sbv2_core/Cargo.toml b/sbv2_core/Cargo.toml index 00be740..5159738 100644 --- a/sbv2_core/Cargo.toml +++ b/sbv2_core/Cargo.toml @@ -10,5 +10,7 @@ ndarray = "0.16.1" once_cell = "1.19.0" ort = { git = "https://github.com/pykeio/ort.git", version = "2.0.0-rc.5" } regex = "1.10.6" +serde = { version = "1.0.210", features = ["derive"] } +serde_json = "1.0.128" thiserror = "1.0.63" tokenizers = "0.20.0" diff --git a/sbv2_core/convert.py b/sbv2_core/convert.py new file mode 100644 index 0000000..0cb8a28 --- /dev/null +++ b/sbv2_core/convert.py @@ -0,0 +1,194 @@ +import json + +__MORA_LIST_MINIMUM: list[tuple[str, str | None, str]] = [ + ("ヴォ", "v", "o"), + ("ヴェ", "v", "e"), + ("ヴィ", "v", "i"), + ("ヴァ", "v", "a"), + ("ヴ", "v", "u"), + ("ン", None, "N"), + ("ワ", "w", "a"), + ("ロ", "r", "o"), + ("レ", "r", "e"), + ("ル", "r", "u"), + ("リョ", "ry", "o"), + ("リュ", "ry", "u"), + ("リャ", "ry", "a"), + ("リェ", "ry", "e"), + ("リ", "r", "i"), + ("ラ", "r", "a"), + ("ヨ", "y", "o"), + ("ユ", "y", "u"), + ("ヤ", "y", "a"), + ("モ", "m", "o"), + ("メ", "m", "e"), + ("ム", "m", "u"), + ("ミョ", "my", "o"), + ("ミュ", "my", "u"), + ("ミャ", "my", "a"), + ("ミェ", "my", "e"), + ("ミ", "m", "i"), + ("マ", "m", "a"), + ("ポ", "p", "o"), + ("ボ", "b", "o"), + ("ホ", "h", "o"), + ("ペ", "p", "e"), + ("ベ", "b", "e"), + ("ヘ", "h", "e"), + ("プ", "p", "u"), + ("ブ", "b", "u"), + ("フォ", "f", "o"), + ("フェ", "f", "e"), + ("フィ", "f", "i"), + ("ファ", "f", "a"), + ("フ", "f", "u"), + ("ピョ", "py", "o"), + ("ピュ", "py", "u"), + ("ピャ", "py", "a"), + ("ピェ", "py", "e"), + ("ピ", "p", "i"), + ("ビョ", "by", "o"), + ("ビュ", "by", "u"), + ("ビャ", "by", "a"), + ("ビェ", "by", "e"), + ("ビ", "b", "i"), + ("ヒョ", "hy", "o"), + ("ヒュ", "hy", "u"), + ("ヒャ", "hy", "a"), + ("ヒェ", "hy", "e"), + ("ヒ", "h", "i"), + ("パ", "p", "a"), + ("バ", "b", "a"), + ("ハ", "h", "a"), + ("ノ", "n", "o"), + ("ネ", "n", "e"), + ("ヌ", "n", "u"), + ("ニョ", "ny", "o"), + ("ニュ", "ny", "u"), + ("ニャ", "ny", "a"), + ("ニェ", "ny", "e"), + ("ニ", "n", "i"), + ("ナ", "n", "a"), + ("ドゥ", "d", "u"), + ("ド", "d", "o"), + ("トゥ", "t", "u"), + ("ト", "t", "o"), + ("デョ", "dy", "o"), + ("デュ", "dy", "u"), + ("デャ", "dy", "a"), + # ("デェ", "dy", "e"), + ("ディ", "d", "i"), + ("デ", "d", "e"), + ("テョ", "ty", "o"), + ("テュ", "ty", "u"), + ("テャ", "ty", "a"), + ("ティ", "t", "i"), + ("テ", "t", "e"), + ("ツォ", "ts", "o"), + ("ツェ", "ts", "e"), + ("ツィ", "ts", "i"), + ("ツァ", "ts", "a"), + ("ツ", "ts", "u"), + ("ッ", None, "q"), # 「cl」から「q」に変更 + ("チョ", "ch", "o"), + ("チュ", "ch", "u"), + ("チャ", "ch", "a"), + ("チェ", "ch", "e"), + ("チ", "ch", "i"), + ("ダ", "d", "a"), + ("タ", "t", "a"), + ("ゾ", "z", "o"), + ("ソ", "s", "o"), + ("ゼ", "z", "e"), + ("セ", "s", "e"), + ("ズィ", "z", "i"), + ("ズ", "z", "u"), + ("スィ", "s", "i"), + ("ス", "s", "u"), + ("ジョ", "j", "o"), + ("ジュ", "j", "u"), + ("ジャ", "j", "a"), + ("ジェ", "j", "e"), + ("ジ", "j", "i"), + ("ショ", "sh", "o"), + ("シュ", "sh", "u"), + ("シャ", "sh", "a"), + ("シェ", "sh", "e"), + ("シ", "sh", "i"), + ("ザ", "z", "a"), + ("サ", "s", "a"), + ("ゴ", "g", "o"), + ("コ", "k", "o"), + ("ゲ", "g", "e"), + ("ケ", "k", "e"), + ("グヮ", "gw", "a"), + ("グ", "g", "u"), + ("クヮ", "kw", "a"), + ("ク", "k", "u"), + ("ギョ", "gy", "o"), + ("ギュ", "gy", "u"), + ("ギャ", "gy", "a"), + ("ギェ", "gy", "e"), + ("ギ", "g", "i"), + ("キョ", "ky", "o"), + ("キュ", "ky", "u"), + ("キャ", "ky", "a"), + ("キェ", "ky", "e"), + ("キ", "k", "i"), + ("ガ", "g", "a"), + ("カ", "k", "a"), + ("オ", None, "o"), + ("エ", None, "e"), + ("ウォ", "w", "o"), + ("ウェ", "w", "e"), + ("ウィ", "w", "i"), + ("ウ", None, "u"), + ("イェ", "y", "e"), + ("イ", None, "i"), + ("ア", None, "a"), +] +__MORA_LIST_ADDITIONAL: list[tuple[str, str | None, str]] = [ + ("ヴョ", "by", "o"), + ("ヴュ", "by", "u"), + ("ヴャ", "by", "a"), + ("ヲ", None, "o"), + ("ヱ", None, "e"), + ("ヰ", None, "i"), + ("ヮ", "w", "a"), + ("ョ", "y", "o"), + ("ュ", "y", "u"), + ("ヅ", "z", "u"), + ("ヂ", "j", "i"), + ("ヶ", "k", "e"), + ("ャ", "y", "a"), + ("ォ", None, "o"), + ("ェ", None, "e"), + ("ゥ", None, "u"), + ("ィ", None, "i"), + ("ァ", None, "a"), +] + +data = {"minimum": [], "additional": []} + + +for mora, consonant, vowel in __MORA_LIST_MINIMUM: + data["minimum"].append( + { + "mora": mora, + "consonant": consonant, + "vowel": vowel, + } + ) + +for mora, consonant, vowel in __MORA_LIST_ADDITIONAL: + data["additional"].append( + { + "mora": mora, + "consonant": consonant, + "vowel": vowel, + } + ) + + +with open("src/mora_list.json", "w") as f: + json.dump(data, f, ensure_ascii=False, indent=4) \ No newline at end of file diff --git a/sbv2_core/src/lib.rs b/sbv2_core/src/lib.rs index 2b6d280..476b053 100644 --- a/sbv2_core/src/lib.rs +++ b/sbv2_core/src/lib.rs @@ -2,6 +2,7 @@ pub mod bert; pub mod error; pub mod norm; pub mod text; +pub mod mora; pub fn add(left: usize, right: usize) -> usize { left + right diff --git a/sbv2_core/src/mora.rs b/sbv2_core/src/mora.rs new file mode 100644 index 0000000..6feb701 --- /dev/null +++ b/sbv2_core/src/mora.rs @@ -0,0 +1,34 @@ +use serde::{Deserialize, Serialize}; +use once_cell::sync::Lazy; +use std::collections::HashMap; + +#[derive(Debug, Serialize, Deserialize)] +pub struct Mora { + pub mora: String, + pub consonant: Option, + pub vowel: String, +} + +#[derive(Debug, Serialize, Deserialize)] +pub struct MoraFile { + pub minimum: Vec, + pub additional: Vec, +} + +static MORA_LIST_MINIMUM: Lazy> = Lazy::new(|| { + let data: MoraFile = serde_json::from_str(include_str!("./mora_list.json")).unwrap(); + data.minimum +}); + +static MORA_LIST_ADDITIONAL: Lazy> = Lazy::new(|| { + let data: MoraFile = serde_json::from_str(include_str!("./mora_list.json")).unwrap(); + data.additional +}); + +pub static MORA_KATA_TO_MORA_PHONEMES: Lazy, String)>> = Lazy::new(|| { + let mut map = HashMap::new(); + for mora in MORA_LIST_MINIMUM.iter().chain(MORA_LIST_ADDITIONAL.iter()) { + map.insert(mora.mora.clone(), (mora.consonant.clone(), mora.vowel.clone())); + } + map +}); \ No newline at end of file diff --git a/sbv2_core/src/mora_list.json b/sbv2_core/src/mora_list.json new file mode 100644 index 0000000..7cd2308 --- /dev/null +++ b/sbv2_core/src/mora_list.json @@ -0,0 +1,816 @@ +{ + "minimum": [ + { + "mora": "ヴォ", + "consonant": "v", + "vowel": "o" + }, + { + "mora": "ヴェ", + "consonant": "v", + "vowel": "e" + }, + { + "mora": "ヴィ", + "consonant": "v", + "vowel": "i" + }, + { + "mora": "ヴァ", + "consonant": "v", + "vowel": "a" + }, + { + "mora": "ヴ", + "consonant": "v", + "vowel": "u" + }, + { + "mora": "ン", + "consonant": null, + "vowel": "N" + }, + { + "mora": "ワ", + "consonant": "w", + "vowel": "a" + }, + { + "mora": "ロ", + "consonant": "r", + "vowel": "o" + }, + { + "mora": "レ", + "consonant": "r", + "vowel": "e" + }, + { + "mora": "ル", + "consonant": "r", + "vowel": "u" + }, + { + "mora": "リョ", + "consonant": "ry", + "vowel": "o" + }, + { + "mora": "リュ", + "consonant": "ry", + "vowel": "u" + }, + { + "mora": "リャ", + "consonant": "ry", + "vowel": "a" + }, + { + "mora": "リェ", + "consonant": "ry", + "vowel": "e" + }, + { + "mora": "リ", + "consonant": "r", + "vowel": "i" + }, + { + "mora": "ラ", + "consonant": "r", + "vowel": "a" + }, + { + "mora": "ヨ", + "consonant": "y", + "vowel": "o" + }, + { + "mora": "ユ", + "consonant": "y", + "vowel": "u" + }, + { + "mora": "ヤ", + "consonant": "y", + "vowel": "a" + }, + { + "mora": "モ", + "consonant": "m", + "vowel": "o" + }, + { + "mora": "メ", + "consonant": "m", + "vowel": "e" + }, + { + "mora": "ム", + "consonant": "m", + "vowel": "u" + }, + { + "mora": "ミョ", + "consonant": "my", + "vowel": "o" + }, + { + "mora": "ミュ", + "consonant": "my", + "vowel": "u" + }, + { + "mora": "ミャ", + "consonant": "my", + "vowel": "a" + }, + { + "mora": "ミェ", + "consonant": "my", + "vowel": "e" + }, + { + "mora": "ミ", + "consonant": "m", + "vowel": "i" + }, + { + "mora": "マ", + "consonant": "m", + "vowel": "a" + }, + { + "mora": "ポ", + "consonant": "p", + "vowel": "o" + }, + { + "mora": "ボ", + "consonant": "b", + "vowel": "o" + }, + { + "mora": "ホ", + "consonant": "h", + "vowel": "o" + }, + { + "mora": "ペ", + "consonant": "p", + "vowel": "e" + }, + { + "mora": "ベ", + "consonant": "b", + "vowel": "e" + }, + { + "mora": "ヘ", + "consonant": "h", + "vowel": "e" + }, + { + "mora": "プ", + "consonant": "p", + "vowel": "u" + }, + { + "mora": "ブ", + "consonant": "b", + "vowel": "u" + }, + { + "mora": "フォ", + "consonant": "f", + "vowel": "o" + }, + { + "mora": "フェ", + "consonant": "f", + "vowel": "e" + }, + { + "mora": "フィ", + "consonant": "f", + "vowel": "i" + }, + { + "mora": "ファ", + "consonant": "f", + "vowel": "a" + }, + { + "mora": "フ", + "consonant": "f", + "vowel": "u" + }, + { + "mora": "ピョ", + "consonant": "py", + "vowel": "o" + }, + { + "mora": "ピュ", + "consonant": "py", + "vowel": "u" + }, + { + "mora": "ピャ", + "consonant": "py", + "vowel": "a" + }, + { + "mora": "ピェ", + "consonant": "py", + "vowel": "e" + }, + { + "mora": "ピ", + "consonant": "p", + "vowel": "i" + }, + { + "mora": "ビョ", + "consonant": "by", + "vowel": "o" + }, + { + "mora": "ビュ", + "consonant": "by", + "vowel": "u" + }, + { + "mora": "ビャ", + "consonant": "by", + "vowel": "a" + }, + { + "mora": "ビェ", + "consonant": "by", + "vowel": "e" + }, + { + "mora": "ビ", + "consonant": "b", + "vowel": "i" + }, + { + "mora": "ヒョ", + "consonant": "hy", + "vowel": "o" + }, + { + "mora": "ヒュ", + "consonant": "hy", + "vowel": "u" + }, + { + "mora": "ヒャ", + "consonant": "hy", + "vowel": "a" + }, + { + "mora": "ヒェ", + "consonant": "hy", + "vowel": "e" + }, + { + "mora": "ヒ", + "consonant": "h", + "vowel": "i" + }, + { + "mora": "パ", + "consonant": "p", + "vowel": "a" + }, + { + "mora": "バ", + "consonant": "b", + "vowel": "a" + }, + { + "mora": "ハ", + "consonant": "h", + "vowel": "a" + }, + { + "mora": "ノ", + "consonant": "n", + "vowel": "o" + }, + { + "mora": "ネ", + "consonant": "n", + "vowel": "e" + }, + { + "mora": "ヌ", + "consonant": "n", + "vowel": "u" + }, + { + "mora": "ニョ", + "consonant": "ny", + "vowel": "o" + }, + { + "mora": "ニュ", + "consonant": "ny", + "vowel": "u" + }, + { + "mora": "ニャ", + "consonant": "ny", + "vowel": "a" + }, + { + "mora": "ニェ", + "consonant": "ny", + "vowel": "e" + }, + { + "mora": "ニ", + "consonant": "n", + "vowel": "i" + }, + { + "mora": "ナ", + "consonant": "n", + "vowel": "a" + }, + { + "mora": "ドゥ", + "consonant": "d", + "vowel": "u" + }, + { + "mora": "ド", + "consonant": "d", + "vowel": "o" + }, + { + "mora": "トゥ", + "consonant": "t", + "vowel": "u" + }, + { + "mora": "ト", + "consonant": "t", + "vowel": "o" + }, + { + "mora": "デョ", + "consonant": "dy", + "vowel": "o" + }, + { + "mora": "デュ", + "consonant": "dy", + "vowel": "u" + }, + { + "mora": "デャ", + "consonant": "dy", + "vowel": "a" + }, + { + "mora": "ディ", + "consonant": "d", + "vowel": "i" + }, + { + "mora": "デ", + "consonant": "d", + "vowel": "e" + }, + { + "mora": "テョ", + "consonant": "ty", + "vowel": "o" + }, + { + "mora": "テュ", + "consonant": "ty", + "vowel": "u" + }, + { + "mora": "テャ", + "consonant": "ty", + "vowel": "a" + }, + { + "mora": "ティ", + "consonant": "t", + "vowel": "i" + }, + { + "mora": "テ", + "consonant": "t", + "vowel": "e" + }, + { + "mora": "ツォ", + "consonant": "ts", + "vowel": "o" + }, + { + "mora": "ツェ", + "consonant": "ts", + "vowel": "e" + }, + { + "mora": "ツィ", + "consonant": "ts", + "vowel": "i" + }, + { + "mora": "ツァ", + "consonant": "ts", + "vowel": "a" + }, + { + "mora": "ツ", + "consonant": "ts", + "vowel": "u" + }, + { + "mora": "ッ", + "consonant": null, + "vowel": "q" + }, + { + "mora": "チョ", + "consonant": "ch", + "vowel": "o" + }, + { + "mora": "チュ", + "consonant": "ch", + "vowel": "u" + }, + { + "mora": "チャ", + "consonant": "ch", + "vowel": "a" + }, + { + "mora": "チェ", + "consonant": "ch", + "vowel": "e" + }, + { + "mora": "チ", + "consonant": "ch", + "vowel": "i" + }, + { + "mora": "ダ", + "consonant": "d", + "vowel": "a" + }, + { + "mora": "タ", + "consonant": "t", + "vowel": "a" + }, + { + "mora": "ゾ", + "consonant": "z", + "vowel": "o" + }, + { + "mora": "ソ", + "consonant": "s", + "vowel": "o" + }, + { + "mora": "ゼ", + "consonant": "z", + "vowel": "e" + }, + { + "mora": "セ", + "consonant": "s", + "vowel": "e" + }, + { + "mora": "ズィ", + "consonant": "z", + "vowel": "i" + }, + { + "mora": "ズ", + "consonant": "z", + "vowel": "u" + }, + { + "mora": "スィ", + "consonant": "s", + "vowel": "i" + }, + { + "mora": "ス", + "consonant": "s", + "vowel": "u" + }, + { + "mora": "ジョ", + "consonant": "j", + "vowel": "o" + }, + { + "mora": "ジュ", + "consonant": "j", + "vowel": "u" + }, + { + "mora": "ジャ", + "consonant": "j", + "vowel": "a" + }, + { + "mora": "ジェ", + "consonant": "j", + "vowel": "e" + }, + { + "mora": "ジ", + "consonant": "j", + "vowel": "i" + }, + { + "mora": "ショ", + "consonant": "sh", + "vowel": "o" + }, + { + "mora": "シュ", + "consonant": "sh", + "vowel": "u" + }, + { + "mora": "シャ", + "consonant": "sh", + "vowel": "a" + }, + { + "mora": "シェ", + "consonant": "sh", + "vowel": "e" + }, + { + "mora": "シ", + "consonant": "sh", + "vowel": "i" + }, + { + "mora": "ザ", + "consonant": "z", + "vowel": "a" + }, + { + "mora": "サ", + "consonant": "s", + "vowel": "a" + }, + { + "mora": "ゴ", + "consonant": "g", + "vowel": "o" + }, + { + "mora": "コ", + "consonant": "k", + "vowel": "o" + }, + { + "mora": "ゲ", + "consonant": "g", + "vowel": "e" + }, + { + "mora": "ケ", + "consonant": "k", + "vowel": "e" + }, + { + "mora": "グヮ", + "consonant": "gw", + "vowel": "a" + }, + { + "mora": "グ", + "consonant": "g", + "vowel": "u" + }, + { + "mora": "クヮ", + "consonant": "kw", + "vowel": "a" + }, + { + "mora": "ク", + "consonant": "k", + "vowel": "u" + }, + { + "mora": "ギョ", + "consonant": "gy", + "vowel": "o" + }, + { + "mora": "ギュ", + "consonant": "gy", + "vowel": "u" + }, + { + "mora": "ギャ", + "consonant": "gy", + "vowel": "a" + }, + { + "mora": "ギェ", + "consonant": "gy", + "vowel": "e" + }, + { + "mora": "ギ", + "consonant": "g", + "vowel": "i" + }, + { + "mora": "キョ", + "consonant": "ky", + "vowel": "o" + }, + { + "mora": "キュ", + "consonant": "ky", + "vowel": "u" + }, + { + "mora": "キャ", + "consonant": "ky", + "vowel": "a" + }, + { + "mora": "キェ", + "consonant": "ky", + "vowel": "e" + }, + { + "mora": "キ", + "consonant": "k", + "vowel": "i" + }, + { + "mora": "ガ", + "consonant": "g", + "vowel": "a" + }, + { + "mora": "カ", + "consonant": "k", + "vowel": "a" + }, + { + "mora": "オ", + "consonant": null, + "vowel": "o" + }, + { + "mora": "エ", + "consonant": null, + "vowel": "e" + }, + { + "mora": "ウォ", + "consonant": "w", + "vowel": "o" + }, + { + "mora": "ウェ", + "consonant": "w", + "vowel": "e" + }, + { + "mora": "ウィ", + "consonant": "w", + "vowel": "i" + }, + { + "mora": "ウ", + "consonant": null, + "vowel": "u" + }, + { + "mora": "イェ", + "consonant": "y", + "vowel": "e" + }, + { + "mora": "イ", + "consonant": null, + "vowel": "i" + }, + { + "mora": "ア", + "consonant": null, + "vowel": "a" + } + ], + "additional": [ + { + "mora": "ヴョ", + "consonant": "by", + "vowel": "o" + }, + { + "mora": "ヴュ", + "consonant": "by", + "vowel": "u" + }, + { + "mora": "ヴャ", + "consonant": "by", + "vowel": "a" + }, + { + "mora": "ヲ", + "consonant": null, + "vowel": "o" + }, + { + "mora": "ヱ", + "consonant": null, + "vowel": "e" + }, + { + "mora": "ヰ", + "consonant": null, + "vowel": "i" + }, + { + "mora": "ヮ", + "consonant": "w", + "vowel": "a" + }, + { + "mora": "ョ", + "consonant": "y", + "vowel": "o" + }, + { + "mora": "ュ", + "consonant": "y", + "vowel": "u" + }, + { + "mora": "ヅ", + "consonant": "z", + "vowel": "u" + }, + { + "mora": "ヂ", + "consonant": "j", + "vowel": "i" + }, + { + "mora": "ヶ", + "consonant": "k", + "vowel": "e" + }, + { + "mora": "ャ", + "consonant": "y", + "vowel": "a" + }, + { + "mora": "ォ", + "consonant": null, + "vowel": "o" + }, + { + "mora": "ェ", + "consonant": null, + "vowel": "e" + }, + { + "mora": "ゥ", + "consonant": null, + "vowel": "u" + }, + { + "mora": "ィ", + "consonant": null, + "vowel": "i" + }, + { + "mora": "ァ", + "consonant": null, + "vowel": "a" + } + ] +} \ No newline at end of file diff --git a/sbv2_core/src/text.rs b/sbv2_core/src/text.rs index 0abed04..47fbf0b 100644 --- a/sbv2_core/src/text.rs +++ b/sbv2_core/src/text.rs @@ -1,5 +1,6 @@ use crate::error::{Error, Result}; use crate::norm::{replace_punctuation, PUNCTUATIONS}; +use crate::mora::MORA_KATA_TO_MORA_PHONEMES; use jpreprocess::*; use once_cell::sync::Lazy; use regex::Regex; @@ -61,6 +62,14 @@ impl JTalk { } } +static KATAKANA_PATTERN: Lazy = Lazy::new(|| Regex::new(r"[\u30A0-\u30FF]+").unwrap()); +static MORA_PATTERN: Lazy> = Lazy::new(|| { + let mut sorted_keys: Vec = MORA_KATA_TO_MORA_PHONEMES.keys().cloned().collect(); + sorted_keys.sort_by(|a, b| b.len().cmp(&a.len())); + sorted_keys +}); +static LONG_PATTERN: Lazy = Lazy::new(|| Regex::new(r"(\w)(ー*)").unwrap()); + struct JTalkProcess { jpreprocess: Arc, parsed: Vec, @@ -104,9 +113,80 @@ impl JTalkProcess { pub fn g2p(&self) -> Result<()> { let phone_tone_list_wo_punct = self.g2phone_tone_wo_punct()?; let (seq_text, seq_kata) = self.text_to_seq_kata()?; + println!("{:?}", seq_text); + println!("{:?}", seq_kata); Ok(()) } + fn handle_long(sep_phonemes: Vec>) -> Vec> { + for (i, phonemes) in sep_phonemes.iter().enumerate() { + if phonemes.len() == 0 { + continue; + } + if phonemes[0] == "ー" { + if i != 0 { + let prev_phoneme = sep_phonemes[i - 1].last().unwrap(); + } + } + } + vec![] + } + + fn kata_to_phoneme_list(mut text: String) -> Result> { + /* + if set(text).issubset(set(PUNCTUATIONS)): + return list(text) + # `text` がカタカナ(`ー`含む)のみからなるかどうかをチェック + if __KATAKANA_PATTERN.fullmatch(text) is None: + raise ValueError(f"Input must be katakana only: {text}") + + def mora2phonemes(mora: str) -> str: + consonant, vowel = MORA_KATA_TO_MORA_PHONEMES[mora] + if consonant is None: + return f" {vowel}" + return f" {consonant} {vowel}" + + spaced_phonemes = __MORA_PATTERN.sub(lambda m: mora2phonemes(m.group()), text) + + # 長音記号「ー」の処理 + long_replacement = lambda m: m.group(1) + (" " + m.group(1)) * len(m.group(2)) # type: ignore + spaced_phonemes = __LONG_PATTERN.sub(long_replacement, spaced_phonemes) + + return spaced_phonemes.strip().split(" ") + */ + if PUNCTUATIONS.contains(&text.as_str()) { + return Ok(text.chars().map(|x| x.to_string()).collect()); + } + if KATAKANA_PATTERN.is_match(&text) { + return Err(Error::ValueError(format!("Input must be katakana only: {}", text))); + } + + fn mora2phonemes(mora: &str) -> String { + let (consonant, vowel) = MORA_KATA_TO_MORA_PHONEMES.get(mora).unwrap(); + if consonant.is_none() { + return format!(" {}", vowel); + } + format!(" {} {}", consonant.as_ref().unwrap(), vowel) + } + + for mora in MORA_PATTERN.iter() { + let mora = mora.to_string(); + let phonemes = mora2phonemes(&mora); + text = text.replace(&mora, &phonemes); + } + + let long_replacement = |m: ®ex::Captures| { + let mut result = m.get(1).unwrap().as_str().to_string(); + for _ in 0..m.get(2).unwrap().as_str().len() { + result += &format!(" {}", m.get(1).unwrap().as_str()); + } + result + }; + text = LONG_PATTERN.replace_all(&text, long_replacement).to_string(); + + return Ok(text.trim().split(" ").map(|x| x.to_string()).collect()); + } + fn text_to_seq_kata(&self) -> Result<(Vec, Vec)> { let mut seq_kata = vec![]; let mut seq_text = vec![];