From 596eec654df57020f3f8eee3672ba8c2cd42d304 Mon Sep 17 00:00:00 2001 From: Googlefan Date: Mon, 30 Sep 2024 08:04:37 +0000 Subject: [PATCH] feat: sbv2 wasm --- .gitignore | 3 +- Cargo.lock | 23 ++ sbv2_core/src/tts.rs | 2 +- sbv2_core/src/tts_util.rs | 82 +++++- sbv2_wasm/Cargo.toml | 1 + sbv2_wasm/README.md | 2 + sbv2_wasm/biome.json | 31 +++ sbv2_wasm/build.sh | 4 +- sbv2_wasm/example.js | 11 + sbv2_wasm/package.json | 29 ++ sbv2_wasm/pnpm-lock.yaml | 493 ++++++++++++++++++++++++++++++++++ sbv2_wasm/src-js/index.ts | 106 ++++++++ sbv2_wasm/src/array_helper.rs | 102 +++++++ sbv2_wasm/src/lib.rs | 105 +++++--- sbv2_wasm/tsconfig.json | 15 ++ 15 files changed, 961 insertions(+), 48 deletions(-) create mode 100644 sbv2_wasm/README.md create mode 100644 sbv2_wasm/biome.json mode change 100644 => 100755 sbv2_wasm/build.sh create mode 100644 sbv2_wasm/example.js create mode 100644 sbv2_wasm/package.json create mode 100644 sbv2_wasm/pnpm-lock.yaml create mode 100644 sbv2_wasm/src-js/index.ts create mode 100644 sbv2_wasm/src/array_helper.rs create mode 100644 sbv2_wasm/tsconfig.json diff --git a/.gitignore b/.gitignore index 15193c7..1b8d2ff 100644 --- a/.gitignore +++ b/.gitignore @@ -4,4 +4,5 @@ models/ venv/ .env output.wav -node_modules \ No newline at end of file +node_modules +dist/ \ No newline at end of file diff --git a/Cargo.lock b/Cargo.lock index 27c2650..a8be14b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1967,6 +1967,7 @@ dependencies = [ "once_cell", "sbv2_core", "wasm-bindgen", + "wasm-bindgen-futures", ] [[package]] @@ -2524,6 +2525,18 @@ dependencies = [ "wasm-bindgen-shared", ] +[[package]] +name = "wasm-bindgen-futures" +version = "0.4.43" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61e9300f63a621e96ed275155c108eb6f843b6a26d053f122ab69724559dc8ed" +dependencies = [ + "cfg-if", + "js-sys", + "wasm-bindgen", + "web-sys", +] + [[package]] name = "wasm-bindgen-macro" version = "0.2.93" @@ -2553,6 +2566,16 @@ version = "0.2.93" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c62a0a307cb4a311d3a07867860911ca130c3494e8c2719593806c08bc5d0484" +[[package]] +name = "web-sys" +version = "0.3.70" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "26fdeaafd9bd129f65e7c031593c24d62186301e0c72c8978fa1678be7d532c0" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + [[package]] name = "webpki-roots" version = "0.26.5" diff --git a/sbv2_core/src/tts.rs b/sbv2_core/src/tts.rs index 4e1eabb..29c4da6 100644 --- a/sbv2_core/src/tts.rs +++ b/sbv2_core/src/tts.rs @@ -128,7 +128,7 @@ impl TTSModelHolder { &self, text: &str, ) -> Result<(Array2, Array1, Array1, Array1)> { - crate::tts_util::parse_text( + crate::tts_util::parse_text_blocking( text, &self.jtalk, &self.tokenizer, diff --git a/sbv2_core/src/tts_util.rs b/sbv2_core/src/tts_util.rs index eee5474..24b059a 100644 --- a/sbv2_core/src/tts_util.rs +++ b/sbv2_core/src/tts_util.rs @@ -10,7 +10,87 @@ use tokenizers::Tokenizer; /// # Note /// This function is for low-level usage, use `easy_synthesize` for high-level usage. #[allow(clippy::type_complexity)] -pub fn parse_text( +pub async fn parse_text( + text: &str, + jtalk: &jtalk::JTalk, + tokenizer: &Tokenizer, + bert_predict: impl FnOnce( + Vec, + Vec, + ) -> std::pin::Pin< + Box>>>, + >, +) -> Result<(Array2, Array1, Array1, Array1)> { + let text = jtalk.num2word(text)?; + let normalized_text = norm::normalize_text(&text); + + let process = jtalk.process_text(&normalized_text)?; + let (phones, tones, mut word2ph) = process.g2p()?; + let (phones, tones, lang_ids) = nlp::cleaned_text_to_sequence(phones, tones); + + let phones = utils::intersperse(&phones, 0); + let tones = utils::intersperse(&tones, 0); + let lang_ids = utils::intersperse(&lang_ids, 0); + for item in &mut word2ph { + *item *= 2; + } + word2ph[0] += 1; + + let text = { + let (seq_text, _) = process.text_to_seq_kata()?; + seq_text.join("") + }; + let (token_ids, attention_masks) = tokenizer::tokenize(&text, tokenizer)?; + + let bert_content = bert_predict(token_ids, attention_masks).await?; + + assert!( + word2ph.len() == text.chars().count() + 2, + "{} {}", + word2ph.len(), + normalized_text.chars().count() + ); + + let mut phone_level_feature = vec![]; + for (i, reps) in word2ph.iter().enumerate() { + let repeat_feature = { + let (reps_rows, reps_cols) = (*reps, 1); + let arr_len = bert_content.slice(s![i, ..]).len(); + + let mut results: Array2 = Array::zeros((reps_rows as usize, arr_len * reps_cols)); + + for j in 0..reps_rows { + for k in 0..reps_cols { + let mut view = results.slice_mut(s![j, k * arr_len..(k + 1) * arr_len]); + view.assign(&bert_content.slice(s![i, ..])); + } + } + results + }; + phone_level_feature.push(repeat_feature); + } + let phone_level_feature = concatenate( + Axis(0), + &phone_level_feature + .iter() + .map(|x| x.view()) + .collect::>(), + )?; + let bert_ori = phone_level_feature.t(); + Ok(( + bert_ori.to_owned(), + phones.into(), + tones.into(), + lang_ids.into(), + )) +} + +/// Parse text and return the input for synthesize +/// +/// # Note +/// This function is for low-level usage, use `easy_synthesize` for high-level usage. +#[allow(clippy::type_complexity)] +pub fn parse_text_blocking( text: &str, jtalk: &jtalk::JTalk, tokenizer: &Tokenizer, diff --git a/sbv2_wasm/Cargo.toml b/sbv2_wasm/Cargo.toml index a44af89..4930456 100644 --- a/sbv2_wasm/Cargo.toml +++ b/sbv2_wasm/Cargo.toml @@ -12,6 +12,7 @@ sbv2_core = { path = "../sbv2_core", default-features = false, features = ["no_s once_cell.workspace = true js-sys = "0.3.70" ndarray.workspace = true +wasm-bindgen-futures = "0.4.43" [profile.release] lto = true diff --git a/sbv2_wasm/README.md b/sbv2_wasm/README.md new file mode 100644 index 0000000..f647cda --- /dev/null +++ b/sbv2_wasm/README.md @@ -0,0 +1,2 @@ +# StyleBertVITS2 wasm +refer to https://github.com/tuna2134/sbv2-api \ No newline at end of file diff --git a/sbv2_wasm/biome.json b/sbv2_wasm/biome.json new file mode 100644 index 0000000..8dbca02 --- /dev/null +++ b/sbv2_wasm/biome.json @@ -0,0 +1,31 @@ +{ + "$schema": "https://biomejs.dev/schemas/1.9.2/schema.json", + "vcs": { + "enabled": false, + "clientKind": "git", + "useIgnoreFile": false + }, + "files": { + "ignoreUnknown": false, + "ignore": [] + }, + "formatter": { + "enabled": true, + "indentStyle": "tab", + "ignore": ["dist/", "pkg/"] + }, + "organizeImports": { + "enabled": true + }, + "linter": { + "enabled": true, + "rules": { + "recommended": true + } + }, + "javascript": { + "formatter": { + "quoteStyle": "double" + } + } +} diff --git a/sbv2_wasm/build.sh b/sbv2_wasm/build.sh old mode 100644 new mode 100755 index 2240af3..1d23f47 --- a/sbv2_wasm/build.sh +++ b/sbv2_wasm/build.sh @@ -1,2 +1,4 @@ wasm-pack build --target web sbv2_wasm -wasm-opt -O3 -o sbv2_wasm/pkg/sbv2_wasm_bg.wasm sbv2_wasm/pkg/sbv2_wasm_bg.wasm \ No newline at end of file +wasm-opt -O3 -o sbv2_wasm/pkg/sbv2_wasm_bg.wasm sbv2_wasm/pkg/sbv2_wasm_bg.wasm +mkdir -p sbv2_wasm/dist +cp sbv2_wasm/sbv2_wasm/pkg/sbv2_wasm_bg.wasm sbv2_wasm/dist/sbv2_wasm_bg.wasm \ No newline at end of file diff --git a/sbv2_wasm/example.js b/sbv2_wasm/example.js new file mode 100644 index 0000000..1c4fdc6 --- /dev/null +++ b/sbv2_wasm/example.js @@ -0,0 +1,11 @@ +import { ModelHolder } from "./dist/index.js"; +import fs from "node:fs/promises"; + +ModelHolder.globalInit(await fs.readFile("./dist/sbv2_wasm_bg.wasm")); +const holder = await ModelHolder.create( + (await fs.readFile("../models/tokenizer.json")).toString("utf-8"), + await fs.readFile("../models/deberta.onnx"), +); +await holder.load("tsukuyomi", await fs.readFile("../models/iroha2.sbv2")); +await fs.writeFile("out.wav", await holder.synthesize("tsukuyomi", "おはよう")); +holder.unload("tsukuyomi"); diff --git a/sbv2_wasm/package.json b/sbv2_wasm/package.json new file mode 100644 index 0000000..51e1ac6 --- /dev/null +++ b/sbv2_wasm/package.json @@ -0,0 +1,29 @@ +{ + "name": "sbv2", + "version": "0.1.0", + "description": "Style Bert VITS2 wasm", + "main": "dist/index.js", + "types": "dist/index.d.ts", + "type": "module", + "scripts": { + "build": "tsc && esbuild src-js/index.ts --outfile=dist/index.js --minify --format=esm --bundle --external:onnxruntime-web", + "format": "biome format --write ." + }, + "keywords": [], + "author": "tuna2134", + "license": "MIT", + "devDependencies": { + "@biomejs/biome": "^1.9.2", + "@types/node": "^22.7.4", + "esbuild": "^0.24.0", + "typescript": "^5.6.2" + }, + "dependencies": { + "onnxruntime-web": "^1.19.2" + }, + "files": [ + "dist/*", + "package.json", + "README.md" + ] +} diff --git a/sbv2_wasm/pnpm-lock.yaml b/sbv2_wasm/pnpm-lock.yaml new file mode 100644 index 0000000..c68b448 --- /dev/null +++ b/sbv2_wasm/pnpm-lock.yaml @@ -0,0 +1,493 @@ +lockfileVersion: '9.0' + +settings: + autoInstallPeers: true + excludeLinksFromLockfile: false + +importers: + + .: + devDependencies: + '@biomejs/biome': + specifier: ^1.9.2 + version: 1.9.2 + '@types/node': + specifier: ^22.7.4 + version: 22.7.4 + esbuild: + specifier: ^0.24.0 + version: 0.24.0 + onnxruntime-web: + specifier: ^1.19.2 + version: 1.19.2 + typescript: + specifier: ^5.6.2 + version: 5.6.2 + +packages: + + '@biomejs/biome@1.9.2': + resolution: {integrity: sha512-4j2Gfwft8Jqp1X0qLYvK4TEy4xhTo4o6rlvJPsjPeEame8gsmbGQfOPBkw7ur+7/Z/f0HZmCZKqbMvR7vTXQYQ==} + engines: {node: '>=14.21.3'} + hasBin: true + + '@biomejs/cli-darwin-arm64@1.9.2': + resolution: {integrity: sha512-rbs9uJHFmhqB3Td0Ro+1wmeZOHhAPTL3WHr8NtaVczUmDhXkRDWScaxicG9+vhSLj1iLrW47itiK6xiIJy6vaA==} + engines: {node: '>=14.21.3'} + cpu: [arm64] + os: [darwin] + + '@biomejs/cli-darwin-x64@1.9.2': + resolution: {integrity: sha512-BlfULKijNaMigQ9GH9fqJVt+3JTDOSiZeWOQtG/1S1sa8Lp046JHG3wRJVOvekTPL9q/CNFW1NVG8J0JN+L1OA==} + engines: {node: '>=14.21.3'} + cpu: [x64] + os: [darwin] + + '@biomejs/cli-linux-arm64-musl@1.9.2': + resolution: {integrity: sha512-ZATvbUWhNxegSALUnCKWqetTZqrK72r2RsFD19OK5jXDj/7o1hzI1KzDNG78LloZxftrwr3uI9SqCLh06shSZw==} + engines: {node: '>=14.21.3'} + cpu: [arm64] + os: [linux] + + '@biomejs/cli-linux-arm64@1.9.2': + resolution: {integrity: sha512-T8TJuSxuBDeQCQzxZu2o3OU4eyLumTofhCxxFd3+aH2AEWVMnH7Z/c3QP1lHI5RRMBP9xIJeMORqDQ5j+gVZzw==} + engines: {node: '>=14.21.3'} + cpu: [arm64] + os: [linux] + + '@biomejs/cli-linux-x64-musl@1.9.2': + resolution: {integrity: sha512-CjPM6jT1miV5pry9C7qv8YJk0FIZvZd86QRD3atvDgfgeh9WQU0k2Aoo0xUcPdTnoz0WNwRtDicHxwik63MmSg==} + engines: {node: '>=14.21.3'} + cpu: [x64] + os: [linux] + + '@biomejs/cli-linux-x64@1.9.2': + resolution: {integrity: sha512-T0cPk3C3Jr2pVlsuQVTBqk2qPjTm8cYcTD9p/wmR9MeVqui1C/xTVfOIwd3miRODFMrJaVQ8MYSXnVIhV9jTjg==} + engines: {node: '>=14.21.3'} + cpu: [x64] + os: [linux] + + '@biomejs/cli-win32-arm64@1.9.2': + resolution: {integrity: sha512-2x7gSty75bNIeD23ZRPXyox6Z/V0M71ObeJtvQBhi1fgrvPdtkEuw7/0wEHg6buNCubzOFuN9WYJm6FKoUHfhg==} + engines: {node: '>=14.21.3'} + cpu: [arm64] + os: [win32] + + '@biomejs/cli-win32-x64@1.9.2': + resolution: {integrity: sha512-JC3XvdYcjmu1FmAehVwVV0SebLpeNTnO2ZaMdGCSOdS7f8O9Fq14T2P1gTG1Q29Q8Dt1S03hh0IdVpIZykOL8g==} + engines: {node: '>=14.21.3'} + cpu: [x64] + os: [win32] + + '@esbuild/aix-ppc64@0.24.0': + resolution: {integrity: sha512-WtKdFM7ls47zkKHFVzMz8opM7LkcsIp9amDUBIAWirg70RM71WRSjdILPsY5Uv1D42ZpUfaPILDlfactHgsRkw==} + engines: {node: '>=18'} + cpu: [ppc64] + os: [aix] + + '@esbuild/android-arm64@0.24.0': + resolution: {integrity: sha512-Vsm497xFM7tTIPYK9bNTYJyF/lsP590Qc1WxJdlB6ljCbdZKU9SY8i7+Iin4kyhV/KV5J2rOKsBQbB77Ab7L/w==} + engines: {node: '>=18'} + cpu: [arm64] + os: [android] + + '@esbuild/android-arm@0.24.0': + resolution: {integrity: sha512-arAtTPo76fJ/ICkXWetLCc9EwEHKaeya4vMrReVlEIUCAUncH7M4bhMQ+M9Vf+FFOZJdTNMXNBrWwW+OXWpSew==} + engines: {node: '>=18'} + cpu: [arm] + os: [android] + + '@esbuild/android-x64@0.24.0': + resolution: {integrity: sha512-t8GrvnFkiIY7pa7mMgJd7p8p8qqYIz1NYiAoKc75Zyv73L3DZW++oYMSHPRarcotTKuSs6m3hTOa5CKHaS02TQ==} + engines: {node: '>=18'} + cpu: [x64] + os: [android] + + '@esbuild/darwin-arm64@0.24.0': + resolution: {integrity: sha512-CKyDpRbK1hXwv79soeTJNHb5EiG6ct3efd/FTPdzOWdbZZfGhpbcqIpiD0+vwmpu0wTIL97ZRPZu8vUt46nBSw==} + engines: {node: '>=18'} + cpu: [arm64] + os: [darwin] + + '@esbuild/darwin-x64@0.24.0': + resolution: {integrity: sha512-rgtz6flkVkh58od4PwTRqxbKH9cOjaXCMZgWD905JOzjFKW+7EiUObfd/Kav+A6Gyud6WZk9w+xu6QLytdi2OA==} + engines: {node: '>=18'} + cpu: [x64] + os: [darwin] + + '@esbuild/freebsd-arm64@0.24.0': + resolution: {integrity: sha512-6Mtdq5nHggwfDNLAHkPlyLBpE5L6hwsuXZX8XNmHno9JuL2+bg2BX5tRkwjyfn6sKbxZTq68suOjgWqCicvPXA==} + engines: {node: '>=18'} + cpu: [arm64] + os: [freebsd] + + '@esbuild/freebsd-x64@0.24.0': + resolution: {integrity: sha512-D3H+xh3/zphoX8ck4S2RxKR6gHlHDXXzOf6f/9dbFt/NRBDIE33+cVa49Kil4WUjxMGW0ZIYBYtaGCa2+OsQwQ==} + engines: {node: '>=18'} + cpu: [x64] + os: [freebsd] + + '@esbuild/linux-arm64@0.24.0': + resolution: {integrity: sha512-TDijPXTOeE3eaMkRYpcy3LarIg13dS9wWHRdwYRnzlwlA370rNdZqbcp0WTyyV/k2zSxfko52+C7jU5F9Tfj1g==} + engines: {node: '>=18'} + cpu: [arm64] + os: [linux] + + '@esbuild/linux-arm@0.24.0': + resolution: {integrity: sha512-gJKIi2IjRo5G6Glxb8d3DzYXlxdEj2NlkixPsqePSZMhLudqPhtZ4BUrpIuTjJYXxvF9njql+vRjB2oaC9XpBw==} + engines: {node: '>=18'} + cpu: [arm] + os: [linux] + + '@esbuild/linux-ia32@0.24.0': + resolution: {integrity: sha512-K40ip1LAcA0byL05TbCQ4yJ4swvnbzHscRmUilrmP9Am7//0UjPreh4lpYzvThT2Quw66MhjG//20mrufm40mA==} + engines: {node: '>=18'} + cpu: [ia32] + os: [linux] + + '@esbuild/linux-loong64@0.24.0': + resolution: {integrity: sha512-0mswrYP/9ai+CU0BzBfPMZ8RVm3RGAN/lmOMgW4aFUSOQBjA31UP8Mr6DDhWSuMwj7jaWOT0p0WoZ6jeHhrD7g==} + engines: {node: '>=18'} + cpu: [loong64] + os: [linux] + + '@esbuild/linux-mips64el@0.24.0': + resolution: {integrity: sha512-hIKvXm0/3w/5+RDtCJeXqMZGkI2s4oMUGj3/jM0QzhgIASWrGO5/RlzAzm5nNh/awHE0A19h/CvHQe6FaBNrRA==} + engines: {node: '>=18'} + cpu: [mips64el] + os: [linux] + + '@esbuild/linux-ppc64@0.24.0': + resolution: {integrity: sha512-HcZh5BNq0aC52UoocJxaKORfFODWXZxtBaaZNuN3PUX3MoDsChsZqopzi5UupRhPHSEHotoiptqikjN/B77mYQ==} + engines: {node: '>=18'} + cpu: [ppc64] + os: [linux] + + '@esbuild/linux-riscv64@0.24.0': + resolution: {integrity: sha512-bEh7dMn/h3QxeR2KTy1DUszQjUrIHPZKyO6aN1X4BCnhfYhuQqedHaa5MxSQA/06j3GpiIlFGSsy1c7Gf9padw==} + engines: {node: '>=18'} + cpu: [riscv64] + os: [linux] + + '@esbuild/linux-s390x@0.24.0': + resolution: {integrity: sha512-ZcQ6+qRkw1UcZGPyrCiHHkmBaj9SiCD8Oqd556HldP+QlpUIe2Wgn3ehQGVoPOvZvtHm8HPx+bH20c9pvbkX3g==} + engines: {node: '>=18'} + cpu: [s390x] + os: [linux] + + '@esbuild/linux-x64@0.24.0': + resolution: {integrity: sha512-vbutsFqQ+foy3wSSbmjBXXIJ6PL3scghJoM8zCL142cGaZKAdCZHyf+Bpu/MmX9zT9Q0zFBVKb36Ma5Fzfa8xA==} + engines: {node: '>=18'} + cpu: [x64] + os: [linux] + + '@esbuild/netbsd-x64@0.24.0': + resolution: {integrity: sha512-hjQ0R/ulkO8fCYFsG0FZoH+pWgTTDreqpqY7UnQntnaKv95uP5iW3+dChxnx7C3trQQU40S+OgWhUVwCjVFLvg==} + engines: {node: '>=18'} + cpu: [x64] + os: [netbsd] + + '@esbuild/openbsd-arm64@0.24.0': + resolution: {integrity: sha512-MD9uzzkPQbYehwcN583yx3Tu5M8EIoTD+tUgKF982WYL9Pf5rKy9ltgD0eUgs8pvKnmizxjXZyLt0z6DC3rRXg==} + engines: {node: '>=18'} + cpu: [arm64] + os: [openbsd] + + '@esbuild/openbsd-x64@0.24.0': + resolution: {integrity: sha512-4ir0aY1NGUhIC1hdoCzr1+5b43mw99uNwVzhIq1OY3QcEwPDO3B7WNXBzaKY5Nsf1+N11i1eOfFcq+D/gOS15Q==} + engines: {node: '>=18'} + cpu: [x64] + os: [openbsd] + + '@esbuild/sunos-x64@0.24.0': + resolution: {integrity: sha512-jVzdzsbM5xrotH+W5f1s+JtUy1UWgjU0Cf4wMvffTB8m6wP5/kx0KiaLHlbJO+dMgtxKV8RQ/JvtlFcdZ1zCPA==} + engines: {node: '>=18'} + cpu: [x64] + os: [sunos] + + '@esbuild/win32-arm64@0.24.0': + resolution: {integrity: sha512-iKc8GAslzRpBytO2/aN3d2yb2z8XTVfNV0PjGlCxKo5SgWmNXx82I/Q3aG1tFfS+A2igVCY97TJ8tnYwpUWLCA==} + engines: {node: '>=18'} + cpu: [arm64] + os: [win32] + + '@esbuild/win32-ia32@0.24.0': + resolution: {integrity: sha512-vQW36KZolfIudCcTnaTpmLQ24Ha1RjygBo39/aLkM2kmjkWmZGEJ5Gn9l5/7tzXA42QGIoWbICfg6KLLkIw6yw==} + engines: {node: '>=18'} + cpu: [ia32] + os: [win32] + + '@esbuild/win32-x64@0.24.0': + resolution: {integrity: sha512-7IAFPrjSQIJrGsK6flwg7NFmwBoSTyF3rl7If0hNUFQU4ilTsEPL6GuMuU9BfIWVVGuRnuIidkSMC+c0Otu8IA==} + engines: {node: '>=18'} + cpu: [x64] + os: [win32] + + '@protobufjs/aspromise@1.1.2': + resolution: {integrity: sha512-j+gKExEuLmKwvz3OgROXtrJ2UG2x8Ch2YZUxahh+s1F2HZ+wAceUNLkvy6zKCPVRkU++ZWQrdxsUeQXmcg4uoQ==} + + '@protobufjs/base64@1.1.2': + resolution: {integrity: sha512-AZkcAA5vnN/v4PDqKyMR5lx7hZttPDgClv83E//FMNhR2TMcLUhfRUBHCmSl0oi9zMgDDqRUJkSxO3wm85+XLg==} + + '@protobufjs/codegen@2.0.4': + resolution: {integrity: sha512-YyFaikqM5sH0ziFZCN3xDC7zeGaB/d0IUb9CATugHWbd1FRFwWwt4ld4OYMPWu5a3Xe01mGAULCdqhMlPl29Jg==} + + '@protobufjs/eventemitter@1.1.0': + resolution: {integrity: sha512-j9ednRT81vYJ9OfVuXG6ERSTdEL1xVsNgqpkxMsbIabzSo3goCjDIveeGv5d03om39ML71RdmrGNjG5SReBP/Q==} + + '@protobufjs/fetch@1.1.0': + resolution: {integrity: sha512-lljVXpqXebpsijW71PZaCYeIcE5on1w5DlQy5WH6GLbFryLUrBD4932W/E2BSpfRJWseIL4v/KPgBFxDOIdKpQ==} + + '@protobufjs/float@1.0.2': + resolution: {integrity: sha512-Ddb+kVXlXst9d+R9PfTIxh1EdNkgoRe5tOX6t01f1lYWOvJnSPDBlG241QLzcyPdoNTsblLUdujGSE4RzrTZGQ==} + + '@protobufjs/inquire@1.1.0': + resolution: {integrity: sha512-kdSefcPdruJiFMVSbn801t4vFK7KB/5gd2fYvrxhuJYg8ILrmn9SKSX2tZdV6V+ksulWqS7aXjBcRXl3wHoD9Q==} + + '@protobufjs/path@1.1.2': + resolution: {integrity: sha512-6JOcJ5Tm08dOHAbdR3GrvP+yUUfkjG5ePsHYczMFLq3ZmMkAD98cDgcT2iA1lJ9NVwFd4tH/iSSoe44YWkltEA==} + + '@protobufjs/pool@1.1.0': + resolution: {integrity: sha512-0kELaGSIDBKvcgS4zkjz1PeddatrjYcmMWOlAuAPwAeccUrPHdUqo/J6LiymHHEiJT5NrF1UVwxY14f+fy4WQw==} + + '@protobufjs/utf8@1.1.0': + resolution: {integrity: sha512-Vvn3zZrhQZkkBE8LSuW3em98c0FwgO4nxzv6OdSxPKJIEKY2bGbHn+mhGIPerzI4twdxaP8/0+06HBpwf345Lw==} + + '@types/node@22.7.4': + resolution: {integrity: sha512-y+NPi1rFzDs1NdQHHToqeiX2TIS79SWEAw9GYhkkx8bD0ChpfqC+n2j5OXOCpzfojBEBt6DnEnnG9MY0zk1XLg==} + + esbuild@0.24.0: + resolution: {integrity: sha512-FuLPevChGDshgSicjisSooU0cemp/sGXR841D5LHMB7mTVOmsEHcAxaH3irL53+8YDIeVNQEySh4DaYU/iuPqQ==} + engines: {node: '>=18'} + hasBin: true + + flatbuffers@1.12.0: + resolution: {integrity: sha512-c7CZADjRcl6j0PlvFy0ZqXQ67qSEZfrVPynmnL+2zPc+NtMvrF8Y0QceMo7QqnSPc7+uWjUIAbvCQ5WIKlMVdQ==} + + guid-typescript@1.0.9: + resolution: {integrity: sha512-Y8T4vYhEfwJOTbouREvG+3XDsjr8E3kIr7uf+JZ0BYloFsttiHU0WfvANVsR7TxNUJa/WpCnw/Ino/p+DeBhBQ==} + + long@5.2.3: + resolution: {integrity: sha512-lcHwpNoggQTObv5apGNCTdJrO69eHOZMi4BNC+rTLER8iHAqGrUVeLh/irVIM7zTw2bOXA8T6uNPeujwOLg/2Q==} + + onnxruntime-common@1.19.2: + resolution: {integrity: sha512-a4R7wYEVFbZBlp0BfhpbFWqe4opCor3KM+5Wm22Az3NGDcQMiU2hfG/0MfnBs+1ZrlSGmlgWeMcXQkDk1UFb8Q==} + + onnxruntime-web@1.19.2: + resolution: {integrity: sha512-r0ok6KpTUXR4WA+rHvUiZn7JoH02e8iS7XE1p5bXk7q3E0UaRFfYvpMNUHqEPiTBMuIssfBxDCQjUihV8dDFPg==} + + platform@1.3.6: + resolution: {integrity: sha512-fnWVljUchTro6RiCFvCXBbNhJc2NijN7oIQxbwsyL0buWJPG85v81ehlHI9fXrJsMNgTofEoWIQeClKpgxFLrg==} + + protobufjs@7.4.0: + resolution: {integrity: sha512-mRUWCc3KUU4w1jU8sGxICXH/gNS94DvI1gxqDvBzhj1JpcsimQkYiOJfwsPUykUI5ZaspFbSgmBLER8IrQ3tqw==} + engines: {node: '>=12.0.0'} + + typescript@5.6.2: + resolution: {integrity: sha512-NW8ByodCSNCwZeghjN3o+JX5OFH0Ojg6sadjEKY4huZ52TqbJTJnDo5+Tw98lSy63NZvi4n+ez5m2u5d4PkZyw==} + engines: {node: '>=14.17'} + hasBin: true + + undici-types@6.19.8: + resolution: {integrity: sha512-ve2KP6f/JnbPBFyobGHuerC9g1FYGn/F8n1LWTwNxCEzd6IfqTwUQcNXgEtmmQ6DlRrC1hrSrBnCZPokRrDHjw==} + +snapshots: + + '@biomejs/biome@1.9.2': + optionalDependencies: + '@biomejs/cli-darwin-arm64': 1.9.2 + '@biomejs/cli-darwin-x64': 1.9.2 + '@biomejs/cli-linux-arm64': 1.9.2 + '@biomejs/cli-linux-arm64-musl': 1.9.2 + '@biomejs/cli-linux-x64': 1.9.2 + '@biomejs/cli-linux-x64-musl': 1.9.2 + '@biomejs/cli-win32-arm64': 1.9.2 + '@biomejs/cli-win32-x64': 1.9.2 + + '@biomejs/cli-darwin-arm64@1.9.2': + optional: true + + '@biomejs/cli-darwin-x64@1.9.2': + optional: true + + '@biomejs/cli-linux-arm64-musl@1.9.2': + optional: true + + '@biomejs/cli-linux-arm64@1.9.2': + optional: true + + '@biomejs/cli-linux-x64-musl@1.9.2': + optional: true + + '@biomejs/cli-linux-x64@1.9.2': + optional: true + + '@biomejs/cli-win32-arm64@1.9.2': + optional: true + + '@biomejs/cli-win32-x64@1.9.2': + optional: true + + '@esbuild/aix-ppc64@0.24.0': + optional: true + + '@esbuild/android-arm64@0.24.0': + optional: true + + '@esbuild/android-arm@0.24.0': + optional: true + + '@esbuild/android-x64@0.24.0': + optional: true + + '@esbuild/darwin-arm64@0.24.0': + optional: true + + '@esbuild/darwin-x64@0.24.0': + optional: true + + '@esbuild/freebsd-arm64@0.24.0': + optional: true + + '@esbuild/freebsd-x64@0.24.0': + optional: true + + '@esbuild/linux-arm64@0.24.0': + optional: true + + '@esbuild/linux-arm@0.24.0': + optional: true + + '@esbuild/linux-ia32@0.24.0': + optional: true + + '@esbuild/linux-loong64@0.24.0': + optional: true + + '@esbuild/linux-mips64el@0.24.0': + optional: true + + '@esbuild/linux-ppc64@0.24.0': + optional: true + + '@esbuild/linux-riscv64@0.24.0': + optional: true + + '@esbuild/linux-s390x@0.24.0': + optional: true + + '@esbuild/linux-x64@0.24.0': + optional: true + + '@esbuild/netbsd-x64@0.24.0': + optional: true + + '@esbuild/openbsd-arm64@0.24.0': + optional: true + + '@esbuild/openbsd-x64@0.24.0': + optional: true + + '@esbuild/sunos-x64@0.24.0': + optional: true + + '@esbuild/win32-arm64@0.24.0': + optional: true + + '@esbuild/win32-ia32@0.24.0': + optional: true + + '@esbuild/win32-x64@0.24.0': + optional: true + + '@protobufjs/aspromise@1.1.2': {} + + '@protobufjs/base64@1.1.2': {} + + '@protobufjs/codegen@2.0.4': {} + + '@protobufjs/eventemitter@1.1.0': {} + + '@protobufjs/fetch@1.1.0': + dependencies: + '@protobufjs/aspromise': 1.1.2 + '@protobufjs/inquire': 1.1.0 + + '@protobufjs/float@1.0.2': {} + + '@protobufjs/inquire@1.1.0': {} + + '@protobufjs/path@1.1.2': {} + + '@protobufjs/pool@1.1.0': {} + + '@protobufjs/utf8@1.1.0': {} + + '@types/node@22.7.4': + dependencies: + undici-types: 6.19.8 + + esbuild@0.24.0: + optionalDependencies: + '@esbuild/aix-ppc64': 0.24.0 + '@esbuild/android-arm': 0.24.0 + '@esbuild/android-arm64': 0.24.0 + '@esbuild/android-x64': 0.24.0 + '@esbuild/darwin-arm64': 0.24.0 + '@esbuild/darwin-x64': 0.24.0 + '@esbuild/freebsd-arm64': 0.24.0 + '@esbuild/freebsd-x64': 0.24.0 + '@esbuild/linux-arm': 0.24.0 + '@esbuild/linux-arm64': 0.24.0 + '@esbuild/linux-ia32': 0.24.0 + '@esbuild/linux-loong64': 0.24.0 + '@esbuild/linux-mips64el': 0.24.0 + '@esbuild/linux-ppc64': 0.24.0 + '@esbuild/linux-riscv64': 0.24.0 + '@esbuild/linux-s390x': 0.24.0 + '@esbuild/linux-x64': 0.24.0 + '@esbuild/netbsd-x64': 0.24.0 + '@esbuild/openbsd-arm64': 0.24.0 + '@esbuild/openbsd-x64': 0.24.0 + '@esbuild/sunos-x64': 0.24.0 + '@esbuild/win32-arm64': 0.24.0 + '@esbuild/win32-ia32': 0.24.0 + '@esbuild/win32-x64': 0.24.0 + + flatbuffers@1.12.0: {} + + guid-typescript@1.0.9: {} + + long@5.2.3: {} + + onnxruntime-common@1.19.2: {} + + onnxruntime-web@1.19.2: + dependencies: + flatbuffers: 1.12.0 + guid-typescript: 1.0.9 + long: 5.2.3 + onnxruntime-common: 1.19.2 + platform: 1.3.6 + protobufjs: 7.4.0 + + platform@1.3.6: {} + + protobufjs@7.4.0: + dependencies: + '@protobufjs/aspromise': 1.1.2 + '@protobufjs/base64': 1.1.2 + '@protobufjs/codegen': 2.0.4 + '@protobufjs/eventemitter': 1.1.0 + '@protobufjs/fetch': 1.1.0 + '@protobufjs/float': 1.0.2 + '@protobufjs/inquire': 1.1.0 + '@protobufjs/path': 1.1.2 + '@protobufjs/pool': 1.1.0 + '@protobufjs/utf8': 1.1.0 + '@types/node': 22.7.4 + long: 5.2.3 + + typescript@5.6.2: {} + + undici-types@6.19.8: {} diff --git a/sbv2_wasm/src-js/index.ts b/sbv2_wasm/src-js/index.ts new file mode 100644 index 0000000..27249f4 --- /dev/null +++ b/sbv2_wasm/src-js/index.ts @@ -0,0 +1,106 @@ +import * as wasm from "../pkg/sbv2_wasm.js"; +import { InferenceSession, Tensor } from "onnxruntime-web"; + +export class ModelHolder { + private models: Map = + new Map(); + constructor( + private tok: wasm.TokenizerWrap, + private deberta: InferenceSession, + ) {} + public static async globalInit(buf: ArrayBufferLike) { + await wasm.default(buf); + } + public static async create(tok: string, deberta: ArrayBufferLike) { + return new ModelHolder( + await wasm.load_tokenizer(tok), + await InferenceSession.create(deberta, { + executionProviders: ["webnn", "webgpu", "webgl", "wasm", "cpu"], + graphOptimizationLevel: "all", + }), + ); + } + public async synthesize( + name: string, + text: string, + style_id: number = 0, + style_weight: number = 1.0, + sdp_ratio: number = 0.4, + speed: number = 1.0, + ) { + const mod = this.models.get(name); + if (!mod) throw new Error(`No model named ${name}`); + const [vits2, style] = mod; + return wasm.synthesize( + text, + this.tok, + async (a: BigInt64Array, b: BigInt64Array) => { + try { + const res = ( + await this.deberta.run({ + input_ids: new Tensor("int64", a, [1, a.length]), + attention_mask: new Tensor("int64", b, [1, b.length]), + }) + )["output"]; + return [new Uint32Array(res.dims), await res.getData(true)]; + } catch (e) { + console.warn(e); + throw e; + } + }, + async ( + [a_shape, a_array]: any, + b_d: any, + c_d: any, + d_d: any, + e_d: any, + f: number, + g: number, + ) => { + try { + const a = new Tensor("float32", a_array, [1, ...a_shape]); + const b = new Tensor("int64", b_d, [1, b_d.length]); + const c = new Tensor("int64", c_d, [1, c_d.length]); + const d = new Tensor("int64", d_d, [1, d_d.length]); + const e = new Tensor("float32", e_d, [1, e_d.length]); + const res = ( + await vits2.run({ + x_tst: b, + x_tst_lengths: new Tensor("int64", [b_d.length]), + sid: new Tensor("int64", [0]), + tones: c, + language: d, + bert: a, + style_vec: e, + sdp_ratio: new Tensor("float32", [f]), + length_scale: new Tensor("float32", [g]), + }) + ).output; + return [new Uint32Array(res.dims), await res.getData(true)]; + } catch (e) { + console.warn(e); + throw e; + } + }, + sdp_ratio, + 1.0 / speed, + style_id, + style_weight, + style, + ); + } + public async load(name: string, b: Uint8Array) { + const [style, vits2_b] = wasm.load_sbv2file(b); + const vits2 = await InferenceSession.create(vits2_b as Uint8Array, { + executionProviders: ["webnn", "webgpu", "webgl", "wasm", "cpu"], + graphOptimizationLevel: "all", + }); + this.models.set(name, [vits2, style]); + } + public async unload(name: string) { + return this.models.delete(name); + } + public modelList() { + return this.models.keys(); + } +} diff --git a/sbv2_wasm/src/array_helper.rs b/sbv2_wasm/src/array_helper.rs new file mode 100644 index 0000000..4e00d2d --- /dev/null +++ b/sbv2_wasm/src/array_helper.rs @@ -0,0 +1,102 @@ +pub fn vec8_to_array8(v: Vec) -> js_sys::Uint8Array { + let arr = js_sys::Uint8Array::new_with_length(v.len() as u32); + arr.copy_from(&v); + arr +} + +pub fn vec_f32_to_array_f32(v: Vec) -> js_sys::Float32Array { + let arr = js_sys::Float32Array::new_with_length(v.len() as u32); + arr.copy_from(&v); + arr +} + +pub fn array8_to_vec8(buf: js_sys::Uint8Array) -> Vec { + let mut body = vec![0; buf.length() as usize]; + buf.copy_to(&mut body[..]); + body +} + +pub fn vec64_to_array64(v: Vec) -> js_sys::BigInt64Array { + let arr = js_sys::BigInt64Array::new_with_length(v.len() as u32); + arr.copy_from(&v); + arr +} + +pub fn vec_to_array(v: Vec) -> js_sys::Array { + let arr = js_sys::Array::new_with_length(v.len() as u32); + for (i, v) in v.into_iter().enumerate() { + arr.set(i as u32, v); + } + arr +} + +struct A { + shape: Vec, + data: Vec, +} + +impl TryFrom for A { + type Error = sbv2_core::error::Error; + + fn try_from(value: wasm_bindgen::JsValue) -> Result { + let value: js_sys::Array = value.into(); + let mut shape = vec![]; + let mut data = vec![]; + for (i, v) in value.iter().enumerate() { + match i { + 0 => { + let v: js_sys::Uint32Array = v.into(); + shape = vec![0; v.length() as usize]; + v.copy_to(&mut shape); + } + 1 => { + let v: js_sys::Float32Array = v.into(); + data = vec![0.0; v.length() as usize]; + v.copy_to(&mut data); + } + _ => {} + }; + } + Ok(A { shape, data }) + } +} + +pub fn array_to_array2_f32( + a: wasm_bindgen::JsValue, +) -> sbv2_core::error::Result> { + let a = A::try_from(a)?; + if a.shape.len() != 2 { + return Err(sbv2_core::error::Error::OtherError( + "Length mismatch".to_string(), + )); + } + let shape = [a.shape[0] as usize, a.shape[1] as usize]; + let arr = ndarray::Array2::from_shape_vec(shape, a.data.to_vec()) + .map_err(|e| sbv2_core::error::Error::OtherError(e.to_string()))?; + Ok(arr) +} +pub fn array_to_array3_f32( + a: wasm_bindgen::JsValue, +) -> sbv2_core::error::Result> { + let a = A::try_from(a)?; + if a.shape.len() != 3 { + return Err(sbv2_core::error::Error::OtherError( + "Length mismatch".to_string(), + )); + } + let shape = [ + a.shape[0] as usize, + a.shape[1] as usize, + a.shape[2] as usize, + ]; + let arr = ndarray::Array3::from_shape_vec(shape, a.data.to_vec()) + .map_err(|e| sbv2_core::error::Error::OtherError(e.to_string()))?; + Ok(arr) +} + +pub fn array2_f32_to_array(a: ndarray::Array2) -> js_sys::Array { + let shape: Vec = a.shape().iter().map(|f| (*f as u32).into()).collect(); + let typed_array = js_sys::Float32Array::new_with_length(a.len() as u32); + typed_array.copy_from(&a.into_flat().to_vec()); + vec_to_array(vec![vec_to_array(shape).into(), typed_array.into()]) +} diff --git a/sbv2_wasm/src/lib.rs b/sbv2_wasm/src/lib.rs index 69d2ac6..9ca5a4d 100644 --- a/sbv2_wasm/src/lib.rs +++ b/sbv2_wasm/src/lib.rs @@ -1,6 +1,8 @@ use once_cell::sync::Lazy; use sbv2_core::*; use wasm_bindgen::prelude::*; +use wasm_bindgen_futures::JsFuture; +mod array_helper; static JTALK: Lazy = Lazy::new(|| jtalk::JTalk::new().unwrap()); @@ -28,25 +30,20 @@ pub struct StyleVectorWrap { #[wasm_bindgen] pub fn load_sbv2file(buf: js_sys::Uint8Array) -> Result { - let mut body = vec![0; buf.length() as usize]; - buf.copy_to(&mut body[..]); - let (style_vectors, vits2) = sbv2file::parse_sbv2file(body)?; - let buf = js_sys::Uint8Array::new_with_length(vits2.len() as u32); - buf.copy_from(&vits2); - let arr = js_sys::Array::new_with_length(2); - arr.set( - 0, + let (style_vectors, vits2) = sbv2file::parse_sbv2file(array_helper::array8_to_vec8(buf))?; + let buf = array_helper::vec8_to_array8(vits2); + Ok(array_helper::vec_to_array(vec![ StyleVectorWrap { style_vector: style::load_style(style_vectors)?, } .into(), - ); - arr.set(1, buf.into()); - Ok(arr) + buf.into(), + ])) } +#[allow(clippy::too_many_arguments)] #[wasm_bindgen] -pub fn synthesize( +pub async fn synthesize( text: &str, tokenizer: &TokenizerWrap, bert_predict_fn: js_sys::Function, @@ -57,39 +54,61 @@ pub fn synthesize( style_weight: f32, style_vectors: &StyleVectorWrap, ) -> Result { - fn synthesize_wrap( - bert_ori: ndarray::Array2, - x_tst: ndarray::Array1, - tones: ndarray::Array1, - lang_ids: ndarray::Array1, - style_vector: ndarray::Array1, - sdp_ratio: f32, - length_scale: f32, - ) -> error::Result> { - todo!() - } + let synthesize_wrap = |bert_ori: ndarray::Array2, + x_tst: ndarray::Array1, + tones: ndarray::Array1, + lang_ids: ndarray::Array1, + style_vector: ndarray::Array1, + sdp_ratio: f32, + length_scale: f32| async move { + let arr = array_helper::vec_to_array(vec![ + array_helper::array2_f32_to_array(bert_ori).into(), + array_helper::vec64_to_array64(x_tst.to_vec()).into(), + array_helper::vec64_to_array64(tones.to_vec()).into(), + array_helper::vec64_to_array64(lang_ids.to_vec()).into(), + array_helper::vec_f32_to_array_f32(style_vector.to_vec()).into(), + sdp_ratio.into(), + length_scale.into(), + ]); + let res = synthesize_fn + .apply(&js_sys::Object::new().into(), &arr) + .map_err(|e| { + error::Error::OtherError(e.as_string().unwrap_or("unknown".to_string())) + })?; + let res = JsFuture::from(Into::::into(res)) + .await + .map_err(|e| { + sbv2_core::error::Error::OtherError(e.as_string().unwrap_or("unknown".to_string())) + })?; + array_helper::array_to_array3_f32(res) + }; let (bert_ori, phones, tones, lang_ids) = tts_util::parse_text( text, &JTALK, &tokenizer.tokenizer, |token_ids: Vec, attention_masks: Vec| { - let token_ids_ = js_sys::BigInt64Array::new_with_length(token_ids.len() as u32); - token_ids_.copy_from(&token_ids); - let attention_masks_ = - js_sys::BigInt64Array::new_with_length(attention_masks.len() as u32); - attention_masks_.copy_from(&attention_masks); - let arr = js_sys::Array::new_with_length(2); - arr.set(0, token_ids_.into()); - arr.set(1, attention_masks_.into()); - let res = bert_predict_fn - .apply(&js_sys::Object::new().into(), &arr) - .map_err(|e| { - error::Error::OtherError(e.as_string().unwrap_or("unknown".to_string())) - })?; - let res: js_sys::Array = res.into(); - Ok(todo!()) + Box::pin(async move { + let arr = array_helper::vec_to_array(vec![ + array_helper::vec64_to_array64(token_ids).into(), + array_helper::vec64_to_array64(attention_masks).into(), + ]); + let res = bert_predict_fn + .apply(&js_sys::Object::new().into(), &arr) + .map_err(|e| { + error::Error::OtherError(e.as_string().unwrap_or("unknown".to_string())) + })?; + let res = JsFuture::from(Into::::into(res)) + .await + .map_err(|e| { + sbv2_core::error::Error::OtherError( + e.as_string().unwrap_or("unknown".to_string()), + ) + })?; + array_helper::array_to_array2_f32(res) + }) }, - )?; + ) + .await?; let audio = synthesize_wrap( bert_ori.to_owned(), phones, @@ -98,9 +117,7 @@ pub fn synthesize( style::get_style_vector(&style_vectors.style_vector, style_id, style_weight)?, sdp_ratio, length_scale, - )?; - let vec = tts_util::array_to_vec(audio)?; - let buf = js_sys::Uint8Array::new_with_length(vec.len() as u32); - buf.copy_from(&vec); - Ok(buf) + ) + .await?; + Ok(array_helper::vec8_to_array8(tts_util::array_to_vec(audio)?)) } diff --git a/sbv2_wasm/tsconfig.json b/sbv2_wasm/tsconfig.json new file mode 100644 index 0000000..08a51fb --- /dev/null +++ b/sbv2_wasm/tsconfig.json @@ -0,0 +1,15 @@ +{ + "compilerOptions": { + "target": "ESNext", + "module": "ESNext", + "rootDir": "./src-js", + "outDir": "./dist", + "moduleResolution": "node", + "esModuleInterop": true, + "forceConsistentCasingInFileNames": true, + "strict": true, + "skipLibCheck": true, + "declaration": true, + "emitDeclarationOnly": true + } +}