mirror of
https://github.com/neodyland/sbv2-api.git
synced 2025-12-25 00:29:57 +00:00
Compare commits
81 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
4744f02792 | ||
|
|
5de9514546 | ||
|
|
dc88251d41 | ||
|
|
1550ce6ee4 | ||
|
|
c1bebea69b | ||
|
|
af5a550b8f | ||
|
|
febfd0d84f | ||
|
|
55698f4a61 | ||
|
|
b0155f5ffa | ||
|
|
0e9c7b6522 | ||
|
|
b0d8be32b6 | ||
|
|
f76f5e6d1c | ||
|
|
e8cc450693 | ||
|
|
6f0fcd491c | ||
|
|
5cf4149024 | ||
|
|
65303173a8 | ||
|
|
30e4cde3ed | ||
|
|
596eec654d | ||
|
|
ee292315e1 | ||
|
|
731c751455 | ||
|
|
497bdd79ea | ||
|
|
b887fae47b | ||
|
|
ca0b8553e4 | ||
|
|
29b14895bb | ||
|
|
c2910ad9e8 | ||
|
|
5c092e8cbb | ||
|
|
d380e549c4 | ||
|
|
395f5b0004 | ||
|
|
f5609035b7 | ||
|
|
1e9f25dcb1 | ||
|
|
321ca4e749 | ||
|
|
bb23bd145b | ||
|
|
30e79d0df6 | ||
|
|
04c21aa97c | ||
|
|
6f388052ae | ||
|
|
04af3abad5 | ||
|
|
414e42db50 | ||
|
|
b8b0198ca8 | ||
|
|
a99fd39834 | ||
|
|
886ab78eeb | ||
|
|
c85f474dbf | ||
|
|
6d160d7ae8 | ||
|
|
ee927d65cb | ||
|
|
6e7d641ecb | ||
|
|
eb249aad81 | ||
|
|
f79a67138f | ||
|
|
09945e2c1c | ||
|
|
821b4c7fb3 | ||
|
|
ec06c35929 | ||
|
|
1373aef4b2 | ||
|
|
e2e49fd0e8 | ||
|
|
0cf9f87cc9 | ||
|
|
5e500b2c42 | ||
|
|
136375e5b6 | ||
|
|
aade119ddb | ||
|
|
55cedb2f6d | ||
|
|
f2940f4ebe | ||
|
|
96a5ab0672 | ||
|
|
64cbd151a6 | ||
|
|
3103fcef17 | ||
|
|
dd8ae77edc | ||
|
|
ee4c4ab8ad | ||
|
|
79120e4aee | ||
|
|
c947df2105 | ||
|
|
dcbb19fcdd | ||
|
|
b5601410f8 | ||
|
|
a3160ea2e8 | ||
|
|
1a978c3fe3 | ||
|
|
5837b66759 | ||
|
|
962fa9a49d | ||
|
|
290fb37c16 | ||
|
|
0c926751a4 | ||
|
|
da86aa811d | ||
|
|
4e0edaebcd | ||
|
|
1d7d65ae21 | ||
|
|
3112e3e8ec | ||
|
|
5724251fb5 | ||
|
|
3b1182f07d | ||
|
|
4ed463b05b | ||
|
|
c641bc7529 | ||
|
|
0bb3c5b8ea |
13
.github/pull_request_template.md
vendored
Normal file
13
.github/pull_request_template.md
vendored
Normal file
@@ -0,0 +1,13 @@
|
||||
## 概要
|
||||
<!--
|
||||
ここに本PRの説明をしてください。
|
||||
-->
|
||||
|
||||
## 関連issue
|
||||
<!--
|
||||
ここに該当するissueの番号を書いてください。
|
||||
#nの前にfixesを置くとプルリクが閉じた時に自動的に該当issueもクローズします、
|
||||
-->
|
||||
|
||||
## 確認
|
||||
- [ ] 動作確認しましたか?
|
||||
166
.github/workflows/CI.yml
vendored
Normal file
166
.github/workflows/CI.yml
vendored
Normal file
@@ -0,0 +1,166 @@
|
||||
# This file is autogenerated by maturin v1.7.1
|
||||
# To update, run
|
||||
#
|
||||
# maturin generate-ci github
|
||||
#
|
||||
name: CI
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
- master
|
||||
tags:
|
||||
- '*'
|
||||
pull_request:
|
||||
workflow_dispatch:
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
id-token: write
|
||||
|
||||
jobs:
|
||||
linux:
|
||||
runs-on: ${{ matrix.platform.runner }}
|
||||
strategy:
|
||||
matrix:
|
||||
platform:
|
||||
- runner: ubuntu-latest
|
||||
target: x86_64
|
||||
- runner: ubuntu-latest
|
||||
target: aarch64
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: 3.x
|
||||
- name: Build wheels
|
||||
uses: PyO3/maturin-action@v1
|
||||
with:
|
||||
target: ${{ matrix.platform.target }}
|
||||
args: --release --out dist --find-interpreter
|
||||
sccache: 'true'
|
||||
manylinux: auto
|
||||
working-directory: sbv2_bindings
|
||||
- name: Upload wheels
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: wheels-linux-${{ matrix.platform.target }}
|
||||
path: sbv2_bindings/dist
|
||||
|
||||
windows:
|
||||
runs-on: ${{ matrix.platform.runner }}
|
||||
strategy:
|
||||
matrix:
|
||||
platform:
|
||||
- runner: windows-latest
|
||||
target: x64
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: 3.x
|
||||
architecture: ${{ matrix.platform.target }}
|
||||
- name: Build wheels
|
||||
uses: PyO3/maturin-action@v1
|
||||
with:
|
||||
target: ${{ matrix.platform.target }}
|
||||
args: --release --out dist --find-interpreter
|
||||
sccache: 'true'
|
||||
working-directory: sbv2_bindings
|
||||
- name: Upload wheels
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: wheels-windows-${{ matrix.platform.target }}
|
||||
path: sbv2_bindings/dist
|
||||
|
||||
macos:
|
||||
runs-on: ${{ matrix.platform.runner }}
|
||||
strategy:
|
||||
matrix:
|
||||
platform:
|
||||
- runner: macos-12
|
||||
target: x86_64
|
||||
- runner: macos-14
|
||||
target: aarch64
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: 3.x
|
||||
- name: Build wheels
|
||||
uses: PyO3/maturin-action@v1
|
||||
with:
|
||||
target: ${{ matrix.platform.target }}
|
||||
args: --release --out dist --find-interpreter
|
||||
sccache: 'true'
|
||||
working-directory: sbv2_bindings
|
||||
- name: Upload wheels
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: wheels-macos-${{ matrix.platform.target }}
|
||||
path: sbv2_bindings/dist
|
||||
|
||||
sdist:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- name: Build sdist
|
||||
uses: PyO3/maturin-action@v1
|
||||
with:
|
||||
command: sdist
|
||||
args: --out dist
|
||||
working-directory: sbv2_bindings
|
||||
- name: Upload sdist
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: wheels-sdist
|
||||
path: sbv2_bindings/dist
|
||||
|
||||
release:
|
||||
name: Release
|
||||
runs-on: ubuntu-latest
|
||||
if: "startsWith(github.ref, 'refs/tags/')"
|
||||
needs: [linux, windows, macos, sdist]
|
||||
environment: release
|
||||
steps:
|
||||
- uses: actions/download-artifact@v4
|
||||
- name: Publish to PyPI
|
||||
uses: PyO3/maturin-action@v1
|
||||
with:
|
||||
command: upload
|
||||
args: --non-interactive --skip-existing wheels-*/*
|
||||
|
||||
push-docker:
|
||||
runs-on: ubuntu-latest
|
||||
if: "startsWith(github.ref, 'refs/tags/')"
|
||||
permissions:
|
||||
contents: read
|
||||
packages: write
|
||||
strategy:
|
||||
matrix:
|
||||
tag: [cpu, cuda]
|
||||
platform:
|
||||
- linux/amd64
|
||||
- linux/arm64
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@v3
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
- name: Login to GitHub Container Registry
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
registry: ghcr.io
|
||||
username: ${{ github.actor }}
|
||||
password: ${{ secrets.GITHUB_TOKEN }}
|
||||
- name: Build and push image
|
||||
uses: docker/build-push-action@v6
|
||||
with:
|
||||
context: .
|
||||
push: true
|
||||
tags: |
|
||||
ghcr.io/${{ github.repository }}:${{ matrix.tag }}
|
||||
file: docker/${{ matrix.tag }}.Dockerfile
|
||||
platforms: ${{ matrix.platform }}
|
||||
36
.github/workflows/build.yml
vendored
36
.github/workflows/build.yml
vendored
@@ -1,36 +0,0 @@
|
||||
name: Push to github container register
|
||||
|
||||
on:
|
||||
release:
|
||||
types: [created]
|
||||
workflow_dispatch:
|
||||
|
||||
jobs:
|
||||
push-docker:
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
contents: read
|
||||
packages: write
|
||||
strategy:
|
||||
matrix:
|
||||
tag: [cpu, cuda]
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@v3
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
- name: Login to GitHub Container Registry
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
registry: ghcr.io
|
||||
username: ${{ github.actor }}
|
||||
password: ${{ secrets.GITHUB_TOKEN }}
|
||||
- name: Build and push image
|
||||
uses: docker/build-push-action@v6
|
||||
with:
|
||||
context: .
|
||||
push: true
|
||||
tags: |
|
||||
ghcr.io/${{ github.repository }}:${{ matrix.tag }}
|
||||
file: docker/${{ matrix.tag }}.Dockerfile
|
||||
3
.gitignore
vendored
3
.gitignore
vendored
@@ -4,4 +4,5 @@ models/
|
||||
venv/
|
||||
.env
|
||||
output.wav
|
||||
node_modules
|
||||
node_modules
|
||||
dist/
|
||||
675
Cargo.lock
generated
675
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
10
Cargo.toml
10
Cargo.toml
@@ -1,9 +1,15 @@
|
||||
[workspace]
|
||||
resolver = "2"
|
||||
members = ["sbv2_api", "sbv2_core", "sbv2_bindings"]
|
||||
members = ["sbv2_api", "sbv2_core", "sbv2_bindings", "sbv2_wasm"]
|
||||
|
||||
[workspace.dependencies]
|
||||
anyhow = "1.0.86"
|
||||
dotenvy = "0.15.7"
|
||||
env_logger = "0.11.5"
|
||||
ndarray = "0.16.1"
|
||||
ndarray = "0.16.1"
|
||||
once_cell = "1.19.0"
|
||||
|
||||
[profile.release]
|
||||
lto = true
|
||||
debug = false
|
||||
strip = true
|
||||
|
||||
21
README.md
21
README.md
@@ -1,5 +1,8 @@
|
||||
# SBV2-API
|
||||
|
||||
## 注意:本バージョンはアルファ版です。
|
||||
安定版を利用したい場合は[こちら](https://github.com/tuna2134/sbv2-api/tree/v0.1.x)をご覧ください。
|
||||
|
||||
## プログラミングに詳しくない方向け
|
||||
|
||||
[こちら](https://github.com/tuna2134/sbv2-gui?tab=readme-ov-file)を参照してください。
|
||||
@@ -27,6 +30,8 @@ JP-Extra しか対応していません。(基本的に対応する予定もあ
|
||||
- [x] GPU 対応(DirectML)
|
||||
- [x] GPU 対応(CoreML)
|
||||
- [ ] WASM 変換(依存ライブラリの関係により現在は不可)
|
||||
- [x] arm64のdockerサポート
|
||||
- [ ] MeCabを利用する
|
||||
|
||||
## 構造説明
|
||||
|
||||
@@ -57,6 +62,22 @@ docker run -it --rm -p 3000:3000 --name sbv2 \
|
||||
ghcr.io/tuna2134/sbv2-api:cpu
|
||||
```
|
||||
|
||||
<details>
|
||||
<summary>Apple Silicon搭載のMac(M1以降)の場合</summary>
|
||||
docker上で動作させる場合、.envのADDRをlocalhostから0.0.0.0に変更してください。
|
||||
|
||||
```yaml
|
||||
ADDR=0.0.0.0:3000
|
||||
```
|
||||
|
||||
CPUの場合は
|
||||
```bash
|
||||
docker run --platform linux/amd64 -it --rm -p 3000:3000 --name sbv2 \
|
||||
-v ./models:/work/models --env-file .env \
|
||||
ghcr.io/tuna2134/sbv2-api:cpu
|
||||
```
|
||||
</details>
|
||||
|
||||
CUDAの場合は
|
||||
```sh
|
||||
docker run -it --rm -p 3000:3000 --name sbv2 \
|
||||
|
||||
15
content.txt
15
content.txt
@@ -1 +1,14 @@
|
||||
10,000年前までコロナが流行っていました
|
||||
悪徳貴族として名高いヴェレット家の長男――オウガ・ヴェレットは転生者である。
|
||||
ブラック企業に勤め、過労死した彼には一つの夢があった。
|
||||
|
||||
「可愛いハーレム作って、美味い物を食べる。領民の税金で楽して好き放題な生活を送ってみせる!」
|
||||
|
||||
素晴らしき異世界ライフを夢見た彼は実現へ向けて、努力を始めた。
|
||||
ハーレムを築くためにいじめられてる平民の子を助けて恩を売ってやったり。
|
||||
労働力を手に入れるために多くの孤児を雇って教育したり。
|
||||
反乱を起きても鎮圧できるように魔法学院へ通って魔法を極める。
|
||||
|
||||
「クックック……! 順調、順調! 未来は明るいなぁ!」
|
||||
|
||||
――オウガはまだ知らない。
|
||||
楽な生活を送るためにしてきたことが評価され、世間から『聖者』様として呼ばれる未来を。
|
||||
@@ -126,11 +126,13 @@ torch.onnx.export(
|
||||
f"../models/model_{out_name}.onnx",
|
||||
verbose=True,
|
||||
dynamic_axes={
|
||||
"x_tst": {1: "batch_size"},
|
||||
"x_tst": {0: "batch_size", 1: "x_tst_max_length"},
|
||||
"x_tst_lengths": {0: "batch_size"},
|
||||
"tones": {1: "batch_size"},
|
||||
"language": {1: "batch_size"},
|
||||
"bert": {2: "batch_size"},
|
||||
"sid": {0: "batch_size"},
|
||||
"tones": {0: "batch_size", 1: "x_tst_max_length"},
|
||||
"language": {0: "batch_size", 1: "x_tst_max_length"},
|
||||
"bert": {0: "batch_size", 2: "x_tst_max_length"},
|
||||
"style_vec": {0: "batch_size"},
|
||||
},
|
||||
input_names=[
|
||||
"x_tst",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "sbv2_api"
|
||||
version = "0.1.0"
|
||||
version = "0.2.0-alpha"
|
||||
edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
@@ -9,9 +9,11 @@ axum = "0.7.5"
|
||||
dotenvy.workspace = true
|
||||
env_logger.workspace = true
|
||||
log = "0.4.22"
|
||||
sbv2_core = { version = "0.1.3", path = "../sbv2_core" }
|
||||
sbv2_core = { version = "0.2.0-alpha", path = "../sbv2_core" }
|
||||
serde = { version = "1.0.210", features = ["derive"] }
|
||||
tokio = { version = "1.40.0", features = ["full"] }
|
||||
utoipa = { version = "4.2.3", features = ["axum_extras"] }
|
||||
utoipa-scalar = { version = "0.1.0", features = ["axum"] }
|
||||
|
||||
[features]
|
||||
coreml = ["sbv2_core/coreml"]
|
||||
@@ -19,4 +21,4 @@ cuda = ["sbv2_core/cuda"]
|
||||
cuda_tf32 = ["sbv2_core/cuda_tf32"]
|
||||
dynamic = ["sbv2_core/dynamic"]
|
||||
directml = ["sbv2_core/directml"]
|
||||
tensorrt = ["sbv2_core/tensorrt"]
|
||||
tensorrt = ["sbv2_core/tensorrt"]
|
||||
|
||||
5
sbv2_api/build.rs
Normal file
5
sbv2_api/build.rs
Normal file
@@ -0,0 +1,5 @@
|
||||
fn main() {
|
||||
if cfg!(feature = "coreml") {
|
||||
println!("cargo:rustc-link-arg=-fapple-link-rtlib");
|
||||
}
|
||||
}
|
||||
@@ -11,10 +11,23 @@ use std::env;
|
||||
use std::sync::Arc;
|
||||
use tokio::fs;
|
||||
use tokio::sync::Mutex;
|
||||
use utoipa::{OpenApi, ToSchema};
|
||||
use utoipa_scalar::{Scalar, Servable};
|
||||
|
||||
mod error;
|
||||
use crate::error::AppResult;
|
||||
|
||||
#[derive(OpenApi)]
|
||||
#[openapi(paths(models, synthesize), components(schemas(SynthesizeRequest)))]
|
||||
struct ApiDoc;
|
||||
|
||||
#[utoipa::path(
|
||||
get,
|
||||
path = "/models",
|
||||
responses(
|
||||
(status = 200, description = "Return model list", body = Vec<String>),
|
||||
)
|
||||
)]
|
||||
async fn models(State(state): State<AppState>) -> AppResult<impl IntoResponse> {
|
||||
Ok(Json(state.tts_model.lock().await.models()))
|
||||
}
|
||||
@@ -27,7 +40,7 @@ fn length_default() -> f32 {
|
||||
1.0
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
#[derive(Deserialize, ToSchema)]
|
||||
struct SynthesizeRequest {
|
||||
text: String,
|
||||
ident: String,
|
||||
@@ -37,6 +50,14 @@ struct SynthesizeRequest {
|
||||
length_scale: f32,
|
||||
}
|
||||
|
||||
#[utoipa::path(
|
||||
post,
|
||||
path = "/synthesize",
|
||||
request_body = SynthesizeRequest,
|
||||
responses(
|
||||
(status = 200, description = "Return audio/wav", body = Vec<u8>, content_type = "audio/wav")
|
||||
)
|
||||
)]
|
||||
async fn synthesize(
|
||||
State(state): State<AppState>,
|
||||
Json(SynthesizeRequest {
|
||||
@@ -139,7 +160,8 @@ async fn main() -> anyhow::Result<()> {
|
||||
.route("/", get(|| async { "Hello, World!" }))
|
||||
.route("/synthesize", post(synthesize))
|
||||
.route("/models", get(models))
|
||||
.with_state(AppState::new().await?);
|
||||
.with_state(AppState::new().await?)
|
||||
.merge(Scalar::with_url("/docs", ApiDoc::openapi()));
|
||||
let addr = env::var("ADDR").unwrap_or("0.0.0.0:3000".to_string());
|
||||
let listener = tokio::net::TcpListener::bind(&addr).await?;
|
||||
log::info!("Listening on {addr}");
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "sbv2_bindings"
|
||||
version = "0.1.0"
|
||||
version = "0.2.0-alpha1"
|
||||
edition = "2021"
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
@@ -12,4 +12,4 @@ crate-type = ["cdylib"]
|
||||
anyhow.workspace = true
|
||||
ndarray.workspace = true
|
||||
pyo3 = { version = "0.22.0", features = ["anyhow"] }
|
||||
sbv2_core = { version = "0.1.3", path = "../sbv2_core" }
|
||||
sbv2_core = { version = "0.2.0-alpha", path = "../sbv2_core" }
|
||||
|
||||
@@ -8,11 +8,13 @@ def main():
|
||||
|
||||
model.load_sbv2file_from_path("amitaro", "../models/amitaro.sbv2")
|
||||
print("All setup is done!")
|
||||
|
||||
|
||||
style_vector = model.get_style_vector("amitaro", 0, 1.0)
|
||||
with open("output.wav", "wb") as f:
|
||||
f.write(model.synthesize("おはようございます。", "amitaro", style_vector, 0.0, 0.5))
|
||||
f.write(
|
||||
model.synthesize("おはようございます。", "amitaro", style_vector, 0.0, 0.5)
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
main()
|
||||
|
||||
@@ -11,5 +11,7 @@ classifiers = [
|
||||
"Programming Language :: Python :: Implementation :: PyPy",
|
||||
]
|
||||
dynamic = ["version"]
|
||||
|
||||
[tool.maturin]
|
||||
features = ["pyo3/extension-module"]
|
||||
strip = true
|
||||
@@ -1,6 +1,6 @@
|
||||
use pyo3::prelude::*;
|
||||
use pyo3::types::PyBytes;
|
||||
use sbv2_core::tts::TTSModelHolder;
|
||||
use sbv2_core::tts::{SynthesizeOptions, TTSModelHolder};
|
||||
|
||||
use crate::style::StyleVector;
|
||||
|
||||
@@ -109,8 +109,8 @@ impl TTSModel {
|
||||
/// テキスト
|
||||
/// ident : str
|
||||
/// 識別子
|
||||
/// style_vector : StyleVector
|
||||
/// スタイルベクトル
|
||||
/// style_id : int
|
||||
/// スタイルID
|
||||
/// sdp_ratio : float
|
||||
/// SDP比率
|
||||
/// length_scale : float
|
||||
@@ -125,21 +125,24 @@ impl TTSModel {
|
||||
py: Python<'p>,
|
||||
text: String,
|
||||
ident: String,
|
||||
style_vector: StyleVector,
|
||||
style_id: i32,
|
||||
sdp_ratio: f32,
|
||||
length_scale: f32,
|
||||
) -> anyhow::Result<Bound<PyBytes>> {
|
||||
let (bert_ori, phones, tones, lang_ids) = self.model.parse_text(&text)?;
|
||||
let data = self.model.synthesize(
|
||||
ident,
|
||||
bert_ori,
|
||||
phones,
|
||||
tones,
|
||||
lang_ids,
|
||||
style_vector.get(),
|
||||
sdp_ratio,
|
||||
length_scale,
|
||||
let data = self.model.easy_synthesize(
|
||||
ident.as_str(),
|
||||
&text,
|
||||
style_id,
|
||||
SynthesizeOptions {
|
||||
sdp_ratio,
|
||||
length_scale,
|
||||
..Default::default()
|
||||
},
|
||||
)?;
|
||||
Ok(PyBytes::new_bound(py, &data))
|
||||
}
|
||||
|
||||
fn unload(&mut self, ident: String) -> bool {
|
||||
self.model.unload(ident)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
[package]
|
||||
name = "sbv2_core"
|
||||
description = "Style-Bert-VITSの推論ライブラリ"
|
||||
version = "0.1.3"
|
||||
version = "0.2.0-alpha1"
|
||||
edition = "2021"
|
||||
license = "MIT"
|
||||
readme = "../README.md"
|
||||
@@ -16,20 +16,25 @@ hound = "3.5.1"
|
||||
jpreprocess = { version = "0.10.0", features = ["naist-jdic"] }
|
||||
ndarray.workspace = true
|
||||
num_cpus = "1.16.0"
|
||||
once_cell = "1.19.0"
|
||||
ort = { git = "https://github.com/pykeio/ort.git", version = "2.0.0-rc.6" }
|
||||
once_cell.workspace = true
|
||||
ort = { git = "https://github.com/pykeio/ort.git", version = "2.0.0-rc.6", optional = true }
|
||||
regex = "1.10.6"
|
||||
serde = { version = "1.0.210", features = ["derive"] }
|
||||
serde_json = "1.0.128"
|
||||
tar = "0.4.41"
|
||||
thiserror = "1.0.63"
|
||||
tokenizers = "0.20.0"
|
||||
tokenizers = { version = "0.20.0", default-features = false }
|
||||
vibrato = { version = "0.5.1", optional = true }
|
||||
zstd = "0.13.2"
|
||||
|
||||
[features]
|
||||
cuda = ["ort/cuda"]
|
||||
cuda_tf32 = []
|
||||
dynamic = ["ort/load-dynamic"]
|
||||
directml = ["ort/directml"]
|
||||
tensorrt = ["ort/tensorrt"]
|
||||
coreml = ["ort/coreml"]
|
||||
cuda = ["ort/cuda", "std"]
|
||||
cuda_tf32 = ["std", "cuda"]
|
||||
std = ["dep:ort", "tokenizers/progressbar", "tokenizers/onig", "tokenizers/esaxx_fast"]
|
||||
dynamic = ["ort/load-dynamic", "std"]
|
||||
directml = ["ort/directml", "std"]
|
||||
tensorrt = ["ort/tensorrt", "std"]
|
||||
coreml = ["ort/coreml", "std"]
|
||||
default = ["std"]
|
||||
no_std = ["tokenizers/unstable_wasm"]
|
||||
mecab = ["vibrato"]
|
||||
|
||||
@@ -6,6 +6,7 @@ pub enum Error {
|
||||
TokenizerError(#[from] tokenizers::Error),
|
||||
#[error("JPreprocess error: {0}")]
|
||||
JPreprocessError(#[from] jpreprocess::error::JPreprocessError),
|
||||
#[cfg(feature = "std")]
|
||||
#[error("ONNX error: {0}")]
|
||||
OrtError(#[from] ort::Error),
|
||||
#[error("NDArray error: {0}")]
|
||||
@@ -20,6 +21,8 @@ pub enum Error {
|
||||
HoundError(#[from] hound::Error),
|
||||
#[error("model not found error")]
|
||||
ModelNotFoundError(String),
|
||||
#[error("other")]
|
||||
OtherError(String),
|
||||
}
|
||||
|
||||
pub type Result<T> = std::result::Result<T, Error>;
|
||||
|
||||
@@ -19,21 +19,6 @@ fn initialize_jtalk() -> Result<JPreprocessType> {
|
||||
Ok(jpreprocess)
|
||||
}
|
||||
|
||||
static JTALK_G2P_G_A1_PATTERN: Lazy<Regex> = Lazy::new(|| Regex::new(r"/A:([0-9\-]+)\+").unwrap());
|
||||
static JTALK_G2P_G_A2_PATTERN: Lazy<Regex> = Lazy::new(|| Regex::new(r"\+(\d+)\+").unwrap());
|
||||
static JTALK_G2P_G_A3_PATTERN: Lazy<Regex> = Lazy::new(|| Regex::new(r"\+(\d+)/").unwrap());
|
||||
static JTALK_G2P_G_E3_PATTERN: Lazy<Regex> = Lazy::new(|| Regex::new(r"!(\d+)_").unwrap());
|
||||
static JTALK_G2P_G_F1_PATTERN: Lazy<Regex> = Lazy::new(|| Regex::new(r"/F:(\d+)_").unwrap());
|
||||
static JTALK_G2P_G_P3_PATTERN: Lazy<Regex> = Lazy::new(|| Regex::new(r"\-(.*?)\+").unwrap());
|
||||
|
||||
fn numeric_feature_by_regex(regex: &Regex, text: &str) -> i32 {
|
||||
if let Some(mat) = regex.captures(text) {
|
||||
mat[1].parse::<i32>().unwrap()
|
||||
} else {
|
||||
-50
|
||||
}
|
||||
}
|
||||
|
||||
macro_rules! hash_set {
|
||||
($($elem:expr),* $(,)?) => {{
|
||||
let mut set = HashSet::new();
|
||||
@@ -239,7 +224,10 @@ impl JTalkProcess {
|
||||
}
|
||||
|
||||
fn kata_to_phoneme_list(mut text: String) -> Result<Vec<String>> {
|
||||
if PUNCTUATIONS.contains(&text.as_str()) {
|
||||
let chars: HashSet<String> = text.chars().map(|x| x.to_string()).collect();
|
||||
if chars.is_subset(&HashSet::from_iter(
|
||||
PUNCTUATIONS.iter().map(|x| x.to_string()),
|
||||
)) {
|
||||
return Ok(text.chars().map(|x| x.to_string()).collect());
|
||||
}
|
||||
if !KATAKANA_PATTERN.is_match(&text) {
|
||||
@@ -351,11 +339,7 @@ impl JTalkProcess {
|
||||
|
||||
let mut phones: Vec<String> = Vec::new();
|
||||
for (i, label) in labels.iter().enumerate() {
|
||||
let mut p3 = {
|
||||
let label_text = label.to_string();
|
||||
let mattched = JTALK_G2P_G_P3_PATTERN.captures(&label_text).unwrap();
|
||||
mattched[1].to_string()
|
||||
};
|
||||
let mut p3 = label.phoneme.c.clone().unwrap();
|
||||
if "AIUEO".contains(&p3) {
|
||||
// 文字をlowerする
|
||||
p3 = p3.to_lowercase();
|
||||
@@ -365,10 +349,10 @@ impl JTalkProcess {
|
||||
if i == 0 {
|
||||
phones.push("^".to_string());
|
||||
} else if i == labels.len() - 1 {
|
||||
let e3 = numeric_feature_by_regex(&JTALK_G2P_G_E3_PATTERN, &label.to_string());
|
||||
if e3 == 0 {
|
||||
let e3 = label.accent_phrase_prev.clone().unwrap().is_interrogative;
|
||||
if e3 {
|
||||
phones.push("$".to_string());
|
||||
} else if e3 == 1 {
|
||||
} else {
|
||||
phones.push("?".to_string());
|
||||
}
|
||||
}
|
||||
@@ -380,14 +364,33 @@ impl JTalkProcess {
|
||||
phones.push(p3.clone());
|
||||
}
|
||||
|
||||
let a1 = numeric_feature_by_regex(&JTALK_G2P_G_A1_PATTERN, &label.to_string());
|
||||
let a2 = numeric_feature_by_regex(&JTALK_G2P_G_A2_PATTERN, &label.to_string());
|
||||
let a3 = numeric_feature_by_regex(&JTALK_G2P_G_A3_PATTERN, &label.to_string());
|
||||
let a1 = if let Some(mora) = &label.mora {
|
||||
mora.relative_accent_position as i32
|
||||
} else {
|
||||
-50
|
||||
};
|
||||
let a2 = if let Some(mora) = &label.mora {
|
||||
mora.position_forward as i32
|
||||
} else {
|
||||
-50
|
||||
};
|
||||
let a3 = if let Some(mora) = &label.mora {
|
||||
mora.position_backward as i32
|
||||
} else {
|
||||
-50
|
||||
};
|
||||
|
||||
let f1 = numeric_feature_by_regex(&JTALK_G2P_G_F1_PATTERN, &label.to_string());
|
||||
let f1 = if let Some(accent_phrase) = &label.accent_phrase_curr {
|
||||
accent_phrase.mora_count as i32
|
||||
} else {
|
||||
-50
|
||||
};
|
||||
|
||||
let a2_next =
|
||||
numeric_feature_by_regex(&JTALK_G2P_G_A2_PATTERN, &labels[i + 1].to_string());
|
||||
let a2_next = if let Some(mora) = &labels[i + 1].mora {
|
||||
mora.position_forward as i32
|
||||
} else {
|
||||
-50
|
||||
};
|
||||
|
||||
if a3 == 1 && a2_next == 1 && "aeiouAEIOUNcl".contains(&p3) {
|
||||
phones.push("#".to_string());
|
||||
|
||||
@@ -1,11 +1,19 @@
|
||||
#[cfg(feature = "std")]
|
||||
pub mod bert;
|
||||
pub mod error;
|
||||
pub mod jtalk;
|
||||
#[cfg(feature = "std")]
|
||||
pub mod model;
|
||||
pub mod mora;
|
||||
pub mod nlp;
|
||||
pub mod norm;
|
||||
pub mod sbv2file;
|
||||
pub mod style;
|
||||
pub mod tokenizer;
|
||||
#[cfg(feature = "std")]
|
||||
pub mod tts;
|
||||
pub mod tts_util;
|
||||
pub mod utils;
|
||||
|
||||
#[cfg(feature = "mecab")]
|
||||
pub mod mecab;
|
||||
@@ -1,9 +1,9 @@
|
||||
use std::env;
|
||||
use std::fs;
|
||||
|
||||
use sbv2_core::tts;
|
||||
use std::env;
|
||||
|
||||
fn main() -> anyhow::Result<()> {
|
||||
#[cfg(feature = "std")]
|
||||
fn main_inner() -> anyhow::Result<()> {
|
||||
use sbv2_core::tts;
|
||||
dotenvy::dotenv_override().ok();
|
||||
env_logger::init();
|
||||
let text = fs::read_to_string("content.txt")?;
|
||||
@@ -19,3 +19,13 @@ fn main() -> anyhow::Result<()> {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
#[cfg(not(feature = "std"))]
|
||||
fn main_inner() -> anyhow::Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn main() {
|
||||
if let Err(e) = main_inner() {
|
||||
println!("Error: {e}");
|
||||
}
|
||||
}
|
||||
|
||||
0
sbv2_core/src/mecab.rs
Normal file
0
sbv2_core/src/mecab.rs
Normal file
37
sbv2_core/src/sbv2file.rs
Normal file
37
sbv2_core/src/sbv2file.rs
Normal file
@@ -0,0 +1,37 @@
|
||||
use std::io::{Cursor, Read};
|
||||
|
||||
use tar::Archive;
|
||||
use zstd::decode_all;
|
||||
|
||||
use crate::error::{Error, Result};
|
||||
|
||||
/// Parse a .sbv2 file binary
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```rs
|
||||
/// parse_sbv2file("tsukuyomi", std::fs::read("tsukuyomi.sbv2")?)?;
|
||||
/// ```
|
||||
pub fn parse_sbv2file<P: AsRef<[u8]>>(sbv2_bytes: P) -> Result<(Vec<u8>, Vec<u8>)> {
|
||||
let mut arc = Archive::new(Cursor::new(decode_all(Cursor::new(sbv2_bytes.as_ref()))?));
|
||||
let mut vits2 = None;
|
||||
let mut style_vectors = None;
|
||||
let mut et = arc.entries()?;
|
||||
while let Some(Ok(mut e)) = et.next() {
|
||||
let pth = String::from_utf8_lossy(&e.path_bytes()).to_string();
|
||||
let mut b = Vec::with_capacity(e.size() as usize);
|
||||
e.read_to_end(&mut b)?;
|
||||
match pth.as_str() {
|
||||
"model.onnx" => vits2 = Some(b),
|
||||
"style_vectors.json" => style_vectors = Some(b),
|
||||
_ => continue,
|
||||
}
|
||||
}
|
||||
if style_vectors.is_none() {
|
||||
return Err(Error::ModelNotFoundError("style_vectors".to_string()));
|
||||
}
|
||||
if vits2.is_none() {
|
||||
return Err(Error::ModelNotFoundError("vits2".to_string()));
|
||||
}
|
||||
Ok((style_vectors.unwrap(), vits2.unwrap()))
|
||||
}
|
||||
@@ -1,5 +1,5 @@
|
||||
use crate::error::Result;
|
||||
use tokenizers::Tokenizer;
|
||||
pub use tokenizers::Tokenizer;
|
||||
|
||||
pub fn get_tokenizer<P: AsRef<[u8]>>(p: P) -> Result<Tokenizer> {
|
||||
let tokenizer = Tokenizer::from_bytes(p)?;
|
||||
|
||||
@@ -1,12 +1,8 @@
|
||||
use crate::error::{Error, Result};
|
||||
use crate::{bert, jtalk, model, nlp, norm, style, tokenizer, utils};
|
||||
use hound::{SampleFormat, WavSpec, WavWriter};
|
||||
use ndarray::{concatenate, s, Array, Array1, Array2, Array3, Axis};
|
||||
use crate::{jtalk, model, style, tokenizer, tts_util};
|
||||
use ndarray::{concatenate, Array1, Array2, Array3, Axis};
|
||||
use ort::Session;
|
||||
use std::io::{Cursor, Read};
|
||||
use tar::Archive;
|
||||
use tokenizers::Tokenizer;
|
||||
use zstd::decode_all;
|
||||
|
||||
#[derive(PartialEq, Eq, Clone)]
|
||||
pub struct TTSIdent(String);
|
||||
@@ -33,6 +29,7 @@ pub struct TTSModel {
|
||||
ident: TTSIdent,
|
||||
}
|
||||
|
||||
/// High-level Style-Bert-VITS2's API
|
||||
pub struct TTSModelHolder {
|
||||
tokenizer: Tokenizer,
|
||||
bert: Session,
|
||||
@@ -41,6 +38,13 @@ pub struct TTSModelHolder {
|
||||
}
|
||||
|
||||
impl TTSModelHolder {
|
||||
/// Initialize a new TTSModelHolder
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```rs
|
||||
/// let mut tts_holder = TTSModelHolder::new(std::fs::read("deberta.onnx")?, std::fs::read("tokenizer.json")?)?;
|
||||
/// ```
|
||||
pub fn new<P: AsRef<[u8]>>(bert_model_bytes: P, tokenizer_bytes: P) -> Result<Self> {
|
||||
let bert = model::load_model(bert_model_bytes, true)?;
|
||||
let jtalk = jtalk::JTalk::new()?;
|
||||
@@ -53,39 +57,35 @@ impl TTSModelHolder {
|
||||
})
|
||||
}
|
||||
|
||||
/// Return a list of model names
|
||||
pub fn models(&self) -> Vec<String> {
|
||||
self.models.iter().map(|m| m.ident.to_string()).collect()
|
||||
}
|
||||
|
||||
/// Load a .sbv2 file binary
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```rs
|
||||
/// tts_holder.load_sbv2file("tsukuyomi", std::fs::read("tsukuyomi.sbv2")?)?;
|
||||
/// ```
|
||||
pub fn load_sbv2file<I: Into<TTSIdent>, P: AsRef<[u8]>>(
|
||||
&mut self,
|
||||
ident: I,
|
||||
sbv2_bytes: P,
|
||||
) -> Result<()> {
|
||||
let mut arc = Archive::new(Cursor::new(decode_all(Cursor::new(sbv2_bytes.as_ref()))?));
|
||||
let mut vits2 = None;
|
||||
let mut style_vectors = None;
|
||||
let mut et = arc.entries()?;
|
||||
while let Some(Ok(mut e)) = et.next() {
|
||||
let pth = String::from_utf8_lossy(&e.path_bytes()).to_string();
|
||||
let mut b = Vec::with_capacity(e.size() as usize);
|
||||
e.read_to_end(&mut b)?;
|
||||
match pth.as_str() {
|
||||
"model.onnx" => vits2 = Some(b),
|
||||
"style_vectors.json" => style_vectors = Some(b),
|
||||
_ => continue,
|
||||
}
|
||||
}
|
||||
if style_vectors.is_none() {
|
||||
return Err(Error::ModelNotFoundError("style_vectors".to_string()));
|
||||
}
|
||||
if vits2.is_none() {
|
||||
return Err(Error::ModelNotFoundError("vits2".to_string()));
|
||||
}
|
||||
self.load(ident, style_vectors.unwrap(), vits2.unwrap())?;
|
||||
let (style_vectors, vits2) = crate::sbv2file::parse_sbv2file(sbv2_bytes)?;
|
||||
self.load(ident, style_vectors, vits2)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Load a style vector and onnx model binary
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```rs
|
||||
/// tts_holder.load("tsukuyomi", std::fs::read("style_vectors.json")?, std::fs::read("model.onnx")?)?;
|
||||
/// ```
|
||||
pub fn load<I: Into<TTSIdent>, P: AsRef<[u8]>>(
|
||||
&mut self,
|
||||
ident: I,
|
||||
@@ -103,6 +103,7 @@ impl TTSModelHolder {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Unload a model
|
||||
pub fn unload<I: Into<TTSIdent>>(&mut self, ident: I) -> bool {
|
||||
let ident = ident.into();
|
||||
if let Some((i, _)) = self
|
||||
@@ -118,74 +119,23 @@ impl TTSModelHolder {
|
||||
}
|
||||
}
|
||||
|
||||
/// Parse text and return the input for synthesize
|
||||
///
|
||||
/// # Note
|
||||
/// This function is for low-level usage, use `easy_synthesize` for high-level usage.
|
||||
#[allow(clippy::type_complexity)]
|
||||
pub fn parse_text(
|
||||
&self,
|
||||
text: &str,
|
||||
) -> Result<(Array2<f32>, Array1<i64>, Array1<i64>, Array1<i64>)> {
|
||||
let text = self.jtalk.num2word(text)?;
|
||||
let normalized_text = norm::normalize_text(&text);
|
||||
|
||||
let process = self.jtalk.process_text(&normalized_text)?;
|
||||
let (phones, tones, mut word2ph) = process.g2p()?;
|
||||
let (phones, tones, lang_ids) = nlp::cleaned_text_to_sequence(phones, tones);
|
||||
|
||||
let phones = utils::intersperse(&phones, 0);
|
||||
let tones = utils::intersperse(&tones, 0);
|
||||
let lang_ids = utils::intersperse(&lang_ids, 0);
|
||||
for item in &mut word2ph {
|
||||
*item *= 2;
|
||||
}
|
||||
word2ph[0] += 1;
|
||||
|
||||
let text = {
|
||||
let (seq_text, _) = process.text_to_seq_kata()?;
|
||||
seq_text.join("")
|
||||
};
|
||||
let (token_ids, attention_masks) = tokenizer::tokenize(&text, &self.tokenizer)?;
|
||||
|
||||
let bert_content = bert::predict(&self.bert, token_ids, attention_masks)?;
|
||||
|
||||
assert!(
|
||||
word2ph.len() == text.chars().count() + 2,
|
||||
"{} {}",
|
||||
word2ph.len(),
|
||||
normalized_text.chars().count()
|
||||
);
|
||||
|
||||
let mut phone_level_feature = vec![];
|
||||
for (i, reps) in word2ph.iter().enumerate() {
|
||||
let repeat_feature = {
|
||||
let (reps_rows, reps_cols) = (*reps, 1);
|
||||
let arr_len = bert_content.slice(s![i, ..]).len();
|
||||
|
||||
let mut results: Array2<f32> =
|
||||
Array::zeros((reps_rows as usize, arr_len * reps_cols));
|
||||
|
||||
for j in 0..reps_rows {
|
||||
for k in 0..reps_cols {
|
||||
let mut view = results.slice_mut(s![j, k * arr_len..(k + 1) * arr_len]);
|
||||
view.assign(&bert_content.slice(s![i, ..]));
|
||||
}
|
||||
}
|
||||
results
|
||||
};
|
||||
phone_level_feature.push(repeat_feature);
|
||||
}
|
||||
let phone_level_feature = concatenate(
|
||||
Axis(0),
|
||||
&phone_level_feature
|
||||
.iter()
|
||||
.map(|x| x.view())
|
||||
.collect::<Vec<_>>(),
|
||||
)?;
|
||||
let bert_ori = phone_level_feature.t();
|
||||
Ok((
|
||||
bert_ori.to_owned(),
|
||||
phones.into(),
|
||||
tones.into(),
|
||||
lang_ids.into(),
|
||||
))
|
||||
crate::tts_util::parse_text_blocking(
|
||||
text,
|
||||
&self.jtalk,
|
||||
&self.tokenizer,
|
||||
|token_ids, attention_masks| {
|
||||
crate::bert::predict(&self.bert, token_ids, attention_masks)
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
fn find_model<I: Into<TTSIdent>>(&self, ident: I) -> Result<&TTSModel> {
|
||||
@@ -196,6 +146,10 @@ impl TTSModelHolder {
|
||||
.ok_or(Error::ModelNotFoundError(ident.to_string()))
|
||||
}
|
||||
|
||||
/// Get style vector by style id and weight
|
||||
///
|
||||
/// # Note
|
||||
/// This function is for low-level usage, use `easy_synthesize` for high-level usage.
|
||||
pub fn get_style_vector<I: Into<TTSIdent>>(
|
||||
&self,
|
||||
ident: I,
|
||||
@@ -205,6 +159,13 @@ impl TTSModelHolder {
|
||||
style::get_style_vector(&self.find_model(ident)?.style_vectors, style_id, weight)
|
||||
}
|
||||
|
||||
/// Synthesize text to audio
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```rs
|
||||
/// let audio = tts_holder.easy_synthesize("tsukuyomi", "こんにちは", 0, SynthesizeOptions::default())?;
|
||||
/// ```
|
||||
pub fn easy_synthesize<I: Into<TTSIdent> + Copy>(
|
||||
&self,
|
||||
ident: I,
|
||||
@@ -253,28 +214,13 @@ impl TTSModelHolder {
|
||||
options.length_scale,
|
||||
)?
|
||||
};
|
||||
Self::array_to_vec(audio_array)
|
||||
}
|
||||
|
||||
fn array_to_vec(audio_array: Array3<f32>) -> Result<Vec<u8>> {
|
||||
let spec = WavSpec {
|
||||
channels: 1,
|
||||
sample_rate: 44100,
|
||||
bits_per_sample: 32,
|
||||
sample_format: SampleFormat::Float,
|
||||
};
|
||||
let mut cursor = Cursor::new(Vec::new());
|
||||
let mut writer = WavWriter::new(&mut cursor, spec)?;
|
||||
for i in 0..audio_array.shape()[0] {
|
||||
let output = audio_array.slice(s![i, 0, ..]).to_vec();
|
||||
for sample in output {
|
||||
writer.write_sample(sample)?;
|
||||
}
|
||||
}
|
||||
writer.finalize()?;
|
||||
Ok(cursor.into_inner())
|
||||
tts_util::array_to_vec(audio_array)
|
||||
}
|
||||
|
||||
/// Synthesize text to audio
|
||||
///
|
||||
/// # Note
|
||||
/// This function is for low-level usage, use `easy_synthesize` for high-level usage.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub fn synthesize<I: Into<TTSIdent>>(
|
||||
&self,
|
||||
@@ -297,10 +243,17 @@ impl TTSModelHolder {
|
||||
sdp_ratio,
|
||||
length_scale,
|
||||
)?;
|
||||
Self::array_to_vec(audio_array)
|
||||
tts_util::array_to_vec(audio_array)
|
||||
}
|
||||
}
|
||||
|
||||
/// Synthesize options
|
||||
///
|
||||
/// # Fields
|
||||
/// - `sdp_ratio`: SDP ratio
|
||||
/// - `length_scale`: Length scale
|
||||
/// - `style_weight`: Style weight
|
||||
/// - `split_sentences`: Split sentences
|
||||
pub struct SynthesizeOptions {
|
||||
pub sdp_ratio: f32,
|
||||
pub length_scale: f32,
|
||||
|
||||
180
sbv2_core/src/tts_util.rs
Normal file
180
sbv2_core/src/tts_util.rs
Normal file
@@ -0,0 +1,180 @@
|
||||
use std::io::Cursor;
|
||||
|
||||
use crate::error::Result;
|
||||
use crate::{jtalk, nlp, norm, tokenizer, utils};
|
||||
use hound::{SampleFormat, WavSpec, WavWriter};
|
||||
use ndarray::{concatenate, s, Array, Array1, Array2, Array3, Axis};
|
||||
use tokenizers::Tokenizer;
|
||||
/// Parse text and return the input for synthesize
|
||||
///
|
||||
/// # Note
|
||||
/// This function is for low-level usage, use `easy_synthesize` for high-level usage.
|
||||
#[allow(clippy::type_complexity)]
|
||||
pub async fn parse_text(
|
||||
text: &str,
|
||||
jtalk: &jtalk::JTalk,
|
||||
tokenizer: &Tokenizer,
|
||||
bert_predict: impl FnOnce(
|
||||
Vec<i64>,
|
||||
Vec<i64>,
|
||||
) -> std::pin::Pin<
|
||||
Box<dyn std::future::Future<Output = Result<ndarray::Array2<f32>>>>,
|
||||
>,
|
||||
) -> Result<(Array2<f32>, Array1<i64>, Array1<i64>, Array1<i64>)> {
|
||||
let text = jtalk.num2word(text)?;
|
||||
let normalized_text = norm::normalize_text(&text);
|
||||
|
||||
let process = jtalk.process_text(&normalized_text)?;
|
||||
let (phones, tones, mut word2ph) = process.g2p()?;
|
||||
let (phones, tones, lang_ids) = nlp::cleaned_text_to_sequence(phones, tones);
|
||||
|
||||
let phones = utils::intersperse(&phones, 0);
|
||||
let tones = utils::intersperse(&tones, 0);
|
||||
let lang_ids = utils::intersperse(&lang_ids, 0);
|
||||
for item in &mut word2ph {
|
||||
*item *= 2;
|
||||
}
|
||||
word2ph[0] += 1;
|
||||
|
||||
let text = {
|
||||
let (seq_text, _) = process.text_to_seq_kata()?;
|
||||
seq_text.join("")
|
||||
};
|
||||
let (token_ids, attention_masks) = tokenizer::tokenize(&text, tokenizer)?;
|
||||
|
||||
let bert_content = bert_predict(token_ids, attention_masks).await?;
|
||||
|
||||
assert!(
|
||||
word2ph.len() == text.chars().count() + 2,
|
||||
"{} {}",
|
||||
word2ph.len(),
|
||||
normalized_text.chars().count()
|
||||
);
|
||||
|
||||
let mut phone_level_feature = vec![];
|
||||
for (i, reps) in word2ph.iter().enumerate() {
|
||||
let repeat_feature = {
|
||||
let (reps_rows, reps_cols) = (*reps, 1);
|
||||
let arr_len = bert_content.slice(s![i, ..]).len();
|
||||
|
||||
let mut results: Array2<f32> = Array::zeros((reps_rows as usize, arr_len * reps_cols));
|
||||
|
||||
for j in 0..reps_rows {
|
||||
for k in 0..reps_cols {
|
||||
let mut view = results.slice_mut(s![j, k * arr_len..(k + 1) * arr_len]);
|
||||
view.assign(&bert_content.slice(s![i, ..]));
|
||||
}
|
||||
}
|
||||
results
|
||||
};
|
||||
phone_level_feature.push(repeat_feature);
|
||||
}
|
||||
let phone_level_feature = concatenate(
|
||||
Axis(0),
|
||||
&phone_level_feature
|
||||
.iter()
|
||||
.map(|x| x.view())
|
||||
.collect::<Vec<_>>(),
|
||||
)?;
|
||||
let bert_ori = phone_level_feature.t();
|
||||
Ok((
|
||||
bert_ori.to_owned(),
|
||||
phones.into(),
|
||||
tones.into(),
|
||||
lang_ids.into(),
|
||||
))
|
||||
}
|
||||
|
||||
/// Parse text and return the input for synthesize
|
||||
///
|
||||
/// # Note
|
||||
/// This function is for low-level usage, use `easy_synthesize` for high-level usage.
|
||||
#[allow(clippy::type_complexity)]
|
||||
pub fn parse_text_blocking(
|
||||
text: &str,
|
||||
jtalk: &jtalk::JTalk,
|
||||
tokenizer: &Tokenizer,
|
||||
bert_predict: impl FnOnce(Vec<i64>, Vec<i64>) -> Result<ndarray::Array2<f32>>,
|
||||
) -> Result<(Array2<f32>, Array1<i64>, Array1<i64>, Array1<i64>)> {
|
||||
let text = jtalk.num2word(text)?;
|
||||
let normalized_text = norm::normalize_text(&text);
|
||||
|
||||
let process = jtalk.process_text(&normalized_text)?;
|
||||
let (phones, tones, mut word2ph) = process.g2p()?;
|
||||
let (phones, tones, lang_ids) = nlp::cleaned_text_to_sequence(phones, tones);
|
||||
|
||||
let phones = utils::intersperse(&phones, 0);
|
||||
let tones = utils::intersperse(&tones, 0);
|
||||
let lang_ids = utils::intersperse(&lang_ids, 0);
|
||||
for item in &mut word2ph {
|
||||
*item *= 2;
|
||||
}
|
||||
word2ph[0] += 1;
|
||||
|
||||
let text = {
|
||||
let (seq_text, _) = process.text_to_seq_kata()?;
|
||||
seq_text.join("")
|
||||
};
|
||||
let (token_ids, attention_masks) = tokenizer::tokenize(&text, tokenizer)?;
|
||||
|
||||
let bert_content = bert_predict(token_ids, attention_masks)?;
|
||||
|
||||
assert!(
|
||||
word2ph.len() == text.chars().count() + 2,
|
||||
"{} {}",
|
||||
word2ph.len(),
|
||||
normalized_text.chars().count()
|
||||
);
|
||||
|
||||
let mut phone_level_feature = vec![];
|
||||
for (i, reps) in word2ph.iter().enumerate() {
|
||||
let repeat_feature = {
|
||||
let (reps_rows, reps_cols) = (*reps, 1);
|
||||
let arr_len = bert_content.slice(s![i, ..]).len();
|
||||
|
||||
let mut results: Array2<f32> = Array::zeros((reps_rows as usize, arr_len * reps_cols));
|
||||
|
||||
for j in 0..reps_rows {
|
||||
for k in 0..reps_cols {
|
||||
let mut view = results.slice_mut(s![j, k * arr_len..(k + 1) * arr_len]);
|
||||
view.assign(&bert_content.slice(s![i, ..]));
|
||||
}
|
||||
}
|
||||
results
|
||||
};
|
||||
phone_level_feature.push(repeat_feature);
|
||||
}
|
||||
let phone_level_feature = concatenate(
|
||||
Axis(0),
|
||||
&phone_level_feature
|
||||
.iter()
|
||||
.map(|x| x.view())
|
||||
.collect::<Vec<_>>(),
|
||||
)?;
|
||||
let bert_ori = phone_level_feature.t();
|
||||
Ok((
|
||||
bert_ori.to_owned(),
|
||||
phones.into(),
|
||||
tones.into(),
|
||||
lang_ids.into(),
|
||||
))
|
||||
}
|
||||
|
||||
pub fn array_to_vec(audio_array: Array3<f32>) -> Result<Vec<u8>> {
|
||||
let spec = WavSpec {
|
||||
channels: 1,
|
||||
sample_rate: 44100,
|
||||
bits_per_sample: 32,
|
||||
sample_format: SampleFormat::Float,
|
||||
};
|
||||
let mut cursor = Cursor::new(Vec::new());
|
||||
let mut writer = WavWriter::new(&mut cursor, spec)?;
|
||||
for i in 0..audio_array.shape()[0] {
|
||||
let output = audio_array.slice(s![i, 0, ..]).to_vec();
|
||||
for sample in output {
|
||||
writer.write_sample(sample)?;
|
||||
}
|
||||
}
|
||||
writer.finalize()?;
|
||||
Ok(cursor.into_inner())
|
||||
}
|
||||
19
sbv2_wasm/Cargo.toml
Normal file
19
sbv2_wasm/Cargo.toml
Normal file
@@ -0,0 +1,19 @@
|
||||
[package]
|
||||
name = "sbv2_wasm"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
[lib]
|
||||
crate-type = ["cdylib", "rlib"]
|
||||
|
||||
[dependencies]
|
||||
wasm-bindgen = "0.2.93"
|
||||
sbv2_core = { path = "../sbv2_core", default-features = false, features = ["no_std"] }
|
||||
once_cell.workspace = true
|
||||
js-sys = "0.3.70"
|
||||
ndarray.workspace = true
|
||||
wasm-bindgen-futures = "0.4.43"
|
||||
|
||||
[profile.release]
|
||||
lto = true
|
||||
opt-level = "s"
|
||||
2
sbv2_wasm/README.md
Normal file
2
sbv2_wasm/README.md
Normal file
@@ -0,0 +1,2 @@
|
||||
# StyleBertVITS2 wasm
|
||||
refer to https://github.com/tuna2134/sbv2-api
|
||||
31
sbv2_wasm/biome.json
Normal file
31
sbv2_wasm/biome.json
Normal file
@@ -0,0 +1,31 @@
|
||||
{
|
||||
"$schema": "https://biomejs.dev/schemas/1.9.2/schema.json",
|
||||
"vcs": {
|
||||
"enabled": false,
|
||||
"clientKind": "git",
|
||||
"useIgnoreFile": false
|
||||
},
|
||||
"files": {
|
||||
"ignoreUnknown": false,
|
||||
"ignore": []
|
||||
},
|
||||
"formatter": {
|
||||
"enabled": true,
|
||||
"indentStyle": "tab",
|
||||
"ignore": ["dist/", "pkg/"]
|
||||
},
|
||||
"organizeImports": {
|
||||
"enabled": true
|
||||
},
|
||||
"linter": {
|
||||
"enabled": true,
|
||||
"rules": {
|
||||
"recommended": true
|
||||
}
|
||||
},
|
||||
"javascript": {
|
||||
"formatter": {
|
||||
"quoteStyle": "double"
|
||||
}
|
||||
}
|
||||
}
|
||||
4
sbv2_wasm/build.sh
Executable file
4
sbv2_wasm/build.sh
Executable file
@@ -0,0 +1,4 @@
|
||||
wasm-pack build --target web sbv2_wasm
|
||||
wasm-opt -O3 -o sbv2_wasm/pkg/sbv2_wasm_bg.wasm sbv2_wasm/pkg/sbv2_wasm_bg.wasm
|
||||
mkdir -p sbv2_wasm/dist
|
||||
cp sbv2_wasm/sbv2_wasm/pkg/sbv2_wasm_bg.wasm sbv2_wasm/dist/sbv2_wasm_bg.wasm
|
||||
51
sbv2_wasm/example.html
Normal file
51
sbv2_wasm/example.html
Normal file
@@ -0,0 +1,51 @@
|
||||
<!doctype html>
|
||||
<html lang="en">
|
||||
|
||||
<head>
|
||||
<meta charset="UTF-8" />
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
||||
<title>Style Bert VITS2 Web</title>
|
||||
<script type="importmap">
|
||||
{
|
||||
"imports": {
|
||||
"onnxruntime-web": "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.19.2/dist/ort.all.min.mjs",
|
||||
"sbv2": "https://cdn.jsdelivr.net/npm/sbv2@0.1.1+esm"
|
||||
}
|
||||
}
|
||||
</script>
|
||||
<script type="module" async defer>
|
||||
import { ModelHolder } from "sbv2";
|
||||
await ModelHolder.globalInit(
|
||||
await (
|
||||
await fetch("https://esm.sh/sbv2@0.1.1/dist/sbv2_wasm_bg.wasm", { cache: "force-cache" })
|
||||
).arrayBuffer(),
|
||||
);
|
||||
const holder = await ModelHolder.create(
|
||||
await (
|
||||
await fetch("/models/tokenizer.json", { cache: "force-cache" })
|
||||
).text(),
|
||||
await (
|
||||
await fetch("/models/deberta.onnx", { cache: "force-cache" })
|
||||
).arrayBuffer(),
|
||||
);
|
||||
if (typeof window.onready == "function") {
|
||||
window.onready(holder);
|
||||
}
|
||||
</script>
|
||||
<script type="module" async defer>
|
||||
window.onready = async function (holder) {
|
||||
await holder.load(
|
||||
"amitaro",
|
||||
await (await fetch("/models/amitaro.sbv2")).arrayBuffer(),
|
||||
);
|
||||
const wave = await holder.synthesize("amitaro", "おはよう");
|
||||
console.log(wave);
|
||||
};
|
||||
</script>
|
||||
</head>
|
||||
|
||||
<body>
|
||||
<div id="root"></div>
|
||||
</body>
|
||||
|
||||
</html>
|
||||
11
sbv2_wasm/example.js
Normal file
11
sbv2_wasm/example.js
Normal file
@@ -0,0 +1,11 @@
|
||||
import { ModelHolder } from "./dist/index.js";
|
||||
import fs from "node:fs/promises";
|
||||
|
||||
ModelHolder.globalInit(await fs.readFile("./dist/sbv2_wasm_bg.wasm"));
|
||||
const holder = await ModelHolder.create(
|
||||
(await fs.readFile("../models/tokenizer.json")).toString("utf-8"),
|
||||
await fs.readFile("../models/deberta.onnx"),
|
||||
);
|
||||
await holder.load("tsukuyomi", await fs.readFile("../models/iroha2.sbv2"));
|
||||
await fs.writeFile("out.wav", await holder.synthesize("tsukuyomi", "おはよう"));
|
||||
holder.unload("tsukuyomi");
|
||||
25
sbv2_wasm/package.json
Normal file
25
sbv2_wasm/package.json
Normal file
@@ -0,0 +1,25 @@
|
||||
{
|
||||
"name": "sbv2",
|
||||
"version": "0.1.1",
|
||||
"description": "Style Bert VITS2 wasm",
|
||||
"main": "dist/index.js",
|
||||
"types": "dist/index.d.ts",
|
||||
"type": "module",
|
||||
"scripts": {
|
||||
"build": "tsc && esbuild src-js/index.ts --outfile=dist/index.js --minify --format=esm --bundle --external:onnxruntime-web",
|
||||
"format": "biome format --write ."
|
||||
},
|
||||
"keywords": [],
|
||||
"author": "tuna2134",
|
||||
"license": "MIT",
|
||||
"devDependencies": {
|
||||
"@biomejs/biome": "^1.9.2",
|
||||
"@types/node": "^22.7.4",
|
||||
"esbuild": "^0.24.0",
|
||||
"typescript": "^5.6.2"
|
||||
},
|
||||
"dependencies": {
|
||||
"onnxruntime-web": "^1.19.2"
|
||||
},
|
||||
"files": ["dist/*", "package.json", "README.md"]
|
||||
}
|
||||
494
sbv2_wasm/pnpm-lock.yaml
generated
Normal file
494
sbv2_wasm/pnpm-lock.yaml
generated
Normal file
@@ -0,0 +1,494 @@
|
||||
lockfileVersion: '9.0'
|
||||
|
||||
settings:
|
||||
autoInstallPeers: true
|
||||
excludeLinksFromLockfile: false
|
||||
|
||||
importers:
|
||||
|
||||
.:
|
||||
dependencies:
|
||||
onnxruntime-web:
|
||||
specifier: ^1.19.2
|
||||
version: 1.19.2
|
||||
devDependencies:
|
||||
'@biomejs/biome':
|
||||
specifier: ^1.9.2
|
||||
version: 1.9.3
|
||||
'@types/node':
|
||||
specifier: ^22.7.4
|
||||
version: 22.7.4
|
||||
esbuild:
|
||||
specifier: ^0.24.0
|
||||
version: 0.24.0
|
||||
typescript:
|
||||
specifier: ^5.6.2
|
||||
version: 5.6.2
|
||||
|
||||
packages:
|
||||
|
||||
'@biomejs/biome@1.9.3':
|
||||
resolution: {integrity: sha512-POjAPz0APAmX33WOQFGQrwLvlu7WLV4CFJMlB12b6ZSg+2q6fYu9kZwLCOA+x83zXfcPd1RpuWOKJW0GbBwLIQ==}
|
||||
engines: {node: '>=14.21.3'}
|
||||
hasBin: true
|
||||
|
||||
'@biomejs/cli-darwin-arm64@1.9.3':
|
||||
resolution: {integrity: sha512-QZzD2XrjJDUyIZK+aR2i5DDxCJfdwiYbUKu9GzkCUJpL78uSelAHAPy7m0GuPMVtF/Uo+OKv97W3P9nuWZangQ==}
|
||||
engines: {node: '>=14.21.3'}
|
||||
cpu: [arm64]
|
||||
os: [darwin]
|
||||
|
||||
'@biomejs/cli-darwin-x64@1.9.3':
|
||||
resolution: {integrity: sha512-vSCoIBJE0BN3SWDFuAY/tRavpUtNoqiceJ5PrU3xDfsLcm/U6N93JSM0M9OAiC/X7mPPfejtr6Yc9vSgWlEgVw==}
|
||||
engines: {node: '>=14.21.3'}
|
||||
cpu: [x64]
|
||||
os: [darwin]
|
||||
|
||||
'@biomejs/cli-linux-arm64-musl@1.9.3':
|
||||
resolution: {integrity: sha512-VBzyhaqqqwP3bAkkBrhVq50i3Uj9+RWuj+pYmXrMDgjS5+SKYGE56BwNw4l8hR3SmYbLSbEo15GcV043CDSk+Q==}
|
||||
engines: {node: '>=14.21.3'}
|
||||
cpu: [arm64]
|
||||
os: [linux]
|
||||
|
||||
'@biomejs/cli-linux-arm64@1.9.3':
|
||||
resolution: {integrity: sha512-vJkAimD2+sVviNTbaWOGqEBy31cW0ZB52KtpVIbkuma7PlfII3tsLhFa+cwbRAcRBkobBBhqZ06hXoZAN8NODQ==}
|
||||
engines: {node: '>=14.21.3'}
|
||||
cpu: [arm64]
|
||||
os: [linux]
|
||||
|
||||
'@biomejs/cli-linux-x64-musl@1.9.3':
|
||||
resolution: {integrity: sha512-TJmnOG2+NOGM72mlczEsNki9UT+XAsMFAOo8J0me/N47EJ/vkLXxf481evfHLlxMejTY6IN8SdRSiPVLv6AHlA==}
|
||||
engines: {node: '>=14.21.3'}
|
||||
cpu: [x64]
|
||||
os: [linux]
|
||||
|
||||
'@biomejs/cli-linux-x64@1.9.3':
|
||||
resolution: {integrity: sha512-x220V4c+romd26Mu1ptU+EudMXVS4xmzKxPVb9mgnfYlN4Yx9vD5NZraSx/onJnd3Gh/y8iPUdU5CDZJKg9COA==}
|
||||
engines: {node: '>=14.21.3'}
|
||||
cpu: [x64]
|
||||
os: [linux]
|
||||
|
||||
'@biomejs/cli-win32-arm64@1.9.3':
|
||||
resolution: {integrity: sha512-lg/yZis2HdQGsycUvHWSzo9kOvnGgvtrYRgoCEwPBwwAL8/6crOp3+f47tPwI/LI1dZrhSji7PNsGKGHbwyAhw==}
|
||||
engines: {node: '>=14.21.3'}
|
||||
cpu: [arm64]
|
||||
os: [win32]
|
||||
|
||||
'@biomejs/cli-win32-x64@1.9.3':
|
||||
resolution: {integrity: sha512-cQMy2zanBkVLpmmxXdK6YePzmZx0s5Z7KEnwmrW54rcXK3myCNbQa09SwGZ8i/8sLw0H9F3X7K4rxVNGU8/D4Q==}
|
||||
engines: {node: '>=14.21.3'}
|
||||
cpu: [x64]
|
||||
os: [win32]
|
||||
|
||||
'@esbuild/aix-ppc64@0.24.0':
|
||||
resolution: {integrity: sha512-WtKdFM7ls47zkKHFVzMz8opM7LkcsIp9amDUBIAWirg70RM71WRSjdILPsY5Uv1D42ZpUfaPILDlfactHgsRkw==}
|
||||
engines: {node: '>=18'}
|
||||
cpu: [ppc64]
|
||||
os: [aix]
|
||||
|
||||
'@esbuild/android-arm64@0.24.0':
|
||||
resolution: {integrity: sha512-Vsm497xFM7tTIPYK9bNTYJyF/lsP590Qc1WxJdlB6ljCbdZKU9SY8i7+Iin4kyhV/KV5J2rOKsBQbB77Ab7L/w==}
|
||||
engines: {node: '>=18'}
|
||||
cpu: [arm64]
|
||||
os: [android]
|
||||
|
||||
'@esbuild/android-arm@0.24.0':
|
||||
resolution: {integrity: sha512-arAtTPo76fJ/ICkXWetLCc9EwEHKaeya4vMrReVlEIUCAUncH7M4bhMQ+M9Vf+FFOZJdTNMXNBrWwW+OXWpSew==}
|
||||
engines: {node: '>=18'}
|
||||
cpu: [arm]
|
||||
os: [android]
|
||||
|
||||
'@esbuild/android-x64@0.24.0':
|
||||
resolution: {integrity: sha512-t8GrvnFkiIY7pa7mMgJd7p8p8qqYIz1NYiAoKc75Zyv73L3DZW++oYMSHPRarcotTKuSs6m3hTOa5CKHaS02TQ==}
|
||||
engines: {node: '>=18'}
|
||||
cpu: [x64]
|
||||
os: [android]
|
||||
|
||||
'@esbuild/darwin-arm64@0.24.0':
|
||||
resolution: {integrity: sha512-CKyDpRbK1hXwv79soeTJNHb5EiG6ct3efd/FTPdzOWdbZZfGhpbcqIpiD0+vwmpu0wTIL97ZRPZu8vUt46nBSw==}
|
||||
engines: {node: '>=18'}
|
||||
cpu: [arm64]
|
||||
os: [darwin]
|
||||
|
||||
'@esbuild/darwin-x64@0.24.0':
|
||||
resolution: {integrity: sha512-rgtz6flkVkh58od4PwTRqxbKH9cOjaXCMZgWD905JOzjFKW+7EiUObfd/Kav+A6Gyud6WZk9w+xu6QLytdi2OA==}
|
||||
engines: {node: '>=18'}
|
||||
cpu: [x64]
|
||||
os: [darwin]
|
||||
|
||||
'@esbuild/freebsd-arm64@0.24.0':
|
||||
resolution: {integrity: sha512-6Mtdq5nHggwfDNLAHkPlyLBpE5L6hwsuXZX8XNmHno9JuL2+bg2BX5tRkwjyfn6sKbxZTq68suOjgWqCicvPXA==}
|
||||
engines: {node: '>=18'}
|
||||
cpu: [arm64]
|
||||
os: [freebsd]
|
||||
|
||||
'@esbuild/freebsd-x64@0.24.0':
|
||||
resolution: {integrity: sha512-D3H+xh3/zphoX8ck4S2RxKR6gHlHDXXzOf6f/9dbFt/NRBDIE33+cVa49Kil4WUjxMGW0ZIYBYtaGCa2+OsQwQ==}
|
||||
engines: {node: '>=18'}
|
||||
cpu: [x64]
|
||||
os: [freebsd]
|
||||
|
||||
'@esbuild/linux-arm64@0.24.0':
|
||||
resolution: {integrity: sha512-TDijPXTOeE3eaMkRYpcy3LarIg13dS9wWHRdwYRnzlwlA370rNdZqbcp0WTyyV/k2zSxfko52+C7jU5F9Tfj1g==}
|
||||
engines: {node: '>=18'}
|
||||
cpu: [arm64]
|
||||
os: [linux]
|
||||
|
||||
'@esbuild/linux-arm@0.24.0':
|
||||
resolution: {integrity: sha512-gJKIi2IjRo5G6Glxb8d3DzYXlxdEj2NlkixPsqePSZMhLudqPhtZ4BUrpIuTjJYXxvF9njql+vRjB2oaC9XpBw==}
|
||||
engines: {node: '>=18'}
|
||||
cpu: [arm]
|
||||
os: [linux]
|
||||
|
||||
'@esbuild/linux-ia32@0.24.0':
|
||||
resolution: {integrity: sha512-K40ip1LAcA0byL05TbCQ4yJ4swvnbzHscRmUilrmP9Am7//0UjPreh4lpYzvThT2Quw66MhjG//20mrufm40mA==}
|
||||
engines: {node: '>=18'}
|
||||
cpu: [ia32]
|
||||
os: [linux]
|
||||
|
||||
'@esbuild/linux-loong64@0.24.0':
|
||||
resolution: {integrity: sha512-0mswrYP/9ai+CU0BzBfPMZ8RVm3RGAN/lmOMgW4aFUSOQBjA31UP8Mr6DDhWSuMwj7jaWOT0p0WoZ6jeHhrD7g==}
|
||||
engines: {node: '>=18'}
|
||||
cpu: [loong64]
|
||||
os: [linux]
|
||||
|
||||
'@esbuild/linux-mips64el@0.24.0':
|
||||
resolution: {integrity: sha512-hIKvXm0/3w/5+RDtCJeXqMZGkI2s4oMUGj3/jM0QzhgIASWrGO5/RlzAzm5nNh/awHE0A19h/CvHQe6FaBNrRA==}
|
||||
engines: {node: '>=18'}
|
||||
cpu: [mips64el]
|
||||
os: [linux]
|
||||
|
||||
'@esbuild/linux-ppc64@0.24.0':
|
||||
resolution: {integrity: sha512-HcZh5BNq0aC52UoocJxaKORfFODWXZxtBaaZNuN3PUX3MoDsChsZqopzi5UupRhPHSEHotoiptqikjN/B77mYQ==}
|
||||
engines: {node: '>=18'}
|
||||
cpu: [ppc64]
|
||||
os: [linux]
|
||||
|
||||
'@esbuild/linux-riscv64@0.24.0':
|
||||
resolution: {integrity: sha512-bEh7dMn/h3QxeR2KTy1DUszQjUrIHPZKyO6aN1X4BCnhfYhuQqedHaa5MxSQA/06j3GpiIlFGSsy1c7Gf9padw==}
|
||||
engines: {node: '>=18'}
|
||||
cpu: [riscv64]
|
||||
os: [linux]
|
||||
|
||||
'@esbuild/linux-s390x@0.24.0':
|
||||
resolution: {integrity: sha512-ZcQ6+qRkw1UcZGPyrCiHHkmBaj9SiCD8Oqd556HldP+QlpUIe2Wgn3ehQGVoPOvZvtHm8HPx+bH20c9pvbkX3g==}
|
||||
engines: {node: '>=18'}
|
||||
cpu: [s390x]
|
||||
os: [linux]
|
||||
|
||||
'@esbuild/linux-x64@0.24.0':
|
||||
resolution: {integrity: sha512-vbutsFqQ+foy3wSSbmjBXXIJ6PL3scghJoM8zCL142cGaZKAdCZHyf+Bpu/MmX9zT9Q0zFBVKb36Ma5Fzfa8xA==}
|
||||
engines: {node: '>=18'}
|
||||
cpu: [x64]
|
||||
os: [linux]
|
||||
|
||||
'@esbuild/netbsd-x64@0.24.0':
|
||||
resolution: {integrity: sha512-hjQ0R/ulkO8fCYFsG0FZoH+pWgTTDreqpqY7UnQntnaKv95uP5iW3+dChxnx7C3trQQU40S+OgWhUVwCjVFLvg==}
|
||||
engines: {node: '>=18'}
|
||||
cpu: [x64]
|
||||
os: [netbsd]
|
||||
|
||||
'@esbuild/openbsd-arm64@0.24.0':
|
||||
resolution: {integrity: sha512-MD9uzzkPQbYehwcN583yx3Tu5M8EIoTD+tUgKF982WYL9Pf5rKy9ltgD0eUgs8pvKnmizxjXZyLt0z6DC3rRXg==}
|
||||
engines: {node: '>=18'}
|
||||
cpu: [arm64]
|
||||
os: [openbsd]
|
||||
|
||||
'@esbuild/openbsd-x64@0.24.0':
|
||||
resolution: {integrity: sha512-4ir0aY1NGUhIC1hdoCzr1+5b43mw99uNwVzhIq1OY3QcEwPDO3B7WNXBzaKY5Nsf1+N11i1eOfFcq+D/gOS15Q==}
|
||||
engines: {node: '>=18'}
|
||||
cpu: [x64]
|
||||
os: [openbsd]
|
||||
|
||||
'@esbuild/sunos-x64@0.24.0':
|
||||
resolution: {integrity: sha512-jVzdzsbM5xrotH+W5f1s+JtUy1UWgjU0Cf4wMvffTB8m6wP5/kx0KiaLHlbJO+dMgtxKV8RQ/JvtlFcdZ1zCPA==}
|
||||
engines: {node: '>=18'}
|
||||
cpu: [x64]
|
||||
os: [sunos]
|
||||
|
||||
'@esbuild/win32-arm64@0.24.0':
|
||||
resolution: {integrity: sha512-iKc8GAslzRpBytO2/aN3d2yb2z8XTVfNV0PjGlCxKo5SgWmNXx82I/Q3aG1tFfS+A2igVCY97TJ8tnYwpUWLCA==}
|
||||
engines: {node: '>=18'}
|
||||
cpu: [arm64]
|
||||
os: [win32]
|
||||
|
||||
'@esbuild/win32-ia32@0.24.0':
|
||||
resolution: {integrity: sha512-vQW36KZolfIudCcTnaTpmLQ24Ha1RjygBo39/aLkM2kmjkWmZGEJ5Gn9l5/7tzXA42QGIoWbICfg6KLLkIw6yw==}
|
||||
engines: {node: '>=18'}
|
||||
cpu: [ia32]
|
||||
os: [win32]
|
||||
|
||||
'@esbuild/win32-x64@0.24.0':
|
||||
resolution: {integrity: sha512-7IAFPrjSQIJrGsK6flwg7NFmwBoSTyF3rl7If0hNUFQU4ilTsEPL6GuMuU9BfIWVVGuRnuIidkSMC+c0Otu8IA==}
|
||||
engines: {node: '>=18'}
|
||||
cpu: [x64]
|
||||
os: [win32]
|
||||
|
||||
'@protobufjs/aspromise@1.1.2':
|
||||
resolution: {integrity: sha512-j+gKExEuLmKwvz3OgROXtrJ2UG2x8Ch2YZUxahh+s1F2HZ+wAceUNLkvy6zKCPVRkU++ZWQrdxsUeQXmcg4uoQ==}
|
||||
|
||||
'@protobufjs/base64@1.1.2':
|
||||
resolution: {integrity: sha512-AZkcAA5vnN/v4PDqKyMR5lx7hZttPDgClv83E//FMNhR2TMcLUhfRUBHCmSl0oi9zMgDDqRUJkSxO3wm85+XLg==}
|
||||
|
||||
'@protobufjs/codegen@2.0.4':
|
||||
resolution: {integrity: sha512-YyFaikqM5sH0ziFZCN3xDC7zeGaB/d0IUb9CATugHWbd1FRFwWwt4ld4OYMPWu5a3Xe01mGAULCdqhMlPl29Jg==}
|
||||
|
||||
'@protobufjs/eventemitter@1.1.0':
|
||||
resolution: {integrity: sha512-j9ednRT81vYJ9OfVuXG6ERSTdEL1xVsNgqpkxMsbIabzSo3goCjDIveeGv5d03om39ML71RdmrGNjG5SReBP/Q==}
|
||||
|
||||
'@protobufjs/fetch@1.1.0':
|
||||
resolution: {integrity: sha512-lljVXpqXebpsijW71PZaCYeIcE5on1w5DlQy5WH6GLbFryLUrBD4932W/E2BSpfRJWseIL4v/KPgBFxDOIdKpQ==}
|
||||
|
||||
'@protobufjs/float@1.0.2':
|
||||
resolution: {integrity: sha512-Ddb+kVXlXst9d+R9PfTIxh1EdNkgoRe5tOX6t01f1lYWOvJnSPDBlG241QLzcyPdoNTsblLUdujGSE4RzrTZGQ==}
|
||||
|
||||
'@protobufjs/inquire@1.1.0':
|
||||
resolution: {integrity: sha512-kdSefcPdruJiFMVSbn801t4vFK7KB/5gd2fYvrxhuJYg8ILrmn9SKSX2tZdV6V+ksulWqS7aXjBcRXl3wHoD9Q==}
|
||||
|
||||
'@protobufjs/path@1.1.2':
|
||||
resolution: {integrity: sha512-6JOcJ5Tm08dOHAbdR3GrvP+yUUfkjG5ePsHYczMFLq3ZmMkAD98cDgcT2iA1lJ9NVwFd4tH/iSSoe44YWkltEA==}
|
||||
|
||||
'@protobufjs/pool@1.1.0':
|
||||
resolution: {integrity: sha512-0kELaGSIDBKvcgS4zkjz1PeddatrjYcmMWOlAuAPwAeccUrPHdUqo/J6LiymHHEiJT5NrF1UVwxY14f+fy4WQw==}
|
||||
|
||||
'@protobufjs/utf8@1.1.0':
|
||||
resolution: {integrity: sha512-Vvn3zZrhQZkkBE8LSuW3em98c0FwgO4nxzv6OdSxPKJIEKY2bGbHn+mhGIPerzI4twdxaP8/0+06HBpwf345Lw==}
|
||||
|
||||
'@types/node@22.7.4':
|
||||
resolution: {integrity: sha512-y+NPi1rFzDs1NdQHHToqeiX2TIS79SWEAw9GYhkkx8bD0ChpfqC+n2j5OXOCpzfojBEBt6DnEnnG9MY0zk1XLg==}
|
||||
|
||||
esbuild@0.24.0:
|
||||
resolution: {integrity: sha512-FuLPevChGDshgSicjisSooU0cemp/sGXR841D5LHMB7mTVOmsEHcAxaH3irL53+8YDIeVNQEySh4DaYU/iuPqQ==}
|
||||
engines: {node: '>=18'}
|
||||
hasBin: true
|
||||
|
||||
flatbuffers@1.12.0:
|
||||
resolution: {integrity: sha512-c7CZADjRcl6j0PlvFy0ZqXQ67qSEZfrVPynmnL+2zPc+NtMvrF8Y0QceMo7QqnSPc7+uWjUIAbvCQ5WIKlMVdQ==}
|
||||
|
||||
guid-typescript@1.0.9:
|
||||
resolution: {integrity: sha512-Y8T4vYhEfwJOTbouREvG+3XDsjr8E3kIr7uf+JZ0BYloFsttiHU0WfvANVsR7TxNUJa/WpCnw/Ino/p+DeBhBQ==}
|
||||
|
||||
long@5.2.3:
|
||||
resolution: {integrity: sha512-lcHwpNoggQTObv5apGNCTdJrO69eHOZMi4BNC+rTLER8iHAqGrUVeLh/irVIM7zTw2bOXA8T6uNPeujwOLg/2Q==}
|
||||
|
||||
onnxruntime-common@1.19.2:
|
||||
resolution: {integrity: sha512-a4R7wYEVFbZBlp0BfhpbFWqe4opCor3KM+5Wm22Az3NGDcQMiU2hfG/0MfnBs+1ZrlSGmlgWeMcXQkDk1UFb8Q==}
|
||||
|
||||
onnxruntime-web@1.19.2:
|
||||
resolution: {integrity: sha512-r0ok6KpTUXR4WA+rHvUiZn7JoH02e8iS7XE1p5bXk7q3E0UaRFfYvpMNUHqEPiTBMuIssfBxDCQjUihV8dDFPg==}
|
||||
|
||||
platform@1.3.6:
|
||||
resolution: {integrity: sha512-fnWVljUchTro6RiCFvCXBbNhJc2NijN7oIQxbwsyL0buWJPG85v81ehlHI9fXrJsMNgTofEoWIQeClKpgxFLrg==}
|
||||
|
||||
protobufjs@7.4.0:
|
||||
resolution: {integrity: sha512-mRUWCc3KUU4w1jU8sGxICXH/gNS94DvI1gxqDvBzhj1JpcsimQkYiOJfwsPUykUI5ZaspFbSgmBLER8IrQ3tqw==}
|
||||
engines: {node: '>=12.0.0'}
|
||||
|
||||
typescript@5.6.2:
|
||||
resolution: {integrity: sha512-NW8ByodCSNCwZeghjN3o+JX5OFH0Ojg6sadjEKY4huZ52TqbJTJnDo5+Tw98lSy63NZvi4n+ez5m2u5d4PkZyw==}
|
||||
engines: {node: '>=14.17'}
|
||||
hasBin: true
|
||||
|
||||
undici-types@6.19.8:
|
||||
resolution: {integrity: sha512-ve2KP6f/JnbPBFyobGHuerC9g1FYGn/F8n1LWTwNxCEzd6IfqTwUQcNXgEtmmQ6DlRrC1hrSrBnCZPokRrDHjw==}
|
||||
|
||||
snapshots:
|
||||
|
||||
'@biomejs/biome@1.9.3':
|
||||
optionalDependencies:
|
||||
'@biomejs/cli-darwin-arm64': 1.9.3
|
||||
'@biomejs/cli-darwin-x64': 1.9.3
|
||||
'@biomejs/cli-linux-arm64': 1.9.3
|
||||
'@biomejs/cli-linux-arm64-musl': 1.9.3
|
||||
'@biomejs/cli-linux-x64': 1.9.3
|
||||
'@biomejs/cli-linux-x64-musl': 1.9.3
|
||||
'@biomejs/cli-win32-arm64': 1.9.3
|
||||
'@biomejs/cli-win32-x64': 1.9.3
|
||||
|
||||
'@biomejs/cli-darwin-arm64@1.9.3':
|
||||
optional: true
|
||||
|
||||
'@biomejs/cli-darwin-x64@1.9.3':
|
||||
optional: true
|
||||
|
||||
'@biomejs/cli-linux-arm64-musl@1.9.3':
|
||||
optional: true
|
||||
|
||||
'@biomejs/cli-linux-arm64@1.9.3':
|
||||
optional: true
|
||||
|
||||
'@biomejs/cli-linux-x64-musl@1.9.3':
|
||||
optional: true
|
||||
|
||||
'@biomejs/cli-linux-x64@1.9.3':
|
||||
optional: true
|
||||
|
||||
'@biomejs/cli-win32-arm64@1.9.3':
|
||||
optional: true
|
||||
|
||||
'@biomejs/cli-win32-x64@1.9.3':
|
||||
optional: true
|
||||
|
||||
'@esbuild/aix-ppc64@0.24.0':
|
||||
optional: true
|
||||
|
||||
'@esbuild/android-arm64@0.24.0':
|
||||
optional: true
|
||||
|
||||
'@esbuild/android-arm@0.24.0':
|
||||
optional: true
|
||||
|
||||
'@esbuild/android-x64@0.24.0':
|
||||
optional: true
|
||||
|
||||
'@esbuild/darwin-arm64@0.24.0':
|
||||
optional: true
|
||||
|
||||
'@esbuild/darwin-x64@0.24.0':
|
||||
optional: true
|
||||
|
||||
'@esbuild/freebsd-arm64@0.24.0':
|
||||
optional: true
|
||||
|
||||
'@esbuild/freebsd-x64@0.24.0':
|
||||
optional: true
|
||||
|
||||
'@esbuild/linux-arm64@0.24.0':
|
||||
optional: true
|
||||
|
||||
'@esbuild/linux-arm@0.24.0':
|
||||
optional: true
|
||||
|
||||
'@esbuild/linux-ia32@0.24.0':
|
||||
optional: true
|
||||
|
||||
'@esbuild/linux-loong64@0.24.0':
|
||||
optional: true
|
||||
|
||||
'@esbuild/linux-mips64el@0.24.0':
|
||||
optional: true
|
||||
|
||||
'@esbuild/linux-ppc64@0.24.0':
|
||||
optional: true
|
||||
|
||||
'@esbuild/linux-riscv64@0.24.0':
|
||||
optional: true
|
||||
|
||||
'@esbuild/linux-s390x@0.24.0':
|
||||
optional: true
|
||||
|
||||
'@esbuild/linux-x64@0.24.0':
|
||||
optional: true
|
||||
|
||||
'@esbuild/netbsd-x64@0.24.0':
|
||||
optional: true
|
||||
|
||||
'@esbuild/openbsd-arm64@0.24.0':
|
||||
optional: true
|
||||
|
||||
'@esbuild/openbsd-x64@0.24.0':
|
||||
optional: true
|
||||
|
||||
'@esbuild/sunos-x64@0.24.0':
|
||||
optional: true
|
||||
|
||||
'@esbuild/win32-arm64@0.24.0':
|
||||
optional: true
|
||||
|
||||
'@esbuild/win32-ia32@0.24.0':
|
||||
optional: true
|
||||
|
||||
'@esbuild/win32-x64@0.24.0':
|
||||
optional: true
|
||||
|
||||
'@protobufjs/aspromise@1.1.2': {}
|
||||
|
||||
'@protobufjs/base64@1.1.2': {}
|
||||
|
||||
'@protobufjs/codegen@2.0.4': {}
|
||||
|
||||
'@protobufjs/eventemitter@1.1.0': {}
|
||||
|
||||
'@protobufjs/fetch@1.1.0':
|
||||
dependencies:
|
||||
'@protobufjs/aspromise': 1.1.2
|
||||
'@protobufjs/inquire': 1.1.0
|
||||
|
||||
'@protobufjs/float@1.0.2': {}
|
||||
|
||||
'@protobufjs/inquire@1.1.0': {}
|
||||
|
||||
'@protobufjs/path@1.1.2': {}
|
||||
|
||||
'@protobufjs/pool@1.1.0': {}
|
||||
|
||||
'@protobufjs/utf8@1.1.0': {}
|
||||
|
||||
'@types/node@22.7.4':
|
||||
dependencies:
|
||||
undici-types: 6.19.8
|
||||
|
||||
esbuild@0.24.0:
|
||||
optionalDependencies:
|
||||
'@esbuild/aix-ppc64': 0.24.0
|
||||
'@esbuild/android-arm': 0.24.0
|
||||
'@esbuild/android-arm64': 0.24.0
|
||||
'@esbuild/android-x64': 0.24.0
|
||||
'@esbuild/darwin-arm64': 0.24.0
|
||||
'@esbuild/darwin-x64': 0.24.0
|
||||
'@esbuild/freebsd-arm64': 0.24.0
|
||||
'@esbuild/freebsd-x64': 0.24.0
|
||||
'@esbuild/linux-arm': 0.24.0
|
||||
'@esbuild/linux-arm64': 0.24.0
|
||||
'@esbuild/linux-ia32': 0.24.0
|
||||
'@esbuild/linux-loong64': 0.24.0
|
||||
'@esbuild/linux-mips64el': 0.24.0
|
||||
'@esbuild/linux-ppc64': 0.24.0
|
||||
'@esbuild/linux-riscv64': 0.24.0
|
||||
'@esbuild/linux-s390x': 0.24.0
|
||||
'@esbuild/linux-x64': 0.24.0
|
||||
'@esbuild/netbsd-x64': 0.24.0
|
||||
'@esbuild/openbsd-arm64': 0.24.0
|
||||
'@esbuild/openbsd-x64': 0.24.0
|
||||
'@esbuild/sunos-x64': 0.24.0
|
||||
'@esbuild/win32-arm64': 0.24.0
|
||||
'@esbuild/win32-ia32': 0.24.0
|
||||
'@esbuild/win32-x64': 0.24.0
|
||||
|
||||
flatbuffers@1.12.0: {}
|
||||
|
||||
guid-typescript@1.0.9: {}
|
||||
|
||||
long@5.2.3: {}
|
||||
|
||||
onnxruntime-common@1.19.2: {}
|
||||
|
||||
onnxruntime-web@1.19.2:
|
||||
dependencies:
|
||||
flatbuffers: 1.12.0
|
||||
guid-typescript: 1.0.9
|
||||
long: 5.2.3
|
||||
onnxruntime-common: 1.19.2
|
||||
platform: 1.3.6
|
||||
protobufjs: 7.4.0
|
||||
|
||||
platform@1.3.6: {}
|
||||
|
||||
protobufjs@7.4.0:
|
||||
dependencies:
|
||||
'@protobufjs/aspromise': 1.1.2
|
||||
'@protobufjs/base64': 1.1.2
|
||||
'@protobufjs/codegen': 2.0.4
|
||||
'@protobufjs/eventemitter': 1.1.0
|
||||
'@protobufjs/fetch': 1.1.0
|
||||
'@protobufjs/float': 1.0.2
|
||||
'@protobufjs/inquire': 1.1.0
|
||||
'@protobufjs/path': 1.1.2
|
||||
'@protobufjs/pool': 1.1.0
|
||||
'@protobufjs/utf8': 1.1.0
|
||||
'@types/node': 22.7.4
|
||||
long: 5.2.3
|
||||
|
||||
typescript@5.6.2: {}
|
||||
|
||||
undici-types@6.19.8: {}
|
||||
106
sbv2_wasm/src-js/index.ts
Normal file
106
sbv2_wasm/src-js/index.ts
Normal file
@@ -0,0 +1,106 @@
|
||||
import * as wasm from "../pkg/sbv2_wasm.js";
|
||||
import { InferenceSession, Tensor } from "onnxruntime-web";
|
||||
|
||||
export class ModelHolder {
|
||||
private models: Map<string, [InferenceSession, wasm.StyleVectorWrap]> =
|
||||
new Map();
|
||||
constructor(
|
||||
private tok: wasm.TokenizerWrap,
|
||||
private deberta: InferenceSession,
|
||||
) {}
|
||||
public static async globalInit(buf: ArrayBufferLike) {
|
||||
await wasm.default(buf);
|
||||
}
|
||||
public static async create(tok: string, deberta: ArrayBufferLike) {
|
||||
return new ModelHolder(
|
||||
wasm.load_tokenizer(tok),
|
||||
await InferenceSession.create(deberta, {
|
||||
executionProviders: ["webnn", "webgpu", "wasm", "cpu"],
|
||||
graphOptimizationLevel: "all",
|
||||
}),
|
||||
);
|
||||
}
|
||||
public async synthesize(
|
||||
name: string,
|
||||
text: string,
|
||||
style_id: number = 0,
|
||||
style_weight: number = 1.0,
|
||||
sdp_ratio: number = 0.4,
|
||||
speed: number = 1.0,
|
||||
) {
|
||||
const mod = this.models.get(name);
|
||||
if (!mod) throw new Error(`No model named ${name}`);
|
||||
const [vits2, style] = mod;
|
||||
return wasm.synthesize(
|
||||
text,
|
||||
this.tok,
|
||||
async (a: BigInt64Array, b: BigInt64Array) => {
|
||||
try {
|
||||
const res = (
|
||||
await this.deberta.run({
|
||||
input_ids: new Tensor("int64", a, [1, a.length]),
|
||||
attention_mask: new Tensor("int64", b, [1, b.length]),
|
||||
})
|
||||
)["output"];
|
||||
return [new Uint32Array(res.dims), await res.getData(true)];
|
||||
} catch (e) {
|
||||
console.warn(e);
|
||||
throw e;
|
||||
}
|
||||
},
|
||||
async (
|
||||
[a_shape, a_array]: any,
|
||||
b_d: any,
|
||||
c_d: any,
|
||||
d_d: any,
|
||||
e_d: any,
|
||||
f: number,
|
||||
g: number,
|
||||
) => {
|
||||
try {
|
||||
const a = new Tensor("float32", a_array, [1, ...a_shape]);
|
||||
const b = new Tensor("int64", b_d, [1, b_d.length]);
|
||||
const c = new Tensor("int64", c_d, [1, c_d.length]);
|
||||
const d = new Tensor("int64", d_d, [1, d_d.length]);
|
||||
const e = new Tensor("float32", e_d, [1, e_d.length]);
|
||||
const res = (
|
||||
await vits2.run({
|
||||
x_tst: b,
|
||||
x_tst_lengths: new Tensor("int64", [b_d.length]),
|
||||
sid: new Tensor("int64", [0]),
|
||||
tones: c,
|
||||
language: d,
|
||||
bert: a,
|
||||
style_vec: e,
|
||||
sdp_ratio: new Tensor("float32", [f]),
|
||||
length_scale: new Tensor("float32", [g]),
|
||||
})
|
||||
).output;
|
||||
return [new Uint32Array(res.dims), await res.getData(true)];
|
||||
} catch (e) {
|
||||
console.warn(e);
|
||||
throw e;
|
||||
}
|
||||
},
|
||||
sdp_ratio,
|
||||
1.0 / speed,
|
||||
style_id,
|
||||
style_weight,
|
||||
style,
|
||||
);
|
||||
}
|
||||
public async load(name: string, b: Uint8Array) {
|
||||
const [style, vits2_b] = wasm.load_sbv2file(b);
|
||||
const vits2 = await InferenceSession.create(vits2_b as Uint8Array, {
|
||||
executionProviders: ["webnn", "webgpu", "wasm", "cpu"],
|
||||
graphOptimizationLevel: "all",
|
||||
});
|
||||
this.models.set(name, [vits2, style]);
|
||||
}
|
||||
public async unload(name: string) {
|
||||
return this.models.delete(name);
|
||||
}
|
||||
public modelList() {
|
||||
return this.models.keys();
|
||||
}
|
||||
}
|
||||
102
sbv2_wasm/src/array_helper.rs
Normal file
102
sbv2_wasm/src/array_helper.rs
Normal file
@@ -0,0 +1,102 @@
|
||||
pub fn vec8_to_array8(v: Vec<u8>) -> js_sys::Uint8Array {
|
||||
let arr = js_sys::Uint8Array::new_with_length(v.len() as u32);
|
||||
arr.copy_from(&v);
|
||||
arr
|
||||
}
|
||||
|
||||
pub fn vec_f32_to_array_f32(v: Vec<f32>) -> js_sys::Float32Array {
|
||||
let arr = js_sys::Float32Array::new_with_length(v.len() as u32);
|
||||
arr.copy_from(&v);
|
||||
arr
|
||||
}
|
||||
|
||||
pub fn array8_to_vec8(buf: js_sys::Uint8Array) -> Vec<u8> {
|
||||
let mut body = vec![0; buf.length() as usize];
|
||||
buf.copy_to(&mut body[..]);
|
||||
body
|
||||
}
|
||||
|
||||
pub fn vec64_to_array64(v: Vec<i64>) -> js_sys::BigInt64Array {
|
||||
let arr = js_sys::BigInt64Array::new_with_length(v.len() as u32);
|
||||
arr.copy_from(&v);
|
||||
arr
|
||||
}
|
||||
|
||||
pub fn vec_to_array(v: Vec<wasm_bindgen::JsValue>) -> js_sys::Array {
|
||||
let arr = js_sys::Array::new_with_length(v.len() as u32);
|
||||
for (i, v) in v.into_iter().enumerate() {
|
||||
arr.set(i as u32, v);
|
||||
}
|
||||
arr
|
||||
}
|
||||
|
||||
struct A {
|
||||
shape: Vec<u32>,
|
||||
data: Vec<f32>,
|
||||
}
|
||||
|
||||
impl TryFrom<wasm_bindgen::JsValue> for A {
|
||||
type Error = sbv2_core::error::Error;
|
||||
|
||||
fn try_from(value: wasm_bindgen::JsValue) -> Result<Self, Self::Error> {
|
||||
let value: js_sys::Array = value.into();
|
||||
let mut shape = vec![];
|
||||
let mut data = vec![];
|
||||
for (i, v) in value.iter().enumerate() {
|
||||
match i {
|
||||
0 => {
|
||||
let v: js_sys::Uint32Array = v.into();
|
||||
shape = vec![0; v.length() as usize];
|
||||
v.copy_to(&mut shape);
|
||||
}
|
||||
1 => {
|
||||
let v: js_sys::Float32Array = v.into();
|
||||
data = vec![0.0; v.length() as usize];
|
||||
v.copy_to(&mut data);
|
||||
}
|
||||
_ => {}
|
||||
};
|
||||
}
|
||||
Ok(A { shape, data })
|
||||
}
|
||||
}
|
||||
|
||||
pub fn array_to_array2_f32(
|
||||
a: wasm_bindgen::JsValue,
|
||||
) -> sbv2_core::error::Result<ndarray::Array2<f32>> {
|
||||
let a = A::try_from(a)?;
|
||||
if a.shape.len() != 2 {
|
||||
return Err(sbv2_core::error::Error::OtherError(
|
||||
"Length mismatch".to_string(),
|
||||
));
|
||||
}
|
||||
let shape = [a.shape[0] as usize, a.shape[1] as usize];
|
||||
let arr = ndarray::Array2::from_shape_vec(shape, a.data.to_vec())
|
||||
.map_err(|e| sbv2_core::error::Error::OtherError(e.to_string()))?;
|
||||
Ok(arr)
|
||||
}
|
||||
pub fn array_to_array3_f32(
|
||||
a: wasm_bindgen::JsValue,
|
||||
) -> sbv2_core::error::Result<ndarray::Array3<f32>> {
|
||||
let a = A::try_from(a)?;
|
||||
if a.shape.len() != 3 {
|
||||
return Err(sbv2_core::error::Error::OtherError(
|
||||
"Length mismatch".to_string(),
|
||||
));
|
||||
}
|
||||
let shape = [
|
||||
a.shape[0] as usize,
|
||||
a.shape[1] as usize,
|
||||
a.shape[2] as usize,
|
||||
];
|
||||
let arr = ndarray::Array3::from_shape_vec(shape, a.data.to_vec())
|
||||
.map_err(|e| sbv2_core::error::Error::OtherError(e.to_string()))?;
|
||||
Ok(arr)
|
||||
}
|
||||
|
||||
pub fn array2_f32_to_array(a: ndarray::Array2<f32>) -> js_sys::Array {
|
||||
let shape: Vec<wasm_bindgen::JsValue> = a.shape().iter().map(|f| (*f as u32).into()).collect();
|
||||
let typed_array = js_sys::Float32Array::new_with_length(a.len() as u32);
|
||||
typed_array.copy_from(&a.into_flat().to_vec());
|
||||
vec_to_array(vec![vec_to_array(shape).into(), typed_array.into()])
|
||||
}
|
||||
123
sbv2_wasm/src/lib.rs
Normal file
123
sbv2_wasm/src/lib.rs
Normal file
@@ -0,0 +1,123 @@
|
||||
use once_cell::sync::Lazy;
|
||||
use sbv2_core::*;
|
||||
use wasm_bindgen::prelude::*;
|
||||
use wasm_bindgen_futures::JsFuture;
|
||||
mod array_helper;
|
||||
|
||||
static JTALK: Lazy<jtalk::JTalk> = Lazy::new(|| jtalk::JTalk::new().unwrap());
|
||||
|
||||
#[wasm_bindgen]
|
||||
pub struct TokenizerWrap {
|
||||
tokenizer: tokenizer::Tokenizer,
|
||||
}
|
||||
|
||||
#[wasm_bindgen]
|
||||
pub fn load_tokenizer(s: js_sys::JsString) -> Result<TokenizerWrap, JsError> {
|
||||
if let Some(s) = s.as_string() {
|
||||
Ok(TokenizerWrap {
|
||||
tokenizer: tokenizer::Tokenizer::from_bytes(s.as_bytes())
|
||||
.map_err(|e| JsError::new(&e.to_string()))?,
|
||||
})
|
||||
} else {
|
||||
Err(JsError::new("invalid utf8"))
|
||||
}
|
||||
}
|
||||
|
||||
#[wasm_bindgen]
|
||||
pub struct StyleVectorWrap {
|
||||
style_vector: ndarray::Array2<f32>,
|
||||
}
|
||||
|
||||
#[wasm_bindgen]
|
||||
pub fn load_sbv2file(buf: js_sys::Uint8Array) -> Result<js_sys::Array, JsError> {
|
||||
let (style_vectors, vits2) = sbv2file::parse_sbv2file(array_helper::array8_to_vec8(buf))?;
|
||||
let buf = array_helper::vec8_to_array8(vits2);
|
||||
Ok(array_helper::vec_to_array(vec![
|
||||
StyleVectorWrap {
|
||||
style_vector: style::load_style(style_vectors)?,
|
||||
}
|
||||
.into(),
|
||||
buf.into(),
|
||||
]))
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
#[wasm_bindgen]
|
||||
pub async fn synthesize(
|
||||
text: &str,
|
||||
tokenizer: &TokenizerWrap,
|
||||
bert_predict_fn: js_sys::Function,
|
||||
synthesize_fn: js_sys::Function,
|
||||
sdp_ratio: f32,
|
||||
length_scale: f32,
|
||||
style_id: i32,
|
||||
style_weight: f32,
|
||||
style_vectors: &StyleVectorWrap,
|
||||
) -> Result<js_sys::Uint8Array, JsError> {
|
||||
let synthesize_wrap = |bert_ori: ndarray::Array2<f32>,
|
||||
x_tst: ndarray::Array1<i64>,
|
||||
tones: ndarray::Array1<i64>,
|
||||
lang_ids: ndarray::Array1<i64>,
|
||||
style_vector: ndarray::Array1<f32>,
|
||||
sdp_ratio: f32,
|
||||
length_scale: f32| async move {
|
||||
let arr = array_helper::vec_to_array(vec![
|
||||
array_helper::array2_f32_to_array(bert_ori).into(),
|
||||
array_helper::vec64_to_array64(x_tst.to_vec()).into(),
|
||||
array_helper::vec64_to_array64(tones.to_vec()).into(),
|
||||
array_helper::vec64_to_array64(lang_ids.to_vec()).into(),
|
||||
array_helper::vec_f32_to_array_f32(style_vector.to_vec()).into(),
|
||||
sdp_ratio.into(),
|
||||
length_scale.into(),
|
||||
]);
|
||||
let res = synthesize_fn
|
||||
.apply(&js_sys::Object::new().into(), &arr)
|
||||
.map_err(|e| {
|
||||
error::Error::OtherError(e.as_string().unwrap_or("unknown".to_string()))
|
||||
})?;
|
||||
let res = JsFuture::from(Into::<js_sys::Promise>::into(res))
|
||||
.await
|
||||
.map_err(|e| {
|
||||
sbv2_core::error::Error::OtherError(e.as_string().unwrap_or("unknown".to_string()))
|
||||
})?;
|
||||
array_helper::array_to_array3_f32(res)
|
||||
};
|
||||
let (bert_ori, phones, tones, lang_ids) = tts_util::parse_text(
|
||||
text,
|
||||
&JTALK,
|
||||
&tokenizer.tokenizer,
|
||||
|token_ids: Vec<i64>, attention_masks: Vec<i64>| {
|
||||
Box::pin(async move {
|
||||
let arr = array_helper::vec_to_array(vec![
|
||||
array_helper::vec64_to_array64(token_ids).into(),
|
||||
array_helper::vec64_to_array64(attention_masks).into(),
|
||||
]);
|
||||
let res = bert_predict_fn
|
||||
.apply(&js_sys::Object::new().into(), &arr)
|
||||
.map_err(|e| {
|
||||
error::Error::OtherError(e.as_string().unwrap_or("unknown".to_string()))
|
||||
})?;
|
||||
let res = JsFuture::from(Into::<js_sys::Promise>::into(res))
|
||||
.await
|
||||
.map_err(|e| {
|
||||
sbv2_core::error::Error::OtherError(
|
||||
e.as_string().unwrap_or("unknown".to_string()),
|
||||
)
|
||||
})?;
|
||||
array_helper::array_to_array2_f32(res)
|
||||
})
|
||||
},
|
||||
)
|
||||
.await?;
|
||||
let audio = synthesize_wrap(
|
||||
bert_ori.to_owned(),
|
||||
phones,
|
||||
tones,
|
||||
lang_ids,
|
||||
style::get_style_vector(&style_vectors.style_vector, style_id, style_weight)?,
|
||||
sdp_ratio,
|
||||
length_scale,
|
||||
)
|
||||
.await?;
|
||||
Ok(array_helper::vec8_to_array8(tts_util::array_to_vec(audio)?))
|
||||
}
|
||||
15
sbv2_wasm/tsconfig.json
Normal file
15
sbv2_wasm/tsconfig.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"compilerOptions": {
|
||||
"target": "ESNext",
|
||||
"module": "ESNext",
|
||||
"rootDir": "./src-js",
|
||||
"outDir": "./dist",
|
||||
"moduleResolution": "node",
|
||||
"esModuleInterop": true,
|
||||
"forceConsistentCasingInFileNames": true,
|
||||
"strict": true,
|
||||
"skipLibCheck": true,
|
||||
"declaration": true,
|
||||
"emitDeclarationOnly": true
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user