This commit is contained in:
Googlefan
2025-02-22 08:00:17 +00:00
parent 14d631eeaa
commit 506ee4d883
60 changed files with 927 additions and 517 deletions

5
scripts/.gitignore vendored Normal file
View File

@@ -0,0 +1,5 @@
*.json
venv/
tmp/
*.safetensors
*.npy

36
scripts/convert/README.md Normal file
View File

@@ -0,0 +1,36 @@
# 変換方法
## 初心者向け準備
わかる人は飛ばしてください。
1. pythonを入れます。3.11.8で動作確認をしていますが、最近のバージョンなら大体動くはずです。
4. `cd convert`
3. `python -m venv venv`
4. `source venv/bin/activate`
5. `pip install -r requirements.txt`
## モデル変換
1. 変換したいモデルの`.safetensors`で終わるファイルの位置を特定してください。
2. 同様に`config.json``style_vectors.npy`というファイルを探してください。
3. 以下のコマンドを実行します。
```sh
python convert_model.py --style_file "ここにstyle_vectors.npyの場所" --config_file "同様にconfig.json場所" --model_file "同様に.safetensorsで終わるファイルの場所"
```
4. `models/名前.sbv2`というファイルが出力されます。GUI版のモデルファイルに入れてあげたら使えます。
## Deberta変換
意味が分からないならおそらく変換しなくてもいいってことです。
venvを用意し、requirementsを入れて、`python convert_model.py`を実行するだけです。
`models/deberta.onnx``models/tokenizer.json`が出力されたら成功です。

View File

@@ -0,0 +1,50 @@
from transformers.convert_slow_tokenizer import BertConverter
from style_bert_vits2.nlp import bert_models
from style_bert_vits2.constants import Languages
from transformers import AutoModelForMaskedLM, AutoTokenizer
import torch
from torch import nn
from argparse import ArgumentParser
parser = ArgumentParser()
parser.add_argument("--model", default="ku-nlp/deberta-v2-large-japanese-char-wwm")
args = parser.parse_args()
model_name = args.model
bert_models.load_tokenizer(Languages.JP, model_name)
tokenizer = bert_models.load_tokenizer(Languages.JP)
converter = BertConverter(tokenizer)
tokenizer = converter.converted()
tokenizer.save("../models/tokenizer.json")
class ORTDeberta(nn.Module):
def __init__(self, model_name):
super(ORTDeberta, self).__init__()
self.model = AutoModelForMaskedLM.from_pretrained(model_name)
def forward(self, input_ids, token_type_ids, attention_mask):
inputs = {
"input_ids": input_ids,
"token_type_ids": token_type_ids,
"attention_mask": attention_mask,
}
res = self.model(**inputs, output_hidden_states=True)
res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
return res
model = ORTDeberta(model_name)
inputs = AutoTokenizer.from_pretrained(model_name)(
"今日はいい天気ですね", return_tensors="pt"
)
torch.onnx.export(
model,
(inputs["input_ids"], inputs["token_type_ids"], inputs["attention_mask"]),
"../models/deberta.onnx",
input_names=["input_ids", "token_type_ids", "attention_mask"],
output_names=["output"],
verbose=True,
dynamic_axes={"input_ids": {1: "batch_size"}, "attention_mask": {1: "batch_size"}},
)

View File

@@ -0,0 +1,169 @@
import numpy as np
import json
from io import BytesIO
from style_bert_vits2.nlp import bert_models
from style_bert_vits2.constants import Languages
from style_bert_vits2.models.infer import get_net_g, get_text
from style_bert_vits2.models.hyper_parameters import HyperParameters
import torch
from style_bert_vits2.constants import (
DEFAULT_ASSIST_TEXT_WEIGHT,
DEFAULT_STYLE,
DEFAULT_STYLE_WEIGHT,
Languages,
)
import os
from tarfile import open as taropen, TarInfo
from zstandard import ZstdCompressor
from style_bert_vits2.tts_model import TTSModel
import numpy as np
from argparse import ArgumentParser
parser = ArgumentParser()
parser.add_argument("--style_file", required=True)
parser.add_argument("--config_file", required=True)
parser.add_argument("--model_file", required=True)
args = parser.parse_args()
style_file = args.style_file
config_file = args.config_file
model_file = args.model_file
bert_models.load_model(Languages.JP, "ku-nlp/deberta-v2-large-japanese-char-wwm")
bert_models.load_tokenizer(Languages.JP, "ku-nlp/deberta-v2-large-japanese-char-wwm")
array = np.load(style_file)
data = array.tolist()
hyper_parameters = HyperParameters.load_from_json(config_file)
out_name = hyper_parameters.model_name
with open(f"../models/style_vectors_{out_name}.json", "w") as f:
json.dump(
{
"data": data,
"shape": array.shape,
},
f,
)
text = "今日はいい天気ですね。"
bert, ja_bert, en_bert, phones, tones, lang_ids = get_text(
text,
Languages.JP,
hyper_parameters,
"cpu",
assist_text=None,
assist_text_weight=DEFAULT_ASSIST_TEXT_WEIGHT,
given_phone=None,
given_tone=None,
)
tts_model = TTSModel(
model_path=model_file,
config_path=config_file,
style_vec_path=style_file,
device="cpu",
)
device = "cpu"
style_id = tts_model.style2id[DEFAULT_STYLE]
def get_style_vector(style_id, weight):
style_vectors = np.load(style_file)
mean = style_vectors[0]
style_vec = style_vectors[style_id]
style_vec = mean + (style_vec - mean) * weight
return style_vec
style_vector = get_style_vector(style_id, DEFAULT_STYLE_WEIGHT)
x_tst = phones.to(device).unsqueeze(0)
tones = tones.to(device).unsqueeze(0)
lang_ids = lang_ids.to(device).unsqueeze(0)
bert = bert.to(device).unsqueeze(0)
ja_bert = ja_bert.to(device).unsqueeze(0)
en_bert = en_bert.to(device).unsqueeze(0)
x_tst_lengths = torch.LongTensor([phones.size(0)]).to(device)
style_vec_tensor = torch.from_numpy(style_vector).to(device).unsqueeze(0)
model = get_net_g(
model_file,
hyper_parameters.version,
device,
hyper_parameters,
)
def forward(x, x_len, sid, tone, lang, bert, style, length_scale, sdp_ratio):
return model.infer(
x,
x_len,
sid,
tone,
lang,
bert,
style,
sdp_ratio=sdp_ratio,
length_scale=length_scale,
)
model.forward = forward
torch.onnx.export(
model,
(
x_tst,
x_tst_lengths,
torch.LongTensor([0]).to(device),
tones,
lang_ids,
bert,
style_vec_tensor,
torch.tensor(1.0),
torch.tensor(0.0),
),
f"../models/model_{out_name}.onnx",
verbose=True,
dynamic_axes={
"x_tst": {0: "batch_size", 1: "x_tst_max_length"},
"x_tst_lengths": {0: "batch_size"},
"sid": {0: "batch_size"},
"tones": {0: "batch_size", 1: "x_tst_max_length"},
"language": {0: "batch_size", 1: "x_tst_max_length"},
"bert": {0: "batch_size", 2: "x_tst_max_length"},
"style_vec": {0: "batch_size"},
},
input_names=[
"x_tst",
"x_tst_lengths",
"sid",
"tones",
"language",
"bert",
"style_vec",
"length_scale",
"sdp_ratio",
],
output_names=["output"],
)
os.system(f"onnxsim ../models/model_{out_name}.onnx ../models/model_{out_name}.onnx")
onnxfile = open(f"../models/model_{out_name}.onnx", "rb").read()
stylefile = open(f"../models/style_vectors_{out_name}.json", "rb").read()
version = bytes("1", "utf8")
with taropen(f"../models/tmp_{out_name}.sbv2tar", "w") as w:
def add_tar(f, b):
t = TarInfo(f)
t.size = len(b)
w.addfile(t, BytesIO(b))
add_tar("version.txt", version)
add_tar("model.onnx", onnxfile)
add_tar("style_vectors.json", stylefile)
open(f"../models/{out_name}.sbv2", "wb").write(
ZstdCompressor(threads=-1, level=22).compress(
open(f"../models/tmp_{out_name}.sbv2tar", "rb").read()
)
)
os.unlink(f"../models/tmp_{out_name}.sbv2tar")

View File

@@ -0,0 +1,5 @@
style-bert-vits2
onnxsim
numpy<2
zstandard
onnxruntime

View File

@@ -0,0 +1,9 @@
FROM rust AS builder
WORKDIR /work
COPY . .
RUN cargo build -r --bin sbv2_api
FROM gcr.io/distroless/cc-debian12
WORKDIR /work
COPY --from=builder /work/target/release/sbv2_api /work/main
COPY --from=builder /work/target/release/*.so /work
CMD ["/work/main"]

View File

@@ -0,0 +1,10 @@
FROM rust AS builder
WORKDIR /work
COPY . .
RUN cargo build -r --bin sbv2_api -F cuda,cuda_tf32
FROM nvidia/cuda:12.3.2-cudnn9-runtime-ubuntu22.04
WORKDIR /work
COPY --from=builder /work/target/release/sbv2_api /work/main
COPY --from=builder /work/target/release/*.so /work
ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/work
CMD ["/work/main"]

3
scripts/docker/run_cpu.sh Executable file
View File

@@ -0,0 +1,3 @@
docker run -it --rm -p 3000:3000 --name sbv2 \
-v ./models:/work/models --env-file .env \
ghcr.io/tuna2134/sbv2-api:cpu

4
scripts/docker/run_cuda.sh Executable file
View File

@@ -0,0 +1,4 @@
docker run -it --rm -p 3000:3000 --name sbv2 \
-v ./models:/work/models --env-file .env \
--gpus all \
ghcr.io/tuna2134/sbv2-api:cuda

14
scripts/make_dict.sh Executable file
View File

@@ -0,0 +1,14 @@
#!/bin/bash
set -e
git clone https://github.com/Aivis-Project/AivisSpeech-Engine ./scripts/tmp --filter=blob:none -n
cd ./scripts/tmp
git checkout 168b2a1144afe300b0490d9a6dd773ec6e927667 -- resources/dictionaries/*.csv
cd ../..
rm -rf ./crates/sbv2_core/src/dic
cp -r ./scripts/tmp/resources/dictionaries ./crates/sbv2_core/src/dic
rm -rf ./scripts/tmp
for file in ./crates/sbv2_core/src/dic/0*.csv; do
/usr/bin/cat "$file"
echo
done > ./crates/sbv2_core/src/all.csv
lindera build ./crates/sbv2_core/src/all.csv ./crates/sbv2_core/src/dic/all.dic -u -k ipadic

View File

@@ -0,0 +1,180 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 音声合成プログラム\n",
"\n",
"このノートブックでは、`sbv2_bindings` パッケージを使用して音声合成を行います。必要なモデルをダウンロードし、ユーザーが入力したテキストから音声を生成します。音声合成が終わったら、再度テキストの入力を求め、ユーザーが終了するまで繰り返します。"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# 必要なパッケージのインストール\n",
"%pip install sbv2_bindings\n",
"\n",
"# 必要なモジュールのインポート\n",
"import os\n",
"import urllib.request\n",
"import time\n",
"from sbv2_bindings import TTSModel"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## モデルのダウンロード\n",
"\n",
"モデルファイルとトークナイザーをダウンロードします。ユーザーが独自のモデルを使用したい場合は、該当するURLまたはローカルパスを指定してください。"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# モデルの URL またはローカルパスの指定\n",
"user_sbv2_model_url = \"\" # カスタムモデルのURLがあればここに指定\n",
"user_sbv2_model_path = \"\" # カスタムモデルのローカルパスがあればここに指定\n",
"\n",
"# モデル用のディレクトリを作成\n",
"model_dir = 'models'\n",
"os.makedirs(model_dir, exist_ok=True)\n",
"\n",
"# ダウンロードするファイルの URL\n",
"file_urls = [\n",
" \"https://huggingface.co/googlefan/sbv2_onnx_models/resolve/main/tokenizer.json\",\n",
" \"https://huggingface.co/googlefan/sbv2_onnx_models/resolve/main/deberta.onnx\",\n",
"]\n",
"\n",
"# モデルのパス決定\n",
"if user_sbv2_model_path:\n",
" sbv2_model_path = user_sbv2_model_path # ローカルモデルのパスを使用\n",
"elif user_sbv2_model_url:\n",
" sbv2_model_filename = os.path.basename(user_sbv2_model_url)\n",
" sbv2_model_path = os.path.join(model_dir, sbv2_model_filename)\n",
" file_urls.append(user_sbv2_model_url)\n",
"else:\n",
" # デフォルトのモデルを使用\n",
" sbv2_model_filename = \"tsukuyomi.sbv2\"\n",
" sbv2_model_path = os.path.join(model_dir, sbv2_model_filename)\n",
" file_urls.append(\"https://huggingface.co/googlefan/sbv2_onnx_models/resolve/main/tsukuyomi.sbv2\")\n",
"\n",
"# ファイルをダウンロード\n",
"for url in file_urls:\n",
" file_name = os.path.join(model_dir, os.path.basename(url))\n",
" if not os.path.exists(file_name):\n",
" print(f\"{file_name} をダウンロードしています...\")\n",
" urllib.request.urlretrieve(url, file_name)\n",
" else:\n",
" print(f\"{file_name} は既に存在します。\")\n",
"\n",
"# ダウンロードまたは使用するファイルを確認\n",
"print(\"\\n使用するファイル:\")\n",
"for file in os.listdir(model_dir):\n",
" print(file)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## モデルの読み込みと音声合成\n",
"\n",
"モデルを読み込み、ユーザーが入力したテキストから音声を生成します。話者名は使用する `.sbv2` ファイル名から自動的に取得します。音声合成が終わったら、再度テキストの入力を求め、ユーザーが終了するまで繰り返します。"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# 音声合成の実行\n",
"def main():\n",
" try:\n",
" print(\"\\nモデルを読み込んでいます...\")\n",
" model = TTSModel.from_path(\n",
" os.path.join(model_dir, \"deberta.onnx\"),\n",
" os.path.join(model_dir, \"tokenizer.json\")\n",
" )\n",
" print(\"モデルの読み込みが完了しました!\")\n",
" except Exception as e:\n",
" print(f\"モデルの読み込みに失敗しました: {e}\")\n",
" return\n",
"\n",
" # 話者名を取得(.sbv2 ファイル名の拡張子を除いた部分)\n",
" speaker_name = os.path.splitext(os.path.basename(sbv2_model_path))[0]\n",
" \n",
" # 指定されたモデルのパスを使用\n",
" try:\n",
" model.load_sbv2file_from_path(speaker_name, sbv2_model_path)\n",
" print(f\"話者 '{speaker_name}' のセットアップが完了しました!\")\n",
" except Exception as e:\n",
" print(f\"SBV2ファイルの読み込みに失敗しました: {e}\")\n",
" return\n",
"\n",
" # 音声合成を繰り返し実行\n",
" while True:\n",
" # 合成したいテキストをユーザーから入力\n",
" user_input = input(\"\\n音声合成したいテキストを入力してください終了するには 'exit' と入力): \")\n",
" \n",
" if user_input.strip().lower() == 'exit':\n",
" print(\"音声合成を終了します。\")\n",
" break\n",
"\n",
" # 出力ファイル名\n",
" output_file = \"output.wav\"\n",
"\n",
" # 音声合成を実行\n",
" try:\n",
" print(\"\\n音声合成を開始します...\")\n",
" start_time = time.time()\n",
"\n",
" audio_data = model.synthesize(user_input, speaker_name, 0, 0.0, 1)\n",
"\n",
" with open(output_file, \"wb\") as f:\n",
" f.write(audio_data)\n",
"\n",
" end_time = time.time()\n",
" elapsed_time = end_time - start_time\n",
"\n",
" print(f\"\\n音声が '{output_file}' に保存されました。\")\n",
" print(f\"音声合成にかかった時間: {elapsed_time:.2f} 秒\")\n",
" except Exception as e:\n",
" print(f\"音声合成に失敗しました: {e}\")\n",
"\n",
"if __name__ == \"__main__\":\n",
" main()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.x"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

8
scripts/sbv2-test-api.py Normal file
View File

@@ -0,0 +1,8 @@
import requests
res = requests.post(
"http://localhost:3000/synthesize",
json={"text": "おはようございます", "ident": "tsukuyomi"},
)
with open("output.wav", "wb") as f:
f.write(res.content)

View File

@@ -0,0 +1,20 @@
from sbv2_bindings import TTSModel
def main():
print("Loading models...")
model = TTSModel.from_path("./models/debert.onnx", "./models/tokenizer.json")
print("Models loaded!")
model.load_sbv2file_from_path("amitaro", "./models/amitaro.sbv2")
print("All setup is done!")
style_vector = model.get_style_vector("amitaro", 0, 1.0)
with open("output.wav", "wb") as f:
f.write(
model.synthesize("おはようございます。", "amitaro", style_vector, 0.0, 0.5)
)
if __name__ == "__main__":
main()