mirror of
https://github.com/neodyland/sbv2-api.git
synced 2025-12-27 17:42:57 +00:00
とりあえず完成?
This commit is contained in:
@@ -1,8 +1,6 @@
|
||||
use crate::error::Result;
|
||||
use hound::{SampleFormat, WavSpec, WavWriter};
|
||||
use ndarray::{array, s, Array1, Array2, Axis};
|
||||
use ndarray::{array, Array1, Array2, Array3, Axis};
|
||||
use ort::{GraphOptimizationLevel, Session};
|
||||
use std::io::Cursor;
|
||||
|
||||
#[allow(clippy::vec_init_then_push, unused_variables)]
|
||||
pub fn load_model<P: AsRef<[u8]>>(model_file: P, bert: bool) -> Result<Session> {
|
||||
@@ -59,7 +57,7 @@ pub fn synthesize(
|
||||
style_vector: Array1<f32>,
|
||||
sdp_ratio: f32,
|
||||
length_scale: f32,
|
||||
) -> Result<Vec<u8>> {
|
||||
) -> Result<Array3<f32>> {
|
||||
let bert = bert_ori.insert_axis(Axis(0));
|
||||
let x_tst_lengths: Array1<i64> = array![x_tst.shape()[0] as i64];
|
||||
let x_tst = x_tst.insert_axis(Axis(0));
|
||||
@@ -84,24 +82,8 @@ pub fn synthesize(
|
||||
.try_extract_tensor::<f32>()?
|
||||
.to_owned();
|
||||
|
||||
let buffer = {
|
||||
let spec = WavSpec {
|
||||
channels: 1,
|
||||
sample_rate: 44100,
|
||||
bits_per_sample: 32,
|
||||
sample_format: SampleFormat::Float,
|
||||
};
|
||||
let mut cursor = Cursor::new(Vec::new());
|
||||
let mut writer = WavWriter::new(&mut cursor, spec)?;
|
||||
for i in 0..audio_array.shape()[0] {
|
||||
let output = audio_array.slice(s![i, 0, ..]).to_vec();
|
||||
for sample in output {
|
||||
writer.write_sample(sample)?;
|
||||
}
|
||||
}
|
||||
writer.finalize()?;
|
||||
cursor.into_inner()
|
||||
};
|
||||
|
||||
Ok(buffer)
|
||||
Ok(Array3::from_shape_vec(
|
||||
(audio_array.shape()[0], audio_array.shape()[1], audio_array.shape()[2]),
|
||||
audio_array.into_raw_vec_and_offset().0,
|
||||
)?)
|
||||
}
|
||||
|
||||
@@ -198,16 +198,59 @@ impl TTSModelHolder {
|
||||
style::get_style_vector(&self.find_model(ident)?.style_vectors, style_id, weight)
|
||||
}
|
||||
|
||||
pub fn easy_synthesize<I: Into<TTSIdent>>(
|
||||
pub fn easy_synthesize<I: Into<TTSIdent> + Copy>(
|
||||
&self,
|
||||
ident: I,
|
||||
text: &str,
|
||||
style_id: i32,
|
||||
options: SynthesizeOptions,
|
||||
) -> Result<()> {
|
||||
let (bert_ori, phones, tones, lang_ids) = self.parse_text(text)?;
|
||||
) -> Result<Vec<u8>> {
|
||||
let style_vector = self.get_style_vector(ident, style_id, options.style_weight)?;
|
||||
Ok(())
|
||||
let audio_array = if options.split_sentences {
|
||||
let texts: Vec<&str> = text.split("\n").collect();
|
||||
let mut audios = vec![];
|
||||
for (i, t) in texts.iter().enumerate() {
|
||||
if t.is_empty() {
|
||||
continue;
|
||||
}
|
||||
let (bert_ori, phones, tones, lang_ids) = self.parse_text(t)?;
|
||||
let audio = model::synthesize(
|
||||
&self.find_model(ident)?.vits2,
|
||||
bert_ori.to_owned(),
|
||||
phones,
|
||||
tones,
|
||||
lang_ids,
|
||||
style_vector.clone(),
|
||||
options.sdp_ratio,
|
||||
options.length_scale,
|
||||
)?;
|
||||
audios.push(audio);
|
||||
if i != texts.len() - 1 {
|
||||
// 44100 * 0.5s 無音区間
|
||||
audios.push(Array3::zeros((1, 22050, 1)));
|
||||
}
|
||||
}
|
||||
concatenate(
|
||||
Axis(0),
|
||||
&audios
|
||||
.iter()
|
||||
.map(|x| x.view())
|
||||
.collect::<Vec<_>>()
|
||||
)?
|
||||
} else {
|
||||
let (bert_ori, phones, tones, lang_ids) = self.parse_text(text)?;
|
||||
model::synthesize(
|
||||
&self.find_model(ident)?.vits2,
|
||||
bert_ori.to_owned(),
|
||||
phones,
|
||||
tones,
|
||||
lang_ids,
|
||||
style_vector,
|
||||
options.sdp_ratio,
|
||||
options.length_scale,
|
||||
)?
|
||||
};
|
||||
Ok(Self::array_to_vec(audio_array)?)
|
||||
}
|
||||
|
||||
fn array_to_vec(audio_array: Array3<f32>) -> Result<Vec<u8>> {
|
||||
@@ -241,7 +284,7 @@ impl TTSModelHolder {
|
||||
sdp_ratio: f32,
|
||||
length_scale: f32,
|
||||
) -> Result<Vec<u8>> {
|
||||
let buffer = model::synthesize(
|
||||
let audio_array = model::synthesize(
|
||||
&self.find_model(ident)?.vits2,
|
||||
bert_ori.to_owned(),
|
||||
phones,
|
||||
@@ -251,7 +294,7 @@ impl TTSModelHolder {
|
||||
sdp_ratio,
|
||||
length_scale,
|
||||
)?;
|
||||
Ok(buffer)
|
||||
Ok(Self::array_to_vec(audio_array)?)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user