とりあえず完成?

This commit is contained in:
tuna2134
2024-09-13 09:05:43 +00:00
parent 94c30aed9b
commit 703e1a99c1
2 changed files with 55 additions and 30 deletions

View File

@@ -1,8 +1,6 @@
use crate::error::Result;
use hound::{SampleFormat, WavSpec, WavWriter};
use ndarray::{array, s, Array1, Array2, Axis};
use ndarray::{array, Array1, Array2, Array3, Axis};
use ort::{GraphOptimizationLevel, Session};
use std::io::Cursor;
#[allow(clippy::vec_init_then_push, unused_variables)]
pub fn load_model<P: AsRef<[u8]>>(model_file: P, bert: bool) -> Result<Session> {
@@ -59,7 +57,7 @@ pub fn synthesize(
style_vector: Array1<f32>,
sdp_ratio: f32,
length_scale: f32,
) -> Result<Vec<u8>> {
) -> Result<Array3<f32>> {
let bert = bert_ori.insert_axis(Axis(0));
let x_tst_lengths: Array1<i64> = array![x_tst.shape()[0] as i64];
let x_tst = x_tst.insert_axis(Axis(0));
@@ -84,24 +82,8 @@ pub fn synthesize(
.try_extract_tensor::<f32>()?
.to_owned();
let buffer = {
let spec = WavSpec {
channels: 1,
sample_rate: 44100,
bits_per_sample: 32,
sample_format: SampleFormat::Float,
};
let mut cursor = Cursor::new(Vec::new());
let mut writer = WavWriter::new(&mut cursor, spec)?;
for i in 0..audio_array.shape()[0] {
let output = audio_array.slice(s![i, 0, ..]).to_vec();
for sample in output {
writer.write_sample(sample)?;
}
}
writer.finalize()?;
cursor.into_inner()
};
Ok(buffer)
Ok(Array3::from_shape_vec(
(audio_array.shape()[0], audio_array.shape()[1], audio_array.shape()[2]),
audio_array.into_raw_vec_and_offset().0,
)?)
}

View File

@@ -198,16 +198,59 @@ impl TTSModelHolder {
style::get_style_vector(&self.find_model(ident)?.style_vectors, style_id, weight)
}
pub fn easy_synthesize<I: Into<TTSIdent>>(
pub fn easy_synthesize<I: Into<TTSIdent> + Copy>(
&self,
ident: I,
text: &str,
style_id: i32,
options: SynthesizeOptions,
) -> Result<()> {
let (bert_ori, phones, tones, lang_ids) = self.parse_text(text)?;
) -> Result<Vec<u8>> {
let style_vector = self.get_style_vector(ident, style_id, options.style_weight)?;
Ok(())
let audio_array = if options.split_sentences {
let texts: Vec<&str> = text.split("\n").collect();
let mut audios = vec![];
for (i, t) in texts.iter().enumerate() {
if t.is_empty() {
continue;
}
let (bert_ori, phones, tones, lang_ids) = self.parse_text(t)?;
let audio = model::synthesize(
&self.find_model(ident)?.vits2,
bert_ori.to_owned(),
phones,
tones,
lang_ids,
style_vector.clone(),
options.sdp_ratio,
options.length_scale,
)?;
audios.push(audio);
if i != texts.len() - 1 {
// 44100 * 0.5s 無音区間
audios.push(Array3::zeros((1, 22050, 1)));
}
}
concatenate(
Axis(0),
&audios
.iter()
.map(|x| x.view())
.collect::<Vec<_>>()
)?
} else {
let (bert_ori, phones, tones, lang_ids) = self.parse_text(text)?;
model::synthesize(
&self.find_model(ident)?.vits2,
bert_ori.to_owned(),
phones,
tones,
lang_ids,
style_vector,
options.sdp_ratio,
options.length_scale,
)?
};
Ok(Self::array_to_vec(audio_array)?)
}
fn array_to_vec(audio_array: Array3<f32>) -> Result<Vec<u8>> {
@@ -241,7 +284,7 @@ impl TTSModelHolder {
sdp_ratio: f32,
length_scale: f32,
) -> Result<Vec<u8>> {
let buffer = model::synthesize(
let audio_array = model::synthesize(
&self.find_model(ident)?.vits2,
bert_ori.to_owned(),
phones,
@@ -251,7 +294,7 @@ impl TTSModelHolder {
sdp_ratio,
length_scale,
)?;
Ok(buffer)
Ok(Self::array_to_vec(audio_array)?)
}
}