From 04b378752d9a00e7978fea5015d4a5c2e3e2e9c4 Mon Sep 17 00:00:00 2001 From: Ryo Yamashita Date: Sun, 19 Nov 2023 00:15:48 +0900 Subject: [PATCH] =?UTF-8?q?`Synthesizer`=E3=81=AE=E6=A7=8B=E9=80=A0?= =?UTF-8?q?=E6=94=B9=E9=9D=A9=E3=82=92=E3=81=99=E3=82=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crates/voicevox_core/src/__internal.rs | 1 + .../src/__internal/doctest_fixtures.rs | 2 +- crates/voicevox_core/src/__internal/interp.rs | 46 + .../src/engine/acoustic_feature_extractor.rs | 10 +- crates/voicevox_core/src/engine/mod.rs | 3 +- crates/voicevox_core/src/engine/open_jtalk.rs | 75 +- .../src/engine/synthesis_engine.rs | 698 ----------- crates/voicevox_core/src/error.rs | 10 +- crates/voicevox_core/src/infer/status.rs | 15 +- crates/voicevox_core/src/inference_core.rs | 163 +-- crates/voicevox_core/src/synthesizer.rs | 1038 ++++++++++++++--- crates/voicevox_core_c_api/Cargo.toml | 2 +- .../src/compatible_engine.rs | 58 +- crates/voicevox_core_c_api/src/helpers.rs | 2 +- crates/voicevox_core_c_api/src/lib.rs | 2 +- crates/voicevox_core_c_api/src/result_code.rs | 8 +- .../tests/e2e/snapshots.toml | 2 +- .../LoadOpenjtalkSystemDicException.java | 12 + .../NotLoadedOpenjtalkDictException.java | 12 - crates/voicevox_core_java_api/src/common.rs | 2 +- .../voicevox_core_java_api/src/synthesizer.rs | 61 +- .../python/voicevox_core/__init__.py | 4 +- .../python/voicevox_core/_rust.pyi | 4 +- .../voicevox_core_python_api/src/convert.rs | 6 +- crates/voicevox_core_python_api/src/lib.rs | 4 +- 25 files changed, 1153 insertions(+), 1087 deletions(-) create mode 100644 crates/voicevox_core/src/__internal/interp.rs delete mode 100644 crates/voicevox_core/src/engine/synthesis_engine.rs create mode 100644 crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/exceptions/LoadOpenjtalkSystemDicException.java delete mode 100644 crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/exceptions/NotLoadedOpenjtalkDictException.java diff --git a/crates/voicevox_core/src/__internal.rs b/crates/voicevox_core/src/__internal.rs index ff9f5ce3c..b6affb0f2 100644 --- a/crates/voicevox_core/src/__internal.rs +++ b/crates/voicevox_core/src/__internal.rs @@ -1,4 +1,5 @@ pub mod doctest_fixtures; +pub mod interp; // VOICEVOX CORE内のラッパー向けの実装 // FIXME: 要議論: https://github.com/VOICEVOX/voicevox_core/issues/595 diff --git a/crates/voicevox_core/src/__internal/doctest_fixtures.rs b/crates/voicevox_core/src/__internal/doctest_fixtures.rs index 9df517720..1915d3d37 100644 --- a/crates/voicevox_core/src/__internal/doctest_fixtures.rs +++ b/crates/voicevox_core/src/__internal/doctest_fixtures.rs @@ -4,7 +4,7 @@ use crate::{AccelerationMode, InitializeOptions, OpenJtalk, Synthesizer, VoiceMo pub async fn synthesizer_with_sample_voice_model( open_jtalk_dic_dir: impl AsRef, -) -> anyhow::Result { +) -> anyhow::Result>> { let syntesizer = Synthesizer::new( Arc::new(OpenJtalk::new(open_jtalk_dic_dir).unwrap()), &InitializeOptions { diff --git a/crates/voicevox_core/src/__internal/interp.rs b/crates/voicevox_core/src/__internal/interp.rs new file mode 100644 index 000000000..4afb4242d --- /dev/null +++ b/crates/voicevox_core/src/__internal/interp.rs @@ -0,0 +1,46 @@ +use easy_ext::ext; +use ndarray::{Array1, ArrayView1, ArrayView2}; + +use crate::{StyleId, Synthesizer}; + +#[ext(PerformInference)] +impl Synthesizer<()> { + pub fn predict_duration( + &self, + phoneme_list: Array1, + style_id: StyleId, + ) -> crate::Result> { + self.predict_duration(phoneme_list, style_id) + } + + #[allow(clippy::too_many_arguments)] + pub fn predict_intonation( + &self, + vowel_phoneme_list: Array1, + consonant_phoneme_list: Array1, + start_accent_list: Array1, + end_accent_list: Array1, + start_accent_phrase_list: Array1, + end_accent_phrase_list: Array1, + style_id: StyleId, + ) -> crate::Result> { + self.predict_intonation( + vowel_phoneme_list, + consonant_phoneme_list, + start_accent_list, + end_accent_list, + start_accent_phrase_list, + end_accent_phrase_list, + style_id, + ) + } + + pub fn decode( + &self, + f0: ArrayView1<'_, f32>, + phoneme: ArrayView2<'_, f32>, + style_id: StyleId, + ) -> crate::Result> { + self.decode(f0, phoneme, style_id) + } +} diff --git a/crates/voicevox_core/src/engine/acoustic_feature_extractor.rs b/crates/voicevox_core/src/engine/acoustic_feature_extractor.rs index 05cdb2d31..5078f34b5 100644 --- a/crates/voicevox_core/src/engine/acoustic_feature_extractor.rs +++ b/crates/voicevox_core/src/engine/acoustic_feature_extractor.rs @@ -4,7 +4,7 @@ use once_cell::sync::Lazy; use std::collections::HashMap; #[rustfmt::skip] -const PHONEME_LIST: &[&str] = &[ +const PHONEME_LIST: [&str; 45] = [ "pau", "A", "E", @@ -70,9 +70,7 @@ pub struct OjtPhoneme { } impl OjtPhoneme { - pub fn num_phoneme() -> usize { - PHONEME_MAP.len() - } + pub(crate) const NUM_PHONEME: usize = PHONEME_LIST.len(); pub fn space_phoneme() -> String { "pau".into() @@ -134,8 +132,8 @@ mod tests { } #[rstest] - fn test_num_phoneme_works() { - assert_eq!(OjtPhoneme::num_phoneme(), 45); + fn test_phoneme_map_has_enough_elements() { + assert_eq!(OjtPhoneme::NUM_PHONEME, PHONEME_MAP.len()); } #[rstest] diff --git a/crates/voicevox_core/src/engine/mod.rs b/crates/voicevox_core/src/engine/mod.rs index 0e02839dc..a0a073bbf 100644 --- a/crates/voicevox_core/src/engine/mod.rs +++ b/crates/voicevox_core/src/engine/mod.rs @@ -4,7 +4,6 @@ mod kana_parser; mod model; mod mora_list; mod open_jtalk; -mod synthesis_engine; use super::*; @@ -12,5 +11,5 @@ pub use self::acoustic_feature_extractor::*; pub use self::full_context_label::*; pub use self::kana_parser::*; pub use self::model::*; +pub(crate) use self::mora_list::mora2text; pub use self::open_jtalk::OpenJtalk; -pub use self::synthesis_engine::*; diff --git a/crates/voicevox_core/src/engine/open_jtalk.rs b/crates/voicevox_core/src/engine/open_jtalk.rs index f74d4130d..327cb634c 100644 --- a/crates/voicevox_core/src/engine/open_jtalk.rs +++ b/crates/voicevox_core/src/engine/open_jtalk.rs @@ -1,8 +1,5 @@ use std::io::Write; -use std::{ - path::{Path, PathBuf}, - sync::Mutex, -}; +use std::{path::Path, sync::Mutex}; use anyhow::anyhow; use tempfile::NamedTempFile; @@ -22,7 +19,7 @@ pub(crate) struct OpenjtalkFunctionError { /// テキスト解析器としてのOpen JTalk。 pub struct OpenJtalk { resources: Mutex, - dict_dir: Option, + dict_dir: String, } struct Resources { @@ -35,24 +32,27 @@ struct Resources { unsafe impl Send for Resources {} impl OpenJtalk { - // FIXME: この関数は廃止し、`Synthesizer`は`Option`という形でこの構造体を持つ - pub fn new_without_dic() -> Self { - Self { - resources: Mutex::new(Resources { - mecab: ManagedResource::initialize(), - njd: ManagedResource::initialize(), - jpcommon: ManagedResource::initialize(), - }), - dict_dir: None, - } - } pub fn new(open_jtalk_dict_dir: impl AsRef) -> crate::result::Result { - let mut s = Self::new_without_dic(); - s.load(open_jtalk_dict_dir).map_err(|()| { - // FIXME: 「システム辞書を読もうとしたけど読めなかった」というエラーをちゃんと用意する - ErrorRepr::NotLoadedOpenjtalkDict - })?; - Ok(s) + let mut resources = Resources { + mecab: ManagedResource::initialize(), + njd: ManagedResource::initialize(), + jpcommon: ManagedResource::initialize(), + }; + let dict_dir = open_jtalk_dict_dir + .as_ref() + .to_str() + .unwrap_or_else(|| todo!("Rust APIでは`Utf8Path`で受けるようにする")) + .to_owned(); + + let result = resources.mecab.load(&dict_dir); + if !result { + return Err(ErrorRepr::LoadOpenjtalkSystemDic(dict_dir).into()); + } + + Ok(Self { + resources: resources.into(), + dict_dir, + }) } // 先に`load`を呼ぶ必要がある。 @@ -60,12 +60,6 @@ impl OpenJtalk { /// /// この関数を呼び出した後にユーザー辞書を変更した場合は、再度この関数を呼ぶ必要がある。 pub fn use_user_dict(&self, user_dict: &UserDict) -> crate::result::Result<()> { - let dict_dir = self - .dict_dir - .as_ref() - .and_then(|dict_dir| dict_dir.to_str()) - .ok_or(ErrorRepr::NotLoadedOpenjtalkDict)?; - // ユーザー辞書用のcsvを作成 let mut temp_csv = NamedTempFile::new().map_err(|e| ErrorRepr::UseUserDict(e.into()))?; temp_csv @@ -80,7 +74,7 @@ impl OpenJtalk { mecab_dict_index(&[ "mecab-dict-index", "-d", - dict_dir, + &self.dict_dir, "-u", temp_dict_path.to_str().unwrap(), "-f", @@ -93,7 +87,8 @@ impl OpenJtalk { let Resources { mecab, .. } = &mut *self.resources.lock().unwrap(); - let result = mecab.load_with_userdic(Path::new(dict_dir), Some(Path::new(&temp_dict_path))); + let result = + mecab.load_with_userdic(self.dict_dir.as_ref(), Some(Path::new(&temp_dict_path))); if !result { return Err(ErrorRepr::UseUserDict(anyhow!("辞書のコンパイルに失敗しました")).into()); @@ -150,26 +145,6 @@ impl OpenJtalk { }) } } - - fn load(&mut self, open_jtalk_dict_dir: impl AsRef) -> std::result::Result<(), ()> { - let result = self - .resources - .lock() - .unwrap() - .mecab - .load(open_jtalk_dict_dir.as_ref()); - if result { - self.dict_dir = Some(open_jtalk_dict_dir.as_ref().into()); - Ok(()) - } else { - self.dict_dir = None; - Err(()) - } - } - - pub fn dict_loaded(&self) -> bool { - self.dict_dir.is_some() - } } #[cfg(test)] diff --git a/crates/voicevox_core/src/engine/synthesis_engine.rs b/crates/voicevox_core/src/engine/synthesis_engine.rs deleted file mode 100644 index c70742f16..000000000 --- a/crates/voicevox_core/src/engine/synthesis_engine.rs +++ /dev/null @@ -1,698 +0,0 @@ -use derive_new::new; -use std::io::{Cursor, Write}; -use std::sync::Arc; - -use super::full_context_label::Utterance; -use super::open_jtalk::OpenJtalk; -use super::*; -use crate::infer::InferenceRuntime; -use crate::numerics::F32Ext as _; -use crate::InferenceCore; - -const UNVOICED_MORA_PHONEME_LIST: &[&str] = &["A", "I", "U", "E", "O", "cl", "pau"]; - -const MORA_PHONEME_LIST: &[&str] = &[ - "a", "i", "u", "e", "o", "N", "A", "I", "U", "E", "O", "cl", "pau", -]; - -pub const DEFAULT_SAMPLING_RATE: u32 = 24000; - -#[derive(new)] -pub(crate) struct SynthesisEngine { - inference_core: InferenceCore, - open_jtalk: Arc, -} - -impl SynthesisEngine { - pub fn inference_core(&self) -> &InferenceCore { - &self.inference_core - } - - pub async fn create_accent_phrases( - &self, - text: &str, - style_id: StyleId, - ) -> Result> { - if text.is_empty() { - return Ok(Vec::new()); - } - - let utterance = Utterance::extract_full_context_label(&self.open_jtalk, text)?; - - let accent_phrases: Vec = utterance - .breath_groups() - .iter() - .enumerate() - .fold(Vec::new(), |mut accum_vec, (i, breath_group)| { - accum_vec.extend(breath_group.accent_phrases().iter().enumerate().map( - |(j, accent_phrase)| { - let moras = accent_phrase - .moras() - .iter() - .map(|mora| { - let mora_text = mora - .phonemes() - .iter() - .map(|phoneme| phoneme.phoneme().to_string()) - .collect::>() - .join(""); - - let (consonant, consonant_length) = - if let Some(consonant) = mora.consonant() { - (Some(consonant.phoneme().to_string()), Some(0.)) - } else { - (None, None) - }; - - MoraModel::new( - mora_to_text(mora_text), - consonant, - consonant_length, - mora.vowel().phoneme().into(), - 0., - 0., - ) - }) - .collect(); - - let pause_mora = if i != utterance.breath_groups().len() - 1 - && j == breath_group.accent_phrases().len() - 1 - { - Some(MoraModel::new( - "、".into(), - None, - None, - "pau".into(), - 0., - 0., - )) - } else { - None - }; - - AccentPhraseModel::new( - moras, - *accent_phrase.accent(), - pause_mora, - *accent_phrase.is_interrogative(), - ) - }, - )); - - accum_vec - }); - - self.replace_mora_data(&accent_phrases, style_id).await - } - - pub async fn replace_mora_data( - &self, - accent_phrases: &[AccentPhraseModel], - style_id: StyleId, - ) -> Result> { - let accent_phrases = self - .replace_phoneme_length(accent_phrases, style_id) - .await?; - self.replace_mora_pitch(&accent_phrases, style_id).await - } - - pub async fn replace_phoneme_length( - &self, - accent_phrases: &[AccentPhraseModel], - style_id: StyleId, - ) -> Result> { - let (_, phoneme_data_list) = Self::initial_process(accent_phrases); - - let (_, _, vowel_indexes_data) = split_mora(&phoneme_data_list); - - let phoneme_list_s: Vec = phoneme_data_list - .iter() - .map(|phoneme_data| phoneme_data.phoneme_id()) - .collect(); - let phoneme_length = self - .inference_core() - .predict_duration(&phoneme_list_s, style_id) - .await?; - - let mut index = 0; - let new_accent_phrases = accent_phrases - .iter() - .map(|accent_phrase| { - AccentPhraseModel::new( - accent_phrase - .moras() - .iter() - .map(|mora| { - let new_mora = MoraModel::new( - mora.text().clone(), - mora.consonant().clone(), - mora.consonant().as_ref().map(|_| { - phoneme_length[vowel_indexes_data[index + 1] as usize - 1] - }), - mora.vowel().clone(), - phoneme_length[vowel_indexes_data[index + 1] as usize], - *mora.pitch(), - ); - index += 1; - new_mora - }) - .collect(), - *accent_phrase.accent(), - accent_phrase.pause_mora().as_ref().map(|pause_mora| { - let new_pause_mora = MoraModel::new( - pause_mora.text().clone(), - pause_mora.consonant().clone(), - *pause_mora.consonant_length(), - pause_mora.vowel().clone(), - phoneme_length[vowel_indexes_data[index + 1] as usize], - *pause_mora.pitch(), - ); - index += 1; - new_pause_mora - }), - *accent_phrase.is_interrogative(), - ) - }) - .collect(); - - Ok(new_accent_phrases) - } - - pub async fn replace_mora_pitch( - &self, - accent_phrases: &[AccentPhraseModel], - style_id: StyleId, - ) -> Result> { - let (_, phoneme_data_list) = Self::initial_process(accent_phrases); - - let mut base_start_accent_list = vec![0]; - let mut base_end_accent_list = vec![0]; - let mut base_start_accent_phrase_list = vec![0]; - let mut base_end_accent_phrase_list = vec![0]; - for accent_phrase in accent_phrases { - let mut accent = usize::from(*accent_phrase.accent() != 1); - Self::create_one_accent_list(&mut base_start_accent_list, accent_phrase, accent as i32); - - accent = *accent_phrase.accent() - 1; - Self::create_one_accent_list(&mut base_end_accent_list, accent_phrase, accent as i32); - Self::create_one_accent_list(&mut base_start_accent_phrase_list, accent_phrase, 0); - Self::create_one_accent_list(&mut base_end_accent_phrase_list, accent_phrase, -1); - } - base_start_accent_list.push(0); - base_end_accent_list.push(0); - base_start_accent_phrase_list.push(0); - base_end_accent_phrase_list.push(0); - - let (consonant_phoneme_data_list, vowel_phoneme_data_list, vowel_indexes) = - split_mora(&phoneme_data_list); - - let consonant_phoneme_list: Vec = consonant_phoneme_data_list - .iter() - .map(|phoneme_data| phoneme_data.phoneme_id()) - .collect(); - let vowel_phoneme_list: Vec = vowel_phoneme_data_list - .iter() - .map(|phoneme_data| phoneme_data.phoneme_id()) - .collect(); - - let mut start_accent_list = Vec::with_capacity(vowel_indexes.len()); - let mut end_accent_list = Vec::with_capacity(vowel_indexes.len()); - let mut start_accent_phrase_list = Vec::with_capacity(vowel_indexes.len()); - let mut end_accent_phrase_list = Vec::with_capacity(vowel_indexes.len()); - - for vowel_index in vowel_indexes { - start_accent_list.push(base_start_accent_list[vowel_index as usize]); - end_accent_list.push(base_end_accent_list[vowel_index as usize]); - start_accent_phrase_list.push(base_start_accent_phrase_list[vowel_index as usize]); - end_accent_phrase_list.push(base_end_accent_phrase_list[vowel_index as usize]); - } - - let mut f0_list = self - .inference_core() - .predict_intonation( - vowel_phoneme_list.len(), - &vowel_phoneme_list, - &consonant_phoneme_list, - &start_accent_list, - &end_accent_list, - &start_accent_phrase_list, - &end_accent_phrase_list, - style_id, - ) - .await?; - - for i in 0..vowel_phoneme_data_list.len() { - if UNVOICED_MORA_PHONEME_LIST - .iter() - .any(|phoneme| *phoneme == vowel_phoneme_data_list[i].phoneme()) - { - f0_list[i] = 0.; - } - } - - let mut index = 0; - let new_accent_phrases = accent_phrases - .iter() - .map(|accent_phrase| { - AccentPhraseModel::new( - accent_phrase - .moras() - .iter() - .map(|mora| { - let new_mora = MoraModel::new( - mora.text().clone(), - mora.consonant().clone(), - *mora.consonant_length(), - mora.vowel().clone(), - *mora.vowel_length(), - f0_list[index + 1], - ); - index += 1; - new_mora - }) - .collect(), - *accent_phrase.accent(), - accent_phrase.pause_mora().as_ref().map(|pause_mora| { - let new_pause_mora = MoraModel::new( - pause_mora.text().clone(), - pause_mora.consonant().clone(), - *pause_mora.consonant_length(), - pause_mora.vowel().clone(), - *pause_mora.vowel_length(), - f0_list[index + 1], - ); - index += 1; - new_pause_mora - }), - *accent_phrase.is_interrogative(), - ) - }) - .collect(); - - Ok(new_accent_phrases) - } - - pub async fn synthesis( - &self, - query: &AudioQueryModel, - style_id: StyleId, - enable_interrogative_upspeak: bool, - ) -> Result> { - let speed_scale = *query.speed_scale(); - let pitch_scale = *query.pitch_scale(); - let intonation_scale = *query.intonation_scale(); - let pre_phoneme_length = *query.pre_phoneme_length(); - let post_phoneme_length = *query.post_phoneme_length(); - - let accent_phrases = if enable_interrogative_upspeak { - adjust_interrogative_accent_phrases(query.accent_phrases().as_slice()) - } else { - query.accent_phrases().clone() - }; - - let (flatten_moras, phoneme_data_list) = Self::initial_process(&accent_phrases); - - let mut phoneme_length_list = vec![pre_phoneme_length]; - let mut f0_list = vec![0.]; - let mut voiced_list = vec![false]; - { - let mut sum_of_f0_bigger_than_zero = 0.; - let mut count_of_f0_bigger_than_zero = 0; - - for mora in flatten_moras { - let consonant_length = *mora.consonant_length(); - let vowel_length = *mora.vowel_length(); - let pitch = *mora.pitch(); - - if let Some(consonant_length) = consonant_length { - phoneme_length_list.push(consonant_length); - } - phoneme_length_list.push(vowel_length); - - let f0_single = pitch * 2.0_f32.powf(pitch_scale); - f0_list.push(f0_single); - - let bigger_than_zero = f0_single > 0.; - voiced_list.push(bigger_than_zero); - - if bigger_than_zero { - sum_of_f0_bigger_than_zero += f0_single; - count_of_f0_bigger_than_zero += 1; - } - } - phoneme_length_list.push(post_phoneme_length); - f0_list.push(0.); - voiced_list.push(false); - let mean_f0 = sum_of_f0_bigger_than_zero / (count_of_f0_bigger_than_zero as f32); - - if !mean_f0.is_nan() { - for i in 0..f0_list.len() { - if voiced_list[i] { - f0_list[i] = (f0_list[i] - mean_f0) * intonation_scale + mean_f0; - } - } - } - } - - let (_, _, vowel_indexes) = split_mora(&phoneme_data_list); - - let mut phoneme: Vec> = Vec::new(); - let mut f0: Vec = Vec::new(); - { - const RATE: f32 = 24000. / 256.; - let mut sum_of_phoneme_length = 0; - let mut count_of_f0 = 0; - let mut vowel_indexes_index = 0; - - for (i, phoneme_length) in phoneme_length_list.iter().enumerate() { - // VOICEVOX ENGINEと挙動を合わせるため、四捨五入ではなく偶数丸めをする - // - // https://github.com/VOICEVOX/voicevox_engine/issues/552 - let phoneme_length = ((*phoneme_length * RATE).round_ties_even_() / speed_scale) - .round_ties_even_() as usize; - let phoneme_id = phoneme_data_list[i].phoneme_id(); - - for _ in 0..phoneme_length { - let mut phonemes_vec = vec![0.; OjtPhoneme::num_phoneme()]; - phonemes_vec[phoneme_id as usize] = 1.; - phoneme.push(phonemes_vec) - } - sum_of_phoneme_length += phoneme_length; - - if i as i64 == vowel_indexes[vowel_indexes_index] { - for _ in 0..sum_of_phoneme_length { - f0.push(f0_list[count_of_f0]); - } - count_of_f0 += 1; - sum_of_phoneme_length = 0; - vowel_indexes_index += 1; - } - } - } - - // 2次元のvectorを1次元に変換し、アドレスを連続させる - let flatten_phoneme = phoneme.into_iter().flatten().collect::>(); - - self.inference_core() - .decode( - f0.len(), - OjtPhoneme::num_phoneme(), - &f0, - &flatten_phoneme, - style_id, - ) - .await - } - - pub async fn synthesis_wave_format( - &self, - query: &AudioQueryModel, - style_id: StyleId, - enable_interrogative_upspeak: bool, - ) -> Result> { - let wave = self - .synthesis(query, style_id, enable_interrogative_upspeak) - .await?; - let volume_scale = *query.volume_scale(); - let output_stereo = *query.output_stereo(); - let output_sampling_rate = *query.output_sampling_rate(); - - // TODO: 44.1kHzなどの対応 - - let num_channels: u16 = if output_stereo { 2 } else { 1 }; - let bit_depth: u16 = 16; - let repeat_count: u32 = - (output_sampling_rate / DEFAULT_SAMPLING_RATE) * num_channels as u32; - let block_size: u16 = bit_depth * num_channels / 8; - - let bytes_size = wave.len() as u32 * repeat_count * 2; - let wave_size = bytes_size + 44; - - let buf: Vec = Vec::with_capacity(wave_size as usize); - let mut cur = Cursor::new(buf); - - cur.write_all("RIFF".as_bytes()).unwrap(); - cur.write_all(&(wave_size - 8).to_le_bytes()).unwrap(); - cur.write_all("WAVEfmt ".as_bytes()).unwrap(); - cur.write_all(&16_u32.to_le_bytes()).unwrap(); // fmt header length - cur.write_all(&1_u16.to_le_bytes()).unwrap(); //linear PCM - cur.write_all(&num_channels.to_le_bytes()).unwrap(); - cur.write_all(&output_sampling_rate.to_le_bytes()).unwrap(); - - let block_rate = output_sampling_rate * block_size as u32; - - cur.write_all(&block_rate.to_le_bytes()).unwrap(); - cur.write_all(&block_size.to_le_bytes()).unwrap(); - cur.write_all(&bit_depth.to_le_bytes()).unwrap(); - cur.write_all("data".as_bytes()).unwrap(); - cur.write_all(&bytes_size.to_le_bytes()).unwrap(); - - for value in wave { - let v = (value * volume_scale).clamp(-1., 1.); - let data = (v * 0x7fff as f32) as i16; - for _ in 0..repeat_count { - cur.write_all(&data.to_le_bytes()).unwrap(); - } - } - - Ok(cur.into_inner()) - } - - pub fn is_openjtalk_dict_loaded(&self) -> bool { - self.open_jtalk.dict_loaded() - } - - fn initial_process(accent_phrases: &[AccentPhraseModel]) -> (Vec, Vec) { - let flatten_moras = to_flatten_moras(accent_phrases); - - let mut phoneme_strings = vec!["pau".to_string()]; - for mora in flatten_moras.iter() { - if let Some(consonant) = mora.consonant() { - phoneme_strings.push(consonant.clone()) - } - phoneme_strings.push(mora.vowel().clone()); - } - phoneme_strings.push("pau".to_string()); - - let phoneme_data_list = to_phoneme_data_list(&phoneme_strings); - - (flatten_moras, phoneme_data_list) - } - - fn create_one_accent_list( - accent_list: &mut Vec, - accent_phrase: &AccentPhraseModel, - point: i32, - ) { - let mut one_accent_list: Vec = Vec::new(); - - for (i, mora) in accent_phrase.moras().iter().enumerate() { - let value = (i as i32 == point - || (point < 0 && i == (accent_phrase.moras().len() as i32 + point) as usize)) - .into(); - one_accent_list.push(value); - if mora.consonant().is_some() { - one_accent_list.push(value); - } - } - if accent_phrase.pause_mora().is_some() { - one_accent_list.push(0); - } - accent_list.extend(one_accent_list) - } -} - -pub fn to_flatten_moras(accent_phrases: &[AccentPhraseModel]) -> Vec { - let mut flatten_moras = Vec::new(); - - for accent_phrase in accent_phrases { - let moras = accent_phrase.moras(); - for mora in moras { - flatten_moras.push(mora.clone()); - } - if let Some(pause_mora) = accent_phrase.pause_mora() { - flatten_moras.push(pause_mora.clone()); - } - } - - flatten_moras -} - -pub fn to_phoneme_data_list>(phoneme_str_list: &[T]) -> Vec { - OjtPhoneme::convert( - phoneme_str_list - .iter() - .enumerate() - .map(|(i, s)| OjtPhoneme::new(s.as_ref().to_string(), i as f32, i as f32 + 1.)) - .collect::>() - .as_slice(), - ) -} - -pub fn split_mora(phoneme_list: &[OjtPhoneme]) -> (Vec, Vec, Vec) { - let mut vowel_indexes = Vec::new(); - for (i, phoneme) in phoneme_list.iter().enumerate() { - if MORA_PHONEME_LIST - .iter() - .any(|mora_phoneme| *mora_phoneme == phoneme.phoneme()) - { - vowel_indexes.push(i as i64); - } - } - - let vowel_phoneme_list = vowel_indexes - .iter() - .map(|vowel_index| phoneme_list[*vowel_index as usize].clone()) - .collect(); - - let mut consonant_phoneme_list = vec![OjtPhoneme::default()]; - for i in 0..(vowel_indexes.len() - 1) { - let prev = vowel_indexes[i]; - let next = vowel_indexes[i + 1]; - if next - prev == 1 { - consonant_phoneme_list.push(OjtPhoneme::default()); - } else { - consonant_phoneme_list.push(phoneme_list[next as usize - 1].clone()); - } - } - - (consonant_phoneme_list, vowel_phoneme_list, vowel_indexes) -} - -fn mora_to_text(mora: impl AsRef) -> String { - let last_char = mora.as_ref().chars().last().unwrap(); - let mora = if ['A', 'I', 'U', 'E', 'O'].contains(&last_char) { - format!( - "{}{}", - &mora.as_ref()[0..mora.as_ref().len() - 1], - last_char.to_lowercase() - ) - } else { - mora.as_ref().to_string() - }; - // もしカタカナに変換できなければ、引数で与えた文字列がそのまま返ってくる - mora_list::mora2text(&mora).to_string() -} - -fn adjust_interrogative_accent_phrases( - accent_phrases: &[AccentPhraseModel], -) -> Vec { - accent_phrases - .iter() - .map(|accent_phrase| { - AccentPhraseModel::new( - adjust_interrogative_moras(accent_phrase), - *accent_phrase.accent(), - accent_phrase.pause_mora().clone(), - *accent_phrase.is_interrogative(), - ) - }) - .collect() -} - -fn adjust_interrogative_moras(accent_phrase: &AccentPhraseModel) -> Vec { - let moras = accent_phrase.moras(); - if *accent_phrase.is_interrogative() && !moras.is_empty() { - let last_mora = moras.last().unwrap(); - let last_mora_pitch = *last_mora.pitch(); - if last_mora_pitch != 0.0 { - let mut new_moras: Vec = Vec::with_capacity(moras.len() + 1); - new_moras.extend_from_slice(moras.as_slice()); - let interrogative_mora = make_interrogative_mora(last_mora); - new_moras.push(interrogative_mora); - return new_moras; - } - } - moras.clone() -} - -fn make_interrogative_mora(last_mora: &MoraModel) -> MoraModel { - const FIX_VOWEL_LENGTH: f32 = 0.15; - const ADJUST_PITCH: f32 = 0.3; - const MAX_PITCH: f32 = 6.5; - - let pitch = (*last_mora.pitch() + ADJUST_PITCH).min(MAX_PITCH); - - MoraModel::new( - mora_to_text(last_mora.vowel()), - None, - None, - last_mora.vowel().clone(), - FIX_VOWEL_LENGTH, - pitch, - ) -} - -#[cfg(test)] -mod tests { - use super::*; - use ::test_util::OPEN_JTALK_DIC_DIR; - use pretty_assertions::assert_eq; - - use crate::{synthesizer::InferenceRuntimeImpl, *}; - - #[rstest] - #[tokio::test] - async fn is_openjtalk_dict_loaded_works() { - let core = InferenceCore::::new(false, 0).unwrap(); - let synthesis_engine = - SynthesisEngine::new(core, OpenJtalk::new(OPEN_JTALK_DIC_DIR).unwrap().into()); - - assert_eq!(synthesis_engine.is_openjtalk_dict_loaded(), true); - } - - #[rstest] - #[tokio::test] - async fn create_accent_phrases_works() { - let core = InferenceCore::::new(false, 0).unwrap(); - - let model = &VoiceModel::sample().await.unwrap(); - core.load_model(model).await.unwrap(); - - let synthesis_engine = - SynthesisEngine::new(core, OpenJtalk::new(OPEN_JTALK_DIC_DIR).unwrap().into()); - - let accent_phrases = synthesis_engine - .create_accent_phrases("同じ、文章、です。完全に、同一です。", StyleId::new(1)) - .await - .unwrap(); - assert_eq!(accent_phrases.len(), 5); - - // 入力テキストに「、」や「。」などの句読点が含まれていたときに - // AccentPhraseModel の pause_mora に期待する値をテスト - - assert!( - accent_phrases[0].pause_mora().is_some(), - "accent_phrases[0].pause_mora() is None" - ); - assert!( - accent_phrases[1].pause_mora().is_some(), - "accent_phrases[1].pause_mora() is None" - ); - assert!( - accent_phrases[2].pause_mora().is_some(), - "accent_phrases[2].pause_mora() is None" - ); - assert!( - accent_phrases[3].pause_mora().is_some(), - "accent_phrases[3].pause_mora() is None" - ); - assert!( - accent_phrases[4].pause_mora().is_none(), // 文末の句読点は削除される - "accent_phrases[4].pause_mora() is not None" - ); - - for accent_phrase in accent_phrases.iter().take(4) { - let pause_mora = accent_phrase.pause_mora().clone().unwrap(); - assert_eq!(pause_mora.text(), "、"); - assert_eq!(pause_mora.consonant(), &None); - assert_eq!(pause_mora.consonant_length(), &None); - assert_eq!(pause_mora.vowel(), "pau"); - assert_ne!( - pause_mora.vowel_length(), - &0.0, - "pause_mora.vowel_length() should not be 0.0" - ); - } - } -} diff --git a/crates/voicevox_core/src/error.rs b/crates/voicevox_core/src/error.rs index 043b51991..decbddca1 100644 --- a/crates/voicevox_core/src/error.rs +++ b/crates/voicevox_core/src/error.rs @@ -28,7 +28,7 @@ impl Error { /// 対応する[`ErrorKind`]を返す。 pub fn kind(&self) -> ErrorKind { match &self.0 { - ErrorRepr::NotLoadedOpenjtalkDict => ErrorKind::NotLoadedOpenjtalkDict, + ErrorRepr::LoadOpenjtalkSystemDic(_) => ErrorKind::LoadOpenjtalkSystemDic, ErrorRepr::GpuSupport => ErrorKind::GpuSupport, ErrorRepr::LoadModel(LoadModelError { context, .. }) => match context { LoadModelErrorKind::OpenZipFile => ErrorKind::OpenZipFile, @@ -54,8 +54,8 @@ impl Error { #[derive(Error, Debug)] pub(crate) enum ErrorRepr { - #[error("OpenJTalkの辞書が読み込まれていません")] - NotLoadedOpenjtalkDict, + #[error("ディレクトリ`{_0}`をOpen JTalkのシステム辞書として読むことができませんでした")] + LoadOpenjtalkSystemDic(String), #[error("GPU機能をサポートすることができません")] GpuSupport, @@ -106,8 +106,8 @@ pub(crate) enum ErrorRepr { /// エラーの種類。 #[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Debug)] pub enum ErrorKind { - /// open_jtalk辞書ファイルが読み込まれていない。 - NotLoadedOpenjtalkDict, + /// Open JTalkのシステム辞書を読むことができなかった。 + LoadOpenjtalkSystemDic, /// GPUモードがサポートされていない。 GpuSupport, /// ZIPファイルを開くことに失敗した。 diff --git a/crates/voicevox_core/src/infer/status.rs b/crates/voicevox_core/src/infer/status.rs index 7903cb8ff..887fa3900 100644 --- a/crates/voicevox_core/src/infer/status.rs +++ b/crates/voicevox_core/src/infer/status.rs @@ -86,14 +86,14 @@ impl Status { self.loaded_models.lock().unwrap().contains_style(style_id) } - pub fn validate_speaker_id(&self, style_id: StyleId) -> bool { - self.is_loaded_model_by_style_id(style_id) - } - + /// 推論を実行する。 + /// + /// CPU/GPU-bound操作であるため、async文脈ではスレッドに包むべきである。 + /// /// # Panics /// /// `self`が`model_id`を含んでいないとき、パニックする。 - pub(crate) async fn run_session( + pub(crate) fn run_session( &self, model_id: &VoiceModelId, input: I, @@ -103,10 +103,7 @@ impl Status { I::Signature: InferenceSignature, { let sess = self.loaded_models.lock().unwrap().get(model_id); - - tokio::task::spawn_blocking(move || sess.run(input)) - .await - .unwrap() + sess.run(input) } } diff --git a/crates/voicevox_core/src/inference_core.rs b/crates/voicevox_core/src/inference_core.rs index 875c9ba64..4c286827a 100644 --- a/crates/voicevox_core/src/inference_core.rs +++ b/crates/voicevox_core/src/inference_core.rs @@ -1,4 +1,5 @@ use enum_map::enum_map; +use ndarray::{Array1, ArrayView1, ArrayView2}; use crate::infer::{ domain::{ @@ -9,12 +10,14 @@ use crate::infer::{ status::Status, InferenceRuntime, InferenceSessionOptions, }; +use itertools::Itertools as _; use super::*; const PHONEME_LENGTH_MINIMAL: f32 = 0.01; pub(crate) struct InferenceCore { + use_gpu: bool, status: Status, } @@ -32,7 +35,7 @@ impl InferenceCore { | InferenceOperationImpl::PredictIntonation => light_session_options, InferenceOperationImpl::Decode => heavy_session_options, }); - Ok(Self { status }) + Ok(Self { use_gpu, status }) } else { Err(ErrorRepr::GpuSupport.into()) } @@ -50,6 +53,10 @@ impl InferenceCore { } } + pub(crate) fn is_use_gpu(&self) -> bool { + self.use_gpu + } + pub async fn load_model(&self, model: &VoiceModel) -> Result<()> { let model_bytes = &model.read_inference_models().await?; self.status.load_model(model, model_bytes).await @@ -70,29 +77,25 @@ impl InferenceCore { self.status.is_loaded_model_by_style_id(style_id) } - pub async fn predict_duration( + /// `predict_duration`を実行する。 + /// + /// CPU-bound操作であるため、async文脈ではスレッドに包むべきである。 + pub fn predict_duration( &self, - phoneme_vector: &[i64], + phoneme_list: Array1, style_id: StyleId, ) -> Result> { - if !self.status.validate_speaker_id(style_id) { - return Err(ErrorRepr::StyleNotFound { style_id }.into()); - } - let (model_id, model_inner_id) = self.status.ids_for(style_id)?; let PredictDurationOutput { phoneme_length: output, - } = self - .status - .run_session( - &model_id, - PredictDurationInput { - phoneme_list: ndarray::arr1(phoneme_vector), - speaker_id: ndarray::arr1(&[model_inner_id.raw_id().into()]), - }, - ) - .await?; + } = self.status.run_session( + &model_id, + PredictDurationInput { + phoneme_list, + speaker_id: ndarray::arr1(&[model_inner_id.raw_id().into()]), + }, + )?; let mut output = output.into_raw_vec(); for output_item in output.iter_mut() { @@ -104,58 +107,78 @@ impl InferenceCore { Ok(output) } + /// `predict_intonation`を実行する。 + /// + /// CPU-bound操作であるため、async文脈ではスレッドに包むべきである。 + /// + /// # Panics + /// + /// 長さが合わないとき、パニックする。 #[allow(clippy::too_many_arguments)] - pub async fn predict_intonation( + pub fn predict_intonation( &self, - length: usize, - vowel_phoneme_vector: &[i64], - consonant_phoneme_vector: &[i64], - start_accent_vector: &[i64], - end_accent_vector: &[i64], - start_accent_phrase_vector: &[i64], - end_accent_phrase_vector: &[i64], + vowel_phoneme_list: Array1, + consonant_phoneme_list: Array1, + start_accent_list: Array1, + end_accent_list: Array1, + start_accent_phrase_list: Array1, + end_accent_phrase_list: Array1, style_id: StyleId, ) -> Result> { - if !self.status.validate_speaker_id(style_id) { - return Err(ErrorRepr::StyleNotFound { style_id }.into()); - } - let (model_id, model_inner_id) = self.status.ids_for(style_id)?; - let PredictIntonationOutput { f0_list: output } = self - .status - .run_session( - &model_id, - PredictIntonationInput { - length: ndarray::arr0(length as i64), - vowel_phoneme_list: ndarray::arr1(vowel_phoneme_vector), - consonant_phoneme_list: ndarray::arr1(consonant_phoneme_vector), - start_accent_list: ndarray::arr1(start_accent_vector), - end_accent_list: ndarray::arr1(end_accent_vector), - start_accent_phrase_list: ndarray::arr1(start_accent_phrase_vector), - end_accent_phrase_list: ndarray::arr1(end_accent_phrase_vector), - speaker_id: ndarray::arr1(&[model_inner_id.raw_id().into()]), - }, - ) - .await?; + let Ok(&length) = [ + vowel_phoneme_list.len(), + consonant_phoneme_list.len(), + start_accent_list.len(), + end_accent_list.len(), + start_accent_phrase_list.len(), + end_accent_phrase_list.len(), + ] + .iter() + .unique() + .exactly_one() else { + panic!("different lengths"); + }; + + let PredictIntonationOutput { f0_list: output } = self.status.run_session( + &model_id, + PredictIntonationInput { + length: ndarray::arr0(length as i64), + vowel_phoneme_list, + consonant_phoneme_list, + start_accent_list, + end_accent_list, + start_accent_phrase_list, + end_accent_phrase_list, + speaker_id: ndarray::arr1(&[model_inner_id.raw_id().into()]), + }, + )?; Ok(output.into_raw_vec()) } - pub async fn decode( + /// `predict_intonation`を実行する。 + /// + /// CPU/GPU-bound操作であるため、async文脈ではスレッドに包むべきである。 + /// + /// # Panics + /// + /// `f0`と`phoneme`の長さが合わないとき、パニックする。 + pub fn decode( &self, - length: usize, - phoneme_size: usize, - f0: &[f32], - phoneme_vector: &[f32], + f0: ArrayView1<'_, f32>, + phoneme: ArrayView2<'_, f32>, style_id: StyleId, ) -> Result> { - if !self.status.validate_speaker_id(style_id) { - return Err(ErrorRepr::StyleNotFound { style_id }.into()); - } - let (model_id, model_inner_id) = self.status.ids_for(style_id)?; + let length = f0.len(); + let (phoneme_length, phoneme_size) = phoneme.dim(); + if phoneme_length != length { + panic!("different lengths"); + } + // 音が途切れてしまうのを避けるworkaround処理が入っている // TODO: 改善したらここのpadding処理を取り除く const PADDING_SIZE: f64 = 0.4; @@ -163,30 +186,28 @@ impl InferenceCore { let padding_size = ((PADDING_SIZE * DEFAULT_SAMPLING_RATE) / 256.0).round() as usize; let start_and_end_padding_size = 2 * padding_size; let length_with_padding = length + start_and_end_padding_size; - let f0_with_padding = Self::make_f0_with_padding(f0, length_with_padding, padding_size); + let f0_with_padding = + Self::make_f0_with_padding(f0.to_slice().unwrap(), length_with_padding, padding_size); let phoneme_with_padding = Self::make_phoneme_with_padding( - phoneme_vector, + phoneme.to_slice().unwrap(), phoneme_size, length_with_padding, padding_size, ); - let DecodeOutput { wave: output } = self - .status - .run_session( - &model_id, - DecodeInput { - f0: ndarray::arr1(&f0_with_padding) - .into_shape([length_with_padding, 1]) - .unwrap(), - phoneme: ndarray::arr1(&phoneme_with_padding) - .into_shape([length_with_padding, phoneme_size]) - .unwrap(), - speaker_id: ndarray::arr1(&[model_inner_id.raw_id().into()]), - }, - ) - .await?; + let DecodeOutput { wave: output } = self.status.run_session( + &model_id, + DecodeInput { + f0: ndarray::arr1(&f0_with_padding) + .into_shape([length_with_padding, 1]) + .unwrap(), + phoneme: ndarray::arr1(&phoneme_with_padding) + .into_shape([length_with_padding, phoneme_size]) + .unwrap(), + speaker_id: ndarray::arr1(&[model_inner_id.raw_id().into()]), + }, + )?; Ok(Self::trim_padding_from_output( output.into_raw_vec(), diff --git a/crates/voicevox_core/src/synthesizer.rs b/crates/voicevox_core/src/synthesizer.rs index 47ac0997a..63f02ed9d 100644 --- a/crates/voicevox_core/src/synthesizer.rs +++ b/crates/voicevox_core/src/synthesizer.rs @@ -1,11 +1,18 @@ -use std::sync::Arc; +use std::{ + borrow::Borrow, + io::{Cursor, Write as _}, + sync::Arc, +}; + +use ndarray::{Array1, ArrayView, ArrayView1, ArrayView2}; use crate::{ engine::{ - create_kana, parse_kana, AccentPhraseModel, OpenJtalk, SynthesisEngine, - DEFAULT_SAMPLING_RATE, + self, create_kana, parse_kana, AccentPhraseModel, MoraModel, OjtPhoneme, OpenJtalk, + Utterance, }, infer::runtimes::Onnxruntime, + numerics::F32Ext as _, }; use super::*; @@ -13,6 +20,7 @@ use super::*; /// [`Synthesizer::synthesis`]のオプション。 /// /// [`Synthesizer::synthesis`]: Synthesizer::synthesis +#[derive(Clone, Copy)] pub struct SynthesisOptions { pub enable_interrogative_upspeak: bool, } @@ -34,6 +42,7 @@ impl From<&TtsOptions> for SynthesisOptions { /// [`Synthesizer::tts`]のオプション。 /// /// [`Synthesizer::tts`]: Synthesizer::tts +#[derive(Clone, Copy)] pub struct TtsOptions { pub enable_interrogative_upspeak: bool, } @@ -76,12 +85,9 @@ pub struct InitializeOptions { pub(crate) type InferenceRuntimeImpl = Onnxruntime; /// 音声シンセサイザ。 -pub struct Synthesizer { - synthesis_engine: SynthesisEngine, - use_gpu: bool, -} +pub struct Synthesizer(Arc>); -impl Synthesizer { +impl Synthesizer { /// `Synthesizer`をコンストラクトする。 /// /// # Example @@ -108,7 +114,7 @@ impl Synthesizer { /// # Ok(()) /// # } /// ``` - pub fn new(open_jtalk: Arc, options: &InitializeOptions) -> Result { + pub fn new(open_jtalk: O, options: &InitializeOptions) -> Result { #[cfg(windows)] list_windows_video_cards(); let use_gpu = match options.acceleration_mode { @@ -128,53 +134,44 @@ impl Synthesizer { AccelerationMode::Gpu => true, }; - Ok(Self { - synthesis_engine: SynthesisEngine::new( - InferenceCore::new(use_gpu, options.cpu_num_threads)?, + Ok(Self( + Inner { + inference_core: InferenceCore::new(use_gpu, options.cpu_num_threads)?, open_jtalk, - ), - use_gpu, - }) + } + .into(), + )) } /// ハードウェアアクセラレーションがGPUモードか判定する。 pub fn is_gpu_mode(&self) -> bool { - self.use_gpu + self.0.inference_core.is_use_gpu() } /// 音声モデルを読み込む。 pub async fn load_voice_model(&self, model: &VoiceModel) -> Result<()> { - self.synthesis_engine - .inference_core() - .load_model(model) - .await?; + self.0.inference_core.load_model(model).await?; Ok(()) } /// 音声モデルの読み込みを解除する。 pub fn unload_voice_model(&self, voice_model_id: &VoiceModelId) -> Result<()> { - self.synthesis_engine - .inference_core() - .unload_model(voice_model_id) + self.0.inference_core.unload_model(voice_model_id) } /// 指定したIDの音声モデルが読み込まれているか判定する。 pub fn is_loaded_voice_model(&self, voice_model_id: &VoiceModelId) -> bool { - self.synthesis_engine - .inference_core() - .is_loaded_model(voice_model_id) + self.0.inference_core.is_loaded_model(voice_model_id) } #[doc(hidden)] pub fn is_loaded_model_by_style_id(&self, style_id: StyleId) -> bool { - self.synthesis_engine - .inference_core() - .is_model_loaded_by_style_id(style_id) + self.0.inference_core.is_model_loaded_by_style_id(style_id) } /// 今読み込んでいる音声モデルのメタ情報を返す。 pub fn metas(&self) -> VoiceModelMeta { - self.synthesis_engine.inference_core().metas() + self.0.inference_core.metas() } /// AudioQueryから音声合成を行う。 @@ -184,63 +181,52 @@ impl Synthesizer { style_id: StyleId, options: &SynthesisOptions, ) -> Result> { - self.synthesis_engine - .synthesis_wave_format(audio_query, style_id, options.enable_interrogative_upspeak) + let audio_query = audio_query.clone(); + let options = *options; + + self.spawn_blocking(move |inner| inner.blocking_synthesis(&audio_query, style_id, &options)) .await } - #[doc(hidden)] - pub async fn predict_duration( + pub(crate) fn predict_duration( &self, - phoneme_vector: &[i64], + phoneme_list: Array1, style_id: StyleId, ) -> Result> { - self.synthesis_engine - .inference_core() - .predict_duration(phoneme_vector, style_id) - .await + self.0 + .inference_core + .predict_duration(phoneme_list, style_id) } #[allow(clippy::too_many_arguments)] - #[doc(hidden)] - pub async fn predict_intonation( + pub(crate) fn predict_intonation( &self, - length: usize, - vowel_phoneme_vector: &[i64], - consonant_phoneme_vector: &[i64], - start_accent_vector: &[i64], - end_accent_vector: &[i64], - start_accent_phrase_vector: &[i64], - end_accent_phrase_vector: &[i64], + vowel_phoneme_list: Array1, + consonant_phoneme_list: Array1, + start_accent_list: Array1, + end_accent_list: Array1, + start_accent_phrase_list: Array1, + end_accent_phrase_list: Array1, style_id: StyleId, ) -> Result> { - self.synthesis_engine - .inference_core() - .predict_intonation( - length, - vowel_phoneme_vector, - consonant_phoneme_vector, - start_accent_vector, - end_accent_vector, - start_accent_phrase_vector, - end_accent_phrase_vector, - style_id, - ) - .await + self.0.inference_core.predict_intonation( + vowel_phoneme_list, + consonant_phoneme_list, + start_accent_list, + end_accent_list, + start_accent_phrase_list, + end_accent_phrase_list, + style_id, + ) } - #[doc(hidden)] - pub async fn decode( + + pub(crate) fn decode( &self, - length: usize, - phoneme_size: usize, - f0: &[f32], - phoneme_vector: &[f32], + f0: ArrayView1<'_, f32>, + phoneme: ArrayView2<'_, f32>, style_id: StyleId, ) -> Result> { - self.synthesis_engine - .inference_core() - .decode(length, phoneme_size, f0, phoneme_vector, style_id) - .await + self.0.inference_core.decode(f0, phoneme, style_id) } /// AquesTalk風記法からAccentPhrase (アクセント句)の配列を生成する。 @@ -271,12 +257,57 @@ impl Synthesizer { kana: &str, style_id: StyleId, ) -> Result> { - self.synthesis_engine - .replace_mora_data(&parse_kana(kana)?, style_id) - .await + let kana = kana.to_owned(); + + self.spawn_blocking(move |inner| { + inner.blocking_create_accent_phrases_from_kana(&kana, style_id) + }) + .await } - /// 日本語のテキストからAccentPhrase (アクセント句)の配列を生成する。 + /// AccentPhraseの配列の音高・音素長を、特定の声で生成しなおす。 + pub async fn replace_mora_data( + &self, + accent_phrases: &[AccentPhraseModel], + style_id: StyleId, + ) -> Result> { + let accent_phrases = accent_phrases.to_owned(); + + self.spawn_blocking(move |inner| { + inner.blocking_replace_mora_data(&accent_phrases, style_id) + }) + .await + } + + /// AccentPhraseの配列の音素長を、特定の声で生成しなおす。 + pub async fn replace_phoneme_length( + &self, + accent_phrases: &[AccentPhraseModel], + style_id: StyleId, + ) -> Result> { + let accent_phrases = accent_phrases.to_owned(); + + self.spawn_blocking(move |inner| { + inner.blocking_replace_phoneme_length(&accent_phrases, style_id) + }) + .await + } + + /// AccentPhraseの配列の音高を、特定の声で生成しなおす。 + pub async fn replace_mora_pitch( + &self, + accent_phrases: &[AccentPhraseModel], + style_id: StyleId, + ) -> Result> { + let accent_phrases = accent_phrases.to_owned(); + + self.spawn_blocking(move |inner| { + inner.blocking_replace_mora_pitch(&accent_phrases, style_id) + }) + .await + } + + /// AquesTalk風記法から[AudioQuery]を生成する。 /// /// # Example /// @@ -292,60 +323,55 @@ impl Synthesizer { /// # /// use voicevox_core::StyleId; /// - /// let accent_phrases = synthesizer - /// .create_accent_phrases("こんにちは", StyleId::new(302)) + /// let audio_query = synthesizer + /// .audio_query_from_kana("コンニチワ'", StyleId::new(302)) /// .await?; /// # /// # Ok(()) /// # } /// ``` - pub async fn create_accent_phrases( + /// + /// [AudioQuery]: crate::AudioQueryModel + pub async fn audio_query_from_kana( &self, - text: &str, + kana: &str, style_id: StyleId, - ) -> Result> { - if !self.synthesis_engine.is_openjtalk_dict_loaded() { - return Err(ErrorRepr::NotLoadedOpenjtalkDict.into()); - } - self.synthesis_engine - .create_accent_phrases(text, style_id) - .await - } + ) -> Result { + let kana = kana.to_owned(); - /// AccentPhraseの配列の音高・音素長を、特定の声で生成しなおす。 - pub async fn replace_mora_data( - &self, - accent_phrases: &[AccentPhraseModel], - style_id: StyleId, - ) -> Result> { - self.synthesis_engine - .replace_mora_data(accent_phrases, style_id) + self.spawn_blocking(move |inner| inner.blocking_audio_query_from_kana(&kana, style_id)) .await } - /// AccentPhraseの配列の音素長を、特定の声で生成しなおす。 - pub async fn replace_phoneme_length( + /// AquesTalk風記法から音声合成を行う。 + pub async fn tts_from_kana( &self, - accent_phrases: &[AccentPhraseModel], + kana: &str, style_id: StyleId, - ) -> Result> { - self.synthesis_engine - .replace_phoneme_length(accent_phrases, style_id) + options: &TtsOptions, + ) -> Result> { + let kana = kana.to_owned(); + let options = *options; + + self.spawn_blocking(move |inner| inner.blocking_tts_from_kana(&kana, style_id, &options)) .await } - /// AccentPhraseの配列の音高を、特定の声で生成しなおす。 - pub async fn replace_mora_pitch( - &self, - accent_phrases: &[AccentPhraseModel], - style_id: StyleId, - ) -> Result> { - self.synthesis_engine - .replace_mora_pitch(accent_phrases, style_id) + async fn spawn_blocking(&self, f: F) -> Result + where + F: FnOnce(&Inner) -> Result + Send + 'static, + R: Send + 'static, + { + let inner = self.0.clone(); + + tokio::task::spawn_blocking(move || f(&inner)) .await + .unwrap() } +} - /// AquesTalk風記法から[AudioQuery]を生成する。 +impl + Send + Sync + 'static> Synthesizer { + /// 日本語のテキストからAccentPhrase (アクセント句)の配列を生成する。 /// /// # Example /// @@ -361,22 +387,22 @@ impl Synthesizer { /// # /// use voicevox_core::StyleId; /// - /// let audio_query = synthesizer - /// .audio_query_from_kana("コンニチワ'", StyleId::new(302)) + /// let accent_phrases = synthesizer + /// .create_accent_phrases("こんにちは", StyleId::new(302)) /// .await?; /// # /// # Ok(()) /// # } /// ``` - /// - /// [AudioQuery]: crate::AudioQueryModel - pub async fn audio_query_from_kana( + pub async fn create_accent_phrases( &self, - kana: &str, + text: &str, style_id: StyleId, - ) -> Result { - let accent_phrases = self.create_accent_phrases_from_kana(kana, style_id).await?; - Ok(AudioQueryModel::from_accent_phrases(accent_phrases).with_kana(Some(kana.to_owned()))) + ) -> Result> { + let text = text.to_owned(); + + self.spawn_blocking(move |inner| inner.blocking_create_accent_phrases(&text, style_id)) + .await } /// 日本語のテキストから[AudioQuery]を生成する。 @@ -405,35 +431,559 @@ impl Synthesizer { /// /// [AudioQuery]: crate::AudioQueryModel pub async fn audio_query(&self, text: &str, style_id: StyleId) -> Result { - let accent_phrases = self.create_accent_phrases(text, style_id).await?; - Ok(AudioQueryModel::from_accent_phrases(accent_phrases)) + let text = text.to_owned(); + + self.spawn_blocking(move |inner| inner.blocking_audio_query(&text, style_id)) + .await } - /// AquesTalk風記法から音声合成を行う。 - pub async fn tts_from_kana( + /// 日本語のテキストから音声合成を行う。 + pub async fn tts( &self, - kana: &str, + text: &str, style_id: StyleId, options: &TtsOptions, ) -> Result> { - let audio_query = &self.audio_query_from_kana(kana, style_id).await?; - self.synthesis(audio_query, style_id, &SynthesisOptions::from(options)) + let text = text.to_owned(); + let options = *options; + + self.spawn_blocking(move |inner| inner.blocking_tts(&text, style_id, &options)) .await } +} - /// 日本語のテキストから音声合成を行う。 - pub async fn tts( +struct Inner { + inference_core: InferenceCore, + open_jtalk: O, +} + +impl Inner { + fn blocking_synthesis( &self, - text: &str, + audio_query: &AudioQueryModel, + style_id: StyleId, + options: &SynthesisOptions, + ) -> Result> { + let speed_scale = *audio_query.speed_scale(); + let pitch_scale = *audio_query.pitch_scale(); + let intonation_scale = *audio_query.intonation_scale(); + let pre_phoneme_length = *audio_query.pre_phoneme_length(); + let post_phoneme_length = *audio_query.post_phoneme_length(); + + let accent_phrases = if options.enable_interrogative_upspeak { + adjust_interrogative_accent_phrases(audio_query.accent_phrases().as_slice()) + } else { + audio_query.accent_phrases().clone() + }; + + let (flatten_moras, phoneme_data_list) = initial_process(&accent_phrases); + + let mut phoneme_length_list = vec![pre_phoneme_length]; + let mut f0_list = vec![0.]; + let mut voiced_list = vec![false]; + { + let mut sum_of_f0_bigger_than_zero = 0.; + let mut count_of_f0_bigger_than_zero = 0; + + for mora in flatten_moras { + let consonant_length = *mora.consonant_length(); + let vowel_length = *mora.vowel_length(); + let pitch = *mora.pitch(); + + if let Some(consonant_length) = consonant_length { + phoneme_length_list.push(consonant_length); + } + phoneme_length_list.push(vowel_length); + + let f0_single = pitch * 2.0_f32.powf(pitch_scale); + f0_list.push(f0_single); + + let bigger_than_zero = f0_single > 0.; + voiced_list.push(bigger_than_zero); + + if bigger_than_zero { + sum_of_f0_bigger_than_zero += f0_single; + count_of_f0_bigger_than_zero += 1; + } + } + phoneme_length_list.push(post_phoneme_length); + f0_list.push(0.); + voiced_list.push(false); + let mean_f0 = sum_of_f0_bigger_than_zero / (count_of_f0_bigger_than_zero as f32); + + if !mean_f0.is_nan() { + for i in 0..f0_list.len() { + if voiced_list[i] { + f0_list[i] = (f0_list[i] - mean_f0) * intonation_scale + mean_f0; + } + } + } + } + + let (_, _, vowel_indexes) = split_mora(&phoneme_data_list); + + let mut phoneme: Vec<[f32; OjtPhoneme::NUM_PHONEME]> = Vec::new(); + let mut f0: Vec = Vec::new(); + { + const RATE: f32 = 24000. / 256.; + let mut sum_of_phoneme_length = 0; + let mut count_of_f0 = 0; + let mut vowel_indexes_index = 0; + + for (i, phoneme_length) in phoneme_length_list.iter().enumerate() { + // VOICEVOX ENGINEと挙動を合わせるため、四捨五入ではなく偶数丸めをする + // + // https://github.com/VOICEVOX/voicevox_engine/issues/552 + let phoneme_length = ((*phoneme_length * RATE).round_ties_even_() / speed_scale) + .round_ties_even_() as usize; + let phoneme_id = phoneme_data_list[i].phoneme_id(); + + for _ in 0..phoneme_length { + let mut phonemes_vec = [0.; OjtPhoneme::NUM_PHONEME]; + phonemes_vec[phoneme_id as usize] = 1.; + phoneme.push(phonemes_vec) + } + sum_of_phoneme_length += phoneme_length; + + if i as i64 == vowel_indexes[vowel_indexes_index] { + for _ in 0..sum_of_phoneme_length { + f0.push(f0_list[count_of_f0]); + } + count_of_f0 += 1; + sum_of_phoneme_length = 0; + vowel_indexes_index += 1; + } + } + } + + let phoneme = &phoneme.into_iter().flatten().collect::>(); + let phoneme = ArrayView::from_shape((f0.len(), OjtPhoneme::NUM_PHONEME), phoneme).unwrap(); + + let wave = &self + .inference_core + .decode(ndarray::aview1(&f0), phoneme, style_id)?; + + return Ok(to_wav(wave, audio_query)); + + fn adjust_interrogative_accent_phrases( + accent_phrases: &[AccentPhraseModel], + ) -> Vec { + accent_phrases + .iter() + .map(|accent_phrase| { + AccentPhraseModel::new( + adjust_interrogative_moras(accent_phrase), + *accent_phrase.accent(), + accent_phrase.pause_mora().clone(), + *accent_phrase.is_interrogative(), + ) + }) + .collect() + } + + fn adjust_interrogative_moras(accent_phrase: &AccentPhraseModel) -> Vec { + let moras = accent_phrase.moras(); + if *accent_phrase.is_interrogative() && !moras.is_empty() { + let last_mora = moras.last().unwrap(); + let last_mora_pitch = *last_mora.pitch(); + if last_mora_pitch != 0.0 { + let mut new_moras: Vec = Vec::with_capacity(moras.len() + 1); + new_moras.extend_from_slice(moras.as_slice()); + let interrogative_mora = make_interrogative_mora(last_mora); + new_moras.push(interrogative_mora); + return new_moras; + } + } + moras.clone() + } + + fn make_interrogative_mora(last_mora: &MoraModel) -> MoraModel { + const FIX_VOWEL_LENGTH: f32 = 0.15; + const ADJUST_PITCH: f32 = 0.3; + const MAX_PITCH: f32 = 6.5; + + let pitch = (*last_mora.pitch() + ADJUST_PITCH).min(MAX_PITCH); + + MoraModel::new( + mora_to_text(last_mora.vowel()), + None, + None, + last_mora.vowel().clone(), + FIX_VOWEL_LENGTH, + pitch, + ) + } + + fn to_wav(wave: &[f32], query: &AudioQueryModel) -> Vec { + let volume_scale = *query.volume_scale(); + let output_stereo = *query.output_stereo(); + let output_sampling_rate = *query.output_sampling_rate(); + + // TODO: 44.1kHzなどの対応 + + let num_channels: u16 = if output_stereo { 2 } else { 1 }; + let bit_depth: u16 = 16; + let repeat_count: u32 = + (output_sampling_rate / DEFAULT_SAMPLING_RATE) * num_channels as u32; + let block_size: u16 = bit_depth * num_channels / 8; + + let bytes_size = wave.len() as u32 * repeat_count * 2; + let wave_size = bytes_size + 44; + + let buf: Vec = Vec::with_capacity(wave_size as usize); + let mut cur = Cursor::new(buf); + + cur.write_all("RIFF".as_bytes()).unwrap(); + cur.write_all(&(wave_size - 8).to_le_bytes()).unwrap(); + cur.write_all("WAVEfmt ".as_bytes()).unwrap(); + cur.write_all(&16_u32.to_le_bytes()).unwrap(); // fmt header length + cur.write_all(&1_u16.to_le_bytes()).unwrap(); //linear PCM + cur.write_all(&num_channels.to_le_bytes()).unwrap(); + cur.write_all(&output_sampling_rate.to_le_bytes()).unwrap(); + + let block_rate = output_sampling_rate * block_size as u32; + + cur.write_all(&block_rate.to_le_bytes()).unwrap(); + cur.write_all(&block_size.to_le_bytes()).unwrap(); + cur.write_all(&bit_depth.to_le_bytes()).unwrap(); + cur.write_all("data".as_bytes()).unwrap(); + cur.write_all(&bytes_size.to_le_bytes()).unwrap(); + + for value in wave { + let v = (value * volume_scale).clamp(-1., 1.); + let data = (v * 0x7fff as f32) as i16; + for _ in 0..repeat_count { + cur.write_all(&data.to_le_bytes()).unwrap(); + } + } + + cur.into_inner() + } + } + + fn blocking_create_accent_phrases_from_kana( + &self, + kana: &str, + style_id: StyleId, + ) -> Result> { + self.blocking_replace_mora_data(&parse_kana(kana)?, style_id) + } + + fn blocking_replace_mora_data( + &self, + accent_phrases: &[AccentPhraseModel], + style_id: StyleId, + ) -> Result> { + let accent_phrases = self.blocking_replace_phoneme_length(accent_phrases, style_id)?; + self.blocking_replace_mora_pitch(&accent_phrases, style_id) + } + + fn blocking_replace_phoneme_length( + &self, + accent_phrases: &[AccentPhraseModel], + style_id: StyleId, + ) -> Result> { + let (_, phoneme_data_list) = initial_process(accent_phrases); + + let (_, _, vowel_indexes_data) = split_mora(&phoneme_data_list); + + let phoneme_list_s: Array1 = phoneme_data_list + .iter() + .map(|phoneme_data| phoneme_data.phoneme_id()) + .collect(); + let phoneme_length = self + .inference_core + .predict_duration(phoneme_list_s, style_id)?; + + let mut index = 0; + let new_accent_phrases = accent_phrases + .iter() + .map(|accent_phrase| { + AccentPhraseModel::new( + accent_phrase + .moras() + .iter() + .map(|mora| { + let new_mora = MoraModel::new( + mora.text().clone(), + mora.consonant().clone(), + mora.consonant().as_ref().map(|_| { + phoneme_length[vowel_indexes_data[index + 1] as usize - 1] + }), + mora.vowel().clone(), + phoneme_length[vowel_indexes_data[index + 1] as usize], + *mora.pitch(), + ); + index += 1; + new_mora + }) + .collect(), + *accent_phrase.accent(), + accent_phrase.pause_mora().as_ref().map(|pause_mora| { + let new_pause_mora = MoraModel::new( + pause_mora.text().clone(), + pause_mora.consonant().clone(), + *pause_mora.consonant_length(), + pause_mora.vowel().clone(), + phoneme_length[vowel_indexes_data[index + 1] as usize], + *pause_mora.pitch(), + ); + index += 1; + new_pause_mora + }), + *accent_phrase.is_interrogative(), + ) + }) + .collect(); + + Ok(new_accent_phrases) + } + + fn blocking_replace_mora_pitch( + &self, + accent_phrases: &[AccentPhraseModel], + style_id: StyleId, + ) -> Result> { + let (_, phoneme_data_list) = initial_process(accent_phrases); + + let mut base_start_accent_list = vec![0]; + let mut base_end_accent_list = vec![0]; + let mut base_start_accent_phrase_list = vec![0]; + let mut base_end_accent_phrase_list = vec![0]; + for accent_phrase in accent_phrases { + let mut accent = usize::from(*accent_phrase.accent() != 1); + create_one_accent_list(&mut base_start_accent_list, accent_phrase, accent as i32); + + accent = *accent_phrase.accent() - 1; + create_one_accent_list(&mut base_end_accent_list, accent_phrase, accent as i32); + create_one_accent_list(&mut base_start_accent_phrase_list, accent_phrase, 0); + create_one_accent_list(&mut base_end_accent_phrase_list, accent_phrase, -1); + } + base_start_accent_list.push(0); + base_end_accent_list.push(0); + base_start_accent_phrase_list.push(0); + base_end_accent_phrase_list.push(0); + + let (consonant_phoneme_data_list, vowel_phoneme_data_list, vowel_indexes) = + split_mora(&phoneme_data_list); + + let consonant_phoneme_list: Vec = consonant_phoneme_data_list + .iter() + .map(|phoneme_data| phoneme_data.phoneme_id()) + .collect(); + let vowel_phoneme_list: Vec = vowel_phoneme_data_list + .iter() + .map(|phoneme_data| phoneme_data.phoneme_id()) + .collect(); + + let mut start_accent_list = Vec::with_capacity(vowel_indexes.len()); + let mut end_accent_list = Vec::with_capacity(vowel_indexes.len()); + let mut start_accent_phrase_list = Vec::with_capacity(vowel_indexes.len()); + let mut end_accent_phrase_list = Vec::with_capacity(vowel_indexes.len()); + + for vowel_index in vowel_indexes { + start_accent_list.push(base_start_accent_list[vowel_index as usize]); + end_accent_list.push(base_end_accent_list[vowel_index as usize]); + start_accent_phrase_list.push(base_start_accent_phrase_list[vowel_index as usize]); + end_accent_phrase_list.push(base_end_accent_phrase_list[vowel_index as usize]); + } + + let mut f0_list = self.inference_core.predict_intonation( + vowel_phoneme_list.into(), + consonant_phoneme_list.into(), + start_accent_list.into(), + end_accent_list.into(), + start_accent_phrase_list.into(), + end_accent_phrase_list.into(), + style_id, + )?; + + for i in 0..vowel_phoneme_data_list.len() { + if UNVOICED_MORA_PHONEME_LIST + .iter() + .any(|phoneme| *phoneme == vowel_phoneme_data_list[i].phoneme()) + { + f0_list[i] = 0.; + } + } + + let mut index = 0; + let new_accent_phrases = accent_phrases + .iter() + .map(|accent_phrase| { + AccentPhraseModel::new( + accent_phrase + .moras() + .iter() + .map(|mora| { + let new_mora = MoraModel::new( + mora.text().clone(), + mora.consonant().clone(), + *mora.consonant_length(), + mora.vowel().clone(), + *mora.vowel_length(), + f0_list[index + 1], + ); + index += 1; + new_mora + }) + .collect(), + *accent_phrase.accent(), + accent_phrase.pause_mora().as_ref().map(|pause_mora| { + let new_pause_mora = MoraModel::new( + pause_mora.text().clone(), + pause_mora.consonant().clone(), + *pause_mora.consonant_length(), + pause_mora.vowel().clone(), + *pause_mora.vowel_length(), + f0_list[index + 1], + ); + index += 1; + new_pause_mora + }), + *accent_phrase.is_interrogative(), + ) + }) + .collect(); + + return Ok(new_accent_phrases); + + const UNVOICED_MORA_PHONEME_LIST: &[&str] = &["A", "I", "U", "E", "O", "cl", "pau"]; + + fn create_one_accent_list( + accent_list: &mut Vec, + accent_phrase: &AccentPhraseModel, + point: i32, + ) { + let mut one_accent_list: Vec = Vec::new(); + + for (i, mora) in accent_phrase.moras().iter().enumerate() { + let value = (i as i32 == point + || (point < 0 && i == (accent_phrase.moras().len() as i32 + point) as usize)) + .into(); + one_accent_list.push(value); + if mora.consonant().is_some() { + one_accent_list.push(value); + } + } + if accent_phrase.pause_mora().is_some() { + one_accent_list.push(0); + } + accent_list.extend(one_accent_list) + } + } + + fn blocking_audio_query_from_kana( + &self, + kana: &str, + style_id: StyleId, + ) -> Result { + let accent_phrases = self.blocking_create_accent_phrases_from_kana(kana, style_id)?; + Ok(AudioQueryModel::from_accent_phrases(accent_phrases).with_kana(Some(kana.to_owned()))) + } + + fn blocking_tts_from_kana( + &self, + kana: &str, style_id: StyleId, options: &TtsOptions, ) -> Result> { - let audio_query = &self.audio_query(text, style_id).await?; - self.synthesis(audio_query, style_id, &SynthesisOptions::from(options)) - .await + let audio_query = &self.blocking_audio_query_from_kana(kana, style_id)?; + self.blocking_synthesis(audio_query, style_id, &SynthesisOptions::from(options)) + } +} + +impl + Send + Sync + 'static> Inner { + fn blocking_create_accent_phrases( + &self, + text: &str, + style_id: StyleId, + ) -> Result> { + if text.is_empty() { + return Ok(Vec::new()); + } + + let utterance = Utterance::extract_full_context_label(self.open_jtalk.borrow(), text)?; + + let accent_phrases: Vec = utterance + .breath_groups() + .iter() + .enumerate() + .fold(Vec::new(), |mut accum_vec, (i, breath_group)| { + accum_vec.extend(breath_group.accent_phrases().iter().enumerate().map( + |(j, accent_phrase)| { + let moras = accent_phrase + .moras() + .iter() + .map(|mora| { + let mora_text = mora + .phonemes() + .iter() + .map(|phoneme| phoneme.phoneme().to_string()) + .collect::>() + .join(""); + + let (consonant, consonant_length) = + if let Some(consonant) = mora.consonant() { + (Some(consonant.phoneme().to_string()), Some(0.)) + } else { + (None, None) + }; + + MoraModel::new( + mora_to_text(mora_text), + consonant, + consonant_length, + mora.vowel().phoneme().into(), + 0., + 0., + ) + }) + .collect(); + + let pause_mora = if i != utterance.breath_groups().len() - 1 + && j == breath_group.accent_phrases().len() - 1 + { + Some(MoraModel::new( + "、".into(), + None, + None, + "pau".into(), + 0., + 0., + )) + } else { + None + }; + + AccentPhraseModel::new( + moras, + *accent_phrase.accent(), + pause_mora, + *accent_phrase.is_interrogative(), + ) + }, + )); + + accum_vec + }); + + self.blocking_replace_mora_data(&accent_phrases, style_id) + } + + fn blocking_audio_query(&self, text: &str, style_id: StyleId) -> Result { + let accent_phrases = self.blocking_create_accent_phrases(text, style_id)?; + Ok(AudioQueryModel::from_accent_phrases(accent_phrases)) + } + + fn blocking_tts(&self, text: &str, style_id: StyleId, options: &TtsOptions) -> Result> { + let audio_query = &self.blocking_audio_query(text, style_id)?; + self.blocking_synthesis(audio_query, style_id, &SynthesisOptions::from(options)) } } +const DEFAULT_SAMPLING_RATE: u32 = 24000; + #[cfg(windows)] fn list_windows_video_cards() { use std::{ffi::OsString, os::windows::ffi::OsStringExt as _}; @@ -472,6 +1022,99 @@ fn list_windows_video_cards() { } } +fn initial_process(accent_phrases: &[AccentPhraseModel]) -> (Vec, Vec) { + let flatten_moras = to_flatten_moras(accent_phrases); + + let mut phoneme_strings = vec!["pau".to_string()]; + for mora in flatten_moras.iter() { + if let Some(consonant) = mora.consonant() { + phoneme_strings.push(consonant.clone()) + } + phoneme_strings.push(mora.vowel().clone()); + } + phoneme_strings.push("pau".to_string()); + + let phoneme_data_list = to_phoneme_data_list(&phoneme_strings); + + return (flatten_moras, phoneme_data_list); + + fn to_flatten_moras(accent_phrases: &[AccentPhraseModel]) -> Vec { + let mut flatten_moras = Vec::new(); + + for accent_phrase in accent_phrases { + let moras = accent_phrase.moras(); + for mora in moras { + flatten_moras.push(mora.clone()); + } + if let Some(pause_mora) = accent_phrase.pause_mora() { + flatten_moras.push(pause_mora.clone()); + } + } + + flatten_moras + } + + fn to_phoneme_data_list>(phoneme_str_list: &[T]) -> Vec { + OjtPhoneme::convert( + phoneme_str_list + .iter() + .enumerate() + .map(|(i, s)| OjtPhoneme::new(s.as_ref().to_string(), i as f32, i as f32 + 1.)) + .collect::>() + .as_slice(), + ) + } +} + +fn split_mora(phoneme_list: &[OjtPhoneme]) -> (Vec, Vec, Vec) { + let mut vowel_indexes = Vec::new(); + for (i, phoneme) in phoneme_list.iter().enumerate() { + if MORA_PHONEME_LIST + .iter() + .any(|mora_phoneme| *mora_phoneme == phoneme.phoneme()) + { + vowel_indexes.push(i as i64); + } + } + + let vowel_phoneme_list = vowel_indexes + .iter() + .map(|vowel_index| phoneme_list[*vowel_index as usize].clone()) + .collect(); + + let mut consonant_phoneme_list = vec![OjtPhoneme::default()]; + for i in 0..(vowel_indexes.len() - 1) { + let prev = vowel_indexes[i]; + let next = vowel_indexes[i + 1]; + if next - prev == 1 { + consonant_phoneme_list.push(OjtPhoneme::default()); + } else { + consonant_phoneme_list.push(phoneme_list[next as usize - 1].clone()); + } + } + + return (consonant_phoneme_list, vowel_phoneme_list, vowel_indexes); + + const MORA_PHONEME_LIST: &[&str] = &[ + "a", "i", "u", "e", "o", "N", "A", "I", "U", "E", "O", "cl", "pau", + ]; +} + +fn mora_to_text(mora: impl AsRef) -> String { + let last_char = mora.as_ref().chars().last().unwrap(); + let mora = if ['A', 'I', 'U', 'E', 'O'].contains(&last_char) { + format!( + "{}{}", + &mora.as_ref()[0..mora.as_ref().len() - 1], + last_char.to_lowercase() + ) + } else { + mora.as_ref().to_string() + }; + // もしカタカナに変換できなければ、引数で与えた文字列がそのまま返ってくる + engine::mora2text(&mora).to_string() +} + impl AudioQueryModel { fn from_accent_phrases(accent_phrases: Vec) -> Self { let kana = create_kana(&accent_phrases); @@ -492,17 +1135,19 @@ impl AudioQueryModel { #[cfg(test)] mod tests { + use std::sync::Arc; use super::*; use crate::{engine::MoraModel, macros::tests::assert_debug_fmt_eq}; use ::test_util::OPEN_JTALK_DIC_DIR; + use ndarray::{array, s, Array}; #[rstest] #[case(Ok(()))] #[tokio::test] async fn load_model_works(#[case] expected_result_at_initialized: Result<()>) { let syntesizer = Synthesizer::new( - Arc::new(OpenJtalk::new_without_dic()), + (), &InitializeOptions { acceleration_mode: AccelerationMode::Cpu, ..Default::default() @@ -525,7 +1170,7 @@ mod tests { #[tokio::test] async fn is_use_gpu_works() { let syntesizer = Synthesizer::new( - Arc::new(OpenJtalk::new_without_dic()), + (), &InitializeOptions { acceleration_mode: AccelerationMode::Cpu, ..Default::default() @@ -541,7 +1186,7 @@ mod tests { async fn is_loaded_model_by_style_id_works(#[case] style_id: u32, #[case] expected: bool) { let style_id = StyleId::new(style_id); let syntesizer = Synthesizer::new( - Arc::new(OpenJtalk::new_without_dic()), + (), &InitializeOptions { acceleration_mode: AccelerationMode::Cpu, ..Default::default() @@ -569,7 +1214,7 @@ mod tests { #[tokio::test] async fn predict_duration_works() { let syntesizer = Synthesizer::new( - Arc::new(OpenJtalk::new_without_dic()), + (), &InitializeOptions { acceleration_mode: AccelerationMode::Cpu, ..Default::default() @@ -583,14 +1228,12 @@ mod tests { .unwrap(); // 「こんにちは、音声合成の世界へようこそ」という文章を変換して得た phoneme_vector - let phoneme_vector = [ + let phoneme_vector = array![ 0, 23, 30, 4, 28, 21, 10, 21, 42, 7, 0, 30, 4, 35, 14, 14, 16, 30, 30, 35, 14, 14, 28, 30, 35, 14, 23, 7, 21, 14, 43, 30, 30, 23, 30, 35, 30, 0, ]; - let result = syntesizer - .predict_duration(&phoneme_vector, StyleId::new(1)) - .await; + let result = syntesizer.predict_duration(phoneme_vector.clone(), StyleId::new(1)); assert!(result.is_ok(), "{result:?}"); assert_eq!(result.unwrap().len(), phoneme_vector.len()); @@ -600,7 +1243,7 @@ mod tests { #[tokio::test] async fn predict_intonation_works() { let syntesizer = Synthesizer::new( - Arc::new(OpenJtalk::new_without_dic()), + (), &InitializeOptions { acceleration_mode: AccelerationMode::Cpu, ..Default::default() @@ -613,35 +1256,32 @@ mod tests { .unwrap(); // 「テスト」という文章に対応する入力 - let vowel_phoneme_vector = [0, 14, 6, 30, 0]; - let consonant_phoneme_vector = [-1, 37, 35, 37, -1]; - let start_accent_vector = [0, 1, 0, 0, 0]; - let end_accent_vector = [0, 1, 0, 0, 0]; - let start_accent_phrase_vector = [0, 1, 0, 0, 0]; - let end_accent_phrase_vector = [0, 0, 0, 1, 0]; - - let result = syntesizer - .predict_intonation( - vowel_phoneme_vector.len(), - &vowel_phoneme_vector, - &consonant_phoneme_vector, - &start_accent_vector, - &end_accent_vector, - &start_accent_phrase_vector, - &end_accent_phrase_vector, - StyleId::new(1), - ) - .await; + let vowel_phoneme_vector = array![0, 14, 6, 30, 0]; + let consonant_phoneme_vector = array![-1, 37, 35, 37, -1]; + let start_accent_vector = array![0, 1, 0, 0, 0]; + let end_accent_vector = array![0, 1, 0, 0, 0]; + let start_accent_phrase_vector = array![0, 1, 0, 0, 0]; + let end_accent_phrase_vector = array![0, 0, 0, 1, 0]; + + let result = syntesizer.predict_intonation( + vowel_phoneme_vector, + consonant_phoneme_vector, + start_accent_vector, + end_accent_vector, + start_accent_phrase_vector, + end_accent_phrase_vector, + StyleId::new(1), + ); assert!(result.is_ok(), "{result:?}"); - assert_eq!(result.unwrap().len(), vowel_phoneme_vector.len()); + assert_eq!(result.unwrap().len(), 5); } #[rstest] #[tokio::test] async fn decode_works() { let syntesizer = Synthesizer::new( - Arc::new(OpenJtalk::new_without_dic()), + (), &InitializeOptions { acceleration_mode: AccelerationMode::Cpu, ..Default::default() @@ -655,15 +1295,14 @@ mod tests { // 「テスト」という文章に対応する入力 const F0_LENGTH: usize = 69; - let mut f0 = [0.; F0_LENGTH]; - f0[9..24].fill(5.905218); - f0[37..60].fill(5.565851); + let mut f0 = ndarray::arr1(&[0.; F0_LENGTH]); + f0.slice_mut(s!(9..24)).fill(5.905218); + f0.slice_mut(s!(37..60)).fill(5.565851); - const PHONEME_SIZE: usize = 45; - let mut phoneme = [0.; PHONEME_SIZE * F0_LENGTH]; + let mut phoneme = Array::from_shape_simple_fn((F0_LENGTH, OjtPhoneme::NUM_PHONEME), || 0.); let mut set_one = |index, range| { for i in range { - phoneme[i * PHONEME_SIZE + index] = 1.; + phoneme[(i, index)] = 1.; } }; set_one(0, 0..9); @@ -675,9 +1314,7 @@ mod tests { set_one(30, 45..60); set_one(0, 60..69); - let result = syntesizer - .decode(F0_LENGTH, PHONEME_SIZE, &f0, &phoneme, StyleId::new(1)) - .await; + let result = syntesizer.decode(f0.view(), phoneme.view(), StyleId::new(1)); assert!(result.is_ok(), "{result:?}"); assert_eq!(result.unwrap().len(), F0_LENGTH * 256); @@ -862,6 +1499,65 @@ mod tests { } } + #[rstest] + #[tokio::test] + async fn accent_phrases_works_for_japanese_periods_and_commas() { + let syntesizer = Synthesizer::new( + Arc::new(OpenJtalk::new(OPEN_JTALK_DIC_DIR).unwrap()), + &InitializeOptions { + acceleration_mode: AccelerationMode::Cpu, + ..Default::default() + }, + ) + .unwrap(); + + let model = &VoiceModel::sample().await.unwrap(); + syntesizer.load_voice_model(model).await.unwrap(); + + let accent_phrases = syntesizer + .create_accent_phrases("同じ、文章、です。完全に、同一です。", StyleId::new(1)) + .await + .unwrap(); + assert_eq!(accent_phrases.len(), 5); + + // 入力テキストに「、」や「。」などの句読点が含まれていたときに + // AccentPhraseModel の pause_mora に期待する値をテスト + + assert!( + accent_phrases[0].pause_mora().is_some(), + "accent_phrases[0].pause_mora() is None" + ); + assert!( + accent_phrases[1].pause_mora().is_some(), + "accent_phrases[1].pause_mora() is None" + ); + assert!( + accent_phrases[2].pause_mora().is_some(), + "accent_phrases[2].pause_mora() is None" + ); + assert!( + accent_phrases[3].pause_mora().is_some(), + "accent_phrases[3].pause_mora() is None" + ); + assert!( + accent_phrases[4].pause_mora().is_none(), // 文末の句読点は削除される + "accent_phrases[4].pause_mora() is not None" + ); + + for accent_phrase in accent_phrases.iter().take(4) { + let pause_mora = accent_phrase.pause_mora().clone().unwrap(); + assert_eq!(pause_mora.text(), "、"); + assert_eq!(pause_mora.consonant(), &None); + assert_eq!(pause_mora.consonant_length(), &None); + assert_eq!(pause_mora.vowel(), "pau"); + assert_ne!( + pause_mora.vowel_length(), + &0.0, + "pause_mora.vowel_length() should not be 0.0" + ); + } + } + #[rstest] #[tokio::test] async fn mora_length_works() { diff --git a/crates/voicevox_core_c_api/Cargo.toml b/crates/voicevox_core_c_api/Cargo.toml index 0567a0c9e..dbf4df4f9 100644 --- a/crates/voicevox_core_c_api/Cargo.toml +++ b/crates/voicevox_core_c_api/Cargo.toml @@ -24,6 +24,7 @@ derive-getters.workspace = true futures.workspace = true itertools.workspace = true libc = "0.2.134" +ndarray.workspace = true once_cell.workspace = true serde_json.workspace = true thiserror.workspace = true @@ -52,7 +53,6 @@ easy-ext.workspace = true inventory = "0.3.4" libloading = "0.7.3" libtest-mimic = "0.6.0" -ndarray.workspace = true ndarray-stats = "0.5.1" regex.workspace = true serde.workspace = true diff --git a/crates/voicevox_core_c_api/src/compatible_engine.rs b/crates/voicevox_core_c_api/src/compatible_engine.rs index dd8ce8e94..bd7715386 100644 --- a/crates/voicevox_core_c_api/src/compatible_engine.rs +++ b/crates/voicevox_core_c_api/src/compatible_engine.rs @@ -1,9 +1,10 @@ -use std::{collections::BTreeMap, sync::Arc}; +use std::collections::BTreeMap; use super::*; use libc::c_int; -use voicevox_core::{OpenJtalk, StyleId, VoiceModel}; +use ndarray::ArrayView; +use voicevox_core::{StyleId, VoiceModel, __internal::interp::PerformInference as _}; macro_rules! ensure_initialized { ($synthesizer:expr $(,)?) => { @@ -88,10 +89,10 @@ fn voice_model_set() -> &'static VoiceModelSet { &VOICE_MODEL_SET } -static SYNTHESIZER: Lazy>> = +static SYNTHESIZER: Lazy>>> = Lazy::new(|| Mutex::new(None)); -fn lock_synthesizer() -> MutexGuard<'static, Option> { +fn lock_synthesizer() -> MutexGuard<'static, Option>> { SYNTHESIZER.lock().unwrap() } @@ -106,7 +107,7 @@ fn set_message(message: &str) { pub extern "C" fn initialize(use_gpu: bool, cpu_num_threads: c_int, load_all_models: bool) -> bool { let result = RUNTIME.block_on(async { let synthesizer = voicevox_core::Synthesizer::new( - Arc::new(OpenJtalk::new_without_dic()), + (), &voicevox_core::InitializeOptions { acceleration_mode: if use_gpu { voicevox_core::AccelerationMode::Gpu @@ -190,20 +191,21 @@ pub extern "C" fn supported_devices() -> *const c_char { } #[no_mangle] -pub extern "C" fn yukarin_s_forward( +pub unsafe extern "C" fn yukarin_s_forward( length: i64, phoneme_list: *mut i64, speaker_id: *mut i64, output: *mut f32, ) -> bool { + let length = length as usize; let synthesizer = &*lock_synthesizer(); - let result = RUNTIME.block_on(ensure_initialized!(synthesizer).predict_duration( - unsafe { std::slice::from_raw_parts_mut(phoneme_list, length as usize) }, + let result = ensure_initialized!(synthesizer).predict_duration( + ArrayView::from_shape_ptr((length,), phoneme_list).into_owned(), StyleId::new(unsafe { *speaker_id as u32 }), - )); + ); match result { Ok(output_vec) => { - let output_slice = unsafe { std::slice::from_raw_parts_mut(output, length as usize) }; + let output_slice = std::slice::from_raw_parts_mut(output, length); output_slice.clone_from_slice(&output_vec); true } @@ -215,7 +217,7 @@ pub extern "C" fn yukarin_s_forward( } #[no_mangle] -pub extern "C" fn yukarin_sa_forward( +pub unsafe extern "C" fn yukarin_sa_forward( length: i64, vowel_phoneme_list: *mut i64, consonant_phoneme_list: *mut i64, @@ -226,20 +228,20 @@ pub extern "C" fn yukarin_sa_forward( speaker_id: *mut i64, output: *mut f32, ) -> bool { + let length = length as usize; let synthesizer = &*lock_synthesizer(); - let result = RUNTIME.block_on(ensure_initialized!(synthesizer).predict_intonation( - length as usize, - unsafe { std::slice::from_raw_parts(vowel_phoneme_list, length as usize) }, - unsafe { std::slice::from_raw_parts(consonant_phoneme_list, length as usize) }, - unsafe { std::slice::from_raw_parts(start_accent_list, length as usize) }, - unsafe { std::slice::from_raw_parts(end_accent_list, length as usize) }, - unsafe { std::slice::from_raw_parts(start_accent_phrase_list, length as usize) }, - unsafe { std::slice::from_raw_parts(end_accent_phrase_list, length as usize) }, + let result = ensure_initialized!(synthesizer).predict_intonation( + ArrayView::from_shape_ptr((length,), vowel_phoneme_list).into_owned(), + ArrayView::from_shape_ptr((length,), consonant_phoneme_list).into_owned(), + ArrayView::from_shape_ptr((length,), start_accent_list).into_owned(), + ArrayView::from_shape_ptr((length,), end_accent_list).into_owned(), + ArrayView::from_shape_ptr((length,), start_accent_phrase_list).into_owned(), + ArrayView::from_shape_ptr((length,), end_accent_phrase_list).into_owned(), StyleId::new(unsafe { *speaker_id as u32 }), - )); + ); match result { Ok(output_vec) => { - let output_slice = unsafe { std::slice::from_raw_parts_mut(output, length as usize) }; + let output_slice = std::slice::from_raw_parts_mut(output, length); output_slice.clone_from_slice(&output_vec); true } @@ -251,7 +253,7 @@ pub extern "C" fn yukarin_sa_forward( } #[no_mangle] -pub extern "C" fn decode_forward( +pub unsafe extern "C" fn decode_forward( length: i64, phoneme_size: i64, f0: *mut f32, @@ -262,16 +264,14 @@ pub extern "C" fn decode_forward( let length = length as usize; let phoneme_size = phoneme_size as usize; let synthesizer = &*lock_synthesizer(); - let result = RUNTIME.block_on(ensure_initialized!(synthesizer).decode( - length, - phoneme_size, - unsafe { std::slice::from_raw_parts(f0, length) }, - unsafe { std::slice::from_raw_parts(phoneme, phoneme_size * length) }, + let result = ensure_initialized!(synthesizer).decode( + ArrayView::from_shape_ptr((length,), f0), + ArrayView::from_shape_ptr((length, phoneme_size), phoneme), StyleId::new(unsafe { *speaker_id as u32 }), - )); + ); match result { Ok(output_vec) => { - let output_slice = unsafe { std::slice::from_raw_parts_mut(output, length * 256) }; + let output_slice = std::slice::from_raw_parts_mut(output, length * 256); output_slice.clone_from_slice(&output_vec); true } diff --git a/crates/voicevox_core_c_api/src/helpers.rs b/crates/voicevox_core_c_api/src/helpers.rs index 698e89b45..9c582251f 100644 --- a/crates/voicevox_core_c_api/src/helpers.rs +++ b/crates/voicevox_core_c_api/src/helpers.rs @@ -29,7 +29,7 @@ pub(crate) fn into_result_code_with_error(result: CApiResult<()>) -> VoicevoxRes match result { Ok(()) => VOICEVOX_RESULT_OK, Err(RustApi(err)) => match err.kind() { - NotLoadedOpenjtalkDict => VOICEVOX_RESULT_NOT_LOADED_OPENJTALK_DICT_ERROR, + LoadOpenjtalkSystemDic => VOICEVOX_RESULT_LOAD_OPENJTALK_SYSTEM_DIC, GpuSupport => VOICEVOX_RESULT_GPU_SUPPORT_ERROR, OpenZipFile => VOICEVOX_RESULT_OPEN_ZIP_FILE_ERROR, ReadZipEntry => VOICEVOX_RESULT_READ_ZIP_ENTRY_ERROR, diff --git a/crates/voicevox_core_c_api/src/lib.rs b/crates/voicevox_core_c_api/src/lib.rs index 302089a95..0e24b6d12 100644 --- a/crates/voicevox_core_c_api/src/lib.rs +++ b/crates/voicevox_core_c_api/src/lib.rs @@ -319,7 +319,7 @@ pub extern "C" fn voicevox_voice_model_delete(model: Box) { /// 構築(_construction_)は ::voicevox_synthesizer_new で行い、破棄(_destruction_)は ::voicevox_synthesizer_delete で行う。 #[derive(Getters)] pub struct VoicevoxSynthesizer { - synthesizer: Synthesizer, + synthesizer: Synthesizer>, } /// ::VoicevoxSynthesizer を構築(_construct_)する。 diff --git a/crates/voicevox_core_c_api/src/result_code.rs b/crates/voicevox_core_c_api/src/result_code.rs index 65236ada4..d77ffc405 100644 --- a/crates/voicevox_core_c_api/src/result_code.rs +++ b/crates/voicevox_core_c_api/src/result_code.rs @@ -11,8 +11,8 @@ pub enum VoicevoxResultCode { // 出力フォーマットを変更すればRustでよく使われているUpperCamelにできるが、実際に出力されるコードとの差異をできるだけ少なくするため /// 成功 VOICEVOX_RESULT_OK = 0, - /// open_jtalk辞書ファイルが読み込まれていない - VOICEVOX_RESULT_NOT_LOADED_OPENJTALK_DICT_ERROR = 1, + /// Open JTalkのシステム辞書を読むことができなかった + VOICEVOX_RESULT_LOAD_OPENJTALK_SYSTEM_DIC = 1, /// サポートされているデバイス情報取得に失敗した VOICEVOX_RESULT_GET_SUPPORTED_DEVICES_ERROR = 3, /// GPUモードがサポートされていない @@ -60,8 +60,8 @@ pub enum VoicevoxResultCode { pub(crate) const fn error_result_to_message(result_code: VoicevoxResultCode) -> &'static CStr { use VoicevoxResultCode::*; match result_code { - VOICEVOX_RESULT_NOT_LOADED_OPENJTALK_DICT_ERROR => { - cstr!("OpenJTalkの辞書が読み込まれていません") + VOICEVOX_RESULT_LOAD_OPENJTALK_SYSTEM_DIC => { + cstr!("Open JTalkのシステム辞書を読むことができませんでした") } VOICEVOX_RESULT_GPU_SUPPORT_ERROR => cstr!("GPU機能をサポートすることができません"), VOICEVOX_RESULT_GET_SUPPORTED_DEVICES_ERROR => { diff --git a/crates/voicevox_core_c_api/tests/e2e/snapshots.toml b/crates/voicevox_core_c_api/tests/e2e/snapshots.toml index 8f3fa4f3b..b8792c29c 100644 --- a/crates/voicevox_core_c_api/tests/e2e/snapshots.toml +++ b/crates/voicevox_core_c_api/tests/e2e/snapshots.toml @@ -50,7 +50,7 @@ stderr = "" [global_info] result_messages.0 = "エラーが発生しませんでした" -result_messages.1 = "OpenJTalkの辞書が読み込まれていません" +result_messages.1 = "Open JTalkのシステム辞書を読むことができませんでした" result_messages.3 = "サポートされているデバイス情報取得中にエラーが発生しました" result_messages.4 = "GPU機能をサポートすることができません" result_messages.6 = "指定されたIDに対するスタイルが見つかりませんでした。音声モデルが読み込まれていないか、読み込みが解除されています" diff --git a/crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/exceptions/LoadOpenjtalkSystemDicException.java b/crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/exceptions/LoadOpenjtalkSystemDicException.java new file mode 100644 index 000000000..ec1b92871 --- /dev/null +++ b/crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/exceptions/LoadOpenjtalkSystemDicException.java @@ -0,0 +1,12 @@ +package jp.hiroshiba.voicevoxcore.exceptions; + +/** Open JTalkのシステム辞書を読むことができなかった。 */ +public class LoadOpenjtalkSystemDicException extends IllegalStateException { + public LoadOpenjtalkSystemDicException(String message) { + super(message); + } + + public LoadOpenjtalkSystemDicException(String message, Throwable cause) { + super(message, cause); + } +} diff --git a/crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/exceptions/NotLoadedOpenjtalkDictException.java b/crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/exceptions/NotLoadedOpenjtalkDictException.java deleted file mode 100644 index 3bee93b08..000000000 --- a/crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/exceptions/NotLoadedOpenjtalkDictException.java +++ /dev/null @@ -1,12 +0,0 @@ -package jp.hiroshiba.voicevoxcore.exceptions; - -/** open_jtalk辞書ファイルが読み込まれていない。 */ -public class NotLoadedOpenjtalkDictException extends IllegalStateException { - public NotLoadedOpenjtalkDictException(String message) { - super(message); - } - - public NotLoadedOpenjtalkDictException(String message, Throwable cause) { - super(message, cause); - } -} diff --git a/crates/voicevox_core_java_api/src/common.rs b/crates/voicevox_core_java_api/src/common.rs index 138676a1d..a135387dc 100644 --- a/crates/voicevox_core_java_api/src/common.rs +++ b/crates/voicevox_core_java_api/src/common.rs @@ -122,7 +122,7 @@ where } let class = class!( - NotLoadedOpenjtalkDict, + LoadOpenjtalkSystemDic, GpuSupport, OpenZipFile, ReadZipEntry, diff --git a/crates/voicevox_core_java_api/src/synthesizer.rs b/crates/voicevox_core_java_api/src/synthesizer.rs index 9d3cb9f9f..bb48f3c01 100644 --- a/crates/voicevox_core_java_api/src/synthesizer.rs +++ b/crates/voicevox_core_java_api/src/synthesizer.rs @@ -9,6 +9,7 @@ use jni::{ JNIEnv, }; use std::sync::Arc; +use voicevox_core::OpenJtalk; #[no_mangle] unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_Synthesizer_rsNew<'local>( @@ -64,7 +65,9 @@ unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_Synthesizer_rsIsGpuMode ) -> jboolean { throw_if_err(env, false, |env| { let internal = env - .get_rust_field::<_, _, Arc>(&this, "handle")? + .get_rust_field::<_, _, Arc>>>( + &this, "handle", + )? .clone(); Ok(internal.is_gpu_mode()) @@ -78,7 +81,9 @@ unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_Synthesizer_rsGetMetasJ ) -> jobject { throw_if_err(env, std::ptr::null_mut(), |env| { let internal = env - .get_rust_field::<_, _, Arc>(&this, "handle")? + .get_rust_field::<_, _, Arc>>>( + &this, "handle", + )? .clone(); let metas_json = serde_json::to_string(&internal.metas()).expect("should not fail"); @@ -100,7 +105,9 @@ unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_Synthesizer_rsLoadVoice .get_rust_field::<_, _, Arc>(&model, "handle")? .clone(); let internal = env - .get_rust_field::<_, _, Arc>(&this, "handle")? + .get_rust_field::<_, _, Arc>>>( + &this, "handle", + )? .clone(); RUNTIME.block_on(internal.load_voice_model(&model))?; Ok(()) @@ -117,7 +124,9 @@ unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_Synthesizer_rsUnloadVoi let model_id: String = env.get_string(&model_id)?.into(); let internal = env - .get_rust_field::<_, _, Arc>(&this, "handle")? + .get_rust_field::<_, _, Arc>>>( + &this, "handle", + )? .clone(); internal.unload_voice_model(&voicevox_core::VoiceModelId::new(model_id))?; @@ -138,7 +147,9 @@ unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_Synthesizer_rsIsLoadedV let model_id: String = env.get_string(&model_id)?.into(); let internal = env - .get_rust_field::<_, _, Arc>(&this, "handle")? + .get_rust_field::<_, _, Arc>>>( + &this, "handle", + )? .clone(); let is_loaded = internal.is_loaded_voice_model(&voicevox_core::VoiceModelId::new(model_id)); @@ -162,7 +173,9 @@ unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_Synthesizer_rsAudioQuer let style_id = style_id as u32; let internal = env - .get_rust_field::<_, _, Arc>(&this, "handle")? + .get_rust_field::<_, _, Arc>>>( + &this, "handle", + )? .clone(); let audio_query = RUNTIME.block_on( @@ -189,7 +202,9 @@ unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_Synthesizer_rsAudioQuer let style_id = style_id as u32; let internal = env - .get_rust_field::<_, _, Arc>(&this, "handle")? + .get_rust_field::<_, _, Arc>>>( + &this, "handle", + )? .clone(); let audio_query = @@ -217,7 +232,9 @@ unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_Synthesizer_rsAccentPhr let style_id = style_id as u32; let internal = env - .get_rust_field::<_, _, Arc>(&this, "handle")? + .get_rust_field::<_, _, Arc>>>( + &this, "handle", + )? .clone(); let accent_phrases = RUNTIME.block_on( @@ -244,7 +261,9 @@ unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_Synthesizer_rsAccentPhr let style_id = style_id as u32; let internal = env - .get_rust_field::<_, _, Arc>(&this, "handle")? + .get_rust_field::<_, _, Arc>>>( + &this, "handle", + )? .clone(); let accent_phrases = RUNTIME.block_on( @@ -273,7 +292,9 @@ unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_Synthesizer_rsReplaceMo let style_id = style_id as u32; let internal = env - .get_rust_field::<_, _, Arc>(&this, "handle")? + .get_rust_field::<_, _, Arc>>>( + &this, "handle", + )? .clone(); let replaced_accent_phrases = RUNTIME.block_on( @@ -303,7 +324,9 @@ unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_Synthesizer_rsReplacePh let style_id = style_id as u32; let internal = env - .get_rust_field::<_, _, Arc>(&this, "handle")? + .get_rust_field::<_, _, Arc>>>( + &this, "handle", + )? .clone(); let replaced_accent_phrases = { @@ -334,7 +357,9 @@ unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_Synthesizer_rsReplaceMo let style_id = style_id as u32; let internal = env - .get_rust_field::<_, _, Arc>(&this, "handle")? + .get_rust_field::<_, _, Arc>>>( + &this, "handle", + )? .clone(); let replaced_accent_phrases = RUNTIME.block_on( @@ -363,7 +388,9 @@ unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_Synthesizer_rsSynthesis let style_id = style_id as u32; let internal = env - .get_rust_field::<_, _, Arc>(&this, "handle")? + .get_rust_field::<_, _, Arc>>>( + &this, "handle", + )? .clone(); let wave = { @@ -397,7 +424,9 @@ unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_Synthesizer_rsTtsFromKa let style_id = style_id as u32; let internal = env - .get_rust_field::<_, _, Arc>(&this, "handle")? + .get_rust_field::<_, _, Arc>>>( + &this, "handle", + )? .clone(); let wave = { @@ -431,7 +460,9 @@ unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_Synthesizer_rsTts<'loca let style_id = style_id as u32; let internal = env - .get_rust_field::<_, _, Arc>(&this, "handle")? + .get_rust_field::<_, _, Arc>>>( + &this, "handle", + )? .clone(); let wave = { diff --git a/crates/voicevox_core_python_api/python/voicevox_core/__init__.py b/crates/voicevox_core_python_api/python/voicevox_core/__init__.py index fc09808bd..d1c0963ac 100644 --- a/crates/voicevox_core_python_api/python/voicevox_core/__init__.py +++ b/crates/voicevox_core_python_api/python/voicevox_core/__init__.py @@ -21,10 +21,10 @@ InferenceFailedError, InvalidModelDataError, InvalidWordError, + LoadOpenjtalkSystemDicError, LoadUserDictError, ModelAlreadyLoadedError, ModelNotFoundError, - NotLoadedOpenjtalkDictError, OpenJtalk, OpenZipFileError, ParseKanaError, @@ -52,11 +52,11 @@ "InferenceFailedError", "InvalidModelDataError", "InvalidWordError", + "LoadOpenjtalkSystemDicError", "LoadUserDictError", "ModelAlreadyLoadedError", "ModelNotFoundError", "Mora", - "NotLoadedOpenjtalkDictError", "OpenJtalk", "OpenZipFileError", "ParseKanaError", diff --git a/crates/voicevox_core_python_api/python/voicevox_core/_rust.pyi b/crates/voicevox_core_python_api/python/voicevox_core/_rust.pyi index 5288fcbde..4df0ae9be 100644 --- a/crates/voicevox_core_python_api/python/voicevox_core/_rust.pyi +++ b/crates/voicevox_core_python_api/python/voicevox_core/_rust.pyi @@ -423,8 +423,8 @@ class UserDict: """ ... -class NotLoadedOpenjtalkDictError(Exception): - """open_jtalk辞書ファイルが読み込まれていない。""" +class LoadOpenjtalkSystemDicError(Exception): + """Open JTalkのシステム辞書を読むことができなかった。""" ... diff --git a/crates/voicevox_core_python_api/src/convert.rs b/crates/voicevox_core_python_api/src/convert.rs index 7e8d437ad..3dc5116a7 100644 --- a/crates/voicevox_core_python_api/src/convert.rs +++ b/crates/voicevox_core_python_api/src/convert.rs @@ -15,8 +15,8 @@ use voicevox_core::{ use crate::{ ExtractFullContextLabelError, GetSupportedDevicesError, GpuSupportError, InferenceFailedError, - InvalidModelDataError, InvalidWordError, LoadUserDictError, ModelAlreadyLoadedError, - ModelNotFoundError, NotLoadedOpenjtalkDictError, OpenZipFileError, ParseKanaError, + InvalidModelDataError, InvalidWordError, LoadOpenjtalkSystemDicError, LoadUserDictError, + ModelAlreadyLoadedError, ModelNotFoundError, OpenZipFileError, ParseKanaError, ReadZipEntryError, SaveUserDictError, StyleAlreadyLoadedError, StyleNotFoundError, UseUserDictError, WordNotFoundError, }; @@ -158,7 +158,7 @@ pub impl voicevox_core::Result { self.map_err(|err| { let msg = err.to_string(); let top = match err.kind() { - ErrorKind::NotLoadedOpenjtalkDict => NotLoadedOpenjtalkDictError::new_err(msg), + ErrorKind::LoadOpenjtalkSystemDic => LoadOpenjtalkSystemDicError::new_err(msg), ErrorKind::GpuSupport => GpuSupportError::new_err(msg), ErrorKind::OpenZipFile => OpenZipFileError::new_err(msg), ErrorKind::ReadZipEntry => ReadZipEntryError::new_err(msg), diff --git a/crates/voicevox_core_python_api/src/lib.rs b/crates/voicevox_core_python_api/src/lib.rs index 1531b463e..f595474da 100644 --- a/crates/voicevox_core_python_api/src/lib.rs +++ b/crates/voicevox_core_python_api/src/lib.rs @@ -50,7 +50,7 @@ macro_rules! exceptions { } exceptions! { - NotLoadedOpenjtalkDictError: PyException; + LoadOpenjtalkSystemDicError: PyException; GpuSupportError: PyException; OpenZipFileError: PyException; ReadZipEntryError: PyException; @@ -140,7 +140,7 @@ impl OpenJtalk { #[pyclass] struct Synthesizer { - synthesizer: Closable, Self>, + synthesizer: Closable>>, Self>, } #[pymethods]