From 04b378752d9a00e7978fea5015d4a5c2e3e2e9c4 Mon Sep 17 00:00:00 2001
From: Ryo Yamashita <qryxip@gmail.com>
Date: Sun, 19 Nov 2023 00:15:48 +0900
Subject: [PATCH] =?UTF-8?q?`Synthesizer`=E3=81=AE=E6=A7=8B=E9=80=A0?=
 =?UTF-8?q?=E6=94=B9=E9=9D=A9=E3=82=92=E3=81=99=E3=82=8B?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 crates/voicevox_core/src/__internal.rs        |    1 +
 .../src/__internal/doctest_fixtures.rs        |    2 +-
 crates/voicevox_core/src/__internal/interp.rs |   46 +
 .../src/engine/acoustic_feature_extractor.rs  |   10 +-
 crates/voicevox_core/src/engine/mod.rs        |    3 +-
 crates/voicevox_core/src/engine/open_jtalk.rs |   75 +-
 .../src/engine/synthesis_engine.rs            |  698 -----------
 crates/voicevox_core/src/error.rs             |   10 +-
 crates/voicevox_core/src/infer/status.rs      |   15 +-
 crates/voicevox_core/src/inference_core.rs    |  163 +--
 crates/voicevox_core/src/synthesizer.rs       | 1038 ++++++++++++++---
 crates/voicevox_core_c_api/Cargo.toml         |    2 +-
 .../src/compatible_engine.rs                  |   58 +-
 crates/voicevox_core_c_api/src/helpers.rs     |    2 +-
 crates/voicevox_core_c_api/src/lib.rs         |    2 +-
 crates/voicevox_core_c_api/src/result_code.rs |    8 +-
 .../tests/e2e/snapshots.toml                  |    2 +-
 .../LoadOpenjtalkSystemDicException.java      |   12 +
 .../NotLoadedOpenjtalkDictException.java      |   12 -
 crates/voicevox_core_java_api/src/common.rs   |    2 +-
 .../voicevox_core_java_api/src/synthesizer.rs |   61 +-
 .../python/voicevox_core/__init__.py          |    4 +-
 .../python/voicevox_core/_rust.pyi            |    4 +-
 .../voicevox_core_python_api/src/convert.rs   |    6 +-
 crates/voicevox_core_python_api/src/lib.rs    |    4 +-
 25 files changed, 1153 insertions(+), 1087 deletions(-)
 create mode 100644 crates/voicevox_core/src/__internal/interp.rs
 delete mode 100644 crates/voicevox_core/src/engine/synthesis_engine.rs
 create mode 100644 crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/exceptions/LoadOpenjtalkSystemDicException.java
 delete mode 100644 crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/exceptions/NotLoadedOpenjtalkDictException.java
diff --git a/crates/voicevox_core/src/__internal.rs b/crates/voicevox_core/src/__internal.rs
index ff9f5ce3c..b6affb0f2 100644
--- a/crates/voicevox_core/src/__internal.rs
+++ b/crates/voicevox_core/src/__internal.rs
@@ -1,4 +1,5 @@
 pub mod doctest_fixtures;
+pub mod interp;
 
 // VOICEVOX CORE内のラッパー向けの実装
 // FIXME: 要議論: https://github.com/VOICEVOX/voicevox_core/issues/595
diff --git a/crates/voicevox_core/src/__internal/doctest_fixtures.rs b/crates/voicevox_core/src/__internal/doctest_fixtures.rs
index 9df517720..1915d3d37 100644
--- a/crates/voicevox_core/src/__internal/doctest_fixtures.rs
+++ b/crates/voicevox_core/src/__internal/doctest_fixtures.rs
@@ -4,7 +4,7 @@ use crate::{AccelerationMode, InitializeOptions, OpenJtalk, Synthesizer, VoiceMo
 
 pub async fn synthesizer_with_sample_voice_model(
     open_jtalk_dic_dir: impl AsRef<Path>,
-) -> anyhow::Result<Synthesizer> {
+) -> anyhow::Result<Synthesizer<Arc<OpenJtalk>>> {
     let syntesizer = Synthesizer::new(
         Arc::new(OpenJtalk::new(open_jtalk_dic_dir).unwrap()),
         &InitializeOptions {
diff --git a/crates/voicevox_core/src/__internal/interp.rs b/crates/voicevox_core/src/__internal/interp.rs
new file mode 100644
index 000000000..4afb4242d
--- /dev/null
+++ b/crates/voicevox_core/src/__internal/interp.rs
@@ -0,0 +1,46 @@
+use easy_ext::ext;
+use ndarray::{Array1, ArrayView1, ArrayView2};
+
+use crate::{StyleId, Synthesizer};
+
+#[ext(PerformInference)]
+impl Synthesizer<()> {
+    pub fn predict_duration(
+        &self,
+        phoneme_list: Array1<i64>,
+        style_id: StyleId,
+    ) -> crate::Result<Vec<f32>> {
+        self.predict_duration(phoneme_list, style_id)
+    }
+
+    #[allow(clippy::too_many_arguments)]
+    pub fn predict_intonation(
+        &self,
+        vowel_phoneme_list: Array1<i64>,
+        consonant_phoneme_list: Array1<i64>,
+        start_accent_list: Array1<i64>,
+        end_accent_list: Array1<i64>,
+        start_accent_phrase_list: Array1<i64>,
+        end_accent_phrase_list: Array1<i64>,
+        style_id: StyleId,
+    ) -> crate::Result<Vec<f32>> {
+        self.predict_intonation(
+            vowel_phoneme_list,
+            consonant_phoneme_list,
+            start_accent_list,
+            end_accent_list,
+            start_accent_phrase_list,
+            end_accent_phrase_list,
+            style_id,
+        )
+    }
+
+    pub fn decode(
+        &self,
+        f0: ArrayView1<'_, f32>,
+        phoneme: ArrayView2<'_, f32>,
+        style_id: StyleId,
+    ) -> crate::Result<Vec<f32>> {
+        self.decode(f0, phoneme, style_id)
+    }
+}
diff --git a/crates/voicevox_core/src/engine/acoustic_feature_extractor.rs b/crates/voicevox_core/src/engine/acoustic_feature_extractor.rs
index 05cdb2d31..5078f34b5 100644
--- a/crates/voicevox_core/src/engine/acoustic_feature_extractor.rs
+++ b/crates/voicevox_core/src/engine/acoustic_feature_extractor.rs
@@ -4,7 +4,7 @@ use once_cell::sync::Lazy;
 use std::collections::HashMap;
 
 #[rustfmt::skip]
-const PHONEME_LIST: &[&str] = &[
+const PHONEME_LIST: [&str; 45] = [
     "pau",
     "A",
     "E",
@@ -70,9 +70,7 @@ pub struct OjtPhoneme {
 }
 
 impl OjtPhoneme {
-    pub fn num_phoneme() -> usize {
-        PHONEME_MAP.len()
-    }
+    pub(crate) const NUM_PHONEME: usize = PHONEME_LIST.len();
 
     pub fn space_phoneme() -> String {
         "pau".into()
@@ -134,8 +132,8 @@ mod tests {
     }
 
     #[rstest]
-    fn test_num_phoneme_works() {
-        assert_eq!(OjtPhoneme::num_phoneme(), 45);
+    fn test_phoneme_map_has_enough_elements() {
+        assert_eq!(OjtPhoneme::NUM_PHONEME, PHONEME_MAP.len());
     }
 
     #[rstest]
diff --git a/crates/voicevox_core/src/engine/mod.rs b/crates/voicevox_core/src/engine/mod.rs
index 0e02839dc..a0a073bbf 100644
--- a/crates/voicevox_core/src/engine/mod.rs
+++ b/crates/voicevox_core/src/engine/mod.rs
@@ -4,7 +4,6 @@ mod kana_parser;
 mod model;
 mod mora_list;
 mod open_jtalk;
-mod synthesis_engine;
 
 use super::*;
 
@@ -12,5 +11,5 @@ pub use self::acoustic_feature_extractor::*;
 pub use self::full_context_label::*;
 pub use self::kana_parser::*;
 pub use self::model::*;
+pub(crate) use self::mora_list::mora2text;
 pub use self::open_jtalk::OpenJtalk;
-pub use self::synthesis_engine::*;
diff --git a/crates/voicevox_core/src/engine/open_jtalk.rs b/crates/voicevox_core/src/engine/open_jtalk.rs
index f74d4130d..327cb634c 100644
--- a/crates/voicevox_core/src/engine/open_jtalk.rs
+++ b/crates/voicevox_core/src/engine/open_jtalk.rs
@@ -1,8 +1,5 @@
 use std::io::Write;
-use std::{
-    path::{Path, PathBuf},
-    sync::Mutex,
-};
+use std::{path::Path, sync::Mutex};
 
 use anyhow::anyhow;
 use tempfile::NamedTempFile;
@@ -22,7 +19,7 @@ pub(crate) struct OpenjtalkFunctionError {
 /// テキスト解析器としてのOpen JTalk。
 pub struct OpenJtalk {
     resources: Mutex<Resources>,
-    dict_dir: Option<PathBuf>,
+    dict_dir: String,
 }
 
 struct Resources {
@@ -35,24 +32,27 @@ struct Resources {
 unsafe impl Send for Resources {}
 
 impl OpenJtalk {
-    // FIXME: この関数は廃止し、`Synthesizer`は`Option<OpenJtalk>`という形でこの構造体を持つ
-    pub fn new_without_dic() -> Self {
-        Self {
-            resources: Mutex::new(Resources {
-                mecab: ManagedResource::initialize(),
-                njd: ManagedResource::initialize(),
-                jpcommon: ManagedResource::initialize(),
-            }),
-            dict_dir: None,
-        }
-    }
     pub fn new(open_jtalk_dict_dir: impl AsRef<Path>) -> crate::result::Result<Self> {
-        let mut s = Self::new_without_dic();
-        s.load(open_jtalk_dict_dir).map_err(|()| {
-            // FIXME: 「システム辞書を読もうとしたけど読めなかった」というエラーをちゃんと用意する
-            ErrorRepr::NotLoadedOpenjtalkDict
-        })?;
-        Ok(s)
+        let mut resources = Resources {
+            mecab: ManagedResource::initialize(),
+            njd: ManagedResource::initialize(),
+            jpcommon: ManagedResource::initialize(),
+        };
+        let dict_dir = open_jtalk_dict_dir
+            .as_ref()
+            .to_str()
+            .unwrap_or_else(|| todo!("Rust APIでは`Utf8Path`で受けるようにする"))
+            .to_owned();
+
+        let result = resources.mecab.load(&dict_dir);
+        if !result {
+            return Err(ErrorRepr::LoadOpenjtalkSystemDic(dict_dir).into());
+        }
+
+        Ok(Self {
+            resources: resources.into(),
+            dict_dir,
+        })
     }
 
     // 先に`load`を呼ぶ必要がある。
@@ -60,12 +60,6 @@ impl OpenJtalk {
     ///
     /// この関数を呼び出した後にユーザー辞書を変更した場合は、再度この関数を呼ぶ必要がある。
     pub fn use_user_dict(&self, user_dict: &UserDict) -> crate::result::Result<()> {
-        let dict_dir = self
-            .dict_dir
-            .as_ref()
-            .and_then(|dict_dir| dict_dir.to_str())
-            .ok_or(ErrorRepr::NotLoadedOpenjtalkDict)?;
-
         // ユーザー辞書用のcsvを作成
         let mut temp_csv = NamedTempFile::new().map_err(|e| ErrorRepr::UseUserDict(e.into()))?;
         temp_csv
@@ -80,7 +74,7 @@ impl OpenJtalk {
         mecab_dict_index(&[
             "mecab-dict-index",
             "-d",
-            dict_dir,
+            &self.dict_dir,
             "-u",
             temp_dict_path.to_str().unwrap(),
             "-f",
@@ -93,7 +87,8 @@ impl OpenJtalk {
 
         let Resources { mecab, .. } = &mut *self.resources.lock().unwrap();
 
-        let result = mecab.load_with_userdic(Path::new(dict_dir), Some(Path::new(&temp_dict_path)));
+        let result =
+            mecab.load_with_userdic(self.dict_dir.as_ref(), Some(Path::new(&temp_dict_path)));
 
         if !result {
             return Err(ErrorRepr::UseUserDict(anyhow!("辞書のコンパイルに失敗しました")).into());
@@ -150,26 +145,6 @@ impl OpenJtalk {
             })
         }
     }
-
-    fn load(&mut self, open_jtalk_dict_dir: impl AsRef<Path>) -> std::result::Result<(), ()> {
-        let result = self
-            .resources
-            .lock()
-            .unwrap()
-            .mecab
-            .load(open_jtalk_dict_dir.as_ref());
-        if result {
-            self.dict_dir = Some(open_jtalk_dict_dir.as_ref().into());
-            Ok(())
-        } else {
-            self.dict_dir = None;
-            Err(())
-        }
-    }
-
-    pub fn dict_loaded(&self) -> bool {
-        self.dict_dir.is_some()
-    }
 }
 
 #[cfg(test)]
diff --git a/crates/voicevox_core/src/engine/synthesis_engine.rs b/crates/voicevox_core/src/engine/synthesis_engine.rs
deleted file mode 100644
index c70742f16..000000000
--- a/crates/voicevox_core/src/engine/synthesis_engine.rs
+++ /dev/null
@@ -1,698 +0,0 @@
-use derive_new::new;
-use std::io::{Cursor, Write};
-use std::sync::Arc;
-
-use super::full_context_label::Utterance;
-use super::open_jtalk::OpenJtalk;
-use super::*;
-use crate::infer::InferenceRuntime;
-use crate::numerics::F32Ext as _;
-use crate::InferenceCore;
-
-const UNVOICED_MORA_PHONEME_LIST: &[&str] = &["A", "I", "U", "E", "O", "cl", "pau"];
-
-const MORA_PHONEME_LIST: &[&str] = &[
-    "a", "i", "u", "e", "o", "N", "A", "I", "U", "E", "O", "cl", "pau",
-];
-
-pub const DEFAULT_SAMPLING_RATE: u32 = 24000;
-
-#[derive(new)]
-pub(crate) struct SynthesisEngine<R: InferenceRuntime> {
-    inference_core: InferenceCore<R>,
-    open_jtalk: Arc<OpenJtalk>,
-}
-
-impl<R: InferenceRuntime> SynthesisEngine<R> {
-    pub fn inference_core(&self) -> &InferenceCore<R> {
-        &self.inference_core
-    }
-
-    pub async fn create_accent_phrases(
-        &self,
-        text: &str,
-        style_id: StyleId,
-    ) -> Result<Vec<AccentPhraseModel>> {
-        if text.is_empty() {
-            return Ok(Vec::new());
-        }
-
-        let utterance = Utterance::extract_full_context_label(&self.open_jtalk, text)?;
-
-        let accent_phrases: Vec<AccentPhraseModel> = utterance
-            .breath_groups()
-            .iter()
-            .enumerate()
-            .fold(Vec::new(), |mut accum_vec, (i, breath_group)| {
-                accum_vec.extend(breath_group.accent_phrases().iter().enumerate().map(
-                    |(j, accent_phrase)| {
-                        let moras = accent_phrase
-                            .moras()
-                            .iter()
-                            .map(|mora| {
-                                let mora_text = mora
-                                    .phonemes()
-                                    .iter()
-                                    .map(|phoneme| phoneme.phoneme().to_string())
-                                    .collect::<Vec<_>>()
-                                    .join("");
-
-                                let (consonant, consonant_length) =
-                                    if let Some(consonant) = mora.consonant() {
-                                        (Some(consonant.phoneme().to_string()), Some(0.))
-                                    } else {
-                                        (None, None)
-                                    };
-
-                                MoraModel::new(
-                                    mora_to_text(mora_text),
-                                    consonant,
-                                    consonant_length,
-                                    mora.vowel().phoneme().into(),
-                                    0.,
-                                    0.,
-                                )
-                            })
-                            .collect();
-
-                        let pause_mora = if i != utterance.breath_groups().len() - 1
-                            && j == breath_group.accent_phrases().len() - 1
-                        {
-                            Some(MoraModel::new(
-                                "、".into(),
-                                None,
-                                None,
-                                "pau".into(),
-                                0.,
-                                0.,
-                            ))
-                        } else {
-                            None
-                        };
-
-                        AccentPhraseModel::new(
-                            moras,
-                            *accent_phrase.accent(),
-                            pause_mora,
-                            *accent_phrase.is_interrogative(),
-                        )
-                    },
-                ));
-
-                accum_vec
-            });
-
-        self.replace_mora_data(&accent_phrases, style_id).await
-    }
-
-    pub async fn replace_mora_data(
-        &self,
-        accent_phrases: &[AccentPhraseModel],
-        style_id: StyleId,
-    ) -> Result<Vec<AccentPhraseModel>> {
-        let accent_phrases = self
-            .replace_phoneme_length(accent_phrases, style_id)
-            .await?;
-        self.replace_mora_pitch(&accent_phrases, style_id).await
-    }
-
-    pub async fn replace_phoneme_length(
-        &self,
-        accent_phrases: &[AccentPhraseModel],
-        style_id: StyleId,
-    ) -> Result<Vec<AccentPhraseModel>> {
-        let (_, phoneme_data_list) = Self::initial_process(accent_phrases);
-
-        let (_, _, vowel_indexes_data) = split_mora(&phoneme_data_list);
-
-        let phoneme_list_s: Vec<i64> = phoneme_data_list
-            .iter()
-            .map(|phoneme_data| phoneme_data.phoneme_id())
-            .collect();
-        let phoneme_length = self
-            .inference_core()
-            .predict_duration(&phoneme_list_s, style_id)
-            .await?;
-
-        let mut index = 0;
-        let new_accent_phrases = accent_phrases
-            .iter()
-            .map(|accent_phrase| {
-                AccentPhraseModel::new(
-                    accent_phrase
-                        .moras()
-                        .iter()
-                        .map(|mora| {
-                            let new_mora = MoraModel::new(
-                                mora.text().clone(),
-                                mora.consonant().clone(),
-                                mora.consonant().as_ref().map(|_| {
-                                    phoneme_length[vowel_indexes_data[index + 1] as usize - 1]
-                                }),
-                                mora.vowel().clone(),
-                                phoneme_length[vowel_indexes_data[index + 1] as usize],
-                                *mora.pitch(),
-                            );
-                            index += 1;
-                            new_mora
-                        })
-                        .collect(),
-                    *accent_phrase.accent(),
-                    accent_phrase.pause_mora().as_ref().map(|pause_mora| {
-                        let new_pause_mora = MoraModel::new(
-                            pause_mora.text().clone(),
-                            pause_mora.consonant().clone(),
-                            *pause_mora.consonant_length(),
-                            pause_mora.vowel().clone(),
-                            phoneme_length[vowel_indexes_data[index + 1] as usize],
-                            *pause_mora.pitch(),
-                        );
-                        index += 1;
-                        new_pause_mora
-                    }),
-                    *accent_phrase.is_interrogative(),
-                )
-            })
-            .collect();
-
-        Ok(new_accent_phrases)
-    }
-
-    pub async fn replace_mora_pitch(
-        &self,
-        accent_phrases: &[AccentPhraseModel],
-        style_id: StyleId,
-    ) -> Result<Vec<AccentPhraseModel>> {
-        let (_, phoneme_data_list) = Self::initial_process(accent_phrases);
-
-        let mut base_start_accent_list = vec![0];
-        let mut base_end_accent_list = vec![0];
-        let mut base_start_accent_phrase_list = vec![0];
-        let mut base_end_accent_phrase_list = vec![0];
-        for accent_phrase in accent_phrases {
-            let mut accent = usize::from(*accent_phrase.accent() != 1);
-            Self::create_one_accent_list(&mut base_start_accent_list, accent_phrase, accent as i32);
-
-            accent = *accent_phrase.accent() - 1;
-            Self::create_one_accent_list(&mut base_end_accent_list, accent_phrase, accent as i32);
-            Self::create_one_accent_list(&mut base_start_accent_phrase_list, accent_phrase, 0);
-            Self::create_one_accent_list(&mut base_end_accent_phrase_list, accent_phrase, -1);
-        }
-        base_start_accent_list.push(0);
-        base_end_accent_list.push(0);
-        base_start_accent_phrase_list.push(0);
-        base_end_accent_phrase_list.push(0);
-
-        let (consonant_phoneme_data_list, vowel_phoneme_data_list, vowel_indexes) =
-            split_mora(&phoneme_data_list);
-
-        let consonant_phoneme_list: Vec<i64> = consonant_phoneme_data_list
-            .iter()
-            .map(|phoneme_data| phoneme_data.phoneme_id())
-            .collect();
-        let vowel_phoneme_list: Vec<i64> = vowel_phoneme_data_list
-            .iter()
-            .map(|phoneme_data| phoneme_data.phoneme_id())
-            .collect();
-
-        let mut start_accent_list = Vec::with_capacity(vowel_indexes.len());
-        let mut end_accent_list = Vec::with_capacity(vowel_indexes.len());
-        let mut start_accent_phrase_list = Vec::with_capacity(vowel_indexes.len());
-        let mut end_accent_phrase_list = Vec::with_capacity(vowel_indexes.len());
-
-        for vowel_index in vowel_indexes {
-            start_accent_list.push(base_start_accent_list[vowel_index as usize]);
-            end_accent_list.push(base_end_accent_list[vowel_index as usize]);
-            start_accent_phrase_list.push(base_start_accent_phrase_list[vowel_index as usize]);
-            end_accent_phrase_list.push(base_end_accent_phrase_list[vowel_index as usize]);
-        }
-
-        let mut f0_list = self
-            .inference_core()
-            .predict_intonation(
-                vowel_phoneme_list.len(),
-                &vowel_phoneme_list,
-                &consonant_phoneme_list,
-                &start_accent_list,
-                &end_accent_list,
-                &start_accent_phrase_list,
-                &end_accent_phrase_list,
-                style_id,
-            )
-            .await?;
-
-        for i in 0..vowel_phoneme_data_list.len() {
-            if UNVOICED_MORA_PHONEME_LIST
-                .iter()
-                .any(|phoneme| *phoneme == vowel_phoneme_data_list[i].phoneme())
-            {
-                f0_list[i] = 0.;
-            }
-        }
-
-        let mut index = 0;
-        let new_accent_phrases = accent_phrases
-            .iter()
-            .map(|accent_phrase| {
-                AccentPhraseModel::new(
-                    accent_phrase
-                        .moras()
-                        .iter()
-                        .map(|mora| {
-                            let new_mora = MoraModel::new(
-                                mora.text().clone(),
-                                mora.consonant().clone(),
-                                *mora.consonant_length(),
-                                mora.vowel().clone(),
-                                *mora.vowel_length(),
-                                f0_list[index + 1],
-                            );
-                            index += 1;
-                            new_mora
-                        })
-                        .collect(),
-                    *accent_phrase.accent(),
-                    accent_phrase.pause_mora().as_ref().map(|pause_mora| {
-                        let new_pause_mora = MoraModel::new(
-                            pause_mora.text().clone(),
-                            pause_mora.consonant().clone(),
-                            *pause_mora.consonant_length(),
-                            pause_mora.vowel().clone(),
-                            *pause_mora.vowel_length(),
-                            f0_list[index + 1],
-                        );
-                        index += 1;
-                        new_pause_mora
-                    }),
-                    *accent_phrase.is_interrogative(),
-                )
-            })
-            .collect();
-
-        Ok(new_accent_phrases)
-    }
-
-    pub async fn synthesis(
-        &self,
-        query: &AudioQueryModel,
-        style_id: StyleId,
-        enable_interrogative_upspeak: bool,
-    ) -> Result<Vec<f32>> {
-        let speed_scale = *query.speed_scale();
-        let pitch_scale = *query.pitch_scale();
-        let intonation_scale = *query.intonation_scale();
-        let pre_phoneme_length = *query.pre_phoneme_length();
-        let post_phoneme_length = *query.post_phoneme_length();
-
-        let accent_phrases = if enable_interrogative_upspeak {
-            adjust_interrogative_accent_phrases(query.accent_phrases().as_slice())
-        } else {
-            query.accent_phrases().clone()
-        };
-
-        let (flatten_moras, phoneme_data_list) = Self::initial_process(&accent_phrases);
-
-        let mut phoneme_length_list = vec![pre_phoneme_length];
-        let mut f0_list = vec![0.];
-        let mut voiced_list = vec![false];
-        {
-            let mut sum_of_f0_bigger_than_zero = 0.;
-            let mut count_of_f0_bigger_than_zero = 0;
-
-            for mora in flatten_moras {
-                let consonant_length = *mora.consonant_length();
-                let vowel_length = *mora.vowel_length();
-                let pitch = *mora.pitch();
-
-                if let Some(consonant_length) = consonant_length {
-                    phoneme_length_list.push(consonant_length);
-                }
-                phoneme_length_list.push(vowel_length);
-
-                let f0_single = pitch * 2.0_f32.powf(pitch_scale);
-                f0_list.push(f0_single);
-
-                let bigger_than_zero = f0_single > 0.;
-                voiced_list.push(bigger_than_zero);
-
-                if bigger_than_zero {
-                    sum_of_f0_bigger_than_zero += f0_single;
-                    count_of_f0_bigger_than_zero += 1;
-                }
-            }
-            phoneme_length_list.push(post_phoneme_length);
-            f0_list.push(0.);
-            voiced_list.push(false);
-            let mean_f0 = sum_of_f0_bigger_than_zero / (count_of_f0_bigger_than_zero as f32);
-
-            if !mean_f0.is_nan() {
-                for i in 0..f0_list.len() {
-                    if voiced_list[i] {
-                        f0_list[i] = (f0_list[i] - mean_f0) * intonation_scale + mean_f0;
-                    }
-                }
-            }
-        }
-
-        let (_, _, vowel_indexes) = split_mora(&phoneme_data_list);
-
-        let mut phoneme: Vec<Vec<f32>> = Vec::new();
-        let mut f0: Vec<f32> = Vec::new();
-        {
-            const RATE: f32 = 24000. / 256.;
-            let mut sum_of_phoneme_length = 0;
-            let mut count_of_f0 = 0;
-            let mut vowel_indexes_index = 0;
-
-            for (i, phoneme_length) in phoneme_length_list.iter().enumerate() {
-                // VOICEVOX ENGINEと挙動を合わせるため、四捨五入ではなく偶数丸めをする
-                //
-                // https://github.com/VOICEVOX/voicevox_engine/issues/552
-                let phoneme_length = ((*phoneme_length * RATE).round_ties_even_() / speed_scale)
-                    .round_ties_even_() as usize;
-                let phoneme_id = phoneme_data_list[i].phoneme_id();
-
-                for _ in 0..phoneme_length {
-                    let mut phonemes_vec = vec![0.; OjtPhoneme::num_phoneme()];
-                    phonemes_vec[phoneme_id as usize] = 1.;
-                    phoneme.push(phonemes_vec)
-                }
-                sum_of_phoneme_length += phoneme_length;
-
-                if i as i64 == vowel_indexes[vowel_indexes_index] {
-                    for _ in 0..sum_of_phoneme_length {
-                        f0.push(f0_list[count_of_f0]);
-                    }
-                    count_of_f0 += 1;
-                    sum_of_phoneme_length = 0;
-                    vowel_indexes_index += 1;
-                }
-            }
-        }
-
-        // 2次元のvectorを1次元に変換し、アドレスを連続させる
-        let flatten_phoneme = phoneme.into_iter().flatten().collect::<Vec<_>>();
-
-        self.inference_core()
-            .decode(
-                f0.len(),
-                OjtPhoneme::num_phoneme(),
-                &f0,
-                &flatten_phoneme,
-                style_id,
-            )
-            .await
-    }
-
-    pub async fn synthesis_wave_format(
-        &self,
-        query: &AudioQueryModel,
-        style_id: StyleId,
-        enable_interrogative_upspeak: bool,
-    ) -> Result<Vec<u8>> {
-        let wave = self
-            .synthesis(query, style_id, enable_interrogative_upspeak)
-            .await?;
-        let volume_scale = *query.volume_scale();
-        let output_stereo = *query.output_stereo();
-        let output_sampling_rate = *query.output_sampling_rate();
-
-        // TODO: 44.1kHzなどの対応
-
-        let num_channels: u16 = if output_stereo { 2 } else { 1 };
-        let bit_depth: u16 = 16;
-        let repeat_count: u32 =
-            (output_sampling_rate / DEFAULT_SAMPLING_RATE) * num_channels as u32;
-        let block_size: u16 = bit_depth * num_channels / 8;
-
-        let bytes_size = wave.len() as u32 * repeat_count * 2;
-        let wave_size = bytes_size + 44;
-
-        let buf: Vec<u8> = Vec::with_capacity(wave_size as usize);
-        let mut cur = Cursor::new(buf);
-
-        cur.write_all("RIFF".as_bytes()).unwrap();
-        cur.write_all(&(wave_size - 8).to_le_bytes()).unwrap();
-        cur.write_all("WAVEfmt ".as_bytes()).unwrap();
-        cur.write_all(&16_u32.to_le_bytes()).unwrap(); // fmt header length
-        cur.write_all(&1_u16.to_le_bytes()).unwrap(); //linear PCM
-        cur.write_all(&num_channels.to_le_bytes()).unwrap();
-        cur.write_all(&output_sampling_rate.to_le_bytes()).unwrap();
-
-        let block_rate = output_sampling_rate * block_size as u32;
-
-        cur.write_all(&block_rate.to_le_bytes()).unwrap();
-        cur.write_all(&block_size.to_le_bytes()).unwrap();
-        cur.write_all(&bit_depth.to_le_bytes()).unwrap();
-        cur.write_all("data".as_bytes()).unwrap();
-        cur.write_all(&bytes_size.to_le_bytes()).unwrap();
-
-        for value in wave {
-            let v = (value * volume_scale).clamp(-1., 1.);
-            let data = (v * 0x7fff as f32) as i16;
-            for _ in 0..repeat_count {
-                cur.write_all(&data.to_le_bytes()).unwrap();
-            }
-        }
-
-        Ok(cur.into_inner())
-    }
-
-    pub fn is_openjtalk_dict_loaded(&self) -> bool {
-        self.open_jtalk.dict_loaded()
-    }
-
-    fn initial_process(accent_phrases: &[AccentPhraseModel]) -> (Vec<MoraModel>, Vec<OjtPhoneme>) {
-        let flatten_moras = to_flatten_moras(accent_phrases);
-
-        let mut phoneme_strings = vec!["pau".to_string()];
-        for mora in flatten_moras.iter() {
-            if let Some(consonant) = mora.consonant() {
-                phoneme_strings.push(consonant.clone())
-            }
-            phoneme_strings.push(mora.vowel().clone());
-        }
-        phoneme_strings.push("pau".to_string());
-
-        let phoneme_data_list = to_phoneme_data_list(&phoneme_strings);
-
-        (flatten_moras, phoneme_data_list)
-    }
-
-    fn create_one_accent_list(
-        accent_list: &mut Vec<i64>,
-        accent_phrase: &AccentPhraseModel,
-        point: i32,
-    ) {
-        let mut one_accent_list: Vec<i64> = Vec::new();
-
-        for (i, mora) in accent_phrase.moras().iter().enumerate() {
-            let value = (i as i32 == point
-                || (point < 0 && i == (accent_phrase.moras().len() as i32 + point) as usize))
-                .into();
-            one_accent_list.push(value);
-            if mora.consonant().is_some() {
-                one_accent_list.push(value);
-            }
-        }
-        if accent_phrase.pause_mora().is_some() {
-            one_accent_list.push(0);
-        }
-        accent_list.extend(one_accent_list)
-    }
-}
-
-pub fn to_flatten_moras(accent_phrases: &[AccentPhraseModel]) -> Vec<MoraModel> {
-    let mut flatten_moras = Vec::new();
-
-    for accent_phrase in accent_phrases {
-        let moras = accent_phrase.moras();
-        for mora in moras {
-            flatten_moras.push(mora.clone());
-        }
-        if let Some(pause_mora) = accent_phrase.pause_mora() {
-            flatten_moras.push(pause_mora.clone());
-        }
-    }
-
-    flatten_moras
-}
-
-pub fn to_phoneme_data_list<T: AsRef<str>>(phoneme_str_list: &[T]) -> Vec<OjtPhoneme> {
-    OjtPhoneme::convert(
-        phoneme_str_list
-            .iter()
-            .enumerate()
-            .map(|(i, s)| OjtPhoneme::new(s.as_ref().to_string(), i as f32, i as f32 + 1.))
-            .collect::<Vec<OjtPhoneme>>()
-            .as_slice(),
-    )
-}
-
-pub fn split_mora(phoneme_list: &[OjtPhoneme]) -> (Vec<OjtPhoneme>, Vec<OjtPhoneme>, Vec<i64>) {
-    let mut vowel_indexes = Vec::new();
-    for (i, phoneme) in phoneme_list.iter().enumerate() {
-        if MORA_PHONEME_LIST
-            .iter()
-            .any(|mora_phoneme| *mora_phoneme == phoneme.phoneme())
-        {
-            vowel_indexes.push(i as i64);
-        }
-    }
-
-    let vowel_phoneme_list = vowel_indexes
-        .iter()
-        .map(|vowel_index| phoneme_list[*vowel_index as usize].clone())
-        .collect();
-
-    let mut consonant_phoneme_list = vec![OjtPhoneme::default()];
-    for i in 0..(vowel_indexes.len() - 1) {
-        let prev = vowel_indexes[i];
-        let next = vowel_indexes[i + 1];
-        if next - prev == 1 {
-            consonant_phoneme_list.push(OjtPhoneme::default());
-        } else {
-            consonant_phoneme_list.push(phoneme_list[next as usize - 1].clone());
-        }
-    }
-
-    (consonant_phoneme_list, vowel_phoneme_list, vowel_indexes)
-}
-
-fn mora_to_text(mora: impl AsRef<str>) -> String {
-    let last_char = mora.as_ref().chars().last().unwrap();
-    let mora = if ['A', 'I', 'U', 'E', 'O'].contains(&last_char) {
-        format!(
-            "{}{}",
-            &mora.as_ref()[0..mora.as_ref().len() - 1],
-            last_char.to_lowercase()
-        )
-    } else {
-        mora.as_ref().to_string()
-    };
-    // もしカタカナに変換できなければ、引数で与えた文字列がそのまま返ってくる
-    mora_list::mora2text(&mora).to_string()
-}
-
-fn adjust_interrogative_accent_phrases(
-    accent_phrases: &[AccentPhraseModel],
-) -> Vec<AccentPhraseModel> {
-    accent_phrases
-        .iter()
-        .map(|accent_phrase| {
-            AccentPhraseModel::new(
-                adjust_interrogative_moras(accent_phrase),
-                *accent_phrase.accent(),
-                accent_phrase.pause_mora().clone(),
-                *accent_phrase.is_interrogative(),
-            )
-        })
-        .collect()
-}
-
-fn adjust_interrogative_moras(accent_phrase: &AccentPhraseModel) -> Vec<MoraModel> {
-    let moras = accent_phrase.moras();
-    if *accent_phrase.is_interrogative() && !moras.is_empty() {
-        let last_mora = moras.last().unwrap();
-        let last_mora_pitch = *last_mora.pitch();
-        if last_mora_pitch != 0.0 {
-            let mut new_moras: Vec<MoraModel> = Vec::with_capacity(moras.len() + 1);
-            new_moras.extend_from_slice(moras.as_slice());
-            let interrogative_mora = make_interrogative_mora(last_mora);
-            new_moras.push(interrogative_mora);
-            return new_moras;
-        }
-    }
-    moras.clone()
-}
-
-fn make_interrogative_mora(last_mora: &MoraModel) -> MoraModel {
-    const FIX_VOWEL_LENGTH: f32 = 0.15;
-    const ADJUST_PITCH: f32 = 0.3;
-    const MAX_PITCH: f32 = 6.5;
-
-    let pitch = (*last_mora.pitch() + ADJUST_PITCH).min(MAX_PITCH);
-
-    MoraModel::new(
-        mora_to_text(last_mora.vowel()),
-        None,
-        None,
-        last_mora.vowel().clone(),
-        FIX_VOWEL_LENGTH,
-        pitch,
-    )
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use ::test_util::OPEN_JTALK_DIC_DIR;
-    use pretty_assertions::assert_eq;
-
-    use crate::{synthesizer::InferenceRuntimeImpl, *};
-
-    #[rstest]
-    #[tokio::test]
-    async fn is_openjtalk_dict_loaded_works() {
-        let core = InferenceCore::<InferenceRuntimeImpl>::new(false, 0).unwrap();
-        let synthesis_engine =
-            SynthesisEngine::new(core, OpenJtalk::new(OPEN_JTALK_DIC_DIR).unwrap().into());
-
-        assert_eq!(synthesis_engine.is_openjtalk_dict_loaded(), true);
-    }
-
-    #[rstest]
-    #[tokio::test]
-    async fn create_accent_phrases_works() {
-        let core = InferenceCore::<InferenceRuntimeImpl>::new(false, 0).unwrap();
-
-        let model = &VoiceModel::sample().await.unwrap();
-        core.load_model(model).await.unwrap();
-
-        let synthesis_engine =
-            SynthesisEngine::new(core, OpenJtalk::new(OPEN_JTALK_DIC_DIR).unwrap().into());
-
-        let accent_phrases = synthesis_engine
-            .create_accent_phrases("同じ、文章、です。完全に、同一です。", StyleId::new(1))
-            .await
-            .unwrap();
-        assert_eq!(accent_phrases.len(), 5);
-
-        // 入力テキストに「、」や「。」などの句読点が含まれていたときに
-        // AccentPhraseModel の pause_mora に期待する値をテスト
-
-        assert!(
-            accent_phrases[0].pause_mora().is_some(),
-            "accent_phrases[0].pause_mora() is None"
-        );
-        assert!(
-            accent_phrases[1].pause_mora().is_some(),
-            "accent_phrases[1].pause_mora() is None"
-        );
-        assert!(
-            accent_phrases[2].pause_mora().is_some(),
-            "accent_phrases[2].pause_mora() is None"
-        );
-        assert!(
-            accent_phrases[3].pause_mora().is_some(),
-            "accent_phrases[3].pause_mora() is None"
-        );
-        assert!(
-            accent_phrases[4].pause_mora().is_none(), // 文末の句読点は削除される
-            "accent_phrases[4].pause_mora() is not None"
-        );
-
-        for accent_phrase in accent_phrases.iter().take(4) {
-            let pause_mora = accent_phrase.pause_mora().clone().unwrap();
-            assert_eq!(pause_mora.text(), "、");
-            assert_eq!(pause_mora.consonant(), &None);
-            assert_eq!(pause_mora.consonant_length(), &None);
-            assert_eq!(pause_mora.vowel(), "pau");
-            assert_ne!(
-                pause_mora.vowel_length(),
-                &0.0,
-                "pause_mora.vowel_length() should not be 0.0"
-            );
-        }
-    }
-}
diff --git a/crates/voicevox_core/src/error.rs b/crates/voicevox_core/src/error.rs
index 043b51991..decbddca1 100644
--- a/crates/voicevox_core/src/error.rs
+++ b/crates/voicevox_core/src/error.rs
@@ -28,7 +28,7 @@ impl Error {
     /// 対応する[`ErrorKind`]を返す。
     pub fn kind(&self) -> ErrorKind {
         match &self.0 {
-            ErrorRepr::NotLoadedOpenjtalkDict => ErrorKind::NotLoadedOpenjtalkDict,
+            ErrorRepr::LoadOpenjtalkSystemDic(_) => ErrorKind::LoadOpenjtalkSystemDic,
             ErrorRepr::GpuSupport => ErrorKind::GpuSupport,
             ErrorRepr::LoadModel(LoadModelError { context, .. }) => match context {
                 LoadModelErrorKind::OpenZipFile => ErrorKind::OpenZipFile,
@@ -54,8 +54,8 @@ impl Error {
 
 #[derive(Error, Debug)]
 pub(crate) enum ErrorRepr {
-    #[error("OpenJTalkの辞書が読み込まれていません")]
-    NotLoadedOpenjtalkDict,
+    #[error("ディレクトリ`{_0}`をOpen JTalkのシステム辞書として読むことができませんでした")]
+    LoadOpenjtalkSystemDic(String),
 
     #[error("GPU機能をサポートすることができません")]
     GpuSupport,
@@ -106,8 +106,8 @@ pub(crate) enum ErrorRepr {
 /// エラーの種類。
 #[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Debug)]
 pub enum ErrorKind {
-    /// open_jtalk辞書ファイルが読み込まれていない。
-    NotLoadedOpenjtalkDict,
+    /// Open JTalkのシステム辞書を読むことができなかった。
+    LoadOpenjtalkSystemDic,
     /// GPUモードがサポートされていない。
     GpuSupport,
     /// ZIPファイルを開くことに失敗した。
diff --git a/crates/voicevox_core/src/infer/status.rs b/crates/voicevox_core/src/infer/status.rs
index 7903cb8ff..887fa3900 100644
--- a/crates/voicevox_core/src/infer/status.rs
+++ b/crates/voicevox_core/src/infer/status.rs
@@ -86,14 +86,14 @@ impl<R: InferenceRuntime, D: InferenceDomain> Status<R, D> {
         self.loaded_models.lock().unwrap().contains_style(style_id)
     }
 
-    pub fn validate_speaker_id(&self, style_id: StyleId) -> bool {
-        self.is_loaded_model_by_style_id(style_id)
-    }
-
+    /// 推論を実行する。
+    ///
+    /// CPU/GPU-bound操作であるため、async文脈ではスレッドに包むべきである。
+    ///
     /// # Panics
     ///
     /// `self`が`model_id`を含んでいないとき、パニックする。
-    pub(crate) async fn run_session<I>(
+    pub(crate) fn run_session<I>(
         &self,
         model_id: &VoiceModelId,
         input: I,
@@ -103,10 +103,7 @@ impl<R: InferenceRuntime, D: InferenceDomain> Status<R, D> {
         I::Signature: InferenceSignature<Domain = D>,
     {
         let sess = self.loaded_models.lock().unwrap().get(model_id);
-
-        tokio::task::spawn_blocking(move || sess.run(input))
-            .await
-            .unwrap()
+        sess.run(input)
     }
 }
 
diff --git a/crates/voicevox_core/src/inference_core.rs b/crates/voicevox_core/src/inference_core.rs
index 875c9ba64..4c286827a 100644
--- a/crates/voicevox_core/src/inference_core.rs
+++ b/crates/voicevox_core/src/inference_core.rs
@@ -1,4 +1,5 @@
 use enum_map::enum_map;
+use ndarray::{Array1, ArrayView1, ArrayView2};
 
 use crate::infer::{
     domain::{
@@ -9,12 +10,14 @@ use crate::infer::{
     status::Status,
     InferenceRuntime, InferenceSessionOptions,
 };
+use itertools::Itertools as _;
 
 use super::*;
 
 const PHONEME_LENGTH_MINIMAL: f32 = 0.01;
 
 pub(crate) struct InferenceCore<R: InferenceRuntime> {
+    use_gpu: bool,
     status: Status<R, InferenceDomainImpl>,
 }
 
@@ -32,7 +35,7 @@ impl<R: InferenceRuntime> InferenceCore<R> {
                 | InferenceOperationImpl::PredictIntonation => light_session_options,
                 InferenceOperationImpl::Decode => heavy_session_options,
             });
-            Ok(Self { status })
+            Ok(Self { use_gpu, status })
         } else {
             Err(ErrorRepr::GpuSupport.into())
         }
@@ -50,6 +53,10 @@ impl<R: InferenceRuntime> InferenceCore<R> {
         }
     }
 
+    pub(crate) fn is_use_gpu(&self) -> bool {
+        self.use_gpu
+    }
+
     pub async fn load_model(&self, model: &VoiceModel) -> Result<()> {
         let model_bytes = &model.read_inference_models().await?;
         self.status.load_model(model, model_bytes).await
@@ -70,29 +77,25 @@ impl<R: InferenceRuntime> InferenceCore<R> {
         self.status.is_loaded_model_by_style_id(style_id)
     }
 
-    pub async fn predict_duration(
+    /// `predict_duration`を実行する。
+    ///
+    /// CPU-bound操作であるため、async文脈ではスレッドに包むべきである。
+    pub fn predict_duration(
         &self,
-        phoneme_vector: &[i64],
+        phoneme_list: Array1<i64>,
         style_id: StyleId,
     ) -> Result<Vec<f32>> {
-        if !self.status.validate_speaker_id(style_id) {
-            return Err(ErrorRepr::StyleNotFound { style_id }.into());
-        }
-
         let (model_id, model_inner_id) = self.status.ids_for(style_id)?;
 
         let PredictDurationOutput {
             phoneme_length: output,
-        } = self
-            .status
-            .run_session(
-                &model_id,
-                PredictDurationInput {
-                    phoneme_list: ndarray::arr1(phoneme_vector),
-                    speaker_id: ndarray::arr1(&[model_inner_id.raw_id().into()]),
-                },
-            )
-            .await?;
+        } = self.status.run_session(
+            &model_id,
+            PredictDurationInput {
+                phoneme_list,
+                speaker_id: ndarray::arr1(&[model_inner_id.raw_id().into()]),
+            },
+        )?;
         let mut output = output.into_raw_vec();
 
         for output_item in output.iter_mut() {
@@ -104,58 +107,78 @@ impl<R: InferenceRuntime> InferenceCore<R> {
         Ok(output)
     }
 
+    /// `predict_intonation`を実行する。
+    ///
+    /// CPU-bound操作であるため、async文脈ではスレッドに包むべきである。
+    ///
+    /// # Panics
+    ///
+    /// 長さが合わないとき、パニックする。
     #[allow(clippy::too_many_arguments)]
-    pub async fn predict_intonation(
+    pub fn predict_intonation(
         &self,
-        length: usize,
-        vowel_phoneme_vector: &[i64],
-        consonant_phoneme_vector: &[i64],
-        start_accent_vector: &[i64],
-        end_accent_vector: &[i64],
-        start_accent_phrase_vector: &[i64],
-        end_accent_phrase_vector: &[i64],
+        vowel_phoneme_list: Array1<i64>,
+        consonant_phoneme_list: Array1<i64>,
+        start_accent_list: Array1<i64>,
+        end_accent_list: Array1<i64>,
+        start_accent_phrase_list: Array1<i64>,
+        end_accent_phrase_list: Array1<i64>,
         style_id: StyleId,
     ) -> Result<Vec<f32>> {
-        if !self.status.validate_speaker_id(style_id) {
-            return Err(ErrorRepr::StyleNotFound { style_id }.into());
-        }
-
         let (model_id, model_inner_id) = self.status.ids_for(style_id)?;
 
-        let PredictIntonationOutput { f0_list: output } = self
-            .status
-            .run_session(
-                &model_id,
-                PredictIntonationInput {
-                    length: ndarray::arr0(length as i64),
-                    vowel_phoneme_list: ndarray::arr1(vowel_phoneme_vector),
-                    consonant_phoneme_list: ndarray::arr1(consonant_phoneme_vector),
-                    start_accent_list: ndarray::arr1(start_accent_vector),
-                    end_accent_list: ndarray::arr1(end_accent_vector),
-                    start_accent_phrase_list: ndarray::arr1(start_accent_phrase_vector),
-                    end_accent_phrase_list: ndarray::arr1(end_accent_phrase_vector),
-                    speaker_id: ndarray::arr1(&[model_inner_id.raw_id().into()]),
-                },
-            )
-            .await?;
+        let Ok(&length) = [
+            vowel_phoneme_list.len(),
+            consonant_phoneme_list.len(),
+            start_accent_list.len(),
+            end_accent_list.len(),
+            start_accent_phrase_list.len(),
+            end_accent_phrase_list.len(),
+        ]
+        .iter()
+        .unique()
+        .exactly_one() else {
+            panic!("different lengths");
+        };
+
+        let PredictIntonationOutput { f0_list: output } = self.status.run_session(
+            &model_id,
+            PredictIntonationInput {
+                length: ndarray::arr0(length as i64),
+                vowel_phoneme_list,
+                consonant_phoneme_list,
+                start_accent_list,
+                end_accent_list,
+                start_accent_phrase_list,
+                end_accent_phrase_list,
+                speaker_id: ndarray::arr1(&[model_inner_id.raw_id().into()]),
+            },
+        )?;
 
         Ok(output.into_raw_vec())
     }
 
-    pub async fn decode(
+    /// `predict_intonation`を実行する。
+    ///
+    /// CPU/GPU-bound操作であるため、async文脈ではスレッドに包むべきである。
+    ///
+    /// # Panics
+    ///
+    /// `f0`と`phoneme`の長さが合わないとき、パニックする。
+    pub fn decode(
         &self,
-        length: usize,
-        phoneme_size: usize,
-        f0: &[f32],
-        phoneme_vector: &[f32],
+        f0: ArrayView1<'_, f32>,
+        phoneme: ArrayView2<'_, f32>,
         style_id: StyleId,
     ) -> Result<Vec<f32>> {
-        if !self.status.validate_speaker_id(style_id) {
-            return Err(ErrorRepr::StyleNotFound { style_id }.into());
-        }
-
         let (model_id, model_inner_id) = self.status.ids_for(style_id)?;
 
+        let length = f0.len();
+        let (phoneme_length, phoneme_size) = phoneme.dim();
+        if phoneme_length != length {
+            panic!("different lengths");
+        }
+
         // 音が途切れてしまうのを避けるworkaround処理が入っている
         // TODO: 改善したらここのpadding処理を取り除く
         const PADDING_SIZE: f64 = 0.4;
@@ -163,30 +186,28 @@ impl<R: InferenceRuntime> InferenceCore<R> {
         let padding_size = ((PADDING_SIZE * DEFAULT_SAMPLING_RATE) / 256.0).round() as usize;
         let start_and_end_padding_size = 2 * padding_size;
         let length_with_padding = length + start_and_end_padding_size;
-        let f0_with_padding = Self::make_f0_with_padding(f0, length_with_padding, padding_size);
+        let f0_with_padding =
+            Self::make_f0_with_padding(f0.to_slice().unwrap(), length_with_padding, padding_size);
 
         let phoneme_with_padding = Self::make_phoneme_with_padding(
-            phoneme_vector,
+            phoneme.to_slice().unwrap(),
             phoneme_size,
             length_with_padding,
             padding_size,
         );
 
-        let DecodeOutput { wave: output } = self
-            .status
-            .run_session(
-                &model_id,
-                DecodeInput {
-                    f0: ndarray::arr1(&f0_with_padding)
-                        .into_shape([length_with_padding, 1])
-                        .unwrap(),
-                    phoneme: ndarray::arr1(&phoneme_with_padding)
-                        .into_shape([length_with_padding, phoneme_size])
-                        .unwrap(),
-                    speaker_id: ndarray::arr1(&[model_inner_id.raw_id().into()]),
-                },
-            )
-            .await?;
+        let DecodeOutput { wave: output } = self.status.run_session(
+            &model_id,
+            DecodeInput {
+                f0: ndarray::arr1(&f0_with_padding)
+                    .into_shape([length_with_padding, 1])
+                    .unwrap(),
+                phoneme: ndarray::arr1(&phoneme_with_padding)
+                    .into_shape([length_with_padding, phoneme_size])
+                    .unwrap(),
+                speaker_id: ndarray::arr1(&[model_inner_id.raw_id().into()]),
+            },
+        )?;
 
         Ok(Self::trim_padding_from_output(
             output.into_raw_vec(),
diff --git a/crates/voicevox_core/src/synthesizer.rs b/crates/voicevox_core/src/synthesizer.rs
index 47ac0997a..63f02ed9d 100644
--- a/crates/voicevox_core/src/synthesizer.rs
+++ b/crates/voicevox_core/src/synthesizer.rs
@@ -1,11 +1,18 @@
-use std::sync::Arc;
+use std::{
+    borrow::Borrow,
+    io::{Cursor, Write as _},
+    sync::Arc,
+};
+
+use ndarray::{Array1, ArrayView, ArrayView1, ArrayView2};
 
 use crate::{
     engine::{
-        create_kana, parse_kana, AccentPhraseModel, OpenJtalk, SynthesisEngine,
-        DEFAULT_SAMPLING_RATE,
+        self, create_kana, parse_kana, AccentPhraseModel, MoraModel, OjtPhoneme, OpenJtalk,
+        Utterance,
     },
     infer::runtimes::Onnxruntime,
+    numerics::F32Ext as _,
 };
 
 use super::*;
@@ -13,6 +20,7 @@ use super::*;
 /// [`Synthesizer::synthesis`]のオプション。
 ///
 /// [`Synthesizer::synthesis`]: Synthesizer::synthesis
+#[derive(Clone, Copy)]
 pub struct SynthesisOptions {
     pub enable_interrogative_upspeak: bool,
 }
@@ -34,6 +42,7 @@ impl From<&TtsOptions> for SynthesisOptions {
 /// [`Synthesizer::tts`]のオプション。
 ///
 /// [`Synthesizer::tts`]: Synthesizer::tts
+#[derive(Clone, Copy)]
 pub struct TtsOptions {
     pub enable_interrogative_upspeak: bool,
 }
@@ -76,12 +85,9 @@ pub struct InitializeOptions {
 pub(crate) type InferenceRuntimeImpl = Onnxruntime;
 
 /// 音声シンセサイザ。
-pub struct Synthesizer {
-    synthesis_engine: SynthesisEngine<InferenceRuntimeImpl>,
-    use_gpu: bool,
-}
+pub struct Synthesizer<O>(Arc<Inner<O>>);
 
-impl Synthesizer {
+impl<O: Send + Sync + 'static> Synthesizer<O> {
     /// `Synthesizer`をコンストラクトする。
     ///
     /// # Example
@@ -108,7 +114,7 @@ impl Synthesizer {
     /// # Ok(())
     /// # }
     /// ```
-    pub fn new(open_jtalk: Arc<OpenJtalk>, options: &InitializeOptions) -> Result<Self> {
+    pub fn new(open_jtalk: O, options: &InitializeOptions) -> Result<Self> {
         #[cfg(windows)]
         list_windows_video_cards();
         let use_gpu = match options.acceleration_mode {
@@ -128,53 +134,44 @@ impl Synthesizer {
             AccelerationMode::Gpu => true,
         };
 
-        Ok(Self {
-            synthesis_engine: SynthesisEngine::new(
-                InferenceCore::new(use_gpu, options.cpu_num_threads)?,
+        Ok(Self(
+            Inner {
+                inference_core: InferenceCore::new(use_gpu, options.cpu_num_threads)?,
                 open_jtalk,
-            ),
-            use_gpu,
-        })
+            }
+            .into(),
+        ))
     }
 
     /// ハードウェアアクセラレーションがGPUモードか判定する。
     pub fn is_gpu_mode(&self) -> bool {
-        self.use_gpu
+        self.0.inference_core.is_use_gpu()
     }
 
     /// 音声モデルを読み込む。
     pub async fn load_voice_model(&self, model: &VoiceModel) -> Result<()> {
-        self.synthesis_engine
-            .inference_core()
-            .load_model(model)
-            .await?;
+        self.0.inference_core.load_model(model).await?;
         Ok(())
     }
 
     /// 音声モデルの読み込みを解除する。
     pub fn unload_voice_model(&self, voice_model_id: &VoiceModelId) -> Result<()> {
-        self.synthesis_engine
-            .inference_core()
-            .unload_model(voice_model_id)
+        self.0.inference_core.unload_model(voice_model_id)
     }
 
     /// 指定したIDの音声モデルが読み込まれているか判定する。
     pub fn is_loaded_voice_model(&self, voice_model_id: &VoiceModelId) -> bool {
-        self.synthesis_engine
-            .inference_core()
-            .is_loaded_model(voice_model_id)
+        self.0.inference_core.is_loaded_model(voice_model_id)
     }
 
     #[doc(hidden)]
     pub fn is_loaded_model_by_style_id(&self, style_id: StyleId) -> bool {
-        self.synthesis_engine
-            .inference_core()
-            .is_model_loaded_by_style_id(style_id)
+        self.0.inference_core.is_model_loaded_by_style_id(style_id)
     }
 
     /// 今読み込んでいる音声モデルのメタ情報を返す。
     pub fn metas(&self) -> VoiceModelMeta {
-        self.synthesis_engine.inference_core().metas()
+        self.0.inference_core.metas()
     }
 
     /// AudioQueryから音声合成を行う。
@@ -184,63 +181,52 @@ impl Synthesizer {
         style_id: StyleId,
         options: &SynthesisOptions,
     ) -> Result<Vec<u8>> {
-        self.synthesis_engine
-            .synthesis_wave_format(audio_query, style_id, options.enable_interrogative_upspeak)
+        let audio_query = audio_query.clone();
+        let options = *options;
+
+        self.spawn_blocking(move |inner| inner.blocking_synthesis(&audio_query, style_id, &options))
             .await
     }
 
-    #[doc(hidden)]
-    pub async fn predict_duration(
+    pub(crate) fn predict_duration(
         &self,
-        phoneme_vector: &[i64],
+        phoneme_list: Array1<i64>,
         style_id: StyleId,
     ) -> Result<Vec<f32>> {
-        self.synthesis_engine
-            .inference_core()
-            .predict_duration(phoneme_vector, style_id)
-            .await
+        self.0
+            .inference_core
+            .predict_duration(phoneme_list, style_id)
     }
 
     #[allow(clippy::too_many_arguments)]
-    #[doc(hidden)]
-    pub async fn predict_intonation(
+    pub(crate) fn predict_intonation(
         &self,
-        length: usize,
-        vowel_phoneme_vector: &[i64],
-        consonant_phoneme_vector: &[i64],
-        start_accent_vector: &[i64],
-        end_accent_vector: &[i64],
-        start_accent_phrase_vector: &[i64],
-        end_accent_phrase_vector: &[i64],
+        vowel_phoneme_list: Array1<i64>,
+        consonant_phoneme_list: Array1<i64>,
+        start_accent_list: Array1<i64>,
+        end_accent_list: Array1<i64>,
+        start_accent_phrase_list: Array1<i64>,
+        end_accent_phrase_list: Array1<i64>,
         style_id: StyleId,
     ) -> Result<Vec<f32>> {
-        self.synthesis_engine
-            .inference_core()
-            .predict_intonation(
-                length,
-                vowel_phoneme_vector,
-                consonant_phoneme_vector,
-                start_accent_vector,
-                end_accent_vector,
-                start_accent_phrase_vector,
-                end_accent_phrase_vector,
-                style_id,
-            )
-            .await
+        self.0.inference_core.predict_intonation(
+            vowel_phoneme_list,
+            consonant_phoneme_list,
+            start_accent_list,
+            end_accent_list,
+            start_accent_phrase_list,
+            end_accent_phrase_list,
+            style_id,
+        )
     }
-    #[doc(hidden)]
-    pub async fn decode(
+
+    pub(crate) fn decode(
         &self,
-        length: usize,
-        phoneme_size: usize,
-        f0: &[f32],
-        phoneme_vector: &[f32],
+        f0: ArrayView1<'_, f32>,
+        phoneme: ArrayView2<'_, f32>,
         style_id: StyleId,
     ) -> Result<Vec<f32>> {
-        self.synthesis_engine
-            .inference_core()
-            .decode(length, phoneme_size, f0, phoneme_vector, style_id)
-            .await
+        self.0.inference_core.decode(f0, phoneme, style_id)
     }
 
     /// AquesTalk風記法からAccentPhrase (アクセント句)の配列を生成する。
@@ -271,12 +257,57 @@ impl Synthesizer {
         kana: &str,
         style_id: StyleId,
     ) -> Result<Vec<AccentPhraseModel>> {
-        self.synthesis_engine
-            .replace_mora_data(&parse_kana(kana)?, style_id)
-            .await
+        let kana = kana.to_owned();
+
+        self.spawn_blocking(move |inner| {
+            inner.blocking_create_accent_phrases_from_kana(&kana, style_id)
+        })
+        .await
     }
 
-    /// 日本語のテキストからAccentPhrase (アクセント句)の配列を生成する。
+    /// AccentPhraseの配列の音高・音素長を、特定の声で生成しなおす。
+    pub async fn replace_mora_data(
+        &self,
+        accent_phrases: &[AccentPhraseModel],
+        style_id: StyleId,
+    ) -> Result<Vec<AccentPhraseModel>> {
+        let accent_phrases = accent_phrases.to_owned();
+
+        self.spawn_blocking(move |inner| {
+            inner.blocking_replace_mora_data(&accent_phrases, style_id)
+        })
+        .await
+    }
+
+    /// AccentPhraseの配列の音素長を、特定の声で生成しなおす。
+    pub async fn replace_phoneme_length(
+        &self,
+        accent_phrases: &[AccentPhraseModel],
+        style_id: StyleId,
+    ) -> Result<Vec<AccentPhraseModel>> {
+        let accent_phrases = accent_phrases.to_owned();
+
+        self.spawn_blocking(move |inner| {
+            inner.blocking_replace_phoneme_length(&accent_phrases, style_id)
+        })
+        .await
+    }
+
+    /// AccentPhraseの配列の音高を、特定の声で生成しなおす。
+    pub async fn replace_mora_pitch(
+        &self,
+        accent_phrases: &[AccentPhraseModel],
+        style_id: StyleId,
+    ) -> Result<Vec<AccentPhraseModel>> {
+        let accent_phrases = accent_phrases.to_owned();
+
+        self.spawn_blocking(move |inner| {
+            inner.blocking_replace_mora_pitch(&accent_phrases, style_id)
+        })
+        .await
+    }
+
+    /// AquesTalk風記法から[AudioQuery]を生成する。
     ///
     /// # Example
     ///
@@ -292,60 +323,55 @@ impl Synthesizer {
     /// #
     /// use voicevox_core::StyleId;
     ///
-    /// let accent_phrases = synthesizer
-    ///     .create_accent_phrases("こんにちは", StyleId::new(302))
+    /// let audio_query = synthesizer
+    ///     .audio_query_from_kana("コンニチワ'", StyleId::new(302))
     ///     .await?;
     /// #
     /// # Ok(())
     /// # }
     /// ```
-    pub async fn create_accent_phrases(
+    ///
+    /// [AudioQuery]: crate::AudioQueryModel
+    pub async fn audio_query_from_kana(
         &self,
-        text: &str,
+        kana: &str,
         style_id: StyleId,
-    ) -> Result<Vec<AccentPhraseModel>> {
-        if !self.synthesis_engine.is_openjtalk_dict_loaded() {
-            return Err(ErrorRepr::NotLoadedOpenjtalkDict.into());
-        }
-        self.synthesis_engine
-            .create_accent_phrases(text, style_id)
-            .await
-    }
+    ) -> Result<AudioQueryModel> {
+        let kana = kana.to_owned();
 
-    /// AccentPhraseの配列の音高・音素長を、特定の声で生成しなおす。
-    pub async fn replace_mora_data(
-        &self,
-        accent_phrases: &[AccentPhraseModel],
-        style_id: StyleId,
-    ) -> Result<Vec<AccentPhraseModel>> {
-        self.synthesis_engine
-            .replace_mora_data(accent_phrases, style_id)
+        self.spawn_blocking(move |inner| inner.blocking_audio_query_from_kana(&kana, style_id))
             .await
     }
 
-    /// AccentPhraseの配列の音素長を、特定の声で生成しなおす。
-    pub async fn replace_phoneme_length(
+    /// AquesTalk風記法から音声合成を行う。
+    pub async fn tts_from_kana(
         &self,
-        accent_phrases: &[AccentPhraseModel],
+        kana: &str,
         style_id: StyleId,
-    ) -> Result<Vec<AccentPhraseModel>> {
-        self.synthesis_engine
-            .replace_phoneme_length(accent_phrases, style_id)
+        options: &TtsOptions,
+    ) -> Result<Vec<u8>> {
+        let kana = kana.to_owned();
+        let options = *options;
+
+        self.spawn_blocking(move |inner| inner.blocking_tts_from_kana(&kana, style_id, &options))
             .await
     }
 
-    /// AccentPhraseの配列の音高を、特定の声で生成しなおす。
-    pub async fn replace_mora_pitch(
-        &self,
-        accent_phrases: &[AccentPhraseModel],
-        style_id: StyleId,
-    ) -> Result<Vec<AccentPhraseModel>> {
-        self.synthesis_engine
-            .replace_mora_pitch(accent_phrases, style_id)
+    async fn spawn_blocking<F, R>(&self, f: F) -> Result<R>
+    where
+        F: FnOnce(&Inner<O>) -> Result<R> + Send + 'static,
+        R: Send + 'static,
+    {
+        let inner = self.0.clone();
+
+        tokio::task::spawn_blocking(move || f(&inner))
             .await
+            .unwrap()
     }
+}
 
-    /// AquesTalk風記法から[AudioQuery]を生成する。
+impl<O: Borrow<OpenJtalk> + Send + Sync + 'static> Synthesizer<O> {
+    /// 日本語のテキストからAccentPhrase (アクセント句)の配列を生成する。
     ///
     /// # Example
     ///
@@ -361,22 +387,22 @@ impl Synthesizer {
     /// #
     /// use voicevox_core::StyleId;
     ///
-    /// let audio_query = synthesizer
-    ///     .audio_query_from_kana("コンニチワ'", StyleId::new(302))
+    /// let accent_phrases = synthesizer
+    ///     .create_accent_phrases("こんにちは", StyleId::new(302))
     ///     .await?;
     /// #
     /// # Ok(())
     /// # }
     /// ```
-    ///
-    /// [AudioQuery]: crate::AudioQueryModel
-    pub async fn audio_query_from_kana(
+    pub async fn create_accent_phrases(
         &self,
-        kana: &str,
+        text: &str,
         style_id: StyleId,
-    ) -> Result<AudioQueryModel> {
-        let accent_phrases = self.create_accent_phrases_from_kana(kana, style_id).await?;
-        Ok(AudioQueryModel::from_accent_phrases(accent_phrases).with_kana(Some(kana.to_owned())))
+    ) -> Result<Vec<AccentPhraseModel>> {
+        let text = text.to_owned();
+
+        self.spawn_blocking(move |inner| inner.blocking_create_accent_phrases(&text, style_id))
+            .await
     }
 
     /// 日本語のテキストから[AudioQuery]を生成する。
@@ -405,35 +431,559 @@ impl Synthesizer {
     ///
     /// [AudioQuery]: crate::AudioQueryModel
     pub async fn audio_query(&self, text: &str, style_id: StyleId) -> Result<AudioQueryModel> {
-        let accent_phrases = self.create_accent_phrases(text, style_id).await?;
-        Ok(AudioQueryModel::from_accent_phrases(accent_phrases))
+        let text = text.to_owned();
+
+        self.spawn_blocking(move |inner| inner.blocking_audio_query(&text, style_id))
+            .await
     }
 
-    /// AquesTalk風記法から音声合成を行う。
-    pub async fn tts_from_kana(
+    /// 日本語のテキストから音声合成を行う。
+    pub async fn tts(
         &self,
-        kana: &str,
+        text: &str,
         style_id: StyleId,
         options: &TtsOptions,
     ) -> Result<Vec<u8>> {
-        let audio_query = &self.audio_query_from_kana(kana, style_id).await?;
-        self.synthesis(audio_query, style_id, &SynthesisOptions::from(options))
+        let text = text.to_owned();
+        let options = *options;
+
+        self.spawn_blocking(move |inner| inner.blocking_tts(&text, style_id, &options))
             .await
     }
+}
 
-    /// 日本語のテキストから音声合成を行う。
-    pub async fn tts(
+struct Inner<O> {
+    inference_core: InferenceCore<InferenceRuntimeImpl>,
+    open_jtalk: O,
+}
+
+impl<O: Send + Sync> Inner<O> {
+    fn blocking_synthesis(
         &self,
-        text: &str,
+        audio_query: &AudioQueryModel,
+        style_id: StyleId,
+        options: &SynthesisOptions,
+    ) -> Result<Vec<u8>> {
+        let speed_scale = *audio_query.speed_scale();
+        let pitch_scale = *audio_query.pitch_scale();
+        let intonation_scale = *audio_query.intonation_scale();
+        let pre_phoneme_length = *audio_query.pre_phoneme_length();
+        let post_phoneme_length = *audio_query.post_phoneme_length();
+
+        let accent_phrases = if options.enable_interrogative_upspeak {
+            adjust_interrogative_accent_phrases(audio_query.accent_phrases().as_slice())
+        } else {
+            audio_query.accent_phrases().clone()
+        };
+
+        let (flatten_moras, phoneme_data_list) = initial_process(&accent_phrases);
+
+        let mut phoneme_length_list = vec![pre_phoneme_length];
+        let mut f0_list = vec![0.];
+        let mut voiced_list = vec![false];
+        {
+            let mut sum_of_f0_bigger_than_zero = 0.;
+            let mut count_of_f0_bigger_than_zero = 0;
+
+            for mora in flatten_moras {
+                let consonant_length = *mora.consonant_length();
+                let vowel_length = *mora.vowel_length();
+                let pitch = *mora.pitch();
+
+                if let Some(consonant_length) = consonant_length {
+                    phoneme_length_list.push(consonant_length);
+                }
+                phoneme_length_list.push(vowel_length);
+
+                let f0_single = pitch * 2.0_f32.powf(pitch_scale);
+                f0_list.push(f0_single);
+
+                let bigger_than_zero = f0_single > 0.;
+                voiced_list.push(bigger_than_zero);
+
+                if bigger_than_zero {
+                    sum_of_f0_bigger_than_zero += f0_single;
+                    count_of_f0_bigger_than_zero += 1;
+                }
+            }
+            phoneme_length_list.push(post_phoneme_length);
+            f0_list.push(0.);
+            voiced_list.push(false);
+            let mean_f0 = sum_of_f0_bigger_than_zero / (count_of_f0_bigger_than_zero as f32);
+
+            if !mean_f0.is_nan() {
+                for i in 0..f0_list.len() {
+                    if voiced_list[i] {
+                        f0_list[i] = (f0_list[i] - mean_f0) * intonation_scale + mean_f0;
+                    }
+                }
+            }
+        }
+
+        let (_, _, vowel_indexes) = split_mora(&phoneme_data_list);
+
+        let mut phoneme: Vec<[f32; OjtPhoneme::NUM_PHONEME]> = Vec::new();
+        let mut f0: Vec<f32> = Vec::new();
+        {
+            const RATE: f32 = 24000. / 256.;
+            let mut sum_of_phoneme_length = 0;
+            let mut count_of_f0 = 0;
+            let mut vowel_indexes_index = 0;
+
+            for (i, phoneme_length) in phoneme_length_list.iter().enumerate() {
+                // VOICEVOX ENGINEと挙動を合わせるため、四捨五入ではなく偶数丸めをする
+                //
+                // https://github.com/VOICEVOX/voicevox_engine/issues/552
+                let phoneme_length = ((*phoneme_length * RATE).round_ties_even_() / speed_scale)
+                    .round_ties_even_() as usize;
+                let phoneme_id = phoneme_data_list[i].phoneme_id();
+
+                for _ in 0..phoneme_length {
+                    let mut phonemes_vec = [0.; OjtPhoneme::NUM_PHONEME];
+                    phonemes_vec[phoneme_id as usize] = 1.;
+                    phoneme.push(phonemes_vec)
+                }
+                sum_of_phoneme_length += phoneme_length;
+
+                if i as i64 == vowel_indexes[vowel_indexes_index] {
+                    for _ in 0..sum_of_phoneme_length {
+                        f0.push(f0_list[count_of_f0]);
+                    }
+                    count_of_f0 += 1;
+                    sum_of_phoneme_length = 0;
+                    vowel_indexes_index += 1;
+                }
+            }
+        }
+
+        let phoneme = &phoneme.into_iter().flatten().collect::<Vec<_>>();
+        let phoneme = ArrayView::from_shape((f0.len(), OjtPhoneme::NUM_PHONEME), phoneme).unwrap();
+
+        let wave = &self
+            .inference_core
+            .decode(ndarray::aview1(&f0), phoneme, style_id)?;
+
+        return Ok(to_wav(wave, audio_query));
+
+        fn adjust_interrogative_accent_phrases(
+            accent_phrases: &[AccentPhraseModel],
+        ) -> Vec<AccentPhraseModel> {
+            accent_phrases
+                .iter()
+                .map(|accent_phrase| {
+                    AccentPhraseModel::new(
+                        adjust_interrogative_moras(accent_phrase),
+                        *accent_phrase.accent(),
+                        accent_phrase.pause_mora().clone(),
+                        *accent_phrase.is_interrogative(),
+                    )
+                })
+                .collect()
+        }
+
+        fn adjust_interrogative_moras(accent_phrase: &AccentPhraseModel) -> Vec<MoraModel> {
+            let moras = accent_phrase.moras();
+            if *accent_phrase.is_interrogative() && !moras.is_empty() {
+                let last_mora = moras.last().unwrap();
+                let last_mora_pitch = *last_mora.pitch();
+                if last_mora_pitch != 0.0 {
+                    let mut new_moras: Vec<MoraModel> = Vec::with_capacity(moras.len() + 1);
+                    new_moras.extend_from_slice(moras.as_slice());
+                    let interrogative_mora = make_interrogative_mora(last_mora);
+                    new_moras.push(interrogative_mora);
+                    return new_moras;
+                }
+            }
+            moras.clone()
+        }
+
+        fn make_interrogative_mora(last_mora: &MoraModel) -> MoraModel {
+            const FIX_VOWEL_LENGTH: f32 = 0.15;
+            const ADJUST_PITCH: f32 = 0.3;
+            const MAX_PITCH: f32 = 6.5;
+
+            let pitch = (*last_mora.pitch() + ADJUST_PITCH).min(MAX_PITCH);
+
+            MoraModel::new(
+                mora_to_text(last_mora.vowel()),
+                None,
+                None,
+                last_mora.vowel().clone(),
+                FIX_VOWEL_LENGTH,
+                pitch,
+            )
+        }
+
+        fn to_wav(wave: &[f32], query: &AudioQueryModel) -> Vec<u8> {
+            let volume_scale = *query.volume_scale();
+            let output_stereo = *query.output_stereo();
+            let output_sampling_rate = *query.output_sampling_rate();
+
+            // TODO: 44.1kHzなどの対応
+
+            let num_channels: u16 = if output_stereo { 2 } else { 1 };
+            let bit_depth: u16 = 16;
+            let repeat_count: u32 =
+                (output_sampling_rate / DEFAULT_SAMPLING_RATE) * num_channels as u32;
+            let block_size: u16 = bit_depth * num_channels / 8;
+
+            let bytes_size = wave.len() as u32 * repeat_count * 2;
+            let wave_size = bytes_size + 44;
+
+            let buf: Vec<u8> = Vec::with_capacity(wave_size as usize);
+            let mut cur = Cursor::new(buf);
+
+            cur.write_all("RIFF".as_bytes()).unwrap();
+            cur.write_all(&(wave_size - 8).to_le_bytes()).unwrap();
+            cur.write_all("WAVEfmt ".as_bytes()).unwrap();
+            cur.write_all(&16_u32.to_le_bytes()).unwrap(); // fmt header length
+            cur.write_all(&1_u16.to_le_bytes()).unwrap(); //linear PCM
+            cur.write_all(&num_channels.to_le_bytes()).unwrap();
+            cur.write_all(&output_sampling_rate.to_le_bytes()).unwrap();
+
+            let block_rate = output_sampling_rate * block_size as u32;
+
+            cur.write_all(&block_rate.to_le_bytes()).unwrap();
+            cur.write_all(&block_size.to_le_bytes()).unwrap();
+            cur.write_all(&bit_depth.to_le_bytes()).unwrap();
+            cur.write_all("data".as_bytes()).unwrap();
+            cur.write_all(&bytes_size.to_le_bytes()).unwrap();
+
+            for value in wave {
+                let v = (value * volume_scale).clamp(-1., 1.);
+                let data = (v * 0x7fff as f32) as i16;
+                for _ in 0..repeat_count {
+                    cur.write_all(&data.to_le_bytes()).unwrap();
+                }
+            }
+
+            cur.into_inner()
+        }
+    }
+
+    fn blocking_create_accent_phrases_from_kana(
+        &self,
+        kana: &str,
+        style_id: StyleId,
+    ) -> Result<Vec<AccentPhraseModel>> {
+        self.blocking_replace_mora_data(&parse_kana(kana)?, style_id)
+    }
+
+    fn blocking_replace_mora_data(
+        &self,
+        accent_phrases: &[AccentPhraseModel],
+        style_id: StyleId,
+    ) -> Result<Vec<AccentPhraseModel>> {
+        let accent_phrases = self.blocking_replace_phoneme_length(accent_phrases, style_id)?;
+        self.blocking_replace_mora_pitch(&accent_phrases, style_id)
+    }
+
+    fn blocking_replace_phoneme_length(
+        &self,
+        accent_phrases: &[AccentPhraseModel],
+        style_id: StyleId,
+    ) -> Result<Vec<AccentPhraseModel>> {
+        let (_, phoneme_data_list) = initial_process(accent_phrases);
+
+        let (_, _, vowel_indexes_data) = split_mora(&phoneme_data_list);
+
+        let phoneme_list_s: Array1<i64> = phoneme_data_list
+            .iter()
+            .map(|phoneme_data| phoneme_data.phoneme_id())
+            .collect();
+        let phoneme_length = self
+            .inference_core
+            .predict_duration(phoneme_list_s, style_id)?;
+
+        let mut index = 0;
+        let new_accent_phrases = accent_phrases
+            .iter()
+            .map(|accent_phrase| {
+                AccentPhraseModel::new(
+                    accent_phrase
+                        .moras()
+                        .iter()
+                        .map(|mora| {
+                            let new_mora = MoraModel::new(
+                                mora.text().clone(),
+                                mora.consonant().clone(),
+                                mora.consonant().as_ref().map(|_| {
+                                    phoneme_length[vowel_indexes_data[index + 1] as usize - 1]
+                                }),
+                                mora.vowel().clone(),
+                                phoneme_length[vowel_indexes_data[index + 1] as usize],
+                                *mora.pitch(),
+                            );
+                            index += 1;
+                            new_mora
+                        })
+                        .collect(),
+                    *accent_phrase.accent(),
+                    accent_phrase.pause_mora().as_ref().map(|pause_mora| {
+                        let new_pause_mora = MoraModel::new(
+                            pause_mora.text().clone(),
+                            pause_mora.consonant().clone(),
+                            *pause_mora.consonant_length(),
+                            pause_mora.vowel().clone(),
+                            phoneme_length[vowel_indexes_data[index + 1] as usize],
+                            *pause_mora.pitch(),
+                        );
+                        index += 1;
+                        new_pause_mora
+                    }),
+                    *accent_phrase.is_interrogative(),
+                )
+            })
+            .collect();
+
+        Ok(new_accent_phrases)
+    }
+
+    fn blocking_replace_mora_pitch(
+        &self,
+        accent_phrases: &[AccentPhraseModel],
+        style_id: StyleId,
+    ) -> Result<Vec<AccentPhraseModel>> {
+        let (_, phoneme_data_list) = initial_process(accent_phrases);
+
+        let mut base_start_accent_list = vec![0];
+        let mut base_end_accent_list = vec![0];
+        let mut base_start_accent_phrase_list = vec![0];
+        let mut base_end_accent_phrase_list = vec![0];
+        for accent_phrase in accent_phrases {
+            let mut accent = usize::from(*accent_phrase.accent() != 1);
+            create_one_accent_list(&mut base_start_accent_list, accent_phrase, accent as i32);
+
+            accent = *accent_phrase.accent() - 1;
+            create_one_accent_list(&mut base_end_accent_list, accent_phrase, accent as i32);
+            create_one_accent_list(&mut base_start_accent_phrase_list, accent_phrase, 0);
+            create_one_accent_list(&mut base_end_accent_phrase_list, accent_phrase, -1);
+        }
+        base_start_accent_list.push(0);
+        base_end_accent_list.push(0);
+        base_start_accent_phrase_list.push(0);
+        base_end_accent_phrase_list.push(0);
+
+        let (consonant_phoneme_data_list, vowel_phoneme_data_list, vowel_indexes) =
+            split_mora(&phoneme_data_list);
+
+        let consonant_phoneme_list: Vec<i64> = consonant_phoneme_data_list
+            .iter()
+            .map(|phoneme_data| phoneme_data.phoneme_id())
+            .collect();
+        let vowel_phoneme_list: Vec<i64> = vowel_phoneme_data_list
+            .iter()
+            .map(|phoneme_data| phoneme_data.phoneme_id())
+            .collect();
+
+        let mut start_accent_list = Vec::with_capacity(vowel_indexes.len());
+        let mut end_accent_list = Vec::with_capacity(vowel_indexes.len());
+        let mut start_accent_phrase_list = Vec::with_capacity(vowel_indexes.len());
+        let mut end_accent_phrase_list = Vec::with_capacity(vowel_indexes.len());
+
+        for vowel_index in vowel_indexes {
+            start_accent_list.push(base_start_accent_list[vowel_index as usize]);
+            end_accent_list.push(base_end_accent_list[vowel_index as usize]);
+            start_accent_phrase_list.push(base_start_accent_phrase_list[vowel_index as usize]);
+            end_accent_phrase_list.push(base_end_accent_phrase_list[vowel_index as usize]);
+        }
+
+        let mut f0_list = self.inference_core.predict_intonation(
+            vowel_phoneme_list.into(),
+            consonant_phoneme_list.into(),
+            start_accent_list.into(),
+            end_accent_list.into(),
+            start_accent_phrase_list.into(),
+            end_accent_phrase_list.into(),
+            style_id,
+        )?;
+
+        for i in 0..vowel_phoneme_data_list.len() {
+            if UNVOICED_MORA_PHONEME_LIST
+                .iter()
+                .any(|phoneme| *phoneme == vowel_phoneme_data_list[i].phoneme())
+            {
+                f0_list[i] = 0.;
+            }
+        }
+
+        let mut index = 0;
+        let new_accent_phrases = accent_phrases
+            .iter()
+            .map(|accent_phrase| {
+                AccentPhraseModel::new(
+                    accent_phrase
+                        .moras()
+                        .iter()
+                        .map(|mora| {
+                            let new_mora = MoraModel::new(
+                                mora.text().clone(),
+                                mora.consonant().clone(),
+                                *mora.consonant_length(),
+                                mora.vowel().clone(),
+                                *mora.vowel_length(),
+                                f0_list[index + 1],
+                            );
+                            index += 1;
+                            new_mora
+                        })
+                        .collect(),
+                    *accent_phrase.accent(),
+                    accent_phrase.pause_mora().as_ref().map(|pause_mora| {
+                        let new_pause_mora = MoraModel::new(
+                            pause_mora.text().clone(),
+                            pause_mora.consonant().clone(),
+                            *pause_mora.consonant_length(),
+                            pause_mora.vowel().clone(),
+                            *pause_mora.vowel_length(),
+                            f0_list[index + 1],
+                        );
+                        index += 1;
+                        new_pause_mora
+                    }),
+                    *accent_phrase.is_interrogative(),
+                )
+            })
+            .collect();
+
+        return Ok(new_accent_phrases);
+
+        const UNVOICED_MORA_PHONEME_LIST: &[&str] = &["A", "I", "U", "E", "O", "cl", "pau"];
+
+        fn create_one_accent_list(
+            accent_list: &mut Vec<i64>,
+            accent_phrase: &AccentPhraseModel,
+            point: i32,
+        ) {
+            let mut one_accent_list: Vec<i64> = Vec::new();
+
+            for (i, mora) in accent_phrase.moras().iter().enumerate() {
+                let value = (i as i32 == point
+                    || (point < 0 && i == (accent_phrase.moras().len() as i32 + point) as usize))
+                    .into();
+                one_accent_list.push(value);
+                if mora.consonant().is_some() {
+                    one_accent_list.push(value);
+                }
+            }
+            if accent_phrase.pause_mora().is_some() {
+                one_accent_list.push(0);
+            }
+            accent_list.extend(one_accent_list)
+        }
+    }
+
+    fn blocking_audio_query_from_kana(
+        &self,
+        kana: &str,
+        style_id: StyleId,
+    ) -> Result<AudioQueryModel> {
+        let accent_phrases = self.blocking_create_accent_phrases_from_kana(kana, style_id)?;
+        Ok(AudioQueryModel::from_accent_phrases(accent_phrases).with_kana(Some(kana.to_owned())))
+    }
+
+    fn blocking_tts_from_kana(
+        &self,
+        kana: &str,
         style_id: StyleId,
         options: &TtsOptions,
     ) -> Result<Vec<u8>> {
-        let audio_query = &self.audio_query(text, style_id).await?;
-        self.synthesis(audio_query, style_id, &SynthesisOptions::from(options))
-            .await
+        let audio_query = &self.blocking_audio_query_from_kana(kana, style_id)?;
+        self.blocking_synthesis(audio_query, style_id, &SynthesisOptions::from(options))
+    }
+}
+
+impl<O: Borrow<OpenJtalk> + Send + Sync + 'static> Inner<O> {
+    fn blocking_create_accent_phrases(
+        &self,
+        text: &str,
+        style_id: StyleId,
+    ) -> Result<Vec<AccentPhraseModel>> {
+        if text.is_empty() {
+            return Ok(Vec::new());
+        }
+
+        let utterance = Utterance::extract_full_context_label(self.open_jtalk.borrow(), text)?;
+
+        let accent_phrases: Vec<AccentPhraseModel> = utterance
+            .breath_groups()
+            .iter()
+            .enumerate()
+            .fold(Vec::new(), |mut accum_vec, (i, breath_group)| {
+                accum_vec.extend(breath_group.accent_phrases().iter().enumerate().map(
+                    |(j, accent_phrase)| {
+                        let moras = accent_phrase
+                            .moras()
+                            .iter()
+                            .map(|mora| {
+                                let mora_text = mora
+                                    .phonemes()
+                                    .iter()
+                                    .map(|phoneme| phoneme.phoneme().to_string())
+                                    .collect::<Vec<_>>()
+                                    .join("");
+
+                                let (consonant, consonant_length) =
+                                    if let Some(consonant) = mora.consonant() {
+                                        (Some(consonant.phoneme().to_string()), Some(0.))
+                                    } else {
+                                        (None, None)
+                                    };
+
+                                MoraModel::new(
+                                    mora_to_text(mora_text),
+                                    consonant,
+                                    consonant_length,
+                                    mora.vowel().phoneme().into(),
+                                    0.,
+                                    0.,
+                                )
+                            })
+                            .collect();
+
+                        let pause_mora = if i != utterance.breath_groups().len() - 1
+                            && j == breath_group.accent_phrases().len() - 1
+                        {
+                            Some(MoraModel::new(
+                                "、".into(),
+                                None,
+                                None,
+                                "pau".into(),
+                                0.,
+                                0.,
+                            ))
+                        } else {
+                            None
+                        };
+
+                        AccentPhraseModel::new(
+                            moras,
+                            *accent_phrase.accent(),
+                            pause_mora,
+                            *accent_phrase.is_interrogative(),
+                        )
+                    },
+                ));
+
+                accum_vec
+            });
+
+        self.blocking_replace_mora_data(&accent_phrases, style_id)
+    }
+
+    fn blocking_audio_query(&self, text: &str, style_id: StyleId) -> Result<AudioQueryModel> {
+        let accent_phrases = self.blocking_create_accent_phrases(text, style_id)?;
+        Ok(AudioQueryModel::from_accent_phrases(accent_phrases))
+    }
+
+    fn blocking_tts(&self, text: &str, style_id: StyleId, options: &TtsOptions) -> Result<Vec<u8>> {
+        let audio_query = &self.blocking_audio_query(text, style_id)?;
+        self.blocking_synthesis(audio_query, style_id, &SynthesisOptions::from(options))
     }
 }
 
+const DEFAULT_SAMPLING_RATE: u32 = 24000;
+
 #[cfg(windows)]
 fn list_windows_video_cards() {
     use std::{ffi::OsString, os::windows::ffi::OsStringExt as _};
@@ -472,6 +1022,99 @@ fn list_windows_video_cards() {
     }
 }
 
+fn initial_process(accent_phrases: &[AccentPhraseModel]) -> (Vec<MoraModel>, Vec<OjtPhoneme>) {
+    let flatten_moras = to_flatten_moras(accent_phrases);
+
+    let mut phoneme_strings = vec!["pau".to_string()];
+    for mora in flatten_moras.iter() {
+        if let Some(consonant) = mora.consonant() {
+            phoneme_strings.push(consonant.clone())
+        }
+        phoneme_strings.push(mora.vowel().clone());
+    }
+    phoneme_strings.push("pau".to_string());
+
+    let phoneme_data_list = to_phoneme_data_list(&phoneme_strings);
+
+    return (flatten_moras, phoneme_data_list);
+
+    fn to_flatten_moras(accent_phrases: &[AccentPhraseModel]) -> Vec<MoraModel> {
+        let mut flatten_moras = Vec::new();
+
+        for accent_phrase in accent_phrases {
+            let moras = accent_phrase.moras();
+            for mora in moras {
+                flatten_moras.push(mora.clone());
+            }
+            if let Some(pause_mora) = accent_phrase.pause_mora() {
+                flatten_moras.push(pause_mora.clone());
+            }
+        }
+
+        flatten_moras
+    }
+
+    fn to_phoneme_data_list<T: AsRef<str>>(phoneme_str_list: &[T]) -> Vec<OjtPhoneme> {
+        OjtPhoneme::convert(
+            phoneme_str_list
+                .iter()
+                .enumerate()
+                .map(|(i, s)| OjtPhoneme::new(s.as_ref().to_string(), i as f32, i as f32 + 1.))
+                .collect::<Vec<OjtPhoneme>>()
+                .as_slice(),
+        )
+    }
+}
+
+fn split_mora(phoneme_list: &[OjtPhoneme]) -> (Vec<OjtPhoneme>, Vec<OjtPhoneme>, Vec<i64>) {
+    let mut vowel_indexes = Vec::new();
+    for (i, phoneme) in phoneme_list.iter().enumerate() {
+        if MORA_PHONEME_LIST
+            .iter()
+            .any(|mora_phoneme| *mora_phoneme == phoneme.phoneme())
+        {
+            vowel_indexes.push(i as i64);
+        }
+    }
+
+    let vowel_phoneme_list = vowel_indexes
+        .iter()
+        .map(|vowel_index| phoneme_list[*vowel_index as usize].clone())
+        .collect();
+
+    let mut consonant_phoneme_list = vec![OjtPhoneme::default()];
+    for i in 0..(vowel_indexes.len() - 1) {
+        let prev = vowel_indexes[i];
+        let next = vowel_indexes[i + 1];
+        if next - prev == 1 {
+            consonant_phoneme_list.push(OjtPhoneme::default());
+        } else {
+            consonant_phoneme_list.push(phoneme_list[next as usize - 1].clone());
+        }
+    }
+
+    return (consonant_phoneme_list, vowel_phoneme_list, vowel_indexes);
+
+    const MORA_PHONEME_LIST: &[&str] = &[
+        "a", "i", "u", "e", "o", "N", "A", "I", "U", "E", "O", "cl", "pau",
+    ];
+}
+
+fn mora_to_text(mora: impl AsRef<str>) -> String {
+    let last_char = mora.as_ref().chars().last().unwrap();
+    let mora = if ['A', 'I', 'U', 'E', 'O'].contains(&last_char) {
+        format!(
+            "{}{}",
+            &mora.as_ref()[0..mora.as_ref().len() - 1],
+            last_char.to_lowercase()
+        )
+    } else {
+        mora.as_ref().to_string()
+    };
+    // もしカタカナに変換できなければ、引数で与えた文字列がそのまま返ってくる
+    engine::mora2text(&mora).to_string()
+}
+
 impl AudioQueryModel {
     fn from_accent_phrases(accent_phrases: Vec<AccentPhraseModel>) -> Self {
         let kana = create_kana(&accent_phrases);
@@ -492,17 +1135,19 @@ impl AudioQueryModel {
 
 #[cfg(test)]
 mod tests {
+    use std::sync::Arc;
 
     use super::*;
     use crate::{engine::MoraModel, macros::tests::assert_debug_fmt_eq};
     use ::test_util::OPEN_JTALK_DIC_DIR;
+    use ndarray::{array, s, Array};
 
     #[rstest]
     #[case(Ok(()))]
     #[tokio::test]
     async fn load_model_works(#[case] expected_result_at_initialized: Result<()>) {
         let syntesizer = Synthesizer::new(
-            Arc::new(OpenJtalk::new_without_dic()),
+            (),
             &InitializeOptions {
                 acceleration_mode: AccelerationMode::Cpu,
                 ..Default::default()
@@ -525,7 +1170,7 @@ mod tests {
     #[tokio::test]
     async fn is_use_gpu_works() {
         let syntesizer = Synthesizer::new(
-            Arc::new(OpenJtalk::new_without_dic()),
+            (),
             &InitializeOptions {
                 acceleration_mode: AccelerationMode::Cpu,
                 ..Default::default()
@@ -541,7 +1186,7 @@ mod tests {
     async fn is_loaded_model_by_style_id_works(#[case] style_id: u32, #[case] expected: bool) {
         let style_id = StyleId::new(style_id);
         let syntesizer = Synthesizer::new(
-            Arc::new(OpenJtalk::new_without_dic()),
+            (),
             &InitializeOptions {
                 acceleration_mode: AccelerationMode::Cpu,
                 ..Default::default()
@@ -569,7 +1214,7 @@ mod tests {
     #[tokio::test]
     async fn predict_duration_works() {
         let syntesizer = Synthesizer::new(
-            Arc::new(OpenJtalk::new_without_dic()),
+            (),
             &InitializeOptions {
                 acceleration_mode: AccelerationMode::Cpu,
                 ..Default::default()
@@ -583,14 +1228,12 @@ mod tests {
             .unwrap();
 
         // 「こんにちは、音声合成の世界へようこそ」という文章を変換して得た phoneme_vector
-        let phoneme_vector = [
+        let phoneme_vector = array![
             0, 23, 30, 4, 28, 21, 10, 21, 42, 7, 0, 30, 4, 35, 14, 14, 16, 30, 30, 35, 14, 14, 28,
             30, 35, 14, 23, 7, 21, 14, 43, 30, 30, 23, 30, 35, 30, 0,
         ];
 
-        let result = syntesizer
-            .predict_duration(&phoneme_vector, StyleId::new(1))
-            .await;
+        let result = syntesizer.predict_duration(phoneme_vector.clone(), StyleId::new(1));
 
         assert!(result.is_ok(), "{result:?}");
         assert_eq!(result.unwrap().len(), phoneme_vector.len());
@@ -600,7 +1243,7 @@ mod tests {
     #[tokio::test]
     async fn predict_intonation_works() {
         let syntesizer = Synthesizer::new(
-            Arc::new(OpenJtalk::new_without_dic()),
+            (),
             &InitializeOptions {
                 acceleration_mode: AccelerationMode::Cpu,
                 ..Default::default()
@@ -613,35 +1256,32 @@ mod tests {
             .unwrap();
 
         // 「テスト」という文章に対応する入力
-        let vowel_phoneme_vector = [0, 14, 6, 30, 0];
-        let consonant_phoneme_vector = [-1, 37, 35, 37, -1];
-        let start_accent_vector = [0, 1, 0, 0, 0];
-        let end_accent_vector = [0, 1, 0, 0, 0];
-        let start_accent_phrase_vector = [0, 1, 0, 0, 0];
-        let end_accent_phrase_vector = [0, 0, 0, 1, 0];
-
-        let result = syntesizer
-            .predict_intonation(
-                vowel_phoneme_vector.len(),
-                &vowel_phoneme_vector,
-                &consonant_phoneme_vector,
-                &start_accent_vector,
-                &end_accent_vector,
-                &start_accent_phrase_vector,
-                &end_accent_phrase_vector,
-                StyleId::new(1),
-            )
-            .await;
+        let vowel_phoneme_vector = array![0, 14, 6, 30, 0];
+        let consonant_phoneme_vector = array![-1, 37, 35, 37, -1];
+        let start_accent_vector = array![0, 1, 0, 0, 0];
+        let end_accent_vector = array![0, 1, 0, 0, 0];
+        let start_accent_phrase_vector = array![0, 1, 0, 0, 0];
+        let end_accent_phrase_vector = array![0, 0, 0, 1, 0];
+
+        let result = syntesizer.predict_intonation(
+            vowel_phoneme_vector,
+            consonant_phoneme_vector,
+            start_accent_vector,
+            end_accent_vector,
+            start_accent_phrase_vector,
+            end_accent_phrase_vector,
+            StyleId::new(1),
+        );
 
         assert!(result.is_ok(), "{result:?}");
-        assert_eq!(result.unwrap().len(), vowel_phoneme_vector.len());
+        assert_eq!(result.unwrap().len(), 5);
     }
 
     #[rstest]
     #[tokio::test]
     async fn decode_works() {
         let syntesizer = Synthesizer::new(
-            Arc::new(OpenJtalk::new_without_dic()),
+            (),
             &InitializeOptions {
                 acceleration_mode: AccelerationMode::Cpu,
                 ..Default::default()
@@ -655,15 +1295,14 @@ mod tests {
 
         // 「テスト」という文章に対応する入力
         const F0_LENGTH: usize = 69;
-        let mut f0 = [0.; F0_LENGTH];
-        f0[9..24].fill(5.905218);
-        f0[37..60].fill(5.565851);
+        let mut f0 = ndarray::arr1(&[0.; F0_LENGTH]);
+        f0.slice_mut(s!(9..24)).fill(5.905218);
+        f0.slice_mut(s!(37..60)).fill(5.565851);
 
-        const PHONEME_SIZE: usize = 45;
-        let mut phoneme = [0.; PHONEME_SIZE * F0_LENGTH];
+        let mut phoneme = Array::from_shape_simple_fn((F0_LENGTH, OjtPhoneme::NUM_PHONEME), || 0.);
         let mut set_one = |index, range| {
             for i in range {
-                phoneme[i * PHONEME_SIZE + index] = 1.;
+                phoneme[(i, index)] = 1.;
             }
         };
         set_one(0, 0..9);
@@ -675,9 +1314,7 @@ mod tests {
         set_one(30, 45..60);
         set_one(0, 60..69);
 
-        let result = syntesizer
-            .decode(F0_LENGTH, PHONEME_SIZE, &f0, &phoneme, StyleId::new(1))
-            .await;
+        let result = syntesizer.decode(f0.view(), phoneme.view(), StyleId::new(1));
 
         assert!(result.is_ok(), "{result:?}");
         assert_eq!(result.unwrap().len(), F0_LENGTH * 256);
@@ -862,6 +1499,65 @@ mod tests {
         }
     }
 
+    #[rstest]
+    #[tokio::test]
+    async fn accent_phrases_works_for_japanese_periods_and_commas() {
+        let syntesizer = Synthesizer::new(
+            Arc::new(OpenJtalk::new(OPEN_JTALK_DIC_DIR).unwrap()),
+            &InitializeOptions {
+                acceleration_mode: AccelerationMode::Cpu,
+                ..Default::default()
+            },
+        )
+        .unwrap();
+
+        let model = &VoiceModel::sample().await.unwrap();
+        syntesizer.load_voice_model(model).await.unwrap();
+
+        let accent_phrases = syntesizer
+            .create_accent_phrases("同じ、文章、です。完全に、同一です。", StyleId::new(1))
+            .await
+            .unwrap();
+        assert_eq!(accent_phrases.len(), 5);
+
+        // 入力テキストに「、」や「。」などの句読点が含まれていたときに
+        // AccentPhraseModel の pause_mora に期待する値をテスト
+
+        assert!(
+            accent_phrases[0].pause_mora().is_some(),
+            "accent_phrases[0].pause_mora() is None"
+        );
+        assert!(
+            accent_phrases[1].pause_mora().is_some(),
+            "accent_phrases[1].pause_mora() is None"
+        );
+        assert!(
+            accent_phrases[2].pause_mora().is_some(),
+            "accent_phrases[2].pause_mora() is None"
+        );
+        assert!(
+            accent_phrases[3].pause_mora().is_some(),
+            "accent_phrases[3].pause_mora() is None"
+        );
+        assert!(
+            accent_phrases[4].pause_mora().is_none(), // 文末の句読点は削除される
+            "accent_phrases[4].pause_mora() is not None"
+        );
+
+        for accent_phrase in accent_phrases.iter().take(4) {
+            let pause_mora = accent_phrase.pause_mora().clone().unwrap();
+            assert_eq!(pause_mora.text(), "、");
+            assert_eq!(pause_mora.consonant(), &None);
+            assert_eq!(pause_mora.consonant_length(), &None);
+            assert_eq!(pause_mora.vowel(), "pau");
+            assert_ne!(
+                pause_mora.vowel_length(),
+                &0.0,
+                "pause_mora.vowel_length() should not be 0.0"
+            );
+        }
+    }
+
     #[rstest]
     #[tokio::test]
     async fn mora_length_works() {
diff --git a/crates/voicevox_core_c_api/Cargo.toml b/crates/voicevox_core_c_api/Cargo.toml
index 0567a0c9e..dbf4df4f9 100644
--- a/crates/voicevox_core_c_api/Cargo.toml
+++ b/crates/voicevox_core_c_api/Cargo.toml
@@ -24,6 +24,7 @@ derive-getters.workspace = true
 futures.workspace = true
 itertools.workspace = true
 libc = "0.2.134"
+ndarray.workspace = true
 once_cell.workspace = true
 serde_json.workspace = true
 thiserror.workspace = true
@@ -52,7 +53,6 @@ easy-ext.workspace = true
 inventory = "0.3.4"
 libloading = "0.7.3"
 libtest-mimic = "0.6.0"
-ndarray.workspace = true
 ndarray-stats = "0.5.1"
 regex.workspace = true
 serde.workspace = true
diff --git a/crates/voicevox_core_c_api/src/compatible_engine.rs b/crates/voicevox_core_c_api/src/compatible_engine.rs
index dd8ce8e94..bd7715386 100644
--- a/crates/voicevox_core_c_api/src/compatible_engine.rs
+++ b/crates/voicevox_core_c_api/src/compatible_engine.rs
@@ -1,9 +1,10 @@
-use std::{collections::BTreeMap, sync::Arc};
+use std::collections::BTreeMap;
 
 use super::*;
 use libc::c_int;
 
-use voicevox_core::{OpenJtalk, StyleId, VoiceModel};
+use ndarray::ArrayView;
+use voicevox_core::{StyleId, VoiceModel, __internal::interp::PerformInference as _};
 
 macro_rules! ensure_initialized {
     ($synthesizer:expr $(,)?) => {
@@ -88,10 +89,10 @@ fn voice_model_set() -> &'static VoiceModelSet {
     &VOICE_MODEL_SET
 }
 
-static SYNTHESIZER: Lazy<Mutex<Option<voicevox_core::Synthesizer>>> =
+static SYNTHESIZER: Lazy<Mutex<Option<voicevox_core::Synthesizer<()>>>> =
     Lazy::new(|| Mutex::new(None));
 
-fn lock_synthesizer() -> MutexGuard<'static, Option<voicevox_core::Synthesizer>> {
+fn lock_synthesizer() -> MutexGuard<'static, Option<voicevox_core::Synthesizer<()>>> {
     SYNTHESIZER.lock().unwrap()
 }
 
@@ -106,7 +107,7 @@ fn set_message(message: &str) {
 pub extern "C" fn initialize(use_gpu: bool, cpu_num_threads: c_int, load_all_models: bool) -> bool {
     let result = RUNTIME.block_on(async {
         let synthesizer = voicevox_core::Synthesizer::new(
-            Arc::new(OpenJtalk::new_without_dic()),
+            (),
             &voicevox_core::InitializeOptions {
                 acceleration_mode: if use_gpu {
                     voicevox_core::AccelerationMode::Gpu
@@ -190,20 +191,21 @@ pub extern "C" fn supported_devices() -> *const c_char {
 }
 
 #[no_mangle]
-pub extern "C" fn yukarin_s_forward(
+pub unsafe extern "C" fn yukarin_s_forward(
     length: i64,
     phoneme_list: *mut i64,
     speaker_id: *mut i64,
     output: *mut f32,
 ) -> bool {
+    let length = length as usize;
     let synthesizer = &*lock_synthesizer();
-    let result = RUNTIME.block_on(ensure_initialized!(synthesizer).predict_duration(
-        unsafe { std::slice::from_raw_parts_mut(phoneme_list, length as usize) },
+    let result = ensure_initialized!(synthesizer).predict_duration(
+        ArrayView::from_shape_ptr((length,), phoneme_list).into_owned(),
         StyleId::new(unsafe { *speaker_id as u32 }),
-    ));
+    );
     match result {
         Ok(output_vec) => {
-            let output_slice = unsafe { std::slice::from_raw_parts_mut(output, length as usize) };
+            let output_slice = std::slice::from_raw_parts_mut(output, length);
             output_slice.clone_from_slice(&output_vec);
             true
         }
@@ -215,7 +217,7 @@ pub extern "C" fn yukarin_s_forward(
 }
 
 #[no_mangle]
-pub extern "C" fn yukarin_sa_forward(
+pub unsafe extern "C" fn yukarin_sa_forward(
     length: i64,
     vowel_phoneme_list: *mut i64,
     consonant_phoneme_list: *mut i64,
@@ -226,20 +228,20 @@ pub extern "C" fn yukarin_sa_forward(
     speaker_id: *mut i64,
     output: *mut f32,
 ) -> bool {
+    let length = length as usize;
     let synthesizer = &*lock_synthesizer();
-    let result = RUNTIME.block_on(ensure_initialized!(synthesizer).predict_intonation(
-        length as usize,
-        unsafe { std::slice::from_raw_parts(vowel_phoneme_list, length as usize) },
-        unsafe { std::slice::from_raw_parts(consonant_phoneme_list, length as usize) },
-        unsafe { std::slice::from_raw_parts(start_accent_list, length as usize) },
-        unsafe { std::slice::from_raw_parts(end_accent_list, length as usize) },
-        unsafe { std::slice::from_raw_parts(start_accent_phrase_list, length as usize) },
-        unsafe { std::slice::from_raw_parts(end_accent_phrase_list, length as usize) },
+    let result = ensure_initialized!(synthesizer).predict_intonation(
+        ArrayView::from_shape_ptr((length,), vowel_phoneme_list).into_owned(),
+        ArrayView::from_shape_ptr((length,), consonant_phoneme_list).into_owned(),
+        ArrayView::from_shape_ptr((length,), start_accent_list).into_owned(),
+        ArrayView::from_shape_ptr((length,), end_accent_list).into_owned(),
+        ArrayView::from_shape_ptr((length,), start_accent_phrase_list).into_owned(),
+        ArrayView::from_shape_ptr((length,), end_accent_phrase_list).into_owned(),
         StyleId::new(unsafe { *speaker_id as u32 }),
-    ));
+    );
     match result {
         Ok(output_vec) => {
-            let output_slice = unsafe { std::slice::from_raw_parts_mut(output, length as usize) };
+            let output_slice = std::slice::from_raw_parts_mut(output, length);
             output_slice.clone_from_slice(&output_vec);
             true
         }
@@ -251,7 +253,7 @@ pub extern "C" fn yukarin_sa_forward(
 }
 
 #[no_mangle]
-pub extern "C" fn decode_forward(
+pub unsafe extern "C" fn decode_forward(
     length: i64,
     phoneme_size: i64,
     f0: *mut f32,
@@ -262,16 +264,14 @@ pub extern "C" fn decode_forward(
     let length = length as usize;
     let phoneme_size = phoneme_size as usize;
     let synthesizer = &*lock_synthesizer();
-    let result = RUNTIME.block_on(ensure_initialized!(synthesizer).decode(
-        length,
-        phoneme_size,
-        unsafe { std::slice::from_raw_parts(f0, length) },
-        unsafe { std::slice::from_raw_parts(phoneme, phoneme_size * length) },
+    let result = ensure_initialized!(synthesizer).decode(
+        ArrayView::from_shape_ptr((length,), f0),
+        ArrayView::from_shape_ptr((length, phoneme_size), phoneme),
         StyleId::new(unsafe { *speaker_id as u32 }),
-    ));
+    );
     match result {
         Ok(output_vec) => {
-            let output_slice = unsafe { std::slice::from_raw_parts_mut(output, length * 256) };
+            let output_slice = std::slice::from_raw_parts_mut(output, length * 256);
             output_slice.clone_from_slice(&output_vec);
             true
         }
diff --git a/crates/voicevox_core_c_api/src/helpers.rs b/crates/voicevox_core_c_api/src/helpers.rs
index 698e89b45..9c582251f 100644
--- a/crates/voicevox_core_c_api/src/helpers.rs
+++ b/crates/voicevox_core_c_api/src/helpers.rs
@@ -29,7 +29,7 @@ pub(crate) fn into_result_code_with_error(result: CApiResult<()>) -> VoicevoxRes
         match result {
             Ok(()) => VOICEVOX_RESULT_OK,
             Err(RustApi(err)) => match err.kind() {
-                NotLoadedOpenjtalkDict => VOICEVOX_RESULT_NOT_LOADED_OPENJTALK_DICT_ERROR,
+                LoadOpenjtalkSystemDic => VOICEVOX_RESULT_LOAD_OPENJTALK_SYSTEM_DIC,
                 GpuSupport => VOICEVOX_RESULT_GPU_SUPPORT_ERROR,
                 OpenZipFile => VOICEVOX_RESULT_OPEN_ZIP_FILE_ERROR,
                 ReadZipEntry => VOICEVOX_RESULT_READ_ZIP_ENTRY_ERROR,
diff --git a/crates/voicevox_core_c_api/src/lib.rs b/crates/voicevox_core_c_api/src/lib.rs
index 302089a95..0e24b6d12 100644
--- a/crates/voicevox_core_c_api/src/lib.rs
+++ b/crates/voicevox_core_c_api/src/lib.rs
@@ -319,7 +319,7 @@ pub extern "C" fn voicevox_voice_model_delete(model: Box<VoicevoxVoiceModel>) {
 /// <b>構築</b>(_construction_)は ::voicevox_synthesizer_new で行い、<b>破棄</b>(_destruction_)は ::voicevox_synthesizer_delete で行う。
 #[derive(Getters)]
 pub struct VoicevoxSynthesizer {
-    synthesizer: Synthesizer,
+    synthesizer: Synthesizer<Arc<OpenJtalk>>,
 }
 
 /// ::VoicevoxSynthesizer を<b>構築</b>(_construct_)する。
diff --git a/crates/voicevox_core_c_api/src/result_code.rs b/crates/voicevox_core_c_api/src/result_code.rs
index 65236ada4..d77ffc405 100644
--- a/crates/voicevox_core_c_api/src/result_code.rs
+++ b/crates/voicevox_core_c_api/src/result_code.rs
@@ -11,8 +11,8 @@ pub enum VoicevoxResultCode {
     // 出力フォーマットを変更すればRustでよく使われているUpperCamelにできるが、実際に出力されるコードとの差異をできるだけ少なくするため
     /// 成功
     VOICEVOX_RESULT_OK = 0,
-    /// open_jtalk辞書ファイルが読み込まれていない
-    VOICEVOX_RESULT_NOT_LOADED_OPENJTALK_DICT_ERROR = 1,
+    /// Open JTalkのシステム辞書を読むことができなかった
+    VOICEVOX_RESULT_LOAD_OPENJTALK_SYSTEM_DIC = 1,
     /// サポートされているデバイス情報取得に失敗した
     VOICEVOX_RESULT_GET_SUPPORTED_DEVICES_ERROR = 3,
     /// GPUモードがサポートされていない
@@ -60,8 +60,8 @@ pub enum VoicevoxResultCode {
 pub(crate) const fn error_result_to_message(result_code: VoicevoxResultCode) -> &'static CStr {
     use VoicevoxResultCode::*;
     match result_code {
-        VOICEVOX_RESULT_NOT_LOADED_OPENJTALK_DICT_ERROR => {
-            cstr!("OpenJTalkの辞書が読み込まれていません")
+        VOICEVOX_RESULT_LOAD_OPENJTALK_SYSTEM_DIC => {
+            cstr!("Open JTalkのシステム辞書を読むことができませんでした")
         }
         VOICEVOX_RESULT_GPU_SUPPORT_ERROR => cstr!("GPU機能をサポートすることができません"),
         VOICEVOX_RESULT_GET_SUPPORTED_DEVICES_ERROR => {
diff --git a/crates/voicevox_core_c_api/tests/e2e/snapshots.toml b/crates/voicevox_core_c_api/tests/e2e/snapshots.toml
index 8f3fa4f3b..b8792c29c 100644
--- a/crates/voicevox_core_c_api/tests/e2e/snapshots.toml
+++ b/crates/voicevox_core_c_api/tests/e2e/snapshots.toml
@@ -50,7 +50,7 @@ stderr = ""
 
 [global_info]
 result_messages.0 = "エラーが発生しませんでした"
-result_messages.1 = "OpenJTalkの辞書が読み込まれていません"
+result_messages.1 = "Open JTalkのシステム辞書を読むことができませんでした"
 result_messages.3 = "サポートされているデバイス情報取得中にエラーが発生しました"
 result_messages.4 = "GPU機能をサポートすることができません"
 result_messages.6 = "指定されたIDに対するスタイルが見つかりませんでした。音声モデルが読み込まれていないか、読み込みが解除されています"
diff --git a/crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/exceptions/LoadOpenjtalkSystemDicException.java b/crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/exceptions/LoadOpenjtalkSystemDicException.java
new file mode 100644
index 000000000..ec1b92871
--- /dev/null
+++ b/crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/exceptions/LoadOpenjtalkSystemDicException.java
@@ -0,0 +1,12 @@
+package jp.hiroshiba.voicevoxcore.exceptions;
+
+/** Open JTalkのシステム辞書を読むことができなかった。 */
+public class LoadOpenjtalkSystemDicException extends IllegalStateException {
+  public LoadOpenjtalkSystemDicException(String message) {
+    super(message);
+  }
+
+  public LoadOpenjtalkSystemDicException(String message, Throwable cause) {
+    super(message, cause);
+  }
+}
diff --git a/crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/exceptions/NotLoadedOpenjtalkDictException.java b/crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/exceptions/NotLoadedOpenjtalkDictException.java
deleted file mode 100644
index 3bee93b08..000000000
--- a/crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/exceptions/NotLoadedOpenjtalkDictException.java
+++ /dev/null
@@ -1,12 +0,0 @@
-package jp.hiroshiba.voicevoxcore.exceptions;
-
-/** open_jtalk辞書ファイルが読み込まれていない。 */
-public class NotLoadedOpenjtalkDictException extends IllegalStateException {
-  public NotLoadedOpenjtalkDictException(String message) {
-    super(message);
-  }
-
-  public NotLoadedOpenjtalkDictException(String message, Throwable cause) {
-    super(message, cause);
-  }
-}
diff --git a/crates/voicevox_core_java_api/src/common.rs b/crates/voicevox_core_java_api/src/common.rs
index 138676a1d..a135387dc 100644
--- a/crates/voicevox_core_java_api/src/common.rs
+++ b/crates/voicevox_core_java_api/src/common.rs
@@ -122,7 +122,7 @@ where
                         }
 
                         let class = class!(
-                            NotLoadedOpenjtalkDict,
+                            LoadOpenjtalkSystemDic,
                             GpuSupport,
                             OpenZipFile,
                             ReadZipEntry,
diff --git a/crates/voicevox_core_java_api/src/synthesizer.rs b/crates/voicevox_core_java_api/src/synthesizer.rs
index 9d3cb9f9f..bb48f3c01 100644
--- a/crates/voicevox_core_java_api/src/synthesizer.rs
+++ b/crates/voicevox_core_java_api/src/synthesizer.rs
@@ -9,6 +9,7 @@ use jni::{
     JNIEnv,
 };
 use std::sync::Arc;
+use voicevox_core::OpenJtalk;
 
 #[no_mangle]
 unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_Synthesizer_rsNew<'local>(
@@ -64,7 +65,9 @@ unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_Synthesizer_rsIsGpuMode
 ) -> jboolean {
     throw_if_err(env, false, |env| {
         let internal = env
-            .get_rust_field::<_, _, Arc<voicevox_core::Synthesizer>>(&this, "handle")?
+            .get_rust_field::<_, _, Arc<voicevox_core::Synthesizer<Arc<OpenJtalk>>>>(
+                &this, "handle",
+            )?
             .clone();
 
         Ok(internal.is_gpu_mode())
@@ -78,7 +81,9 @@ unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_Synthesizer_rsGetMetasJ
 ) -> jobject {
     throw_if_err(env, std::ptr::null_mut(), |env| {
         let internal = env
-            .get_rust_field::<_, _, Arc<voicevox_core::Synthesizer>>(&this, "handle")?
+            .get_rust_field::<_, _, Arc<voicevox_core::Synthesizer<Arc<OpenJtalk>>>>(
+                &this, "handle",
+            )?
             .clone();
 
         let metas_json = serde_json::to_string(&internal.metas()).expect("should not fail");
@@ -100,7 +105,9 @@ unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_Synthesizer_rsLoadVoice
             .get_rust_field::<_, _, Arc<voicevox_core::VoiceModel>>(&model, "handle")?
             .clone();
         let internal = env
-            .get_rust_field::<_, _, Arc<voicevox_core::Synthesizer>>(&this, "handle")?
+            .get_rust_field::<_, _, Arc<voicevox_core::Synthesizer<Arc<OpenJtalk>>>>(
+                &this, "handle",
+            )?
             .clone();
         RUNTIME.block_on(internal.load_voice_model(&model))?;
         Ok(())
@@ -117,7 +124,9 @@ unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_Synthesizer_rsUnloadVoi
         let model_id: String = env.get_string(&model_id)?.into();
 
         let internal = env
-            .get_rust_field::<_, _, Arc<voicevox_core::Synthesizer>>(&this, "handle")?
+            .get_rust_field::<_, _, Arc<voicevox_core::Synthesizer<Arc<OpenJtalk>>>>(
+                &this, "handle",
+            )?
             .clone();
 
         internal.unload_voice_model(&voicevox_core::VoiceModelId::new(model_id))?;
@@ -138,7 +147,9 @@ unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_Synthesizer_rsIsLoadedV
         let model_id: String = env.get_string(&model_id)?.into();
 
         let internal = env
-            .get_rust_field::<_, _, Arc<voicevox_core::Synthesizer>>(&this, "handle")?
+            .get_rust_field::<_, _, Arc<voicevox_core::Synthesizer<Arc<OpenJtalk>>>>(
+                &this, "handle",
+            )?
             .clone();
 
         let is_loaded = internal.is_loaded_voice_model(&voicevox_core::VoiceModelId::new(model_id));
@@ -162,7 +173,9 @@ unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_Synthesizer_rsAudioQuer
         let style_id = style_id as u32;
 
         let internal = env
-            .get_rust_field::<_, _, Arc<voicevox_core::Synthesizer>>(&this, "handle")?
+            .get_rust_field::<_, _, Arc<voicevox_core::Synthesizer<Arc<OpenJtalk>>>>(
+                &this, "handle",
+            )?
             .clone();
 
         let audio_query = RUNTIME.block_on(
@@ -189,7 +202,9 @@ unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_Synthesizer_rsAudioQuer
         let style_id = style_id as u32;
 
         let internal = env
-            .get_rust_field::<_, _, Arc<voicevox_core::Synthesizer>>(&this, "handle")?
+            .get_rust_field::<_, _, Arc<voicevox_core::Synthesizer<Arc<OpenJtalk>>>>(
+                &this, "handle",
+            )?
             .clone();
 
         let audio_query =
@@ -217,7 +232,9 @@ unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_Synthesizer_rsAccentPhr
         let style_id = style_id as u32;
 
         let internal = env
-            .get_rust_field::<_, _, Arc<voicevox_core::Synthesizer>>(&this, "handle")?
+            .get_rust_field::<_, _, Arc<voicevox_core::Synthesizer<Arc<OpenJtalk>>>>(
+                &this, "handle",
+            )?
             .clone();
 
         let accent_phrases = RUNTIME.block_on(
@@ -244,7 +261,9 @@ unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_Synthesizer_rsAccentPhr
         let style_id = style_id as u32;
 
         let internal = env
-            .get_rust_field::<_, _, Arc<voicevox_core::Synthesizer>>(&this, "handle")?
+            .get_rust_field::<_, _, Arc<voicevox_core::Synthesizer<Arc<OpenJtalk>>>>(
+                &this, "handle",
+            )?
             .clone();
 
         let accent_phrases = RUNTIME.block_on(
@@ -273,7 +292,9 @@ unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_Synthesizer_rsReplaceMo
         let style_id = style_id as u32;
 
         let internal = env
-            .get_rust_field::<_, _, Arc<voicevox_core::Synthesizer>>(&this, "handle")?
+            .get_rust_field::<_, _, Arc<voicevox_core::Synthesizer<Arc<OpenJtalk>>>>(
+                &this, "handle",
+            )?
             .clone();
 
         let replaced_accent_phrases = RUNTIME.block_on(
@@ -303,7 +324,9 @@ unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_Synthesizer_rsReplacePh
         let style_id = style_id as u32;
 
         let internal = env
-            .get_rust_field::<_, _, Arc<voicevox_core::Synthesizer>>(&this, "handle")?
+            .get_rust_field::<_, _, Arc<voicevox_core::Synthesizer<Arc<OpenJtalk>>>>(
+                &this, "handle",
+            )?
             .clone();
 
         let replaced_accent_phrases = {
@@ -334,7 +357,9 @@ unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_Synthesizer_rsReplaceMo
         let style_id = style_id as u32;
 
         let internal = env
-            .get_rust_field::<_, _, Arc<voicevox_core::Synthesizer>>(&this, "handle")?
+            .get_rust_field::<_, _, Arc<voicevox_core::Synthesizer<Arc<OpenJtalk>>>>(
+                &this, "handle",
+            )?
             .clone();
 
         let replaced_accent_phrases = RUNTIME.block_on(
@@ -363,7 +388,9 @@ unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_Synthesizer_rsSynthesis
         let style_id = style_id as u32;
 
         let internal = env
-            .get_rust_field::<_, _, Arc<voicevox_core::Synthesizer>>(&this, "handle")?
+            .get_rust_field::<_, _, Arc<voicevox_core::Synthesizer<Arc<OpenJtalk>>>>(
+                &this, "handle",
+            )?
             .clone();
 
         let wave = {
@@ -397,7 +424,9 @@ unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_Synthesizer_rsTtsFromKa
         let style_id = style_id as u32;
 
         let internal = env
-            .get_rust_field::<_, _, Arc<voicevox_core::Synthesizer>>(&this, "handle")?
+            .get_rust_field::<_, _, Arc<voicevox_core::Synthesizer<Arc<OpenJtalk>>>>(
+                &this, "handle",
+            )?
             .clone();
 
         let wave = {
@@ -431,7 +460,9 @@ unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_Synthesizer_rsTts<'loca
         let style_id = style_id as u32;
 
         let internal = env
-            .get_rust_field::<_, _, Arc<voicevox_core::Synthesizer>>(&this, "handle")?
+            .get_rust_field::<_, _, Arc<voicevox_core::Synthesizer<Arc<OpenJtalk>>>>(
+                &this, "handle",
+            )?
             .clone();
 
         let wave = {
diff --git a/crates/voicevox_core_python_api/python/voicevox_core/__init__.py b/crates/voicevox_core_python_api/python/voicevox_core/__init__.py
index fc09808bd..d1c0963ac 100644
--- a/crates/voicevox_core_python_api/python/voicevox_core/__init__.py
+++ b/crates/voicevox_core_python_api/python/voicevox_core/__init__.py
@@ -21,10 +21,10 @@
     InferenceFailedError,
     InvalidModelDataError,
     InvalidWordError,
+    LoadOpenjtalkSystemDicError,
     LoadUserDictError,
     ModelAlreadyLoadedError,
     ModelNotFoundError,
-    NotLoadedOpenjtalkDictError,
     OpenJtalk,
     OpenZipFileError,
     ParseKanaError,
@@ -52,11 +52,11 @@
     "InferenceFailedError",
     "InvalidModelDataError",
     "InvalidWordError",
+    "LoadOpenjtalkSystemDicError",
     "LoadUserDictError",
     "ModelAlreadyLoadedError",
     "ModelNotFoundError",
     "Mora",
-    "NotLoadedOpenjtalkDictError",
     "OpenJtalk",
     "OpenZipFileError",
     "ParseKanaError",
diff --git a/crates/voicevox_core_python_api/python/voicevox_core/_rust.pyi b/crates/voicevox_core_python_api/python/voicevox_core/_rust.pyi
index 5288fcbde..4df0ae9be 100644
--- a/crates/voicevox_core_python_api/python/voicevox_core/_rust.pyi
+++ b/crates/voicevox_core_python_api/python/voicevox_core/_rust.pyi
@@ -423,8 +423,8 @@ class UserDict:
         """
         ...
 
-class NotLoadedOpenjtalkDictError(Exception):
-    """open_jtalk辞書ファイルが読み込まれていない。"""
+class LoadOpenjtalkSystemDicError(Exception):
+    """Open JTalkのシステム辞書を読むことができなかった。"""
 
     ...
 
diff --git a/crates/voicevox_core_python_api/src/convert.rs b/crates/voicevox_core_python_api/src/convert.rs
index 7e8d437ad..3dc5116a7 100644
--- a/crates/voicevox_core_python_api/src/convert.rs
+++ b/crates/voicevox_core_python_api/src/convert.rs
@@ -15,8 +15,8 @@ use voicevox_core::{
 
 use crate::{
     ExtractFullContextLabelError, GetSupportedDevicesError, GpuSupportError, InferenceFailedError,
-    InvalidModelDataError, InvalidWordError, LoadUserDictError, ModelAlreadyLoadedError,
-    ModelNotFoundError, NotLoadedOpenjtalkDictError, OpenZipFileError, ParseKanaError,
+    InvalidModelDataError, InvalidWordError, LoadOpenjtalkSystemDicError, LoadUserDictError,
+    ModelAlreadyLoadedError, ModelNotFoundError, OpenZipFileError, ParseKanaError,
     ReadZipEntryError, SaveUserDictError, StyleAlreadyLoadedError, StyleNotFoundError,
     UseUserDictError, WordNotFoundError,
 };
@@ -158,7 +158,7 @@ pub impl<T> voicevox_core::Result<T> {
         self.map_err(|err| {
             let msg = err.to_string();
             let top = match err.kind() {
-                ErrorKind::NotLoadedOpenjtalkDict => NotLoadedOpenjtalkDictError::new_err(msg),
+                ErrorKind::LoadOpenjtalkSystemDic => LoadOpenjtalkSystemDicError::new_err(msg),
                 ErrorKind::GpuSupport => GpuSupportError::new_err(msg),
                 ErrorKind::OpenZipFile => OpenZipFileError::new_err(msg),
                 ErrorKind::ReadZipEntry => ReadZipEntryError::new_err(msg),
diff --git a/crates/voicevox_core_python_api/src/lib.rs b/crates/voicevox_core_python_api/src/lib.rs
index 1531b463e..f595474da 100644
--- a/crates/voicevox_core_python_api/src/lib.rs
+++ b/crates/voicevox_core_python_api/src/lib.rs
@@ -50,7 +50,7 @@ macro_rules! exceptions {
 }
 
 exceptions! {
-    NotLoadedOpenjtalkDictError: PyException;
+    LoadOpenjtalkSystemDicError: PyException;
     GpuSupportError: PyException;
     OpenZipFileError: PyException;
     ReadZipEntryError: PyException;
@@ -140,7 +140,7 @@ impl OpenJtalk {
 
 #[pyclass]
 struct Synthesizer {
-    synthesizer: Closable<Arc<voicevox_core::Synthesizer>, Self>,
+    synthesizer: Closable<Arc<voicevox_core::Synthesizer<Arc<voicevox_core::OpenJtalk>>>, Self>,
 }
 
 #[pymethods]