styleIdとsession.runに渡す数値が異なっているVVMでも音声合成できるようにする (#551)

Co-authored-by: Ryo Yamashita <qryxip@gmail.com>
VOICEVOX · Aug 3, 2023 · e0d32a5 · e0d32a5
1 parent f2b66ec
commit e0d32a5
Show file tree

Hide file tree

Showing 13 changed files with 157 additions and 86 deletions.
diff --git a/crates/voicevox_core/src/error.rs b/crates/voicevox_core/src/error.rs
@@ -40,11 +40,14 @@ pub enum Error {
         source: anyhow::Error,
     },
 
-    #[error("{},{filename}", base_error_message(VOICEVOX_VVM_MODEL_READ_ERROR))]
+    #[error(
+        "{}({path}):{source}",
+        base_error_message(VOICEVOX_VVM_MODEL_READ_ERROR)
+    )]
     VvmRead {
-        filename: String,
+        path: PathBuf,
         #[source]
-        source: Option<anyhow::Error>,
+        source: anyhow::Error,
     },
 
     #[error("{},{0}", base_error_message(VOICEVOX_RESULT_LOAD_METAS_ERROR))]
@@ -63,10 +66,10 @@ pub enum Error {
     InvalidStyleId { style_id: StyleId },
 
     #[error(
-        "{}: {model_index}",
-        base_error_message(VOICEVOX_RESULT_INVALID_MODEL_INDEX_ERROR)
+        "{}: {model_id:?}",
+        base_error_message(VOICEVOX_RESULT_INVALID_MODEL_ID_ERROR)
     )]
-    InvalidModelIndex { model_index: usize },
+    InvalidModelId { model_id: VoiceModelId },
 
     #[error("{}", base_error_message(VOICEVOX_RESULT_INFERENCE_ERROR))]
     InferenceFailed,

diff --git a/crates/voicevox_core/src/inference_core.rs b/crates/voicevox_core/src/inference_core.rs
@@ -71,15 +71,21 @@ impl InferenceCore {
             return Err(Error::InvalidStyleId { style_id });
         }
 
+        let (model_id, model_inner_id) = self
+            .status
+            .id_relations
+            .get(&style_id)
+            .ok_or(Error::InvalidStyleId { style_id })?;
+
         let mut phoneme_vector_array = NdArray::new(ndarray::arr1(phoneme_vector));
-        let mut speaker_id_array = NdArray::new(ndarray::arr1(&[style_id.raw_id() as i64]));
+        let mut speaker_id_array = NdArray::new(ndarray::arr1(&[model_inner_id.raw_id() as i64]));
 
         let input_tensors: Vec<&mut dyn AnyArray> =
             vec![&mut phoneme_vector_array, &mut speaker_id_array];
 
         let mut output = self
             .status
-            .predict_duration_session_run(style_id, input_tensors)?;
+            .predict_duration_session_run(model_id, input_tensors)?;
 
         for output_item in output.iter_mut() {
             if *output_item < PHONEME_LENGTH_MINIMAL {
@@ -106,6 +112,12 @@ impl InferenceCore {
             return Err(Error::InvalidStyleId { style_id });
         }
 
+        let (model_id, model_inner_id) = self
+            .status
+            .id_relations
+            .get(&style_id)
+            .ok_or(Error::InvalidStyleId { style_id })?;
+
         let mut length_array = NdArray::new(ndarray::arr0(length as i64));
         let mut vowel_phoneme_vector_array = NdArray::new(ndarray::arr1(vowel_phoneme_vector));
         let mut consonant_phoneme_vector_array =
@@ -116,7 +128,7 @@ impl InferenceCore {
             NdArray::new(ndarray::arr1(start_accent_phrase_vector));
         let mut end_accent_phrase_vector_array =
             NdArray::new(ndarray::arr1(end_accent_phrase_vector));
-        let mut speaker_id_array = NdArray::new(ndarray::arr1(&[style_id.raw_id() as i64]));
+        let mut speaker_id_array = NdArray::new(ndarray::arr1(&[model_inner_id.raw_id() as i64]));
 
         let input_tensors: Vec<&mut dyn AnyArray> = vec![
             &mut length_array,
@@ -130,7 +142,7 @@ impl InferenceCore {
         ];
 
         self.status
-            .predict_intonation_session_run(style_id, input_tensors)
+            .predict_intonation_session_run(model_id, input_tensors)
     }
 
     pub async fn decode(
@@ -145,6 +157,12 @@ impl InferenceCore {
             return Err(Error::InvalidStyleId { style_id });
         }
 
+        let (model_id, model_inner_id) = self
+            .status
+            .id_relations
+            .get(&style_id)
+            .ok_or(Error::InvalidStyleId { style_id })?;
+
         // 音が途切れてしまうのを避けるworkaround処理が入っている
         // TODO: 改善したらここのpadding処理を取り除く
         const PADDING_SIZE: f64 = 0.4;
@@ -171,13 +189,13 @@ impl InferenceCore {
                 .into_shape([length_with_padding, phoneme_size])
                 .unwrap(),
         );
-        let mut speaker_id_array = NdArray::new(ndarray::arr1(&[style_id.raw_id() as i64]));
+        let mut speaker_id_array = NdArray::new(ndarray::arr1(&[model_inner_id.raw_id() as i64]));
 
         let input_tensors: Vec<&mut dyn AnyArray> =
             vec![&mut f0_array, &mut phoneme_array, &mut speaker_id_array];
 
         self.status
-            .decode_session_run(style_id, input_tensors)
+            .decode_session_run(model_id, input_tensors)
             .map(|output| Self::trim_padding_from_output(output, padding_size))
     }
 

diff --git a/crates/voicevox_core/src/manifest.rs b/crates/voicevox_core/src/manifest.rs
@@ -1,8 +1,10 @@
-use std::fmt::Display;
+use std::{collections::BTreeMap, fmt::Display};
 
 use derive_getters::Getters;
 use derive_new::new;
-use serde::Deserialize;
+use serde::{Deserialize, Serialize};
+
+use super::*;
 
 pub type RawManifestVersion = String;
 #[derive(Deserialize, Clone, Debug, PartialEq, new)]
@@ -20,11 +22,31 @@ impl Display for ManifestVersion {
     }
 }
 
+/// モデル内IDの実体
+pub type RawModelInnerId = u32;
+/// モデル内ID
+#[derive(PartialEq, Eq, Clone, Copy, Ord, PartialOrd, Deserialize, Serialize, new, Debug)]
+pub struct ModelInnerId(RawModelInnerId);
+
+impl ModelInnerId {
+    pub fn raw_id(self) -> RawModelInnerId {
+        self.0
+    }
+}
+
+impl Display for ModelInnerId {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", self.raw_id())
+    }
+}
+
 #[derive(Deserialize, Getters, Clone)]
 pub struct Manifest {
     manifest_version: ManifestVersion,
     metas_filename: String,
     decode_filename: String,
     predict_duration_filename: String,
     predict_intonation_filename: String,
+    #[serde(default)]
+    style_id_to_model_inner_id: BTreeMap<StyleId, ModelInnerId>,
 }
diff --git a/crates/voicevox_core/src/result_code.rs b/crates/voicevox_core/src/result_code.rs
@@ -21,8 +21,8 @@ pub enum VoicevoxResultCode {
     VOICEVOX_RESULT_LOAD_METAS_ERROR = 5,
     /// 無効なstyle_idが指定された
     VOICEVOX_RESULT_INVALID_STYLE_ID_ERROR = 6,
-    /// 無効なmodel_indexが指定された
-    VOICEVOX_RESULT_INVALID_MODEL_INDEX_ERROR = 7,
+    /// 無効なmodel_idが指定された
+    VOICEVOX_RESULT_INVALID_MODEL_ID_ERROR = 7,
     /// 推論に失敗した
     VOICEVOX_RESULT_INFERENCE_ERROR = 8,
     /// コンテキストラベル出力に失敗した
@@ -74,7 +74,7 @@ pub const fn error_result_to_message(result_code: VoicevoxResultCode) -> &'stati
 
         VOICEVOX_RESULT_OK => "エラーが発生しませんでした\0",
         VOICEVOX_RESULT_INVALID_STYLE_ID_ERROR => "無効なspeaker_idです\0",
-        VOICEVOX_RESULT_INVALID_MODEL_INDEX_ERROR => "無効なmodel_indexです\0",
+        VOICEVOX_RESULT_INVALID_MODEL_ID_ERROR => "無効なmodel_idです\0",
         VOICEVOX_RESULT_INFERENCE_ERROR => "推論に失敗しました\0",
         VOICEVOX_RESULT_EXTRACT_FULL_CONTEXT_LABEL_ERROR => {
             "入力テキストからのフルコンテキストラベル抽出に失敗しました\0"

diff --git a/crates/voicevox_core/src/status.rs b/crates/voicevox_core/src/status.rs
@@ -23,7 +23,7 @@ pub struct Status {
     merged_metas: VoiceModelMeta,
     light_session_options: SessionOptions, // 軽いモデルはこちらを使う
     heavy_session_options: SessionOptions, // 重いモデルはこちらを使う
-    id_relations: BTreeMap<StyleId, VoiceModelId>,
+    pub id_relations: BTreeMap<StyleId, (VoiceModelId, ModelInnerId)>, // FIXME: pubはやめたい
 }
 
 struct StatusModels {
@@ -113,7 +113,10 @@ impl Status {
 
         for speaker in model.metas().iter() {
             for style in speaker.styles().iter() {
-                self.id_relations.insert(*style.id(), model.id().clone());
+                self.id_relations.insert(
+                    *style.id(),
+                    (model.id().clone(), model.model_inner_id_for(*style.id())),
+                );
             }
         }
         self.set_metas();
@@ -141,7 +144,7 @@ impl Status {
             let remove_style_ids = self
                 .id_relations
                 .iter()
-                .filter(|&(_, loaded_model_id)| loaded_model_id == voice_model_id)
+                .filter(|&(_, (loaded_model_id, _))| loaded_model_id == voice_model_id)
                 .map(|(&style_id, _)| style_id)
                 .collect::<Vec<_>>();
 
@@ -228,61 +231,55 @@ impl Status {
 
     pub fn predict_duration_session_run(
         &self,
-        style_id: StyleId,
+        model_id: &VoiceModelId,
         inputs: Vec<&mut dyn AnyArray>,
     ) -> Result<Vec<f32>> {
-        if let Some(model_id) = self.id_relations.get(&style_id) {
-            if let Some(model) = self.models.predict_duration.get(model_id) {
-                if let Ok(output_tensors) = model.lock().unwrap().run(inputs) {
-                    Ok(output_tensors[0].as_slice().unwrap().to_owned())
-                } else {
-                    Err(Error::InferenceFailed)
-                }
+        if let Some(model) = self.models.predict_duration.get(model_id) {
+            if let Ok(output_tensors) = model.lock().unwrap().run(inputs) {
+                Ok(output_tensors[0].as_slice().unwrap().to_owned())
             } else {
-                Err(Error::InvalidStyleId { style_id })
+                Err(Error::InferenceFailed)
             }
         } else {
-            Err(Error::InvalidStyleId { style_id })
+            Err(Error::InvalidModelId {
+                model_id: model_id.clone(),
+            })
         }
     }
 
     pub fn predict_intonation_session_run(
         &self,
-        style_id: StyleId,
+        model_id: &VoiceModelId,
         inputs: Vec<&mut dyn AnyArray>,
     ) -> Result<Vec<f32>> {
-        if let Some(model_id) = self.id_relations.get(&style_id) {
-            if let Some(model) = self.models.predict_intonation.get(model_id) {
-                if let Ok(output_tensors) = model.lock().unwrap().run(inputs) {
-                    Ok(output_tensors[0].as_slice().unwrap().to_owned())
-                } else {
-                    Err(Error::InferenceFailed)
-                }
+        if let Some(model) = self.models.predict_intonation.get(model_id) {
+            if let Ok(output_tensors) = model.lock().unwrap().run(inputs) {
+                Ok(output_tensors[0].as_slice().unwrap().to_owned())
             } else {
-                Err(Error::InvalidStyleId { style_id })
+                Err(Error::InferenceFailed)
             }
         } else {
-            Err(Error::InvalidStyleId { style_id })
+            Err(Error::InvalidModelId {
+                model_id: model_id.clone(),
+            })
         }
     }
 
     pub fn decode_session_run(
         &self,
-        style_id: StyleId,
+        model_id: &VoiceModelId,
         inputs: Vec<&mut dyn AnyArray>,
     ) -> Result<Vec<f32>> {
-        if let Some(model_id) = self.id_relations.get(&style_id) {
-            if let Some(model) = self.models.decode.get(model_id) {
-                if let Ok(output_tensors) = model.lock().unwrap().run(inputs) {
-                    Ok(output_tensors[0].as_slice().unwrap().to_owned())
-                } else {
-                    Err(Error::InferenceFailed)
-                }
+        if let Some(model) = self.models.decode.get(model_id) {
+            if let Ok(output_tensors) = model.lock().unwrap().run(inputs) {
+                Ok(output_tensors[0].as_slice().unwrap().to_owned())
             } else {
-                Err(Error::InvalidStyleId { style_id })
+                Err(Error::InferenceFailed)
             }
         } else {
-            Err(Error::InvalidStyleId { style_id })
+            Err(Error::InvalidModelId {
+                model_id: model_id.clone(),
+            })
         }
     }
 }

diff --git a/crates/voicevox_core/src/test_data/model_sources/load_model_works1/manifest.json b/crates/voicevox_core/src/test_data/model_sources/load_model_works1/manifest.json
@@ -3,5 +3,9 @@
   "metas_filename": "metas.json",
   "decode_filename": "decode.onnx",
   "predict_duration_filename": "predict_duration.onnx",
-  "predict_intonation_filename": "predict_intonation.onnx"
+  "predict_intonation_filename": "predict_intonation.onnx",
+  "style_id_to_model_inner_id": {
+    "302": 2,
+    "303": 3
+  }
 }
diff --git a/crates/voicevox_core/src/test_data/model_sources/load_model_works1/metas.json b/crates/voicevox_core/src/test_data/model_sources/load_model_works1/metas.json
@@ -26,11 +26,11 @@
     "styles": [
       {
         "name": "style3-1",
-        "id": 2
+        "id": 302
       },
       {
         "name": "style3-2",
-        "id": 3
+        "id": 303
       }
     ],
     "speaker_uuid": "5d3d9aa9-88e5-4a96-8ef7-f13a3cad1cb3",