split decoder into spectrogram and vocoder without changing API (VOICEVOX#851)

Yosshi999 · web-flow · commit 4547925818ae · 2024-10-13T04:39:35.000+09:00
この本文は @qryxip が記述している。 ストリーミング処理を見据え、decoderからvocoderを切り離す。ただしこのPRで はAPIは変えない。 モデルとしては、`decode`を`generate_full_intermediate`と `render_audio_segment`に分離する。"audio"ではなく"wave"の方が適切かもし れないが、リリースするまでに考えることとする。 VOICEVOX#851 (review) Refs: Hiroshiba/vv_core_inference#28
diff --git a/crates/voicevox_core/src/infer/domains.rs b/crates/voicevox_core/src/infer/domains.rs
@@ -4,8 +4,9 @@ use educe::Educe;
 use serde::{Deserialize, Deserializer};
 
 pub(crate) use self::talk::{
-    DecodeInput, DecodeOutput, PredictDurationInput, PredictDurationOutput, PredictIntonationInput,
-    PredictIntonationOutput, TalkDomain, TalkOperation,
+    GenerateFullIntermediateInput, GenerateFullIntermediateOutput, PredictDurationInput,
+    PredictDurationOutput, PredictIntonationInput, PredictIntonationOutput,
+    RenderAudioSegmentInput, RenderAudioSegmentOutput, TalkDomain, TalkOperation,
 };
 
 #[derive(Educe)]
diff --git a/crates/voicevox_core/src/infer/domains/talk.rs b/crates/voicevox_core/src/infer/domains/talk.rs
@@ -41,10 +41,16 @@ pub(crate) enum TalkOperation {
     PredictIntonation,
 
     #[inference_operation(
-        type Input = DecodeInput;
-        type Output = DecodeOutput;
+        type Input = GenerateFullIntermediateInput;
+        type Output = GenerateFullIntermediateOutput;
     )]
-    Decode,
+    GenerateFullIntermediate,
+
+    #[inference_operation(
+        type Input = RenderAudioSegmentInput;
+        type Output = RenderAudioSegmentOutput;
+    )]
+    RenderAudioSegment,
 }
 
 #[derive(InferenceInputSignature)]
@@ -83,15 +89,28 @@ pub(crate) struct PredictIntonationOutput {
 
 #[derive(InferenceInputSignature)]
 #[inference_input_signature(
-    type Signature = Decode;
+    type Signature = GenerateFullIntermediate;
 )]
-pub(crate) struct DecodeInput {
+pub(crate) struct GenerateFullIntermediateInput {
     pub(crate) f0: Array2<f32>,
     pub(crate) phoneme: Array2<f32>,
     pub(crate) speaker_id: Array1<i64>,
 }
 
 #[derive(InferenceOutputSignature)]
-pub(crate) struct DecodeOutput {
+pub(crate) struct GenerateFullIntermediateOutput {
+    pub(crate) spec: Array2<f32>,
+}
+
+#[derive(InferenceInputSignature)]
+#[inference_input_signature(
+    type Signature = RenderAudioSegment;
+)]
+pub(crate) struct RenderAudioSegmentInput {
+    pub(crate) spec: Array2<f32>,
+}
+
+#[derive(InferenceOutputSignature)]
+pub(crate) struct RenderAudioSegmentOutput {
     pub(crate) wave: Array1<f32>,
 }
diff --git a/crates/voicevox_core/src/manifest.rs b/crates/voicevox_core/src/manifest.rs
@@ -91,8 +91,11 @@ pub(crate) struct TalkManifest {
     #[index_for_fields(TalkOperation::PredictIntonation)]
     pub(crate) predict_intonation_filename: Arc<str>,
 
-    #[index_for_fields(TalkOperation::Decode)]
-    pub(crate) decode_filename: Arc<str>,
+    #[index_for_fields(TalkOperation::GenerateFullIntermediate)]
+    pub(crate) generate_full_intermediate_filename: Arc<str>,
+
+    #[index_for_fields(TalkOperation::RenderAudioSegment)]
+    pub(crate) render_audio_segment_filename: Arc<str>,
 
     #[serde(default)]
     pub(crate) style_id_to_inner_voice_id: StyleIdToInnerVoiceId,
diff --git a/crates/voicevox_core/src/status.rs b/crates/voicevox_core/src/status.rs
@@ -375,8 +375,9 @@ mod tests {
         let session_options = InferenceDomainMap {
             talk: enum_map! {
                 TalkOperation::PredictDuration
-                | TalkOperation::PredictIntonation => light_session_options,
-                TalkOperation::Decode => heavy_session_options,
+                | TalkOperation::PredictIntonation
+                | TalkOperation::GenerateFullIntermediate => light_session_options,
+                TalkOperation::RenderAudioSegment => heavy_session_options,
             },
         };
         let status = Status::new(
@@ -392,9 +393,13 @@ mod tests {
             light_session_options,
             status.session_options.talk[TalkOperation::PredictIntonation],
         );
+        assert_eq!(
+            light_session_options,
+            status.session_options.talk[TalkOperation::GenerateFullIntermediate],
+        );
         assert_eq!(
             heavy_session_options,
-            status.session_options.talk[TalkOperation::Decode],
+            status.session_options.talk[TalkOperation::RenderAudioSegment],
         );
 
         assert!(status.loaded_models.lock().unwrap().0.is_empty());
diff --git a/crates/voicevox_core/src/synthesizer.rs b/crates/voicevox_core/src/synthesizer.rs
@@ -91,9 +91,10 @@ pub(crate) mod blocking {
         error::ErrorRepr,
         infer::{
             domains::{
-                DecodeInput, DecodeOutput, InferenceDomainMap, PredictDurationInput,
-                PredictDurationOutput, PredictIntonationInput, PredictIntonationOutput, TalkDomain,
-                TalkOperation,
+                GenerateFullIntermediateInput, GenerateFullIntermediateOutput, InferenceDomainMap,
+                PredictDurationInput, PredictDurationOutput, PredictIntonationInput,
+                PredictIntonationOutput, RenderAudioSegmentInput, RenderAudioSegmentOutput,
+                TalkDomain, TalkOperation,
             },
             InferenceRuntime as _, InferenceSessionOptions,
         },
@@ -204,8 +205,9 @@ pub(crate) mod blocking {
                 InferenceDomainMap {
                     talk: enum_map! {
                         TalkOperation::PredictDuration
-                        | TalkOperation::PredictIntonation => light_session_options,
-                        TalkOperation::Decode => heavy_session_options,
+                        | TalkOperation::PredictIntonation
+                        | TalkOperation::GenerateFullIntermediate => light_session_options,
+                        TalkOperation::RenderAudioSegment => heavy_session_options,
                     },
                 },
             );
@@ -935,9 +937,9 @@ pub(crate) mod blocking {
                 padding_size,
             );
 
-            let DecodeOutput { wave: output } = self.status.run_session(
+            let GenerateFullIntermediateOutput { spec } = self.status.run_session(
                 model_id,
-                DecodeInput {
+                GenerateFullIntermediateInput {
                     f0: ndarray::arr1(&f0_with_padding)
                         .into_shape([length_with_padding, 1])
                         .unwrap(),
@@ -948,6 +950,10 @@ pub(crate) mod blocking {
                 },
             )?;
 
+            let RenderAudioSegmentOutput { wave: output } = self
+                .status
+                .run_session(model_id, RenderAudioSegmentInput { spec })?;
+
             return Ok(trim_padding_from_output(
                 output.into_raw_vec(),
                 padding_size,
diff --git a/crates/voicevox_core/src/voice_model.rs b/crates/voicevox_core/src/voice_model.rs
@@ -145,8 +145,11 @@ impl<A: Async> Inner<A> {
                                         TalkOperation::PredictIntonation => {
                                             find_entry_index(&manifest.predict_intonation_filename)?
                                         }
-                                        TalkOperation::Decode => {
-                                            find_entry_index(&manifest.decode_filename)?
+                                        TalkOperation::GenerateFullIntermediate => {
+                                            find_entry_index(&manifest.generate_full_intermediate_filename)?
+                                        }
+                                        TalkOperation::RenderAudioSegment => {
+                                            find_entry_index(&manifest.render_audio_segment_filename)?
                                         }
                                     };
 
@@ -232,14 +235,20 @@ impl<A: Async> Inner<A> {
 
         let talk = OptionFuture::from(talk.map(
             |(entries, style_id_to_inner_voice_id)| async move {
-                let [predict_duration, predict_intonation, decode] = entries.into_array();
+                let [predict_duration, predict_intonation, predict_spectrogram, run_vocoder] =
+                    entries.into_array();
 
                 let predict_duration = read_file!(predict_duration);
                 let predict_intonation = read_file!(predict_intonation);
-                let decode = read_file!(decode);
-
-                let model_bytes =
-                    EnumMap::from_array([predict_duration, predict_intonation, decode]);
+                let predict_spectrogram = read_file!(predict_spectrogram);
+                let run_vocoder = read_file!(run_vocoder);
+
+                let model_bytes = EnumMap::from_array([
+                    predict_duration,
+                    predict_intonation,
+                    predict_spectrogram,
+                    run_vocoder,
+                ]);
 
                 Ok((style_id_to_inner_voice_id, model_bytes))
             },
diff --git a/crates/voicevox_core_macros/src/lib.rs b/crates/voicevox_core_macros/src/lib.rs
@@ -119,8 +119,11 @@ pub fn derive_inference_output_signature(
 ///     #[index_for_fields(TalkOperation::PredictIntonation)]
 ///     pub(crate) predict_intonation_filename: Arc<str>,
 ///
-///     #[index_for_fields(TalkOperation::Decode)]
-///     pub(crate) decode_filename: Arc<str>,
+///     #[index_for_fields(TalkOperation::GenerateFullIntermediate)]
+///     pub(crate) generate_full_intermediate_filename: Arc<str>,
+///
+///     #[index_for_fields(TalkOperation::RenderAudioSegment)]
+///     pub(crate) render_audio_segment_filename: Arc<str>,
 ///
 ///     // …
 /// }
diff --git a/model/sample.vvm/manifest.json b/model/sample.vvm/manifest.json
@@ -5,7 +5,8 @@
   "talk": {
     "predict_duration_filename": "predict_duration.onnx",
     "predict_intonation_filename": "predict_intonation.onnx",
-    "decode_filename": "decode.onnx",
+    "generate_full_intermediate_filename": "predict_spectrogram.onnx",
+    "render_audio_segment_filename": "vocoder.onnx",
     "style_id_to_inner_voice_id": {
       "302": 2,
       "303": 3
diff --git a/model/sample.vvm/predict_spectrogram.onnx b/model/sample.vvm/predict_spectrogram.onnx
diff --git a/model/sample.vvm/vocoder.onnx b/model/sample.vvm/vocoder.onnx