Skip to content

Commit 4547925

Browse files
authored
split decoder into spectrogram and vocoder without changing API (VOICEVOX#851)
この本文は @qryxip が記述している。 ストリーミング処理を見据え、decoderからvocoderを切り離す。ただしこのPRで はAPIは変えない。 モデルとしては、`decode`を`generate_full_intermediate`と `render_audio_segment`に分離する。"audio"ではなく"wave"の方が適切かもし れないが、リリースするまでに考えることとする。 VOICEVOX#851 (review) Refs: Hiroshiba/vv_core_inference#28
1 parent 991fbc8 commit 4547925

File tree

10 files changed

+77
-30
lines changed

10 files changed

+77
-30
lines changed

crates/voicevox_core/src/infer/domains.rs

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,9 @@ use educe::Educe;
44
use serde::{Deserialize, Deserializer};
55

66
pub(crate) use self::talk::{
7-
DecodeInput, DecodeOutput, PredictDurationInput, PredictDurationOutput, PredictIntonationInput,
8-
PredictIntonationOutput, TalkDomain, TalkOperation,
7+
GenerateFullIntermediateInput, GenerateFullIntermediateOutput, PredictDurationInput,
8+
PredictDurationOutput, PredictIntonationInput, PredictIntonationOutput,
9+
RenderAudioSegmentInput, RenderAudioSegmentOutput, TalkDomain, TalkOperation,
910
};
1011

1112
#[derive(Educe)]

crates/voicevox_core/src/infer/domains/talk.rs

Lines changed: 25 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -41,10 +41,16 @@ pub(crate) enum TalkOperation {
4141
PredictIntonation,
4242

4343
#[inference_operation(
44-
type Input = DecodeInput;
45-
type Output = DecodeOutput;
44+
type Input = GenerateFullIntermediateInput;
45+
type Output = GenerateFullIntermediateOutput;
4646
)]
47-
Decode,
47+
GenerateFullIntermediate,
48+
49+
#[inference_operation(
50+
type Input = RenderAudioSegmentInput;
51+
type Output = RenderAudioSegmentOutput;
52+
)]
53+
RenderAudioSegment,
4854
}
4955

5056
#[derive(InferenceInputSignature)]
@@ -83,15 +89,28 @@ pub(crate) struct PredictIntonationOutput {
8389

8490
#[derive(InferenceInputSignature)]
8591
#[inference_input_signature(
86-
type Signature = Decode;
92+
type Signature = GenerateFullIntermediate;
8793
)]
88-
pub(crate) struct DecodeInput {
94+
pub(crate) struct GenerateFullIntermediateInput {
8995
pub(crate) f0: Array2<f32>,
9096
pub(crate) phoneme: Array2<f32>,
9197
pub(crate) speaker_id: Array1<i64>,
9298
}
9399

94100
#[derive(InferenceOutputSignature)]
95-
pub(crate) struct DecodeOutput {
101+
pub(crate) struct GenerateFullIntermediateOutput {
102+
pub(crate) spec: Array2<f32>,
103+
}
104+
105+
#[derive(InferenceInputSignature)]
106+
#[inference_input_signature(
107+
type Signature = RenderAudioSegment;
108+
)]
109+
pub(crate) struct RenderAudioSegmentInput {
110+
pub(crate) spec: Array2<f32>,
111+
}
112+
113+
#[derive(InferenceOutputSignature)]
114+
pub(crate) struct RenderAudioSegmentOutput {
96115
pub(crate) wave: Array1<f32>,
97116
}

crates/voicevox_core/src/manifest.rs

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -91,8 +91,11 @@ pub(crate) struct TalkManifest {
9191
#[index_for_fields(TalkOperation::PredictIntonation)]
9292
pub(crate) predict_intonation_filename: Arc<str>,
9393

94-
#[index_for_fields(TalkOperation::Decode)]
95-
pub(crate) decode_filename: Arc<str>,
94+
#[index_for_fields(TalkOperation::GenerateFullIntermediate)]
95+
pub(crate) generate_full_intermediate_filename: Arc<str>,
96+
97+
#[index_for_fields(TalkOperation::RenderAudioSegment)]
98+
pub(crate) render_audio_segment_filename: Arc<str>,
9699

97100
#[serde(default)]
98101
pub(crate) style_id_to_inner_voice_id: StyleIdToInnerVoiceId,

crates/voicevox_core/src/status.rs

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -375,8 +375,9 @@ mod tests {
375375
let session_options = InferenceDomainMap {
376376
talk: enum_map! {
377377
TalkOperation::PredictDuration
378-
| TalkOperation::PredictIntonation => light_session_options,
379-
TalkOperation::Decode => heavy_session_options,
378+
| TalkOperation::PredictIntonation
379+
| TalkOperation::GenerateFullIntermediate => light_session_options,
380+
TalkOperation::RenderAudioSegment => heavy_session_options,
380381
},
381382
};
382383
let status = Status::new(
@@ -392,9 +393,13 @@ mod tests {
392393
light_session_options,
393394
status.session_options.talk[TalkOperation::PredictIntonation],
394395
);
396+
assert_eq!(
397+
light_session_options,
398+
status.session_options.talk[TalkOperation::GenerateFullIntermediate],
399+
);
395400
assert_eq!(
396401
heavy_session_options,
397-
status.session_options.talk[TalkOperation::Decode],
402+
status.session_options.talk[TalkOperation::RenderAudioSegment],
398403
);
399404

400405
assert!(status.loaded_models.lock().unwrap().0.is_empty());

crates/voicevox_core/src/synthesizer.rs

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -91,9 +91,10 @@ pub(crate) mod blocking {
9191
error::ErrorRepr,
9292
infer::{
9393
domains::{
94-
DecodeInput, DecodeOutput, InferenceDomainMap, PredictDurationInput,
95-
PredictDurationOutput, PredictIntonationInput, PredictIntonationOutput, TalkDomain,
96-
TalkOperation,
94+
GenerateFullIntermediateInput, GenerateFullIntermediateOutput, InferenceDomainMap,
95+
PredictDurationInput, PredictDurationOutput, PredictIntonationInput,
96+
PredictIntonationOutput, RenderAudioSegmentInput, RenderAudioSegmentOutput,
97+
TalkDomain, TalkOperation,
9798
},
9899
InferenceRuntime as _, InferenceSessionOptions,
99100
},
@@ -204,8 +205,9 @@ pub(crate) mod blocking {
204205
InferenceDomainMap {
205206
talk: enum_map! {
206207
TalkOperation::PredictDuration
207-
| TalkOperation::PredictIntonation => light_session_options,
208-
TalkOperation::Decode => heavy_session_options,
208+
| TalkOperation::PredictIntonation
209+
| TalkOperation::GenerateFullIntermediate => light_session_options,
210+
TalkOperation::RenderAudioSegment => heavy_session_options,
209211
},
210212
},
211213
);
@@ -935,9 +937,9 @@ pub(crate) mod blocking {
935937
padding_size,
936938
);
937939

938-
let DecodeOutput { wave: output } = self.status.run_session(
940+
let GenerateFullIntermediateOutput { spec } = self.status.run_session(
939941
model_id,
940-
DecodeInput {
942+
GenerateFullIntermediateInput {
941943
f0: ndarray::arr1(&f0_with_padding)
942944
.into_shape([length_with_padding, 1])
943945
.unwrap(),
@@ -948,6 +950,10 @@ pub(crate) mod blocking {
948950
},
949951
)?;
950952

953+
let RenderAudioSegmentOutput { wave: output } = self
954+
.status
955+
.run_session(model_id, RenderAudioSegmentInput { spec })?;
956+
951957
return Ok(trim_padding_from_output(
952958
output.into_raw_vec(),
953959
padding_size,

crates/voicevox_core/src/voice_model.rs

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -145,8 +145,11 @@ impl<A: Async> Inner<A> {
145145
TalkOperation::PredictIntonation => {
146146
find_entry_index(&manifest.predict_intonation_filename)?
147147
}
148-
TalkOperation::Decode => {
149-
find_entry_index(&manifest.decode_filename)?
148+
TalkOperation::GenerateFullIntermediate => {
149+
find_entry_index(&manifest.generate_full_intermediate_filename)?
150+
}
151+
TalkOperation::RenderAudioSegment => {
152+
find_entry_index(&manifest.render_audio_segment_filename)?
150153
}
151154
};
152155

@@ -232,14 +235,20 @@ impl<A: Async> Inner<A> {
232235

233236
let talk = OptionFuture::from(talk.map(
234237
|(entries, style_id_to_inner_voice_id)| async move {
235-
let [predict_duration, predict_intonation, decode] = entries.into_array();
238+
let [predict_duration, predict_intonation, predict_spectrogram, run_vocoder] =
239+
entries.into_array();
236240

237241
let predict_duration = read_file!(predict_duration);
238242
let predict_intonation = read_file!(predict_intonation);
239-
let decode = read_file!(decode);
240-
241-
let model_bytes =
242-
EnumMap::from_array([predict_duration, predict_intonation, decode]);
243+
let predict_spectrogram = read_file!(predict_spectrogram);
244+
let run_vocoder = read_file!(run_vocoder);
245+
246+
let model_bytes = EnumMap::from_array([
247+
predict_duration,
248+
predict_intonation,
249+
predict_spectrogram,
250+
run_vocoder,
251+
]);
243252

244253
Ok((style_id_to_inner_voice_id, model_bytes))
245254
},

crates/voicevox_core_macros/src/lib.rs

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -119,8 +119,11 @@ pub fn derive_inference_output_signature(
119119
/// #[index_for_fields(TalkOperation::PredictIntonation)]
120120
/// pub(crate) predict_intonation_filename: Arc<str>,
121121
///
122-
/// #[index_for_fields(TalkOperation::Decode)]
123-
/// pub(crate) decode_filename: Arc<str>,
122+
/// #[index_for_fields(TalkOperation::GenerateFullIntermediate)]
123+
/// pub(crate) generate_full_intermediate_filename: Arc<str>,
124+
///
125+
/// #[index_for_fields(TalkOperation::RenderAudioSegment)]
126+
/// pub(crate) render_audio_segment_filename: Arc<str>,
124127
///
125128
/// // …
126129
/// }

model/sample.vvm/manifest.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,8 @@
55
"talk": {
66
"predict_duration_filename": "predict_duration.onnx",
77
"predict_intonation_filename": "predict_intonation.onnx",
8-
"decode_filename": "decode.onnx",
8+
"generate_full_intermediate_filename": "predict_spectrogram.onnx",
9+
"render_audio_segment_filename": "vocoder.onnx",
910
"style_id_to_inner_voice_id": {
1011
"302": 2,
1112
"303": 3
1.35 MB
Binary file not shown.
Binary file not shown.

0 commit comments

Comments
 (0)