Skip to content

Commit a5745c2

Browse files
authored
Streaming decoder for compatible engine (#875)
この本文は @qryxip が記述している。 ストリーミング処理を`compatible_engine`に実装する。testcaseとして一括変 換・ストリーム変換の二つの生成結果を追加し、元の生成音声と比較して十分近 いことを確かめた。 `render_audio_segment`には将来の互換性のため、未使用引数 `int64_t margin_width`を入れる。 #875 (comment) またRust APIの出力サイズをチェックしてパニックする仕組みも入れる。これに ついては他の関数にも後で導入することとする。 #875 (comment) Refs: #866
1 parent c3c0580 commit a5745c2

File tree

6 files changed

+228
-1
lines changed

6 files changed

+228
-1
lines changed

crates/test_util/build.rs

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -202,6 +202,35 @@ fn generate_example_data_json(dist: &Path) -> anyhow::Result<()> {
202202
phoneme.to_vec()
203203
},
204204
},
205+
intermediate: typing::IntermediateExampleData {
206+
f0_length: 69,
207+
phoneme_size: 45,
208+
feature_dim: 80,
209+
margin_width: 14,
210+
f0_vector: {
211+
let mut f0 = [0.; 69];
212+
f0[9..24].fill(5.905218);
213+
f0[37..60].fill(5.565851);
214+
f0.to_vec()
215+
},
216+
phoneme_vector: {
217+
let mut phoneme = [0.; 45 * 69];
218+
let mut set_one = |index, range| {
219+
for i in range {
220+
phoneme[(i * 45 + index) as usize] = 1.;
221+
}
222+
};
223+
set_one(0, 0..9);
224+
set_one(37, 9..13);
225+
set_one(14, 13..24);
226+
set_one(35, 24..30);
227+
set_one(6, 30..37);
228+
set_one(37, 37..45);
229+
set_one(30, 45..60);
230+
set_one(0, 60..69);
231+
phoneme.to_vec()
232+
},
233+
},
205234
};
206235

207236
fs_err::write(

crates/test_util/compatible_engine.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,4 +25,11 @@ bool yukarin_sa_forward(int64_t length, int64_t *vowel_phoneme_list,
2525
bool decode_forward(int64_t length, int64_t phoneme_size, float *f0,
2626
float *phoneme, int64_t *speaker_id, float *output);
2727

28+
bool generate_full_intermediate(int64_t length, int64_t phoneme_size,
29+
float *f0, float *phoneme, int64_t *speaker_id,
30+
float *output);
31+
32+
bool render_audio_segment(int64_t length, int64_t margin_width, int64_t feature_size,
33+
float *audio_feature, int64_t *speaker_id, float *output);
34+
2835
const char *last_error_message();

crates/test_util/src/typing.rs

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,11 +31,22 @@ pub struct DecodeExampleData {
3131
pub phoneme_vector: Vec<f32>,
3232
}
3333

34+
#[derive(Debug, Serialize, Deserialize)]
35+
pub struct IntermediateExampleData {
36+
pub f0_length: i64,
37+
pub phoneme_size: i64,
38+
pub feature_dim: i64,
39+
pub margin_width: i64,
40+
pub f0_vector: Vec<f32>,
41+
pub phoneme_vector: Vec<f32>,
42+
}
43+
3444
#[derive(Debug, Serialize, Deserialize)]
3545
pub struct ExampleData {
3646
pub speaker_id: i64,
3747

3848
pub duration: DurationExampleData,
3949
pub intonation: IntonationExampleData,
4050
pub decode: DecodeExampleData,
51+
pub intermediate: IntermediateExampleData,
4152
}

crates/voicevox_core_c_api/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ easy-ext.workspace = true
3030
educe.workspace = true
3131
itertools.workspace = true
3232
libc.workspace = true
33+
ndarray.workspace = true
3334
parking_lot = { workspace = true, features = ["arc_lock"] }
3435
process_path.workspace = true
3536
ref-cast.workspace = true

crates/voicevox_core_c_api/src/compatible_engine.rs

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -359,6 +359,111 @@ pub unsafe extern "C" fn decode_forward(
359359
}
360360
}
361361

362+
/// # Safety
363+
///
364+
/// - `f0`はRustの`&[f32; length as usize]`として解釈できなければならない。
365+
/// - `phoneme`はRustの`&[f32; phoneme_size * length as usize]`として解釈できなければならない。
366+
/// - `speaker_id`はRustの`&[i64; 1]`として解釈できなければならない。
367+
/// - `output`はRustの`&mut [MaybeUninit<f32>; ((length + 2 * 14) * 80) as usize]`として解釈できなければならない。
368+
#[unsafe(no_mangle)] // SAFETY: voicevox_core_c_apiを構成するライブラリの中に、これと同名のシンボルは存在しない
369+
pub unsafe extern "C" fn generate_full_intermediate(
370+
length: i64,
371+
phoneme_size: i64,
372+
f0: *mut f32,
373+
phoneme: *mut f32,
374+
speaker_id: *mut i64,
375+
output: *mut f32,
376+
) -> bool {
377+
init_logger_once();
378+
assert_aligned(f0);
379+
assert_aligned(phoneme);
380+
assert_aligned(speaker_id);
381+
assert_aligned(output);
382+
let length = length as usize;
383+
let phoneme_size = phoneme_size as usize;
384+
const MARGIN_WIDTH: usize = 14;
385+
const FEATURE_SIZE: usize = 80;
386+
let synthesizer = &*lock_synthesizer();
387+
let result = ensure_initialized!(synthesizer).generate_full_intermediate(
388+
length,
389+
phoneme_size,
390+
// SAFETY: The safety contract must be upheld by the caller.
391+
unsafe { std::slice::from_raw_parts(f0, length) },
392+
unsafe { std::slice::from_raw_parts(phoneme, phoneme_size * length) },
393+
StyleId::new(unsafe { *speaker_id as u32 }),
394+
);
395+
match result {
396+
Ok(output_arr) => {
397+
let output_len = (length + 2 * MARGIN_WIDTH) * FEATURE_SIZE;
398+
if output_arr.len() != output_len {
399+
panic!("expected {}, got {}", output_len, output_arr.len());
400+
}
401+
let output_arr = output_arr.as_standard_layout();
402+
// SAFETY: The safety contract must be upheld by the caller.
403+
unsafe {
404+
output_arr
405+
.as_ptr()
406+
.copy_to_nonoverlapping(output, output_len);
407+
}
408+
true
409+
}
410+
Err(err) => {
411+
set_message(&format!("{err}"));
412+
false
413+
}
414+
}
415+
}
416+
417+
/// # Safety
418+
///
419+
/// - `audio_feature`はRustの`&[f32; (length * feature_size) as usize]`として解釈できなければならない。
420+
/// - `speaker_id`はRustの`&[i64; 1]`として解釈できなければならない。
421+
/// - `output`はRustの`&mut [MaybeUninit<f32>; length as usize * 256]`として解釈できなければならない。
422+
#[unsafe(no_mangle)] // SAFETY: voicevox_core_c_apiを構成するライブラリの中に、これと同名のシンボルは存在しない
423+
pub unsafe extern "C" fn render_audio_segment(
424+
length: i64,
425+
_margin_width: i64,
426+
feature_size: i64,
427+
audio_feature: *mut f32,
428+
speaker_id: *mut i64,
429+
output: *mut f32,
430+
) -> bool {
431+
init_logger_once();
432+
assert_aligned(audio_feature);
433+
assert_aligned(speaker_id);
434+
assert_aligned(output);
435+
let length = length as usize;
436+
let feature_size = feature_size as usize;
437+
let synthesizer = &*lock_synthesizer();
438+
let result = ensure_initialized!(synthesizer).render_audio_segment(
439+
// SAFETY: The safety contract must be upheld by the caller.
440+
unsafe {
441+
ndarray::ArrayView2::from_shape_ptr([length, feature_size], audio_feature).to_owned()
442+
},
443+
StyleId::new(unsafe { *speaker_id as u32 }),
444+
);
445+
match result {
446+
Ok(output_arr) => {
447+
let output_len = length * 256;
448+
if output_arr.len() != output_len {
449+
panic!("expected {}, got {}", output_len, output_arr.len());
450+
}
451+
let output_arr = output_arr.as_standard_layout();
452+
// SAFETY: The safety contract must be upheld by the caller.
453+
unsafe {
454+
output_arr
455+
.as_ptr()
456+
.copy_to_nonoverlapping(output, output_len);
457+
}
458+
true
459+
}
460+
Err(err) => {
461+
set_message(&format!("{err}"));
462+
false
463+
}
464+
}
465+
}
466+
362467
#[track_caller]
363468
fn assert_aligned(ptr: *mut impl Sized) {
364469
assert!(

crates/voicevox_core_c_api/tests/e2e/testcases/compatible_engine.rs

Lines changed: 75 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
// エンジンを起動してyukarin_s・yukarin_sa・decodeの推論を行う
22

3-
use std::ffi::CStr;
43
use std::sync::LazyLock;
4+
use std::{cmp::min, ffi::CStr};
55

66
use assert_cmd::assert::AssertResult;
77
use libloading::Library;
@@ -83,12 +83,86 @@ impl assert_cdylib::TestCase for TestCase {
8383
wave
8484
};
8585

86+
// 中間生成物を経由した場合の生成音声
87+
let wave2 = {
88+
let length_with_margin =
89+
EXAMPLE_DATA.intermediate.f0_length + 2 * EXAMPLE_DATA.intermediate.margin_width;
90+
let mut audio_feature =
91+
vec![0.; (length_with_margin * EXAMPLE_DATA.intermediate.feature_dim) as usize];
92+
let mut wave = vec![0.; 256 * length_with_margin as usize];
93+
assert!(lib.generate_full_intermediate(
94+
EXAMPLE_DATA.intermediate.f0_length,
95+
EXAMPLE_DATA.intermediate.phoneme_size,
96+
EXAMPLE_DATA.intermediate.f0_vector.as_ptr() as *mut f32,
97+
EXAMPLE_DATA.intermediate.phoneme_vector.as_ptr() as *mut f32,
98+
&mut { EXAMPLE_DATA.speaker_id } as *mut i64,
99+
audio_feature.as_mut_ptr(),
100+
));
101+
assert!(lib.render_audio_segment(
102+
length_with_margin,
103+
EXAMPLE_DATA.intermediate.margin_width,
104+
EXAMPLE_DATA.intermediate.feature_dim,
105+
audio_feature.as_ptr() as *mut f32,
106+
&mut { EXAMPLE_DATA.speaker_id } as *mut i64,
107+
wave.as_mut_ptr(),
108+
));
109+
wave[256 * EXAMPLE_DATA.intermediate.margin_width as usize
110+
..wave.len() - 256 * EXAMPLE_DATA.intermediate.margin_width as usize]
111+
.to_vec()
112+
};
113+
114+
// 中間生成物を経由し、さらにチャンクごとに変換した場合の生成音声
115+
let wave3 = {
116+
let length_with_margin =
117+
EXAMPLE_DATA.intermediate.f0_length + 2 * EXAMPLE_DATA.intermediate.margin_width;
118+
let mut audio_feature =
119+
vec![0.; (length_with_margin * EXAMPLE_DATA.intermediate.feature_dim) as usize];
120+
let mut wave = vec![0.; 256 * EXAMPLE_DATA.intermediate.f0_length as usize];
121+
assert!(lib.generate_full_intermediate(
122+
EXAMPLE_DATA.intermediate.f0_length,
123+
EXAMPLE_DATA.intermediate.phoneme_size,
124+
EXAMPLE_DATA.intermediate.f0_vector.as_ptr() as *mut f32,
125+
EXAMPLE_DATA.intermediate.phoneme_vector.as_ptr() as *mut f32,
126+
&mut { EXAMPLE_DATA.speaker_id } as *mut i64,
127+
audio_feature.as_mut_ptr(),
128+
));
129+
let full_length = EXAMPLE_DATA.intermediate.f0_length as usize;
130+
let pitch = EXAMPLE_DATA.intermediate.feature_dim as usize;
131+
for render_start in (0..full_length).step_by(10) {
132+
// render_start .. render_end の音声を取得する
133+
let render_end = min(render_start + 10, full_length);
134+
let slice_start = render_start;
135+
let slice_end = render_end + 2 * EXAMPLE_DATA.intermediate.margin_width as usize;
136+
let feature_segment = &audio_feature[slice_start * pitch..slice_end * pitch];
137+
let slice_length = slice_end - slice_start;
138+
let mut wave_segment_with_margin = vec![0.; 256 * slice_length];
139+
assert!(lib.render_audio_segment(
140+
slice_length as i64,
141+
EXAMPLE_DATA.intermediate.margin_width,
142+
pitch as i64,
143+
feature_segment.as_ptr() as *mut f32,
144+
&mut { EXAMPLE_DATA.speaker_id } as *mut i64,
145+
wave_segment_with_margin.as_mut_ptr(),
146+
));
147+
let wave_segment = &wave_segment_with_margin[256
148+
* EXAMPLE_DATA.intermediate.margin_width as usize
149+
..wave_segment_with_margin.len()
150+
- 256 * EXAMPLE_DATA.intermediate.margin_width as usize];
151+
wave[render_start * 256..render_end * 256].clone_from_slice(wave_segment);
152+
}
153+
wave
154+
};
155+
86156
std::assert_eq!(SNAPSHOTS.metas, metas_json);
87157

88158
float_assert::close_l1(&phoneme_length, &EXAMPLE_DATA.duration.result, 0.01);
89159
float_assert::close_l1(&intonation_list, &EXAMPLE_DATA.intonation.result, 0.01);
90160

91161
assert!(wave.iter().copied().all(f32::is_normal));
162+
assert!(wave2.iter().copied().all(f32::is_normal));
163+
assert!(wave3.iter().copied().all(f32::is_normal));
164+
float_assert::close_l1(&wave2, &wave, 0.001);
165+
float_assert::close_l1(&wave3, &wave, 0.001);
92166

93167
lib.finalize();
94168
Ok(())

0 commit comments

Comments
 (0)