We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
1 parent 4ce31ca commit f6a8c80Copy full SHA for f6a8c80
cosyvoice/dataset/processor.py
@@ -177,11 +177,10 @@ def compute_fbank(data,
177
waveform = sample['speech']
178
feat = feat_extractor(waveform).squeeze(dim=0).transpose(0, 1)
179
180
- # padding with replicate mode (align to speech_token len * token_mel_ratio)
181
- pad_len = sample["speech_token"].shape[0] * token_mel_ratio - feat.shape[0]
182
- if pad_len > 0:
183
- feat_to_pad = feat[-1:].repeat((pad_len, 1))
184
- feat = torch.cat([feat, feat_to_pad], dim=0)
+ # trim to align speech_token and speech_feat
+ token_len = min(feat.shape[0] // token_mel_ratio, sample["speech_token"].shape[0])
+ feat = feat[:token_mel_ratio * token_len]
+ sample["speech_token"] = sample["speech_token"][:token_len]
185
186
sample['speech_feat'] = feat
187
yield sample
0 commit comments