Skip to content

Commit 87a21bd

Browse files
committed
update the synthesis method in a user-friendly way
1 parent 4e2f826 commit 87a21bd

File tree

2 files changed

+29
-17
lines changed

2 files changed

+29
-17
lines changed

synthesize.py

Lines changed: 26 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
import re
22
import argparse
33
from string import punctuation
4+
import os
5+
import json
46

57
import torch
68
import yaml
@@ -81,7 +83,7 @@ def preprocess_english(text, preprocess_config):
8183
return np.array(sequence)
8284

8385

84-
def synthesize(model, step, configs, vocoder, batchs, control_values):
86+
def synthesize(model, step, configs, vocoder, batchs, control_values, tag):
8587
preprocess_config, model_config, train_config = configs
8688
pitch_control, energy_control, duration_control = control_values
8789

@@ -102,6 +104,7 @@ def synthesize(model, step, configs, vocoder, batchs, control_values):
102104
model_config,
103105
preprocess_config,
104106
train_config["path"]["result_path"],
107+
tag,
105108
)
106109

107110

@@ -130,26 +133,26 @@ def synthesize(model, step, configs, vocoder, batchs, control_values):
130133
)
131134
parser.add_argument(
132135
"--speaker_id",
133-
type=int,
134-
default=0,
136+
type=str,
137+
default="p001",
135138
help="speaker ID for multi-speaker synthesis, for single-sentence mode only",
136139
)
137140
parser.add_argument(
138141
"--emotion_id",
139-
type=int,
140-
default=0,
142+
type=str,
143+
default="happy",
141144
help="emotion ID for multi-emotion synthesis, for single-sentence mode only",
142145
)
143146
parser.add_argument(
144147
"--arousal",
145-
type=int,
146-
default=3,
148+
type=str,
149+
default="3",
147150
help="arousal value for multi-emotion synthesis, for single-sentence mode only",
148151
)
149152
parser.add_argument(
150153
"--valence",
151-
type=int,
152-
default=3,
154+
type=str,
155+
default="3",
153156
help="valence value for multi-emotion synthesis, for single-sentence mode only",
154157
)
155158
parser.add_argument(
@@ -214,21 +217,30 @@ def synthesize(model, step, configs, vocoder, batchs, control_values):
214217
batch_size=8,
215218
collate_fn=dataset.collate_fn,
216219
)
220+
tag = None
217221
if args.mode == "single":
218222
emotions = arousals = valences = None
219223
ids = raw_texts = [args.text[:100]]
220-
speakers = np.array([args.speaker_id])
224+
with open(os.path.join(preprocess_config["path"]["preprocessed_path"], "speakers.json")) as f:
225+
speaker_map = json.load(f)
226+
speakers = np.array([speaker_map[args.speaker_id]])
221227
if model_config["multi_emotion"]:
222-
emotions = np.array([args.emotion_id])
223-
arousals = np.array([args.arousal])
224-
valences = np.array([args.valence])
228+
with open(os.path.join(preprocess_config["path"]["preprocessed_path"], "emotions.json")) as f:
229+
json_raw = json.load(f)
230+
emotion_map = json_raw["emotion_dict"]
231+
arousal_map = json_raw["arousal_dict"]
232+
valence_map = json_raw["valence_dict"]
233+
emotions = np.array([emotion_map[args.emotion_id]])
234+
arousals = np.array([arousal_map[args.arousal]])
235+
valences = np.array([valence_map[args.valence]])
225236
if preprocess_config["preprocessing"]["text"]["language"] == "kr":
226237
texts = np.array([preprocess_korean(args.text, preprocess_config)])
227238
elif preprocess_config["preprocessing"]["text"]["language"] == "en":
228239
texts = np.array([preprocess_english(args.text, preprocess_config)])
229240
text_lens = np.array([len(texts[0])])
230241
batchs = [(ids, raw_texts, speakers, emotions, arousals, valences, texts, text_lens, max(text_lens))]
242+
tag = f"{args.speaker_id}_{args.emotion_id}"
231243

232244
control_values = args.pitch_control, args.energy_control, args.duration_control
233245

234-
synthesize(model, args.restore_step, configs, vocoder, batchs, control_values)
246+
synthesize(model, args.restore_step, configs, vocoder, batchs, control_values, tag)

utils/tools.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -222,7 +222,7 @@ def synth_one_sample(targets, predictions, vocoder, model_config, preprocess_con
222222
return fig, wav_reconstruction, wav_prediction, basename
223223

224224

225-
def synth_samples(targets, predictions, vocoder, model_config, preprocess_config, path):
225+
def synth_samples(targets, predictions, vocoder, model_config, preprocess_config, path, tag=None):
226226

227227
basenames = targets[0]
228228
for i in range(len(predictions[0])):
@@ -255,7 +255,7 @@ def synth_samples(targets, predictions, vocoder, model_config, preprocess_config
255255
stats,
256256
["Synthetized Spectrogram"],
257257
)
258-
plt.savefig(os.path.join(path, "{}.png".format(basename)))
258+
plt.savefig(os.path.join(path, "{}{}.png".format(basename, f"_{tag}" if tag is not None else "")))
259259
plt.close()
260260

261261
from .model import vocoder_infer
@@ -268,7 +268,7 @@ def synth_samples(targets, predictions, vocoder, model_config, preprocess_config
268268

269269
sampling_rate = preprocess_config["preprocessing"]["audio"]["sampling_rate"]
270270
for wav, basename in zip(wav_predictions, basenames):
271-
wavfile.write(os.path.join(path, "{}.wav".format(basename)), sampling_rate, wav)
271+
wavfile.write(os.path.join(path, "{}{}.wav".format(basename, f"_{tag}" if tag is not None else "")), sampling_rate, wav)
272272

273273

274274
def plot_mel(data, stats, titles):

0 commit comments

Comments
 (0)