1
1
import re
2
2
import argparse
3
3
from string import punctuation
4
+ import os
5
+ import json
4
6
5
7
import torch
6
8
import yaml
@@ -81,7 +83,7 @@ def preprocess_english(text, preprocess_config):
81
83
return np .array (sequence )
82
84
83
85
84
- def synthesize (model , step , configs , vocoder , batchs , control_values ):
86
+ def synthesize (model , step , configs , vocoder , batchs , control_values , tag ):
85
87
preprocess_config , model_config , train_config = configs
86
88
pitch_control , energy_control , duration_control = control_values
87
89
@@ -102,6 +104,7 @@ def synthesize(model, step, configs, vocoder, batchs, control_values):
102
104
model_config ,
103
105
preprocess_config ,
104
106
train_config ["path" ]["result_path" ],
107
+ tag ,
105
108
)
106
109
107
110
@@ -130,26 +133,26 @@ def synthesize(model, step, configs, vocoder, batchs, control_values):
130
133
)
131
134
parser .add_argument (
132
135
"--speaker_id" ,
133
- type = int ,
134
- default = 0 ,
136
+ type = str ,
137
+ default = "p001" ,
135
138
help = "speaker ID for multi-speaker synthesis, for single-sentence mode only" ,
136
139
)
137
140
parser .add_argument (
138
141
"--emotion_id" ,
139
- type = int ,
140
- default = 0 ,
142
+ type = str ,
143
+ default = "happy" ,
141
144
help = "emotion ID for multi-emotion synthesis, for single-sentence mode only" ,
142
145
)
143
146
parser .add_argument (
144
147
"--arousal" ,
145
- type = int ,
146
- default = 3 ,
148
+ type = str ,
149
+ default = "3" ,
147
150
help = "arousal value for multi-emotion synthesis, for single-sentence mode only" ,
148
151
)
149
152
parser .add_argument (
150
153
"--valence" ,
151
- type = int ,
152
- default = 3 ,
154
+ type = str ,
155
+ default = "3" ,
153
156
help = "valence value for multi-emotion synthesis, for single-sentence mode only" ,
154
157
)
155
158
parser .add_argument (
@@ -214,21 +217,30 @@ def synthesize(model, step, configs, vocoder, batchs, control_values):
214
217
batch_size = 8 ,
215
218
collate_fn = dataset .collate_fn ,
216
219
)
220
+ tag = None
217
221
if args .mode == "single" :
218
222
emotions = arousals = valences = None
219
223
ids = raw_texts = [args .text [:100 ]]
220
- speakers = np .array ([args .speaker_id ])
224
+ with open (os .path .join (preprocess_config ["path" ]["preprocessed_path" ], "speakers.json" )) as f :
225
+ speaker_map = json .load (f )
226
+ speakers = np .array ([speaker_map [args .speaker_id ]])
221
227
if model_config ["multi_emotion" ]:
222
- emotions = np .array ([args .emotion_id ])
223
- arousals = np .array ([args .arousal ])
224
- valences = np .array ([args .valence ])
228
+ with open (os .path .join (preprocess_config ["path" ]["preprocessed_path" ], "emotions.json" )) as f :
229
+ json_raw = json .load (f )
230
+ emotion_map = json_raw ["emotion_dict" ]
231
+ arousal_map = json_raw ["arousal_dict" ]
232
+ valence_map = json_raw ["valence_dict" ]
233
+ emotions = np .array ([emotion_map [args .emotion_id ]])
234
+ arousals = np .array ([arousal_map [args .arousal ]])
235
+ valences = np .array ([valence_map [args .valence ]])
225
236
if preprocess_config ["preprocessing" ]["text" ]["language" ] == "kr" :
226
237
texts = np .array ([preprocess_korean (args .text , preprocess_config )])
227
238
elif preprocess_config ["preprocessing" ]["text" ]["language" ] == "en" :
228
239
texts = np .array ([preprocess_english (args .text , preprocess_config )])
229
240
text_lens = np .array ([len (texts [0 ])])
230
241
batchs = [(ids , raw_texts , speakers , emotions , arousals , valences , texts , text_lens , max (text_lens ))]
242
+ tag = f"{ args .speaker_id } _{ args .emotion_id } "
231
243
232
244
control_values = args .pitch_control , args .energy_control , args .duration_control
233
245
234
- synthesize (model , args .restore_step , configs , vocoder , batchs , control_values )
246
+ synthesize (model , args .restore_step , configs , vocoder , batchs , control_values , tag )
0 commit comments