1
+ audio :
2
+ chunk_size : 485100 # 44100 * 11
3
+ num_channels : 2
4
+ sample_rate : 44100
5
+ min_mean_abs : 0.000
6
+
7
+ model :
8
+ sources : ['drums', 'bass', 'other', 'vocals']
9
+ audio_channels : 2
10
+ dims : [4, 32, 64, 128]
11
+ nfft : 4096
12
+ hop_size : 1024
13
+ win_size : 4096
14
+ normalized : True
15
+ band_SR : [0.175, 0.392, 0.433]
16
+ band_stride : [1, 4, 16]
17
+ band_kernel : [3, 4, 16]
18
+ conv_depths : [3, 2, 1]
19
+ compress : 4
20
+ conv_kernel : 3
21
+ num_dplayer : 6
22
+ expand : 1
23
+ tran_rotary_embedding_dim : 64
24
+ tran_depth : 1
25
+ tran_heads : 8
26
+ tran_dim_head : 64
27
+ tran_attn_dropout : 0.0
28
+ tran_ff_dropout : 0.0
29
+ tran_flash_attn : False
30
+
31
+ training :
32
+ batch_size : 5
33
+ gradient_accumulation_steps : 1
34
+ grad_clip : 0
35
+ instruments : ['drums', 'bass', 'other', 'vocals']
36
+ patience : 2
37
+ reduce_factor : 0.95
38
+ target_instrument : null
39
+ num_epochs : 1000
40
+ num_steps : 1000
41
+ q : 0.95
42
+ coarse_loss_clip : true
43
+ ema_momentum : 0.999
44
+ optimizer : adam
45
+ lr : 5.0e-05
46
+ # optimizer: prodigy
47
+ # lr: 1.0
48
+ normalize : false # perform normalization on input of model (use the same for inference!)
49
+ other_fix : false # it's needed for checking on multisong dataset if other is actually instrumental
50
+ use_amp : true # enable or disable usage of mixed precision (float16) - usually it must be true
51
+
52
+ augmentations :
53
+ enable : true # enable or disable all augmentations (to fast disable if needed)
54
+ loudness : true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
55
+ loudness_min : 0.5
56
+ loudness_max : 1.5
57
+ mixup : true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
58
+ mixup_probs : !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
59
+ - 0.2
60
+ - 0.02
61
+ - 0.002
62
+ mixup_loudness_min : 0.5
63
+ mixup_loudness_max : 1.5
64
+
65
+ # apply mp3 compression to mixture only (emulate downloading mp3 from internet)
66
+ mp3_compression_on_mixture : 0.01
67
+ mp3_compression_on_mixture_bitrate_min : 32
68
+ mp3_compression_on_mixture_bitrate_max : 320
69
+ mp3_compression_on_mixture_backend : " lameenc"
70
+
71
+ all :
72
+ channel_shuffle : 0.5 # Set 0 or lower to disable
73
+ random_inverse : 0.01 # inverse track (better lower probability)
74
+ random_polarity : 0.5 # polarity change (multiply waveform to -1)
75
+
76
+ vocals :
77
+ pitch_shift : 0.1
78
+ pitch_shift_min_semitones : -5
79
+ pitch_shift_max_semitones : 5
80
+ seven_band_parametric_eq : 0.1
81
+ seven_band_parametric_eq_min_gain_db : -9
82
+ seven_band_parametric_eq_max_gain_db : 9
83
+ tanh_distortion : 0.1
84
+ tanh_distortion_min : 0.1
85
+ tanh_distortion_max : 0.7
86
+ time_stretch : 0.01
87
+ time_stretch_min_rate : 0.8
88
+ time_stretch_max_rate : 1.25
89
+ bass :
90
+ pitch_shift : 0.01
91
+ pitch_shift_min_semitones : -2
92
+ pitch_shift_max_semitones : 2
93
+ seven_band_parametric_eq : 0.01
94
+ seven_band_parametric_eq_min_gain_db : -3
95
+ seven_band_parametric_eq_max_gain_db : 6
96
+ tanh_distortion : 0.01
97
+ tanh_distortion_min : 0.1
98
+ tanh_distortion_max : 0.5
99
+ time_stretch : 0.1
100
+ time_stretch_min_rate : 0.9
101
+ time_stretch_max_rate : 1.1
102
+ drums :
103
+ pitch_shift : 0.1
104
+ pitch_shift_min_semitones : -5
105
+ pitch_shift_max_semitones : 5
106
+ seven_band_parametric_eq : 0.1
107
+ seven_band_parametric_eq_min_gain_db : -9
108
+ seven_band_parametric_eq_max_gain_db : 9
109
+ tanh_distortion : 0.1
110
+ tanh_distortion_min : 0.1
111
+ tanh_distortion_max : 0.6
112
+ time_stretch : 0.01
113
+ time_stretch_min_rate : 0.8
114
+ time_stretch_max_rate : 1.25
115
+ other :
116
+ pitch_shift : 0.1
117
+ pitch_shift_min_semitones : -4
118
+ pitch_shift_max_semitones : 4
119
+ gaussian_noise : 0.1
120
+ gaussian_noise_min_amplitude : 0.001
121
+ gaussian_noise_max_amplitude : 0.015
122
+ time_stretch : 0.01
123
+ time_stretch_min_rate : 0.8
124
+ time_stretch_max_rate : 1.25
125
+
126
+ inference :
127
+ batch_size : 2
128
+ dim_t : 256
129
+ num_overlap : 2
130
+ normalize : false
131
+
132
+ loss_multistft :
133
+ fft_sizes :
134
+ - 1024
135
+ - 2048
136
+ - 4096
137
+ hop_sizes :
138
+ - 147
139
+ - 256
140
+ - 512
141
+ win_lengths :
142
+ - 1024
143
+ - 2048
144
+ - 4096
145
+ window : " hann_window"
146
+ scale : " mel"
147
+ n_bins : 128
148
+ sample_rate : 44100
149
+ perceptual_weighting : true
150
+ w_sc : 1.0
151
+ w_log_mag : 1.0
152
+ w_lin_mag : 0.0
153
+ w_phs : 0.0
154
+ mag_distance : " L1"
0 commit comments