Skip to content

Commit e9f7cb6

Browse files
committed
correction for cmvnw function
1 parent 2862414 commit e9f7cb6

File tree

4 files changed

+52
-54
lines changed

4 files changed

+52
-54
lines changed

docs/source/epilogue/test.rst

Lines changed: 28 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -10,34 +10,33 @@ The test example can be seen in ``test/test_package.py`` as below:
1010

1111
.. code-block:: python
1212
13-
import scipy.io.wavfile as wav
14-
import numpy as np
15-
import speechpy
16-
import os
17-
1813
file_name = os.path.join(os.path.dirname(os.path.abspath(__file__)),'Alesis-Sanctuary-QCard-AcoustcBas-C2.wav')
1914
fs, signal = wav.read(file_name)
2015
signal = signal[:,0]
2116
17+
# Example of staching frames
18+
frames = speechpy.processing.stack_frames(signal, sampling_frequency=fs, frame_length=0.020, frame_stride=0.01, Filter=lambda x: np.ones((x,)),
19+
zero_padding=True)
20+
21+
# Example of extracting power spectrum
22+
power_spectrum = speechpy.processing.power_spectrum(frames, fft_points=512)
23+
print('power spectrum shape=', power_spectrum.shape)
24+
2225
############# Extract MFCC features #############
23-
mfcc = speechpy.mfcc(signal, sampling_frequency=fs, frame_length=0.020, frame_stride=0.01,
24-
num_filters=40, fft_length=512, low_frequency=0, high_frequency=None)
25-
mfcc_cmvn = speechpy.cmvnw(mfcc,win_size=301,variance_normalization=True)
26+
mfcc = speechpy.feature.mfcc(signal, sampling_frequency=fs, frame_length=0.020, frame_stride=0.01,
27+
num_filters=40, fft_length=512, low_frequency=0, high_frequency=None)
28+
mfcc_cmvn = speechpy.processing.cmvnw(mfcc,win_size=301,variance_normalization=True)
2629
print('mfcc(mean + variance normalized) feature shape=', mfcc_cmvn.shape)
2730
28-
mfcc_feature_cube = speechpy.extract_derivative_feature(mfcc)
31+
mfcc_feature_cube = speechpy.feature.extract_derivative_feature(mfcc)
2932
print('mfcc feature cube shape=', mfcc_feature_cube.shape)
3033
3134
############# Extract logenergy features #############
32-
logenergy = speechpy.lmfe(signal, sampling_frequency=fs, frame_length=0.020, frame_stride=0.01,
33-
num_filters=40, fft_length=512, low_frequency=0, high_frequency=None)
34-
logenergy_feature_cube = speechpy.extract_derivative_feature(logenergy)
35+
logenergy = speechpy.feature.lmfe(signal, sampling_frequency=fs, frame_length=0.020, frame_stride=0.01,
36+
num_filters=40, fft_length=512, low_frequency=0, high_frequency=None)
37+
logenergy_feature_cube = speechpy.feature.extract_derivative_feature(logenergy)
3538
print('logenergy features=', logenergy.shape)
3639
37-
# Example of staching frames
38-
signal = speechpy.stack_frames(signal, sampling_frequency=fs, frame_length=0.020, frame_stride=0.01, Filter=lambda x: np.ones((x,)),
39-
zero_padding=True)
40-
4140
-----------
4241
Test Local
4342
-----------
@@ -47,39 +46,33 @@ The local test example can be found in ``test/test_package.py`` as follows:
4746

4847
.. code-block:: python
4948
50-
import scipy.io.wavfile as wav
51-
import numpy as np
52-
import os
53-
import sys
54-
lib_path = os.path.abspath(os.path.join('..'))
55-
print(lib_path)
56-
sys.path.append(lib_path)
57-
import speechpy
58-
import os
59-
6049
file_name = os.path.join(os.path.dirname(os.path.abspath(__file__)),'Alesis-Sanctuary-QCard-AcoustcBas-C2.wav')
6150
fs, signal = wav.read(file_name)
6251
signal = signal[:,0]
6352
53+
# Example of staching frames
54+
frames = speechpy.processing.stack_frames(signal, sampling_frequency=fs, frame_length=0.020, frame_stride=0.01, Filter=lambda x: np.ones((x,)),
55+
zero_padding=True)
56+
57+
# Example of extracting power spectrum
58+
power_spectrum = speechpy.processing.power_spectrum(frames, fft_points=512)
59+
print('power spectrum shape=', power_spectrum.shape)
60+
6461
############# Extract MFCC features #############
65-
mfcc = speechpy.mfcc(signal, sampling_frequency=fs, frame_length=0.020, frame_stride=0.01,
62+
mfcc = speechpy.feature.mfcc(signal, sampling_frequency=fs, frame_length=0.020, frame_stride=0.01,
6663
num_filters=40, fft_length=512, low_frequency=0, high_frequency=None)
67-
mfcc_cmvn = speechpy.cmvnw(mfcc,win_size=301,variance_normalization=True)
64+
mfcc_cmvn = speechpy.processing.cmvnw(mfcc,win_size=301,variance_normalization=True)
6865
print('mfcc(mean + variance normalized) feature shape=', mfcc_cmvn.shape)
6966
70-
mfcc_feature_cube = speechpy.extract_derivative_feature(mfcc)
67+
mfcc_feature_cube = speechpy.feature.extract_derivative_feature(mfcc)
7168
print('mfcc feature cube shape=', mfcc_feature_cube.shape)
7269
7370
############# Extract logenergy features #############
74-
logenergy = speechpy.lmfe(signal, sampling_frequency=fs, frame_length=0.020, frame_stride=0.01,
71+
logenergy = speechpy.feature.lmfe(signal, sampling_frequency=fs, frame_length=0.020, frame_stride=0.01,
7572
num_filters=40, fft_length=512, low_frequency=0, high_frequency=None)
76-
logenergy_feature_cube = speechpy.extract_derivative_feature(logenergy)
73+
logenergy_feature_cube = speechpy.feature.extract_derivative_feature(logenergy)
7774
print('logenergy features=', logenergy.shape)
7875
79-
# Example of staching frames
80-
signal = speechpy.stack_frames(signal, sampling_frequency=fs, frame_length=0.020, frame_stride=0.01, Filter=lambda x: np.ones((x,)),
81-
zero_padding=True)
82-
8376
8477
8578
For ectracting the feature at first, the signal samples will be stacked into frames. The features are computed for each frame in the stacked frames collection.

speechpy/processing.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -77,39 +77,39 @@ def stack_frames(sig, sampling_frequency, frame_length=0.020, frame_stride=0.020
7777
return Extracted_Frames
7878

7979

80-
def fft_spectrum(frames, fft_length=512):
80+
def fft_spectrum(frames, fft_points=512):
8181
"""This function computes the one-dimensional n-point discrete Fourier Transform (DFT) of a real-valued
8282
array by means of an efficient algorithm called the Fast Fourier Transform (FFT). Please refer to
8383
https://docs.scipy.org/doc/numpy/reference/generated/numpy.fft.rfft.html for further details.
8484
8585
:param frames: The frame array in which each row is a frame.
86-
:param fft_length: The length of FFT. If fft_length is greater than frame_len, the frames will be zero-padded.
86+
:param fft_points: The length of FFT. If fft_length is greater than frame_len, the frames will be zero-padded.
8787
:param num_keep_coefficients: The number of coefficients that is kept.
8888
:returns: If frames is an num_frames x sample_per_frame matrix, output will be num_frames x FFT_LENGTH.
8989
"""
90-
SPECTRUM_VECTOR = np.fft.rfft(frames, n=fft_length, axis=-1, norm=None)
90+
SPECTRUM_VECTOR = np.fft.rfft(frames, n=fft_points, axis=-1, norm=None)
9191
return np.absolute(SPECTRUM_VECTOR)
9292

9393

94-
def power_spectrum(frames, fft_length=512):
94+
def power_spectrum(frames, fft_points=512):
9595
"""Power spectrum of each frame.
9696
9797
:param frames: The frame array in which each row is a frame.
98-
:param fft_length: The length of FFT. If fft_length is greater than frame_len, the frames will be zero-padded.
98+
:param fft_points: The length of FFT. If fft_length is greater than frame_len, the frames will be zero-padded.
9999
:returns: If frames is an num_frames x sample_per_frame matrix, output will be num_frames x fft_length.
100100
"""
101-
return 1.0 / fft_length * np.square(fft_spectrum(frames, fft_length))
101+
return 1.0 / fft_points * np.square(fft_spectrum(frames, fft_points))
102102

103103

104-
def log_power_spectrum(frames, fft_length=512, normalize=True):
104+
def log_power_spectrum(frames, fft_points=512, normalize=True):
105105
"""Log power spectrum of each frame in frames.
106106
107107
:param frames: The frame array in which each row is a frame.
108-
:param fft_length: The length of FFT. If fft_length is greater than frame_len, the frames will be zero-padded.
108+
:param fft_points: The length of FFT. If fft_length is greater than frame_len, the frames will be zero-padded.
109109
:param norm: If norm=1, the log power spectrum will be normalized.
110110
:returns: If frames is an num_frames x sample_per_frame matrix, output will be num_frames x fft_length.
111111
"""
112-
power_spec = power_spectrum(frames, fft_length)
112+
power_spec = power_spectrum(frames, fft_points)
113113
power_spec[power_spec <= 1e-20] = 1e-20
114114
log_power_spec = 10 * np.log10(power_spec)
115115
if normalize:

tests/test_local.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,8 @@
1717
zero_padding=True)
1818

1919
# Example of extracting power spectrum
20-
frames = speechpy.processing.power_spectrum(frames, fft_length=512)
20+
power_spectrum = speechpy.processing.power_spectrum(frames, fft_points=512)
21+
print('power spectrum shape=', power_spectrum.shape)
2122

2223
############# Extract MFCC features #############
2324
mfcc = speechpy.feature.mfcc(signal, sampling_frequency=fs, frame_length=0.020, frame_stride=0.01,

tests/test_package.py

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -7,24 +7,28 @@
77
fs, signal = wav.read(file_name)
88
signal = signal[:,0]
99

10+
# Example of staching frames
11+
frames = speechpy.processing.stack_frames(signal, sampling_frequency=fs, frame_length=0.020, frame_stride=0.01, Filter=lambda x: np.ones((x,)),
12+
zero_padding=True)
13+
14+
# Example of extracting power spectrum
15+
power_spectrum = speechpy.processing.power_spectrum(frames, fft_points=512)
16+
print('power spectrum shape=', power_spectrum.shape)
17+
1018
############# Extract MFCC features #############
11-
mfcc = speechpy.mfcc(signal, sampling_frequency=fs, frame_length=0.020, frame_stride=0.01,
19+
mfcc = speechpy.feature.mfcc(signal, sampling_frequency=fs, frame_length=0.020, frame_stride=0.01,
1220
num_filters=40, fft_length=512, low_frequency=0, high_frequency=None)
13-
mfcc_cmvn = speechpy.cmvnw(mfcc,win_size=301,variance_normalization=True)
21+
mfcc_cmvn = speechpy.processing.cmvnw(mfcc,win_size=301,variance_normalization=True)
1422
print('mfcc(mean + variance normalized) feature shape=', mfcc_cmvn.shape)
1523

16-
mfcc_feature_cube = speechpy.extract_derivative_feature(mfcc)
24+
mfcc_feature_cube = speechpy.feature.extract_derivative_feature(mfcc)
1725
print('mfcc feature cube shape=', mfcc_feature_cube.shape)
1826

1927
############# Extract logenergy features #############
20-
logenergy = speechpy.lmfe(signal, sampling_frequency=fs, frame_length=0.020, frame_stride=0.01,
28+
logenergy = speechpy.feature.lmfe(signal, sampling_frequency=fs, frame_length=0.020, frame_stride=0.01,
2129
num_filters=40, fft_length=512, low_frequency=0, high_frequency=None)
22-
logenergy_feature_cube = speechpy.extract_derivative_feature(logenergy)
30+
logenergy_feature_cube = speechpy.feature.extract_derivative_feature(logenergy)
2331
print('logenergy features=', logenergy.shape)
2432

25-
# Example of staching frames
26-
signal = speechpy.stack_frames(signal, sampling_frequency=fs, frame_length=0.020, frame_stride=0.01, Filter=lambda x: np.ones((x,)),
27-
zero_padding=True)
28-
2933

3034

0 commit comments

Comments
 (0)