Skip to content

Synthesis window in lowLatencySpeechEnhancement.ipynb #16

@jurihock

Description

@jurihock

Hi,

is there a reason why the synthesis window is not applied?

See also the attached sketch based on the lowLatencySpeechEnhancement.ipynb example.

fix = False

  • your version
  • clearly visible modulation in the OLA output
  • output gain != 1

fix = True

  • adequate hop size
  • unity gain
  • synthesis window after irfft

orig

fixed

from matplotlib.pyplot import *
from numpy import *
from numpy.fft import rfft, irfft

# Apply the fix
fix = False

# Preprocessing params
fftSize = 1024

# Asymmetric windowing params
analysisWindowSize = fftSize
synthesisWindowSize = 128

asymmetricHopSize = synthesisWindowSize // 4 if fix else (synthesisWindowSize * 3) // 4
m = synthesisWindowSize // 2
k = analysisWindowSize
d = 0

# Symmetric windowing params
symmetricWindowSize = fftSize
symmetricHopSize = asymmetricHopSize # to better compare results

# Generate test signal
stereoSamples = ones((1, fftSize*10))
numChannels, numSamples = stereoSamples.shape

def getAsymmetricAnalysisWindow(k, m, d):
    risingSqrtHann = sqrt( hanning(2*(k-m-d)+1)[:2*(k-m-d)] )
    fallingSqrtHann = sqrt( hanning(2*m+1)[:2*m] )

    window = zeros(k)
    window[:d] = 0
    window[d:k-m] = risingSqrtHann[:k-m-d]
    window[k-m:] = fallingSqrtHann[-m:]

    return window

def getAsymmetricSynthesisWindow(k, m, d):
    risingSqrtHannAnalysis = sqrt( hanning(2*(k-m-d)+1)[:2*(k-m-d)] )
    risingNoramlizedHann = hanning(2*m+1)[:m] / risingSqrtHannAnalysis[k-2*m-d:k-m-d]
    fallingSqrtHann = sqrt( hanning(2*m+1)[:2*m] )

    window = zeros(k)
    window[:-2*m] = 0
    window[-2*m:-m] = risingNoramlizedHann
    window[-m:] = fallingSqrtHann[-m:]

    return window

def performOnlineSpeechEnhancement(analysisWindow, synthesisWindow, hopSize):
    # Setup variables to save speech enhancement results
    numFrequencies = len(rfft(zeros(len(analysisWindow))))
    numFrames = (numSamples-len(synthesisWindow)) // hopSize

    if fix:
        gainFactor = hopSize / sum(analysisWindow * synthesisWindow)
    else:
        gainFactor = hopSize / float(len(synthesisWindow)) * 2

    targetEstimateSamplesOLA = zeros_like(stereoSamples)
    inputSpectrogram = zeros( (2, numFrequencies, numFrames), 'complex64')
    outputSpectrogram = zeros( (2, numFrequencies, numFrames), 'complex64')

    for frameIndex in range(numFrames):
        # compute FFT
        frameStart = frameIndex * hopSize
        frameEnd = frameStart + analysisWindowSize
        stereoSTFTFrame = rfft( stereoSamples[:, frameStart:frameEnd] * analysisWindow )
        inputSpectrogram[..., frameIndex] = stereoSTFTFrame
        outputSpectrogram[..., frameIndex] = stereoSTFTFrame

        # reconstruct time domain samples
        recStereoSTFTFrame = irfft(stereoSTFTFrame)

        if fix:
            # apply synthesis window as well
            recStereoSTFTFrame *= synthesisWindow

        # overlap-add to output samples
        targetEstimateSamplesOLA[:, frameStart:frameEnd] += recStereoSTFTFrame

    targetEstimateSamplesOLA *= gainFactor

    return inputSpectrogram, outputSpectrogram, targetEstimateSamplesOLA

analysisWindow = getAsymmetricAnalysisWindow(k, m, d)
synthesisWindow = getAsymmetricSynthesisWindow(k, m, d)

symmetricWindow = sqrt(hanning(symmetricWindowSize))

symmetricResults = performOnlineSpeechEnhancement(symmetricWindow, symmetricWindow, symmetricHopSize)
asymmetricResults = performOnlineSpeechEnhancement(analysisWindow, synthesisWindow, asymmetricHopSize)

title('fixed' if fix else 'orig')
plot(symmetricResults[-1][-1], label='symmetric', color='b', alpha=0.5)
plot(asymmetricResults[-1][-1], label='asymmetric', color='r', alpha=0.5)
legend()
show()

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions