1. 程式人生 > >語音波形,截斷的頻域輸出以及語譜圖製作

語音波形,截斷的頻域輸出以及語譜圖製作

語音視覺化

今天我想復現一下,文中語譜圖提取部分的程式碼這裡寫圖片描述 由於輸入的語音有單通道和雙通道之分,處理方式是單通道不變,雙通道只取一個通道的資訊。附上程式碼:

import wave as we
import numpy as np
import matplotlib.pyplot as plt

def wavread(path):
    wavfile =  we.open(path,"rb")
    params = wavfile.getparams()
    nchannels,samplewidth,framerate,nframes=params[:4] 
    datawav = wavfile.readframes(nframes)
    wavfile.close()
    wave_data = np.fromstring(datawav,dtype = np.short)

    if nchannels==1: wave_data.shape=-1,1  
    if nchannels==2: wave_data.shape=-1,2

    wave_data = wave_data.T
    time = np.arange(0, nframes) * (1.0/framerate)
    return wave_data[0],time


path = "1.wav"
wavdata,wavtime = wavread(path)
plt.plot(wavtime, wavdata,color = 'blue')
plt.show()

得到如下的時域波形圖這裡寫圖片描述

然後對原始語音訊號處理,得到4k範圍內的頻率訊號。為了理解操作過程,對fft變換的結果進行了總結:這裡寫圖片描述

def fft_4K(path):
    # gain wav data
    wavfile =  we.open(path,"rb")
    params = wavfile.getparams()
    nchannels,samplewidth,framerate,nframes=params[:4] 
    datawav = wavfile.readframes(nframes)
    wavfile.close()
    wave_data = np.fromstring(datawav,dtype = np.short)

    if nchannels==1: wave_data.shape=-1,1  
    if nchannels==2: wave_data.shape=-1,2

    wave_data = wave_data.T

    # gain fft
    df=framerate/(float)(nframes-1)  
    freq=[df*n for n in range(0,nframes)]  
    transformed=np.fft.fft(wave_data[0])  
    d=int(len(transformed)/2)  
    while freq[d]>4000:  
        d-=10  
    freq=freq[:d]  
    transformed=transformed[:d]  
    for i,data in enumerate(transformed):  
        transformed[i]=abs(data)  

    return freq, transformed

得到的結果這裡寫圖片描述

之後,為了進一步得到語譜圖結果,採用如下程式碼,幀長為20ms,幀移為10ms。測試語音只保留3s內的資訊,顯示的頻率範圍是【0,7.5KHz】,之後的頻率範圍內的特徵值被捨棄。

import numpy, wave
import numpy, matplotlib.pyplot as plt

# target: gain spec from framename
# input: filename, wav file path, string
#        window_length_ms(/ms),window length(/ms), int
#        window_shift_times(),rate of shit length, float
def getSpectrum(filename, window_length_ms, window_shift_times):  

    # read data
    wav_file = wave.open(filename, 'r')
    params = wav_file.getparams()
    # nchannels, channel number (like, 2 channel wav)
    # sampwidth, sample percision rate (like, 2)
    # framerate, sample rate, (like, 44100)
    # wav_length, how much points after sampled, (int)
    nchannels, sampwidth, framerate, wav_length = params[:4]
    str_data = wav_file.readframes(wav_length)
    wave_data = numpy.fromstring(str_data, dtype=numpy.short)
    wav_file.close()

    # gain log spectrogram
    window_length = framerate * window_length_ms / 1000 # change time to points number
    window_shift = int(window_length * window_shift_times) # change time to points number
    nframe = (wav_length - (window_length - window_shift)) / window_shift # gain frame number
    spec = numpy.zeros((window_length/2, nframe)) # store spectrogram [only half part]
    for i in xrange(nframe):
        start = i * window_shift
        end = start + window_length
        spec[:, i] = numpy.log(numpy.abs(numpy.fft.fft(wave_data[start:end])))[:window_length/2]
    return spec


# main process
speech_spectrum = getSpectrum('1.wav', 20, 0.5)  
plt.imshow(speech_spectrum[:,:])
plt.xlim(0, 300)
plt.ylim(0, 150)
plt.show()

得到的語譜圖結果:這裡寫圖片描述