ASRT_SpeechRecognition/general_function/feature_computer.py

275 lines
8.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import numpy
import sigproc
from scipy.fftpack import dct
from scipy.ndimage import convolve1d
def get_wav_feature(wav_file):
'''
获取WAV文件最终输入的特征向量的函数
参数:
wav_file: WAV文件对象
返回值:
神经网络可直接用的输入层向量
'''
vector = numpy.matrix([])
#在这里添加代码
return vector
def mfcc(signal, samplerate, conf):
'''
Compute MFCC features from an audio signal.
从一个声音信号中计算MFCC特征向量
Args:
参数:
signal: the audio signal from which to compute features. Should be an
N*1 array
通过这个声音信号计算特征向量它应当是一个N*1的数组
samplerate: the samplerate of the signal we are working with.
要处理的信号的采样率
conf: feature configuration
特征的配置
Returns:
返回值:
A numpy array of size (NUMFRAMES by numcep) containing features. Each
row holds 1 feature vector, a numpy vector containing the signal
log-energy
返回一个包含特征向量的numpy数组每一列是一个特征向量每一个numpy向量包含信号的对数能量
'''
feat, energy = fbank(signal, samplerate, conf)
feat = numpy.log(feat)
feat = dct(feat, type=2, axis=1, norm='ortho')[:, :int(conf['numcep'])]
feat = lifter(feat, float(conf['ceplifter']))
return feat, numpy.log(energy)
def fbank(signal, samplerate, conf):
'''
Compute fbank features from an audio signal.
从一个声音信号中计算fbank特征向量
Args:
signal: the audio signal from which to compute features. Should be an
N*1 array
samplerate: the samplerate of the signal we are working with.
conf: feature configuration
Returns:
A numpy array of size (NUMFRAMES by nfilt) containing features, a numpy
vector containing the signal energy
'''
highfreq = int(conf['highfreq'])
if highfreq < 0:
highfreq = samplerate/2
signal = sigproc.preemphasis(signal, float(conf['preemph']))
frames = sigproc.framesig(signal, float(conf['winlen'])*samplerate,
float(conf['winstep'])*samplerate)
pspec = sigproc.powspec(frames, int(conf['nfft']))
# this stores the total energy in each frame
energy = numpy.sum(pspec, 1)
# if energy is zero, we get problems with log
energy = numpy.where(energy == 0, numpy.finfo(float).eps, energy)
filterbank = get_filterbanks(int(conf['nfilt']), int(conf['nfft']),
samplerate, int(conf['lowfreq']), highfreq)
# compute the filterbank energies
feat = numpy.dot(pspec, filterbank.T)
# if feat is zero, we get problems with log
feat = numpy.where(feat == 0, numpy.finfo(float).eps, feat)
return feat, energy
def logfbank(signal, samplerate, conf):
'''
Compute log-fbank features from an audio signal.
Args:
signal: the audio signal from which to compute features. Should be an
N*1 array
samplerate: the samplerate of the signal we are working with.
conf: feature configuration
Returns:
A numpy array of size (NUMFRAMES by nfilt) containing features, a numpy
vector containing the signal log-energy
'''
feat, energy = fbank(signal, samplerate, conf)
return numpy.log(feat), numpy.log(energy)
def ssc(signal, samplerate, conf):
'''
Compute ssc features from an audio signal.
Args:
signal: the audio signal from which to compute features. Should be an
N*1 array
samplerate: the samplerate of the signal we are working with.
conf: feature configuration
Returns:
A numpy array of size (NUMFRAMES by nfilt) containing features, a numpy
vector containing the signal log-energy
'''
highfreq = int(conf['highfreq'])
if highfreq < 0:
highfreq = samplerate/2
signal = sigproc.preemphasis(signal, float(conf['preemph']))
frames = sigproc.framesig(signal, float(conf['winlen'])*samplerate,
float(conf['winstep'])*samplerate)
pspec = sigproc.powspec(frames, int(conf['nfft']))
# this stores the total energy in each frame
energy = numpy.sum(pspec, 1)
# if energy is zero, we get problems with log
energy = numpy.where(energy == 0, numpy.finfo(float).eps, energy)
filterbank = get_filterbanks(int(conf['nfilt']), int(conf['nfft']),
samplerate, int(conf['lowfreq']), highfreq)
# compute the filterbank energies
feat = numpy.dot(pspec, filterbank.T)
tiles = numpy.tile(numpy.linspace(1, samplerate/2, numpy.size(pspec, 1)),
(numpy.size(pspec, 0), 1))
return numpy.dot(pspec*tiles, filterbank.T) / feat, numpy.log(energy)
def hz2mel(rate):
'''
Convert a value in Hertz to Mels
Args:
rate: a value in Hz. This can also be a numpy array, conversion proceeds
element-wise.
Returns:
a value in Mels. If an array was passed in, an identical sized array is
returned.
'''
return 2595 * numpy.log10(1+rate/700.0)
def mel2hz(mel):
'''
Convert a value in Mels to Hertz
Args:
mel: a value in Mels. This can also be a numpy array, conversion
proceeds element-wise.
Returns:
a value in Hertz. If an array was passed in, an identical sized array is
returned.
'''
return 700*(10**(mel/2595.0)-1)
def get_filterbanks(nfilt=20, nfft=512, samplerate=16000, lowfreq=0,
highfreq=None):
'''
Compute a Mel-filterbank.
The filters are stored in the rows, the columns correspond to fft bins.
The filters are returned as an array of size nfilt * (nfft/2 + 1)
Args:
nfilt: the number of filters in the filterbank, default 20.
nfft: the FFT size. Default is 512.
samplerate: the samplerate of the signal we are working with. Affects
mel spacing.
lowfreq: lowest band edge of mel filters, default 0 Hz
highfreq: highest band edge of mel filters, default samplerate/2
Returns:
A numpy array of size nfilt * (nfft/2 + 1) containing filterbank. Each
row holds 1 filter.
'''
highfreq = highfreq or samplerate/2
assert highfreq <= samplerate/2, "highfreq is greater than samplerate/2"
# compute points evenly spaced in mels
lowmel = hz2mel(lowfreq)
highmel = hz2mel(highfreq)
melpoints = numpy.linspace(lowmel, highmel, nfilt+2)
# our points are in Hz, but we use fft bins, so we have to convert
# from Hz to fft bin number
bins = numpy.floor((nfft+1)*mel2hz(melpoints)/samplerate)
fbanks = numpy.zeros([nfilt, nfft/2+1])
for j in xrange(0, nfilt):
for i in xrange(int(bins[j]), int(bins[j+1])):
fbanks[j, i] = (i - bins[j])/(bins[j+1]-bins[j])
for i in xrange(int(bins[j+1]), int(bins[j+2])):
fbanks[j, i] = (bins[j+2]-i)/(bins[j+2]-bins[j+1])
return fbanks
def lifter(cepstra, liftering=22):
'''
Apply a cepstral lifter the the matrix of cepstra.
This has the effect of increasing the magnitude of the high frequency DCT
coeffs.
Args:
cepstra: the matrix of mel-cepstra, will be numframes * numcep in size.
liftering: the liftering coefficient to use. Default is 22. L <= 0
disables lifter.
Returns:
the lifted cepstra
'''
if liftering > 0:
_, ncoeff = numpy.shape(cepstra)
lift = 1+(liftering/2)*numpy.sin(numpy.pi
*numpy.arange(ncoeff)/liftering)
return lift*cepstra
else:
# values of liftering <= 0, do nothing
return cepstra
def deriv(features):
'''
Compute the first order derivative of the features
Args:
features: the input features
Returns:
the firs order derivative
'''
return convolve1d(features, [2, 1, 0, -1, -2], 0)
def delta(features):
'''
concatenate the first order derivative to the features
Args:
features: the input features
Returns:
the features concatenated with the first order derivative
'''
return numpy.concatenate((features, deriv(features)), 1)
def ddelta(features):
'''
concatenate the first and second order derivative to the features
Args:
features: the input features
Returns:
the features concatenated with the first and second order derivative
'''
deltafeat = deriv(features)
return numpy.concatenate((features, deltafeat, deriv(deltafeat)), 1)