add read words list and signal scale
This commit is contained in:
parent
ab74ee4bfc
commit
2217754bf0
|
@ -2,3 +2,4 @@
|
|||
|
||||
*.model
|
||||
[Mm]odel_speech/
|
||||
*.wav
|
|
@ -25,3 +25,7 @@ LSTM + CNN
|
|||
|
||||
基于概率图的马尔可夫模型
|
||||
|
||||
## Log
|
||||
日志
|
||||
|
||||
链接:[进展日志](https://github.com/nl8590687/ASRT_SpeechRecognition/blob/master/log.md)
|
|
@ -50,14 +50,20 @@ def fbank(signal, samplerate, conf):
|
|||
Compute fbank features from an audio signal.
|
||||
从一个声音信号中计算fbank特征向量
|
||||
Args:
|
||||
参数:
|
||||
signal: the audio signal from which to compute features. Should be an
|
||||
N*1 array
|
||||
要计算特征的声音信号,一个N*1维的数组
|
||||
samplerate: the samplerate of the signal we are working with.
|
||||
要处理信号的采样率
|
||||
conf: feature configuration
|
||||
特征的配置
|
||||
|
||||
Returns:
|
||||
返回值:
|
||||
A numpy array of size (NUMFRAMES by nfilt) containing features, a numpy
|
||||
vector containing the signal energy
|
||||
返回一个包含特征向量的numpy数组,一个包含信号能量的numpy向量
|
||||
'''
|
||||
|
||||
highfreq = int(conf['highfreq'])
|
||||
|
|
|
@ -5,6 +5,7 @@ import os
|
|||
import wave
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
import math
|
||||
|
||||
def read_wav_data(filename):
|
||||
'''
|
||||
|
@ -20,14 +21,27 @@ def read_wav_data(filename):
|
|||
wave_data = np.fromstring(str_data, dtype = np.short) # 将声音文件数据转换为数组矩阵形式
|
||||
wave_data.shape = -1, num_channel # 按照声道数将数组整形,单声道时候是一列数组,双声道时候是两列的矩阵
|
||||
wave_data = wave_data.T # 将矩阵转置
|
||||
time = np.arange(0, num_frame) * (1.0/framerate) # 计算声音的播放时间,单位为秒
|
||||
return wave_data, time
|
||||
wave_data = wave_data
|
||||
return wave_data, framerate
|
||||
|
||||
def wav_show(wave_data, time): # 显示出来声音波形
|
||||
#wave_data, time = read_wave_data("C:\\Users\\nl\\Desktop\\A2_0.wav")
|
||||
#draw the wave
|
||||
def wav_scale(energy):
|
||||
'''
|
||||
语音信号能量归一化
|
||||
'''
|
||||
sum=0
|
||||
for i in energy:
|
||||
sum=sum+i*i
|
||||
length=len(energy)
|
||||
print(length,sum)
|
||||
m=math.sqrt(length/sum)
|
||||
e=energy*m
|
||||
return e
|
||||
|
||||
def wav_show(wave_data, fs): # 显示出来声音波形
|
||||
time = np.arange(0, len(wave_data)) * (1.0/fs) # 计算声音的播放时间,单位为秒
|
||||
# 画声音波形
|
||||
#plt.subplot(211)
|
||||
plt.plot(time, wave_data[0])
|
||||
plt.plot(time, wave_data)
|
||||
#plt.subplot(212)
|
||||
#plt.plot(time, wave_data[1], c = "g")
|
||||
plt.show()
|
||||
|
@ -53,11 +67,24 @@ def get_wav_symbol(filename):
|
|||
读取指定数据集中,所有wav文件对应的语音符号
|
||||
返回一个存储符号集的字典类型值
|
||||
'''
|
||||
print('test')
|
||||
#if(__name__=='__main__'):
|
||||
txt_obj=open(filename,'r') # 打开文件并读入
|
||||
txt_text=txt_obj.read()
|
||||
txt_lines=txt_text.split('\n') # 文本分割
|
||||
dic_symbol_list={} # 初始化字典
|
||||
for i in txt_lines:
|
||||
if(i!=''):
|
||||
txt_l=i.split(' ')
|
||||
dic_symbol_list[txt_l[0]]=txt_l[1:]
|
||||
return dic_symbol_list
|
||||
|
||||
if(__name__=='__main__'):
|
||||
#dic=get_wav_symbol('E:\\语音数据集\\doc\\doc\\trans\\train.phone.txt')
|
||||
#print(dic)
|
||||
#dic=get_wav_list('E:\\语音数据集\\doc\\doc\\list\\train.wav.lst')
|
||||
#for i in dic:
|
||||
#print(i,dic[i])
|
||||
#wave_data, time = read_wav_data("C:\\Users\\nl\\Desktop\\A2_0.wav")
|
||||
#wav_show(wave_data,time)
|
||||
wave_data, fs = read_wav_data("A2_0.wav")
|
||||
wave_data[0]=wav_scale(wave_data[0])
|
||||
#print(fs)
|
||||
wav_show(wave_data[0],fs)
|
||||
|
||||
|
|
|
@ -0,0 +1,191 @@
|
|||
'''@file sigproc.py
|
||||
contains the signal processing functionality
|
||||
|
||||
The MIT License (MIT)
|
||||
|
||||
Copyright (c) 2013 James Lyons
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy of
|
||||
this software and associated documentation files (the "Software"), to deal in
|
||||
the Software without restriction, including without limitation the rights to
|
||||
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
|
||||
the Software, and to permit persons to whom the Software is furnished to do so,
|
||||
subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
|
||||
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
|
||||
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
|
||||
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
|
||||
This file includes routines for basic signal processing including framing and
|
||||
computing power spectra.
|
||||
Author: James Lyons 2012
|
||||
'''
|
||||
|
||||
import math
|
||||
import numpy
|
||||
|
||||
def framesig(sig, frame_len, frame_step, winfunc=lambda x: numpy.ones((x, ))):
|
||||
'''
|
||||
Frame a signal into overlapping frames.
|
||||
|
||||
Args:
|
||||
sig: the audio signal to frame.
|
||||
frame_len: length of each frame measured in samples.
|
||||
frame_step: number of samples after the start of the previous frame that
|
||||
the next frame should begin.
|
||||
winfunc: the analysis window to apply to each frame. By default no
|
||||
window is applied.
|
||||
|
||||
Returns:
|
||||
an array of frames. Size is NUMFRAMES by frame_len.
|
||||
'''
|
||||
|
||||
slen = len(sig)
|
||||
frame_len = int(round(frame_len))
|
||||
frame_step = int(round(frame_step))
|
||||
if slen <= frame_len:
|
||||
numframes = 1
|
||||
else:
|
||||
numframes = 1 + int(math.ceil((1.0*slen - frame_len)/frame_step))
|
||||
|
||||
padlen = int((numframes-1)*frame_step + frame_len)
|
||||
|
||||
zeros = numpy.zeros((padlen - slen,))
|
||||
padsignal = numpy.concatenate((sig, zeros))
|
||||
|
||||
indices = (numpy.tile(numpy.arange(0, frame_len), (numframes, 1))
|
||||
+ numpy.tile(numpy.arange(0, numframes*frame_step, frame_step),
|
||||
(frame_len, 1)).T)
|
||||
indices = numpy.array(indices, dtype=numpy.int32)
|
||||
frames = padsignal[indices]
|
||||
win = numpy.tile(winfunc(frame_len), (numframes, 1))
|
||||
return frames*win
|
||||
|
||||
def deframesig(frames, siglen, frame_len, frame_step,
|
||||
winfunc=lambda x: numpy.ones((x, ))):
|
||||
'''
|
||||
Does overlap-add procedure to undo the action of framesig.
|
||||
|
||||
Args:
|
||||
frames the: array of frames.
|
||||
siglen the: length of the desired signal, use 0 if unknown. Output will
|
||||
be truncated to siglen samples.
|
||||
frame_len: length of each frame measured in samples.
|
||||
frame_step: number of samples after the start of the previous frame that
|
||||
the next frame should begin.
|
||||
winfunc: the analysis window to apply to each frame. By default no
|
||||
window is applied.
|
||||
|
||||
Returns:
|
||||
a 1-D signal.
|
||||
'''
|
||||
|
||||
frame_len = round(frame_len)
|
||||
frame_step = round(frame_step)
|
||||
numframes = numpy.shape(frames)[0]
|
||||
assert numpy.shape(frames)[1] == frame_len, '''"frames" matrix is wrong
|
||||
size, 2nd dim is not equal to frame_len'''
|
||||
|
||||
indices = (numpy.tile(numpy.arange(0, frame_len), (numframes, 1))
|
||||
+ numpy.tile(numpy.arange(0, numframes*frame_step, frame_step),
|
||||
(frame_len, 1)).T)
|
||||
|
||||
indices = numpy.array(indices, dtype=numpy.int32)
|
||||
padlen = (numframes-1)*frame_step + frame_len
|
||||
|
||||
if siglen <= 0:
|
||||
siglen = padlen
|
||||
|
||||
rec_signal = numpy.zeros((padlen, ))
|
||||
window_correction = numpy.zeros((padlen, ))
|
||||
win = winfunc(frame_len)
|
||||
|
||||
for i in range(0, numframes):
|
||||
#add a little bit so it is never zero
|
||||
window_correction[indices[i, :]] = (window_correction[indices[i, :]]
|
||||
+ win + 1e-15)
|
||||
|
||||
rec_signal[indices[i, :]] = rec_signal[indices[i, :]] + frames[i, :]
|
||||
|
||||
rec_signal = rec_signal/window_correction
|
||||
return rec_signal[0:siglen]
|
||||
|
||||
def magspec(frames, nfft):
|
||||
'''
|
||||
Compute the magnitude spectrum of each frame in frames.
|
||||
|
||||
If frames is an NxD matrix, output will be NxNFFT.
|
||||
|
||||
Args:
|
||||
frames: the array of frames. Each row is a frame.
|
||||
nfft: the FFT length to use. If NFFT > frame_len, the frames are
|
||||
zero-padded.
|
||||
|
||||
Returns:
|
||||
If frames is an NxD matrix, output will be NxNFFT. Each row will be the
|
||||
magnitude spectrum of the corresponding frame.
|
||||
'''
|
||||
|
||||
complex_spec = numpy.fft.rfft(frames, nfft)
|
||||
return numpy.absolute(complex_spec)
|
||||
|
||||
def powspec(frames, nfft):
|
||||
'''
|
||||
Compute the power spectrum of each frame in frames.
|
||||
|
||||
If frames is an NxD matrix, output will be NxNFFT.
|
||||
|
||||
Args:
|
||||
frames: the array of frames. Each row is a frame.
|
||||
nfft: the FFT length to use. If NFFT > frame_len, the frames are
|
||||
zero-padded.
|
||||
|
||||
Returns:
|
||||
If frames is an NxD matrix, output will be NxNFFT. Each row will be the
|
||||
power spectrum of the corresponding frame.
|
||||
'''
|
||||
return 1.0/nfft * numpy.square(magspec(frames, nfft))
|
||||
|
||||
def logpowspec(frames, nfft, norm=1):
|
||||
'''
|
||||
Compute the log power spectrum of each frame in frames.
|
||||
|
||||
If frames is an NxD matrix, output will be NxNFFT.
|
||||
|
||||
Args:
|
||||
frames: the array of frames. Each row is a frame.
|
||||
nfft: the FFT length to use. If NFFT > frame_len, the frames are
|
||||
zero-padded.
|
||||
norm: If norm=1, the log power spectrum is normalised so that the max
|
||||
value (across all frames) is 1.
|
||||
|
||||
Returns:
|
||||
If frames is an NxD matrix, output will be NxNFFT. Each row will be the
|
||||
log power spectrum of the corresponding frame.
|
||||
'''
|
||||
ps = powspec(frames, nfft)
|
||||
ps[ps <= 1e-30] = 1e-30
|
||||
lps = 10*numpy.log10(ps)
|
||||
if norm:
|
||||
return lps - numpy.max(lps)
|
||||
else:
|
||||
return lps
|
||||
|
||||
def preemphasis(signal, coeff=0.95):
|
||||
'''
|
||||
perform preemphasis on the input signal.
|
||||
|
||||
Args:
|
||||
signal: The signal to filter.
|
||||
coeff: The preemphasis coefficient. 0 is no filter, default is 0.95.
|
||||
|
||||
Returns:
|
||||
the filtered signal.
|
||||
'''
|
||||
return numpy.append(signal[0], signal[1:]-coeff*signal[:-1])
|
2
log.md
2
log.md
|
@ -8,5 +8,7 @@
|
|||
如果有什么问题,团队内部需要在这里直接写出来
|
||||
|
||||
## Log
|
||||
### 2017-08-28
|
||||
开始准备制作语音信号处理方面的功能
|
||||
### 2017-08-22
|
||||
准备使用Keras基于LSTM/CNN尝试实现
|
4
main.py
4
main.py
|
@ -36,7 +36,7 @@ class ModelSpeech(): # 语音模型类
|
|||
return _model
|
||||
|
||||
def TrainModel(self,datas,epoch = 2,save_step=5000,filename='model_speech/LSTM_CNN_model'): # 训练模型
|
||||
print('test')
|
||||
pass
|
||||
|
||||
def LoadModel(self,filename='model_speech/LSTM_CNN_model'): # 加载模型参数
|
||||
self._model.load_weights(filename)
|
||||
|
@ -45,7 +45,7 @@ class ModelSpeech(): # 语音模型类
|
|||
self._model.save_weights(filename+'.model')
|
||||
|
||||
def TestModel(self): # 测试检验模型效果
|
||||
print('test')
|
||||
pass
|
||||
|
||||
@property
|
||||
def model(self): # 返回keras model
|
||||
|
|
Loading…
Reference in New Issue