From 2217754bf0ad1ef6a13367425954d8a4fe8d7bfc Mon Sep 17 00:00:00 2001 From: nl8590687 <3210346136@qq.com> Date: Tue, 29 Aug 2017 00:06:08 +0800 Subject: [PATCH] add read words list and signal scale --- .gitignore | 3 +- README.md | 4 + general_function/feature_computer.py | 6 + general_function/file_wav.py | 47 +++++-- general_function/sigproc.py | 191 +++++++++++++++++++++++++++ log.md | 2 + main.py | 4 +- 7 files changed, 244 insertions(+), 13 deletions(-) create mode 100644 general_function/sigproc.py diff --git a/.gitignore b/.gitignore index a5cf589..c49d806 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ ## Ignore some files and folders for copyright and other reasons. *.model -[Mm]odel_speech/ \ No newline at end of file +[Mm]odel_speech/ +*.wav \ No newline at end of file diff --git a/README.md b/README.md index ca13d7c..2551416 100644 --- a/README.md +++ b/README.md @@ -25,3 +25,7 @@ LSTM + CNN 基于概率图的马尔可夫模型 +## Log +日志 + +链接:[进展日志](https://github.com/nl8590687/ASRT_SpeechRecognition/blob/master/log.md) \ No newline at end of file diff --git a/general_function/feature_computer.py b/general_function/feature_computer.py index b7ac0ac..2db9952 100644 --- a/general_function/feature_computer.py +++ b/general_function/feature_computer.py @@ -50,14 +50,20 @@ def fbank(signal, samplerate, conf): Compute fbank features from an audio signal. 从一个声音信号中计算fbank特征向量 Args: + 参数: signal: the audio signal from which to compute features. Should be an N*1 array + 要计算特征的声音信号,一个N*1维的数组 samplerate: the samplerate of the signal we are working with. + 要处理信号的采样率 conf: feature configuration + 特征的配置 Returns: + 返回值: A numpy array of size (NUMFRAMES by nfilt) containing features, a numpy vector containing the signal energy + 返回一个包含特征向量的numpy数组,一个包含信号能量的numpy向量 ''' highfreq = int(conf['highfreq']) diff --git a/general_function/file_wav.py b/general_function/file_wav.py index f6fe5a5..8b8a7b5 100644 --- a/general_function/file_wav.py +++ b/general_function/file_wav.py @@ -5,6 +5,7 @@ import os import wave import numpy as np import matplotlib.pyplot as plt +import math def read_wav_data(filename): ''' @@ -20,14 +21,27 @@ def read_wav_data(filename): wave_data = np.fromstring(str_data, dtype = np.short) # 将声音文件数据转换为数组矩阵形式 wave_data.shape = -1, num_channel # 按照声道数将数组整形,单声道时候是一列数组,双声道时候是两列的矩阵 wave_data = wave_data.T # 将矩阵转置 - time = np.arange(0, num_frame) * (1.0/framerate) # 计算声音的播放时间,单位为秒 - return wave_data, time + wave_data = wave_data + return wave_data, framerate -def wav_show(wave_data, time): # 显示出来声音波形 - #wave_data, time = read_wave_data("C:\\Users\\nl\\Desktop\\A2_0.wav") - #draw the wave +def wav_scale(energy): + ''' + 语音信号能量归一化 + ''' + sum=0 + for i in energy: + sum=sum+i*i + length=len(energy) + print(length,sum) + m=math.sqrt(length/sum) + e=energy*m + return e + +def wav_show(wave_data, fs): # 显示出来声音波形 + time = np.arange(0, len(wave_data)) * (1.0/fs) # 计算声音的播放时间,单位为秒 + # 画声音波形 #plt.subplot(211) - plt.plot(time, wave_data[0]) + plt.plot(time, wave_data) #plt.subplot(212) #plt.plot(time, wave_data[1], c = "g") plt.show() @@ -53,11 +67,24 @@ def get_wav_symbol(filename): 读取指定数据集中,所有wav文件对应的语音符号 返回一个存储符号集的字典类型值 ''' - print('test') -#if(__name__=='__main__'): + txt_obj=open(filename,'r') # 打开文件并读入 + txt_text=txt_obj.read() + txt_lines=txt_text.split('\n') # 文本分割 + dic_symbol_list={} # 初始化字典 + for i in txt_lines: + if(i!=''): + txt_l=i.split(' ') + dic_symbol_list[txt_l[0]]=txt_l[1:] + return dic_symbol_list + +if(__name__=='__main__'): + #dic=get_wav_symbol('E:\\语音数据集\\doc\\doc\\trans\\train.phone.txt') + #print(dic) #dic=get_wav_list('E:\\语音数据集\\doc\\doc\\list\\train.wav.lst') #for i in dic: #print(i,dic[i]) - #wave_data, time = read_wav_data("C:\\Users\\nl\\Desktop\\A2_0.wav") - #wav_show(wave_data,time) + wave_data, fs = read_wav_data("A2_0.wav") + wave_data[0]=wav_scale(wave_data[0]) + #print(fs) + wav_show(wave_data[0],fs) diff --git a/general_function/sigproc.py b/general_function/sigproc.py new file mode 100644 index 0000000..2147640 --- /dev/null +++ b/general_function/sigproc.py @@ -0,0 +1,191 @@ +'''@file sigproc.py +contains the signal processing functionality + +The MIT License (MIT) + +Copyright (c) 2013 James Lyons + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software is furnished to do so, +subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +This file includes routines for basic signal processing including framing and +computing power spectra. +Author: James Lyons 2012 +''' + +import math +import numpy + +def framesig(sig, frame_len, frame_step, winfunc=lambda x: numpy.ones((x, ))): + ''' + Frame a signal into overlapping frames. + + Args: + sig: the audio signal to frame. + frame_len: length of each frame measured in samples. + frame_step: number of samples after the start of the previous frame that + the next frame should begin. + winfunc: the analysis window to apply to each frame. By default no + window is applied. + + Returns: + an array of frames. Size is NUMFRAMES by frame_len. + ''' + + slen = len(sig) + frame_len = int(round(frame_len)) + frame_step = int(round(frame_step)) + if slen <= frame_len: + numframes = 1 + else: + numframes = 1 + int(math.ceil((1.0*slen - frame_len)/frame_step)) + + padlen = int((numframes-1)*frame_step + frame_len) + + zeros = numpy.zeros((padlen - slen,)) + padsignal = numpy.concatenate((sig, zeros)) + + indices = (numpy.tile(numpy.arange(0, frame_len), (numframes, 1)) + + numpy.tile(numpy.arange(0, numframes*frame_step, frame_step), + (frame_len, 1)).T) + indices = numpy.array(indices, dtype=numpy.int32) + frames = padsignal[indices] + win = numpy.tile(winfunc(frame_len), (numframes, 1)) + return frames*win + +def deframesig(frames, siglen, frame_len, frame_step, + winfunc=lambda x: numpy.ones((x, ))): + ''' + Does overlap-add procedure to undo the action of framesig. + + Args: + frames the: array of frames. + siglen the: length of the desired signal, use 0 if unknown. Output will + be truncated to siglen samples. + frame_len: length of each frame measured in samples. + frame_step: number of samples after the start of the previous frame that + the next frame should begin. + winfunc: the analysis window to apply to each frame. By default no + window is applied. + + Returns: + a 1-D signal. + ''' + + frame_len = round(frame_len) + frame_step = round(frame_step) + numframes = numpy.shape(frames)[0] + assert numpy.shape(frames)[1] == frame_len, '''"frames" matrix is wrong + size, 2nd dim is not equal to frame_len''' + + indices = (numpy.tile(numpy.arange(0, frame_len), (numframes, 1)) + + numpy.tile(numpy.arange(0, numframes*frame_step, frame_step), + (frame_len, 1)).T) + + indices = numpy.array(indices, dtype=numpy.int32) + padlen = (numframes-1)*frame_step + frame_len + + if siglen <= 0: + siglen = padlen + + rec_signal = numpy.zeros((padlen, )) + window_correction = numpy.zeros((padlen, )) + win = winfunc(frame_len) + + for i in range(0, numframes): + #add a little bit so it is never zero + window_correction[indices[i, :]] = (window_correction[indices[i, :]] + + win + 1e-15) + + rec_signal[indices[i, :]] = rec_signal[indices[i, :]] + frames[i, :] + + rec_signal = rec_signal/window_correction + return rec_signal[0:siglen] + +def magspec(frames, nfft): + ''' + Compute the magnitude spectrum of each frame in frames. + + If frames is an NxD matrix, output will be NxNFFT. + + Args: + frames: the array of frames. Each row is a frame. + nfft: the FFT length to use. If NFFT > frame_len, the frames are + zero-padded. + + Returns: + If frames is an NxD matrix, output will be NxNFFT. Each row will be the + magnitude spectrum of the corresponding frame. + ''' + + complex_spec = numpy.fft.rfft(frames, nfft) + return numpy.absolute(complex_spec) + +def powspec(frames, nfft): + ''' + Compute the power spectrum of each frame in frames. + + If frames is an NxD matrix, output will be NxNFFT. + + Args: + frames: the array of frames. Each row is a frame. + nfft: the FFT length to use. If NFFT > frame_len, the frames are + zero-padded. + + Returns: + If frames is an NxD matrix, output will be NxNFFT. Each row will be the + power spectrum of the corresponding frame. + ''' + return 1.0/nfft * numpy.square(magspec(frames, nfft)) + +def logpowspec(frames, nfft, norm=1): + ''' + Compute the log power spectrum of each frame in frames. + + If frames is an NxD matrix, output will be NxNFFT. + + Args: + frames: the array of frames. Each row is a frame. + nfft: the FFT length to use. If NFFT > frame_len, the frames are + zero-padded. + norm: If norm=1, the log power spectrum is normalised so that the max + value (across all frames) is 1. + + Returns: + If frames is an NxD matrix, output will be NxNFFT. Each row will be the + log power spectrum of the corresponding frame. + ''' + ps = powspec(frames, nfft) + ps[ps <= 1e-30] = 1e-30 + lps = 10*numpy.log10(ps) + if norm: + return lps - numpy.max(lps) + else: + return lps + +def preemphasis(signal, coeff=0.95): + ''' + perform preemphasis on the input signal. + + Args: + signal: The signal to filter. + coeff: The preemphasis coefficient. 0 is no filter, default is 0.95. + + Returns: + the filtered signal. + ''' + return numpy.append(signal[0], signal[1:]-coeff*signal[:-1]) diff --git a/log.md b/log.md index 4f362ae..3a11cd2 100644 --- a/log.md +++ b/log.md @@ -8,5 +8,7 @@ 如果有什么问题,团队内部需要在这里直接写出来 ## Log +### 2017-08-28 +开始准备制作语音信号处理方面的功能 ### 2017-08-22 准备使用Keras基于LSTM/CNN尝试实现 \ No newline at end of file diff --git a/main.py b/main.py index f29dc2c..533f55a 100644 --- a/main.py +++ b/main.py @@ -36,7 +36,7 @@ class ModelSpeech(): # 语音模型类 return _model def TrainModel(self,datas,epoch = 2,save_step=5000,filename='model_speech/LSTM_CNN_model'): # 训练模型 - print('test') + pass def LoadModel(self,filename='model_speech/LSTM_CNN_model'): # 加载模型参数 self._model.load_weights(filename) @@ -45,7 +45,7 @@ class ModelSpeech(): # 语音模型类 self._model.save_weights(filename+'.model') def TestModel(self): # 测试检验模型效果 - print('test') + pass @property def model(self): # 返回keras model