add read words list and signal scale

2017-08-29 00:06:08 +08:00 · 2017-08-29 00:06:08 +08:00 · 2217754bf0
parent ab74ee4bfc
commit 2217754bf0
7 changed files with 244 additions and 13 deletions
--- a/.gitignore
+++ b/.gitignore
@ -2,3 +2,4 @@

 *.model
 [Mm]odel_speech/
+*.wav
--- a/README.md
+++ b/README.md
@ -25,3 +25,7 @@ LSTM + CNN

 基于概率图的马尔可夫模型

+## Log
+日志
+
+链接：[进展日志](https://github.com/nl8590687/ASRT_SpeechRecognition/blob/master/log.md)
--- a/general_function/feature_computer.py
+++ b/general_function/feature_computer.py
@ -50,14 +50,20 @@ def fbank(signal, samplerate, conf):
    Compute fbank features from an audio signal.
 	从一个声音信号中计算fbank特征向量
    Args:
+	参数：
        signal: the audio signal from which to compute features. Should be an
            N*1 array
+			要计算特征的声音信号，一个N*1维的数组
        samplerate: the samplerate of the signal we are working with.
+			要处理信号的采样率
        conf: feature configuration
+			特征的配置

    Returns:
+	返回值：
        A numpy array of size (NUMFRAMES by nfilt) containing features, a numpy
        vector containing the signal energy
+		返回一个包含特征向量的numpy数组，一个包含信号能量的numpy向量
    '''

    highfreq = int(conf['highfreq'])
--- a/general_function/file_wav.py
+++ b/general_function/file_wav.py
@ -5,6 +5,7 @@ import os
 import wave
 import numpy as np
 import matplotlib.pyplot as plt  
+import math

 def read_wav_data(filename):
 	'''
@ -20,14 +21,27 @@ def read_wav_data(filename):
 	wave_data = np.fromstring(str_data, dtype = np.short) # 将声音文件数据转换为数组矩阵形式
 	wave_data.shape = -1, num_channel # 按照声道数将数组整形，单声道时候是一列数组，双声道时候是两列的矩阵
 	wave_data = wave_data.T # 将矩阵转置
-	time = np.arange(0, num_frame) * (1.0/framerate)  # 计算声音的播放时间，单位为秒
-	return wave_data, time  
+	wave_data = wave_data 
+	return wave_data, framerate  
 	
-def wav_show(wave_data, time): # 显示出来声音波形
-	#wave_data, time = read_wave_data("C:\\Users\\nl\\Desktop\\A2_0.wav")     
-	#draw the wave  
+def wav_scale(energy):
+	'''
+	语音信号能量归一化
+	'''
+	sum=0
+	for i in energy:
+		sum=sum+i*i
+	length=len(energy)
+	print(length,sum)
+	m=math.sqrt(length/sum)
+	e=energy*m
+	return e
+	
+def wav_show(wave_data, fs): # 显示出来声音波形
+	time = np.arange(0, len(wave_data)) * (1.0/fs)  # 计算声音的播放时间，单位为秒
+	# 画声音波形
 	#plt.subplot(211)  
-	plt.plot(time, wave_data[0])  
+	plt.plot(time, wave_data)  
 	#plt.subplot(212)  
 	#plt.plot(time, wave_data[1], c = "g")  
 	plt.show()  
@ -53,11 +67,24 @@ def get_wav_symbol(filename):
 	读取指定数据集中，所有wav文件对应的语音符号
 	返回一个存储符号集的字典类型值
 	'''
-	print('test')
-#if(__name__=='__main__'):
+	txt_obj=open(filename,'r') # 打开文件并读入
+	txt_text=txt_obj.read()
+	txt_lines=txt_text.split('\n') # 文本分割
+	dic_symbol_list={} # 初始化字典
+	for i in txt_lines:
+		if(i!=''):
+			txt_l=i.split(' ')
+			dic_symbol_list[txt_l[0]]=txt_l[1:]
+	return dic_symbol_list
+	
+if(__name__=='__main__'):
+	#dic=get_wav_symbol('E:\\语音数据集\\doc\\doc\\trans\\train.phone.txt')
+	#print(dic)
 	#dic=get_wav_list('E:\\语音数据集\\doc\\doc\\list\\train.wav.lst')
 	#for i in dic:
 		#print(i,dic[i])
-	#wave_data, time = read_wav_data("C:\\Users\\nl\\Desktop\\A2_0.wav")  
-	#wav_show(wave_data,time)
+	wave_data, fs = read_wav_data("A2_0.wav")  
+	wave_data[0]=wav_scale(wave_data[0])
+	#print(fs)
+	wav_show(wave_data[0],fs)
 	
--- a/general_function/sigproc.py
+++ b/general_function/sigproc.py
@ -0,0 +1,191 @@
+'''@file sigproc.py
+contains the signal processing functionality
+
+The MIT License (MIT)
+
+Copyright (c) 2013 James Lyons
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software is furnished to do so,
+subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+This file includes routines for basic signal processing including framing and
+computing power spectra.
+Author: James Lyons 2012
+'''
+
+import math
+import numpy
+
+def framesig(sig, frame_len, frame_step, winfunc=lambda x: numpy.ones((x, ))):
+    '''
+    Frame a signal into overlapping frames.
+
+    Args:
+        sig: the audio signal to frame.
+        frame_len: length of each frame measured in samples.
+        frame_step: number of samples after the start of the previous frame that
+            the next frame should begin.
+        winfunc: the analysis window to apply to each frame. By default no
+            window is applied.
+
+    Returns:
+        an array of frames. Size is NUMFRAMES by frame_len.
+    '''
+
+    slen = len(sig)
+    frame_len = int(round(frame_len))
+    frame_step = int(round(frame_step))
+    if slen <= frame_len:
+        numframes = 1
+    else:
+        numframes = 1 + int(math.ceil((1.0*slen - frame_len)/frame_step))
+
+    padlen = int((numframes-1)*frame_step + frame_len)
+
+    zeros = numpy.zeros((padlen - slen,))
+    padsignal = numpy.concatenate((sig, zeros))
+
+    indices = (numpy.tile(numpy.arange(0, frame_len), (numframes, 1))
+               + numpy.tile(numpy.arange(0, numframes*frame_step, frame_step),
+                            (frame_len, 1)).T)
+    indices = numpy.array(indices, dtype=numpy.int32)
+    frames = padsignal[indices]
+    win = numpy.tile(winfunc(frame_len), (numframes, 1))
+    return frames*win
+
+def deframesig(frames, siglen, frame_len, frame_step,
+               winfunc=lambda x: numpy.ones((x, ))):
+    '''
+    Does overlap-add procedure to undo the action of framesig.
+
+    Args:
+        frames the: array of frames.
+        siglen the: length of the desired signal, use 0 if unknown. Output will
+            be truncated to siglen samples.
+        frame_len: length of each frame measured in samples.
+        frame_step: number of samples after the start of the previous frame that
+            the next frame should begin.
+        winfunc: the analysis window to apply to each frame. By default no
+            window is applied.
+
+    Returns:
+        a 1-D signal.
+    '''
+
+    frame_len = round(frame_len)
+    frame_step = round(frame_step)
+    numframes = numpy.shape(frames)[0]
+    assert numpy.shape(frames)[1] == frame_len, '''"frames" matrix is wrong
+        size, 2nd dim is not equal to frame_len'''
+
+    indices = (numpy.tile(numpy.arange(0, frame_len), (numframes, 1))
+               + numpy.tile(numpy.arange(0, numframes*frame_step, frame_step),
+                            (frame_len, 1)).T)
+
+    indices = numpy.array(indices, dtype=numpy.int32)
+    padlen = (numframes-1)*frame_step + frame_len
+
+    if siglen <= 0:
+        siglen = padlen
+
+    rec_signal = numpy.zeros((padlen, ))
+    window_correction = numpy.zeros((padlen, ))
+    win = winfunc(frame_len)
+
+    for i in range(0, numframes):
+        #add a little bit so it is never zero
+        window_correction[indices[i, :]] = (window_correction[indices[i, :]]
+                                            + win + 1e-15)
+
+        rec_signal[indices[i, :]] = rec_signal[indices[i, :]] + frames[i, :]
+
+    rec_signal = rec_signal/window_correction
+    return rec_signal[0:siglen]
+
+def magspec(frames, nfft):
+    '''
+    Compute the magnitude spectrum of each frame in frames.
+
+    If frames is an NxD matrix, output will be NxNFFT.
+
+    Args:
+        frames: the array of frames. Each row is a frame.
+        nfft: the FFT length to use. If NFFT > frame_len, the frames are
+            zero-padded.
+
+    Returns:
+        If frames is an NxD matrix, output will be NxNFFT. Each row will be the
+        magnitude spectrum of the corresponding frame.
+    '''
+
+    complex_spec = numpy.fft.rfft(frames, nfft)
+    return numpy.absolute(complex_spec)
+
+def powspec(frames, nfft):
+    '''
+    Compute the power spectrum of each frame in frames.
+
+    If frames is an NxD matrix, output will be NxNFFT.
+
+    Args:
+        frames: the array of frames. Each row is a frame.
+        nfft: the FFT length to use. If NFFT > frame_len, the frames are
+            zero-padded.
+
+    Returns:
+        If frames is an NxD matrix, output will be NxNFFT. Each row will be the
+        power spectrum of the corresponding frame.
+    '''
+    return 1.0/nfft * numpy.square(magspec(frames, nfft))
+
+def logpowspec(frames, nfft, norm=1):
+    '''
+    Compute the log power spectrum of each frame in frames.
+
+    If frames is an NxD matrix, output will be NxNFFT.
+
+    Args:
+        frames: the array of frames. Each row is a frame.
+        nfft: the FFT length to use. If NFFT > frame_len, the frames are
+            zero-padded.
+        norm: If norm=1, the log power spectrum is normalised so that the max
+            value (across all frames) is 1.
+
+    Returns:
+        If frames is an NxD matrix, output will be NxNFFT. Each row will be the
+        log power spectrum of the corresponding frame.
+    '''
+    ps = powspec(frames, nfft)
+    ps[ps <= 1e-30] = 1e-30
+    lps = 10*numpy.log10(ps)
+    if norm:
+        return lps - numpy.max(lps)
+    else:
+        return lps
+
+def preemphasis(signal, coeff=0.95):
+    '''
+    perform preemphasis on the input signal.
+
+    Args:
+        signal: The signal to filter.
+        coeff: The preemphasis coefficient. 0 is no filter, default is 0.95.
+
+    Returns:
+        the filtered signal.
+    '''
+    return numpy.append(signal[0], signal[1:]-coeff*signal[:-1])
--- a/log.md
+++ b/log.md
@ -8,5 +8,7 @@
 如果有什么问题，团队内部需要在这里直接写出来

 ## Log
+### 2017-08-28
+开始准备制作语音信号处理方面的功能
 ### 2017-08-22
 准备使用Keras基于LSTM/CNN尝试实现
--- a/main.py
+++ b/main.py
@ -36,7 +36,7 @@ class ModelSpeech(): # 语音模型类
 		return _model

 	def TrainModel(self,datas,epoch = 2,save_step=5000,filename='model_speech/LSTM_CNN_model'): # 训练模型
-		print('test')
+		pass

 	def LoadModel(self,filename='model_speech/LSTM_CNN_model'): # 加载模型参数
 		self._model.load_weights(filename)
@ -45,7 +45,7 @@ class ModelSpeech(): # 语音模型类
 		self._model.save_weights(filename+'.model')

 	def TestModel(self): # 测试检验模型效果
-		print('test')
+		pass

 	@property
 	def model(self): # 返回keras model