基本完成除添加模型之外的其他部分，不过尚未测试

2017-09-08 20:18:46 +08:00 · 2017-09-08 20:18:46 +08:00 · 4ca877a046
parent c621c0aeb9
commit 4ca877a046
4 changed files with 35 additions and 95 deletions
--- a/README.md
+++ b/README.md
@ -1,43 +1 @@
-# ASRT_SpeechRecognition
-基于深度学习的语音识别系统
-
-## Introduction
-简介
-
-可以尝试使用Keras进行制作
-
-本项目将使用TensorFlow基于递归神经网络和卷积神经网络进行制作。
-
-This project will use TensorFlow based on RNN and CNN to implement. 
-
-本项目尚未完成，想要Fork的同学请手慢。
-
-## Model
-模型
-
-### Speech Model
-语音模型
-
-LSTM + CNN
-
-### Language Model
-语言模型
-
-基于概率图的马尔可夫模型
-
-## Python Import
-Python的依赖库
-
-* python_speech_features
-* TensorFlow
-* Keras
-* Numpy
-* wave
-* matplotlib
-* math
-* Scipy
-
-## Log
-日志
-
-链接：[进展日志](https://github.com/nl8590687/ASRT_SpeechRecognition/blob/master/log.md)
+# ASRT_SpeechRecognition
基于深度学习的语音识别系统

## Introduction 简介

本项目使用Keras、TensorFlow基于长短时记忆神经网络和卷积神经网络以及CTC进行制作。

This project uses keras, TensorFlow based on LSTM, CNN and CTC to implement. 

本项目尚未完成，想要Fork的同学请手慢。

## Model 模型

### Speech Mode l语音模型

CNN + LSTM + CTC

### Language Model 语言模型

基于概率图的马尔可夫模型

## Python Import
Python的依赖库

* python_speech_features
* TensorFlow
* Keras
* Numpy
* wave
* matplotlib
* math
* Scipy

## Log
日志

链接：[进展日志](https://github.com/nl8590687/ASRT_SpeechRecognition/blob/master/log.md)
--- a/log.md
+++ b/log.md
@ -1,18 +1 @@
-# ASRT_SpeechRecognition
-基于深度学习的语音识别系统
-
-## Introduction
-
-这里是更新记录日志文件
-
-如果有什么问题，团队内部需要在这里直接写出来
-
-## Log
-### 2017-08-31
-数据处理部分的代码基本完成，现在准备撸模型
-### 2017-08-29
-准备使用现有的包[python_speech_features](https://github.com/jameslyons/python_speech_features)来实现特征的提取，以及求一阶二阶差分。
-### 2017-08-28
-开始准备制作语音信号处理方面的功能
-### 2017-08-22
-准备使用Keras基于LSTM/CNN尝试实现
+# ASRT_SpeechRecognition
基于深度学习的语音识别系统

## Introduction

这里是更新记录日志文件

如果有什么问题，团队内部需要在这里直接写出来

## Log
### 2017-09-08
基本完成除了添加模型之外的其他部分代码
### 2017-08-31
数据处理部分的代码基本完成，现在准备撸模型
### 2017-08-29
准备使用现有的包[python_speech_features](https://github.com/jameslyons/python_speech_features)来实现特征的提取，以及求一阶二阶差分。
### 2017-08-28
开始准备制作语音信号处理方面的功能
### 2017-08-22
准备使用Keras基于LSTM/CNN尝试实现
--- a/main.py
+++ b/main.py
@ -15,11 +15,11 @@ from keras import backend as K
 from readdata import DataSpeech

 class ModelSpeech(): # 语音模型类
-	def __init__(self,MS_EMBED_SIZE = 64,BATCH_SIZE = 32):
+	def __init__(self,MS_OUTPUT_SIZE = 1283,BATCH_SIZE = 32):
 		'''
 		初始化
 		'''
-		self.MS_EMBED_SIZE = MS_EMBED_SIZE # LSTM 的大小
+		self.MS_OUTPUT_SIZE = MS_OUTPUT_SIZE # 神经网络最终输出的每一个字符向量维度的大小
 		self.BATCH_SIZE = BATCH_SIZE # 一次训练的batch
 		self._model = self.CreateModel() 

@ -29,10 +29,13 @@ class ModelSpeech(): # 语音模型类
 		输入层：39维的特征值序列，一条语音数据的最大长度设为1500（大约15s）
 		隐藏层一：1024个神经元的卷积层
 		隐藏层二：池化层，池化窗口大小为2
-		隐藏层三：Dropout层，需要断开的神经元的比例为0.3，防止过拟合
+		隐藏层三：Dropout层，需要断开的神经元的比例为0.2，防止过拟合
 		隐藏层四：循环层、LSTM层
-		隐藏层五：Dropout层，需要断开的神经元的比例为0.3，防止过拟合
-		输出层：全连接层，神经元数量为1279，使用softmax作为激活函数，使用CTC的loss作为损失函数
+		隐藏层五：Dropout层，需要断开的神经元的比例为0.2，防止过拟合
+		隐藏层六：全连接层，神经元数量为self.MS_OUTPUT_SIZE，使用softmax作为激活函数，
+		输出层：lambda层，即CTC层，使用CTC的loss作为损失函数，实现多输出
+		
+		当前未完成，针对多输出的CTC层尚未添加
 		'''
 		# 每一帧使用13维mfcc特征及其13维一阶差分和13维二阶差分表示，最大信号序列长度为1500
 		layer_input = Input((1500,39))
@ -42,24 +45,13 @@ class ModelSpeech(): # 语音模型类
 		layer_h3 = Dropout(0.2)(layer_h2) # 随机中断部分神经网络连接，防止过拟合
 		layer_h4 = LSTM(256, activation='relu', use_bias=True)(layer_h3) # LSTM层
 		layer_h5 = Dropout(0.2)(layer_h4) # 随机中断部分神经网络连接，防止过拟合
-		layer_h6 = Dense(1279, activation="softmax")(layer_h5) # 全连接层
+		layer_h6 = Dense(self.MS_OUTPUT_SIZE, activation="softmax")(layer_h5) # 全连接层
 		
 		#labels = Input(name='the_labels', shape=[60], dtype='float32')
-		layer_out = Lambda(ctc_lambda_func,output_shape=(1279,), name='ctc')(layer_h6) # CTC
+		layer_out = Lambda(ctc_lambda_func,output_shape=(self.MS_OUTPUT_SIZE, ), name='ctc')(layer_h6) # CTC
 		_model = Model(inputs = layer_input, outputs = layer_out)
 		
-		#_model = Sequential()
 		
-		#_model.add(Conv1D(256, 5,input_shape=(1500,39), use_bias=True, padding="valid"))
-		#_model.add(MaxPooling1D(pool_size=2, strides=None, padding="valid"))
-		#_model.add(Dropout(0.3)) # 随机中断部分神经网络连接
-		
-		#_model.add(LSTM(256, activation='relu', use_bias=True))
-		#_model.add(Dropout(0.3)) # 随机中断部分神经网络连接
-		
-		#_model.add(Dense(1279, activation="softmax"))
-       ##_model.add(Lambda(ctc_lambda_func,output_shape=(1,),name='ctc'))
-       
 		#_model.compile(optimizer="sgd", loss='categorical_crossentropy',metrics=["accuracy"])
 		_model.compile(optimizer="sgd", loss='ctc',metrics=["accuracy"])
 		return _model
@ -68,7 +60,7 @@ class ModelSpeech(): # 语音模型类
 		#labels, y_pred, input_length, label_length = args
 		y_pred = args[:,2:,:]
 		#y_pred = y_pred[:, 2:, :]
-		return K.ctc_decode(y_pred,1279)
+		return K.ctc_decode(y_pred,self.MS_OUTPUT_SIZE)
 		#return K.ctc_batch_cost(labels, y_pred, input_length, label_length)
 	
 	def TrainModel(self,datapath,epoch = 2,save_step=1000,filename='model_speech/LSTM_CNN_model'):
@ -84,11 +76,14 @@ class ModelSpeech(): # 语音模型类
 		data.LoadDataList('train')
 		num_data=DataSpeech.GetDataNum() # 获取数据的数量
 		for epoch in range(epoch): # 迭代轮数
+			print('[running] train epoch %d .' % epoch)
 			n_step = 0 # 迭代数据数
 			while True:
 				try:
+					print('[message] epoch %d . Have train datas %d+'%(epoch, n_step*save_step))
 					# data_genetator是一个生成器函数
-					self._model.fit_generator(data.data_genetator, save_step, nb_worker=2)
+					yielddatas = data.data_genetator(self.BATCH_SIZE)
+					self._model.fit_generator(yielddatas, save_step, nb_worker=2)
 					n_step += 1
 				except StopIteration:
 					print('[error] generator error. please check data format.')
@ -109,17 +104,28 @@ class ModelSpeech(): # 语音模型类
 		'''
 		self._model.save_weights(filename+comment+'.model')

-	def TestModel(self):
+	def TestModel(self, datapath, str_dataset='dev'):
 		'''
 		测试检验模型效果
 		'''
-		pass
+		data=DataSpeech(datapath)
+		data.LoadDataList(str_dataset) 
+		num_data = DataSpeech.GetDataNum() # 获取数据的数量
+		try:
+			gen = data.data_genetator(num_data)
+			for i in range(1):
+				X, y = gen
+			r = self._model.test_on_batch(X, y)
+			print(r)
+		except StopIteration:
+			print('[Error] Model Test Error. please check data format.')

 	def Predict(self,x):
 		'''
 		预测结果
 		'''
-		r = predict_on_batch(x)
+		r = self._model.predict_on_batch(x)
+		print(r)
 		return r
 		pass
 		
@ -132,5 +138,8 @@ class ModelSpeech(): # 语音模型类


 if(__name__=='__main__'):
-	pass
+	datapath = 'E:\\语音数据集'
+	ms = ModelSpeech()
+	ms.TrainModel(datapath)
+	ms.TestModel(datapath)
 	
--- a/readdata.py
+++ b/readdata.py
@ -173,16 +173,6 @@ class DataSpeech():
 		return v
 	
 if(__name__=='__main__'):
-	#wave_data, fs = read_wav_data("general_function\\A2_0.wav")  
-	#print(wave_data)
-	#(fs,wave_data)=wav.read('E:\\国创项目工程\代码\\ASRT_SpeechRecognition\\general_function\\A2_0.wav')
-	#wav_show(wave_data[0],fs)
-	#mfcc_feat = mfcc(wave_data[0],fs) # 计算MFCC特征
-	#print(mfcc_feat[0:3,:])
-	#d_mfcc_feat_1 = delta(mfcc_feat, 2)
-	#print(d_mfcc_feat_1[0,:])
-	#d_mfcc_feat_2 = delta(d_mfcc_feat_1, 2)
-	#print(d_mfcc_feat_2[0,:])
 	#path='E:\\语音数据集'
 	#l=DataSpeech(path)
 	#l.LoadDataList('train')