add ctc loss and fixed bugs

2018-03-11 21:21:12 +08:00 · 2018-03-11 21:21:12 +08:00 · 6517062b94
parent 2ec4b937c9
commit 6517062b94
4 changed files with 99 additions and 23 deletions
--- a/log.md
+++ b/log.md
@ -1 +1,31 @@
-# ASRT_SpeechRecognition
基于深度学习的语音识别系统

## Introduction

这里是更新记录日志文件

如果有什么问题，团队内部需要在这里直接写出来

## Log
### 2017-09-08
基本完成除了添加模型之外的其他部分代码
### 2017-08-31
数据处理部分的代码基本完成，现在准备撸模型
### 2017-08-29
准备使用现有的包[python_speech_features](https://github.com/jameslyons/python_speech_features)来实现特征的提取，以及求一阶二阶差分。
### 2017-08-28
开始准备制作语音信号处理方面的功能
### 2017-08-22
准备使用Keras基于LSTM/CNN尝试实现
+# ASRT_SpeechRecognition
+基于深度学习的语音识别系统
+
+## Introduction
+
+这里是更新记录日志文件
+
+如果有什么问题，可以在这里直接写出来
+
+## Log
+### 2018-03-11
+添加了神经网络的CTC层和定义了CTC_loss损失函数，但是现在有些严重的bug，使得模型无法正常编译，一直找不到问题所在......(T_T)
+#### 报错 
+ValueError: Shapes (?, ?) and (?,) must have the same rank
+
+ValueError: Shapes (?, ?) and (?,) are not compatible
+
+ValueError: Shape (?, ?) must have rank 1
+#### --------------------------------
+各位走过路过的大神有会的吗？请帮帮忙吧，ヾ(o′▽`o)ノ°°谢谢啦
+### 2017-09-08
+基本完成除了添加模型之外的其他部分代码
+### 2017-08-31
+数据处理部分的代码基本完成，现在准备撸模型
+### 2017-08-29
+准备使用现有的包[python_speech_features](https://github.com/jameslyons/python_speech_features)来实现特征的提取，以及求一阶二阶差分。
+### 2017-08-28
+开始准备制作语音信号处理方面的功能
+### 2017-08-22
+准备使用Keras基于LSTM/CNN尝试实现
+
--- a/main.py
+++ b/main.py
@ -13,12 +13,14 @@ from keras.layers import Conv1D,LSTM,MaxPooling1D, Lambda, TimeDistributed, Acti
 from keras import backend as K

 from readdata import DataSpeech
-from neural_network import ctc_layer
+from neural_network.ctc_layer import ctc_layer
+from neural_network.ctc_loss import ctc_batch_loss

 class ModelSpeech(): # 语音模型类
 	def __init__(self,MS_OUTPUT_SIZE = 1283,BATCH_SIZE = 32):
 		'''
 		初始化
+		默认输出的拼音的表示大小是1283，即1282个拼音+1个空白块
 		'''
 		self.MS_OUTPUT_SIZE = MS_OUTPUT_SIZE # 神经网络最终输出的每一个字符向量维度的大小
 		self.BATCH_SIZE = BATCH_SIZE # 一次训练的batch
@ -47,8 +49,10 @@ class ModelSpeech(): # 语音模型类
 		layer_h4 = LSTM(256, activation='relu', use_bias=True, return_sequences=True)(layer_h3) # LSTM层
 		layer_h5 = Dropout(0.2)(layer_h4) # 随机中断部分神经网络连接，防止过拟合
 		layer_h6 = Dense(self.MS_OUTPUT_SIZE, activation="softmax")(layer_h5) # 全连接层
+		#layer_h6 = Dense(1283, activation="softmax")(layer_h5) # 全连接层
 		
-		layer_out = ctc_layer()(layer_h6) # CTC层  尚未实现！
+		layer_out = ctc_layer(self.MS_OUTPUT_SIZE, self.BATCH_SIZE)(layer_h6) # CTC层  可能有bug
+		#layer_out = ctc_layer(1283, 32)(layer_h6) # CTC层  可能有bug
 		
 		#labels = Input(name='the_labels', shape=[60], dtype='float32')
 		#layer_out = Lambda(ctc_lambda_func,output_shape=(self.MS_OUTPUT_SIZE, ), name='ctc')(layer_h6) # CTC
@ -68,16 +72,18 @@ class ModelSpeech(): # 语音模型类
 		
 		
 		
-		_model.compile(optimizer="sgd", loss='categorical_crossentropy',metrics=["accuracy"])
-		#_model.compile(optimizer="sgd", loss='ctc',metrics=["accuracy"])
+		#_model.compile(optimizer="sgd", loss='categorical_crossentropy',metrics=["accuracy"])
+		_model.compile(optimizer = "sgd", loss = ctc_batch_loss, metrics = ["accuracy"])
 		return _model
 		
+	'''
 	def ctc_lambda_func(args):
 		#labels, y_pred, input_length, label_length = args
 		y_pred = args[:,2:,:]
 		#y_pred = y_pred[:, 2:, :]
 		return K.ctc_decode(y_pred,self.MS_OUTPUT_SIZE)
 		#return K.ctc_batch_cost(labels, y_pred, input_length, label_length)
+	'''
 	
 	def TrainModel(self,datapath,epoch = 2,save_step=1000,filename='model_speech/LSTM_CNN_model'):
 		'''
@ -156,6 +162,6 @@ class ModelSpeech(): # 语音模型类
 if(__name__=='__main__'):
 	datapath = 'E:\\语音数据集'
 	ms = ModelSpeech()
-	ms.TrainModel(datapath)
-	ms.TestModel(datapath)
+	#ms.TrainModel(datapath)
+	#ms.TestModel(datapath)
 	
--- a/neural_network/ctc_layer.py
+++ b/neural_network/ctc_layer.py
@ -21,34 +21,61 @@ import tensorflow as tf
 # 继承父类Layer
 class ctc_layer(Layer):
 	'''
-		对CTC层的实现，具体需要再去参考下论文...以及tensorflow中ctc的实现，
-		并将其通过自定义层加入到keras的神经网络层中
+		本类是对CTC层的实现，具体请去参考下论文...
+		tensorflow中和keras中有ctc的一些实现，
+		并将其通过自定义层加入到keras创建的神经网络层中
+		
+		参数：
+			output_dim: 每一条时间序列中，输出的序列张量的尺寸
+			
+			
+		目前可能有bug
 	'''
-	def __init__(self, input_dim, output_dim, **kwargs):
+	def __init__(self, output_dim, batch_size, **kwargs):
+		'''
+			这里是神经网络CTC层的初始化模块
+		'''
+		#if 'input_shape' not in kwargs and 'input_dim' in kwargs:
+        #    kwargs['input_shape'] = (kwargs.pop('input_dim'), kwargs.pop('input_dim'),)
 		super(ctc_layer, self).__init__(**kwargs)
-		self.input_dim = input_dim
+		#self.input_dim = input_dim
+		#self.input_spec = [InputSpec(dtype=(None,,output_dim),ndim=3, axes={None: input_dim})]
 		self.output_dim = output_dim
+		self.batch_size = batch_size
 		#self.input_spec = InputSpec(min_ndim=3)
+		#super(ctc_layer, self).build(input_shape)  # Be sure to call this somewhere!
 		pass
 	
 	def build(self, input_shape):
+		assert len(input_shape) >= 2
+		input_dim = input_shape[-1]
 		# Create a trainable weight variable for this layer.
 		self.kernel = self.add_weight(name='kernel', 
-                                      shape=('''input_shape[0],''' self.output_dim, -1),
+										shape=(input_dim, self.output_dim), 
 										initializer='uniform', 
 										trainable=True)
-        super(MyLayer, self).build(input_shape)  # Be sure to call this somewhere!
 		
+		#super(ctc_layer, self).build(input_shape)  # Be sure to call this somewhere!
+		#self.input_spec = InputSpec(min_ndim=2, axes={-1: input_dim})
+		self.input_spec = [InputSpec(min_ndim=3)] # , axes={1: 748, -1: self.output_dim}
+		self.built = True
 	
 	def call(self, x, mask=None):
-		decoded_dense, log_prob = K.ctc_decode(x,self.input_dim)
-		decoded_sequence = K.ctc_label_dense_to_sparse(decoded_dense, decoded_dense.shape[0])
-		return decoded_sequence
-		
+		output = K.dot(x, self.kernel)
+		#output.shape[0] = self.batch_size
+		decoded_dense, log_prob = K.ctc_decode(output,tf.Variable((output.shape[1],output.shape[2]),dtype=tf.int64))
+		#decoded_dense, log_prob = K.ctc_decode(output,output.shape[1])
+		#decoded_sequence = K.ctc_label_dense_to_sparse(decoded_dense, len(decoded_dense))
+		#return decoded_sequence
+		return decoded_dense
 	
 	def get_config(self):
+		config = {
+			'output_dim': self.output_dim
+		}
+		base_config = super(ctc_layer, self).get_config()
+		return dict(list(base_config.items()) + list(config.items()))
 	
-		pass
-	
-	
+	def compute_output_shape(self, input_shape):
+		return (input_shape[0], input_shape[1], self.output_dim)
 	
--- a/neural_network/ctc_loss.py
+++ b/neural_network/ctc_loss.py
@ -0,0 +1,13 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+from keras.backend.tensorflow_backend import ctc_batch_cost
+import tensorflow as tf
+
+def ctc_batch_loss(y_true, y_pred):
+	'''
+		CTC的loss函数
+		这里目前有bug
+	'''
+	loss = ctc_batch_cost(y_true, y_pred, tf.Variable((748,1283),dtype=tf.int64), tf.Variable((64,1283),dtype=tf.int64))
+	return tf.Variable(loss,dtype=tf.int64)