diff --git a/log.md b/log.md index 3ac4b51..16e7b3d 100644 --- a/log.md +++ b/log.md @@ -1 +1,31 @@ -# ASRT_SpeechRecognition 基于深度学习的语音识别系统 ## Introduction 这里是更新记录日志文件 如果有什么问题,团队内部需要在这里直接写出来 ## Log ### 2017-09-08 基本完成除了添加模型之外的其他部分代码 ### 2017-08-31 数据处理部分的代码基本完成,现在准备撸模型 ### 2017-08-29 准备使用现有的包[python_speech_features](https://github.com/jameslyons/python_speech_features)来实现特征的提取,以及求一阶二阶差分。 ### 2017-08-28 开始准备制作语音信号处理方面的功能 ### 2017-08-22 准备使用Keras基于LSTM/CNN尝试实现 \ No newline at end of file +# ASRT_SpeechRecognition +基于深度学习的语音识别系统 + +## Introduction + +这里是更新记录日志文件 + +如果有什么问题,可以在这里直接写出来 + +## Log +### 2018-03-11 +添加了神经网络的CTC层和定义了CTC_loss损失函数,但是现在有些严重的bug,使得模型无法正常编译,一直找不到问题所在......(T_T) +#### 报错 +ValueError: Shapes (?, ?) and (?,) must have the same rank + +ValueError: Shapes (?, ?) and (?,) are not compatible + +ValueError: Shape (?, ?) must have rank 1 +#### -------------------------------- +各位走过路过的大神有会的吗?请帮帮忙吧,ヾ(o′▽`o)ノ°°谢谢啦 +### 2017-09-08 +基本完成除了添加模型之外的其他部分代码 +### 2017-08-31 +数据处理部分的代码基本完成,现在准备撸模型 +### 2017-08-29 +准备使用现有的包[python_speech_features](https://github.com/jameslyons/python_speech_features)来实现特征的提取,以及求一阶二阶差分。 +### 2017-08-28 +开始准备制作语音信号处理方面的功能 +### 2017-08-22 +准备使用Keras基于LSTM/CNN尝试实现 + diff --git a/main.py b/main.py index 00d80ed..645d856 100644 --- a/main.py +++ b/main.py @@ -13,12 +13,14 @@ from keras.layers import Conv1D,LSTM,MaxPooling1D, Lambda, TimeDistributed, Acti from keras import backend as K from readdata import DataSpeech -from neural_network import ctc_layer +from neural_network.ctc_layer import ctc_layer +from neural_network.ctc_loss import ctc_batch_loss class ModelSpeech(): # 语音模型类 def __init__(self,MS_OUTPUT_SIZE = 1283,BATCH_SIZE = 32): ''' 初始化 + 默认输出的拼音的表示大小是1283,即1282个拼音+1个空白块 ''' self.MS_OUTPUT_SIZE = MS_OUTPUT_SIZE # 神经网络最终输出的每一个字符向量维度的大小 self.BATCH_SIZE = BATCH_SIZE # 一次训练的batch @@ -47,8 +49,10 @@ class ModelSpeech(): # 语音模型类 layer_h4 = LSTM(256, activation='relu', use_bias=True, return_sequences=True)(layer_h3) # LSTM层 layer_h5 = Dropout(0.2)(layer_h4) # 随机中断部分神经网络连接,防止过拟合 layer_h6 = Dense(self.MS_OUTPUT_SIZE, activation="softmax")(layer_h5) # 全连接层 + #layer_h6 = Dense(1283, activation="softmax")(layer_h5) # 全连接层 - layer_out = ctc_layer()(layer_h6) # CTC层 尚未实现! + layer_out = ctc_layer(self.MS_OUTPUT_SIZE, self.BATCH_SIZE)(layer_h6) # CTC层 可能有bug + #layer_out = ctc_layer(1283, 32)(layer_h6) # CTC层 可能有bug #labels = Input(name='the_labels', shape=[60], dtype='float32') #layer_out = Lambda(ctc_lambda_func,output_shape=(self.MS_OUTPUT_SIZE, ), name='ctc')(layer_h6) # CTC @@ -68,16 +72,18 @@ class ModelSpeech(): # 语音模型类 - _model.compile(optimizer="sgd", loss='categorical_crossentropy',metrics=["accuracy"]) - #_model.compile(optimizer="sgd", loss='ctc',metrics=["accuracy"]) + #_model.compile(optimizer="sgd", loss='categorical_crossentropy',metrics=["accuracy"]) + _model.compile(optimizer = "sgd", loss = ctc_batch_loss, metrics = ["accuracy"]) return _model + ''' def ctc_lambda_func(args): #labels, y_pred, input_length, label_length = args y_pred = args[:,2:,:] #y_pred = y_pred[:, 2:, :] return K.ctc_decode(y_pred,self.MS_OUTPUT_SIZE) #return K.ctc_batch_cost(labels, y_pred, input_length, label_length) + ''' def TrainModel(self,datapath,epoch = 2,save_step=1000,filename='model_speech/LSTM_CNN_model'): ''' @@ -156,6 +162,6 @@ class ModelSpeech(): # 语音模型类 if(__name__=='__main__'): datapath = 'E:\\语音数据集' ms = ModelSpeech() - ms.TrainModel(datapath) - ms.TestModel(datapath) + #ms.TrainModel(datapath) + #ms.TestModel(datapath) diff --git a/neural_network/ctc_layer.py b/neural_network/ctc_layer.py index ebc3103..b229821 100644 --- a/neural_network/ctc_layer.py +++ b/neural_network/ctc_layer.py @@ -21,34 +21,61 @@ import tensorflow as tf # 继承父类Layer class ctc_layer(Layer): ''' - 对CTC层的实现,具体需要再去参考下论文...以及tensorflow中ctc的实现, - 并将其通过自定义层加入到keras的神经网络层中 + 本类是对CTC层的实现,具体请去参考下论文... + tensorflow中和keras中有ctc的一些实现, + 并将其通过自定义层加入到keras创建的神经网络层中 + + 参数: + output_dim: 每一条时间序列中,输出的序列张量的尺寸 + + + 目前可能有bug ''' - def __init__(self, input_dim, output_dim, **kwargs): + def __init__(self, output_dim, batch_size, **kwargs): + ''' + 这里是神经网络CTC层的初始化模块 + ''' + #if 'input_shape' not in kwargs and 'input_dim' in kwargs: + # kwargs['input_shape'] = (kwargs.pop('input_dim'), kwargs.pop('input_dim'),) super(ctc_layer, self).__init__(**kwargs) - self.input_dim = input_dim + #self.input_dim = input_dim + #self.input_spec = [InputSpec(dtype=(None,,output_dim),ndim=3, axes={None: input_dim})] self.output_dim = output_dim + self.batch_size = batch_size #self.input_spec = InputSpec(min_ndim=3) + #super(ctc_layer, self).build(input_shape) # Be sure to call this somewhere! pass def build(self, input_shape): + assert len(input_shape) >= 2 + input_dim = input_shape[-1] # Create a trainable weight variable for this layer. - self.kernel = self.add_weight(name='kernel', - shape=('''input_shape[0],''' self.output_dim, -1), - initializer='uniform', - trainable=True) - super(MyLayer, self).build(input_shape) # Be sure to call this somewhere! + self.kernel = self.add_weight(name='kernel', + shape=(input_dim, self.output_dim), + initializer='uniform', + trainable=True) + #super(ctc_layer, self).build(input_shape) # Be sure to call this somewhere! + #self.input_spec = InputSpec(min_ndim=2, axes={-1: input_dim}) + self.input_spec = [InputSpec(min_ndim=3)] # , axes={1: 748, -1: self.output_dim} + self.built = True def call(self, x, mask=None): - decoded_dense, log_prob = K.ctc_decode(x,self.input_dim) - decoded_sequence = K.ctc_label_dense_to_sparse(decoded_dense, decoded_dense.shape[0]) - return decoded_sequence - + output = K.dot(x, self.kernel) + #output.shape[0] = self.batch_size + decoded_dense, log_prob = K.ctc_decode(output,tf.Variable((output.shape[1],output.shape[2]),dtype=tf.int64)) + #decoded_dense, log_prob = K.ctc_decode(output,output.shape[1]) + #decoded_sequence = K.ctc_label_dense_to_sparse(decoded_dense, len(decoded_dense)) + #return decoded_sequence + return decoded_dense def get_config(self): - - pass - + config = { + 'output_dim': self.output_dim + } + base_config = super(ctc_layer, self).get_config() + return dict(list(base_config.items()) + list(config.items())) + def compute_output_shape(self, input_shape): + return (input_shape[0], input_shape[1], self.output_dim) \ No newline at end of file diff --git a/neural_network/ctc_loss.py b/neural_network/ctc_loss.py new file mode 100644 index 0000000..d994ada --- /dev/null +++ b/neural_network/ctc_loss.py @@ -0,0 +1,13 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +from keras.backend.tensorflow_backend import ctc_batch_cost +import tensorflow as tf + +def ctc_batch_loss(y_true, y_pred): + ''' + CTC的loss函数 + 这里目前有bug + ''' + loss = ctc_batch_cost(y_true, y_pred, tf.Variable((748,1283),dtype=tf.int64), tf.Variable((64,1283),dtype=tf.int64)) + return tf.Variable(loss,dtype=tf.int64) \ No newline at end of file