add ctc loss and fixed bugs

This commit is contained in:
nl8590687 2018-03-11 21:21:12 +08:00
parent 2ec4b937c9
commit 6517062b94
4 changed files with 99 additions and 23 deletions

32
log.md
View File

@ -1 +1,31 @@
# ASRT_SpeechRecognition 基于深度学习的语音识别系统 ## Introduction 这里是更新记录日志文件 如果有什么问题,团队内部需要在这里直接写出来 ## Log ### 2017-09-08 基本完成除了添加模型之外的其他部分代码 ### 2017-08-31 数据处理部分的代码基本完成,现在准备撸模型 ### 2017-08-29 准备使用现有的包[python_speech_features](https://github.com/jameslyons/python_speech_features)来实现特征的提取,以及求一阶二阶差分。 ### 2017-08-28 开始准备制作语音信号处理方面的功能 ### 2017-08-22 准备使用Keras基于LSTM/CNN尝试实现
# ASRT_SpeechRecognition
基于深度学习的语音识别系统
## Introduction
这里是更新记录日志文件
如果有什么问题,可以在这里直接写出来
## Log
### 2018-03-11
添加了神经网络的CTC层和定义了CTC_loss损失函数但是现在有些严重的bug使得模型无法正常编译一直找不到问题所在......(T_T)
#### 报错
ValueError: Shapes (?, ?) and (?,) must have the same rank
ValueError: Shapes (?, ?) and (?,) are not compatible
ValueError: Shape (?, ?) must have rank 1
#### --------------------------------
各位走过路过的大神有会的吗?请帮帮忙吧,ヾ(o▽`o)ノ°°谢谢啦
### 2017-09-08
基本完成除了添加模型之外的其他部分代码
### 2017-08-31
数据处理部分的代码基本完成,现在准备撸模型
### 2017-08-29
准备使用现有的包[python_speech_features](https://github.com/jameslyons/python_speech_features)来实现特征的提取,以及求一阶二阶差分。
### 2017-08-28
开始准备制作语音信号处理方面的功能
### 2017-08-22
准备使用Keras基于LSTM/CNN尝试实现

18
main.py
View File

@ -13,12 +13,14 @@ from keras.layers import Conv1D,LSTM,MaxPooling1D, Lambda, TimeDistributed, Acti
from keras import backend as K
from readdata import DataSpeech
from neural_network import ctc_layer
from neural_network.ctc_layer import ctc_layer
from neural_network.ctc_loss import ctc_batch_loss
class ModelSpeech(): # 语音模型类
def __init__(self,MS_OUTPUT_SIZE = 1283,BATCH_SIZE = 32):
'''
初始化
默认输出的拼音的表示大小是1283即1282个拼音+1个空白块
'''
self.MS_OUTPUT_SIZE = MS_OUTPUT_SIZE # 神经网络最终输出的每一个字符向量维度的大小
self.BATCH_SIZE = BATCH_SIZE # 一次训练的batch
@ -47,8 +49,10 @@ class ModelSpeech(): # 语音模型类
layer_h4 = LSTM(256, activation='relu', use_bias=True, return_sequences=True)(layer_h3) # LSTM层
layer_h5 = Dropout(0.2)(layer_h4) # 随机中断部分神经网络连接,防止过拟合
layer_h6 = Dense(self.MS_OUTPUT_SIZE, activation="softmax")(layer_h5) # 全连接层
#layer_h6 = Dense(1283, activation="softmax")(layer_h5) # 全连接层
layer_out = ctc_layer()(layer_h6) # CTC层 尚未实现!
layer_out = ctc_layer(self.MS_OUTPUT_SIZE, self.BATCH_SIZE)(layer_h6) # CTC层 可能有bug
#layer_out = ctc_layer(1283, 32)(layer_h6) # CTC层 可能有bug
#labels = Input(name='the_labels', shape=[60], dtype='float32')
#layer_out = Lambda(ctc_lambda_func,output_shape=(self.MS_OUTPUT_SIZE, ), name='ctc')(layer_h6) # CTC
@ -68,16 +72,18 @@ class ModelSpeech(): # 语音模型类
_model.compile(optimizer="sgd", loss='categorical_crossentropy',metrics=["accuracy"])
#_model.compile(optimizer="sgd", loss='ctc',metrics=["accuracy"])
#_model.compile(optimizer="sgd", loss='categorical_crossentropy',metrics=["accuracy"])
_model.compile(optimizer = "sgd", loss = ctc_batch_loss, metrics = ["accuracy"])
return _model
'''
def ctc_lambda_func(args):
#labels, y_pred, input_length, label_length = args
y_pred = args[:,2:,:]
#y_pred = y_pred[:, 2:, :]
return K.ctc_decode(y_pred,self.MS_OUTPUT_SIZE)
#return K.ctc_batch_cost(labels, y_pred, input_length, label_length)
'''
def TrainModel(self,datapath,epoch = 2,save_step=1000,filename='model_speech/LSTM_CNN_model'):
'''
@ -156,6 +162,6 @@ class ModelSpeech(): # 语音模型类
if(__name__=='__main__'):
datapath = 'E:\\语音数据集'
ms = ModelSpeech()
ms.TrainModel(datapath)
ms.TestModel(datapath)
#ms.TrainModel(datapath)
#ms.TestModel(datapath)

View File

@ -21,34 +21,61 @@ import tensorflow as tf
# 继承父类Layer
class ctc_layer(Layer):
'''
对CTC层的实现具体需要再去参考下论文...以及tensorflow中ctc的实现
并将其通过自定义层加入到keras的神经网络层中
本类是对CTC层的实现具体请去参考下论文...
tensorflow中和keras中有ctc的一些实现
并将其通过自定义层加入到keras创建的神经网络层中
参数
output_dim: 每一条时间序列中输出的序列张量的尺寸
目前可能有bug
'''
def __init__(self, input_dim, output_dim, **kwargs):
def __init__(self, output_dim, batch_size, **kwargs):
'''
这里是神经网络CTC层的初始化模块
'''
#if 'input_shape' not in kwargs and 'input_dim' in kwargs:
# kwargs['input_shape'] = (kwargs.pop('input_dim'), kwargs.pop('input_dim'),)
super(ctc_layer, self).__init__(**kwargs)
self.input_dim = input_dim
#self.input_dim = input_dim
#self.input_spec = [InputSpec(dtype=(None,,output_dim),ndim=3, axes={None: input_dim})]
self.output_dim = output_dim
self.batch_size = batch_size
#self.input_spec = InputSpec(min_ndim=3)
#super(ctc_layer, self).build(input_shape) # Be sure to call this somewhere!
pass
def build(self, input_shape):
assert len(input_shape) >= 2
input_dim = input_shape[-1]
# Create a trainable weight variable for this layer.
self.kernel = self.add_weight(name='kernel',
shape=('''input_shape[0],''' self.output_dim, -1),
shape=(input_dim, self.output_dim),
initializer='uniform',
trainable=True)
super(MyLayer, self).build(input_shape) # Be sure to call this somewhere!
#super(ctc_layer, self).build(input_shape) # Be sure to call this somewhere!
#self.input_spec = InputSpec(min_ndim=2, axes={-1: input_dim})
self.input_spec = [InputSpec(min_ndim=3)] # , axes={1: 748, -1: self.output_dim}
self.built = True
def call(self, x, mask=None):
decoded_dense, log_prob = K.ctc_decode(x,self.input_dim)
decoded_sequence = K.ctc_label_dense_to_sparse(decoded_dense, decoded_dense.shape[0])
return decoded_sequence
output = K.dot(x, self.kernel)
#output.shape[0] = self.batch_size
decoded_dense, log_prob = K.ctc_decode(output,tf.Variable((output.shape[1],output.shape[2]),dtype=tf.int64))
#decoded_dense, log_prob = K.ctc_decode(output,output.shape[1])
#decoded_sequence = K.ctc_label_dense_to_sparse(decoded_dense, len(decoded_dense))
#return decoded_sequence
return decoded_dense
def get_config(self):
config = {
'output_dim': self.output_dim
}
base_config = super(ctc_layer, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
pass
def compute_output_shape(self, input_shape):
return (input_shape[0], input_shape[1], self.output_dim)

View File

@ -0,0 +1,13 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from keras.backend.tensorflow_backend import ctc_batch_cost
import tensorflow as tf
def ctc_batch_loss(y_true, y_pred):
'''
CTC的loss函数
这里目前有bug
'''
loss = ctc_batch_cost(y_true, y_pred, tf.Variable((748,1283),dtype=tf.int64), tf.Variable((64,1283),dtype=tf.int64))
return tf.Variable(loss,dtype=tf.int64)