add ctc loss and fixed bugs
This commit is contained in:
parent
2ec4b937c9
commit
6517062b94
32
log.md
32
log.md
|
@ -1 +1,31 @@
|
|||
# ASRT_SpeechRecognition
基于深度学习的语音识别系统
## Introduction
这里是更新记录日志文件
如果有什么问题,团队内部需要在这里直接写出来
## Log
### 2017-09-08
基本完成除了添加模型之外的其他部分代码
### 2017-08-31
数据处理部分的代码基本完成,现在准备撸模型
### 2017-08-29
准备使用现有的包[python_speech_features](https://github.com/jameslyons/python_speech_features)来实现特征的提取,以及求一阶二阶差分。
### 2017-08-28
开始准备制作语音信号处理方面的功能
### 2017-08-22
准备使用Keras基于LSTM/CNN尝试实现
|
||||
# ASRT_SpeechRecognition
|
||||
基于深度学习的语音识别系统
|
||||
|
||||
## Introduction
|
||||
|
||||
这里是更新记录日志文件
|
||||
|
||||
如果有什么问题,可以在这里直接写出来
|
||||
|
||||
## Log
|
||||
### 2018-03-11
|
||||
添加了神经网络的CTC层和定义了CTC_loss损失函数,但是现在有些严重的bug,使得模型无法正常编译,一直找不到问题所在......(T_T)
|
||||
#### 报错
|
||||
ValueError: Shapes (?, ?) and (?,) must have the same rank
|
||||
|
||||
ValueError: Shapes (?, ?) and (?,) are not compatible
|
||||
|
||||
ValueError: Shape (?, ?) must have rank 1
|
||||
#### --------------------------------
|
||||
各位走过路过的大神有会的吗?请帮帮忙吧,ヾ(o′▽`o)ノ°°谢谢啦
|
||||
### 2017-09-08
|
||||
基本完成除了添加模型之外的其他部分代码
|
||||
### 2017-08-31
|
||||
数据处理部分的代码基本完成,现在准备撸模型
|
||||
### 2017-08-29
|
||||
准备使用现有的包[python_speech_features](https://github.com/jameslyons/python_speech_features)来实现特征的提取,以及求一阶二阶差分。
|
||||
### 2017-08-28
|
||||
开始准备制作语音信号处理方面的功能
|
||||
### 2017-08-22
|
||||
准备使用Keras基于LSTM/CNN尝试实现
|
||||
|
||||
|
|
18
main.py
18
main.py
|
@ -13,12 +13,14 @@ from keras.layers import Conv1D,LSTM,MaxPooling1D, Lambda, TimeDistributed, Acti
|
|||
from keras import backend as K
|
||||
|
||||
from readdata import DataSpeech
|
||||
from neural_network import ctc_layer
|
||||
from neural_network.ctc_layer import ctc_layer
|
||||
from neural_network.ctc_loss import ctc_batch_loss
|
||||
|
||||
class ModelSpeech(): # 语音模型类
|
||||
def __init__(self,MS_OUTPUT_SIZE = 1283,BATCH_SIZE = 32):
|
||||
'''
|
||||
初始化
|
||||
默认输出的拼音的表示大小是1283,即1282个拼音+1个空白块
|
||||
'''
|
||||
self.MS_OUTPUT_SIZE = MS_OUTPUT_SIZE # 神经网络最终输出的每一个字符向量维度的大小
|
||||
self.BATCH_SIZE = BATCH_SIZE # 一次训练的batch
|
||||
|
@ -47,8 +49,10 @@ class ModelSpeech(): # 语音模型类
|
|||
layer_h4 = LSTM(256, activation='relu', use_bias=True, return_sequences=True)(layer_h3) # LSTM层
|
||||
layer_h5 = Dropout(0.2)(layer_h4) # 随机中断部分神经网络连接,防止过拟合
|
||||
layer_h6 = Dense(self.MS_OUTPUT_SIZE, activation="softmax")(layer_h5) # 全连接层
|
||||
#layer_h6 = Dense(1283, activation="softmax")(layer_h5) # 全连接层
|
||||
|
||||
layer_out = ctc_layer()(layer_h6) # CTC层 尚未实现!
|
||||
layer_out = ctc_layer(self.MS_OUTPUT_SIZE, self.BATCH_SIZE)(layer_h6) # CTC层 可能有bug
|
||||
#layer_out = ctc_layer(1283, 32)(layer_h6) # CTC层 可能有bug
|
||||
|
||||
#labels = Input(name='the_labels', shape=[60], dtype='float32')
|
||||
#layer_out = Lambda(ctc_lambda_func,output_shape=(self.MS_OUTPUT_SIZE, ), name='ctc')(layer_h6) # CTC
|
||||
|
@ -68,16 +72,18 @@ class ModelSpeech(): # 语音模型类
|
|||
|
||||
|
||||
|
||||
_model.compile(optimizer="sgd", loss='categorical_crossentropy',metrics=["accuracy"])
|
||||
#_model.compile(optimizer="sgd", loss='ctc',metrics=["accuracy"])
|
||||
#_model.compile(optimizer="sgd", loss='categorical_crossentropy',metrics=["accuracy"])
|
||||
_model.compile(optimizer = "sgd", loss = ctc_batch_loss, metrics = ["accuracy"])
|
||||
return _model
|
||||
|
||||
'''
|
||||
def ctc_lambda_func(args):
|
||||
#labels, y_pred, input_length, label_length = args
|
||||
y_pred = args[:,2:,:]
|
||||
#y_pred = y_pred[:, 2:, :]
|
||||
return K.ctc_decode(y_pred,self.MS_OUTPUT_SIZE)
|
||||
#return K.ctc_batch_cost(labels, y_pred, input_length, label_length)
|
||||
'''
|
||||
|
||||
def TrainModel(self,datapath,epoch = 2,save_step=1000,filename='model_speech/LSTM_CNN_model'):
|
||||
'''
|
||||
|
@ -156,6 +162,6 @@ class ModelSpeech(): # 语音模型类
|
|||
if(__name__=='__main__'):
|
||||
datapath = 'E:\\语音数据集'
|
||||
ms = ModelSpeech()
|
||||
ms.TrainModel(datapath)
|
||||
ms.TestModel(datapath)
|
||||
#ms.TrainModel(datapath)
|
||||
#ms.TestModel(datapath)
|
||||
|
||||
|
|
|
@ -21,34 +21,61 @@ import tensorflow as tf
|
|||
# 继承父类Layer
|
||||
class ctc_layer(Layer):
|
||||
'''
|
||||
对CTC层的实现,具体需要再去参考下论文...以及tensorflow中ctc的实现,
|
||||
并将其通过自定义层加入到keras的神经网络层中
|
||||
本类是对CTC层的实现,具体请去参考下论文...
|
||||
tensorflow中和keras中有ctc的一些实现,
|
||||
并将其通过自定义层加入到keras创建的神经网络层中
|
||||
|
||||
参数:
|
||||
output_dim: 每一条时间序列中,输出的序列张量的尺寸
|
||||
|
||||
|
||||
目前可能有bug
|
||||
'''
|
||||
def __init__(self, input_dim, output_dim, **kwargs):
|
||||
def __init__(self, output_dim, batch_size, **kwargs):
|
||||
'''
|
||||
这里是神经网络CTC层的初始化模块
|
||||
'''
|
||||
#if 'input_shape' not in kwargs and 'input_dim' in kwargs:
|
||||
# kwargs['input_shape'] = (kwargs.pop('input_dim'), kwargs.pop('input_dim'),)
|
||||
super(ctc_layer, self).__init__(**kwargs)
|
||||
self.input_dim = input_dim
|
||||
#self.input_dim = input_dim
|
||||
#self.input_spec = [InputSpec(dtype=(None,,output_dim),ndim=3, axes={None: input_dim})]
|
||||
self.output_dim = output_dim
|
||||
self.batch_size = batch_size
|
||||
#self.input_spec = InputSpec(min_ndim=3)
|
||||
#super(ctc_layer, self).build(input_shape) # Be sure to call this somewhere!
|
||||
pass
|
||||
|
||||
def build(self, input_shape):
|
||||
assert len(input_shape) >= 2
|
||||
input_dim = input_shape[-1]
|
||||
# Create a trainable weight variable for this layer.
|
||||
self.kernel = self.add_weight(name='kernel',
|
||||
shape=('''input_shape[0],''' self.output_dim, -1),
|
||||
initializer='uniform',
|
||||
trainable=True)
|
||||
super(MyLayer, self).build(input_shape) # Be sure to call this somewhere!
|
||||
self.kernel = self.add_weight(name='kernel',
|
||||
shape=(input_dim, self.output_dim),
|
||||
initializer='uniform',
|
||||
trainable=True)
|
||||
|
||||
#super(ctc_layer, self).build(input_shape) # Be sure to call this somewhere!
|
||||
#self.input_spec = InputSpec(min_ndim=2, axes={-1: input_dim})
|
||||
self.input_spec = [InputSpec(min_ndim=3)] # , axes={1: 748, -1: self.output_dim}
|
||||
self.built = True
|
||||
|
||||
def call(self, x, mask=None):
|
||||
decoded_dense, log_prob = K.ctc_decode(x,self.input_dim)
|
||||
decoded_sequence = K.ctc_label_dense_to_sparse(decoded_dense, decoded_dense.shape[0])
|
||||
return decoded_sequence
|
||||
|
||||
output = K.dot(x, self.kernel)
|
||||
#output.shape[0] = self.batch_size
|
||||
decoded_dense, log_prob = K.ctc_decode(output,tf.Variable((output.shape[1],output.shape[2]),dtype=tf.int64))
|
||||
#decoded_dense, log_prob = K.ctc_decode(output,output.shape[1])
|
||||
#decoded_sequence = K.ctc_label_dense_to_sparse(decoded_dense, len(decoded_dense))
|
||||
#return decoded_sequence
|
||||
return decoded_dense
|
||||
|
||||
def get_config(self):
|
||||
|
||||
pass
|
||||
|
||||
config = {
|
||||
'output_dim': self.output_dim
|
||||
}
|
||||
base_config = super(ctc_layer, self).get_config()
|
||||
return dict(list(base_config.items()) + list(config.items()))
|
||||
|
||||
def compute_output_shape(self, input_shape):
|
||||
return (input_shape[0], input_shape[1], self.output_dim)
|
||||
|
|
@ -0,0 +1,13 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from keras.backend.tensorflow_backend import ctc_batch_cost
|
||||
import tensorflow as tf
|
||||
|
||||
def ctc_batch_loss(y_true, y_pred):
|
||||
'''
|
||||
CTC的loss函数
|
||||
这里目前有bug
|
||||
'''
|
||||
loss = ctc_batch_cost(y_true, y_pred, tf.Variable((748,1283),dtype=tf.int64), tf.Variable((64,1283),dtype=tf.int64))
|
||||
return tf.Variable(loss,dtype=tf.int64)
|
Loading…
Reference in New Issue