diff --git a/main.py b/main.py index 13baa0d..00d80ed 100644 --- a/main.py +++ b/main.py @@ -9,10 +9,11 @@ import numpy as np from keras.models import Sequential, Model from keras.layers import Dense, Dropout, Input # , Flatten,LSTM,Convolution1D,MaxPooling1D,Merge -from keras.layers import Conv1D,LSTM,MaxPooling1D, Lambda #, Merge, Conv2D, MaxPooling2D,Conv1D +from keras.layers import Conv1D,LSTM,MaxPooling1D, Lambda, TimeDistributed, Activation #, Merge, Conv2D, MaxPooling2D,Conv1D from keras import backend as K from readdata import DataSpeech +from neural_network import ctc_layer class ModelSpeech(): # 语音模型类 def __init__(self,MS_OUTPUT_SIZE = 1283,BATCH_SIZE = 32): @@ -33,9 +34,9 @@ class ModelSpeech(): # 语音模型类 隐藏层四:循环层、LSTM层 隐藏层五:Dropout层,需要断开的神经元的比例为0.2,防止过拟合 隐藏层六:全连接层,神经元数量为self.MS_OUTPUT_SIZE,使用softmax作为激活函数, - 输出层:lambda层,即CTC层,使用CTC的loss作为损失函数,实现多输出 + 输出层:自定义层,即CTC层,使用CTC的loss作为损失函数,实现连接性时序多输出 - 当前未完成,针对多输出的CTC层尚未添加 + 当前未完成,针对多输出的CTC层尚未实现 ''' # 每一帧使用13维mfcc特征及其13维一阶差分和13维二阶差分表示,最大信号序列长度为1500 layer_input = Input((1500,39)) @@ -43,17 +44,32 @@ class ModelSpeech(): # 语音模型类 layer_h1 = Conv1D(256, 5, use_bias=True, padding="valid")(layer_input) # 卷积层 layer_h2 = MaxPooling1D(pool_size=2, strides=None, padding="valid")(layer_h1) # 池化层 layer_h3 = Dropout(0.2)(layer_h2) # 随机中断部分神经网络连接,防止过拟合 - layer_h4 = LSTM(256, activation='relu', use_bias=True)(layer_h3) # LSTM层 + layer_h4 = LSTM(256, activation='relu', use_bias=True, return_sequences=True)(layer_h3) # LSTM层 layer_h5 = Dropout(0.2)(layer_h4) # 随机中断部分神经网络连接,防止过拟合 layer_h6 = Dense(self.MS_OUTPUT_SIZE, activation="softmax")(layer_h5) # 全连接层 + layer_out = ctc_layer()(layer_h6) # CTC层 尚未实现! + #labels = Input(name='the_labels', shape=[60], dtype='float32') - layer_out = Lambda(ctc_lambda_func,output_shape=(self.MS_OUTPUT_SIZE, ), name='ctc')(layer_h6) # CTC + #layer_out = Lambda(ctc_lambda_func,output_shape=(self.MS_OUTPUT_SIZE, ), name='ctc')(layer_h6) # CTC + #layer_out = TimeDistributed(Dense(self.MS_OUTPUT_SIZE, activation="softmax"))(layer_h5) _model = Model(inputs = layer_input, outputs = layer_out) - #_model.compile(optimizer="sgd", loss='categorical_crossentropy',metrics=["accuracy"]) - _model.compile(optimizer="sgd", loss='ctc',metrics=["accuracy"]) + #_model = Sequential() + #_model.add(Conv1D(256, 5, use_bias=True, padding="valid", input_shape=(1500,39))) + #_model.add(MaxPooling1D(pool_size=2, strides=None, padding="valid")) + #_model.add(Dropout(0.2)) + #_model.add(LSTM(256, activation='relu', use_bias=True, return_sequences=True)) + #_model.add(Dropout(0.2)) + #_model.add(TimeDistributed(Dense(self.MS_OUTPUT_SIZE))) + #_model.add(Activation("softmax")) + + + + + _model.compile(optimizer="sgd", loss='categorical_crossentropy',metrics=["accuracy"]) + #_model.compile(optimizer="sgd", loss='ctc',metrics=["accuracy"]) return _model def ctc_lambda_func(args): diff --git a/neural_network/ctc_layer.py b/neural_network/ctc_layer.py new file mode 100644 index 0000000..ebc3103 --- /dev/null +++ b/neural_network/ctc_layer.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +''' +本代码用来实现神经网络中的CTC层 +CTC层即:Connectionist Temporal Classification (连续型短时分类) +将这里实现的 +尚未完成 +''' + +from keras.layers.core import Layer +from keras.engine import InputSpec +from keras import backend as K + +try: + from keras import initializations +except ImportError: + from keras import initializers as initializations + +import tensorflow as tf + +# 继承父类Layer +class ctc_layer(Layer): + ''' + 对CTC层的实现,具体需要再去参考下论文...以及tensorflow中ctc的实现, + 并将其通过自定义层加入到keras的神经网络层中 + ''' + def __init__(self, input_dim, output_dim, **kwargs): + super(ctc_layer, self).__init__(**kwargs) + self.input_dim = input_dim + self.output_dim = output_dim + #self.input_spec = InputSpec(min_ndim=3) + pass + + def build(self, input_shape): + # Create a trainable weight variable for this layer. + self.kernel = self.add_weight(name='kernel', + shape=('''input_shape[0],''' self.output_dim, -1), + initializer='uniform', + trainable=True) + super(MyLayer, self).build(input_shape) # Be sure to call this somewhere! + + + def call(self, x, mask=None): + decoded_dense, log_prob = K.ctc_decode(x,self.input_dim) + decoded_sequence = K.ctc_label_dense_to_sparse(decoded_dense, decoded_dense.shape[0]) + return decoded_sequence + + + def get_config(self): + + pass + + + \ No newline at end of file