diff --git a/README.md b/README.md index f8c7a8d..f188ebd 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ This project uses keras, TensorFlow based on LSTM, CNN and CTC to implement. -本项目目前已经可以进行训练了,不过训练时loss一直高居不下。 +本项目目前已经可以正常进行训练了,现在的这几个神经网络模型正在准备评估哪一个模型的效果最好。 本项目运行请执行: ```shell diff --git a/SpeechModel.py b/SpeechModel.py index 53dc7e5..0d42b98 100644 --- a/SpeechModel.py +++ b/SpeechModel.py @@ -33,7 +33,7 @@ class ModelSpeech(): # 语音模型类 def __init__(self, datapath): ''' 初始化 - 默认输出的拼音的表示大小是1283,即1282个拼音+1个空白块 + 默认输出的拼音的表示大小是1417,即1416个拼音+1个空白块 ''' MS_OUTPUT_SIZE = 1417 self.MS_OUTPUT_SIZE = MS_OUTPUT_SIZE # 神经网络最终输出的每一个字符向量维度的大小 @@ -62,37 +62,37 @@ class ModelSpeech(): # 语音模型类 # 每一帧使用13维mfcc特征及其13维一阶差分和13维二阶差分表示,最大信号序列长度为1500 input_data = Input(name='the_input', shape=(self.AUDIO_LENGTH, self.AUDIO_FEATURE_LENGTH)) - layer_h1_c = Conv1D(filters=256, kernel_size=5, strides=1, use_bias=True, padding="valid")(input_data) # 卷积层 + layer_h1_c = Conv1D(filters=256, kernel_size=5, strides=1, use_bias=True, kernel_initializer='he_normal', padding="same")(input_data) # 卷积层 #layer_h1_a = Activation('relu', name='relu0')(layer_h1_c) layer_h1_a = LeakyReLU(alpha=0.3)(layer_h1_c) # 高级激活层 layer_h1 = MaxPooling1D(pool_size=2, strides=None, padding="valid")(layer_h1_a) # 池化层 layer_h2 = BatchNormalization()(layer_h1) - layer_h3_c = Conv1D(filters=256, kernel_size=5, strides=1, use_bias=True, padding="valid")(layer_h2) # 卷积层 + layer_h3_c = Conv1D(filters=256, kernel_size=5, strides=1, use_bias=True, kernel_initializer='he_normal', padding="same")(layer_h2) # 卷积层 layer_h3_a = LeakyReLU(alpha=0.3)(layer_h3_c) # 高级激活层 #layer_h3_a = Activation('relu', name='relu1')(layer_h3_c) layer_h3 = MaxPooling1D(pool_size=2, strides=None, padding="valid")(layer_h3_a) # 池化层 layer_h4 = Dropout(0.1)(layer_h3) # 随机中断部分神经网络连接,防止过拟合 - layer_h5 = Dense(256, use_bias=True, activation="softmax")(layer_h4) # 全连接层 - layer_h6 = Dense(256, use_bias=True, activation="softmax")(layer_h5) # 全连接层 + layer_h5 = Dense(256, use_bias=True, activation="relu", kernel_initializer='he_normal')(layer_h4) # 全连接层 + layer_h6 = Dense(256, use_bias=True, activation="relu", kernel_initializer='he_normal')(layer_h5) # 全连接层 #layer_h4 = Activation('softmax', name='softmax0')(layer_h4_d1) - layer_h7 = LSTM(256, activation='softmax', use_bias=True, return_sequences=True)(layer_h6) # LSTM层 - layer_h8 = LSTM(256, activation='softmax', use_bias=True, return_sequences=True)(layer_h7) # LSTM层 - layer_h9 = LSTM(256, activation='softmax', use_bias=True, return_sequences=True)(layer_h8) # LSTM层 - layer_h10 = LSTM(256, activation='softmax', use_bias=True, return_sequences=True)(layer_h9) # LSTM层 + layer_h7 = LSTM(256, activation='tanh', use_bias=True, return_sequences=True, kernel_initializer='he_normal')(layer_h6) # LSTM层 + layer_h8 = LSTM(256, activation='tanh', use_bias=True, return_sequences=True, kernel_initializer='he_normal')(layer_h7) # LSTM层 + layer_h9 = LSTM(256, activation='tanh', use_bias=True, return_sequences=True, kernel_initializer='he_normal')(layer_h8) # LSTM层 + layer_h10 = LSTM(256, activation='tanh', use_bias=True, return_sequences=True, kernel_initializer='he_normal')(layer_h9) # LSTM层 #layer_h10 = Activation('softmax', name='softmax1')(layer_h9) layer_h10_dropout = Dropout(0.1)(layer_h10) # 随机中断部分神经网络连接,防止过拟合 - layer_h11 = Dense(512, use_bias=True, activation="softmax")(layer_h10_dropout) # 全连接层 - layer_h12 = Dense(self.MS_OUTPUT_SIZE, use_bias=True, activation="softmax")(layer_h11) # 全连接层 + layer_h11 = Dense(512, use_bias=True, activation="relu", kernel_initializer='he_normal')(layer_h10_dropout) # 全连接层 + layer_h12 = Dense(self.MS_OUTPUT_SIZE, use_bias=True, kernel_initializer='he_normal')(layer_h11) # 全连接层 #layer_h6 = Dense(1283, activation="softmax")(layer_h5) # 全连接层 - y_pred = Activation('softmax', name='softmax2')(layer_h12) + y_pred = Activation('softmax', name='softmax')(layer_h12) model_data = Model(inputs = input_data, outputs = y_pred) #model_data.summary() diff --git a/log.md b/log.md index 25bab72..51f1852 100644 --- a/log.md +++ b/log.md @@ -8,6 +8,8 @@ 如果有什么问题,可以在这里直接写出来 ## Log +### 2018-04-08 +经过连续几天的不懈努力,loss终于可以下降了。原因竟然是模型的权重参数初始化有问题,直接导致了梯度的消失,以至于难以训练,loss迟迟下不来,一直欠拟合。调参的第一坑... ### 2018-04-05 将之前的模型做了修改,并且,想用图像的方式试试效果。现在对于loss下不来acc上不去这个问题很头大。 ### 2018-03-30 diff --git a/readdata.py b/readdata.py index 1931c8b..2e23ec6 100644 --- a/readdata.py +++ b/readdata.py @@ -197,7 +197,7 @@ class DataSpeech(): labels = [] for i in range(0,batch_size): #input_length.append([1500]) - labels.append([1e-12]) # 最终的ctc loss结果,0代表着没有ctc上的loss + labels.append([0]) # 最终的ctc loss结果,0代表着没有ctc上的loss @@ -217,7 +217,7 @@ class DataSpeech(): #input_length.append(data_input.shape[1] // 4 - 2) #print(data_input.shape[0],len(data_input)) - input_length.append(data_input.shape[0] // 4 - 3) + input_length.append(data_input.shape[0] // 4) #print(data_input, data_labels) #print('data_input长度:',len(data_input)) diff --git a/readdata2.py b/readdata2.py index b67c9b3..da67b76 100644 --- a/readdata2.py +++ b/readdata2.py @@ -193,7 +193,7 @@ class DataSpeech(): for i in range(batch_size): data_input, data_labels = self.GetData((ran_num + i) % self.DataNum) # 从随机数开始连续向后取一定数量数据 - input_length.append(data_input.shape[0] // 4 - 2) + input_length.append(data_input.shape[0] // 4) #print(data_input, data_labels) #print('data_input长度:',len(data_input))