try again 20180407_1

2018-04-07 16:17:55 +08:00 · 2018-04-07 16:17:55 +08:00 · 8b59183c00
parent 00061ab74e
commit 8b59183c00
2 changed files with 355 additions and 1 deletions
--- a/SpeechModel4.py
+++ b/SpeechModel4.py
@ -427,7 +427,7 @@ if(__name__=='__main__'):
 	
 	ms = ModelSpeech(datapath)
 	
-	ms.LoadModel(modelpath + 'speech_model_e_0_step_30.model')
+	ms.LoadModel(modelpath + '1/speech_model_e_0_step_80.model')
 	#ms.TrainModel(datapath, epoch = 2, batch_size = 8, save_step = 10)
 	#ms.TestModel(datapath, str_dataset='dev', data_count = 32)
 	r = ms.RecognizeSpeech_FromFile('E:\\语音数据集\\wav\\test\\D4\\D4_750.wav')
--- a/SpeechModel5.py
+++ b/SpeechModel5.py
@ -0,0 +1,354 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+@author: nl8590687
+"""
+import platform as plat
+import os
+
+from general_function.file_wav import *
+import numpy as np
+
+# LSTM_CNN
+import keras as kr
+import numpy as np
+
+from keras.models import Sequential, Model
+from keras.layers import Dense, Dropout, Input # , Flatten,LSTM,Convolution1D,MaxPooling1D,Merge
+from keras.layers import Conv1D,LSTM,MaxPooling1D, Lambda, TimeDistributed, Activation #, Merge, Conv2D, MaxPooling2D,Conv1D
+from keras.layers.normalization import BatchNormalization
+from keras.layers.merge import add, concatenate
+
+from keras.layers.advanced_activations import LeakyReLU
+
+from keras import backend as K
+from keras.optimizers import SGD, Adadelta
+
+from readdata import DataSpeech
+from neural_network.ctc_layer import ctc_layer
+from neural_network.ctc_loss import ctc_batch_loss
+
+#from keras.backend.tensorflow_backend import ctc_batch_cost
+
+class ModelSpeech(): # 语音模型类
+	def __init__(self, datapath):
+		'''
+		初始化
+		默认输出的拼音的表示大小是1283，即1282个拼音+1个空白块
+		'''
+		MS_OUTPUT_SIZE = 1417
+		self.MS_OUTPUT_SIZE = MS_OUTPUT_SIZE # 神经网络最终输出的每一个字符向量维度的大小
+		#self.BATCH_SIZE = BATCH_SIZE # 一次训练的batch
+		self.label_max_string_length = 64
+		self.AUDIO_LENGTH = 1600
+		self.AUDIO_FEATURE_LENGTH = 200
+		self._model = self.CreateModel() 
+		
+		self.data = DataSpeech(datapath)
+		
+	def CreateModel(self):
+		'''
+		定义CNN/LSTM/CTC模型，使用函数式模型
+		输入层：39维的特征值序列，一条语音数据的最大长度设为1500（大约15s）
+		隐藏层一：1024个神经元的卷积层
+		隐藏层二：池化层，池化窗口大小为2
+		隐藏层三：Dropout层，需要断开的神经元的比例为0.2，防止过拟合
+		隐藏层四：循环层、LSTM层
+		隐藏层五：Dropout层，需要断开的神经元的比例为0.2，防止过拟合
+		隐藏层六：全连接层，神经元数量为self.MS_OUTPUT_SIZE，使用softmax作为激活函数，
+		输出层：自定义层，即CTC层，使用CTC的loss作为损失函数
+		
+		当前未完成，网络模型可能还需要修改
+		'''
+		# 每一帧使用13维mfcc特征及其13维一阶差分和13维二阶差分表示，最大信号序列长度为1500
+		input_data = Input(name='the_input', shape=(self.AUDIO_LENGTH, self.AUDIO_FEATURE_LENGTH))
+		
+		layer_h1_c = Conv1D(filters=256, kernel_size=5, strides=1, use_bias=True, padding="same")(input_data) # 卷积层
+		#layer_h1_a = Activation('relu', name='relu0')(layer_h1_c)
+		layer_h1_a = LeakyReLU(alpha=0.3)(layer_h1_c) # 高级激活层
+		layer_h1 = MaxPooling1D(pool_size=2, strides=None, padding="valid")(layer_h1_a) # 池化层
+		
+		layer_h2 = BatchNormalization()(layer_h1)
+		
+		layer_h3_c = Conv1D(filters=256, kernel_size=5, strides=1, use_bias=True, padding="same")(layer_h2) # 卷积层
+		layer_h3_a = LeakyReLU(alpha=0.3)(layer_h3_c) # 高级激活层
+		#layer_h3_a = Activation('relu', name='relu1')(layer_h3_c)
+		layer_h3 = MaxPooling1D(pool_size=2, strides=None, padding="valid")(layer_h3_a) # 池化层
+		
+		layer_h4 = Dropout(0.1)(layer_h3) # 随机中断部分神经网络连接，防止过拟合
+		
+		layer_h5 = Dense(256, use_bias=True, activation="softmax")(layer_h4) # 全连接层
+		layer_h6 = Dense(256, use_bias=True, activation="softmax")(layer_h5) # 全连接层
+		#layer_h4 = Activation('softmax', name='softmax0')(layer_h4_d1)
+		
+		layer_h7a = LSTM(256, activation='softmax', use_bias=True, return_sequences=True)(layer_h6) # LSTM层
+		layer_h7b = LSTM(256, activation='softmax', use_bias=True, return_sequences=True)(layer_h6) # LSTM层
+		
+		layer_h7_merged = add([layer_h7a, layer_h7b])
+		
+		layer_h8a = LSTM(256, activation='softmax', use_bias=True, return_sequences=True)(layer_h7_merged) # LSTM层
+		layer_h8b = LSTM(256, activation='softmax', use_bias=True, return_sequences=True)(layer_h7_merged) # LSTM层
+		
+		layer_h8 = concatenate([layer_h8a, layer_h8b])
+		#layer_h10 = Activation('softmax', name='softmax1')(layer_h9)
+		
+		#layer_h10_dropout = Dropout(0.1)(layer_h10) # 随机中断部分神经网络连接，防止过拟合
+		
+		#layer_h11 = Dense(512, use_bias=True, activation="softmax")(layer_h8) # 全连接层
+		layer_h12 = Dense(self.MS_OUTPUT_SIZE, use_bias=True, activation="softmax")(layer_h8) # 全连接层
+		#layer_h6 = Dense(1283, activation="softmax")(layer_h5) # 全连接层
+		
+		y_pred = Activation('softmax', name='softmax2')(layer_h12)
+		model_data = Model(inputs = input_data, outputs = y_pred)
+		#model_data.summary()
+		
+		
+		#labels = Input(name='the_labels', shape=[60], dtype='float32')
+		
+		labels = Input(name='the_labels', shape=[self.label_max_string_length], dtype='float32')
+		input_length = Input(name='input_length', shape=[1], dtype='int64')
+		label_length = Input(name='label_length', shape=[1], dtype='int64')
+		# Keras doesn't currently support loss funcs with extra parameters
+		# so CTC loss is implemented in a lambda layer
+		
+		#layer_out = Lambda(ctc_lambda_func,output_shape=(self.MS_OUTPUT_SIZE, ), name='ctc')([y_pred, labels, input_length, label_length])#(layer_h6) # CTC
+		loss_out = Lambda(self.ctc_lambda_func, output_shape=(1,), name='ctc')([y_pred, labels, input_length, label_length])
+		
+		#y_out = Activation('softmax', name='softmax3')(loss_out)
+		model = Model(inputs=[input_data, labels, input_length, label_length], outputs=loss_out)
+		
+		model.summary()
+		
+		# clipnorm seems to speeds up convergence
+		#sgd = SGD(lr=0.0001, decay=1e-8, momentum=0.9, nesterov=True, clipnorm=5)
+		ada_d = Adadelta(lr = 0.01, rho = 0.95, epsilon = 1e-06)
+		
+		#model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer = sgd, metrics=['accuracy'])
+		model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer = ada_d, metrics=['accuracy'])
+		
+		
+		# captures output of softmax so we can decode the output during visualization
+		self.test_func = K.function([input_data], [y_pred])
+		
+		print('[*提示] 创建模型成功，模型编译成功')
+		return model
+		
+	def ctc_lambda_func(self, args):
+		y_pred, labels, input_length, label_length = args
+		#print(y_pred)
+		y_pred = y_pred[:, :, 0:-2]
+		#return K.ctc_decode(y_pred,self.MS_OUTPUT_SIZE)
+		return K.ctc_batch_cost(labels, y_pred, input_length, label_length)
+	
+	
+	
+	def TrainModel(self, datapath, epoch = 2, batch_size = 32, save_step = 1000, filename = 'model_speech/speech_model'):
+		'''
+		训练模型
+		参数：
+			datapath: 数据保存的路径
+			epoch: 迭代轮数
+			save_step: 每多少步保存一次模型
+			filename: 默认保存文件名，不含文件后缀名
+		'''
+		#data=DataSpeech(datapath)
+		data = self.data
+		data.LoadDataList('train')
+		num_data = data.GetDataNum() # 获取数据的数量
+		for epoch in range(epoch): # 迭代轮数
+			print('[running] train epoch %d .' % epoch)
+			n_step = 0 # 迭代数据数
+			while (n_step * save_step < num_data):
+				try:
+					print('[message] epoch %d . Have train datas %d+'%(epoch, n_step*save_step))
+					# data_genetator是一个生成器函数
+					yielddatas = data.data_genetator(batch_size, self.AUDIO_LENGTH)
+					#self._model.fit_generator(yielddatas, save_step, nb_worker=2)
+					self._model.fit_generator(yielddatas, save_step)
+					n_step += 1
+				except StopIteration:
+					print('[error] generator error. please check data format.')
+					break
+				
+				self.SaveModel(comment='_e_'+str(epoch)+'_step_'+str(n_step * save_step))
+				
+				
+	def LoadModel(self, filename = 'model_speech/speech_model_e_0_step_1.model'):
+		'''
+		加载模型参数
+		'''
+		self._model.load_weights(filename)
+		print('*[提示] 已加载模型')
+
+	def SaveModel(self, filename = 'model_speech/speech_model', comment = ''):
+		'''
+		保存模型参数
+		'''
+		self._model.save_weights(filename + comment + '.model')
+
+	def TestModel(self, datapath, str_dataset='dev', data_count = 32):
+		'''
+		测试检验模型效果
+		'''
+		#data=DataSpeech(datapath)
+		data = self.data
+		data.LoadDataList(str_dataset) 
+		num_data = data.GetDataNum() # 获取数据的数量
+		if(data_count <= 0 or data_count > num_data): # 当data_count为小于等于0或者大于测试数据量的值时，则使用全部数据来测试
+			data_count = num_data
+		
+		try:
+			gen = data.data_genetator(data_count)
+			#for i in range(1):
+			#	[X, y, input_length, label_length ], labels = gen
+			#r = self._model.test_on_batch([X, y, input_length, label_length ], labels)
+			r = self._model.evaluate_generator(generator = gen, steps = 1, max_queue_size = data_count, workers = 1, use_multiprocessing = False)
+			print(r)
+		except StopIteration:
+			print('[Error] Model Test Error. please check data format.')
+
+	def Predict(self,x):
+		'''
+		预测结果
+		'''
+		r = self._model.predict_on_batch(x)
+		print(r)
+		return r
+		pass
+		
+	def decode_batch(self, test_func, word_batch):
+		out = test_func([word_batch])[0]
+		ret = []
+		for j in range(out.shape[0]):
+			out_best = list(np.argmax(out[j, 2:], 1))
+			out_best = [k for k, g in itertools.groupby(out_best)]
+			outstr = labels_to_text(out_best)
+			ret.append(outstr)
+		return ret
+	
+	def show_edit_distance(self, num):
+		num_left = num
+		mean_norm_ed = 0.0
+		mean_ed = 0.0
+		while num_left > 0:
+			word_batch = next(self.text_img_gen)[0]
+			num_proc = min(word_batch['the_input'].shape[0], num_left)
+			decoded_res = decode_batch(self.test_func, word_batch['the_input'][0:num_proc])
+			for j in range(num_proc):
+				edit_dist = editdistance.eval(decoded_res[j], word_batch['source_str'][j])
+				mean_ed += float(edit_dist)
+				mean_norm_ed += float(edit_dist) / len(word_batch['source_str'][j])
+			num_left -= num_proc
+		mean_norm_ed = mean_norm_ed / num
+		mean_ed = mean_ed / num
+		print('\nOut of %d samples:  Mean edit distance: %.3f Mean normalized edit distance: %0.3f'
+				% (num, mean_ed, mean_norm_ed))
+	
+	def RecognizeSpeech(self, wavsignal, fs):
+		'''
+		最终做语音识别用的函数，识别一个wav序列的语音
+		不过这里现在还有bug
+		'''
+		
+		#data = self.data
+		data = DataSpeech('E:\\语音数据集')
+		data.LoadDataList('dev')
+		# 获取输入特征
+		#data_input = data.GetMfccFeature(wavsignal, fs)
+		data_input = data.GetFrequencyFeature(wavsignal, fs)
+		
+		arr_zero = np.zeros((1, 200), dtype=np.int16) #一个全是0的行向量
+		
+		#import matplotlib.pyplot as plt
+		#plt.subplot(111)
+		#plt.imshow(data_input, cmap=plt.get_cmap('gray'))
+		#plt.show()
+		
+		#while(len(data_input)<1600): #长度不够时补全到1600
+		#	data_input = np.row_stack((data_input,arr_zero))
+		#print(len(data_input))
+		
+		list_symbol = data.list_symbol # 获取拼音列表
+		
+		labels = [ list_symbol[0] ]
+		#while(len(labels) < 64):
+		#	labels.append('')
+			
+		labels_num = []
+		for i in labels:
+			labels_num.append(data.SymbolToNum(i))
+		
+		
+		
+		data_input = np.array(data_input, dtype=np.int16)
+		data_input = data_input.reshape(data_input.shape[0],data_input.shape[1])
+		
+		labels_num = np.array(labels_num, dtype=np.int16)
+		labels_num = labels_num.reshape(labels_num.shape[0])
+		
+		input_length = np.array([data_input.shape[0] // 4 - 3], dtype=np.int16)
+		input_length = np.array(input_length)
+		input_length = input_length.reshape(input_length.shape[0])
+		
+		label_length = np.array([labels_num.shape[0]], dtype=np.int16)
+		label_length = np.array(label_length)
+		label_length = label_length.reshape(label_length.shape[0])
+		
+		x = [data_input, labels_num, input_length, label_length]
+		#x = next(data.data_genetator(1, self.AUDIO_LENGTH))
+		#x = kr.utils.np_utils.to_categorical(x)
+		
+		print(x)
+		x=np.array(x)
+		
+		pred = self._model.predict(x=x)
+		#pred = self._model.predict_on_batch([data_input, labels_num, input_length, label_length])
+		return [labels,pred]
+		
+		pass
+		
+	def RecognizeSpeech_FromFile(self, filename):
+		'''
+		最终做语音识别用的函数，识别指定文件名的语音
+		'''
+		
+		wavsignal,fs = read_wav_data(filename)
+		return self.RecognizeSpeech(wavsignal, fs)
+		
+		pass
+	
+	@property
+	def model(self):
+		'''
+		返回keras model
+		'''
+		return self._model
+
+
+if(__name__=='__main__'):
+	datapath = ''
+	modelpath = 'model_speech'
+	
+	
+	if(not os.path.exists(modelpath)): # 判断保存模型的目录是否存在
+		os.makedirs(modelpath) # 如果不存在，就新建一个，避免之后保存模型的时候炸掉
+	
+	system_type = plat.system() # 由于不同的系统的文件路径表示不一样，需要进行判断
+	if(system_type == 'Windows'):
+		datapath = 'E:\\语音数据集'
+		modelpath = modelpath + '\\'
+	elif(system_type == 'Linux'):
+		datapath = 'dataset'
+		modelpath = modelpath + '/'
+	else:
+		print('*[Message] Unknown System\n')
+		datapath = 'dataset'
+		modelpath = modelpath + '/'
+	
+	ms = ModelSpeech(datapath)
+	
+	#ms.LoadModel(modelpath + 'speech_model_e_0_step_1.model')
+	ms.TrainModel(datapath, epoch = 2, batch_size = 8, save_step = 1)
+	#ms.TestModel(datapath, str_dataset='dev', data_count = 32)
+	#r = ms.RecognizeSpeech_FromFile('E:\\语音数据集\\wav\\test\\D4\\D4_750.wav')
+	#print('*[提示] 语音识别结果：\n',r)