bug修复以及一些尝试

This commit is contained in:
nl8590687 2018-04-05 21:56:28 +08:00
parent 8b9d086ab2
commit 1398b068a3
2 changed files with 47 additions and 26 deletions

View File

@ -118,11 +118,11 @@ class ModelSpeech(): # 语音模型类
ada_d = Adadelta(lr = 0.01, rho = 0.95, epsilon = 1e-06)
#model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer = sgd, metrics=['accuracy'])
model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer = ada_d, metrics=['accuracy'])
model.compile(loss={'ctc': lambda labels, y_pred: y_pred}, optimizer = ada_d, metrics=['accuracy'])
# captures output of softmax so we can decode the output during visualization
test_func = K.function([input_data], [y_pred])
self.test_func = K.function([input_data], [y_pred])
print('[*提示] 创建模型成功,模型编译成功')
return model
@ -210,6 +210,16 @@ class ModelSpeech(): # 语音模型类
return r
pass
def decode_batch(test_func, word_batch):
out = test_func([word_batch])[0]
ret = []
for j in range(out.shape[0]):
out_best = list(np.argmax(out[j, 2:], 1))
out_best = [k for k, g in itertools.groupby(out_best)]
outstr = labels_to_text(out_best)
ret.append(outstr)
return ret
def RecognizeSpeech(self, wavsignal, fs):
'''
最终做语音识别用的函数识别一个wav序列的语音
@ -220,24 +230,25 @@ class ModelSpeech(): # 语音模型类
data = DataSpeech('E:\\语音数据集')
data.LoadDataList('dev')
# 获取输入特征
data_input = data.GetMfccFeature(wavsignal, fs)
#data_input = data.GetMfccFeature(wavsignal, fs)
data_input = data.GetFrequencyFeature(wavsignal, fs)
arr_zero = np.zeros((1, 39), dtype=np.int16) #一个全是0的行向量
arr_zero = np.zeros((1, 200), dtype=np.int16) #一个全是0的行向量
import matplotlib.pyplot as plt
plt.subplot(111)
plt.imshow(data_input.T, cmap=plt.get_cmap('gray'))
plt.show()
#import matplotlib.pyplot as plt
#plt.subplot(111)
#plt.imshow(data_input, cmap=plt.get_cmap('gray'))
#plt.show()
while(len(data_input)<1600): #长度不够时补全到1600
data_input = np.row_stack((data_input,arr_zero))
#while(len(data_input)<1600): #长度不够时补全到1600
# data_input = np.row_stack((data_input,arr_zero))
#print(len(data_input))
list_symbol = data.list_symbol # 获取拼音列表
labels = [ list_symbol[0] ]
while(len(labels) < 64):
labels.append('')
#while(len(labels) < 64):
# labels.append('')
labels_num = []
for i in labels:
@ -245,18 +256,28 @@ class ModelSpeech(): # 语音模型类
#data_input = np.array([data_input], dtype=np.int16)
#labels_num = np.array([labels_num], dtype=np.int16)
data_input = np.array(data_input, dtype=np.int16)
data_input = data_input.reshape(data_input.shape[0],data_input.shape[1])
labels_num = np.array(labels_num, dtype=np.int16)
labels_num = labels_num.reshape(labels_num.shape[0])
input_length = np.array([data_input.shape[0] // 4 - 3], dtype=np.int16)
label_length = np.array([64], dtype=np.int16)
x = data_input, labels_num, input_length, label_length
input_length = np.array(input_length)
input_length = input_length.reshape(input_length.shape[0])
label_length = np.array([labels_num.shape[0]], dtype=np.int16)
label_length = np.array(label_length)
label_length = label_length.reshape(label_length.shape[0])
x = [data_input, labels_num, input_length, label_length]
#x = next(data.data_genetator(1, self.AUDIO_LENGTH))
#x = kr.utils.np_utils.to_categorical(x)
print(x)
x=np.array(x)
pred = self._model.predict(x[0], batch_size = None)
pred = self._model.predict(x=x)
#pred = self._model.predict_on_batch([data_input, labels_num, input_length, label_length])
return [labels,pred]
@ -286,7 +307,7 @@ if(__name__=='__main__'):
if(not os.path.exists(modelpath)): # 判断保存模型的目录是否存在
os.makedirs(path) # 如果不存在,就新建一个,避免之后保存模型的时候炸掉
os.makedirs(modelpath) # 如果不存在,就新建一个,避免之后保存模型的时候炸掉
system_type = plat.system() # 由于不同的系统的文件路径表示不一样,需要进行判断
if(system_type == 'Windows'):

View File

@ -14,7 +14,7 @@ from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Input, Reshape # , Flatten,LSTM,Convolution1D,MaxPooling1D,Merge
from keras.layers import Conv1D,LSTM,MaxPooling1D, Lambda, TimeDistributed, Activation,Conv2D, MaxPooling2D #, Merge,Conv1D
from keras import backend as K
from keras.optimizers import SGD
from keras.optimizers import SGD, Adadelta
from readdata2 import DataSpeech
from neural_network.ctc_layer import ctc_layer
@ -77,9 +77,6 @@ class ModelSpeech(): # 语音模型类
#model_data.summary()
#layer_out = ctc_layer(64, self.BATCH_SIZE)(layer_h6) # CTC层 可能有bug
#layer_out = ctc_layer(1283, 32)(layer_h6) # CTC层 可能有bug
#labels = Input(name='the_labels', shape=[60], dtype='float32')
labels = Input(name='the_labels', shape=[self.label_max_string_length], dtype='float32')
@ -91,15 +88,18 @@ class ModelSpeech(): # 语音模型类
#layer_out = Lambda(ctc_lambda_func,output_shape=(self.MS_OUTPUT_SIZE, ), name='ctc')([y_pred, labels, input_length, label_length])#(layer_h6) # CTC
loss_out = Lambda(self.ctc_lambda_func, output_shape=(1,), name='ctc')([y_pred, labels, input_length, label_length])
# clipnorm seems to speeds up convergence
sgd = SGD(lr=0.001, decay=1e-6, momentum=0.9, nesterov=True, clipnorm=5)
model = Model(inputs=[input_data, labels, input_length, label_length], outputs=loss_out)
model.summary()
model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer=sgd, metrics=["accuracy"])
# clipnorm seems to speeds up convergence
sgd = SGD(lr=0.0001, decay=1e-6, momentum=0.9, nesterov=True, clipnorm=5)
#ada_d = Adadelta(lr = 0.0001, rho = 0.95, epsilon = 1e-06)
model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer=sgd, metrics=["accuracy"])
#model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer = ada_d, metrics=['accuracy'])
# captures output of softmax so we can decode the output during visualization
@ -198,7 +198,7 @@ if(__name__=='__main__'):
if(not os.path.exists(modelpath)): # 判断保存模型的目录是否存在
os.makedirs(path) # 如果不存在,就新建一个,避免之后保存模型的时候炸掉
os.makedirs(modelpath) # 如果不存在,就新建一个,避免之后保存模型的时候炸掉
system_type = plat.system() # 由于不同的系统的文件路径表示不一样,需要进行判断
if(system_type == 'Windows'):