2018-04-21 20:50:53 +08:00
|
|
|
|
#!/usr/bin/env python3
|
|
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
"""
|
|
|
|
|
@author: nl8590687
|
|
|
|
|
语音识别的语言模型
|
|
|
|
|
|
2018-04-25 19:50:32 +08:00
|
|
|
|
基于马尔可夫模型的语言模型
|
|
|
|
|
|
2018-04-21 20:50:53 +08:00
|
|
|
|
"""
|
2018-04-25 19:50:32 +08:00
|
|
|
|
import platform as plat
|
2018-04-21 20:50:53 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class ModelLanguage(): # 语音模型类
|
2018-04-25 19:50:32 +08:00
|
|
|
|
def __init__(self, modelpath):
|
|
|
|
|
self.modelpath = modelpath
|
|
|
|
|
system_type = plat.system() # 由于不同的系统的文件路径表示不一样,需要进行判断
|
|
|
|
|
|
|
|
|
|
self.slash = ''
|
|
|
|
|
if(system_type == 'Windows'):
|
|
|
|
|
self.slash = '\\'
|
|
|
|
|
elif(system_type == 'Linux'):
|
|
|
|
|
self.slash = '/'
|
|
|
|
|
else:
|
|
|
|
|
print('*[Message] Unknown System\n')
|
|
|
|
|
self.slash = '/'
|
|
|
|
|
|
|
|
|
|
if(self.slash != self.modelpath[-1]): # 在目录路径末尾增加斜杠
|
|
|
|
|
self.modelpath = self.modelpath + self.slash
|
|
|
|
|
|
2018-04-21 20:50:53 +08:00
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
def LoadModel(self):
|
2018-04-25 19:50:32 +08:00
|
|
|
|
self.dict_pinyin = self.GetSymbolDict('dict.txt')
|
|
|
|
|
self.model1 = self.GetLanguageModel(self.modelpath + 'language_model1.txt')
|
|
|
|
|
self.model2 = self.GetLanguageModel(self.modelpath + 'language_model2.txt')
|
2018-05-05 13:41:45 +08:00
|
|
|
|
self.pinyin = self.GetPinyin(self.modelpath + 'dic_pinyin.txt')
|
2018-04-25 19:50:32 +08:00
|
|
|
|
model = (self.dict_pinyin, self.model1, self.model2 )
|
|
|
|
|
return model
|
2018-04-21 20:50:53 +08:00
|
|
|
|
pass
|
2018-04-25 19:50:32 +08:00
|
|
|
|
|
|
|
|
|
def SpeechToText(self, list_syllable):
|
|
|
|
|
'''
|
|
|
|
|
为语音识别专用的处理函数
|
|
|
|
|
实现从语音拼音符号到最终文本的转换
|
|
|
|
|
'''
|
2018-05-05 13:41:45 +08:00
|
|
|
|
r=''
|
|
|
|
|
length = len(list_syllable)
|
|
|
|
|
if(length == 0): # 传入的参数没有包含任何拼音时
|
|
|
|
|
return ''
|
|
|
|
|
|
2018-05-28 17:43:50 +08:00
|
|
|
|
# 先取出一个字,即拼音列表中第一个字
|
2018-05-05 13:41:45 +08:00
|
|
|
|
str_tmp = [list_syllable[0]]
|
2018-05-28 17:43:50 +08:00
|
|
|
|
|
2018-05-05 13:41:45 +08:00
|
|
|
|
for i in range(0, length - 1):
|
2018-05-28 17:43:50 +08:00
|
|
|
|
# 依次从第一个字开始每次连续取两个字拼音
|
2018-05-05 13:41:45 +08:00
|
|
|
|
str_split = list_syllable[i] + ' ' + list_syllable[i+1]
|
|
|
|
|
#print(str_split,str_tmp,r)
|
2018-05-28 17:43:50 +08:00
|
|
|
|
# 如果这个拼音在汉语拼音状态转移字典里的话
|
2018-05-05 13:41:45 +08:00
|
|
|
|
if(str_split in self.pinyin):
|
2018-05-28 17:43:50 +08:00
|
|
|
|
# 将第二个字的拼音加入
|
2018-05-05 13:41:45 +08:00
|
|
|
|
str_tmp.append(list_syllable[i+1])
|
|
|
|
|
else:
|
2018-05-28 17:43:50 +08:00
|
|
|
|
# 否则不加入,然后直接将现有的拼音序列进行解码
|
2018-05-05 13:41:45 +08:00
|
|
|
|
str_decode = self.decode(str_tmp, 0.0000)
|
|
|
|
|
#print('decode ',str_tmp,str_decode)
|
|
|
|
|
if(str_decode != []):
|
|
|
|
|
r += str_decode[0][0]
|
2018-05-28 17:43:50 +08:00
|
|
|
|
# 再重新从i+1开始作为第一个拼音
|
2018-05-05 13:41:45 +08:00
|
|
|
|
str_tmp = [list_syllable[i+1]]
|
2018-05-28 17:43:50 +08:00
|
|
|
|
|
2018-05-05 13:41:45 +08:00
|
|
|
|
|
2018-05-28 17:43:50 +08:00
|
|
|
|
#print('最后:', str_tmp)
|
2018-05-05 13:41:45 +08:00
|
|
|
|
str_decode = self.decode(str_tmp, 0.0000)
|
2018-05-28 17:43:50 +08:00
|
|
|
|
|
|
|
|
|
#print('剩余解码:',str_decode)
|
|
|
|
|
|
2018-05-05 13:41:45 +08:00
|
|
|
|
if(str_decode != []):
|
|
|
|
|
r += str_decode[0][0]
|
|
|
|
|
|
|
|
|
|
return r
|
2018-04-25 19:50:32 +08:00
|
|
|
|
|
2018-05-05 13:41:45 +08:00
|
|
|
|
def decode(self,list_syllable, yuzhi = 0.0001):
|
2018-04-25 19:50:32 +08:00
|
|
|
|
'''
|
|
|
|
|
实现拼音向文本的转换
|
|
|
|
|
基于马尔可夫链
|
|
|
|
|
'''
|
|
|
|
|
#assert self.dic_pinyin == null or self.model1 == null or self.model2 == null
|
|
|
|
|
list_words = []
|
2018-04-21 20:50:53 +08:00
|
|
|
|
|
2018-04-25 19:50:32 +08:00
|
|
|
|
num_pinyin = len(list_syllable)
|
2018-05-05 13:41:45 +08:00
|
|
|
|
#print('======')
|
|
|
|
|
#print('decode function: list_syllable\n',list_syllable)
|
2018-04-25 19:50:32 +08:00
|
|
|
|
#print(num_pinyin)
|
|
|
|
|
# 开始语音解码
|
|
|
|
|
for i in range(num_pinyin):
|
|
|
|
|
#print(i)
|
|
|
|
|
ls = ''
|
2018-05-05 13:41:45 +08:00
|
|
|
|
if(list_syllable[i] in self.dict_pinyin): # 如果这个拼音在汉语拼音字典里的话
|
2018-04-25 19:50:32 +08:00
|
|
|
|
# 获取拼音下属的字的列表,ls包含了该拼音对应的所有的字
|
|
|
|
|
ls = self.dict_pinyin[list_syllable[i]]
|
|
|
|
|
else:
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if(i == 0):
|
2018-05-05 13:41:45 +08:00
|
|
|
|
# 第一个字做初始处理
|
2018-04-25 19:50:32 +08:00
|
|
|
|
num_ls = len(ls)
|
|
|
|
|
for j in range(num_ls):
|
|
|
|
|
tuple_word = ['',0.0]
|
|
|
|
|
# 设置马尔科夫模型初始状态值
|
|
|
|
|
# 设置初始概率,置为1.0
|
|
|
|
|
tuple_word = [ls[j], 1.0]
|
|
|
|
|
#print(tuple_word)
|
|
|
|
|
# 添加到可能的句子列表
|
|
|
|
|
list_words.append(tuple_word)
|
|
|
|
|
|
|
|
|
|
#print(list_words)
|
|
|
|
|
continue
|
|
|
|
|
else:
|
2018-05-05 13:41:45 +08:00
|
|
|
|
# 开始处理紧跟在第一个字后面的字
|
2018-04-25 19:50:32 +08:00
|
|
|
|
list_words_2 = []
|
|
|
|
|
num_ls_word = len(list_words)
|
2018-05-05 13:41:45 +08:00
|
|
|
|
#print('ls_wd: ',list_words)
|
2018-04-25 19:50:32 +08:00
|
|
|
|
for j in range(0, num_ls_word):
|
2018-05-05 13:41:45 +08:00
|
|
|
|
|
2018-04-25 19:50:32 +08:00
|
|
|
|
num_ls = len(ls)
|
|
|
|
|
for k in range(0, num_ls):
|
|
|
|
|
tuple_word = ['',0.0]
|
2018-05-05 13:41:45 +08:00
|
|
|
|
tuple_word = list(list_words[j]) # 把现有的每一条短语取出来
|
2018-04-25 19:50:32 +08:00
|
|
|
|
#print('tw1: ',tuple_word)
|
2018-05-05 13:41:45 +08:00
|
|
|
|
tuple_word[0] = tuple_word[0] + ls[k] # 尝试按照下一个音可能对应的全部的字进行组合
|
2018-04-25 19:50:32 +08:00
|
|
|
|
#print('ls[k] ',ls[k])
|
|
|
|
|
|
2018-05-05 13:41:45 +08:00
|
|
|
|
tmp_words = tuple_word[0][-2:] # 取出用于计算的最后两个字
|
|
|
|
|
#print('tmp_words: ',tmp_words,tmp_words in self.model2)
|
|
|
|
|
if(tmp_words in self.model2): # 判断它们是不是再状态转移表里
|
|
|
|
|
#print(tmp_words,tmp_words in self.model2)
|
2018-06-07 12:48:31 +08:00
|
|
|
|
tuple_word[1] = tuple_word[1] * float(self.model2[tmp_words]) / float(self.model1[tmp_words[-2]])
|
2018-04-25 19:50:32 +08:00
|
|
|
|
# 核心!在当前概率上乘转移概率,公式化简后为第n-1和n个字出现的次数除以第n-1个字出现的次数
|
2018-06-07 12:48:31 +08:00
|
|
|
|
#print(self.model2[tmp_words],self.model1[tmp_words[-2]])
|
2018-04-25 19:50:32 +08:00
|
|
|
|
else:
|
|
|
|
|
tuple_word[1] = 0.0
|
|
|
|
|
continue
|
|
|
|
|
#print('tw2: ',tuple_word)
|
2018-05-05 13:41:45 +08:00
|
|
|
|
#print(tuple_word[1] >= pow(yuzhi, i))
|
2018-04-25 19:50:32 +08:00
|
|
|
|
if(tuple_word[1] >= pow(yuzhi, i)):
|
|
|
|
|
# 大于阈值之后保留,否则丢弃
|
|
|
|
|
list_words_2.append(tuple_word)
|
|
|
|
|
|
|
|
|
|
list_words = list_words_2
|
|
|
|
|
#print(list_words,'\n')
|
|
|
|
|
#print(list_words)
|
|
|
|
|
for i in range(0, len(list_words)):
|
|
|
|
|
for j in range(i + 1, len(list_words)):
|
|
|
|
|
if(list_words[i][1] < list_words[j][1]):
|
|
|
|
|
tmp = list_words[i]
|
|
|
|
|
list_words[i] = list_words[j]
|
|
|
|
|
list_words[j] = tmp
|
|
|
|
|
|
|
|
|
|
return list_words
|
2018-04-21 20:50:53 +08:00
|
|
|
|
pass
|
2018-04-25 19:50:32 +08:00
|
|
|
|
|
|
|
|
|
def GetSymbolDict(self, dictfilename):
|
|
|
|
|
'''
|
|
|
|
|
读取拼音汉字的字典文件
|
|
|
|
|
返回读取后的字典
|
|
|
|
|
'''
|
|
|
|
|
txt_obj = open(dictfilename, 'r', encoding='UTF-8') # 打开文件并读入
|
|
|
|
|
txt_text = txt_obj.read()
|
|
|
|
|
txt_obj.close()
|
|
|
|
|
txt_lines = txt_text.split('\n') # 文本分割
|
|
|
|
|
|
|
|
|
|
dic_symbol = {} # 初始化符号字典
|
|
|
|
|
for i in txt_lines:
|
|
|
|
|
list_symbol=[] # 初始化符号列表
|
|
|
|
|
if(i!=''):
|
|
|
|
|
txt_l=i.split('\t')
|
|
|
|
|
pinyin = txt_l[0]
|
|
|
|
|
for word in txt_l[1]:
|
|
|
|
|
list_symbol.append(word)
|
|
|
|
|
dic_symbol[pinyin] = list_symbol
|
|
|
|
|
|
|
|
|
|
return dic_symbol
|
|
|
|
|
|
|
|
|
|
def GetLanguageModel(self, modelLanFilename):
|
|
|
|
|
'''
|
|
|
|
|
读取语言模型的文件
|
|
|
|
|
返回读取后的模型
|
|
|
|
|
'''
|
|
|
|
|
txt_obj = open(modelLanFilename, 'r', encoding='UTF-8') # 打开文件并读入
|
|
|
|
|
txt_text = txt_obj.read()
|
|
|
|
|
txt_obj.close()
|
|
|
|
|
txt_lines = txt_text.split('\n') # 文本分割
|
|
|
|
|
|
|
|
|
|
dic_model = {} # 初始化符号字典
|
|
|
|
|
for i in txt_lines:
|
|
|
|
|
if(i!=''):
|
|
|
|
|
txt_l=i.split('\t')
|
|
|
|
|
if(len(txt_l) == 1):
|
|
|
|
|
continue
|
|
|
|
|
#print(txt_l)
|
|
|
|
|
dic_model[txt_l[0]] = txt_l[1]
|
|
|
|
|
|
|
|
|
|
return dic_model
|
|
|
|
|
|
2018-05-05 13:41:45 +08:00
|
|
|
|
def GetPinyin(self, filename):
|
|
|
|
|
file_obj = open(filename,'r',encoding='UTF-8')
|
|
|
|
|
txt_all = file_obj.read()
|
|
|
|
|
file_obj.close()
|
|
|
|
|
|
|
|
|
|
txt_lines = txt_all.split('\n')
|
|
|
|
|
dic={}
|
|
|
|
|
|
|
|
|
|
for line in txt_lines:
|
|
|
|
|
if(line == ''):
|
|
|
|
|
continue
|
|
|
|
|
pinyin_split = line.split('\t')
|
|
|
|
|
|
|
|
|
|
list_pinyin=pinyin_split[0]
|
|
|
|
|
|
2018-05-28 17:43:50 +08:00
|
|
|
|
if(list_pinyin not in dic and int(pinyin_split[1]) > 1):
|
2018-05-05 13:41:45 +08:00
|
|
|
|
dic[list_pinyin] = pinyin_split[1]
|
|
|
|
|
return dic
|
2018-04-21 20:50:53 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if(__name__=='__main__'):
|
|
|
|
|
|
2018-04-25 19:50:32 +08:00
|
|
|
|
ml = ModelLanguage('model_language')
|
|
|
|
|
ml.LoadModel()
|
2018-04-21 20:50:53 +08:00
|
|
|
|
|
2018-04-25 19:50:32 +08:00
|
|
|
|
#str_pinyin = ['zhe4','zhen1','shi4','ji2', 'hao3','de5']
|
2018-05-05 13:41:45 +08:00
|
|
|
|
#str_pinyin = ['jin1', 'tian1', 'shi4', 'xing1', 'qi1', 'san1']
|
2018-04-25 19:50:32 +08:00
|
|
|
|
#str_pinyin = ['ni3', 'hao3','a1']
|
2018-05-05 13:41:45 +08:00
|
|
|
|
#str_pinyin = ['wo3','dui4','shi4','mei2','cuo4','ni3','hao3']
|
|
|
|
|
#str_pinyin = ['wo3','dui4','shi4','tian1','mei2','na5','li3','hai4']
|
|
|
|
|
#str_pinyin = ['ba3','zhe4','xie1','zuo4','wan2','wo3','jiu4','qu4','shui4','jiao4']
|
|
|
|
|
#str_pinyin = ['wo3','qu4','a4','mei2','shi4','er2','la1']
|
|
|
|
|
#str_pinyin = ['wo3', 'men5', 'qun2', 'li3', 'xiong1', 'di4', 'jian4', 'mei4', 'dou1', 'zai4', 'shuo1']
|
|
|
|
|
#str_pinyin = ['su1', 'an1', 'ni3', 'sui4', 'li4', 'yun4', 'sui2', 'cong2', 'jiao4', 'ming2', 'tao2', 'qi3', 'yu2', 'peng2', 'ya4', 'yang4', 'chao1', 'dao3', 'jiang1', 'li3', 'yuan2', 'kang1', 'zhua1', 'zou3']
|
2018-07-19 21:21:51 +08:00
|
|
|
|
#str_pinyin = ['da4', 'jia1', 'hao3']
|
|
|
|
|
str_pinyin = ['kao3', 'yan2', 'yan1', 'yu3', 'ci2', 'hui4']
|
2018-05-05 13:41:45 +08:00
|
|
|
|
#r = ml.decode(str_pinyin)
|
|
|
|
|
r=ml.SpeechToText(str_pinyin)
|
2018-04-25 19:50:32 +08:00
|
|
|
|
print('语音转文字结果:\n',r)
|
2018-04-21 20:50:53 +08:00
|
|
|
|
|