2022-09-18 20:56:14 +08:00
|
|
|
|
# !/usr/bin/env python3
|
2022-04-26 21:12:27 +08:00
|
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
#
|
|
|
|
|
# Copyright 2016-2099 Ailemon.net
|
|
|
|
|
#
|
|
|
|
|
# This file is part of ASRT Speech Recognition Tool.
|
|
|
|
|
#
|
|
|
|
|
# ASRT is free software: you can redistribute it and/or modify
|
|
|
|
|
# it under the terms of the GNU General Public License as published by
|
|
|
|
|
# the Free Software Foundation, either version 3 of the License, or
|
|
|
|
|
# (at your option) any later version.
|
|
|
|
|
# ASRT is distributed in the hope that it will be useful,
|
|
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
|
# GNU General Public License for more details.
|
|
|
|
|
#
|
|
|
|
|
# You should have received a copy of the GNU General Public License
|
|
|
|
|
# along with ASRT. If not, see <https://www.gnu.org/licenses/>.
|
|
|
|
|
# ============================================================================
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
@author: nl8590687
|
|
|
|
|
ASRT语音识别的语言模型
|
|
|
|
|
|
|
|
|
|
基于N-Gram的语言模型
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
import os
|
|
|
|
|
|
|
|
|
|
from utils.ops import get_symbol_dict, get_language_model
|
|
|
|
|
|
2022-09-18 20:56:14 +08:00
|
|
|
|
|
2022-04-26 21:12:27 +08:00
|
|
|
|
class ModelLanguage:
|
2022-09-18 20:56:14 +08:00
|
|
|
|
"""
|
2022-04-26 21:12:27 +08:00
|
|
|
|
ASRT专用N-Gram语言模型
|
2022-09-18 20:56:14 +08:00
|
|
|
|
"""
|
|
|
|
|
|
2022-04-26 21:12:27 +08:00
|
|
|
|
def __init__(self, model_path: str):
|
|
|
|
|
self.model_path = model_path
|
|
|
|
|
self.dict_pinyin = dict()
|
|
|
|
|
self.model1 = dict()
|
|
|
|
|
self.model2 = dict()
|
|
|
|
|
|
|
|
|
|
def load_model(self):
|
2022-09-18 20:56:14 +08:00
|
|
|
|
"""
|
2022-04-26 21:12:27 +08:00
|
|
|
|
加载N-Gram语言模型到内存
|
2022-09-18 20:56:14 +08:00
|
|
|
|
"""
|
2022-04-26 21:12:27 +08:00
|
|
|
|
self.dict_pinyin = get_symbol_dict('dict.txt')
|
|
|
|
|
self.model1 = get_language_model(os.path.join(self.model_path, 'language_model1.txt'))
|
|
|
|
|
self.model2 = get_language_model(os.path.join(self.model_path, 'language_model2.txt'))
|
2022-09-18 20:56:14 +08:00
|
|
|
|
model = (self.dict_pinyin, self.model1, self.model2)
|
2022-04-26 21:12:27 +08:00
|
|
|
|
return model
|
|
|
|
|
|
2022-09-18 20:56:14 +08:00
|
|
|
|
def pinyin_to_text(self, list_pinyin: list, beam_size: int = 100) -> str:
|
|
|
|
|
"""
|
2022-04-26 21:12:27 +08:00
|
|
|
|
拼音转文本,一次性取得全部结果
|
2022-09-18 20:56:14 +08:00
|
|
|
|
"""
|
2022-04-26 21:12:27 +08:00
|
|
|
|
result = list()
|
|
|
|
|
tmp_result_last = list()
|
|
|
|
|
for item_pinyin in list_pinyin:
|
|
|
|
|
tmp_result = self.pinyin_stream_decode(tmp_result_last, item_pinyin, beam_size)
|
|
|
|
|
if len(tmp_result) == 0 and len(tmp_result_last) > 0:
|
|
|
|
|
result.append(tmp_result_last[0][0])
|
|
|
|
|
tmp_result = self.pinyin_stream_decode([], item_pinyin, beam_size)
|
|
|
|
|
if len(tmp_result) > 0:
|
|
|
|
|
result.append(tmp_result[0][0])
|
|
|
|
|
tmp_result = []
|
|
|
|
|
tmp_result_last = tmp_result
|
|
|
|
|
|
|
|
|
|
if len(tmp_result_last) > 0:
|
|
|
|
|
result.append(tmp_result_last[0][0])
|
|
|
|
|
|
|
|
|
|
return ''.join(result)
|
|
|
|
|
|
|
|
|
|
def pinyin_stream_decode(self, temple_result: list,
|
2022-09-18 20:56:14 +08:00
|
|
|
|
item_pinyin: str,
|
|
|
|
|
beam_size: int = 100) -> list:
|
|
|
|
|
"""
|
2022-04-26 21:12:27 +08:00
|
|
|
|
拼音流式解码,逐字转换,每次返回中间结果
|
2022-09-18 20:56:14 +08:00
|
|
|
|
"""
|
2022-04-26 21:12:27 +08:00
|
|
|
|
# 如果这个拼音不在汉语拼音字典里的话,直接返回空列表,不做decode
|
|
|
|
|
if item_pinyin not in self.dict_pinyin:
|
|
|
|
|
return []
|
|
|
|
|
|
|
|
|
|
# 获取拼音下属的字的列表,cur_words包含了该拼音对应的所有的字
|
|
|
|
|
cur_words = self.dict_pinyin[item_pinyin]
|
|
|
|
|
# 第一个字做初始处理
|
|
|
|
|
if len(temple_result) == 0:
|
|
|
|
|
lst_result = list()
|
|
|
|
|
for word in cur_words:
|
|
|
|
|
# 添加该字到可能的句子列表,设置初始概率为1.0
|
|
|
|
|
lst_result.append([word, 1.0])
|
|
|
|
|
return lst_result
|
|
|
|
|
|
|
|
|
|
# 开始处理已经至少有一个字的中间结果情况
|
|
|
|
|
new_result = list()
|
|
|
|
|
for sequence in temple_result:
|
|
|
|
|
for cur_word in cur_words:
|
|
|
|
|
# 得到2-gram的汉字子序列
|
|
|
|
|
tuple2_word = sequence[0][-1] + cur_word
|
|
|
|
|
if tuple2_word not in self.model2:
|
|
|
|
|
# 如果2-gram子序列不存在
|
|
|
|
|
continue
|
|
|
|
|
# 计算状态转移概率
|
2022-09-18 20:56:14 +08:00
|
|
|
|
prob_origin = sequence[1] # 原始概率
|
|
|
|
|
count_two_word = float(self.model2[tuple2_word]) # 二字频数
|
|
|
|
|
count_one_word = float(self.model1[tuple2_word[-2]]) # 单字频数
|
2022-04-26 21:12:27 +08:00
|
|
|
|
cur_probility = prob_origin * count_two_word / count_one_word
|
2022-09-18 20:56:14 +08:00
|
|
|
|
new_result.append([sequence[0] + cur_word, cur_probility])
|
2022-04-26 21:12:27 +08:00
|
|
|
|
|
2022-09-18 20:56:14 +08:00
|
|
|
|
new_result = sorted(new_result, key=lambda x: x[1], reverse=True)
|
2022-04-26 21:12:27 +08:00
|
|
|
|
if len(new_result) > beam_size:
|
|
|
|
|
return new_result[0:beam_size]
|
|
|
|
|
return new_result
|
|
|
|
|
|
|
|
|
|
|
2022-09-18 20:56:14 +08:00
|
|
|
|
if __name__ == '__main__':
|
2022-04-26 21:12:27 +08:00
|
|
|
|
ml = ModelLanguage('model_language')
|
|
|
|
|
ml.load_model()
|
|
|
|
|
|
|
|
|
|
_str_pinyin = ['zhe4', 'zhen1', 'shi4', 'ji2', 'hao3', 'de5']
|
|
|
|
|
_RESULT = ml.pinyin_to_text(_str_pinyin)
|
|
|
|
|
print('语音转文字结果:\n', _RESULT)
|