ASRT_SpeechRecognition/utils/ops.py

182 lines
5.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# Copyright 2016-2099 Ailemon.net
#
# This file is part of ASRT Speech Recognition Tool.
#
# ASRT is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# ASRT is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with ASRT. If not, see <https://www.gnu.org/licenses/>.
# ============================================================================
"""
@author: nl8590687
一些常用操作函数的定义
"""
import wave
import difflib
import matplotlib.pyplot as plt
import numpy as np
def read_wav_data(filename: str) -> tuple:
"""
读取一个wav文件返回声音信号的时域谱矩阵和播放时间
"""
wav = wave.open(filename,"rb") # 打开一个wav格式的声音文件流
num_frame = wav.getnframes() # 获取帧数
num_channel=wav.getnchannels() # 获取声道数
framerate=wav.getframerate() # 获取帧速率
num_sample_width=wav.getsampwidth() # 获取实例的比特宽度,即每一帧的字节数
str_data = wav.readframes(num_frame) # 读取全部的帧
wav.close() # 关闭流
wave_data = np.fromstring(str_data, dtype = np.short) # 将声音文件数据转换为数组矩阵形式
wave_data.shape = -1, num_channel # 按照声道数将数组整形,单声道时候是一列数组,双声道时候是两列的矩阵
wave_data = wave_data.T # 将矩阵转置
return wave_data, framerate, num_channel, num_sample_width
def read_wav_bytes(filename: str) -> tuple:
"""
读取一个wav文件返回声音信号的时域谱矩阵和播放时间
"""
wav = wave.open(filename,"rb") # 打开一个wav格式的声音文件流
num_frame = wav.getnframes() # 获取帧数
num_channel=wav.getnchannels() # 获取声道数
framerate=wav.getframerate() # 获取帧速率
num_sample_width=wav.getsampwidth() # 获取实例的比特宽度,即每一帧的字节数
str_data = wav.readframes(num_frame) # 读取全部的帧
wav.close() # 关闭流
return str_data, framerate, num_channel, num_sample_width
def get_edit_distance(str1, str2) -> int:
"""
计算两个串的编辑距离支持str和list类型
"""
leven_cost = 0
sequence_match = difflib.SequenceMatcher(None, str1, str2)
for tag, index_1, index_2, index_j1, index_j2 in sequence_match.get_opcodes():
if tag == 'replace':
leven_cost += max(index_2-index_1, index_j2-index_j1)
elif tag == 'insert':
leven_cost += (index_j2-index_j1)
elif tag == 'delete':
leven_cost += (index_2-index_1)
return leven_cost
def ctc_decode_delete_tail_blank(ctc_decode_list):
"""
处理CTC解码后序列末尾余留的空白元素删除掉
"""
p = 0
while p < len(ctc_decode_list) and ctc_decode_list[p] != -1:
p += 1
return ctc_decode_list[0:p]
def visual_1D(points_list, frequency=1):
"""
可视化1D数据
"""
# 首先创建绘图网格1个子图
fig, ax = plt.subplots(1)
x = np.linspace(0, len(points_list)-1, len(points_list)) / frequency
# 在对应对象上调用 plot() 方法
ax.plot(x, points_list)
fig.show()
def visual_2D(img):
"""
可视化2D数据
"""
plt.subplot(111)
plt.imshow(img)
plt.colorbar(cax=None, ax=None, shrink=0.5)
plt.show()
def decode_wav_bytes(samples_data: bytes, channels: int = 1, byte_width: int = 2) -> list:
"""
解码wav格式样本点字节流得到numpy数组
"""
numpy_type = np.short
if byte_width == 4:
numpy_type = np.int
elif byte_width != 2:
raise Exception('error: unsurpport byte width `' + str(byte_width) + '`')
wave_data = np.fromstring(samples_data, dtype=numpy_type) # 将声音文件数据转换为数组矩阵形式
wave_data.shape = -1, channels # 按照声道数将数组整形,单声道时候是一列数组,双声道时候是两列的矩阵
wave_data = wave_data.T # 将矩阵转置
return wave_data
def get_symbol_dict(dict_filename):
"""
读取拼音汉字的字典文件
返回读取后的字典
"""
txt_obj = open(dict_filename, 'r', encoding='UTF-8') # 打开文件并读入
txt_text = txt_obj.read()
txt_obj.close()
txt_lines = txt_text.split('\n') # 文本分割
dic_symbol = {} # 初始化符号字典
for i in txt_lines:
list_symbol = [] # 初始化符号列表
if i != '':
txt_l=i.split('\t')
pinyin = txt_l[0]
for word in txt_l[1]:
list_symbol.append(word)
dic_symbol[pinyin] = list_symbol
return dic_symbol
def get_language_model(model_language_filename):
"""
读取语言模型的文件
返回读取后的模型
"""
txt_obj = open(model_language_filename, 'r', encoding='UTF-8') # 打开文件并读入
txt_text = txt_obj.read()
txt_obj.close()
txt_lines = txt_text.split('\n') # 文本分割
dic_model = {} # 初始化符号字典
for i in txt_lines:
if i != '':
txt_l = i.split('\t')
if len(txt_l) == 1:
continue
dic_model[txt_l[0]] = txt_l[1]
return dic_model
def ctc_decode_stream(tokens):
i = 0
while i < len(tokens):
while i+1 < len(tokens) and tokens[i] == tokens[i+1]:
i += 1
if i+1 == len(tokens) and tokens[i] != -1:
return tokens[0], []
if tokens[i] != -1:
return tokens[i], tokens[i+1:]
i += 1
return -1, []