229 lines
8.8 KiB
Python
229 lines
8.8 KiB
Python
#!/usr/bin/env python3
|
||
# -*- coding: utf-8 -*-
|
||
#
|
||
# Copyright 2016-2099 Ailemon.net
|
||
#
|
||
# This file is part of ASRT Speech Recognition Tool.
|
||
#
|
||
# ASRT is free software: you can redistribute it and/or modify
|
||
# it under the terms of the GNU General Public License as published by
|
||
# the Free Software Foundation, either version 3 of the License, or
|
||
# (at your option) any later version.
|
||
# ASRT is distributed in the hope that it will be useful,
|
||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||
# GNU General Public License for more details.
|
||
#
|
||
# You should have received a copy of the GNU General Public License
|
||
# along with ASRT. If not, see <https://www.gnu.org/licenses/>.
|
||
# ============================================================================
|
||
|
||
"""
|
||
@author: nl8590687
|
||
ASRT语音识别内置声学特征提取模块,定义了几个常用的声学特征类
|
||
"""
|
||
|
||
import random
|
||
import numpy as np
|
||
from scipy.fftpack import fft
|
||
from .base import mfcc, delta, logfbank
|
||
|
||
|
||
class SpeechFeatureMeta:
|
||
"""
|
||
ASRT语音识别中所有声学特征提取类的基类
|
||
"""
|
||
|
||
def __init__(self, framesamplerate=16000):
|
||
self.framesamplerate = framesamplerate
|
||
|
||
def run(self, wavsignal, fs=16000):
|
||
'''
|
||
run method
|
||
'''
|
||
raise NotImplementedError('[ASRT] `run()` method is not implemented.')
|
||
|
||
|
||
class MFCC(SpeechFeatureMeta):
|
||
"""
|
||
ASRT语音识别内置的mfcc声学特征提取类
|
||
|
||
Compute MFCC features from an audio signal.
|
||
|
||
:param framesamplerate: the sample rate of the signal we are working with, in Hz.
|
||
:param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
|
||
:param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
|
||
:param numcep: the number of cepstrum to return, default 13
|
||
:param nfilt: the number of filters in the filterbank, default 26.
|
||
:param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
|
||
"""
|
||
|
||
def __init__(self, framesamplerate=16000,
|
||
winlen=0.025,
|
||
winstep=0.01,
|
||
numcep=13,
|
||
nfilt=26,
|
||
preemph=0.97):
|
||
self.framesamplerate = framesamplerate
|
||
self.winlen = winlen
|
||
self.winstep = winstep
|
||
self.numcep = numcep
|
||
self.nfilt = nfilt
|
||
self.preemph = preemph
|
||
super().__init__(framesamplerate)
|
||
|
||
def run(self, wavsignal, fs=16000):
|
||
"""
|
||
计算mfcc声学特征,包含静态特征、一阶差分和二阶差分
|
||
|
||
:returns: A numpy array of size (NUMFRAMES by numcep * 3) containing features. Each row holds 1 feature vector.
|
||
"""
|
||
wavsignal = np.array(wavsignal, dtype=np.float64)
|
||
# 获取输入特征
|
||
feat_mfcc = mfcc(wavsignal[0], samplerate=self.framesamplerate, winlen=self.winlen,
|
||
winstep=self.winstep, numcep=self.numcep, nfilt=self.nfilt, preemph=self.preemph)
|
||
feat_mfcc_d = delta(feat_mfcc, 2)
|
||
feat_mfcc_dd = delta(feat_mfcc_d, 2)
|
||
# 返回值分别是mfcc特征向量的矩阵及其一阶差分和二阶差分矩阵
|
||
wav_feature = np.column_stack((feat_mfcc, feat_mfcc_d, feat_mfcc_dd))
|
||
return wav_feature
|
||
|
||
|
||
class Logfbank(SpeechFeatureMeta):
|
||
"""
|
||
ASRT语音识别内置的logfbank声学特征提取类
|
||
"""
|
||
|
||
def __init__(self, framesamplerate=16000, nfilt=26):
|
||
self.nfilt = nfilt
|
||
super().__init__(framesamplerate)
|
||
|
||
def run(self, wavsignal, fs=16000):
|
||
wavsignal = np.array(wavsignal, dtype=np.float64)
|
||
# 获取输入特征
|
||
wav_feature = logfbank(wavsignal, fs, nfilt=self.nfilt)
|
||
return wav_feature
|
||
|
||
|
||
class Spectrogram(SpeechFeatureMeta):
|
||
"""
|
||
ASRT语音识别内置的语谱图声学特征提取类
|
||
"""
|
||
|
||
def __init__(self, framesamplerate=16000, timewindow=25, timeshift=10):
|
||
self.time_window = timewindow
|
||
self.window_length = int(framesamplerate / 1000 * self.time_window) # 计算窗长度的公式,目前全部为400固定值
|
||
self.timeshift = timeshift
|
||
|
||
'''
|
||
# 保留将来用于不同采样频率
|
||
self.x=np.linspace(0, self.window_length - 1, self.window_length, dtype = np.int64)
|
||
self.w = 0.54 - 0.46 * np.cos(2 * np.pi * (self.x) / (self.window_length - 1) ) # 汉明窗
|
||
'''
|
||
|
||
self.x = np.linspace(0, 400 - 1, 400, dtype=np.int64)
|
||
self.w = 0.54 - 0.46 * np.cos(2 * np.pi * (self.x) / (400 - 1)) # 汉明窗
|
||
super().__init__(framesamplerate)
|
||
|
||
def run(self, wavsignal, fs=16000):
|
||
if fs != 16000:
|
||
raise ValueError(
|
||
f"[Error] ASRT currently only supports wav audio files with a sampling rate of 16000 Hz, but this "
|
||
f"audio is {fs} Hz.")
|
||
|
||
# wav波形 加时间窗以及时移10ms
|
||
time_window = 25 # 单位ms
|
||
window_length = int(fs / 1000 * time_window) # 计算窗长度的公式,目前全部为400固定值
|
||
|
||
wav_arr = np.array(wavsignal)
|
||
# wav_length = len(wavsignal[0])
|
||
# wav_length = wav_arr.shape[1]
|
||
|
||
range0_end = int(len(wavsignal[0]) / fs * 1000 - time_window) // 10 + 1 # 计算循环终止的位置,也就是最终生成的窗数
|
||
data_input = np.zeros((range0_end, window_length // 2), dtype=np.float64) # 用于存放最终的频率特征数据
|
||
data_line = np.zeros((1, window_length), dtype=np.float64)
|
||
|
||
for i in range(0, range0_end):
|
||
p_start = i * 160
|
||
p_end = p_start + 400
|
||
|
||
data_line = wav_arr[0, p_start:p_end]
|
||
data_line = data_line * self.w # 加窗
|
||
data_line = np.abs(fft(data_line))
|
||
|
||
data_input[i] = data_line[0: window_length // 2] # 设置为400除以2的值(即200)是取一半数据,因为是对称的
|
||
|
||
data_input = np.log(data_input + 1)
|
||
return data_input
|
||
|
||
|
||
class SpecAugment(SpeechFeatureMeta):
|
||
"""
|
||
复现谷歌SpecAugment数据增强特征算法,基于Spectrogram语谱图基础特征
|
||
"""
|
||
|
||
def __init__(self, framesamplerate=16000, timewindow=25, timeshift=10):
|
||
self.time_window = timewindow
|
||
self.window_length = int(framesamplerate / 1000 * self.time_window) # 计算窗长度的公式,目前全部为400固定值
|
||
self.timeshift = timeshift
|
||
|
||
'''
|
||
# 保留将来用于不同采样频率
|
||
self.x=np.linspace(0, self.window_length - 1, self.window_length, dtype = np.int64)
|
||
self.w = 0.54 - 0.46 * np.cos(2 * np.pi * (self.x) / (self.window_length - 1) ) # 汉明窗
|
||
'''
|
||
|
||
self.x = np.linspace(0, 400 - 1, 400, dtype=np.int64)
|
||
self.w = 0.54 - 0.46 * np.cos(2 * np.pi * (self.x) / (400 - 1)) # 汉明窗
|
||
super().__init__(framesamplerate)
|
||
|
||
def run(self, wavsignal, fs=16000):
|
||
if fs != 16000:
|
||
raise ValueError(
|
||
f"[Error] ASRT currently only supports wav audio files with a sampling rate of 16000 Hz, but this "
|
||
f"audio is {fs} Hz.")
|
||
|
||
# wav波形 加时间窗以及时移10ms
|
||
time_window = 25 # 单位ms
|
||
window_length = int(fs / 1000 * time_window) # 计算窗长度的公式,目前全部为400固定值
|
||
|
||
wav_arr = np.array(wavsignal)
|
||
# wav_length = len(wavsignal[0])
|
||
# wav_length = wav_arr.shape[1]
|
||
|
||
range0_end = int(len(wavsignal[0]) / fs * 1000 - time_window) // 10 + 1 # 计算循环终止的位置,也就是最终生成的窗数
|
||
data_input = np.zeros((range0_end, window_length // 2), dtype=np.float64) # 用于存放最终的频率特征数据
|
||
data_line = np.zeros((1, window_length), dtype=np.float64)
|
||
|
||
for i in range(0, range0_end):
|
||
p_start = i * 160
|
||
p_end = p_start + 400
|
||
|
||
data_line = wav_arr[0, p_start:p_end]
|
||
data_line = data_line * self.w # 加窗
|
||
data_line = np.abs(fft(data_line))
|
||
|
||
data_input[i] = data_line[0: window_length // 2] # 设置为400除以2的值(即200)是取一半数据,因为是对称的
|
||
|
||
# print(data_input.shape)
|
||
data_input = np.log(data_input + 1)
|
||
|
||
# 开始对得到的特征应用SpecAugment
|
||
mode = random.randint(1, 100)
|
||
h_start = random.randint(1, data_input.shape[0])
|
||
h_width = random.randint(1, 100)
|
||
|
||
v_start = random.randint(1, data_input.shape[1])
|
||
v_width = random.randint(1, 100)
|
||
|
||
if mode <= 60: # 正常特征 60%
|
||
pass
|
||
elif 60 < mode <= 75: # 横向遮盖 15%
|
||
data_input[h_start:h_start + h_width, :] = 0
|
||
elif 75 < mode <= 90: # 纵向遮盖 15%
|
||
data_input[:, v_start:v_start + v_width] = 0
|
||
else: # 两种遮盖叠加 10%
|
||
data_input[h_start:h_start + h_width, :v_start:v_start + v_width] = 0
|
||
|
||
return data_input
|