231 lines
7.2 KiB
Python
231 lines
7.2 KiB
Python
#!/usr/bin/env python3
|
||
# -*- coding: utf-8 -*-
|
||
|
||
import os
|
||
import wave
|
||
import numpy as np
|
||
import matplotlib.pyplot as plt
|
||
import math
|
||
import time
|
||
|
||
from python_speech_features import mfcc
|
||
from python_speech_features import delta
|
||
from python_speech_features import logfbank
|
||
|
||
from scipy.fftpack import fft
|
||
|
||
def read_wav_data(filename):
|
||
'''
|
||
读取一个wav文件,返回声音信号的时域谱矩阵和播放时间
|
||
'''
|
||
wav = wave.open(filename,"rb") # 打开一个wav格式的声音文件流
|
||
num_frame = wav.getnframes() # 获取帧数
|
||
num_channel=wav.getnchannels() # 获取声道数
|
||
framerate=wav.getframerate() # 获取帧速率
|
||
num_sample_width=wav.getsampwidth() # 获取实例的比特宽度,即每一帧的字节数
|
||
str_data = wav.readframes(num_frame) # 读取全部的帧
|
||
wav.close() # 关闭流
|
||
wave_data = np.fromstring(str_data, dtype = np.short) # 将声音文件数据转换为数组矩阵形式
|
||
wave_data.shape = -1, num_channel # 按照声道数将数组整形,单声道时候是一列数组,双声道时候是两列的矩阵
|
||
wave_data = wave_data.T # 将矩阵转置
|
||
#wave_data = wave_data
|
||
return wave_data, framerate
|
||
|
||
def GetMfccFeature(wavsignal, fs):
|
||
# 获取输入特征
|
||
feat_mfcc=mfcc(wavsignal[0],fs)
|
||
feat_mfcc_d=delta(feat_mfcc,2)
|
||
feat_mfcc_dd=delta(feat_mfcc_d,2)
|
||
# 返回值分别是mfcc特征向量的矩阵及其一阶差分和二阶差分矩阵
|
||
wav_feature = np.column_stack((feat_mfcc, feat_mfcc_d, feat_mfcc_dd))
|
||
return wav_feature
|
||
|
||
def GetFrequencyFeature(wavsignal, fs):
|
||
# wav波形 加时间窗以及时移10ms
|
||
time_window = 25 # 单位ms
|
||
data_input = []
|
||
|
||
#print(int(len(wavsignal[0])/fs*1000 - time_window) // 10)
|
||
wav_length = len(wavsignal[0]) # 计算一条语音信号的原始长度
|
||
range0_end = int(len(wavsignal[0])/fs*1000 - time_window) // 10 # 计算循环终止的位置,也就是最终生成的窗数
|
||
for i in range(0, range0_end):
|
||
p_start = i * 160
|
||
p_end = p_start + 400
|
||
data_line = []
|
||
|
||
for j in range(p_start, p_end):
|
||
data_line.append(wavsignal[0][j])
|
||
#print('wavsignal[0][j]:\n',wavsignal[0][j])
|
||
#data_line = abs(fft(data_line)) / len(wavsignal[0])
|
||
data_line = fft(data_line) / wav_length
|
||
data_line2 = []
|
||
for fre_sig in data_line:
|
||
# 分别取出频率信号的实部和虚部作为语音信号的频率特征
|
||
# 直接使用复数的话,之后会被numpy将虚部丢弃,造成信息丢失
|
||
#print('fre_sig:\n',fre_sig)
|
||
data_line2.append(fre_sig.real)
|
||
data_line2.append(fre_sig.imag)
|
||
|
||
data_input.append(data_line2[0:len(data_line2)//2]) # 除以2是取一半数据,因为是对称的
|
||
#print('data_input:\n',data_input)
|
||
#print('data_line:\n',data_line)
|
||
#print(len(data_input),len(data_input[0]))
|
||
return data_input
|
||
|
||
def GetFrequencyFeature2(wavsignal, fs):
|
||
# wav波形 加时间窗以及时移10ms
|
||
time_window = 25 # 单位ms
|
||
window_length = fs / 1000 * time_window # 计算窗长度的公式,目前全部为400固定值
|
||
|
||
wav_arr = np.array(wavsignal)
|
||
#wav_length = len(wavsignal[0])
|
||
wav_length = wav_arr.shape[1]
|
||
|
||
range0_end = int(len(wavsignal[0])/fs*1000 - time_window) // 10 # 计算循环终止的位置,也就是最终生成的窗数
|
||
data_input = np.zeros((range0_end, 200), dtype = np.float) # 用于存放最终的频率特征数据
|
||
data_line = np.zeros((1, 400), dtype = np.float)
|
||
for i in range(0, range0_end):
|
||
p_start = i * 160
|
||
p_end = p_start + 400
|
||
|
||
data_line = wav_arr[0, p_start:p_end]
|
||
'''
|
||
x=np.linspace(0, 400 - 1, 400, dtype = np.int64)
|
||
w = 0.54 - 0.46 * np.cos(2 * np.pi * (x) / (400 - 1) ) # 汉明窗
|
||
data_line = data_line * w # 加窗
|
||
'''
|
||
data_line = np.abs(fft(data_line)) / wav_length
|
||
|
||
|
||
data_input[i]=data_line[0:200] # 设置为400除以2的值(即200)是取一半数据,因为是对称的
|
||
|
||
#print(data_input.shape)
|
||
return data_input
|
||
|
||
|
||
x=np.linspace(0, 400 - 1, 400, dtype = np.int64)
|
||
w = 0.54 - 0.46 * np.cos(2 * np.pi * (x) / (400 - 1) ) # 汉明窗
|
||
|
||
def GetFrequencyFeature3(wavsignal, fs):
|
||
# wav波形 加时间窗以及时移10ms
|
||
time_window = 25 # 单位ms
|
||
window_length = fs / 1000 * time_window # 计算窗长度的公式,目前全部为400固定值
|
||
|
||
wav_arr = np.array(wavsignal)
|
||
#wav_length = len(wavsignal[0])
|
||
wav_length = wav_arr.shape[1]
|
||
|
||
range0_end = int(len(wavsignal[0])/fs*1000 - time_window) // 10 # 计算循环终止的位置,也就是最终生成的窗数
|
||
data_input = np.zeros((range0_end, 200), dtype = np.float) # 用于存放最终的频率特征数据
|
||
data_line = np.zeros((1, 400), dtype = np.float)
|
||
|
||
for i in range(0, range0_end):
|
||
p_start = i * 160
|
||
p_end = p_start + 400
|
||
|
||
data_line = wav_arr[0, p_start:p_end]
|
||
|
||
data_line = data_line * w # 加窗
|
||
|
||
data_line = np.abs(fft(data_line)) / wav_length
|
||
|
||
|
||
data_input[i]=data_line[0:200] # 设置为400除以2的值(即200)是取一半数据,因为是对称的
|
||
|
||
#print(data_input.shape)
|
||
data_input = np.log(data_input + 1)
|
||
return data_input
|
||
|
||
def wav_scale(energy):
|
||
'''
|
||
语音信号能量归一化
|
||
'''
|
||
means = energy.mean() # 均值
|
||
var=energy.var() # 方差
|
||
e=(energy-means)/math.sqrt(var) # 归一化能量
|
||
return e
|
||
|
||
def wav_scale2(energy):
|
||
'''
|
||
语音信号能量归一化
|
||
'''
|
||
maxnum = max(energy)
|
||
e = energy / maxnum
|
||
return e
|
||
|
||
def wav_scale3(energy):
|
||
'''
|
||
语音信号能量归一化
|
||
'''
|
||
for i in range(len(energy)):
|
||
#if i == 1:
|
||
# #print('wavsignal[0]:\n {:.4f}'.format(energy[1]),energy[1] is int)
|
||
energy[i] = float(energy[i]) / 100.0
|
||
#if i == 1:
|
||
# #print('wavsignal[0]:\n {:.4f}'.format(energy[1]),energy[1] is int)
|
||
return energy
|
||
|
||
def wav_show(wave_data, fs): # 显示出来声音波形
|
||
time = np.arange(0, len(wave_data)) * (1.0/fs) # 计算声音的播放时间,单位为秒
|
||
# 画声音波形
|
||
#plt.subplot(211)
|
||
plt.plot(time, wave_data)
|
||
#plt.subplot(212)
|
||
#plt.plot(time, wave_data[1], c = "g")
|
||
plt.show()
|
||
|
||
|
||
def get_wav_list(filename):
|
||
'''
|
||
读取一个wav文件列表,返回一个存储该列表的字典类型值
|
||
ps:在数据中专门有几个文件用于存放用于训练、验证和测试的wav文件列表
|
||
'''
|
||
txt_obj=open(filename,'r') # 打开文件并读入
|
||
txt_text=txt_obj.read()
|
||
txt_lines=txt_text.split('\n') # 文本分割
|
||
dic_filelist={} # 初始化字典
|
||
list_wavmark=[] # 初始化wav列表
|
||
for i in txt_lines:
|
||
if(i!=''):
|
||
txt_l=i.split(' ')
|
||
dic_filelist[txt_l[0]] = txt_l[1]
|
||
list_wavmark.append(txt_l[0])
|
||
txt_obj.close()
|
||
return dic_filelist,list_wavmark
|
||
|
||
def get_wav_symbol(filename):
|
||
'''
|
||
读取指定数据集中,所有wav文件对应的语音符号
|
||
返回一个存储符号集的字典类型值
|
||
'''
|
||
txt_obj=open(filename,'r') # 打开文件并读入
|
||
txt_text=txt_obj.read()
|
||
txt_lines=txt_text.split('\n') # 文本分割
|
||
dic_symbol_list={} # 初始化字典
|
||
list_symbolmark=[] # 初始化symbol列表
|
||
for i in txt_lines:
|
||
if(i!=''):
|
||
txt_l=i.split(' ')
|
||
dic_symbol_list[txt_l[0]]=txt_l[1:]
|
||
list_symbolmark.append(txt_l[0])
|
||
txt_obj.close()
|
||
return dic_symbol_list,list_symbolmark
|
||
|
||
if(__name__=='__main__'):
|
||
|
||
wave_data, fs = read_wav_data("A2_0.wav")
|
||
|
||
wav_show(wave_data[0],fs)
|
||
t0=time.time()
|
||
freimg = GetFrequencyFeature3(wave_data,fs)
|
||
t1=time.time()
|
||
print('time cost:',t1-t0)
|
||
|
||
freimg = freimg.T
|
||
plt.subplot(111)
|
||
|
||
plt.imshow(freimg)
|
||
plt.colorbar(cax=None,ax=None,shrink=0.5)
|
||
|
||
plt.show()
|