diff --git a/README.md b/README.md index 839508e..45eff3e 100644 --- a/README.md +++ b/README.md @@ -81,7 +81,7 @@ CNN + LSTM/GRU + CTC * 关于下载已经训练好的模型的问题 -可以在Github本仓库下[releases](https://github.com/nl8590687/ASRT_SpeechRecognition/releases)里面的查看发布的各个版本软件的压缩包里获得完整源程序。 +可以在Github本仓库下[releases](https://github.com/nl8590687/ASRT_SpeechRecognition/releases)里面的查看发布的各个版本软件的压缩包里获得包含已经训练好模型参数的完整源程序。 ### Language Model 语言模型 diff --git a/README_EN.md b/README_EN.md index 7850e04..cf4a128 100644 --- a/README_EN.md +++ b/README_EN.md @@ -83,7 +83,7 @@ The maximum length of the input audio is 16 seconds, and the output is the corre * Questions about downloading trained models -The complete source program can be obtained from the archives of the various versions of the software released in the [releases](https://github.com/nl8590687/ASRT_SpeechRecognition/releases) page of Github. +The complete source program that includes trained model weights can be obtained from the archives of the various versions of the software released in the [releases](https://github.com/nl8590687/ASRT_SpeechRecognition/releases) page of Github. ### Language Model diff --git a/general_function/file_wav.py b/general_function/file_wav.py index 1f48242..48ca016 100644 --- a/general_function/file_wav.py +++ b/general_function/file_wav.py @@ -130,6 +130,39 @@ def GetFrequencyFeature3(wavsignal, fs): data_line = np.abs(fft(data_line)) / wav_length + data_input[i]=data_line[0:200] # 设置为400除以2的值(即200)是取一半数据,因为是对称的 + + #print(data_input.shape) + data_input = np.log(data_input + 1) + return data_input + +def GetFrequencyFeature4(wavsignal, fs): + ''' + 主要是用来修正3版的bug + ''' + # wav波形 加时间窗以及时移10ms + time_window = 25 # 单位ms + window_length = fs / 1000 * time_window # 计算窗长度的公式,目前全部为400固定值 + + wav_arr = np.array(wavsignal) + #wav_length = len(wavsignal[0]) + wav_length = wav_arr.shape[1] + + range0_end = int(len(wavsignal[0])/fs*1000 - time_window) // 10 + 1 # 计算循环终止的位置,也就是最终生成的窗数 + data_input = np.zeros((range0_end, 200), dtype = np.float) # 用于存放最终的频率特征数据 + data_line = np.zeros((1, 400), dtype = np.float) + + for i in range(0, range0_end): + p_start = i * 160 + p_end = p_start + 400 + + data_line = wav_arr[0, p_start:p_end] + + data_line = data_line * w # 加窗 + + data_line = np.abs(fft(data_line)) / wav_length + + data_input[i]=data_line[0:200] # 设置为400除以2的值(即200)是取一半数据,因为是对称的 #print(data_input.shape)