feat: 切换默认声学模型到m251bn

2022-03-27 21:47:12 +08:00 · 2022-03-27 21:47:12 +08:00 · 087f51f8b4
parent f19b20702b
commit 087f51f8b4
7 changed files with 36 additions and 28 deletions
--- a/README.md
+++ b/README.md
@ -101,7 +101,7 @@ $ python3 client_http.py

 请注意，开启API服务器之后，需要使用本ASRT项目对应的客户端软件来进行语音识别，详见Wiki文档[下载ASRT语音识别客户端SDK和Demo](https://wiki.ailemon.net/docs/asrt-doc/download)。

-如果要训练和使用非251版模型，请在代码中 `import speech_model_zoo` 的相应位置做修改。
+如果要训练和使用非251bn版模型，请在代码中 `import speech_model_zoo` 的相应位置做修改。

 使用docker直接部署ASRT：
 ```shell
--- a/README_EN.md
+++ b/README_EN.md
@ -97,7 +97,7 @@ To test whether it is successful or not that calls api service interface:
 $ python3 client_http.py
 ```

-If you want to train and use other model(not Model 251), make changes in the corresponding position of the `import speech_model_zoo` in the code files.
+If you want to train and use other model(not Model 251bn), make changes in the corresponding position of the `import speech_model_zoo` in the code files.

 If there is any problem during the execution of the program or during use, it can be promptly put forward in the issue, and I will reply as soon as possible.

--- a/asrserver.py
+++ b/asrserver.py
@ -26,7 +26,7 @@ ASRT语音识别API的HTTP服务器程序
 import http.server
 import socket
 from speech_model import ModelSpeech
-from speech_model_zoo import SpeechModel251
+from speech_model_zoo import SpeechModel251BN
 from speech_features import Spectrogram
 from LanguageModel2 import ModelLanguage

@ -35,13 +35,13 @@ AUDIO_FEATURE_LENGTH = 200
 CHANNELS = 1
 # 默认输出的拼音的表示大小是1428，即1427个拼音+1个空白块
 OUTPUT_SIZE = 1428
-sm251 = SpeechModel251(
+sm251bn = SpeechModel251BN(
    input_shape=(AUDIO_LENGTH, AUDIO_FEATURE_LENGTH, CHANNELS),
    output_size=OUTPUT_SIZE
    )
 feat = Spectrogram()
-ms = ModelSpeech(sm251, feat, max_label_length=64)
-ms.load_model('save_models/' + sm251.get_model_name() + '.model.h5')
+ms = ModelSpeech(sm251bn, feat, max_label_length=64)
+ms.load_model('save_models/' + sm251bn.get_model_name() + '.model.h5')

 ml = ModelLanguage('model_language')
 ml.LoadModel()
--- a/asrserver_http.py
+++ b/asrserver_http.py
@ -23,12 +23,13 @@
 ASRT语音识别基于HTTP协议的API服务器程序
 """

+import argparse
 import base64
 import json
 from flask import Flask, Response, request

 from speech_model import ModelSpeech
-from speech_model_zoo import SpeechModel251
+from speech_model_zoo import SpeechModel251BN
 from speech_features import Spectrogram
 from LanguageModel2 import ModelLanguage
 from utils.ops import decode_wav_bytes
@ -36,10 +37,15 @@ from utils.ops import decode_wav_bytes
 API_STATUS_CODE_OK = 200000 # OK
 API_STATUS_CODE_CLIENT_ERROR = 400000
 API_STATUS_CODE_CLIENT_ERROR_FORMAT = 400001 # 请求数据格式错误
-API_STATUS_CODE_CLIENT_ERROR_FORMAT = 400002 # 请求数据配置不支持
+API_STATUS_CODE_CLIENT_ERROR_CONFIG = 400002 # 请求数据配置不支持
 API_STATUS_CODE_SERVER_ERROR = 500000
 API_STATUS_CODE_SERVER_ERROR_RUNNING = 500001 # 服务器运行中出错

+parser = argparse.ArgumentParser(description='ASRT HTTP+Json RESTful API Service')
+parser.add_argument('--listen', default='0.0.0.0', type=str, help='the network to listen')
+parser.add_argument('--port', default='20001', type=str, help='the port to listen')
+args = parser.parse_args()
+
 app = Flask("ASRT API Service")

 AUDIO_LENGTH = 1600
@ -47,13 +53,13 @@ AUDIO_FEATURE_LENGTH = 200
 CHANNELS = 1
 # 默认输出的拼音的表示大小是1428，即1427个拼音+1个空白块
 OUTPUT_SIZE = 1428
-sm251 = SpeechModel251(
+sm251bn = SpeechModel251BN(
    input_shape=(AUDIO_LENGTH, AUDIO_FEATURE_LENGTH, CHANNELS),
    output_size=OUTPUT_SIZE
    )
 feat = Spectrogram()
-ms = ModelSpeech(sm251, feat, max_label_length=64)
-ms.load_model('save_models/' + sm251.get_model_name() + '.model.h5')
+ms = ModelSpeech(sm251bn, feat, max_label_length=64)
+ms.load_model('save_models/' + sm251bn.get_model_name() + '.model.h5')

 ml = ModelLanguage('model_language')
 ml.LoadModel()
@ -149,7 +155,7 @@ def recognition_post(level):
            json_data = AsrtApiResponse(API_STATUS_CODE_OK, 'all level')
            json_data.result = result
            buffer = json_data.to_json()
-            print('output:', buffer)
+            print('ASRT Result:', result,'output:', buffer)
            return Response(buffer, mimetype='application/json')
        else:
            request_data = request.get_json()
@ -165,6 +171,8 @@ def recognition_post(level):
        # request_data['samples'][-100:])
        json_data = AsrtApiResponse(API_STATUS_CODE_SERVER_ERROR, str(except_general))
        buffer = json_data.to_json()
+        #print("input:", request_data, "\n", "output:", buffer)
+        print("output:", buffer, "error:", except_general)
        return Response(buffer, mimetype='application/json')


@ -173,4 +181,4 @@ if __name__ == '__main__':
    #app.run(host='0.0.0.0', port=20001)
    # for production env
    import waitress
-    waitress.serve(app, host='0.0.0.0', port=20001)
+    waitress.serve(app, host=args.listen, port=args.port)
--- a/evaluate_speech_model.py
+++ b/evaluate_speech_model.py
@ -26,7 +26,7 @@
 import os

 from speech_model import ModelSpeech
-from speech_model_zoo import SpeechModel251
+from speech_model_zoo import SpeechModel251BN
 from data_loader import DataLoader
 from speech_features import Spectrogram

@ -37,14 +37,14 @@ AUDIO_FEATURE_LENGTH = 200
 CHANNELS = 1
 # 默认输出的拼音的表示大小是1428，即1427个拼音+1个空白块
 OUTPUT_SIZE = 1428
-sm251 = SpeechModel251(
+sm251bn = SpeechModel251BN(
    input_shape=(AUDIO_LENGTH, AUDIO_FEATURE_LENGTH, CHANNELS),
    output_size=OUTPUT_SIZE
    )
 feat = Spectrogram()
 evalue_data = DataLoader('dev')
-ms = ModelSpeech(sm251, feat, max_label_length=64)
+ms = ModelSpeech(sm251bn, feat, max_label_length=64)

-ms.load_model('save_models/' + sm251.get_model_name() + '.model.h5')
+ms.load_model('save_models/' + sm251bn.get_model_name() + '.model.h5')
 ms.evaluate_model(data_loader=evalue_data, data_count=-1,
    out_report=True, show_ratio=True, show_per_step=100)
--- a/predict_speech_file.py
+++ b/predict_speech_file.py
@ -26,7 +26,7 @@
 import os

 from speech_model import ModelSpeech
-from speech_model_zoo import SpeechModel251
+from speech_model_zoo import SpeechModel251BN
 from speech_features import Spectrogram
 from LanguageModel2 import ModelLanguage

@ -37,14 +37,14 @@ AUDIO_FEATURE_LENGTH = 200
 CHANNELS = 1
 # 默认输出的拼音的表示大小是1428，即1427个拼音+1个空白块
 OUTPUT_SIZE = 1428
-sm251 = SpeechModel251(
+sm251bn = SpeechModel251BN(
    input_shape=(AUDIO_LENGTH, AUDIO_FEATURE_LENGTH, CHANNELS),
    output_size=OUTPUT_SIZE
    )
 feat = Spectrogram()
-ms = ModelSpeech(sm251, feat, max_label_length=64)
+ms = ModelSpeech(sm251bn, feat, max_label_length=64)

-ms.load_model('save_models/' + sm251.get_model_name() + '.model.h5')
+ms.load_model('save_models/' + sm251bn.get_model_name() + '.model.h5')
 res = ms.recognize_speech_from_file('filename.wav')
 print('*[提示] 声学模型语音识别结果：\n', res)

--- a/train_speech_model.py
+++ b/train_speech_model.py
@ -28,9 +28,9 @@ import os
 from tensorflow.keras.optimizers import Adam

 from speech_model import ModelSpeech
-from speech_model_zoo import SpeechModel251
+from speech_model_zoo import SpeechModel251BN
 from data_loader import DataLoader
-from speech_features import Spectrogram
+from speech_features import SpecAugment

 os.environ["CUDA_VISIBLE_DEVICES"] = "0"

@ -39,16 +39,16 @@ AUDIO_FEATURE_LENGTH = 200
 CHANNELS = 1
 # 默认输出的拼音的表示大小是1428，即1427个拼音+1个空白块
 OUTPUT_SIZE = 1428
-sm251 = SpeechModel251(
+sm251bn = SpeechModel251BN(
    input_shape=(AUDIO_LENGTH, AUDIO_FEATURE_LENGTH, CHANNELS),
    output_size=OUTPUT_SIZE
    )
-feat = Spectrogram()
+feat = SpecAugment()
 train_data = DataLoader('train')
 opt = Adam(lr = 0.0001, beta_1 = 0.9, beta_2 = 0.999, decay = 0.0, epsilon = 10e-8)
-ms = ModelSpeech(sm251, feat, max_label_length=64)
+ms = ModelSpeech(sm251bn, feat, max_label_length=64)

-#ms.load_model('save_models/' + sm251.get_model_name() + '.model.h5')
+#ms.load_model('save_models/' + sm251bn.get_model_name() + '.model.h5')
 ms.train_model(optimizer=opt, data_loader=train_data,
    epochs=50, save_step=1, batch_size=16, last_epoch=0)
-ms.save_model('save_models/' + sm251.get_model_name())
+ms.save_model('save_models/' + sm251bn.get_model_name())