mirror of
https://gitee.com/270580156/weiyu.git
synced 2026-05-17 04:37:53 +00:00
Sync from bytedesk-private: update
This commit is contained in:
188
modules/python/vendors/FunASR/runtime/python/onnxruntime/README.md
vendored
Normal file
188
modules/python/vendors/FunASR/runtime/python/onnxruntime/README.md
vendored
Normal file
@@ -0,0 +1,188 @@
|
||||
# ONNXRuntime-python
|
||||
|
||||
## Install `funasr-onnx`
|
||||
|
||||
install from pip
|
||||
|
||||
```shell
|
||||
pip install -U funasr-onnx
|
||||
# For the users in China, you could install with the command:
|
||||
# pip install -U funasr-onnx -i https://mirror.sjtu.edu.cn/pypi/web/simple
|
||||
# If you want to export .onnx file, you should install modelscope and funasr
|
||||
pip install -U modelscope funasr
|
||||
# For the users in China, you could install with the command:
|
||||
# pip install -U modelscope funasr -i https://mirror.sjtu.edu.cn/pypi/web/simple
|
||||
```
|
||||
|
||||
or install from source code
|
||||
|
||||
```shell
|
||||
git clone https://github.com/alibaba/FunASR.git && cd FunASR
|
||||
cd funasr/runtime/python/onnxruntime
|
||||
pip install -e ./
|
||||
# For the users in China, you could install with the command:
|
||||
# pip install -e ./ -i https://mirror.sjtu.edu.cn/pypi/web/simple
|
||||
```
|
||||
|
||||
## Inference with runtime
|
||||
|
||||
### Speech Recognition
|
||||
|
||||
#### Paraformer
|
||||
|
||||
```python
|
||||
from funasr_onnx import Paraformer
|
||||
from pathlib import Path
|
||||
|
||||
model_dir = "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
|
||||
model = Paraformer(model_dir, batch_size=1, quantize=True)
|
||||
|
||||
wav_path = ['{}/.cache/modelscope/hub/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/example/asr_example.wav'.format(Path.home())]
|
||||
|
||||
result = model(wav_path)
|
||||
print(result)
|
||||
```
|
||||
|
||||
- `model_dir`: model_name in modelscope or local path downloaded from modelscope. If the local path is set, it should contain `model.onnx`, `config.yaml`, `am.mvn`
|
||||
- `batch_size`: `1` (Default), the batch size duration inference
|
||||
- `device_id`: `-1` (Default), infer on CPU. If you want to infer with GPU, set it to gpu_id (Please make sure that you have install the onnxruntime-gpu)
|
||||
- `quantize`: `False` (Default), load the model of `model.onnx` in `model_dir`. If set `True`, load the model of `model_quant.onnx` in `model_dir`
|
||||
- `intra_op_num_threads`: `4` (Default), sets the number of threads used for intraop parallelism on CPU
|
||||
|
||||
Input: wav formt file, support formats: `str, np.ndarray, List[str]`
|
||||
|
||||
Output: `List[str]`: recognition result
|
||||
|
||||
#### Paraformer-online
|
||||
|
||||
### Voice Activity Detection
|
||||
|
||||
#### FSMN-VAD
|
||||
|
||||
```python
|
||||
from funasr_onnx import Fsmn_vad
|
||||
from pathlib import Path
|
||||
|
||||
model_dir = "damo/speech_fsmn_vad_zh-cn-16k-common-pytorch"
|
||||
wav_path = '{}/.cache/modelscope/hub/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/example/vad_example.wav'.format(Path.home())
|
||||
|
||||
model = Fsmn_vad(model_dir)
|
||||
|
||||
result = model(wav_path)
|
||||
print(result)
|
||||
```
|
||||
|
||||
- `model_dir`: model_name in modelscope or local path downloaded from modelscope. If the local path is set, it should contain `model.onnx`, `config.yaml`, `am.mvn`
|
||||
- `batch_size`: `1` (Default), the batch size duration inference
|
||||
- `device_id`: `-1` (Default), infer on CPU. If you want to infer with GPU, set it to gpu_id (Please make sure that you have install the onnxruntime-gpu)
|
||||
- `quantize`: `False` (Default), load the model of `model.onnx` in `model_dir`. If set `True`, load the model of `model_quant.onnx` in `model_dir`
|
||||
- `intra_op_num_threads`: `4` (Default), sets the number of threads used for intraop parallelism on CPU
|
||||
|
||||
Input: wav formt file, support formats: `str, np.ndarray, List[str]`
|
||||
|
||||
Output: `List[str]`: recognition result
|
||||
|
||||
#### FSMN-VAD-online
|
||||
|
||||
```python
|
||||
from funasr_onnx import Fsmn_vad_online
|
||||
import soundfile
|
||||
from pathlib import Path
|
||||
|
||||
model_dir = "damo/speech_fsmn_vad_zh-cn-16k-common-pytorch"
|
||||
wav_path = '{}/.cache/modelscope/hub/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/example/vad_example.wav'.format(Path.home())
|
||||
|
||||
model = Fsmn_vad_online(model_dir)
|
||||
|
||||
|
||||
##online vad
|
||||
speech, sample_rate = soundfile.read(wav_path)
|
||||
speech_length = speech.shape[0]
|
||||
#
|
||||
sample_offset = 0
|
||||
step = 1600
|
||||
param_dict = {'in_cache': []}
|
||||
for sample_offset in range(0, speech_length, min(step, speech_length - sample_offset)):
|
||||
if sample_offset + step >= speech_length - 1:
|
||||
step = speech_length - sample_offset
|
||||
is_final = True
|
||||
else:
|
||||
is_final = False
|
||||
param_dict['is_final'] = is_final
|
||||
segments_result = model(audio_in=speech[sample_offset: sample_offset + step],
|
||||
param_dict=param_dict)
|
||||
if segments_result:
|
||||
print(segments_result)
|
||||
```
|
||||
|
||||
- `model_dir`: model_name in modelscope or local path downloaded from modelscope. If the local path is set, it should contain `model.onnx`, `config.yaml`, `am.mvn`
|
||||
- `batch_size`: `1` (Default), the batch size duration inference
|
||||
- `device_id`: `-1` (Default), infer on CPU. If you want to infer with GPU, set it to gpu_id (Please make sure that you have install the onnxruntime-gpu)
|
||||
- `quantize`: `False` (Default), load the model of `model.onnx` in `model_dir`. If set `True`, load the model of `model_quant.onnx` in `model_dir`
|
||||
- `intra_op_num_threads`: `4` (Default), sets the number of threads used for intraop parallelism on CPU
|
||||
|
||||
Input: wav formt file, support formats: `str, np.ndarray, List[str]`
|
||||
|
||||
Output: `List[str]`: recognition result
|
||||
|
||||
### Punctuation Restoration
|
||||
|
||||
#### CT-Transformer
|
||||
|
||||
```python
|
||||
from funasr_onnx import CT_Transformer
|
||||
|
||||
model_dir = "damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch"
|
||||
model = CT_Transformer(model_dir)
|
||||
|
||||
text_in="跨境河流是养育沿岸人民的生命之源长期以来为帮助下游地区防灾减灾中方技术人员在上游地区极为恶劣的自然条件下克服巨大困难甚至冒着生命危险向印方提供汛期水文资料处理紧急事件中方重视印方在跨境河流问题上的关切愿意进一步完善双方联合工作机制凡是中方能做的我们都会去做而且会做得更好我请印度朋友们放心中国在上游的任何开发利用都会经过科学规划和论证兼顾上下游的利益"
|
||||
result = model(text_in)
|
||||
print(result[0])
|
||||
```
|
||||
|
||||
- `model_dir`: model_name in modelscope or local path downloaded from modelscope. If the local path is set, it should contain `model.onnx`, `config.yaml`, `am.mvn`
|
||||
- `device_id`: `-1` (Default), infer on CPU. If you want to infer with GPU, set it to gpu_id (Please make sure that you have install the onnxruntime-gpu)
|
||||
- `quantize`: `False` (Default), load the model of `model.onnx` in `model_dir`. If set `True`, load the model of `model_quant.onnx` in `model_dir`
|
||||
- `intra_op_num_threads`: `4` (Default), sets the number of threads used for intraop parallelism on CPU
|
||||
|
||||
Input: `str`, raw text of asr result
|
||||
|
||||
Output: `List[str]`: recognition result
|
||||
|
||||
#### CT-Transformer-online
|
||||
|
||||
```python
|
||||
from funasr_onnx import CT_Transformer_VadRealtime
|
||||
|
||||
model_dir = "damo/punc_ct-transformer_zh-cn-common-vad_realtime-vocab272727"
|
||||
model = CT_Transformer_VadRealtime(model_dir)
|
||||
|
||||
text_in = "跨境河流是养育沿岸|人民的生命之源长期以来为帮助下游地区防灾减灾中方技术人员|在上游地区极为恶劣的自然条件下克服巨大困难甚至冒着生命危险|向印方提供汛期水文资料处理紧急事件中方重视印方在跨境河流>问题上的关切|愿意进一步完善双方联合工作机制|凡是|中方能做的我们|都会去做而且会做得更好我请印度朋友们放心中国在上游的|任何开发利用都会经过科学|规划和论证兼顾上下游的利益"
|
||||
|
||||
vads = text_in.split("|")
|
||||
rec_result_all=""
|
||||
param_dict = {"cache": []}
|
||||
for vad in vads:
|
||||
result = model(vad, param_dict=param_dict)
|
||||
rec_result_all += result[0]
|
||||
|
||||
print(rec_result_all)
|
||||
```
|
||||
|
||||
- `model_dir`: model_name in modelscope or local path downloaded from modelscope. If the local path is set, it should contain `model.onnx`, `config.yaml`, `am.mvn`
|
||||
- `device_id`: `-1` (Default), infer on CPU. If you want to infer with GPU, set it to gpu_id (Please make sure that you have install the onnxruntime-gpu)
|
||||
- `quantize`: `False` (Default), load the model of `model.onnx` in `model_dir`. If set `True`, load the model of `model_quant.onnx` in `model_dir`
|
||||
- `intra_op_num_threads`: `4` (Default), sets the number of threads used for intraop parallelism on CPU
|
||||
|
||||
Input: `str`, raw text of asr result
|
||||
|
||||
Output: `List[str]`: recognition result
|
||||
|
||||
## Performance benchmark
|
||||
|
||||
Please ref to [benchmark](https://github.com/alibaba-damo-academy/FunASR/blob/main/runtime/docs/benchmark_onnx.md)
|
||||
|
||||
## Acknowledge
|
||||
|
||||
1. This project is maintained by [FunASR community](https://github.com/alibaba-damo-academy/FunASR).
|
||||
2. We partially refer [SWHL](https://github.com/RapidAI/RapidASR) for onnxruntime (only for paraformer model).
|
||||
0
modules/python/vendors/FunASR/runtime/python/onnxruntime/__init__.py
vendored
Normal file
0
modules/python/vendors/FunASR/runtime/python/onnxruntime/__init__.py
vendored
Normal file
12
modules/python/vendors/FunASR/runtime/python/onnxruntime/demo_contextual_paraformer.py
vendored
Normal file
12
modules/python/vendors/FunASR/runtime/python/onnxruntime/demo_contextual_paraformer.py
vendored
Normal file
@@ -0,0 +1,12 @@
|
||||
from funasr_onnx import ContextualParaformer
|
||||
from pathlib import Path
|
||||
|
||||
model_dir = "damo/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404"
|
||||
model = ContextualParaformer(model_dir, batch_size=1)
|
||||
|
||||
wav_path = ["{}/.cache/modelscope/hub/{}/example/asr_example.wav".format(Path.home(), model_dir)]
|
||||
|
||||
hotwords = "你的热词 魔搭"
|
||||
|
||||
result = model(wav_path, hotwords)
|
||||
print(result)
|
||||
15
modules/python/vendors/FunASR/runtime/python/onnxruntime/demo_paraformer_offline.py
vendored
Normal file
15
modules/python/vendors/FunASR/runtime/python/onnxruntime/demo_paraformer_offline.py
vendored
Normal file
@@ -0,0 +1,15 @@
|
||||
from funasr_onnx import Paraformer
|
||||
from pathlib import Path
|
||||
|
||||
model_dir = "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
|
||||
# model_dir = "damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
|
||||
model = Paraformer(model_dir, batch_size=1, quantize=False)
|
||||
# model = Paraformer(model_dir, batch_size=1, device_id=0) # gpu
|
||||
|
||||
# when using paraformer-large-vad-punc model, you can set plot_timestamp_to="./xx.png" to get figure of alignment besides timestamps
|
||||
# model = Paraformer(model_dir, batch_size=1, plot_timestamp_to="test.png")
|
||||
|
||||
wav_path = ["{}/.cache/modelscope/hub/{}/example/asr_example.wav".format(Path.home(), model_dir)]
|
||||
|
||||
result = model(wav_path)
|
||||
print(result)
|
||||
31
modules/python/vendors/FunASR/runtime/python/onnxruntime/demo_paraformer_online.py
vendored
Normal file
31
modules/python/vendors/FunASR/runtime/python/onnxruntime/demo_paraformer_online.py
vendored
Normal file
@@ -0,0 +1,31 @@
|
||||
import soundfile
|
||||
from funasr_onnx.paraformer_online_bin import Paraformer
|
||||
from pathlib import Path
|
||||
|
||||
model_dir = "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online"
|
||||
wav_path = ["{}/.cache/modelscope/hub/{}/example/asr_example.wav".format(Path.home(), model_dir)]
|
||||
|
||||
chunk_size = [5, 10, 5]
|
||||
model = Paraformer(
|
||||
model_dir, batch_size=1, quantize=True, chunk_size=chunk_size, intra_op_num_threads=4
|
||||
) # only support batch_size = 1
|
||||
|
||||
##online asr
|
||||
speech, sample_rate = soundfile.read(wav_path)
|
||||
speech_length = speech.shape[0]
|
||||
sample_offset = 0
|
||||
step = chunk_size[1] * 960
|
||||
param_dict = {"cache": dict()}
|
||||
final_result = ""
|
||||
for sample_offset in range(0, speech_length, min(step, speech_length - sample_offset)):
|
||||
if sample_offset + step >= speech_length - 1:
|
||||
step = speech_length - sample_offset
|
||||
is_final = True
|
||||
else:
|
||||
is_final = False
|
||||
param_dict["is_final"] = is_final
|
||||
rec_result = model(audio_in=speech[sample_offset : sample_offset + step], param_dict=param_dict)
|
||||
if len(rec_result) > 0:
|
||||
final_result += rec_result[0]["preds"][0]
|
||||
print(rec_result)
|
||||
print(final_result)
|
||||
9
modules/python/vendors/FunASR/runtime/python/onnxruntime/demo_punc_offline.py
vendored
Normal file
9
modules/python/vendors/FunASR/runtime/python/onnxruntime/demo_punc_offline.py
vendored
Normal file
@@ -0,0 +1,9 @@
|
||||
from funasr_onnx import CT_Transformer
|
||||
|
||||
# model_dir = "damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch"
|
||||
model_dir = "damo/punc_ct-transformer_cn-en-common-vocab471067-large"
|
||||
model = CT_Transformer(model_dir)
|
||||
|
||||
text_in = "跨境河流是养育沿岸人民的生命之源长期以来为帮助下游地区防灾减灾中方技术人员在上游地区极为恶劣的自然条件下克服巨大困难甚至冒着生命危险向印方提供汛期水文资料处理紧急事件中方重视印方在跨境河流问题上的关切愿意进一步完善双方联合工作机制凡是中方能做的我们都会去做而且会做得更好我请印度朋友们放心中国在上游的任何开发利用都会经过科学规划和论证兼顾上下游的利益"
|
||||
result = model(text_in)
|
||||
print(result[0])
|
||||
15
modules/python/vendors/FunASR/runtime/python/onnxruntime/demo_punc_online.py
vendored
Normal file
15
modules/python/vendors/FunASR/runtime/python/onnxruntime/demo_punc_online.py
vendored
Normal file
@@ -0,0 +1,15 @@
|
||||
from funasr_onnx import CT_Transformer_VadRealtime
|
||||
|
||||
model_dir = "damo/punc_ct-transformer_zh-cn-common-vad_realtime-vocab272727"
|
||||
model = CT_Transformer_VadRealtime(model_dir)
|
||||
|
||||
text_in = "跨境河流是养育沿岸|人民的生命之源长期以来为帮助下游地区防灾减灾中方技术人员|在上游地区极为恶劣的自然条件下克服巨大困难甚至冒着生命危险|向印方提供汛期水文资料处理紧急事件中方重视印方在跨境河流>问题上的关切|愿意进一步完善双方联合工作机制|凡是|中方能做的我们|都会去做而且会做得更好我请印度朋友们放心中国在上游的|任何开发利用都会经过科学|规划和论证兼顾上下游的利益"
|
||||
|
||||
vads = text_in.split("|")
|
||||
rec_result_all = ""
|
||||
param_dict = {"cache": []}
|
||||
for vad in vads:
|
||||
result = model(vad, param_dict=param_dict)
|
||||
rec_result_all += result[0]
|
||||
|
||||
print(rec_result_all)
|
||||
12
modules/python/vendors/FunASR/runtime/python/onnxruntime/demo_seaco_paraformer.py
vendored
Normal file
12
modules/python/vendors/FunASR/runtime/python/onnxruntime/demo_seaco_paraformer.py
vendored
Normal file
@@ -0,0 +1,12 @@
|
||||
from funasr_onnx import SeacoParaformer
|
||||
from pathlib import Path
|
||||
|
||||
model_dir = "iic/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
|
||||
model = SeacoParaformer(model_dir, batch_size=1)
|
||||
|
||||
wav_path = ["{}/.cache/modelscope/hub/{}/example/asr_example.wav".format(Path.home(), model_dir)]
|
||||
|
||||
hotwords = "你的热词 魔搭"
|
||||
|
||||
result = model(wav_path, hotwords)
|
||||
print(result)
|
||||
19
modules/python/vendors/FunASR/runtime/python/onnxruntime/demo_sencevoice_small.py
vendored
Normal file
19
modules/python/vendors/FunASR/runtime/python/onnxruntime/demo_sencevoice_small.py
vendored
Normal file
@@ -0,0 +1,19 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- encoding: utf-8 -*-
|
||||
# Copyright FunASR (https://github.com/FunAudioLLM/SenseVoice). All Rights Reserved.
|
||||
# MIT License (https://opensource.org/licenses/MIT)
|
||||
|
||||
from pathlib import Path
|
||||
from funasr_onnx import SenseVoiceSmall
|
||||
from funasr_onnx.utils.postprocess_utils import rich_transcription_postprocess
|
||||
|
||||
|
||||
model_dir = "iic/SenseVoiceSmall"
|
||||
|
||||
model = SenseVoiceSmall(model_dir, batch_size=10, quantize=False)
|
||||
|
||||
# inference
|
||||
wav_or_scp = ["{}/.cache/modelscope/hub/{}/example/en.mp3".format(Path.home(), model_dir)]
|
||||
|
||||
res = model(wav_or_scp, language="auto", use_itn=True)
|
||||
print([rich_transcription_postprocess(i) for i in res])
|
||||
12
modules/python/vendors/FunASR/runtime/python/onnxruntime/demo_vad_offline.py
vendored
Normal file
12
modules/python/vendors/FunASR/runtime/python/onnxruntime/demo_vad_offline.py
vendored
Normal file
@@ -0,0 +1,12 @@
|
||||
from funasr_onnx import Fsmn_vad
|
||||
from pathlib import Path
|
||||
|
||||
model_dir = "damo/speech_fsmn_vad_zh-cn-16k-common-pytorch"
|
||||
wav_path = "{}/.cache/modelscope/hub/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/example/vad_example.wav".format(
|
||||
Path.home()
|
||||
)
|
||||
|
||||
model = Fsmn_vad(model_dir)
|
||||
|
||||
result = model(wav_path)
|
||||
print(result)
|
||||
31
modules/python/vendors/FunASR/runtime/python/onnxruntime/demo_vad_online.py
vendored
Normal file
31
modules/python/vendors/FunASR/runtime/python/onnxruntime/demo_vad_online.py
vendored
Normal file
@@ -0,0 +1,31 @@
|
||||
from funasr_onnx import Fsmn_vad_online
|
||||
import soundfile
|
||||
from pathlib import Path
|
||||
|
||||
model_dir = "damo/speech_fsmn_vad_zh-cn-16k-common-pytorch"
|
||||
wav_path = "{}/.cache/modelscope/hub/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/example/vad_example.wav".format(
|
||||
Path.home()
|
||||
)
|
||||
|
||||
model = Fsmn_vad_online(model_dir)
|
||||
|
||||
|
||||
##online vad
|
||||
speech, sample_rate = soundfile.read(wav_path)
|
||||
speech_length = speech.shape[0]
|
||||
#
|
||||
sample_offset = 0
|
||||
step = 1600
|
||||
param_dict = {"in_cache": []}
|
||||
for sample_offset in range(0, speech_length, min(step, speech_length - sample_offset)):
|
||||
if sample_offset + step >= speech_length - 1:
|
||||
step = speech_length - sample_offset
|
||||
is_final = True
|
||||
else:
|
||||
is_final = False
|
||||
param_dict["is_final"] = is_final
|
||||
segments_result = model(
|
||||
audio_in=speech[sample_offset : sample_offset + step], param_dict=param_dict
|
||||
)
|
||||
if segments_result:
|
||||
print(segments_result)
|
||||
27
modules/python/vendors/FunASR/runtime/python/onnxruntime/funasr_client_http.py
vendored
Normal file
27
modules/python/vendors/FunASR/runtime/python/onnxruntime/funasr_client_http.py
vendored
Normal file
@@ -0,0 +1,27 @@
|
||||
import base64
|
||||
import requests
|
||||
import threading
|
||||
|
||||
|
||||
with open("A2_0.wav", "rb") as f:
|
||||
test_wav_bytes = f.read()
|
||||
url = "http://127.0.0.1:8888/api/asr"
|
||||
|
||||
|
||||
def send_post(i, url, wav_bytes):
|
||||
r1 = requests.post(url, json={"wav_base64": str(base64.b64encode(wav_bytes), "utf-8")})
|
||||
print("线程:", i, r1.json())
|
||||
|
||||
|
||||
for i in range(100):
|
||||
t = threading.Thread(
|
||||
target=send_post,
|
||||
args=(
|
||||
i,
|
||||
url,
|
||||
test_wav_bytes,
|
||||
),
|
||||
)
|
||||
t.start()
|
||||
# t.join()
|
||||
print("完成测试")
|
||||
7
modules/python/vendors/FunASR/runtime/python/onnxruntime/funasr_onnx/__init__.py
vendored
Normal file
7
modules/python/vendors/FunASR/runtime/python/onnxruntime/funasr_onnx/__init__.py
vendored
Normal file
@@ -0,0 +1,7 @@
|
||||
# -*- encoding: utf-8 -*-
|
||||
from .paraformer_bin import Paraformer, ContextualParaformer, SeacoParaformer
|
||||
from .vad_bin import Fsmn_vad
|
||||
from .vad_bin import Fsmn_vad_online
|
||||
from .punc_bin import CT_Transformer
|
||||
from .punc_bin import CT_Transformer_VadRealtime
|
||||
from .sensevoice_bin import SenseVoiceSmall
|
||||
466
modules/python/vendors/FunASR/runtime/python/onnxruntime/funasr_onnx/paraformer_bin.py
vendored
Normal file
466
modules/python/vendors/FunASR/runtime/python/onnxruntime/funasr_onnx/paraformer_bin.py
vendored
Normal file
@@ -0,0 +1,466 @@
|
||||
# -*- encoding: utf-8 -*-
|
||||
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
|
||||
# MIT License (https://opensource.org/licenses/MIT)
|
||||
|
||||
import os.path
|
||||
from pathlib import Path
|
||||
from typing import List, Union, Tuple
|
||||
import json
|
||||
|
||||
import copy
|
||||
import librosa
|
||||
import numpy as np
|
||||
|
||||
from .utils.utils import (
|
||||
CharTokenizer,
|
||||
Hypothesis,
|
||||
ONNXRuntimeError,
|
||||
OrtInferSession,
|
||||
TokenIDConverter,
|
||||
get_logger,
|
||||
read_yaml,
|
||||
)
|
||||
from .utils.postprocess_utils import sentence_postprocess, sentence_postprocess_sentencepiece
|
||||
from .utils.frontend import WavFrontend
|
||||
from .utils.timestamp_utils import time_stamp_lfr6_onnx
|
||||
from .utils.utils import pad_list
|
||||
|
||||
logging = get_logger()
|
||||
|
||||
|
||||
class Paraformer:
|
||||
"""
|
||||
Author: Speech Lab of DAMO Academy, Alibaba Group
|
||||
Paraformer: Fast and Accurate Parallel Transformer for Non-autoregressive End-to-End Speech Recognition
|
||||
https://arxiv.org/abs/2206.08317
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model_dir: Union[str, Path] = None,
|
||||
batch_size: int = 1,
|
||||
device_id: Union[str, int] = "-1",
|
||||
plot_timestamp_to: str = "",
|
||||
quantize: bool = False,
|
||||
intra_op_num_threads: int = 4,
|
||||
cache_dir: str = None,
|
||||
**kwargs,
|
||||
):
|
||||
if not Path(model_dir).exists():
|
||||
try:
|
||||
from modelscope.hub.snapshot_download import snapshot_download
|
||||
except:
|
||||
raise "You are exporting model from modelscope, please install modelscope and try it again. To install modelscope, you could:\n" "\npip3 install -U modelscope\n" "For the users in China, you could install with the command:\n" "\npip3 install -U modelscope -i https://mirror.sjtu.edu.cn/pypi/web/simple"
|
||||
try:
|
||||
model_dir = snapshot_download(model_dir, cache_dir=cache_dir)
|
||||
except:
|
||||
raise "model_dir must be model_name in modelscope or local path downloaded from modelscope, but is {}".format(
|
||||
model_dir
|
||||
)
|
||||
|
||||
model_file = os.path.join(model_dir, "model.onnx")
|
||||
if quantize:
|
||||
model_file = os.path.join(model_dir, "model_quant.onnx")
|
||||
if not os.path.exists(model_file):
|
||||
print(".onnx does not exist, begin to export onnx")
|
||||
try:
|
||||
from funasr import AutoModel
|
||||
except:
|
||||
raise "You are exporting onnx, please install funasr and try it again. To install funasr, you could:\n" "\npip3 install -U funasr\n" "For the users in China, you could install with the command:\n" "\npip3 install -U funasr -i https://mirror.sjtu.edu.cn/pypi/web/simple"
|
||||
|
||||
model = AutoModel(model=model_dir)
|
||||
model_dir = model.export(type="onnx", quantize=quantize, **kwargs)
|
||||
|
||||
config_file = os.path.join(model_dir, "config.yaml")
|
||||
cmvn_file = os.path.join(model_dir, "am.mvn")
|
||||
config = read_yaml(config_file)
|
||||
token_list = os.path.join(model_dir, "tokens.json")
|
||||
with open(token_list, "r", encoding="utf-8") as f:
|
||||
token_list = json.load(f)
|
||||
|
||||
self.converter = TokenIDConverter(token_list)
|
||||
self.tokenizer = CharTokenizer()
|
||||
self.frontend = WavFrontend(cmvn_file=cmvn_file, **config["frontend_conf"])
|
||||
self.ort_infer = OrtInferSession(
|
||||
model_file, device_id, intra_op_num_threads=intra_op_num_threads
|
||||
)
|
||||
self.batch_size = batch_size
|
||||
self.plot_timestamp_to = plot_timestamp_to
|
||||
if "predictor_bias" in config["model_conf"].keys():
|
||||
self.pred_bias = config["model_conf"]["predictor_bias"]
|
||||
else:
|
||||
self.pred_bias = 0
|
||||
if "lang" in config:
|
||||
self.language = config["lang"]
|
||||
else:
|
||||
self.language = None
|
||||
|
||||
def __call__(self, wav_content: Union[str, np.ndarray, List[str]], **kwargs) -> List:
|
||||
waveform_list = self.load_data(wav_content, self.frontend.opts.frame_opts.samp_freq)
|
||||
waveform_nums = len(waveform_list)
|
||||
asr_res = []
|
||||
for beg_idx in range(0, waveform_nums, self.batch_size):
|
||||
|
||||
end_idx = min(waveform_nums, beg_idx + self.batch_size)
|
||||
feats, feats_len = self.extract_feat(waveform_list[beg_idx:end_idx])
|
||||
try:
|
||||
outputs = self.infer(feats, feats_len)
|
||||
am_scores, valid_token_lens = outputs[0], outputs[1]
|
||||
if len(outputs) == 4:
|
||||
# for BiCifParaformer Inference
|
||||
us_alphas, us_peaks = outputs[2], outputs[3]
|
||||
else:
|
||||
us_alphas, us_peaks = None, None
|
||||
except ONNXRuntimeError:
|
||||
# logging.warning(traceback.format_exc())
|
||||
logging.warning("input wav is silence or noise")
|
||||
preds = [""]
|
||||
else:
|
||||
preds = self.decode(am_scores, valid_token_lens)
|
||||
if us_peaks is None:
|
||||
for pred in preds:
|
||||
if self.language == "en-bpe":
|
||||
pred = sentence_postprocess_sentencepiece(pred)
|
||||
else:
|
||||
pred = sentence_postprocess(pred)
|
||||
asr_res.append({"preds": pred})
|
||||
else:
|
||||
for pred, us_peaks_ in zip(preds, us_peaks):
|
||||
raw_tokens = pred
|
||||
timestamp, timestamp_raw = time_stamp_lfr6_onnx(
|
||||
us_peaks_, copy.copy(raw_tokens)
|
||||
)
|
||||
text_proc, timestamp_proc, _ = sentence_postprocess(
|
||||
raw_tokens, timestamp_raw
|
||||
)
|
||||
# logging.warning(timestamp)
|
||||
if len(self.plot_timestamp_to):
|
||||
self.plot_wave_timestamp(
|
||||
waveform_list[0], timestamp, self.plot_timestamp_to
|
||||
)
|
||||
asr_res.append(
|
||||
{
|
||||
"preds": text_proc,
|
||||
"timestamp": timestamp_proc,
|
||||
"raw_tokens": raw_tokens,
|
||||
}
|
||||
)
|
||||
return asr_res
|
||||
|
||||
def plot_wave_timestamp(self, wav, text_timestamp, dest):
|
||||
# TODO: Plot the wav and timestamp results with matplotlib
|
||||
import matplotlib
|
||||
|
||||
matplotlib.use("Agg")
|
||||
matplotlib.rc(
|
||||
"font", family="Alibaba PuHuiTi"
|
||||
) # set it to a font that your system supports
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
fig, ax1 = plt.subplots(figsize=(11, 3.5), dpi=320)
|
||||
ax2 = ax1.twinx()
|
||||
ax2.set_ylim([0, 2.0])
|
||||
# plot waveform
|
||||
ax1.set_ylim([-0.3, 0.3])
|
||||
time = np.arange(wav.shape[0]) / 16000
|
||||
ax1.plot(time, wav / wav.max() * 0.3, color="gray", alpha=0.4)
|
||||
# plot lines and text
|
||||
for char, start, end in text_timestamp:
|
||||
ax1.vlines(start, -0.3, 0.3, ls="--")
|
||||
ax1.vlines(end, -0.3, 0.3, ls="--")
|
||||
x_adj = 0.045 if char != "<sil>" else 0.12
|
||||
ax1.text((start + end) * 0.5 - x_adj, 0, char)
|
||||
# plt.legend()
|
||||
plotname = "{}/timestamp.png".format(dest)
|
||||
plt.savefig(plotname, bbox_inches="tight")
|
||||
|
||||
def load_data(self, wav_content: Union[str, np.ndarray, List[str]], fs: int = None) -> List:
|
||||
def load_wav(path: str) -> np.ndarray:
|
||||
waveform, _ = librosa.load(path, sr=fs)
|
||||
return waveform
|
||||
|
||||
if isinstance(wav_content, np.ndarray):
|
||||
return [wav_content]
|
||||
|
||||
if isinstance(wav_content, str):
|
||||
return [load_wav(wav_content)]
|
||||
|
||||
if isinstance(wav_content, list):
|
||||
return [load_wav(path) for path in wav_content]
|
||||
|
||||
raise TypeError(f"The type of {wav_content} is not in [str, np.ndarray, list]")
|
||||
|
||||
def extract_feat(self, waveform_list: List[np.ndarray]) -> Tuple[np.ndarray, np.ndarray]:
|
||||
feats, feats_len = [], []
|
||||
for waveform in waveform_list:
|
||||
speech, _ = self.frontend.fbank(waveform)
|
||||
feat, feat_len = self.frontend.lfr_cmvn(speech)
|
||||
feats.append(feat)
|
||||
feats_len.append(feat_len)
|
||||
|
||||
feats = self.pad_feats(feats, np.max(feats_len))
|
||||
feats_len = np.array(feats_len).astype(np.int32)
|
||||
return feats, feats_len
|
||||
|
||||
@staticmethod
|
||||
def pad_feats(feats: List[np.ndarray], max_feat_len: int) -> np.ndarray:
|
||||
def pad_feat(feat: np.ndarray, cur_len: int) -> np.ndarray:
|
||||
pad_width = ((0, max_feat_len - cur_len), (0, 0))
|
||||
return np.pad(feat, pad_width, "constant", constant_values=0)
|
||||
|
||||
feat_res = [pad_feat(feat, feat.shape[0]) for feat in feats]
|
||||
feats = np.array(feat_res).astype(np.float32)
|
||||
return feats
|
||||
|
||||
def infer(self, feats: np.ndarray, feats_len: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
|
||||
outputs = self.ort_infer([feats, feats_len])
|
||||
return outputs
|
||||
|
||||
def decode(self, am_scores: np.ndarray, token_nums: int) -> List[str]:
|
||||
return [
|
||||
self.decode_one(am_score, token_num)
|
||||
for am_score, token_num in zip(am_scores, token_nums)
|
||||
]
|
||||
|
||||
def decode_one(self, am_score: np.ndarray, valid_token_num: int) -> List[str]:
|
||||
yseq = am_score.argmax(axis=-1)
|
||||
score = am_score.max(axis=-1)
|
||||
score = np.sum(score, axis=-1)
|
||||
|
||||
# pad with mask tokens to ensure compatibility with sos/eos tokens
|
||||
# asr_model.sos:1 asr_model.eos:2
|
||||
yseq = np.array([1] + yseq.tolist() + [2])
|
||||
hyp = Hypothesis(yseq=yseq, score=score)
|
||||
|
||||
# remove sos/eos and get results
|
||||
last_pos = -1
|
||||
token_int = hyp.yseq[1:last_pos].tolist()
|
||||
|
||||
# remove blank symbol id, which is assumed to be 0
|
||||
token_int = list(filter(lambda x: x not in (0, 2), token_int))
|
||||
|
||||
# Change integer-ids to tokens
|
||||
token = self.converter.ids2tokens(token_int)
|
||||
token = token[: valid_token_num - self.pred_bias]
|
||||
# texts = sentence_postprocess(token)
|
||||
return token
|
||||
|
||||
|
||||
class ContextualParaformer(Paraformer):
|
||||
"""
|
||||
Author: Speech Lab of DAMO Academy, Alibaba Group
|
||||
Paraformer: Fast and Accurate Parallel Transformer for Non-autoregressive End-to-End Speech Recognition
|
||||
https://arxiv.org/abs/2206.08317
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model_dir: Union[str, Path] = None,
|
||||
batch_size: int = 1,
|
||||
device_id: Union[str, int] = "-1",
|
||||
plot_timestamp_to: str = "",
|
||||
quantize: bool = False,
|
||||
intra_op_num_threads: int = 4,
|
||||
cache_dir: str = None,
|
||||
**kwargs,
|
||||
):
|
||||
|
||||
if not Path(model_dir).exists():
|
||||
try:
|
||||
from modelscope.hub.snapshot_download import snapshot_download
|
||||
except:
|
||||
raise "You are exporting model from modelscope, please install modelscope and try it again. To install modelscope, you could:\n" "\npip3 install -U modelscope\n" "For the users in China, you could install with the command:\n" "\npip3 install -U modelscope -i https://mirror.sjtu.edu.cn/pypi/web/simple"
|
||||
try:
|
||||
model_dir = snapshot_download(model_dir, cache_dir=cache_dir)
|
||||
except:
|
||||
raise "model_dir must be model_name in modelscope or local path downloaded from modelscope, but is {}".format(
|
||||
model_dir
|
||||
)
|
||||
|
||||
if quantize:
|
||||
model_bb_file = os.path.join(model_dir, "model_quant.onnx")
|
||||
model_eb_file = os.path.join(model_dir, "model_eb_quant.onnx")
|
||||
else:
|
||||
model_bb_file = os.path.join(model_dir, "model.onnx")
|
||||
model_eb_file = os.path.join(model_dir, "model_eb.onnx")
|
||||
|
||||
if not (os.path.exists(model_eb_file) and os.path.exists(model_bb_file)):
|
||||
print(".onnx does not exist, begin to export onnx")
|
||||
try:
|
||||
from funasr import AutoModel
|
||||
except:
|
||||
raise "You are exporting onnx, please install funasr and try it again. To install funasr, you could:\n" "\npip3 install -U funasr\n" "For the users in China, you could install with the command:\n" "\npip3 install -U funasr -i https://mirror.sjtu.edu.cn/pypi/web/simple"
|
||||
|
||||
model = AutoModel(model=model_dir)
|
||||
model_dir = model.export(type="onnx", quantize=quantize, **kwargs)
|
||||
|
||||
config_file = os.path.join(model_dir, "config.yaml")
|
||||
cmvn_file = os.path.join(model_dir, "am.mvn")
|
||||
config = read_yaml(config_file)
|
||||
token_list = os.path.join(model_dir, "tokens.json")
|
||||
with open(token_list, "r", encoding="utf-8") as f:
|
||||
token_list = json.load(f)
|
||||
|
||||
# revert token_list into vocab dict
|
||||
self.vocab = {}
|
||||
for i, token in enumerate(token_list):
|
||||
self.vocab[token] = i
|
||||
|
||||
self.converter = TokenIDConverter(token_list)
|
||||
self.tokenizer = CharTokenizer()
|
||||
self.frontend = WavFrontend(cmvn_file=cmvn_file, **config["frontend_conf"])
|
||||
self.ort_infer_bb = OrtInferSession(
|
||||
model_bb_file, device_id, intra_op_num_threads=intra_op_num_threads
|
||||
)
|
||||
self.ort_infer_eb = OrtInferSession(
|
||||
model_eb_file, device_id, intra_op_num_threads=intra_op_num_threads
|
||||
)
|
||||
|
||||
self.batch_size = batch_size
|
||||
self.plot_timestamp_to = plot_timestamp_to
|
||||
if "predictor_bias" in config["model_conf"].keys():
|
||||
self.pred_bias = config["model_conf"]["predictor_bias"]
|
||||
else:
|
||||
self.pred_bias = 0
|
||||
if "lang" in config:
|
||||
self.language = config["lang"]
|
||||
else:
|
||||
self.language = None
|
||||
|
||||
def __call__(
|
||||
self, wav_content: Union[str, np.ndarray, List[str]], hotwords: str, **kwargs
|
||||
) -> List:
|
||||
# def __call__(
|
||||
# self, waveform_list:list, hotwords: str, **kwargs
|
||||
# ) -> List:
|
||||
# make hotword list
|
||||
hotwords, hotwords_length = self.proc_hotword(hotwords)
|
||||
[bias_embed] = self.eb_infer(hotwords, hotwords_length)
|
||||
# index from bias_embed
|
||||
bias_embed = bias_embed.transpose(1, 0, 2)
|
||||
_ind = np.arange(0, len(hotwords)).tolist()
|
||||
bias_embed = bias_embed[_ind, hotwords_length.tolist()]
|
||||
waveform_list = self.load_data(wav_content, self.frontend.opts.frame_opts.samp_freq)
|
||||
waveform_nums = len(waveform_list)
|
||||
asr_res = []
|
||||
for beg_idx in range(0, waveform_nums, self.batch_size):
|
||||
end_idx = min(waveform_nums, beg_idx + self.batch_size)
|
||||
feats, feats_len = self.extract_feat(waveform_list[beg_idx:end_idx])
|
||||
bias_embed = np.expand_dims(bias_embed, axis=0)
|
||||
bias_embed = np.repeat(bias_embed, feats.shape[0], axis=0)
|
||||
try:
|
||||
outputs = self.bb_infer(feats, feats_len, bias_embed)
|
||||
am_scores, valid_token_lens = outputs[0], outputs[1]
|
||||
|
||||
if len(outputs) == 4:
|
||||
# for BiCifParaformer Inference
|
||||
us_alphas, us_peaks = outputs[2], outputs[3]
|
||||
else:
|
||||
us_alphas, us_peaks = None, None
|
||||
|
||||
except ONNXRuntimeError:
|
||||
# logging.warning(traceback.format_exc())
|
||||
logging.warning("input wav is silence or noise")
|
||||
preds = [""]
|
||||
else:
|
||||
preds = self.decode(am_scores, valid_token_lens)
|
||||
if us_peaks is None:
|
||||
for pred in preds:
|
||||
if self.language == "en-bpe":
|
||||
pred = sentence_postprocess_sentencepiece(pred)
|
||||
else:
|
||||
pred = sentence_postprocess(pred)
|
||||
asr_res.append({"preds": pred})
|
||||
else:
|
||||
for pred, us_peaks_ in zip(preds, us_peaks):
|
||||
raw_tokens = pred
|
||||
timestamp, timestamp_raw = time_stamp_lfr6_onnx(
|
||||
us_peaks_, copy.copy(raw_tokens)
|
||||
)
|
||||
text_proc, timestamp_proc, _ = sentence_postprocess(
|
||||
raw_tokens, timestamp_raw
|
||||
)
|
||||
# logging.warning(timestamp)
|
||||
if len(self.plot_timestamp_to):
|
||||
self.plot_wave_timestamp(
|
||||
waveform_list[0], timestamp, self.plot_timestamp_to
|
||||
)
|
||||
asr_res.append(
|
||||
{
|
||||
"preds": text_proc,
|
||||
"timestamp": timestamp_proc,
|
||||
"raw_tokens": raw_tokens,
|
||||
}
|
||||
)
|
||||
return asr_res
|
||||
|
||||
def proc_hotword(self, hotwords):
|
||||
hotwords = hotwords.split(" ")
|
||||
hotwords_length = [len(i) - 1 for i in hotwords]
|
||||
hotwords_length.append(0)
|
||||
hotwords_length = np.array(hotwords_length)
|
||||
|
||||
# hotwords.append('<s>')
|
||||
def word_map(word):
|
||||
hotwords = []
|
||||
for c in word:
|
||||
if c not in self.vocab.keys():
|
||||
hotwords.append(8403)
|
||||
logging.warning(
|
||||
"oov character {} found in hotword {}, replaced by <unk>".format(c, word)
|
||||
)
|
||||
else:
|
||||
hotwords.append(self.vocab[c])
|
||||
return np.array(hotwords)
|
||||
|
||||
hotword_int = [word_map(i) for i in hotwords]
|
||||
|
||||
hotword_int.append(np.array([1]))
|
||||
hotwords = pad_list(hotword_int, pad_value=0, max_len=10)
|
||||
|
||||
return hotwords, hotwords_length
|
||||
|
||||
def bb_infer(
|
||||
self, feats: np.ndarray, feats_len: np.ndarray, bias_embed
|
||||
) -> Tuple[np.ndarray, np.ndarray]:
|
||||
outputs = self.ort_infer_bb([feats, feats_len, bias_embed])
|
||||
return outputs
|
||||
|
||||
def eb_infer(self, hotwords, hotwords_length):
|
||||
outputs = self.ort_infer_eb([hotwords.astype(np.int32), hotwords_length.astype(np.int32)])
|
||||
return outputs
|
||||
|
||||
def decode(self, am_scores: np.ndarray, token_nums: int) -> List[str]:
|
||||
return [
|
||||
self.decode_one(am_score, token_num)
|
||||
for am_score, token_num in zip(am_scores, token_nums)
|
||||
]
|
||||
|
||||
def decode_one(self, am_score: np.ndarray, valid_token_num: int) -> List[str]:
|
||||
yseq = am_score.argmax(axis=-1)
|
||||
score = am_score.max(axis=-1)
|
||||
score = np.sum(score, axis=-1)
|
||||
|
||||
# pad with mask tokens to ensure compatibility with sos/eos tokens
|
||||
# asr_model.sos:1 asr_model.eos:2
|
||||
yseq = np.array([1] + yseq.tolist() + [2])
|
||||
hyp = Hypothesis(yseq=yseq, score=score)
|
||||
|
||||
# remove sos/eos and get results
|
||||
last_pos = -1
|
||||
token_int = hyp.yseq[1:last_pos].tolist()
|
||||
|
||||
# remove blank symbol id, which is assumed to be 0
|
||||
token_int = list(filter(lambda x: x not in (0, 2), token_int))
|
||||
|
||||
# Change integer-ids to tokens
|
||||
token = self.converter.ids2tokens(token_int)
|
||||
token = token[: valid_token_num - self.pred_bias]
|
||||
# texts = sentence_postprocess(token)
|
||||
return token
|
||||
|
||||
|
||||
class SeacoParaformer(ContextualParaformer):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
# no difference with contextual_paraformer in method of calling onnx models
|
||||
330
modules/python/vendors/FunASR/runtime/python/onnxruntime/funasr_onnx/paraformer_online_bin.py
vendored
Normal file
330
modules/python/vendors/FunASR/runtime/python/onnxruntime/funasr_onnx/paraformer_online_bin.py
vendored
Normal file
@@ -0,0 +1,330 @@
|
||||
# -*- encoding: utf-8 -*-
|
||||
|
||||
import os.path
|
||||
from pathlib import Path
|
||||
from typing import List, Union, Tuple
|
||||
import json
|
||||
import copy
|
||||
import librosa
|
||||
import numpy as np
|
||||
|
||||
from .utils.utils import (
|
||||
CharTokenizer,
|
||||
Hypothesis,
|
||||
ONNXRuntimeError,
|
||||
OrtInferSession,
|
||||
TokenIDConverter,
|
||||
get_logger,
|
||||
read_yaml,
|
||||
)
|
||||
from .utils.postprocess_utils import sentence_postprocess
|
||||
from .utils.frontend import WavFrontendOnline, SinusoidalPositionEncoderOnline
|
||||
|
||||
logging = get_logger()
|
||||
|
||||
|
||||
class Paraformer:
|
||||
def __init__(
|
||||
self,
|
||||
model_dir: Union[str, Path] = None,
|
||||
batch_size: int = 1,
|
||||
chunk_size: List = [5, 10, 5],
|
||||
device_id: Union[str, int] = "-1",
|
||||
quantize: bool = False,
|
||||
intra_op_num_threads: int = 4,
|
||||
cache_dir: str = None,
|
||||
**kwargs,
|
||||
):
|
||||
|
||||
if not Path(model_dir).exists():
|
||||
try:
|
||||
from modelscope.hub.snapshot_download import snapshot_download
|
||||
except:
|
||||
raise "You are exporting model from modelscope, please install modelscope and try it again. To install modelscope, you could:\n" "\npip3 install -U modelscope\n" "For the users in China, you could install with the command:\n" "\npip3 install -U modelscope -i https://mirror.sjtu.edu.cn/pypi/web/simple"
|
||||
try:
|
||||
model_dir = snapshot_download(model_dir, cache_dir=cache_dir)
|
||||
except:
|
||||
raise "model_dir must be model_name in modelscope or local path downloaded from modelscope, but is {}".format(
|
||||
model_dir
|
||||
)
|
||||
|
||||
encoder_model_file = os.path.join(model_dir, "model.onnx")
|
||||
decoder_model_file = os.path.join(model_dir, "decoder.onnx")
|
||||
if quantize:
|
||||
encoder_model_file = os.path.join(model_dir, "model_quant.onnx")
|
||||
decoder_model_file = os.path.join(model_dir, "decoder_quant.onnx")
|
||||
if not os.path.exists(encoder_model_file) or not os.path.exists(decoder_model_file):
|
||||
print(".onnx does not exist, begin to export onnx")
|
||||
try:
|
||||
from funasr import AutoModel
|
||||
except:
|
||||
raise "You are exporting onnx, please install funasr and try it again. To install funasr, you could:\n" "\npip3 install -U funasr\n" "For the users in China, you could install with the command:\n" "\npip3 install -U funasr -i https://mirror.sjtu.edu.cn/pypi/web/simple"
|
||||
|
||||
model = AutoModel(model=model_dir)
|
||||
model_dir = model.export(type="onnx", quantize=quantize, **kwargs)
|
||||
|
||||
config_file = os.path.join(model_dir, "config.yaml")
|
||||
cmvn_file = os.path.join(model_dir, "am.mvn")
|
||||
config = read_yaml(config_file)
|
||||
token_list = os.path.join(model_dir, "tokens.json")
|
||||
with open(token_list, "r", encoding="utf-8") as f:
|
||||
token_list = json.load(f)
|
||||
|
||||
self.converter = TokenIDConverter(token_list)
|
||||
self.tokenizer = CharTokenizer()
|
||||
self.frontend = WavFrontendOnline(cmvn_file=cmvn_file, **config["frontend_conf"])
|
||||
self.pe = SinusoidalPositionEncoderOnline()
|
||||
self.ort_encoder_infer = OrtInferSession(
|
||||
encoder_model_file, device_id, intra_op_num_threads=intra_op_num_threads
|
||||
)
|
||||
self.ort_decoder_infer = OrtInferSession(
|
||||
decoder_model_file, device_id, intra_op_num_threads=intra_op_num_threads
|
||||
)
|
||||
self.batch_size = batch_size
|
||||
self.chunk_size = chunk_size
|
||||
self.encoder_output_size = config["encoder_conf"]["output_size"]
|
||||
self.fsmn_layer = config["decoder_conf"]["num_blocks"]
|
||||
self.fsmn_lorder = config["decoder_conf"]["kernel_size"] - 1
|
||||
self.fsmn_dims = config["encoder_conf"]["output_size"]
|
||||
self.feats_dims = config["frontend_conf"]["n_mels"] * config["frontend_conf"]["lfr_m"]
|
||||
self.cif_threshold = config["predictor_conf"]["threshold"]
|
||||
self.tail_threshold = config["predictor_conf"]["tail_threshold"]
|
||||
|
||||
def prepare_cache(self, cache: dict = {}, batch_size=1):
|
||||
if len(cache) > 0:
|
||||
return cache
|
||||
cache["start_idx"] = 0
|
||||
cache["cif_hidden"] = np.zeros((batch_size, 1, self.encoder_output_size)).astype(np.float32)
|
||||
cache["cif_alphas"] = np.zeros((batch_size, 1)).astype(np.float32)
|
||||
cache["chunk_size"] = self.chunk_size
|
||||
cache["last_chunk"] = False
|
||||
cache["feats"] = np.zeros(
|
||||
(batch_size, self.chunk_size[0] + self.chunk_size[2], self.feats_dims)
|
||||
).astype(np.float32)
|
||||
cache["decoder_fsmn"] = []
|
||||
for i in range(self.fsmn_layer):
|
||||
fsmn_cache = np.zeros((batch_size, self.fsmn_dims, self.fsmn_lorder)).astype(np.float32)
|
||||
cache["decoder_fsmn"].append(fsmn_cache)
|
||||
return cache
|
||||
|
||||
def add_overlap_chunk(self, feats: np.ndarray, cache: dict = {}):
|
||||
if len(cache) == 0:
|
||||
return feats
|
||||
# process last chunk
|
||||
overlap_feats = np.concatenate((cache["feats"], feats), axis=1)
|
||||
if cache["is_final"]:
|
||||
cache["feats"] = overlap_feats[:, -self.chunk_size[0] :, :]
|
||||
if not cache["last_chunk"]:
|
||||
padding_length = sum(self.chunk_size) - overlap_feats.shape[1]
|
||||
overlap_feats = np.pad(overlap_feats, ((0, 0), (0, padding_length), (0, 0)))
|
||||
else:
|
||||
cache["feats"] = overlap_feats[:, -(self.chunk_size[0] + self.chunk_size[2]) :, :]
|
||||
return overlap_feats
|
||||
|
||||
def __call__(self, audio_in: np.ndarray, **kwargs):
|
||||
waveforms = np.expand_dims(audio_in, axis=0)
|
||||
param_dict = kwargs.get("param_dict", dict())
|
||||
is_final = param_dict.get("is_final", False)
|
||||
cache = param_dict.get("cache", dict())
|
||||
asr_res = []
|
||||
|
||||
if waveforms.shape[1] < 16 * 60 and is_final and len(cache) > 0:
|
||||
cache["last_chunk"] = True
|
||||
feats = cache["feats"]
|
||||
feats_len = np.array([feats.shape[1]]).astype(np.int32)
|
||||
asr_res = self.infer(feats, feats_len, cache)
|
||||
return asr_res
|
||||
|
||||
feats, feats_len = self.extract_feat(waveforms, is_final)
|
||||
if feats.shape[1] != 0:
|
||||
feats *= self.encoder_output_size**0.5
|
||||
cache = self.prepare_cache(cache)
|
||||
cache["is_final"] = is_final
|
||||
|
||||
# fbank -> position encoding -> overlap chunk
|
||||
feats = self.pe.forward(feats, cache["start_idx"])
|
||||
cache["start_idx"] += feats.shape[1]
|
||||
if is_final:
|
||||
if feats.shape[1] + self.chunk_size[2] <= self.chunk_size[1]:
|
||||
cache["last_chunk"] = True
|
||||
feats = self.add_overlap_chunk(feats, cache)
|
||||
else:
|
||||
# first chunk
|
||||
feats_chunk1 = self.add_overlap_chunk(feats[:, : self.chunk_size[1], :], cache)
|
||||
feats_len = np.array([feats_chunk1.shape[1]]).astype(np.int32)
|
||||
asr_res_chunk1 = self.infer(feats_chunk1, feats_len, cache)
|
||||
|
||||
# last chunk
|
||||
cache["last_chunk"] = True
|
||||
feats_chunk2 = self.add_overlap_chunk(
|
||||
feats[:, -(feats.shape[1] + self.chunk_size[2] - self.chunk_size[1]) :, :],
|
||||
cache,
|
||||
)
|
||||
feats_len = np.array([feats_chunk2.shape[1]]).astype(np.int32)
|
||||
asr_res_chunk2 = self.infer(feats_chunk2, feats_len, cache)
|
||||
|
||||
asr_res_chunk = asr_res_chunk1 + asr_res_chunk2
|
||||
res = {}
|
||||
for pred in asr_res_chunk:
|
||||
for key, value in pred.items():
|
||||
if key in res:
|
||||
res[key][0] += value[0]
|
||||
res[key][1].extend(value[1])
|
||||
else:
|
||||
res[key] = [value[0], value[1]]
|
||||
return [res]
|
||||
else:
|
||||
feats = self.add_overlap_chunk(feats, cache)
|
||||
|
||||
feats_len = np.array([feats.shape[1]]).astype(np.int32)
|
||||
asr_res = self.infer(feats, feats_len, cache)
|
||||
|
||||
return asr_res
|
||||
|
||||
def infer(self, feats: np.ndarray, feats_len: np.ndarray, cache):
|
||||
# encoder forward
|
||||
enc_input = [feats, feats_len]
|
||||
enc, enc_lens, cif_alphas = self.ort_encoder_infer(enc_input)
|
||||
|
||||
# predictor forward
|
||||
acoustic_embeds, acoustic_embeds_len = self.cif_search(enc, cif_alphas, cache)
|
||||
|
||||
# decoder forward
|
||||
asr_res = []
|
||||
if acoustic_embeds.shape[1] > 0:
|
||||
dec_input = [enc, enc_lens, acoustic_embeds, acoustic_embeds_len]
|
||||
dec_input.extend(cache["decoder_fsmn"])
|
||||
dec_output = self.ort_decoder_infer(dec_input)
|
||||
logits, sample_ids, cache["decoder_fsmn"] = dec_output[0], dec_output[1], dec_output[2:]
|
||||
cache["decoder_fsmn"] = [
|
||||
item[:, :, -self.fsmn_lorder :] for item in cache["decoder_fsmn"]
|
||||
]
|
||||
|
||||
preds = self.decode(logits, acoustic_embeds_len)
|
||||
for pred in preds:
|
||||
pred = sentence_postprocess(pred)
|
||||
asr_res.append({"preds": pred})
|
||||
|
||||
return asr_res
|
||||
|
||||
def load_data(self, wav_content: Union[str, np.ndarray, List[str]], fs: int = None) -> List:
|
||||
def load_wav(path: str) -> np.ndarray:
|
||||
waveform, _ = librosa.load(path, sr=fs)
|
||||
return waveform
|
||||
|
||||
if isinstance(wav_content, np.ndarray):
|
||||
return [wav_content]
|
||||
|
||||
if isinstance(wav_content, str):
|
||||
return [load_wav(wav_content)]
|
||||
|
||||
if isinstance(wav_content, list):
|
||||
return [load_wav(path) for path in wav_content]
|
||||
|
||||
raise TypeError(f"The type of {wav_content} is not in [str, np.ndarray, list]")
|
||||
|
||||
def extract_feat(
|
||||
self, waveforms: np.ndarray, is_final: bool = False
|
||||
) -> Tuple[np.ndarray, np.ndarray]:
|
||||
waveforms_lens = np.zeros(waveforms.shape[0]).astype(np.int32)
|
||||
for idx, waveform in enumerate(waveforms):
|
||||
waveforms_lens[idx] = waveform.shape[-1]
|
||||
|
||||
feats, feats_len = self.frontend.extract_fbank(waveforms, waveforms_lens, is_final)
|
||||
return feats.astype(np.float32), feats_len.astype(np.int32)
|
||||
|
||||
def decode(self, am_scores: np.ndarray, token_nums: int) -> List[str]:
|
||||
return [
|
||||
self.decode_one(am_score, token_num)
|
||||
for am_score, token_num in zip(am_scores, token_nums)
|
||||
]
|
||||
|
||||
def decode_one(self, am_score: np.ndarray, valid_token_num: int) -> List[str]:
|
||||
yseq = am_score.argmax(axis=-1)
|
||||
score = am_score.max(axis=-1)
|
||||
score = np.sum(score, axis=-1)
|
||||
|
||||
# pad with mask tokens to ensure compatibility with sos/eos tokens
|
||||
# asr_model.sos:1 asr_model.eos:2
|
||||
yseq = np.array([1] + yseq.tolist() + [2])
|
||||
hyp = Hypothesis(yseq=yseq, score=score)
|
||||
|
||||
# remove sos/eos and get results
|
||||
last_pos = -1
|
||||
token_int = hyp.yseq[1:last_pos].tolist()
|
||||
|
||||
# remove blank symbol id, which is assumed to be 0
|
||||
token_int = list(filter(lambda x: x not in (0, 2), token_int))
|
||||
|
||||
# Change integer-ids to tokens
|
||||
token = self.converter.ids2tokens(token_int)
|
||||
token = token[:valid_token_num]
|
||||
# texts = sentence_postprocess(token)
|
||||
return token
|
||||
|
||||
def cif_search(self, hidden, alphas, cache=None):
|
||||
batch_size, len_time, hidden_size = hidden.shape
|
||||
token_length = []
|
||||
list_fires = []
|
||||
list_frames = []
|
||||
cache_alphas = []
|
||||
cache_hiddens = []
|
||||
alphas[:, : self.chunk_size[0]] = 0.0
|
||||
alphas[:, sum(self.chunk_size[:2]) :] = 0.0
|
||||
if cache is not None and "cif_alphas" in cache and "cif_hidden" in cache:
|
||||
hidden = np.concatenate((cache["cif_hidden"], hidden), axis=1)
|
||||
alphas = np.concatenate((cache["cif_alphas"], alphas), axis=1)
|
||||
if cache is not None and "last_chunk" in cache and cache["last_chunk"]:
|
||||
tail_hidden = np.zeros((batch_size, 1, hidden_size)).astype(np.float32)
|
||||
tail_alphas = np.array([[self.tail_threshold]]).astype(np.float32)
|
||||
tail_alphas = np.tile(tail_alphas, (batch_size, 1))
|
||||
hidden = np.concatenate((hidden, tail_hidden), axis=1)
|
||||
alphas = np.concatenate((alphas, tail_alphas), axis=1)
|
||||
|
||||
len_time = alphas.shape[1]
|
||||
for b in range(batch_size):
|
||||
integrate = 0.0
|
||||
frames = np.zeros(hidden_size).astype(np.float32)
|
||||
list_frame = []
|
||||
list_fire = []
|
||||
for t in range(len_time):
|
||||
alpha = alphas[b][t]
|
||||
if alpha + integrate < self.cif_threshold:
|
||||
integrate += alpha
|
||||
list_fire.append(integrate)
|
||||
frames += alpha * hidden[b][t]
|
||||
else:
|
||||
frames += (self.cif_threshold - integrate) * hidden[b][t]
|
||||
list_frame.append(frames)
|
||||
integrate += alpha
|
||||
list_fire.append(integrate)
|
||||
integrate -= self.cif_threshold
|
||||
frames = integrate * hidden[b][t]
|
||||
|
||||
cache_alphas.append(integrate)
|
||||
if integrate > 0.0:
|
||||
cache_hiddens.append(frames / integrate)
|
||||
else:
|
||||
cache_hiddens.append(frames)
|
||||
|
||||
token_length.append(len(list_frame))
|
||||
list_fires.append(list_fire)
|
||||
list_frames.append(list_frame)
|
||||
|
||||
max_token_len = max(token_length)
|
||||
list_ls = []
|
||||
for b in range(batch_size):
|
||||
pad_frames = np.zeros((max_token_len - token_length[b], hidden_size)).astype(np.float32)
|
||||
if token_length[b] == 0:
|
||||
list_ls.append(pad_frames)
|
||||
else:
|
||||
list_ls.append(np.concatenate((list_frames[b], pad_frames), axis=0))
|
||||
|
||||
cache["cif_alphas"] = np.stack(cache_alphas, axis=0)
|
||||
cache["cif_alphas"] = np.expand_dims(cache["cif_alphas"], axis=0)
|
||||
cache["cif_hidden"] = np.stack(cache_hiddens, axis=0)
|
||||
cache["cif_hidden"] = np.expand_dims(cache["cif_hidden"], axis=0)
|
||||
|
||||
return np.stack(list_ls, axis=0).astype(np.float32), np.stack(token_length, axis=0).astype(
|
||||
np.int32
|
||||
)
|
||||
320
modules/python/vendors/FunASR/runtime/python/onnxruntime/funasr_onnx/punc_bin.py
vendored
Normal file
320
modules/python/vendors/FunASR/runtime/python/onnxruntime/funasr_onnx/punc_bin.py
vendored
Normal file
@@ -0,0 +1,320 @@
|
||||
# -*- encoding: utf-8 -*-
|
||||
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
|
||||
# MIT License (https://opensource.org/licenses/MIT)
|
||||
|
||||
import os.path
|
||||
from pathlib import Path
|
||||
from typing import List, Union, Tuple
|
||||
import numpy as np
|
||||
import json
|
||||
from .utils.utils import ONNXRuntimeError, OrtInferSession, get_logger, read_yaml
|
||||
from .utils.utils import (
|
||||
TokenIDConverter,
|
||||
split_to_mini_sentence,
|
||||
code_mix_split_words,
|
||||
code_mix_split_words_jieba,
|
||||
)
|
||||
|
||||
logging = get_logger()
|
||||
|
||||
|
||||
class CT_Transformer:
|
||||
"""
|
||||
Author: Speech Lab of DAMO Academy, Alibaba Group
|
||||
CT-Transformer: Controllable time-delay transformer for real-time punctuation prediction and disfluency detection
|
||||
https://arxiv.org/pdf/2003.01309.pdf
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model_dir: Union[str, Path] = None,
|
||||
batch_size: int = 1,
|
||||
device_id: Union[str, int] = "-1",
|
||||
quantize: bool = False,
|
||||
intra_op_num_threads: int = 4,
|
||||
cache_dir: str = None,
|
||||
**kwargs
|
||||
):
|
||||
|
||||
if not Path(model_dir).exists():
|
||||
try:
|
||||
from modelscope.hub.snapshot_download import snapshot_download
|
||||
except:
|
||||
raise "You are exporting model from modelscope, please install modelscope and try it again. To install modelscope, you could:\n" "\npip3 install -U modelscope\n" "For the users in China, you could install with the command:\n" "\npip3 install -U modelscope -i https://mirror.sjtu.edu.cn/pypi/web/simple"
|
||||
try:
|
||||
model_dir = snapshot_download(model_dir, cache_dir=cache_dir)
|
||||
except:
|
||||
raise "model_dir must be model_name in modelscope or local path downloaded from modelscope, but is {}".format(
|
||||
model_dir
|
||||
)
|
||||
|
||||
model_file = os.path.join(model_dir, "model.onnx")
|
||||
if quantize:
|
||||
model_file = os.path.join(model_dir, "model_quant.onnx")
|
||||
if not os.path.exists(model_file):
|
||||
print(".onnx does not exist, begin to export onnx")
|
||||
try:
|
||||
from funasr import AutoModel
|
||||
except:
|
||||
raise "You are exporting onnx, please install funasr and try it again. To install funasr, you could:\n" "\npip3 install -U funasr\n" "For the users in China, you could install with the command:\n" "\npip3 install -U funasr -i https://mirror.sjtu.edu.cn/pypi/web/simple"
|
||||
|
||||
model = AutoModel(model=model_dir)
|
||||
model_dir = model.export(type="onnx", quantize=quantize, **kwargs)
|
||||
|
||||
config_file = os.path.join(model_dir, "config.yaml")
|
||||
config = read_yaml(config_file)
|
||||
token_list = os.path.join(model_dir, "tokens.json")
|
||||
with open(token_list, "r", encoding="utf-8") as f:
|
||||
token_list = json.load(f)
|
||||
|
||||
self.converter = TokenIDConverter(token_list)
|
||||
self.ort_infer = OrtInferSession(
|
||||
model_file, device_id, intra_op_num_threads=intra_op_num_threads
|
||||
)
|
||||
self.batch_size = 1
|
||||
self.punc_list = config["model_conf"]["punc_list"]
|
||||
self.period = 0
|
||||
for i in range(len(self.punc_list)):
|
||||
if self.punc_list[i] == ",":
|
||||
self.punc_list[i] = ","
|
||||
elif self.punc_list[i] == "?":
|
||||
self.punc_list[i] = "?"
|
||||
elif self.punc_list[i] == "。":
|
||||
self.period = i
|
||||
self.jieba_usr_dict_path = os.path.join(model_dir, "jieba_usr_dict")
|
||||
if os.path.exists(self.jieba_usr_dict_path):
|
||||
self.seg_jieba = True
|
||||
self.code_mix_split_words_jieba = code_mix_split_words_jieba(self.jieba_usr_dict_path)
|
||||
else:
|
||||
self.seg_jieba = False
|
||||
|
||||
def __call__(self, text: Union[list, str], split_size=20):
|
||||
if self.seg_jieba:
|
||||
split_text = self.code_mix_split_words_jieba(text)
|
||||
else:
|
||||
split_text = code_mix_split_words(text)
|
||||
split_text_id = self.converter.tokens2ids(split_text)
|
||||
mini_sentences = split_to_mini_sentence(split_text, split_size)
|
||||
mini_sentences_id = split_to_mini_sentence(split_text_id, split_size)
|
||||
assert len(mini_sentences) == len(mini_sentences_id)
|
||||
cache_sent = []
|
||||
cache_sent_id = []
|
||||
new_mini_sentence = ""
|
||||
new_mini_sentence_punc = []
|
||||
cache_pop_trigger_limit = 200
|
||||
for mini_sentence_i in range(len(mini_sentences)):
|
||||
mini_sentence = mini_sentences[mini_sentence_i]
|
||||
mini_sentence_id = mini_sentences_id[mini_sentence_i]
|
||||
mini_sentence = cache_sent + mini_sentence
|
||||
mini_sentence_id = np.array(cache_sent_id + mini_sentence_id, dtype="int32")
|
||||
data = {
|
||||
"text": mini_sentence_id[None, :],
|
||||
"text_lengths": np.array([len(mini_sentence_id)], dtype="int32"),
|
||||
}
|
||||
try:
|
||||
outputs = self.infer(data["text"], data["text_lengths"])
|
||||
y = outputs[0]
|
||||
punctuations = np.argmax(y, axis=-1)[0]
|
||||
assert punctuations.size == len(mini_sentence)
|
||||
except ONNXRuntimeError:
|
||||
logging.warning("error")
|
||||
|
||||
# Search for the last Period/QuestionMark as cache
|
||||
if mini_sentence_i < len(mini_sentences) - 1:
|
||||
sentenceEnd = -1
|
||||
last_comma_index = -1
|
||||
for i in range(len(punctuations) - 2, 1, -1):
|
||||
if (
|
||||
self.punc_list[punctuations[i]] == "。"
|
||||
or self.punc_list[punctuations[i]] == "?"
|
||||
):
|
||||
sentenceEnd = i
|
||||
break
|
||||
if last_comma_index < 0 and self.punc_list[punctuations[i]] == ",":
|
||||
last_comma_index = i
|
||||
|
||||
if (
|
||||
sentenceEnd < 0
|
||||
and len(mini_sentence) > cache_pop_trigger_limit
|
||||
and last_comma_index >= 0
|
||||
):
|
||||
# The sentence it too long, cut off at a comma.
|
||||
sentenceEnd = last_comma_index
|
||||
punctuations[sentenceEnd] = self.period
|
||||
cache_sent = mini_sentence[sentenceEnd + 1 :]
|
||||
cache_sent_id = mini_sentence_id[sentenceEnd + 1 :].tolist()
|
||||
mini_sentence = mini_sentence[0 : sentenceEnd + 1]
|
||||
punctuations = punctuations[0 : sentenceEnd + 1]
|
||||
|
||||
new_mini_sentence_punc += [int(x) for x in punctuations]
|
||||
words_with_punc = []
|
||||
for i in range(len(mini_sentence)):
|
||||
if i > 0:
|
||||
if (
|
||||
len(mini_sentence[i][0].encode()) == 1
|
||||
and len(mini_sentence[i - 1][0].encode()) == 1
|
||||
):
|
||||
mini_sentence[i] = " " + mini_sentence[i]
|
||||
words_with_punc.append(mini_sentence[i])
|
||||
if self.punc_list[punctuations[i]] != "_":
|
||||
words_with_punc.append(self.punc_list[punctuations[i]])
|
||||
new_mini_sentence += "".join(words_with_punc)
|
||||
# Add Period for the end of the sentence
|
||||
new_mini_sentence_out = new_mini_sentence
|
||||
new_mini_sentence_punc_out = new_mini_sentence_punc
|
||||
if mini_sentence_i == len(mini_sentences) - 1:
|
||||
if new_mini_sentence[-1] == "," or new_mini_sentence[-1] == "、":
|
||||
new_mini_sentence_out = new_mini_sentence[:-1] + "。"
|
||||
new_mini_sentence_punc_out = new_mini_sentence_punc[:-1] + [self.period]
|
||||
elif new_mini_sentence[-1] != "。" and new_mini_sentence[-1] != "?":
|
||||
new_mini_sentence_out = new_mini_sentence + "。"
|
||||
new_mini_sentence_punc_out = new_mini_sentence_punc[:-1] + [self.period]
|
||||
return new_mini_sentence_out, new_mini_sentence_punc_out
|
||||
|
||||
def infer(self, feats: np.ndarray, feats_len: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
|
||||
outputs = self.ort_infer([feats, feats_len])
|
||||
return outputs
|
||||
|
||||
|
||||
class CT_Transformer_VadRealtime(CT_Transformer):
|
||||
"""
|
||||
Author: Speech Lab of DAMO Academy, Alibaba Group
|
||||
CT-Transformer: Controllable time-delay transformer for real-time punctuation prediction and disfluency detection
|
||||
https://arxiv.org/pdf/2003.01309.pdf
|
||||
"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
def __call__(self, text: str, param_dict: map, split_size=20):
|
||||
cache_key = "cache"
|
||||
assert cache_key in param_dict
|
||||
cache = param_dict[cache_key]
|
||||
if cache is not None and len(cache) > 0:
|
||||
precache = "".join(cache)
|
||||
else:
|
||||
precache = ""
|
||||
cache = []
|
||||
full_text = precache + " " + text
|
||||
split_text = code_mix_split_words(full_text)
|
||||
split_text_id = self.converter.tokens2ids(split_text)
|
||||
mini_sentences = split_to_mini_sentence(split_text, split_size)
|
||||
mini_sentences_id = split_to_mini_sentence(split_text_id, split_size)
|
||||
new_mini_sentence_punc = []
|
||||
assert len(mini_sentences) == len(mini_sentences_id)
|
||||
|
||||
cache_sent = []
|
||||
cache_sent_id = np.array([], dtype="int32")
|
||||
sentence_punc_list = []
|
||||
sentence_words_list = []
|
||||
cache_pop_trigger_limit = 200
|
||||
skip_num = 0
|
||||
for mini_sentence_i in range(len(mini_sentences)):
|
||||
mini_sentence = mini_sentences[mini_sentence_i]
|
||||
mini_sentence_id = mini_sentences_id[mini_sentence_i]
|
||||
mini_sentence = cache_sent + mini_sentence
|
||||
mini_sentence_id = np.concatenate(
|
||||
(cache_sent_id, mini_sentence_id), axis=0, dtype="int32"
|
||||
)
|
||||
text_length = len(mini_sentence_id)
|
||||
vad_mask = self.vad_mask(text_length, len(cache))[None, None, :, :].astype(np.float32)
|
||||
data = {
|
||||
"input": mini_sentence_id[None, :],
|
||||
"text_lengths": np.array([text_length], dtype="int32"),
|
||||
"vad_mask": vad_mask,
|
||||
"sub_masks": vad_mask,
|
||||
}
|
||||
try:
|
||||
outputs = self.infer(
|
||||
data["input"], data["text_lengths"], data["vad_mask"], data["sub_masks"]
|
||||
)
|
||||
y = outputs[0]
|
||||
punctuations = np.argmax(y, axis=-1)[0]
|
||||
assert punctuations.size == len(mini_sentence)
|
||||
except ONNXRuntimeError:
|
||||
logging.warning("error")
|
||||
|
||||
# Search for the last Period/QuestionMark as cache
|
||||
if mini_sentence_i < len(mini_sentences) - 1:
|
||||
sentenceEnd = -1
|
||||
last_comma_index = -1
|
||||
for i in range(len(punctuations) - 2, 1, -1):
|
||||
if (
|
||||
self.punc_list[punctuations[i]] == "。"
|
||||
or self.punc_list[punctuations[i]] == "?"
|
||||
):
|
||||
sentenceEnd = i
|
||||
break
|
||||
if last_comma_index < 0 and self.punc_list[punctuations[i]] == ",":
|
||||
last_comma_index = i
|
||||
|
||||
if (
|
||||
sentenceEnd < 0
|
||||
and len(mini_sentence) > cache_pop_trigger_limit
|
||||
and last_comma_index >= 0
|
||||
):
|
||||
# The sentence it too long, cut off at a comma.
|
||||
sentenceEnd = last_comma_index
|
||||
punctuations[sentenceEnd] = self.period
|
||||
cache_sent = mini_sentence[sentenceEnd + 1 :]
|
||||
cache_sent_id = mini_sentence_id[sentenceEnd + 1 :]
|
||||
mini_sentence = mini_sentence[0 : sentenceEnd + 1]
|
||||
punctuations = punctuations[0 : sentenceEnd + 1]
|
||||
|
||||
punctuations_np = [int(x) for x in punctuations]
|
||||
new_mini_sentence_punc += punctuations_np
|
||||
sentence_punc_list += [self.punc_list[int(x)] for x in punctuations_np]
|
||||
sentence_words_list += mini_sentence
|
||||
|
||||
assert len(sentence_punc_list) == len(sentence_words_list)
|
||||
words_with_punc = []
|
||||
sentence_punc_list_out = []
|
||||
for i in range(0, len(sentence_words_list)):
|
||||
if i > 0:
|
||||
if (
|
||||
len(sentence_words_list[i][0].encode()) == 1
|
||||
and len(sentence_words_list[i - 1][-1].encode()) == 1
|
||||
):
|
||||
sentence_words_list[i] = " " + sentence_words_list[i]
|
||||
if skip_num < len(cache):
|
||||
skip_num += 1
|
||||
else:
|
||||
words_with_punc.append(sentence_words_list[i])
|
||||
if skip_num >= len(cache):
|
||||
sentence_punc_list_out.append(sentence_punc_list[i])
|
||||
if sentence_punc_list[i] != "_":
|
||||
words_with_punc.append(sentence_punc_list[i])
|
||||
sentence_out = "".join(words_with_punc)
|
||||
|
||||
sentenceEnd = -1
|
||||
for i in range(len(sentence_punc_list) - 2, 1, -1):
|
||||
if sentence_punc_list[i] == "。" or sentence_punc_list[i] == "?":
|
||||
sentenceEnd = i
|
||||
break
|
||||
cache_out = sentence_words_list[sentenceEnd + 1 :]
|
||||
if sentence_out[-1] in self.punc_list:
|
||||
sentence_out = sentence_out[:-1]
|
||||
sentence_punc_list_out[-1] = "_"
|
||||
param_dict[cache_key] = cache_out
|
||||
return sentence_out, sentence_punc_list_out, cache_out
|
||||
|
||||
def vad_mask(self, size, vad_pos, dtype=bool):
|
||||
"""Create mask for decoder self-attention.
|
||||
|
||||
:param int size: size of mask
|
||||
:param int vad_pos: index of vad index
|
||||
:param torch.dtype dtype: result dtype
|
||||
:rtype: torch.Tensor (B, Lmax, Lmax)
|
||||
"""
|
||||
ret = np.ones((size, size), dtype=dtype)
|
||||
if vad_pos <= 0 or vad_pos >= size:
|
||||
return ret
|
||||
sub_corner = np.zeros((vad_pos - 1, size - vad_pos), dtype=dtype)
|
||||
ret[0 : vad_pos - 1, vad_pos:] = sub_corner
|
||||
return ret
|
||||
|
||||
def infer(
|
||||
self, feats: np.ndarray, feats_len: np.ndarray, vad_mask: np.ndarray, sub_masks: np.ndarray
|
||||
) -> Tuple[np.ndarray, np.ndarray]:
|
||||
outputs = self.ort_infer([feats, feats_len, vad_mask, sub_masks])
|
||||
return outputs
|
||||
244
modules/python/vendors/FunASR/runtime/python/onnxruntime/funasr_onnx/sensevoice_bin.py
vendored
Normal file
244
modules/python/vendors/FunASR/runtime/python/onnxruntime/funasr_onnx/sensevoice_bin.py
vendored
Normal file
@@ -0,0 +1,244 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- encoding: utf-8 -*-
|
||||
# Copyright FunASR (https://github.com/FunAudioLLM/SenseVoice). All Rights Reserved.
|
||||
# MIT License (https://opensource.org/licenses/MIT)
|
||||
|
||||
|
||||
import torch
|
||||
import os.path
|
||||
import librosa
|
||||
import numpy as np
|
||||
from pathlib import Path
|
||||
from typing import List, Union, Tuple
|
||||
|
||||
from .utils.utils import (
|
||||
CharTokenizer,
|
||||
Hypothesis,
|
||||
ONNXRuntimeError,
|
||||
OrtInferSession,
|
||||
TokenIDConverter,
|
||||
get_logger,
|
||||
read_yaml,
|
||||
)
|
||||
from .utils.sentencepiece_tokenizer import SentencepiecesTokenizer
|
||||
from .utils.frontend import WavFrontend
|
||||
|
||||
logging = get_logger()
|
||||
|
||||
|
||||
class SenseVoiceSmall:
|
||||
"""
|
||||
Author: Speech Lab of DAMO Academy, Alibaba Group
|
||||
Paraformer: Fast and Accurate Parallel Transformer for Non-autoregressive End-to-End Speech Recognition
|
||||
https://arxiv.org/abs/2206.08317
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model_dir: Union[str, Path] = None,
|
||||
batch_size: int = 1,
|
||||
device_id: Union[str, int] = "-1",
|
||||
plot_timestamp_to: str = "",
|
||||
quantize: bool = False,
|
||||
intra_op_num_threads: int = 4,
|
||||
cache_dir: str = None,
|
||||
**kwargs,
|
||||
):
|
||||
|
||||
if not Path(model_dir).exists():
|
||||
try:
|
||||
from modelscope.hub.snapshot_download import snapshot_download
|
||||
except:
|
||||
raise "You are exporting model from modelscope, please install modelscope and try it again. To install modelscope, you could:\n" "\npip3 install -U modelscope\n" "For the users in China, you could install with the command:\n" "\npip3 install -U modelscope -i https://mirror.sjtu.edu.cn/pypi/web/simple"
|
||||
try:
|
||||
model_dir = snapshot_download(model_dir, cache_dir=cache_dir)
|
||||
except:
|
||||
raise "model_dir must be model_name in modelscope or local path downloaded from modelscope, but is {}".format(
|
||||
model_dir
|
||||
)
|
||||
|
||||
model_file = os.path.join(model_dir, "model.onnx")
|
||||
if quantize:
|
||||
model_file = os.path.join(model_dir, "model_quant.onnx")
|
||||
if not os.path.exists(model_file):
|
||||
print(".onnx does not exist, begin to export onnx")
|
||||
try:
|
||||
from funasr import AutoModel
|
||||
except:
|
||||
raise "You are exporting onnx, please install funasr and try it again. To install funasr, you could:\n" "\npip3 install -U funasr\n" "For the users in China, you could install with the command:\n" "\npip3 install -U funasr -i https://mirror.sjtu.edu.cn/pypi/web/simple"
|
||||
|
||||
model = AutoModel(model=model_dir)
|
||||
model_dir = model.export(type="onnx", quantize=quantize, **kwargs)
|
||||
|
||||
config_file = os.path.join(model_dir, "config.yaml")
|
||||
cmvn_file = os.path.join(model_dir, "am.mvn")
|
||||
config = read_yaml(config_file)
|
||||
|
||||
self.tokenizer = SentencepiecesTokenizer(
|
||||
bpemodel=os.path.join(model_dir, "chn_jpn_yue_eng_ko_spectok.bpe.model")
|
||||
)
|
||||
config["frontend_conf"]["cmvn_file"] = cmvn_file
|
||||
self.frontend = WavFrontend(**config["frontend_conf"])
|
||||
self.ort_infer = OrtInferSession(
|
||||
model_file, device_id, intra_op_num_threads=intra_op_num_threads
|
||||
)
|
||||
self.batch_size = batch_size
|
||||
self.blank_id = 0
|
||||
self.lid_dict = {"auto": 0, "zh": 3, "en": 4, "yue": 7, "ja": 11, "ko": 12, "nospeech": 13}
|
||||
self.lid_int_dict = {24884: 3, 24885: 4, 24888: 7, 24892: 11, 24896: 12, 24992: 13}
|
||||
self.textnorm_dict = {"withitn": 14, "woitn": 15}
|
||||
self.textnorm_int_dict = {25016: 14, 25017: 15}
|
||||
|
||||
def _get_lid(self, lid):
|
||||
if lid in list(self.lid_dict.keys()):
|
||||
return self.lid_dict[lid]
|
||||
else:
|
||||
raise ValueError(
|
||||
f"The language {l} is not in {list(self.lid_dict.keys())}"
|
||||
)
|
||||
|
||||
def _get_tnid(self, tnid):
|
||||
if tnid in list(self.textnorm_dict.keys()):
|
||||
return self.textnorm_dict[tnid]
|
||||
else:
|
||||
raise ValueError(
|
||||
f"The textnorm {tnid} is not in {list(self.textnorm_dict.keys())}"
|
||||
)
|
||||
|
||||
def read_tags(self, language_input, textnorm_input):
|
||||
# handle language
|
||||
if isinstance(language_input, list):
|
||||
language_list = []
|
||||
for l in language_input:
|
||||
language_list.append(self._get_lid(l))
|
||||
elif isinstance(language_input, str):
|
||||
# if is existing file
|
||||
if os.path.exists(language_input):
|
||||
language_file = open(language_input, "r").readlines()
|
||||
language_list = [
|
||||
self._get_lid(l.strip())
|
||||
for l in language_file
|
||||
]
|
||||
else:
|
||||
language_list = [self._get_lid(language_input)]
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Unsupported type {type(language_input)} for language_input"
|
||||
)
|
||||
# handle textnorm
|
||||
if isinstance(textnorm_input, list):
|
||||
textnorm_list = []
|
||||
for tn in textnorm_input:
|
||||
textnorm_list.append(self._get_tnid(tn))
|
||||
elif isinstance(textnorm_input, str):
|
||||
# if is existing file
|
||||
if os.path.exists(textnorm_input):
|
||||
textnorm_file = open(textnorm_input, "r").readlines()
|
||||
textnorm_list = [
|
||||
self._get_tnid(tn.strip())
|
||||
for tn in textnorm_file
|
||||
]
|
||||
else:
|
||||
textnorm_list = [self._get_tnid(textnorm_input)]
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Unsupported type {type(textnorm_input)} for textnorm_input"
|
||||
)
|
||||
return language_list, textnorm_list
|
||||
|
||||
def __call__(self, wav_content: Union[str, np.ndarray, List[str]], **kwargs):
|
||||
language_input = kwargs.get("language", "auto")
|
||||
textnorm_input = kwargs.get("textnorm", "woitn")
|
||||
language_list, textnorm_list = self.read_tags(language_input, textnorm_input)
|
||||
|
||||
waveform_list = self.load_data(wav_content, self.frontend.opts.frame_opts.samp_freq)
|
||||
waveform_nums = len(waveform_list)
|
||||
|
||||
assert len(language_list) == 1 or len(language_list) == waveform_nums, \
|
||||
"length of parsed language list should be 1 or equal to the number of waveforms"
|
||||
assert len(textnorm_list) == 1 or len(textnorm_list) == waveform_nums, \
|
||||
"length of parsed textnorm list should be 1 or equal to the number of waveforms"
|
||||
|
||||
asr_res = []
|
||||
for beg_idx in range(0, waveform_nums, self.batch_size):
|
||||
end_idx = min(waveform_nums, beg_idx + self.batch_size)
|
||||
feats, feats_len = self.extract_feat(waveform_list[beg_idx:end_idx])
|
||||
_language_list = language_list[beg_idx:end_idx]
|
||||
_textnorm_list = textnorm_list[beg_idx:end_idx]
|
||||
if not len(_language_list):
|
||||
_language_list = [language_list[0]]
|
||||
_textnorm_list = [textnorm_list[0]]
|
||||
B = feats.shape[0]
|
||||
if len(_language_list) == 1 and B != 1:
|
||||
_language_list = _language_list * B
|
||||
if len(_textnorm_list) == 1 and B != 1:
|
||||
_textnorm_list = _textnorm_list * B
|
||||
ctc_logits, encoder_out_lens = self.infer(
|
||||
feats,
|
||||
feats_len,
|
||||
np.array(_language_list, dtype=np.int32),
|
||||
np.array(_textnorm_list, dtype=np.int32),
|
||||
)
|
||||
for b in range(feats.shape[0]):
|
||||
# back to torch.Tensor
|
||||
if isinstance(ctc_logits, np.ndarray):
|
||||
ctc_logits = torch.from_numpy(ctc_logits).float()
|
||||
# support batch_size=1 only currently
|
||||
x = ctc_logits[b, : encoder_out_lens[b].item(), :]
|
||||
yseq = x.argmax(dim=-1)
|
||||
yseq = torch.unique_consecutive(yseq, dim=-1)
|
||||
|
||||
mask = yseq != self.blank_id
|
||||
token_int = yseq[mask].tolist()
|
||||
|
||||
asr_res.append(self.tokenizer.decode(token_int))
|
||||
|
||||
return asr_res
|
||||
|
||||
def load_data(self, wav_content: Union[str, np.ndarray, List[str]], fs: int = None) -> List:
|
||||
def load_wav(path: str) -> np.ndarray:
|
||||
waveform, _ = librosa.load(path, sr=fs)
|
||||
return waveform
|
||||
|
||||
if isinstance(wav_content, np.ndarray):
|
||||
return [wav_content]
|
||||
|
||||
if isinstance(wav_content, str):
|
||||
return [load_wav(wav_content)]
|
||||
|
||||
if isinstance(wav_content, list):
|
||||
return [load_wav(path) for path in wav_content]
|
||||
|
||||
raise TypeError(f"The type of {wav_content} is not in [str, np.ndarray, list]")
|
||||
|
||||
def extract_feat(self, waveform_list: List[np.ndarray]) -> Tuple[np.ndarray, np.ndarray]:
|
||||
feats, feats_len = [], []
|
||||
for waveform in waveform_list:
|
||||
speech, _ = self.frontend.fbank(waveform)
|
||||
feat, feat_len = self.frontend.lfr_cmvn(speech)
|
||||
feats.append(feat)
|
||||
feats_len.append(feat_len)
|
||||
|
||||
feats = self.pad_feats(feats, np.max(feats_len))
|
||||
feats_len = np.array(feats_len).astype(np.int32)
|
||||
return feats, feats_len
|
||||
|
||||
@staticmethod
|
||||
def pad_feats(feats: List[np.ndarray], max_feat_len: int) -> np.ndarray:
|
||||
def pad_feat(feat: np.ndarray, cur_len: int) -> np.ndarray:
|
||||
pad_width = ((0, max_feat_len - cur_len), (0, 0))
|
||||
return np.pad(feat, pad_width, "constant", constant_values=0)
|
||||
|
||||
feat_res = [pad_feat(feat, feat.shape[0]) for feat in feats]
|
||||
feats = np.array(feat_res).astype(np.float32)
|
||||
return feats
|
||||
|
||||
def infer(
|
||||
self,
|
||||
feats: np.ndarray,
|
||||
feats_len: np.ndarray,
|
||||
language: np.ndarray,
|
||||
textnorm: np.ndarray,
|
||||
) -> Tuple[np.ndarray, np.ndarray]:
|
||||
outputs = self.ort_infer([feats, feats_len, language, textnorm])
|
||||
return outputs
|
||||
0
modules/python/vendors/FunASR/runtime/python/onnxruntime/funasr_onnx/utils/__init__.py
vendored
Normal file
0
modules/python/vendors/FunASR/runtime/python/onnxruntime/funasr_onnx/utils/__init__.py
vendored
Normal file
704
modules/python/vendors/FunASR/runtime/python/onnxruntime/funasr_onnx/utils/e2e_vad.py
vendored
Normal file
704
modules/python/vendors/FunASR/runtime/python/onnxruntime/funasr_onnx/utils/e2e_vad.py
vendored
Normal file
@@ -0,0 +1,704 @@
|
||||
# -*- encoding: utf-8 -*-
|
||||
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
|
||||
# MIT License (https://opensource.org/licenses/MIT)
|
||||
|
||||
from enum import Enum
|
||||
from typing import List, Tuple, Dict, Any
|
||||
|
||||
import math
|
||||
import numpy as np
|
||||
|
||||
|
||||
class VadStateMachine(Enum):
|
||||
kVadInStateStartPointNotDetected = 1
|
||||
kVadInStateInSpeechSegment = 2
|
||||
kVadInStateEndPointDetected = 3
|
||||
|
||||
|
||||
class FrameState(Enum):
|
||||
kFrameStateInvalid = -1
|
||||
kFrameStateSpeech = 1
|
||||
kFrameStateSil = 0
|
||||
|
||||
|
||||
# final voice/unvoice state per frame
|
||||
class AudioChangeState(Enum):
|
||||
kChangeStateSpeech2Speech = 0
|
||||
kChangeStateSpeech2Sil = 1
|
||||
kChangeStateSil2Sil = 2
|
||||
kChangeStateSil2Speech = 3
|
||||
kChangeStateNoBegin = 4
|
||||
kChangeStateInvalid = 5
|
||||
|
||||
|
||||
class VadDetectMode(Enum):
|
||||
kVadSingleUtteranceDetectMode = 0
|
||||
kVadMutipleUtteranceDetectMode = 1
|
||||
|
||||
|
||||
class VADXOptions:
|
||||
def __init__(
|
||||
self,
|
||||
sample_rate: int = 16000,
|
||||
detect_mode: int = VadDetectMode.kVadMutipleUtteranceDetectMode.value,
|
||||
snr_mode: int = 0,
|
||||
max_end_silence_time: int = 800,
|
||||
max_start_silence_time: int = 3000,
|
||||
do_start_point_detection: bool = True,
|
||||
do_end_point_detection: bool = True,
|
||||
window_size_ms: int = 200,
|
||||
sil_to_speech_time_thres: int = 150,
|
||||
speech_to_sil_time_thres: int = 150,
|
||||
speech_2_noise_ratio: float = 1.0,
|
||||
do_extend: int = 1,
|
||||
lookback_time_start_point: int = 200,
|
||||
lookahead_time_end_point: int = 100,
|
||||
max_single_segment_time: int = 60000,
|
||||
nn_eval_block_size: int = 8,
|
||||
dcd_block_size: int = 4,
|
||||
snr_thres: int = -100.0,
|
||||
noise_frame_num_used_for_snr: int = 100,
|
||||
decibel_thres: int = -100.0,
|
||||
speech_noise_thres: float = 0.6,
|
||||
fe_prior_thres: float = 1e-4,
|
||||
silence_pdf_num: int = 1,
|
||||
sil_pdf_ids: List[int] = [0],
|
||||
speech_noise_thresh_low: float = -0.1,
|
||||
speech_noise_thresh_high: float = 0.3,
|
||||
output_frame_probs: bool = False,
|
||||
frame_in_ms: int = 10,
|
||||
frame_length_ms: int = 25,
|
||||
):
|
||||
self.sample_rate = sample_rate
|
||||
self.detect_mode = detect_mode
|
||||
self.snr_mode = snr_mode
|
||||
self.max_end_silence_time = max_end_silence_time
|
||||
self.max_start_silence_time = max_start_silence_time
|
||||
self.do_start_point_detection = do_start_point_detection
|
||||
self.do_end_point_detection = do_end_point_detection
|
||||
self.window_size_ms = window_size_ms
|
||||
self.sil_to_speech_time_thres = sil_to_speech_time_thres
|
||||
self.speech_to_sil_time_thres = speech_to_sil_time_thres
|
||||
self.speech_2_noise_ratio = speech_2_noise_ratio
|
||||
self.do_extend = do_extend
|
||||
self.lookback_time_start_point = lookback_time_start_point
|
||||
self.lookahead_time_end_point = lookahead_time_end_point
|
||||
self.max_single_segment_time = max_single_segment_time
|
||||
self.nn_eval_block_size = nn_eval_block_size
|
||||
self.dcd_block_size = dcd_block_size
|
||||
self.snr_thres = snr_thres
|
||||
self.noise_frame_num_used_for_snr = noise_frame_num_used_for_snr
|
||||
self.decibel_thres = decibel_thres
|
||||
self.speech_noise_thres = speech_noise_thres
|
||||
self.fe_prior_thres = fe_prior_thres
|
||||
self.silence_pdf_num = silence_pdf_num
|
||||
self.sil_pdf_ids = sil_pdf_ids
|
||||
self.speech_noise_thresh_low = speech_noise_thresh_low
|
||||
self.speech_noise_thresh_high = speech_noise_thresh_high
|
||||
self.output_frame_probs = output_frame_probs
|
||||
self.frame_in_ms = frame_in_ms
|
||||
self.frame_length_ms = frame_length_ms
|
||||
|
||||
|
||||
class E2EVadSpeechBufWithDoa(object):
|
||||
def __init__(self):
|
||||
self.start_ms = 0
|
||||
self.end_ms = 0
|
||||
self.buffer = []
|
||||
self.contain_seg_start_point = False
|
||||
self.contain_seg_end_point = False
|
||||
self.doa = 0
|
||||
|
||||
def Reset(self):
|
||||
self.start_ms = 0
|
||||
self.end_ms = 0
|
||||
self.buffer = []
|
||||
self.contain_seg_start_point = False
|
||||
self.contain_seg_end_point = False
|
||||
self.doa = 0
|
||||
|
||||
|
||||
class E2EVadFrameProb(object):
|
||||
def __init__(self):
|
||||
self.noise_prob = 0.0
|
||||
self.speech_prob = 0.0
|
||||
self.score = 0.0
|
||||
self.frame_id = 0
|
||||
self.frm_state = 0
|
||||
|
||||
|
||||
class WindowDetector(object):
|
||||
def __init__(
|
||||
self,
|
||||
window_size_ms: int,
|
||||
sil_to_speech_time: int,
|
||||
speech_to_sil_time: int,
|
||||
frame_size_ms: int,
|
||||
):
|
||||
self.window_size_ms = window_size_ms
|
||||
self.sil_to_speech_time = sil_to_speech_time
|
||||
self.speech_to_sil_time = speech_to_sil_time
|
||||
self.frame_size_ms = frame_size_ms
|
||||
|
||||
self.win_size_frame = int(window_size_ms / frame_size_ms)
|
||||
self.win_sum = 0
|
||||
self.win_state = [0] * self.win_size_frame # 初始化窗
|
||||
|
||||
self.cur_win_pos = 0
|
||||
self.pre_frame_state = FrameState.kFrameStateSil
|
||||
self.cur_frame_state = FrameState.kFrameStateSil
|
||||
self.sil_to_speech_frmcnt_thres = int(sil_to_speech_time / frame_size_ms)
|
||||
self.speech_to_sil_frmcnt_thres = int(speech_to_sil_time / frame_size_ms)
|
||||
|
||||
self.voice_last_frame_count = 0
|
||||
self.noise_last_frame_count = 0
|
||||
self.hydre_frame_count = 0
|
||||
|
||||
def Reset(self) -> None:
|
||||
self.cur_win_pos = 0
|
||||
self.win_sum = 0
|
||||
self.win_state = [0] * self.win_size_frame
|
||||
self.pre_frame_state = FrameState.kFrameStateSil
|
||||
self.cur_frame_state = FrameState.kFrameStateSil
|
||||
self.voice_last_frame_count = 0
|
||||
self.noise_last_frame_count = 0
|
||||
self.hydre_frame_count = 0
|
||||
|
||||
def GetWinSize(self) -> int:
|
||||
return int(self.win_size_frame)
|
||||
|
||||
def DetectOneFrame(self, frameState: FrameState, frame_count: int) -> AudioChangeState:
|
||||
cur_frame_state = FrameState.kFrameStateSil
|
||||
if frameState == FrameState.kFrameStateSpeech:
|
||||
cur_frame_state = 1
|
||||
elif frameState == FrameState.kFrameStateSil:
|
||||
cur_frame_state = 0
|
||||
else:
|
||||
return AudioChangeState.kChangeStateInvalid
|
||||
self.win_sum -= self.win_state[self.cur_win_pos]
|
||||
self.win_sum += cur_frame_state
|
||||
self.win_state[self.cur_win_pos] = cur_frame_state
|
||||
self.cur_win_pos = (self.cur_win_pos + 1) % self.win_size_frame
|
||||
|
||||
if (
|
||||
self.pre_frame_state == FrameState.kFrameStateSil
|
||||
and self.win_sum >= self.sil_to_speech_frmcnt_thres
|
||||
):
|
||||
self.pre_frame_state = FrameState.kFrameStateSpeech
|
||||
return AudioChangeState.kChangeStateSil2Speech
|
||||
|
||||
if (
|
||||
self.pre_frame_state == FrameState.kFrameStateSpeech
|
||||
and self.win_sum <= self.speech_to_sil_frmcnt_thres
|
||||
):
|
||||
self.pre_frame_state = FrameState.kFrameStateSil
|
||||
return AudioChangeState.kChangeStateSpeech2Sil
|
||||
|
||||
if self.pre_frame_state == FrameState.kFrameStateSil:
|
||||
return AudioChangeState.kChangeStateSil2Sil
|
||||
if self.pre_frame_state == FrameState.kFrameStateSpeech:
|
||||
return AudioChangeState.kChangeStateSpeech2Speech
|
||||
return AudioChangeState.kChangeStateInvalid
|
||||
|
||||
def FrameSizeMs(self) -> int:
|
||||
return int(self.frame_size_ms)
|
||||
|
||||
|
||||
class E2EVadModel:
|
||||
"""
|
||||
Author: Speech Lab of DAMO Academy, Alibaba Group
|
||||
Deep-FSMN for Large Vocabulary Continuous Speech Recognition
|
||||
https://arxiv.org/abs/1803.05030
|
||||
"""
|
||||
|
||||
def __init__(self, vad_post_args: Dict[str, Any]):
|
||||
super(E2EVadModel, self).__init__()
|
||||
self.vad_opts = VADXOptions(**vad_post_args)
|
||||
self.windows_detector = WindowDetector(
|
||||
self.vad_opts.window_size_ms,
|
||||
self.vad_opts.sil_to_speech_time_thres,
|
||||
self.vad_opts.speech_to_sil_time_thres,
|
||||
self.vad_opts.frame_in_ms,
|
||||
)
|
||||
# self.encoder = encoder
|
||||
# init variables
|
||||
self.is_final = False
|
||||
self.data_buf_start_frame = 0
|
||||
self.frm_cnt = 0
|
||||
self.latest_confirmed_speech_frame = 0
|
||||
self.lastest_confirmed_silence_frame = -1
|
||||
self.continous_silence_frame_count = 0
|
||||
self.vad_state_machine = VadStateMachine.kVadInStateStartPointNotDetected
|
||||
self.confirmed_start_frame = -1
|
||||
self.confirmed_end_frame = -1
|
||||
self.number_end_time_detected = 0
|
||||
self.sil_frame = 0
|
||||
self.sil_pdf_ids = self.vad_opts.sil_pdf_ids
|
||||
self.noise_average_decibel = -100.0
|
||||
self.pre_end_silence_detected = False
|
||||
self.next_seg = True
|
||||
|
||||
self.output_data_buf = []
|
||||
self.output_data_buf_offset = 0
|
||||
self.frame_probs = []
|
||||
self.max_end_sil_frame_cnt_thresh = (
|
||||
self.vad_opts.max_end_silence_time - self.vad_opts.speech_to_sil_time_thres
|
||||
)
|
||||
self.speech_noise_thres = self.vad_opts.speech_noise_thres
|
||||
self.scores = None
|
||||
self.idx_pre_chunk = 0
|
||||
self.max_time_out = False
|
||||
self.decibel = []
|
||||
self.data_buf_size = 0
|
||||
self.data_buf_all_size = 0
|
||||
self.waveform = None
|
||||
self.ResetDetection()
|
||||
|
||||
def AllResetDetection(self):
|
||||
self.is_final = False
|
||||
self.data_buf_start_frame = 0
|
||||
self.frm_cnt = 0
|
||||
self.latest_confirmed_speech_frame = 0
|
||||
self.lastest_confirmed_silence_frame = -1
|
||||
self.continous_silence_frame_count = 0
|
||||
self.vad_state_machine = VadStateMachine.kVadInStateStartPointNotDetected
|
||||
self.confirmed_start_frame = -1
|
||||
self.confirmed_end_frame = -1
|
||||
self.number_end_time_detected = 0
|
||||
self.sil_frame = 0
|
||||
self.sil_pdf_ids = self.vad_opts.sil_pdf_ids
|
||||
self.noise_average_decibel = -100.0
|
||||
self.pre_end_silence_detected = False
|
||||
self.next_seg = True
|
||||
|
||||
self.output_data_buf = []
|
||||
self.output_data_buf_offset = 0
|
||||
self.frame_probs = []
|
||||
self.max_end_sil_frame_cnt_thresh = (
|
||||
self.vad_opts.max_end_silence_time - self.vad_opts.speech_to_sil_time_thres
|
||||
)
|
||||
self.speech_noise_thres = self.vad_opts.speech_noise_thres
|
||||
self.scores = None
|
||||
self.idx_pre_chunk = 0
|
||||
self.max_time_out = False
|
||||
self.decibel = []
|
||||
self.data_buf_size = 0
|
||||
self.data_buf_all_size = 0
|
||||
self.waveform = None
|
||||
self.ResetDetection()
|
||||
|
||||
def ResetDetection(self):
|
||||
self.continous_silence_frame_count = 0
|
||||
self.latest_confirmed_speech_frame = 0
|
||||
self.lastest_confirmed_silence_frame = -1
|
||||
self.confirmed_start_frame = -1
|
||||
self.confirmed_end_frame = -1
|
||||
self.vad_state_machine = VadStateMachine.kVadInStateStartPointNotDetected
|
||||
self.windows_detector.Reset()
|
||||
self.sil_frame = 0
|
||||
self.frame_probs = []
|
||||
|
||||
def ComputeDecibel(self) -> None:
|
||||
frame_sample_length = int(self.vad_opts.frame_length_ms * self.vad_opts.sample_rate / 1000)
|
||||
frame_shift_length = int(self.vad_opts.frame_in_ms * self.vad_opts.sample_rate / 1000)
|
||||
if self.data_buf_all_size == 0:
|
||||
self.data_buf_all_size = len(self.waveform[0])
|
||||
self.data_buf_size = self.data_buf_all_size
|
||||
else:
|
||||
self.data_buf_all_size += len(self.waveform[0])
|
||||
for offset in range(
|
||||
0, self.waveform.shape[1] - frame_sample_length + 1, frame_shift_length
|
||||
):
|
||||
self.decibel.append(
|
||||
10
|
||||
* math.log10(
|
||||
np.square((self.waveform[0][offset : offset + frame_sample_length])).sum()
|
||||
+ 0.000001
|
||||
)
|
||||
)
|
||||
|
||||
def ComputeScores(self, scores: np.ndarray) -> None:
|
||||
# scores = self.encoder(feats, in_cache) # return B * T * D
|
||||
self.vad_opts.nn_eval_block_size = scores.shape[1]
|
||||
self.frm_cnt += scores.shape[1] # count total frames
|
||||
self.scores = scores
|
||||
|
||||
def PopDataBufTillFrame(self, frame_idx: int) -> None: # need check again
|
||||
while self.data_buf_start_frame < frame_idx:
|
||||
if self.data_buf_size >= int(
|
||||
self.vad_opts.frame_in_ms * self.vad_opts.sample_rate / 1000
|
||||
):
|
||||
self.data_buf_start_frame += 1
|
||||
self.data_buf_size = self.data_buf_all_size - self.data_buf_start_frame * int(
|
||||
self.vad_opts.frame_in_ms * self.vad_opts.sample_rate / 1000
|
||||
)
|
||||
|
||||
def PopDataToOutputBuf(
|
||||
self,
|
||||
start_frm: int,
|
||||
frm_cnt: int,
|
||||
first_frm_is_start_point: bool,
|
||||
last_frm_is_end_point: bool,
|
||||
end_point_is_sent_end: bool,
|
||||
) -> None:
|
||||
self.PopDataBufTillFrame(start_frm)
|
||||
expected_sample_number = int(
|
||||
frm_cnt * self.vad_opts.sample_rate * self.vad_opts.frame_in_ms / 1000
|
||||
)
|
||||
if last_frm_is_end_point:
|
||||
extra_sample = max(
|
||||
0,
|
||||
int(
|
||||
self.vad_opts.frame_length_ms * self.vad_opts.sample_rate / 1000
|
||||
- self.vad_opts.sample_rate * self.vad_opts.frame_in_ms / 1000
|
||||
),
|
||||
)
|
||||
expected_sample_number += int(extra_sample)
|
||||
if end_point_is_sent_end:
|
||||
expected_sample_number = max(expected_sample_number, self.data_buf_size)
|
||||
if self.data_buf_size < expected_sample_number:
|
||||
print("error in calling pop data_buf\n")
|
||||
|
||||
if len(self.output_data_buf) == 0 or first_frm_is_start_point:
|
||||
self.output_data_buf.append(E2EVadSpeechBufWithDoa())
|
||||
self.output_data_buf[-1].Reset()
|
||||
self.output_data_buf[-1].start_ms = start_frm * self.vad_opts.frame_in_ms
|
||||
self.output_data_buf[-1].end_ms = self.output_data_buf[-1].start_ms
|
||||
self.output_data_buf[-1].doa = 0
|
||||
cur_seg = self.output_data_buf[-1]
|
||||
if cur_seg.end_ms != start_frm * self.vad_opts.frame_in_ms:
|
||||
print("warning\n")
|
||||
out_pos = len(cur_seg.buffer) # cur_seg.buff现在没做任何操作
|
||||
data_to_pop = 0
|
||||
if end_point_is_sent_end:
|
||||
data_to_pop = expected_sample_number
|
||||
else:
|
||||
data_to_pop = int(
|
||||
frm_cnt * self.vad_opts.frame_in_ms * self.vad_opts.sample_rate / 1000
|
||||
)
|
||||
if data_to_pop > self.data_buf_size:
|
||||
print("VAD data_to_pop is bigger than self.data_buf_size!!!\n")
|
||||
data_to_pop = self.data_buf_size
|
||||
expected_sample_number = self.data_buf_size
|
||||
|
||||
cur_seg.doa = 0
|
||||
for sample_cpy_out in range(0, data_to_pop):
|
||||
# cur_seg.buffer[out_pos ++] = data_buf_.back();
|
||||
out_pos += 1
|
||||
for sample_cpy_out in range(data_to_pop, expected_sample_number):
|
||||
# cur_seg.buffer[out_pos++] = data_buf_.back()
|
||||
out_pos += 1
|
||||
if cur_seg.end_ms != start_frm * self.vad_opts.frame_in_ms:
|
||||
print("Something wrong with the VAD algorithm\n")
|
||||
self.data_buf_start_frame += frm_cnt
|
||||
cur_seg.end_ms = (start_frm + frm_cnt) * self.vad_opts.frame_in_ms
|
||||
if first_frm_is_start_point:
|
||||
cur_seg.contain_seg_start_point = True
|
||||
if last_frm_is_end_point:
|
||||
cur_seg.contain_seg_end_point = True
|
||||
|
||||
def OnSilenceDetected(self, valid_frame: int):
|
||||
self.lastest_confirmed_silence_frame = valid_frame
|
||||
if self.vad_state_machine == VadStateMachine.kVadInStateStartPointNotDetected:
|
||||
self.PopDataBufTillFrame(valid_frame)
|
||||
# silence_detected_callback_
|
||||
# pass
|
||||
|
||||
def OnVoiceDetected(self, valid_frame: int) -> None:
|
||||
self.latest_confirmed_speech_frame = valid_frame
|
||||
self.PopDataToOutputBuf(valid_frame, 1, False, False, False)
|
||||
|
||||
def OnVoiceStart(self, start_frame: int, fake_result: bool = False) -> None:
|
||||
if self.vad_opts.do_start_point_detection:
|
||||
pass
|
||||
if self.confirmed_start_frame != -1:
|
||||
print("not reset vad properly\n")
|
||||
else:
|
||||
self.confirmed_start_frame = start_frame
|
||||
|
||||
if (
|
||||
not fake_result
|
||||
and self.vad_state_machine == VadStateMachine.kVadInStateStartPointNotDetected
|
||||
):
|
||||
self.PopDataToOutputBuf(self.confirmed_start_frame, 1, True, False, False)
|
||||
|
||||
def OnVoiceEnd(self, end_frame: int, fake_result: bool, is_last_frame: bool) -> None:
|
||||
for t in range(self.latest_confirmed_speech_frame + 1, end_frame):
|
||||
self.OnVoiceDetected(t)
|
||||
if self.vad_opts.do_end_point_detection:
|
||||
pass
|
||||
if self.confirmed_end_frame != -1:
|
||||
print("not reset vad properly\n")
|
||||
else:
|
||||
self.confirmed_end_frame = end_frame
|
||||
if not fake_result:
|
||||
self.sil_frame = 0
|
||||
self.PopDataToOutputBuf(self.confirmed_end_frame, 1, False, True, is_last_frame)
|
||||
self.number_end_time_detected += 1
|
||||
|
||||
def MaybeOnVoiceEndIfLastFrame(self, is_final_frame: bool, cur_frm_idx: int) -> None:
|
||||
if is_final_frame:
|
||||
self.OnVoiceEnd(cur_frm_idx, False, True)
|
||||
self.vad_state_machine = VadStateMachine.kVadInStateEndPointDetected
|
||||
|
||||
def GetLatency(self) -> int:
|
||||
return int(self.LatencyFrmNumAtStartPoint() * self.vad_opts.frame_in_ms)
|
||||
|
||||
def LatencyFrmNumAtStartPoint(self) -> int:
|
||||
vad_latency = self.windows_detector.GetWinSize()
|
||||
if self.vad_opts.do_extend:
|
||||
vad_latency += int(self.vad_opts.lookback_time_start_point / self.vad_opts.frame_in_ms)
|
||||
return vad_latency
|
||||
|
||||
def GetFrameState(self, t: int) -> FrameState:
|
||||
frame_state = FrameState.kFrameStateInvalid
|
||||
cur_decibel = self.decibel[t]
|
||||
cur_snr = cur_decibel - self.noise_average_decibel
|
||||
# for each frame, calc log posterior probability of each state
|
||||
if cur_decibel < self.vad_opts.decibel_thres:
|
||||
frame_state = FrameState.kFrameStateSil
|
||||
self.DetectOneFrame(frame_state, t, False)
|
||||
return frame_state
|
||||
|
||||
sum_score = 0.0
|
||||
noise_prob = 0.0
|
||||
assert len(self.sil_pdf_ids) == self.vad_opts.silence_pdf_num
|
||||
if len(self.sil_pdf_ids) > 0:
|
||||
assert len(self.scores) == 1 # 只支持batch_size = 1的测试
|
||||
sil_pdf_scores = [
|
||||
self.scores[0][t - self.idx_pre_chunk][sil_pdf_id]
|
||||
for sil_pdf_id in self.sil_pdf_ids
|
||||
]
|
||||
sum_score = sum(sil_pdf_scores)
|
||||
noise_prob = math.log(sum_score) * self.vad_opts.speech_2_noise_ratio
|
||||
total_score = 1.0
|
||||
sum_score = total_score - sum_score
|
||||
speech_prob = math.log(sum_score)
|
||||
if self.vad_opts.output_frame_probs:
|
||||
frame_prob = E2EVadFrameProb()
|
||||
frame_prob.noise_prob = noise_prob
|
||||
frame_prob.speech_prob = speech_prob
|
||||
frame_prob.score = sum_score
|
||||
frame_prob.frame_id = t
|
||||
self.frame_probs.append(frame_prob)
|
||||
if math.exp(speech_prob) >= math.exp(noise_prob) + self.speech_noise_thres:
|
||||
if cur_snr >= self.vad_opts.snr_thres and cur_decibel >= self.vad_opts.decibel_thres:
|
||||
frame_state = FrameState.kFrameStateSpeech
|
||||
else:
|
||||
frame_state = FrameState.kFrameStateSil
|
||||
else:
|
||||
frame_state = FrameState.kFrameStateSil
|
||||
if self.noise_average_decibel < -99.9:
|
||||
self.noise_average_decibel = cur_decibel
|
||||
else:
|
||||
self.noise_average_decibel = (
|
||||
cur_decibel
|
||||
+ self.noise_average_decibel * (self.vad_opts.noise_frame_num_used_for_snr - 1)
|
||||
) / self.vad_opts.noise_frame_num_used_for_snr
|
||||
|
||||
return frame_state
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
score: np.ndarray,
|
||||
waveform: np.ndarray,
|
||||
is_final: bool = False,
|
||||
max_end_sil: int = 800,
|
||||
online: bool = False,
|
||||
):
|
||||
self.max_end_sil_frame_cnt_thresh = max_end_sil - self.vad_opts.speech_to_sil_time_thres
|
||||
self.waveform = waveform # compute decibel for each frame
|
||||
self.ComputeDecibel()
|
||||
self.ComputeScores(score)
|
||||
if not is_final:
|
||||
self.DetectCommonFrames()
|
||||
else:
|
||||
self.DetectLastFrames()
|
||||
segments = []
|
||||
for batch_num in range(0, score.shape[0]): # only support batch_size = 1 now
|
||||
segment_batch = []
|
||||
if len(self.output_data_buf) > 0:
|
||||
for i in range(self.output_data_buf_offset, len(self.output_data_buf)):
|
||||
if online:
|
||||
if not self.output_data_buf[i].contain_seg_start_point:
|
||||
continue
|
||||
if not self.next_seg and not self.output_data_buf[i].contain_seg_end_point:
|
||||
continue
|
||||
start_ms = self.output_data_buf[i].start_ms if self.next_seg else -1
|
||||
if self.output_data_buf[i].contain_seg_end_point:
|
||||
end_ms = self.output_data_buf[i].end_ms
|
||||
self.next_seg = True
|
||||
self.output_data_buf_offset += 1
|
||||
else:
|
||||
end_ms = -1
|
||||
self.next_seg = False
|
||||
else:
|
||||
if not is_final and (
|
||||
not self.output_data_buf[i].contain_seg_start_point
|
||||
or not self.output_data_buf[i].contain_seg_end_point
|
||||
):
|
||||
continue
|
||||
start_ms = self.output_data_buf[i].start_ms
|
||||
end_ms = self.output_data_buf[i].end_ms
|
||||
self.output_data_buf_offset += 1
|
||||
segment = [start_ms, end_ms]
|
||||
segment_batch.append(segment)
|
||||
|
||||
if segment_batch:
|
||||
segments.append(segment_batch)
|
||||
if is_final:
|
||||
# reset class variables and clear the dict for the next query
|
||||
self.AllResetDetection()
|
||||
return segments
|
||||
|
||||
def DetectCommonFrames(self) -> int:
|
||||
if self.vad_state_machine == VadStateMachine.kVadInStateEndPointDetected:
|
||||
return 0
|
||||
for i in range(self.vad_opts.nn_eval_block_size - 1, -1, -1):
|
||||
frame_state = FrameState.kFrameStateInvalid
|
||||
frame_state = self.GetFrameState(self.frm_cnt - 1 - i)
|
||||
self.DetectOneFrame(frame_state, self.frm_cnt - 1 - i, False)
|
||||
self.idx_pre_chunk += self.scores.shape[1]
|
||||
return 0
|
||||
|
||||
def DetectLastFrames(self) -> int:
|
||||
if self.vad_state_machine == VadStateMachine.kVadInStateEndPointDetected:
|
||||
return 0
|
||||
for i in range(self.vad_opts.nn_eval_block_size - 1, -1, -1):
|
||||
frame_state = FrameState.kFrameStateInvalid
|
||||
frame_state = self.GetFrameState(self.frm_cnt - 1 - i)
|
||||
if i != 0:
|
||||
self.DetectOneFrame(frame_state, self.frm_cnt - 1 - i, False)
|
||||
else:
|
||||
self.DetectOneFrame(frame_state, self.frm_cnt - 1, True)
|
||||
|
||||
return 0
|
||||
|
||||
def DetectOneFrame(
|
||||
self, cur_frm_state: FrameState, cur_frm_idx: int, is_final_frame: bool
|
||||
) -> None:
|
||||
tmp_cur_frm_state = FrameState.kFrameStateInvalid
|
||||
if cur_frm_state == FrameState.kFrameStateSpeech:
|
||||
if math.fabs(1.0) > self.vad_opts.fe_prior_thres:
|
||||
tmp_cur_frm_state = FrameState.kFrameStateSpeech
|
||||
else:
|
||||
tmp_cur_frm_state = FrameState.kFrameStateSil
|
||||
elif cur_frm_state == FrameState.kFrameStateSil:
|
||||
tmp_cur_frm_state = FrameState.kFrameStateSil
|
||||
state_change = self.windows_detector.DetectOneFrame(tmp_cur_frm_state, cur_frm_idx)
|
||||
frm_shift_in_ms = self.vad_opts.frame_in_ms
|
||||
if AudioChangeState.kChangeStateSil2Speech == state_change:
|
||||
silence_frame_count = self.continous_silence_frame_count
|
||||
self.continous_silence_frame_count = 0
|
||||
self.pre_end_silence_detected = False
|
||||
start_frame = 0
|
||||
if self.vad_state_machine == VadStateMachine.kVadInStateStartPointNotDetected:
|
||||
start_frame = max(
|
||||
self.data_buf_start_frame, cur_frm_idx - self.LatencyFrmNumAtStartPoint()
|
||||
)
|
||||
self.OnVoiceStart(start_frame)
|
||||
self.vad_state_machine = VadStateMachine.kVadInStateInSpeechSegment
|
||||
for t in range(start_frame + 1, cur_frm_idx + 1):
|
||||
self.OnVoiceDetected(t)
|
||||
elif self.vad_state_machine == VadStateMachine.kVadInStateInSpeechSegment:
|
||||
for t in range(self.latest_confirmed_speech_frame + 1, cur_frm_idx):
|
||||
self.OnVoiceDetected(t)
|
||||
if (
|
||||
cur_frm_idx - self.confirmed_start_frame + 1
|
||||
> self.vad_opts.max_single_segment_time / frm_shift_in_ms
|
||||
):
|
||||
self.OnVoiceEnd(cur_frm_idx, False, False)
|
||||
self.vad_state_machine = VadStateMachine.kVadInStateEndPointDetected
|
||||
elif not is_final_frame:
|
||||
self.OnVoiceDetected(cur_frm_idx)
|
||||
else:
|
||||
self.MaybeOnVoiceEndIfLastFrame(is_final_frame, cur_frm_idx)
|
||||
else:
|
||||
pass
|
||||
elif AudioChangeState.kChangeStateSpeech2Sil == state_change:
|
||||
self.continous_silence_frame_count = 0
|
||||
if self.vad_state_machine == VadStateMachine.kVadInStateStartPointNotDetected:
|
||||
pass
|
||||
elif self.vad_state_machine == VadStateMachine.kVadInStateInSpeechSegment:
|
||||
if (
|
||||
cur_frm_idx - self.confirmed_start_frame + 1
|
||||
> self.vad_opts.max_single_segment_time / frm_shift_in_ms
|
||||
):
|
||||
self.OnVoiceEnd(cur_frm_idx, False, False)
|
||||
self.vad_state_machine = VadStateMachine.kVadInStateEndPointDetected
|
||||
elif not is_final_frame:
|
||||
self.OnVoiceDetected(cur_frm_idx)
|
||||
else:
|
||||
self.MaybeOnVoiceEndIfLastFrame(is_final_frame, cur_frm_idx)
|
||||
else:
|
||||
pass
|
||||
elif AudioChangeState.kChangeStateSpeech2Speech == state_change:
|
||||
self.continous_silence_frame_count = 0
|
||||
if self.vad_state_machine == VadStateMachine.kVadInStateInSpeechSegment:
|
||||
if (
|
||||
cur_frm_idx - self.confirmed_start_frame + 1
|
||||
> self.vad_opts.max_single_segment_time / frm_shift_in_ms
|
||||
):
|
||||
self.max_time_out = True
|
||||
self.OnVoiceEnd(cur_frm_idx, False, False)
|
||||
self.vad_state_machine = VadStateMachine.kVadInStateEndPointDetected
|
||||
elif not is_final_frame:
|
||||
self.OnVoiceDetected(cur_frm_idx)
|
||||
else:
|
||||
self.MaybeOnVoiceEndIfLastFrame(is_final_frame, cur_frm_idx)
|
||||
else:
|
||||
pass
|
||||
elif AudioChangeState.kChangeStateSil2Sil == state_change:
|
||||
self.continous_silence_frame_count += 1
|
||||
if self.vad_state_machine == VadStateMachine.kVadInStateStartPointNotDetected:
|
||||
# silence timeout, return zero length decision
|
||||
if (
|
||||
(self.vad_opts.detect_mode == VadDetectMode.kVadSingleUtteranceDetectMode.value)
|
||||
and (
|
||||
self.continous_silence_frame_count * frm_shift_in_ms
|
||||
> self.vad_opts.max_start_silence_time
|
||||
)
|
||||
) or (is_final_frame and self.number_end_time_detected == 0):
|
||||
for t in range(self.lastest_confirmed_silence_frame + 1, cur_frm_idx):
|
||||
self.OnSilenceDetected(t)
|
||||
self.OnVoiceStart(0, True)
|
||||
self.OnVoiceEnd(0, True, False)
|
||||
self.vad_state_machine = VadStateMachine.kVadInStateEndPointDetected
|
||||
else:
|
||||
if cur_frm_idx >= self.LatencyFrmNumAtStartPoint():
|
||||
self.OnSilenceDetected(cur_frm_idx - self.LatencyFrmNumAtStartPoint())
|
||||
elif self.vad_state_machine == VadStateMachine.kVadInStateInSpeechSegment:
|
||||
if (
|
||||
self.continous_silence_frame_count * frm_shift_in_ms
|
||||
>= self.max_end_sil_frame_cnt_thresh
|
||||
):
|
||||
lookback_frame = int(self.max_end_sil_frame_cnt_thresh / frm_shift_in_ms)
|
||||
if self.vad_opts.do_extend:
|
||||
lookback_frame -= int(
|
||||
self.vad_opts.lookahead_time_end_point / frm_shift_in_ms
|
||||
)
|
||||
lookback_frame -= 1
|
||||
lookback_frame = max(0, lookback_frame)
|
||||
self.OnVoiceEnd(cur_frm_idx - lookback_frame, False, False)
|
||||
self.vad_state_machine = VadStateMachine.kVadInStateEndPointDetected
|
||||
elif (
|
||||
cur_frm_idx - self.confirmed_start_frame + 1
|
||||
> self.vad_opts.max_single_segment_time / frm_shift_in_ms
|
||||
):
|
||||
self.OnVoiceEnd(cur_frm_idx, False, False)
|
||||
self.vad_state_machine = VadStateMachine.kVadInStateEndPointDetected
|
||||
elif self.vad_opts.do_extend and not is_final_frame:
|
||||
if self.continous_silence_frame_count <= int(
|
||||
self.vad_opts.lookahead_time_end_point / frm_shift_in_ms
|
||||
):
|
||||
self.OnVoiceDetected(cur_frm_idx)
|
||||
else:
|
||||
self.MaybeOnVoiceEndIfLastFrame(is_final_frame, cur_frm_idx)
|
||||
else:
|
||||
pass
|
||||
|
||||
if (
|
||||
self.vad_state_machine == VadStateMachine.kVadInStateEndPointDetected
|
||||
and self.vad_opts.detect_mode == VadDetectMode.kVadMutipleUtteranceDetectMode.value
|
||||
):
|
||||
self.ResetDetection()
|
||||
433
modules/python/vendors/FunASR/runtime/python/onnxruntime/funasr_onnx/utils/frontend.py
vendored
Normal file
433
modules/python/vendors/FunASR/runtime/python/onnxruntime/funasr_onnx/utils/frontend.py
vendored
Normal file
@@ -0,0 +1,433 @@
|
||||
# -*- encoding: utf-8 -*-
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Iterable, List, NamedTuple, Set, Tuple, Union
|
||||
import copy
|
||||
|
||||
import numpy as np
|
||||
import kaldi_native_fbank as knf
|
||||
|
||||
root_dir = Path(__file__).resolve().parent
|
||||
|
||||
logger_initialized = {}
|
||||
|
||||
|
||||
class WavFrontend:
|
||||
"""Conventional frontend structure for ASR."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
cmvn_file: str = None,
|
||||
fs: int = 16000,
|
||||
window: str = "hamming",
|
||||
n_mels: int = 80,
|
||||
frame_length: int = 25,
|
||||
frame_shift: int = 10,
|
||||
lfr_m: int = 1,
|
||||
lfr_n: int = 1,
|
||||
dither: float = 1.0,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
|
||||
opts = knf.FbankOptions()
|
||||
opts.frame_opts.samp_freq = fs
|
||||
opts.frame_opts.dither = dither
|
||||
opts.frame_opts.window_type = window
|
||||
opts.frame_opts.frame_shift_ms = float(frame_shift)
|
||||
opts.frame_opts.frame_length_ms = float(frame_length)
|
||||
opts.mel_opts.num_bins = n_mels
|
||||
opts.energy_floor = 0
|
||||
opts.frame_opts.snip_edges = True
|
||||
opts.mel_opts.debug_mel = False
|
||||
self.opts = opts
|
||||
|
||||
self.lfr_m = lfr_m
|
||||
self.lfr_n = lfr_n
|
||||
self.cmvn_file = cmvn_file
|
||||
|
||||
if self.cmvn_file:
|
||||
self.cmvn = self.load_cmvn()
|
||||
self.fbank_fn = None
|
||||
self.fbank_beg_idx = 0
|
||||
self.reset_status()
|
||||
|
||||
def fbank(self, waveform: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
|
||||
waveform = waveform * (1 << 15)
|
||||
self.fbank_fn = knf.OnlineFbank(self.opts)
|
||||
self.fbank_fn.accept_waveform(self.opts.frame_opts.samp_freq, waveform.tolist())
|
||||
frames = self.fbank_fn.num_frames_ready
|
||||
mat = np.empty([frames, self.opts.mel_opts.num_bins])
|
||||
for i in range(frames):
|
||||
mat[i, :] = self.fbank_fn.get_frame(i)
|
||||
feat = mat.astype(np.float32)
|
||||
feat_len = np.array(mat.shape[0]).astype(np.int32)
|
||||
return feat, feat_len
|
||||
|
||||
def fbank_online(self, waveform: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
|
||||
waveform = waveform * (1 << 15)
|
||||
# self.fbank_fn = knf.OnlineFbank(self.opts)
|
||||
self.fbank_fn.accept_waveform(self.opts.frame_opts.samp_freq, waveform.tolist())
|
||||
frames = self.fbank_fn.num_frames_ready
|
||||
mat = np.empty([frames, self.opts.mel_opts.num_bins])
|
||||
for i in range(self.fbank_beg_idx, frames):
|
||||
mat[i, :] = self.fbank_fn.get_frame(i)
|
||||
# self.fbank_beg_idx += (frames-self.fbank_beg_idx)
|
||||
feat = mat.astype(np.float32)
|
||||
feat_len = np.array(mat.shape[0]).astype(np.int32)
|
||||
return feat, feat_len
|
||||
|
||||
def reset_status(self):
|
||||
self.fbank_fn = knf.OnlineFbank(self.opts)
|
||||
self.fbank_beg_idx = 0
|
||||
|
||||
def lfr_cmvn(self, feat: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
|
||||
if self.lfr_m != 1 or self.lfr_n != 1:
|
||||
feat = self.apply_lfr(feat, self.lfr_m, self.lfr_n)
|
||||
|
||||
if self.cmvn_file:
|
||||
feat = self.apply_cmvn(feat)
|
||||
|
||||
feat_len = np.array(feat.shape[0]).astype(np.int32)
|
||||
return feat, feat_len
|
||||
|
||||
@staticmethod
|
||||
def apply_lfr(inputs: np.ndarray, lfr_m: int, lfr_n: int) -> np.ndarray:
|
||||
LFR_inputs = []
|
||||
|
||||
T = inputs.shape[0]
|
||||
T_lfr = int(np.ceil(T / lfr_n))
|
||||
left_padding = np.tile(inputs[0], ((lfr_m - 1) // 2, 1))
|
||||
inputs = np.vstack((left_padding, inputs))
|
||||
T = T + (lfr_m - 1) // 2
|
||||
for i in range(T_lfr):
|
||||
if lfr_m <= T - i * lfr_n:
|
||||
LFR_inputs.append((inputs[i * lfr_n : i * lfr_n + lfr_m]).reshape(1, -1))
|
||||
else:
|
||||
# process last LFR frame
|
||||
num_padding = lfr_m - (T - i * lfr_n)
|
||||
frame = inputs[i * lfr_n :].reshape(-1)
|
||||
for _ in range(num_padding):
|
||||
frame = np.hstack((frame, inputs[-1]))
|
||||
|
||||
LFR_inputs.append(frame)
|
||||
LFR_outputs = np.vstack(LFR_inputs).astype(np.float32)
|
||||
return LFR_outputs
|
||||
|
||||
def apply_cmvn(self, inputs: np.ndarray) -> np.ndarray:
|
||||
"""
|
||||
Apply CMVN with mvn data
|
||||
"""
|
||||
frame, dim = inputs.shape
|
||||
means = np.tile(self.cmvn[0:1, :dim], (frame, 1))
|
||||
vars = np.tile(self.cmvn[1:2, :dim], (frame, 1))
|
||||
inputs = (inputs + means) * vars
|
||||
return inputs
|
||||
|
||||
def load_cmvn(
|
||||
self,
|
||||
) -> np.ndarray:
|
||||
with open(self.cmvn_file, "r", encoding="utf-8") as f:
|
||||
lines = f.readlines()
|
||||
|
||||
means_list = []
|
||||
vars_list = []
|
||||
for i in range(len(lines)):
|
||||
line_item = lines[i].split()
|
||||
if line_item[0] == "<AddShift>":
|
||||
line_item = lines[i + 1].split()
|
||||
if line_item[0] == "<LearnRateCoef>":
|
||||
add_shift_line = line_item[3 : (len(line_item) - 1)]
|
||||
means_list = list(add_shift_line)
|
||||
continue
|
||||
elif line_item[0] == "<Rescale>":
|
||||
line_item = lines[i + 1].split()
|
||||
if line_item[0] == "<LearnRateCoef>":
|
||||
rescale_line = line_item[3 : (len(line_item) - 1)]
|
||||
vars_list = list(rescale_line)
|
||||
continue
|
||||
|
||||
means = np.array(means_list).astype(np.float64)
|
||||
vars = np.array(vars_list).astype(np.float64)
|
||||
cmvn = np.array([means, vars])
|
||||
return cmvn
|
||||
|
||||
|
||||
class WavFrontendOnline(WavFrontend):
|
||||
def __init__(self, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
# self.fbank_fn = knf.OnlineFbank(self.opts)
|
||||
# add variables
|
||||
self.frame_sample_length = int(
|
||||
self.opts.frame_opts.frame_length_ms * self.opts.frame_opts.samp_freq / 1000
|
||||
)
|
||||
self.frame_shift_sample_length = int(
|
||||
self.opts.frame_opts.frame_shift_ms * self.opts.frame_opts.samp_freq / 1000
|
||||
)
|
||||
self.waveform = None
|
||||
self.reserve_waveforms = None
|
||||
self.input_cache = None
|
||||
self.lfr_splice_cache = []
|
||||
|
||||
@staticmethod
|
||||
# inputs has catted the cache
|
||||
def apply_lfr(
|
||||
inputs: np.ndarray, lfr_m: int, lfr_n: int, is_final: bool = False
|
||||
) -> Tuple[np.ndarray, np.ndarray, int]:
|
||||
"""
|
||||
Apply lfr with data
|
||||
"""
|
||||
|
||||
LFR_inputs = []
|
||||
T = inputs.shape[0] # include the right context
|
||||
T_lfr = int(
|
||||
np.ceil((T - (lfr_m - 1) // 2) / lfr_n)
|
||||
) # minus the right context: (lfr_m - 1) // 2
|
||||
splice_idx = T_lfr
|
||||
for i in range(T_lfr):
|
||||
if lfr_m <= T - i * lfr_n:
|
||||
LFR_inputs.append((inputs[i * lfr_n : i * lfr_n + lfr_m]).reshape(1, -1))
|
||||
else: # process last LFR frame
|
||||
if is_final:
|
||||
num_padding = lfr_m - (T - i * lfr_n)
|
||||
frame = (inputs[i * lfr_n :]).reshape(-1)
|
||||
for _ in range(num_padding):
|
||||
frame = np.hstack((frame, inputs[-1]))
|
||||
LFR_inputs.append(frame)
|
||||
else:
|
||||
# update splice_idx and break the circle
|
||||
splice_idx = i
|
||||
break
|
||||
splice_idx = min(T - 1, splice_idx * lfr_n)
|
||||
lfr_splice_cache = inputs[splice_idx:, :]
|
||||
LFR_outputs = np.vstack(LFR_inputs)
|
||||
return LFR_outputs.astype(np.float32), lfr_splice_cache, splice_idx
|
||||
|
||||
@staticmethod
|
||||
def compute_frame_num(
|
||||
sample_length: int, frame_sample_length: int, frame_shift_sample_length: int
|
||||
) -> int:
|
||||
frame_num = int((sample_length - frame_sample_length) / frame_shift_sample_length + 1)
|
||||
return frame_num if frame_num >= 1 and sample_length >= frame_sample_length else 0
|
||||
|
||||
def fbank(
|
||||
self, input: np.ndarray, input_lengths: np.ndarray
|
||||
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
|
||||
self.fbank_fn = knf.OnlineFbank(self.opts)
|
||||
batch_size = input.shape[0]
|
||||
if self.input_cache is None:
|
||||
self.input_cache = np.empty((batch_size, 0), dtype=np.float32)
|
||||
input = np.concatenate((self.input_cache, input), axis=1)
|
||||
frame_num = self.compute_frame_num(
|
||||
input.shape[-1], self.frame_sample_length, self.frame_shift_sample_length
|
||||
)
|
||||
# update self.in_cache
|
||||
self.input_cache = input[
|
||||
:, -(input.shape[-1] - frame_num * self.frame_shift_sample_length) :
|
||||
]
|
||||
waveforms = np.empty(0, dtype=np.float32)
|
||||
feats_pad = np.empty(0, dtype=np.float32)
|
||||
feats_lens = np.empty(0, dtype=np.int32)
|
||||
if frame_num:
|
||||
waveforms = []
|
||||
feats = []
|
||||
feats_lens = []
|
||||
for i in range(batch_size):
|
||||
waveform = input[i]
|
||||
waveforms.append(
|
||||
waveform[
|
||||
: (
|
||||
(frame_num - 1) * self.frame_shift_sample_length
|
||||
+ self.frame_sample_length
|
||||
)
|
||||
]
|
||||
)
|
||||
waveform = waveform * (1 << 15)
|
||||
|
||||
self.fbank_fn.accept_waveform(self.opts.frame_opts.samp_freq, waveform.tolist())
|
||||
frames = self.fbank_fn.num_frames_ready
|
||||
mat = np.empty([frames, self.opts.mel_opts.num_bins])
|
||||
for i in range(frames):
|
||||
mat[i, :] = self.fbank_fn.get_frame(i)
|
||||
feat = mat.astype(np.float32)
|
||||
feat_len = np.array(mat.shape[0]).astype(np.int32)
|
||||
feats.append(feat)
|
||||
feats_lens.append(feat_len)
|
||||
|
||||
waveforms = np.stack(waveforms)
|
||||
feats_lens = np.array(feats_lens)
|
||||
feats_pad = np.array(feats)
|
||||
self.fbanks = feats_pad
|
||||
self.fbanks_lens = copy.deepcopy(feats_lens)
|
||||
return waveforms, feats_pad, feats_lens
|
||||
|
||||
def get_fbank(self) -> Tuple[np.ndarray, np.ndarray]:
|
||||
return self.fbanks, self.fbanks_lens
|
||||
|
||||
def lfr_cmvn(
|
||||
self, input: np.ndarray, input_lengths: np.ndarray, is_final: bool = False
|
||||
) -> Tuple[np.ndarray, np.ndarray, List[int]]:
|
||||
batch_size = input.shape[0]
|
||||
feats = []
|
||||
feats_lens = []
|
||||
lfr_splice_frame_idxs = []
|
||||
for i in range(batch_size):
|
||||
mat = input[i, : input_lengths[i], :]
|
||||
lfr_splice_frame_idx = -1
|
||||
if self.lfr_m != 1 or self.lfr_n != 1:
|
||||
# update self.lfr_splice_cache in self.apply_lfr
|
||||
mat, self.lfr_splice_cache[i], lfr_splice_frame_idx = self.apply_lfr(
|
||||
mat, self.lfr_m, self.lfr_n, is_final
|
||||
)
|
||||
if self.cmvn_file is not None:
|
||||
mat = self.apply_cmvn(mat)
|
||||
feat_length = mat.shape[0]
|
||||
feats.append(mat)
|
||||
feats_lens.append(feat_length)
|
||||
lfr_splice_frame_idxs.append(lfr_splice_frame_idx)
|
||||
|
||||
feats_lens = np.array(feats_lens)
|
||||
feats_pad = np.array(feats)
|
||||
return feats_pad, feats_lens, lfr_splice_frame_idxs
|
||||
|
||||
def extract_fbank(
|
||||
self, input: np.ndarray, input_lengths: np.ndarray, is_final: bool = False
|
||||
) -> Tuple[np.ndarray, np.ndarray]:
|
||||
batch_size = input.shape[0]
|
||||
assert (
|
||||
batch_size == 1
|
||||
), "we support to extract feature online only when the batch size is equal to 1 now"
|
||||
waveforms, feats, feats_lengths = self.fbank(input, input_lengths) # input shape: B T D
|
||||
if feats.shape[0]:
|
||||
self.waveforms = (
|
||||
waveforms
|
||||
if self.reserve_waveforms is None
|
||||
else np.concatenate((self.reserve_waveforms, waveforms), axis=1)
|
||||
)
|
||||
if not self.lfr_splice_cache:
|
||||
for i in range(batch_size):
|
||||
self.lfr_splice_cache.append(
|
||||
np.expand_dims(feats[i][0, :], axis=0).repeat((self.lfr_m - 1) // 2, axis=0)
|
||||
)
|
||||
|
||||
if feats_lengths[0] + self.lfr_splice_cache[0].shape[0] >= self.lfr_m:
|
||||
lfr_splice_cache_np = np.stack(self.lfr_splice_cache) # B T D
|
||||
feats = np.concatenate((lfr_splice_cache_np, feats), axis=1)
|
||||
feats_lengths += lfr_splice_cache_np[0].shape[0]
|
||||
frame_from_waveforms = int(
|
||||
(self.waveforms.shape[1] - self.frame_sample_length)
|
||||
/ self.frame_shift_sample_length
|
||||
+ 1
|
||||
)
|
||||
minus_frame = (self.lfr_m - 1) // 2 if self.reserve_waveforms is None else 0
|
||||
feats, feats_lengths, lfr_splice_frame_idxs = self.lfr_cmvn(
|
||||
feats, feats_lengths, is_final
|
||||
)
|
||||
if self.lfr_m == 1:
|
||||
self.reserve_waveforms = None
|
||||
else:
|
||||
reserve_frame_idx = lfr_splice_frame_idxs[0] - minus_frame
|
||||
# print('reserve_frame_idx: ' + str(reserve_frame_idx))
|
||||
# print('frame_frame: ' + str(frame_from_waveforms))
|
||||
self.reserve_waveforms = self.waveforms[
|
||||
:,
|
||||
reserve_frame_idx
|
||||
* self.frame_shift_sample_length : frame_from_waveforms
|
||||
* self.frame_shift_sample_length,
|
||||
]
|
||||
sample_length = (
|
||||
frame_from_waveforms - 1
|
||||
) * self.frame_shift_sample_length + self.frame_sample_length
|
||||
self.waveforms = self.waveforms[:, :sample_length]
|
||||
else:
|
||||
# update self.reserve_waveforms and self.lfr_splice_cache
|
||||
self.reserve_waveforms = self.waveforms[
|
||||
:, : -(self.frame_sample_length - self.frame_shift_sample_length)
|
||||
]
|
||||
for i in range(batch_size):
|
||||
self.lfr_splice_cache[i] = np.concatenate(
|
||||
(self.lfr_splice_cache[i], feats[i]), axis=0
|
||||
)
|
||||
return np.empty(0, dtype=np.float32), feats_lengths
|
||||
else:
|
||||
if is_final:
|
||||
self.waveforms = (
|
||||
waveforms if self.reserve_waveforms is None else self.reserve_waveforms
|
||||
)
|
||||
feats = np.stack(self.lfr_splice_cache)
|
||||
feats_lengths = np.zeros(batch_size, dtype=np.int32) + feats.shape[1]
|
||||
feats, feats_lengths, _ = self.lfr_cmvn(feats, feats_lengths, is_final)
|
||||
if is_final:
|
||||
self.cache_reset()
|
||||
return feats, feats_lengths
|
||||
|
||||
def get_waveforms(self):
|
||||
return self.waveforms
|
||||
|
||||
def cache_reset(self):
|
||||
self.fbank_fn = knf.OnlineFbank(self.opts)
|
||||
self.reserve_waveforms = None
|
||||
self.input_cache = None
|
||||
self.lfr_splice_cache = []
|
||||
|
||||
|
||||
def load_bytes(input):
|
||||
middle_data = np.frombuffer(input, dtype=np.int16)
|
||||
middle_data = np.asarray(middle_data)
|
||||
if middle_data.dtype.kind not in "iu":
|
||||
raise TypeError("'middle_data' must be an array of integers")
|
||||
dtype = np.dtype("float32")
|
||||
if dtype.kind != "f":
|
||||
raise TypeError("'dtype' must be a floating point type")
|
||||
|
||||
i = np.iinfo(middle_data.dtype)
|
||||
abs_max = 2 ** (i.bits - 1)
|
||||
offset = i.min + abs_max
|
||||
array = np.frombuffer((middle_data.astype(dtype) - offset) / abs_max, dtype=np.float32)
|
||||
return array
|
||||
|
||||
|
||||
class SinusoidalPositionEncoderOnline:
|
||||
"""Streaming Positional encoding."""
|
||||
|
||||
def encode(self, positions: np.ndarray = None, depth: int = None, dtype: np.dtype = np.float32):
|
||||
batch_size = positions.shape[0]
|
||||
positions = positions.astype(dtype)
|
||||
log_timescale_increment = np.log(np.array([10000], dtype=dtype)) / (depth / 2 - 1)
|
||||
inv_timescales = np.exp(np.arange(depth / 2).astype(dtype) * (-log_timescale_increment))
|
||||
inv_timescales = np.reshape(inv_timescales, [batch_size, -1])
|
||||
scaled_time = np.reshape(positions, [1, -1, 1]) * np.reshape(inv_timescales, [1, 1, -1])
|
||||
encoding = np.concatenate((np.sin(scaled_time), np.cos(scaled_time)), axis=2)
|
||||
return encoding.astype(dtype)
|
||||
|
||||
def forward(self, x, start_idx=0):
|
||||
batch_size, timesteps, input_dim = x.shape
|
||||
positions = np.arange(1, timesteps + 1 + start_idx)[None, :]
|
||||
position_encoding = self.encode(positions, input_dim, x.dtype)
|
||||
|
||||
return x + position_encoding[:, start_idx : start_idx + timesteps]
|
||||
|
||||
|
||||
def test():
|
||||
path = "/nfs/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/example/asr_example.wav"
|
||||
import librosa
|
||||
|
||||
cmvn_file = "/nfs/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/am.mvn"
|
||||
config_file = "/nfs/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/config.yaml"
|
||||
from funasr.runtime.python.onnxruntime.rapid_paraformer.utils.utils import read_yaml
|
||||
|
||||
config = read_yaml(config_file)
|
||||
waveform, _ = librosa.load(path, sr=None)
|
||||
frontend = WavFrontend(
|
||||
cmvn_file=cmvn_file,
|
||||
**config["frontend_conf"],
|
||||
)
|
||||
speech, _ = frontend.fbank_online(waveform) # 1d, (sample,), numpy
|
||||
feat, feat_len = frontend.lfr_cmvn(
|
||||
speech
|
||||
) # 2d, (frame, 450), np.float32 -> torch, torch.from_numpy(), dtype, (1, frame, 450)
|
||||
|
||||
frontend.reset_status() # clear cache
|
||||
return feat, feat_len
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test()
|
||||
418
modules/python/vendors/FunASR/runtime/python/onnxruntime/funasr_onnx/utils/postprocess_utils.py
vendored
Normal file
418
modules/python/vendors/FunASR/runtime/python/onnxruntime/funasr_onnx/utils/postprocess_utils.py
vendored
Normal file
@@ -0,0 +1,418 @@
|
||||
# -*- encoding: utf-8 -*-
|
||||
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
|
||||
# MIT License (https://opensource.org/licenses/MIT)
|
||||
|
||||
import string
|
||||
import logging
|
||||
from typing import Any, List, Union
|
||||
|
||||
|
||||
def isChinese(ch: str):
|
||||
if "\u4e00" <= ch <= "\u9fff" or "\u0030" <= ch <= "\u0039":
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def isAllChinese(word: Union[List[Any], str]):
|
||||
word_lists = []
|
||||
for i in word:
|
||||
cur = i.replace(" ", "")
|
||||
cur = cur.replace("</s>", "")
|
||||
cur = cur.replace("<s>", "")
|
||||
word_lists.append(cur)
|
||||
|
||||
if len(word_lists) == 0:
|
||||
return False
|
||||
|
||||
for ch in word_lists:
|
||||
if isChinese(ch) is False:
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def isAllAlpha(word: Union[List[Any], str]):
|
||||
word_lists = []
|
||||
for i in word:
|
||||
cur = i.replace(" ", "")
|
||||
cur = cur.replace("</s>", "")
|
||||
cur = cur.replace("<s>", "")
|
||||
word_lists.append(cur)
|
||||
|
||||
if len(word_lists) == 0:
|
||||
return False
|
||||
|
||||
for ch in word_lists:
|
||||
if ch.isalpha() is False and ch != "'":
|
||||
return False
|
||||
elif ch.isalpha() is True and isChinese(ch) is True:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
# def abbr_dispose(words: List[Any]) -> List[Any]:
|
||||
def abbr_dispose(words: List[Any], time_stamp: List[List] = None) -> List[Any]:
|
||||
words_size = len(words)
|
||||
word_lists = []
|
||||
abbr_begin = []
|
||||
abbr_end = []
|
||||
last_num = -1
|
||||
ts_lists = []
|
||||
ts_nums = []
|
||||
ts_index = 0
|
||||
for num in range(words_size):
|
||||
if num <= last_num:
|
||||
continue
|
||||
|
||||
if len(words[num]) == 1 and words[num].encode("utf-8").isalpha():
|
||||
if (
|
||||
num + 1 < words_size
|
||||
and words[num + 1] == " "
|
||||
and num + 2 < words_size
|
||||
and len(words[num + 2]) == 1
|
||||
and words[num + 2].encode("utf-8").isalpha()
|
||||
):
|
||||
# found the begin of abbr
|
||||
abbr_begin.append(num)
|
||||
num += 2
|
||||
abbr_end.append(num)
|
||||
# to find the end of abbr
|
||||
while True:
|
||||
num += 1
|
||||
if num < words_size and words[num] == " ":
|
||||
num += 1
|
||||
if (
|
||||
num < words_size
|
||||
and len(words[num]) == 1
|
||||
and words[num].encode("utf-8").isalpha()
|
||||
):
|
||||
abbr_end.pop()
|
||||
abbr_end.append(num)
|
||||
last_num = num
|
||||
else:
|
||||
break
|
||||
else:
|
||||
break
|
||||
|
||||
for num in range(words_size):
|
||||
if words[num] == " ":
|
||||
ts_nums.append(ts_index)
|
||||
else:
|
||||
ts_nums.append(ts_index)
|
||||
ts_index += 1
|
||||
last_num = -1
|
||||
for num in range(words_size):
|
||||
if num <= last_num:
|
||||
continue
|
||||
|
||||
if num in abbr_begin:
|
||||
if time_stamp is not None:
|
||||
begin = time_stamp[ts_nums[num]][0]
|
||||
word_lists.append(words[num].upper())
|
||||
num += 1
|
||||
while num < words_size:
|
||||
if num in abbr_end:
|
||||
word_lists.append(words[num].upper())
|
||||
last_num = num
|
||||
break
|
||||
else:
|
||||
if words[num].encode("utf-8").isalpha():
|
||||
word_lists.append(words[num].upper())
|
||||
num += 1
|
||||
if time_stamp is not None:
|
||||
end = time_stamp[ts_nums[num]][1]
|
||||
ts_lists.append([begin, end])
|
||||
else:
|
||||
word_lists.append(words[num])
|
||||
if time_stamp is not None and words[num] != " ":
|
||||
begin = time_stamp[ts_nums[num]][0]
|
||||
end = time_stamp[ts_nums[num]][1]
|
||||
ts_lists.append([begin, end])
|
||||
begin = end
|
||||
|
||||
if time_stamp is not None:
|
||||
return word_lists, ts_lists
|
||||
else:
|
||||
return word_lists
|
||||
|
||||
|
||||
def sentence_postprocess(words: List[Any], time_stamp: List[List] = None):
|
||||
middle_lists = []
|
||||
word_lists = []
|
||||
word_item = ""
|
||||
ts_lists = []
|
||||
|
||||
# wash words lists
|
||||
for i in words:
|
||||
word = ""
|
||||
if isinstance(i, str):
|
||||
word = i
|
||||
else:
|
||||
word = i.decode("utf-8")
|
||||
|
||||
if word in ["<s>", "</s>", "<unk>"]:
|
||||
continue
|
||||
else:
|
||||
middle_lists.append(word)
|
||||
|
||||
# all chinese characters
|
||||
if isAllChinese(middle_lists):
|
||||
for i, ch in enumerate(middle_lists):
|
||||
word_lists.append(ch.replace(" ", ""))
|
||||
if time_stamp is not None:
|
||||
ts_lists = time_stamp
|
||||
|
||||
# all alpha characters
|
||||
elif isAllAlpha(middle_lists):
|
||||
ts_flag = True
|
||||
for i, ch in enumerate(middle_lists):
|
||||
if ts_flag and time_stamp is not None:
|
||||
begin = time_stamp[i][0]
|
||||
end = time_stamp[i][1]
|
||||
word = ""
|
||||
if "@@" in ch:
|
||||
word = ch.replace("@@", "")
|
||||
word_item += word
|
||||
if time_stamp is not None:
|
||||
ts_flag = False
|
||||
end = time_stamp[i][1]
|
||||
else:
|
||||
word_item += ch
|
||||
word_lists.append(word_item)
|
||||
word_lists.append(" ")
|
||||
word_item = ""
|
||||
if time_stamp is not None:
|
||||
ts_flag = True
|
||||
end = time_stamp[i][1]
|
||||
ts_lists.append([begin, end])
|
||||
begin = end
|
||||
|
||||
# mix characters
|
||||
else:
|
||||
alpha_blank = False
|
||||
ts_flag = True
|
||||
begin = -1
|
||||
end = -1
|
||||
for i, ch in enumerate(middle_lists):
|
||||
if ts_flag and time_stamp is not None:
|
||||
begin = time_stamp[i][0]
|
||||
end = time_stamp[i][1]
|
||||
word = ""
|
||||
if isAllChinese(ch):
|
||||
if alpha_blank is True:
|
||||
word_lists.pop()
|
||||
word_lists.append(ch)
|
||||
alpha_blank = False
|
||||
if time_stamp is not None:
|
||||
ts_flag = True
|
||||
ts_lists.append([begin, end])
|
||||
begin = end
|
||||
elif "@@" in ch:
|
||||
word = ch.replace("@@", "")
|
||||
word_item += word
|
||||
alpha_blank = False
|
||||
if time_stamp is not None:
|
||||
ts_flag = False
|
||||
end = time_stamp[i][1]
|
||||
elif isAllAlpha(ch):
|
||||
word_item += ch
|
||||
word_lists.append(word_item)
|
||||
word_lists.append(" ")
|
||||
word_item = ""
|
||||
alpha_blank = True
|
||||
if time_stamp is not None:
|
||||
ts_flag = True
|
||||
end = time_stamp[i][1]
|
||||
ts_lists.append([begin, end])
|
||||
begin = end
|
||||
else:
|
||||
raise ValueError("invalid character: {}".format(ch))
|
||||
|
||||
if time_stamp is not None:
|
||||
word_lists, ts_lists = abbr_dispose(word_lists, ts_lists)
|
||||
real_word_lists = []
|
||||
for ch in word_lists:
|
||||
if ch != " ":
|
||||
real_word_lists.append(ch)
|
||||
sentence = " ".join(real_word_lists).strip()
|
||||
return sentence, ts_lists, real_word_lists
|
||||
else:
|
||||
word_lists = abbr_dispose(word_lists)
|
||||
real_word_lists = []
|
||||
for ch in word_lists:
|
||||
if ch != " ":
|
||||
real_word_lists.append(ch)
|
||||
sentence = "".join(word_lists).strip()
|
||||
return sentence, real_word_lists
|
||||
|
||||
|
||||
def sentence_postprocess_sentencepiece(words):
|
||||
middle_lists = []
|
||||
word_lists = []
|
||||
word_item = ""
|
||||
|
||||
# wash words lists
|
||||
for i in words:
|
||||
word = ""
|
||||
if isinstance(i, str):
|
||||
word = i
|
||||
else:
|
||||
word = i.decode("utf-8")
|
||||
|
||||
if word in ["<s>", "</s>", "<unk>", "<OOV>"]:
|
||||
continue
|
||||
else:
|
||||
middle_lists.append(word)
|
||||
|
||||
# all alpha characters
|
||||
for i, ch in enumerate(middle_lists):
|
||||
word = ""
|
||||
if "\u2581" in ch and i == 0:
|
||||
word_item = ""
|
||||
word = ch.replace("\u2581", "")
|
||||
word_item += word
|
||||
elif "\u2581" in ch and i != 0:
|
||||
word_lists.append(word_item)
|
||||
word_lists.append(" ")
|
||||
word_item = ""
|
||||
word = ch.replace("\u2581", "")
|
||||
word_item += word
|
||||
else:
|
||||
word_item += ch
|
||||
if word_item is not None:
|
||||
word_lists.append(word_item)
|
||||
# word_lists = abbr_dispose(word_lists)
|
||||
real_word_lists = []
|
||||
for ch in word_lists:
|
||||
if ch != " ":
|
||||
if ch == "i":
|
||||
ch = ch.replace("i", "I")
|
||||
elif ch == "i'm":
|
||||
ch = ch.replace("i'm", "I'm")
|
||||
elif ch == "i've":
|
||||
ch = ch.replace("i've", "I've")
|
||||
elif ch == "i'll":
|
||||
ch = ch.replace("i'll", "I'll")
|
||||
real_word_lists.append(ch)
|
||||
sentence = "".join(word_lists)
|
||||
return sentence, real_word_lists
|
||||
|
||||
|
||||
emo_dict = {
|
||||
"<|HAPPY|>": "😊",
|
||||
"<|SAD|>": "😔",
|
||||
"<|ANGRY|>": "😡",
|
||||
"<|NEUTRAL|>": "",
|
||||
"<|FEARFUL|>": "😰",
|
||||
"<|DISGUSTED|>": "🤢",
|
||||
"<|SURPRISED|>": "😮",
|
||||
}
|
||||
|
||||
event_dict = {
|
||||
"<|BGM|>": "🎼",
|
||||
"<|Speech|>": "",
|
||||
"<|Applause|>": "👏",
|
||||
"<|Laughter|>": "😀",
|
||||
"<|Cry|>": "😭",
|
||||
"<|Sneeze|>": "🤧",
|
||||
"<|Breath|>": "",
|
||||
"<|Cough|>": "🤧",
|
||||
}
|
||||
|
||||
lang_dict = {
|
||||
"<|zh|>": "<|lang|>",
|
||||
"<|en|>": "<|lang|>",
|
||||
"<|yue|>": "<|lang|>",
|
||||
"<|ja|>": "<|lang|>",
|
||||
"<|ko|>": "<|lang|>",
|
||||
"<|nospeech|>": "<|lang|>",
|
||||
}
|
||||
|
||||
emoji_dict = {
|
||||
"<|nospeech|><|Event_UNK|>": "❓",
|
||||
"<|zh|>": "",
|
||||
"<|en|>": "",
|
||||
"<|yue|>": "",
|
||||
"<|ja|>": "",
|
||||
"<|ko|>": "",
|
||||
"<|nospeech|>": "",
|
||||
"<|HAPPY|>": "😊",
|
||||
"<|SAD|>": "😔",
|
||||
"<|ANGRY|>": "😡",
|
||||
"<|NEUTRAL|>": "",
|
||||
"<|BGM|>": "🎼",
|
||||
"<|Speech|>": "",
|
||||
"<|Applause|>": "👏",
|
||||
"<|Laughter|>": "😀",
|
||||
"<|FEARFUL|>": "😰",
|
||||
"<|DISGUSTED|>": "🤢",
|
||||
"<|SURPRISED|>": "😮",
|
||||
"<|Cry|>": "😭",
|
||||
"<|EMO_UNKNOWN|>": "",
|
||||
"<|Sneeze|>": "🤧",
|
||||
"<|Breath|>": "",
|
||||
"<|Cough|>": "😷",
|
||||
"<|Sing|>": "",
|
||||
"<|Speech_Noise|>": "",
|
||||
"<|withitn|>": "",
|
||||
"<|woitn|>": "",
|
||||
"<|GBG|>": "",
|
||||
"<|Event_UNK|>": "",
|
||||
}
|
||||
|
||||
emo_set = {"😊", "😔", "😡", "😰", "🤢", "😮"}
|
||||
event_set = {
|
||||
"🎼",
|
||||
"👏",
|
||||
"😀",
|
||||
"😭",
|
||||
"🤧",
|
||||
"😷",
|
||||
}
|
||||
|
||||
|
||||
def format_str_v2(s):
|
||||
sptk_dict = {}
|
||||
for sptk in emoji_dict:
|
||||
sptk_dict[sptk] = s.count(sptk)
|
||||
s = s.replace(sptk, "")
|
||||
emo = "<|NEUTRAL|>"
|
||||
for e in emo_dict:
|
||||
if sptk_dict[e] > sptk_dict[emo]:
|
||||
emo = e
|
||||
for e in event_dict:
|
||||
if sptk_dict[e] > 0:
|
||||
s = event_dict[e] + s
|
||||
s = s + emo_dict[emo]
|
||||
|
||||
for emoji in emo_set.union(event_set):
|
||||
s = s.replace(" " + emoji, emoji)
|
||||
s = s.replace(emoji + " ", emoji)
|
||||
return s.strip()
|
||||
|
||||
|
||||
def rich_transcription_postprocess(s):
|
||||
def get_emo(s):
|
||||
return s[-1] if s[-1] in emo_set else None
|
||||
|
||||
def get_event(s):
|
||||
return s[0] if s[0] in event_set else None
|
||||
|
||||
s = s.replace("<|nospeech|><|Event_UNK|>", "❓")
|
||||
for lang in lang_dict:
|
||||
s = s.replace(lang, "<|lang|>")
|
||||
s_list = [format_str_v2(s_i).strip(" ") for s_i in s.split("<|lang|>")]
|
||||
new_s = " " + s_list[0]
|
||||
cur_ent_event = get_event(new_s)
|
||||
for i in range(1, len(s_list)):
|
||||
if len(s_list[i]) == 0:
|
||||
continue
|
||||
if get_event(s_list[i]) == cur_ent_event and get_event(s_list[i]) != None:
|
||||
s_list[i] = s_list[i][1:]
|
||||
# else:
|
||||
cur_ent_event = get_event(s_list[i])
|
||||
if get_emo(s_list[i]) != None and get_emo(s_list[i]) == get_emo(new_s):
|
||||
new_s = new_s[:-1]
|
||||
new_s += s_list[i].strip().lstrip()
|
||||
new_s = new_s.replace("The.", " ")
|
||||
return new_s.strip()
|
||||
@@ -0,0 +1,53 @@
|
||||
from pathlib import Path
|
||||
from typing import Iterable
|
||||
from typing import List
|
||||
from typing import Union
|
||||
|
||||
import sentencepiece as spm
|
||||
|
||||
|
||||
class SentencepiecesTokenizer:
|
||||
def __init__(self, bpemodel: Union[Path, str], **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self.bpemodel = str(bpemodel)
|
||||
# NOTE(kamo):
|
||||
# Don't build SentencePieceProcessor in __init__()
|
||||
# because it's not picklable and it may cause following error,
|
||||
# "TypeError: can't pickle SwigPyObject objects",
|
||||
# when giving it as argument of "multiprocessing.Process()".
|
||||
self.sp = None
|
||||
self._build_sentence_piece_processor()
|
||||
|
||||
def __repr__(self):
|
||||
return f'{self.__class__.__name__}(model="{self.bpemodel}")'
|
||||
|
||||
def _build_sentence_piece_processor(self):
|
||||
# Build SentencePieceProcessor lazily.
|
||||
if self.sp is None:
|
||||
self.sp = spm.SentencePieceProcessor()
|
||||
self.sp.load(self.bpemodel)
|
||||
|
||||
def text2tokens(self, line: str) -> List[str]:
|
||||
self._build_sentence_piece_processor()
|
||||
return self.sp.EncodeAsPieces(line)
|
||||
|
||||
def tokens2text(self, tokens: Iterable[str]) -> str:
|
||||
self._build_sentence_piece_processor()
|
||||
return self.sp.DecodePieces(list(tokens))
|
||||
|
||||
def encode(self, line: str, **kwargs) -> List[int]:
|
||||
self._build_sentence_piece_processor()
|
||||
return self.sp.EncodeAsIds(line)
|
||||
|
||||
def decode(self, line: List[int], **kwargs):
|
||||
self._build_sentence_piece_processor()
|
||||
return self.sp.DecodeIds(line)
|
||||
|
||||
def get_vocab_size(self):
|
||||
return self.sp.GetPieceSize()
|
||||
|
||||
def ids2tokens(self, *args, **kwargs):
|
||||
return self.decode(*args, **kwargs)
|
||||
|
||||
def tokens2ids(self, *args, **kwargs):
|
||||
return self.encode(*args, **kwargs)
|
||||
66
modules/python/vendors/FunASR/runtime/python/onnxruntime/funasr_onnx/utils/timestamp_utils.py
vendored
Normal file
66
modules/python/vendors/FunASR/runtime/python/onnxruntime/funasr_onnx/utils/timestamp_utils.py
vendored
Normal file
@@ -0,0 +1,66 @@
|
||||
# -*- encoding: utf-8 -*-
|
||||
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
|
||||
# MIT License (https://opensource.org/licenses/MIT)
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
def time_stamp_lfr6_onnx(us_cif_peak, char_list, begin_time=0.0, total_offset=-1.5):
|
||||
if not len(char_list):
|
||||
return "", []
|
||||
START_END_THRESHOLD = 5
|
||||
MAX_TOKEN_DURATION = 30
|
||||
TIME_RATE = 10.0 * 6 / 1000 / 3 # 3 times upsampled
|
||||
cif_peak = us_cif_peak.reshape(-1)
|
||||
num_frames = cif_peak.shape[-1]
|
||||
if char_list[-1] == "</s>":
|
||||
char_list = char_list[:-1]
|
||||
# char_list = [i for i in text]
|
||||
timestamp_list = []
|
||||
new_char_list = []
|
||||
# for bicif model trained with large data, cif2 actually fires when a character starts
|
||||
# so treat the frames between two peaks as the duration of the former token
|
||||
fire_place = np.where(cif_peak > 1.0 - 1e-4)[0] + total_offset # np format
|
||||
num_peak = len(fire_place)
|
||||
assert num_peak == len(char_list) + 1 # number of peaks is supposed to be number of tokens + 1
|
||||
# begin silence
|
||||
if fire_place[0] > START_END_THRESHOLD:
|
||||
# char_list.insert(0, '<sil>')
|
||||
timestamp_list.append([0.0, fire_place[0] * TIME_RATE])
|
||||
new_char_list.append("<sil>")
|
||||
# tokens timestamp
|
||||
for i in range(len(fire_place) - 1):
|
||||
new_char_list.append(char_list[i])
|
||||
if (
|
||||
i == len(fire_place) - 2
|
||||
or MAX_TOKEN_DURATION < 0
|
||||
or fire_place[i + 1] - fire_place[i] < MAX_TOKEN_DURATION
|
||||
):
|
||||
timestamp_list.append([fire_place[i] * TIME_RATE, fire_place[i + 1] * TIME_RATE])
|
||||
else:
|
||||
# cut the duration to token and sil of the 0-weight frames last long
|
||||
_split = fire_place[i] + MAX_TOKEN_DURATION
|
||||
timestamp_list.append([fire_place[i] * TIME_RATE, _split * TIME_RATE])
|
||||
timestamp_list.append([_split * TIME_RATE, fire_place[i + 1] * TIME_RATE])
|
||||
new_char_list.append("<sil>")
|
||||
# tail token and end silence
|
||||
if num_frames - fire_place[-1] > START_END_THRESHOLD:
|
||||
_end = (num_frames + fire_place[-1]) / 2
|
||||
timestamp_list[-1][1] = _end * TIME_RATE
|
||||
timestamp_list.append([_end * TIME_RATE, num_frames * TIME_RATE])
|
||||
new_char_list.append("<sil>")
|
||||
else:
|
||||
timestamp_list[-1][1] = num_frames * TIME_RATE
|
||||
if begin_time: # add offset time in model with vad
|
||||
for i in range(len(timestamp_list)):
|
||||
timestamp_list[i][0] = timestamp_list[i][0] + begin_time / 1000.0
|
||||
timestamp_list[i][1] = timestamp_list[i][1] + begin_time / 1000.0
|
||||
assert len(new_char_list) == len(timestamp_list)
|
||||
res_str = ""
|
||||
for char, timestamp in zip(new_char_list, timestamp_list):
|
||||
res_str += "{} {} {};".format(char, timestamp[0], timestamp[1])
|
||||
res = []
|
||||
for char, timestamp in zip(new_char_list, timestamp_list):
|
||||
if char != "<sil>":
|
||||
res.append([int(timestamp[0] * 1000), int(timestamp[1] * 1000)])
|
||||
return res_str, res
|
||||
395
modules/python/vendors/FunASR/runtime/python/onnxruntime/funasr_onnx/utils/utils.py
vendored
Normal file
395
modules/python/vendors/FunASR/runtime/python/onnxruntime/funasr_onnx/utils/utils.py
vendored
Normal file
@@ -0,0 +1,395 @@
|
||||
# -*- encoding: utf-8 -*-
|
||||
|
||||
import functools
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Iterable, List, NamedTuple, Set, Tuple, Union
|
||||
|
||||
import re
|
||||
import numpy as np
|
||||
import yaml
|
||||
|
||||
try:
|
||||
from onnxruntime import (
|
||||
GraphOptimizationLevel,
|
||||
InferenceSession,
|
||||
SessionOptions,
|
||||
get_available_providers,
|
||||
get_device,
|
||||
)
|
||||
except:
|
||||
print("please pip3 install onnxruntime")
|
||||
import jieba
|
||||
import warnings
|
||||
|
||||
root_dir = Path(__file__).resolve().parent
|
||||
|
||||
logger_initialized = {}
|
||||
|
||||
|
||||
def pad_list(xs, pad_value, max_len=None):
|
||||
n_batch = len(xs)
|
||||
if max_len is None:
|
||||
max_len = max(x.size(0) for x in xs)
|
||||
# pad = xs[0].new(n_batch, max_len, *xs[0].size()[1:]).fill_(pad_value)
|
||||
# numpy format
|
||||
pad = (np.zeros((n_batch, max_len)) + pad_value).astype(np.int32)
|
||||
for i in range(n_batch):
|
||||
pad[i, : xs[i].shape[0]] = xs[i]
|
||||
|
||||
return pad
|
||||
|
||||
|
||||
"""
|
||||
def make_pad_mask(lengths, xs=None, length_dim=-1, maxlen=None):
|
||||
if length_dim == 0:
|
||||
raise ValueError("length_dim cannot be 0: {}".format(length_dim))
|
||||
|
||||
if not isinstance(lengths, list):
|
||||
lengths = lengths.tolist()
|
||||
bs = int(len(lengths))
|
||||
if maxlen is None:
|
||||
if xs is None:
|
||||
maxlen = int(max(lengths))
|
||||
else:
|
||||
maxlen = xs.size(length_dim)
|
||||
else:
|
||||
assert xs is None
|
||||
assert maxlen >= int(max(lengths))
|
||||
|
||||
seq_range = torch.arange(0, maxlen, dtype=torch.int64)
|
||||
seq_range_expand = seq_range.unsqueeze(0).expand(bs, maxlen)
|
||||
seq_length_expand = seq_range_expand.new(lengths).unsqueeze(-1)
|
||||
mask = seq_range_expand >= seq_length_expand
|
||||
|
||||
if xs is not None:
|
||||
assert xs.size(0) == bs, (xs.size(0), bs)
|
||||
|
||||
if length_dim < 0:
|
||||
length_dim = xs.dim() + length_dim
|
||||
# ind = (:, None, ..., None, :, , None, ..., None)
|
||||
ind = tuple(
|
||||
slice(None) if i in (0, length_dim) else None for i in range(xs.dim())
|
||||
)
|
||||
mask = mask[ind].expand_as(xs).to(xs.device)
|
||||
return mask
|
||||
"""
|
||||
|
||||
|
||||
class TokenIDConverter:
|
||||
def __init__(
|
||||
self,
|
||||
token_list: Union[List, str],
|
||||
):
|
||||
|
||||
self.token_list = token_list
|
||||
self.unk_symbol = token_list[-1]
|
||||
self.token2id = {v: i for i, v in enumerate(self.token_list)}
|
||||
self.unk_id = self.token2id[self.unk_symbol]
|
||||
|
||||
def get_num_vocabulary_size(self) -> int:
|
||||
return len(self.token_list)
|
||||
|
||||
def ids2tokens(self, integers: Union[np.ndarray, Iterable[int]]) -> List[str]:
|
||||
if isinstance(integers, np.ndarray) and integers.ndim != 1:
|
||||
raise TokenIDConverterError(f"Must be 1 dim ndarray, but got {integers.ndim}")
|
||||
return [self.token_list[i] for i in integers]
|
||||
|
||||
def tokens2ids(self, tokens: Iterable[str]) -> List[int]:
|
||||
|
||||
return [self.token2id.get(i, self.unk_id) for i in tokens]
|
||||
|
||||
|
||||
class CharTokenizer:
|
||||
def __init__(
|
||||
self,
|
||||
symbol_value: Union[Path, str, Iterable[str]] = None,
|
||||
space_symbol: str = "<space>",
|
||||
remove_non_linguistic_symbols: bool = False,
|
||||
):
|
||||
|
||||
self.space_symbol = space_symbol
|
||||
self.non_linguistic_symbols = self.load_symbols(symbol_value)
|
||||
self.remove_non_linguistic_symbols = remove_non_linguistic_symbols
|
||||
|
||||
@staticmethod
|
||||
def load_symbols(value: Union[Path, str, Iterable[str]] = None) -> Set:
|
||||
if value is None:
|
||||
return set()
|
||||
|
||||
if isinstance(value, Iterable[str]):
|
||||
return set(value)
|
||||
|
||||
file_path = Path(value)
|
||||
if not file_path.exists():
|
||||
logging.warning("%s doesn't exist.", file_path)
|
||||
return set()
|
||||
|
||||
with file_path.open("r", encoding="utf-8") as f:
|
||||
return set(line.rstrip() for line in f)
|
||||
|
||||
def text2tokens(self, line: Union[str, list]) -> List[str]:
|
||||
tokens = []
|
||||
while len(line) != 0:
|
||||
for w in self.non_linguistic_symbols:
|
||||
if line.startswith(w):
|
||||
if not self.remove_non_linguistic_symbols:
|
||||
tokens.append(line[: len(w)])
|
||||
line = line[len(w) :]
|
||||
break
|
||||
else:
|
||||
t = line[0]
|
||||
if t == " ":
|
||||
t = "<space>"
|
||||
tokens.append(t)
|
||||
line = line[1:]
|
||||
return tokens
|
||||
|
||||
def tokens2text(self, tokens: Iterable[str]) -> str:
|
||||
tokens = [t if t != self.space_symbol else " " for t in tokens]
|
||||
return "".join(tokens)
|
||||
|
||||
def __repr__(self):
|
||||
return (
|
||||
f"{self.__class__.__name__}("
|
||||
f'space_symbol="{self.space_symbol}"'
|
||||
f'non_linguistic_symbols="{self.non_linguistic_symbols}"'
|
||||
f")"
|
||||
)
|
||||
|
||||
|
||||
class Hypothesis(NamedTuple):
|
||||
"""Hypothesis data type."""
|
||||
|
||||
yseq: np.ndarray
|
||||
score: Union[float, np.ndarray] = 0
|
||||
scores: Dict[str, Union[float, np.ndarray]] = dict()
|
||||
states: Dict[str, Any] = dict()
|
||||
|
||||
def asdict(self) -> dict:
|
||||
"""Convert data to JSON-friendly dict."""
|
||||
return self._replace(
|
||||
yseq=self.yseq.tolist(),
|
||||
score=float(self.score),
|
||||
scores={k: float(v) for k, v in self.scores.items()},
|
||||
)._asdict()
|
||||
|
||||
|
||||
class TokenIDConverterError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class ONNXRuntimeError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class OrtInferSession:
|
||||
def __init__(self, model_file, device_id=-1, intra_op_num_threads=4):
|
||||
device_id = str(device_id)
|
||||
sess_opt = SessionOptions()
|
||||
sess_opt.intra_op_num_threads = intra_op_num_threads
|
||||
sess_opt.log_severity_level = 4
|
||||
sess_opt.enable_cpu_mem_arena = False
|
||||
sess_opt.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL
|
||||
|
||||
cuda_ep = "CUDAExecutionProvider"
|
||||
cuda_provider_options = {
|
||||
"device_id": device_id,
|
||||
"arena_extend_strategy": "kNextPowerOfTwo",
|
||||
"cudnn_conv_algo_search": "EXHAUSTIVE",
|
||||
"do_copy_in_default_stream": "true",
|
||||
}
|
||||
cpu_ep = "CPUExecutionProvider"
|
||||
cpu_provider_options = {
|
||||
"arena_extend_strategy": "kSameAsRequested",
|
||||
}
|
||||
|
||||
EP_list = []
|
||||
if device_id != "-1" and get_device() == "GPU" and cuda_ep in get_available_providers():
|
||||
EP_list = [(cuda_ep, cuda_provider_options)]
|
||||
EP_list.append((cpu_ep, cpu_provider_options))
|
||||
|
||||
self._verify_model(model_file)
|
||||
self.session = InferenceSession(model_file, sess_options=sess_opt, providers=EP_list)
|
||||
|
||||
if device_id != "-1" and cuda_ep not in self.session.get_providers():
|
||||
warnings.warn(
|
||||
f"{cuda_ep} is not avaiable for current env, the inference part is automatically shifted to be executed under {cpu_ep}.\n"
|
||||
"Please ensure the installed onnxruntime-gpu version matches your cuda and cudnn version, "
|
||||
"you can check their relations from the offical web site: "
|
||||
"https://onnxruntime.ai/docs/execution-providers/CUDA-ExecutionProvider.html",
|
||||
RuntimeWarning,
|
||||
)
|
||||
|
||||
def __call__(self, input_content: List[Union[np.ndarray, np.ndarray]]) -> np.ndarray:
|
||||
input_dict = dict(zip(self.get_input_names(), input_content))
|
||||
try:
|
||||
return self.session.run(self.get_output_names(), input_dict)
|
||||
except Exception as e:
|
||||
raise ONNXRuntimeError("ONNXRuntime inferece failed.") from e
|
||||
|
||||
def get_input_names(
|
||||
self,
|
||||
):
|
||||
return [v.name for v in self.session.get_inputs()]
|
||||
|
||||
def get_output_names(
|
||||
self,
|
||||
):
|
||||
return [v.name for v in self.session.get_outputs()]
|
||||
|
||||
def get_character_list(self, key: str = "character"):
|
||||
return self.meta_dict[key].splitlines()
|
||||
|
||||
def have_key(self, key: str = "character") -> bool:
|
||||
self.meta_dict = self.session.get_modelmeta().custom_metadata_map
|
||||
if key in self.meta_dict.keys():
|
||||
return True
|
||||
return False
|
||||
|
||||
@staticmethod
|
||||
def _verify_model(model_path):
|
||||
model_path = Path(model_path)
|
||||
if not model_path.exists():
|
||||
raise FileNotFoundError(f"{model_path} does not exists.")
|
||||
if not model_path.is_file():
|
||||
raise FileExistsError(f"{model_path} is not a file.")
|
||||
|
||||
|
||||
def split_to_mini_sentence(words: list, word_limit: int = 20):
|
||||
assert word_limit > 1
|
||||
if len(words) <= word_limit:
|
||||
return [words]
|
||||
sentences = []
|
||||
length = len(words)
|
||||
sentence_len = length // word_limit
|
||||
for i in range(sentence_len):
|
||||
sentences.append(words[i * word_limit : (i + 1) * word_limit])
|
||||
if length % word_limit > 0:
|
||||
sentences.append(words[sentence_len * word_limit :])
|
||||
return sentences
|
||||
|
||||
|
||||
def code_mix_split_words(text: str):
|
||||
words = []
|
||||
segs = text.split()
|
||||
for seg in segs:
|
||||
# There is no space in seg.
|
||||
current_word = ""
|
||||
for c in seg:
|
||||
if len(c.encode()) == 1:
|
||||
# This is an ASCII char.
|
||||
current_word += c
|
||||
else:
|
||||
# This is a Chinese char.
|
||||
if len(current_word) > 0:
|
||||
words.append(current_word)
|
||||
current_word = ""
|
||||
words.append(c)
|
||||
if len(current_word) > 0:
|
||||
words.append(current_word)
|
||||
return words
|
||||
|
||||
|
||||
def isEnglish(text: str):
|
||||
if re.search("^[a-zA-Z']+$", text):
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
|
||||
def join_chinese_and_english(input_list):
|
||||
line = ""
|
||||
for token in input_list:
|
||||
if isEnglish(token):
|
||||
line = line + " " + token
|
||||
else:
|
||||
line = line + token
|
||||
|
||||
line = line.strip()
|
||||
return line
|
||||
|
||||
|
||||
def code_mix_split_words_jieba(seg_dict_file: str):
|
||||
jieba.load_userdict(seg_dict_file)
|
||||
|
||||
def _fn(text: str):
|
||||
input_list = text.split()
|
||||
token_list_all = []
|
||||
langauge_list = []
|
||||
token_list_tmp = []
|
||||
language_flag = None
|
||||
for token in input_list:
|
||||
if isEnglish(token) and language_flag == "Chinese":
|
||||
token_list_all.append(token_list_tmp)
|
||||
langauge_list.append("Chinese")
|
||||
token_list_tmp = []
|
||||
elif not isEnglish(token) and language_flag == "English":
|
||||
token_list_all.append(token_list_tmp)
|
||||
langauge_list.append("English")
|
||||
token_list_tmp = []
|
||||
|
||||
token_list_tmp.append(token)
|
||||
|
||||
if isEnglish(token):
|
||||
language_flag = "English"
|
||||
else:
|
||||
language_flag = "Chinese"
|
||||
|
||||
if token_list_tmp:
|
||||
token_list_all.append(token_list_tmp)
|
||||
langauge_list.append(language_flag)
|
||||
|
||||
result_list = []
|
||||
for token_list_tmp, language_flag in zip(token_list_all, langauge_list):
|
||||
if language_flag == "English":
|
||||
result_list.extend(token_list_tmp)
|
||||
else:
|
||||
seg_list = jieba.cut(join_chinese_and_english(token_list_tmp), HMM=False)
|
||||
result_list.extend(seg_list)
|
||||
|
||||
return result_list
|
||||
|
||||
return _fn
|
||||
|
||||
|
||||
def read_yaml(yaml_path: Union[str, Path]) -> Dict:
|
||||
if not Path(yaml_path).exists():
|
||||
raise FileExistsError(f"The {yaml_path} does not exist.")
|
||||
|
||||
with open(str(yaml_path), "rb") as f:
|
||||
data = yaml.load(f, Loader=yaml.Loader)
|
||||
return data
|
||||
|
||||
|
||||
@functools.lru_cache()
|
||||
def get_logger(name="funasr_onnx"):
|
||||
"""Initialize and get a logger by name.
|
||||
If the logger has not been initialized, this method will initialize the
|
||||
logger by adding one or two handlers, otherwise the initialized logger will
|
||||
be directly returned. During initialization, a StreamHandler will always be
|
||||
added.
|
||||
Args:
|
||||
name (str): Logger name.
|
||||
Returns:
|
||||
logging.Logger: The expected logger.
|
||||
"""
|
||||
logger = logging.getLogger(name)
|
||||
if name in logger_initialized:
|
||||
return logger
|
||||
|
||||
for logger_name in logger_initialized:
|
||||
if name.startswith(logger_name):
|
||||
return logger
|
||||
|
||||
formatter = logging.Formatter(
|
||||
"[%(asctime)s] %(name)s %(levelname)s: %(message)s", datefmt="%Y/%m/%d %H:%M:%S"
|
||||
)
|
||||
|
||||
sh = logging.StreamHandler()
|
||||
sh.setFormatter(formatter)
|
||||
logger.addHandler(sh)
|
||||
logger_initialized[name] = True
|
||||
logger.propagate = False
|
||||
logging.basicConfig(level=logging.ERROR)
|
||||
return logger
|
||||
330
modules/python/vendors/FunASR/runtime/python/onnxruntime/funasr_onnx/vad_bin.py
vendored
Normal file
330
modules/python/vendors/FunASR/runtime/python/onnxruntime/funasr_onnx/vad_bin.py
vendored
Normal file
@@ -0,0 +1,330 @@
|
||||
# -*- encoding: utf-8 -*-
|
||||
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
|
||||
# MIT License (https://opensource.org/licenses/MIT)
|
||||
|
||||
import os.path
|
||||
from pathlib import Path
|
||||
from typing import List, Union, Tuple
|
||||
|
||||
import copy
|
||||
import librosa
|
||||
import numpy as np
|
||||
|
||||
from .utils.utils import ONNXRuntimeError, OrtInferSession, get_logger, read_yaml
|
||||
from .utils.frontend import WavFrontend, WavFrontendOnline
|
||||
from .utils.e2e_vad import E2EVadModel
|
||||
|
||||
logging = get_logger()
|
||||
|
||||
|
||||
class Fsmn_vad:
|
||||
"""
|
||||
Author: Speech Lab of DAMO Academy, Alibaba Group
|
||||
Deep-FSMN for Large Vocabulary Continuous Speech Recognition
|
||||
https://arxiv.org/abs/1803.05030
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model_dir: Union[str, Path] = None,
|
||||
batch_size: int = 1,
|
||||
device_id: Union[str, int] = "-1",
|
||||
quantize: bool = False,
|
||||
intra_op_num_threads: int = 4,
|
||||
max_end_sil: int = None,
|
||||
cache_dir: str = None,
|
||||
**kwargs,
|
||||
):
|
||||
|
||||
if not Path(model_dir).exists():
|
||||
try:
|
||||
from modelscope.hub.snapshot_download import snapshot_download
|
||||
except:
|
||||
raise "You are exporting model from modelscope, please install modelscope and try it again. To install modelscope, you could:\n" "\npip3 install -U modelscope\n" "For the users in China, you could install with the command:\n" "\npip3 install -U modelscope -i https://mirror.sjtu.edu.cn/pypi/web/simple"
|
||||
try:
|
||||
model_dir = snapshot_download(model_dir, cache_dir=cache_dir)
|
||||
except:
|
||||
raise "model_dir must be model_name in modelscope or local path downloaded from modelscope, but is {}".format(
|
||||
model_dir
|
||||
)
|
||||
|
||||
model_file = os.path.join(model_dir, "model.onnx")
|
||||
if quantize:
|
||||
model_file = os.path.join(model_dir, "model_quant.onnx")
|
||||
if not os.path.exists(model_file):
|
||||
print(".onnx does not exist, begin to export onnx")
|
||||
try:
|
||||
from funasr import AutoModel
|
||||
except:
|
||||
raise "You are exporting onnx, please install funasr and try it again. To install funasr, you could:\n" "\npip3 install -U funasr\n" "For the users in China, you could install with the command:\n" "\npip3 install -U funasr -i https://mirror.sjtu.edu.cn/pypi/web/simple"
|
||||
|
||||
model = AutoModel(model=model_dir)
|
||||
model_dir = model.export(type="onnx", quantize=quantize, **kwargs)
|
||||
config_file = os.path.join(model_dir, "config.yaml")
|
||||
cmvn_file = os.path.join(model_dir, "am.mvn")
|
||||
config = read_yaml(config_file)
|
||||
|
||||
self.frontend = WavFrontend(cmvn_file=cmvn_file, **config["frontend_conf"])
|
||||
self.ort_infer = OrtInferSession(
|
||||
model_file, device_id, intra_op_num_threads=intra_op_num_threads
|
||||
)
|
||||
self.batch_size = batch_size
|
||||
self.vad_scorer = E2EVadModel(config["model_conf"])
|
||||
self.max_end_sil = (
|
||||
max_end_sil if max_end_sil is not None else config["model_conf"]["max_end_silence_time"]
|
||||
)
|
||||
self.encoder_conf = config["encoder_conf"]
|
||||
|
||||
def prepare_cache(self, in_cache: list = []):
|
||||
if len(in_cache) > 0:
|
||||
return in_cache
|
||||
fsmn_layers = self.encoder_conf["fsmn_layers"]
|
||||
proj_dim = self.encoder_conf["proj_dim"]
|
||||
lorder = self.encoder_conf["lorder"]
|
||||
for i in range(fsmn_layers):
|
||||
cache = np.zeros((1, proj_dim, lorder - 1, 1)).astype(np.float32)
|
||||
in_cache.append(cache)
|
||||
return in_cache
|
||||
|
||||
def __call__(self, audio_in: Union[str, np.ndarray, List[str]], **kwargs) -> List:
|
||||
waveform_list = self.load_data(audio_in, self.frontend.opts.frame_opts.samp_freq)
|
||||
waveform_nums = len(waveform_list)
|
||||
is_final = kwargs.get("kwargs", False)
|
||||
|
||||
segments = [[]] * self.batch_size
|
||||
for beg_idx in range(0, waveform_nums, self.batch_size):
|
||||
|
||||
end_idx = min(waveform_nums, beg_idx + self.batch_size)
|
||||
waveform = waveform_list[beg_idx:end_idx]
|
||||
feats, feats_len = self.extract_feat(waveform)
|
||||
waveform = np.array(waveform)
|
||||
param_dict = kwargs.get("param_dict", dict())
|
||||
in_cache = param_dict.get("in_cache", list())
|
||||
in_cache = self.prepare_cache(in_cache)
|
||||
try:
|
||||
t_offset = 0
|
||||
step = int(min(feats_len.max(), 6000))
|
||||
for t_offset in range(0, int(feats_len), min(step, feats_len - t_offset)):
|
||||
if t_offset + step >= feats_len - 1:
|
||||
step = feats_len - t_offset
|
||||
is_final = True
|
||||
else:
|
||||
is_final = False
|
||||
feats_package = feats[:, t_offset : int(t_offset + step), :]
|
||||
waveform_package = waveform[
|
||||
:,
|
||||
t_offset
|
||||
* 160 : min(waveform.shape[-1], (int(t_offset + step) - 1) * 160 + 400),
|
||||
]
|
||||
|
||||
inputs = [feats_package]
|
||||
# inputs = [feats]
|
||||
inputs.extend(in_cache)
|
||||
scores, out_caches = self.infer(inputs)
|
||||
in_cache = out_caches
|
||||
segments_part = self.vad_scorer(
|
||||
scores,
|
||||
waveform_package,
|
||||
is_final=is_final,
|
||||
max_end_sil=self.max_end_sil,
|
||||
online=False,
|
||||
)
|
||||
# segments = self.vad_scorer(scores, waveform[0][None, :], is_final=is_final, max_end_sil=self.max_end_sil)
|
||||
|
||||
if segments_part:
|
||||
for batch_num in range(0, self.batch_size):
|
||||
segments[batch_num] += segments_part[batch_num]
|
||||
|
||||
except ONNXRuntimeError:
|
||||
# logging.warning(traceback.format_exc())
|
||||
logging.warning("input wav is silence or noise")
|
||||
segments = ""
|
||||
|
||||
return segments
|
||||
|
||||
def load_data(self, wav_content: Union[str, np.ndarray, List[str]], fs: int = None) -> List:
|
||||
def load_wav(path: str) -> np.ndarray:
|
||||
waveform, _ = librosa.load(path, sr=fs)
|
||||
return waveform
|
||||
|
||||
if isinstance(wav_content, np.ndarray):
|
||||
return [wav_content]
|
||||
|
||||
if isinstance(wav_content, str):
|
||||
return [load_wav(wav_content)]
|
||||
|
||||
if isinstance(wav_content, list):
|
||||
return [load_wav(path) for path in wav_content]
|
||||
|
||||
raise TypeError(f"The type of {wav_content} is not in [str, np.ndarray, list]")
|
||||
|
||||
def extract_feat(self, waveform_list: List[np.ndarray]) -> Tuple[np.ndarray, np.ndarray]:
|
||||
feats, feats_len = [], []
|
||||
for waveform in waveform_list:
|
||||
speech, _ = self.frontend.fbank(waveform)
|
||||
feat, feat_len = self.frontend.lfr_cmvn(speech)
|
||||
feats.append(feat)
|
||||
feats_len.append(feat_len)
|
||||
|
||||
feats = self.pad_feats(feats, np.max(feats_len))
|
||||
feats_len = np.array(feats_len).astype(np.int32)
|
||||
return feats, feats_len
|
||||
|
||||
@staticmethod
|
||||
def pad_feats(feats: List[np.ndarray], max_feat_len: int) -> np.ndarray:
|
||||
def pad_feat(feat: np.ndarray, cur_len: int) -> np.ndarray:
|
||||
pad_width = ((0, max_feat_len - cur_len), (0, 0))
|
||||
return np.pad(feat, pad_width, "constant", constant_values=0)
|
||||
|
||||
feat_res = [pad_feat(feat, feat.shape[0]) for feat in feats]
|
||||
feats = np.array(feat_res).astype(np.float32)
|
||||
return feats
|
||||
|
||||
def infer(self, feats: List) -> Tuple[np.ndarray, np.ndarray]:
|
||||
|
||||
outputs = self.ort_infer(feats)
|
||||
scores, out_caches = outputs[0], outputs[1:]
|
||||
return scores, out_caches
|
||||
|
||||
|
||||
class Fsmn_vad_online:
|
||||
"""
|
||||
Author: Speech Lab of DAMO Academy, Alibaba Group
|
||||
Deep-FSMN for Large Vocabulary Continuous Speech Recognition
|
||||
https://arxiv.org/abs/1803.05030
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model_dir: Union[str, Path] = None,
|
||||
batch_size: int = 1,
|
||||
device_id: Union[str, int] = "-1",
|
||||
quantize: bool = False,
|
||||
intra_op_num_threads: int = 4,
|
||||
max_end_sil: int = None,
|
||||
cache_dir: str = None,
|
||||
**kwargs,
|
||||
):
|
||||
if not Path(model_dir).exists():
|
||||
try:
|
||||
from modelscope.hub.snapshot_download import snapshot_download
|
||||
except:
|
||||
raise "You are exporting model from modelscope, please install modelscope and try it again. To install modelscope, you could:\n" "\npip3 install -U modelscope\n" "For the users in China, you could install with the command:\n" "\npip3 install -U modelscope -i https://mirror.sjtu.edu.cn/pypi/web/simple"
|
||||
try:
|
||||
model_dir = snapshot_download(model_dir, cache_dir=cache_dir)
|
||||
except:
|
||||
raise "model_dir must be model_name in modelscope or local path downloaded from modelscope, but is {}".format(
|
||||
model_dir
|
||||
)
|
||||
|
||||
model_file = os.path.join(model_dir, "model.onnx")
|
||||
if quantize:
|
||||
model_file = os.path.join(model_dir, "model_quant.onnx")
|
||||
if not os.path.exists(model_file):
|
||||
print(".onnx does not exist, begin to export onnx")
|
||||
try:
|
||||
from funasr import AutoModel
|
||||
except:
|
||||
raise "You are exporting onnx, please install funasr and try it again. To install funasr, you could:\n" "\npip3 install -U funasr\n" "For the users in China, you could install with the command:\n" "\npip3 install -U funasr -i https://mirror.sjtu.edu.cn/pypi/web/simple"
|
||||
|
||||
model = AutoModel(model=model_dir)
|
||||
model_dir = model.export(type="onnx", quantize=quantize, **kwargs)
|
||||
|
||||
config_file = os.path.join(model_dir, "config.yaml")
|
||||
cmvn_file = os.path.join(model_dir, "am.mvn")
|
||||
config = read_yaml(config_file)
|
||||
|
||||
self.frontend = WavFrontendOnline(cmvn_file=cmvn_file, **config["frontend_conf"])
|
||||
self.ort_infer = OrtInferSession(
|
||||
model_file, device_id, intra_op_num_threads=intra_op_num_threads
|
||||
)
|
||||
self.batch_size = batch_size
|
||||
self.vad_scorer = E2EVadModel(config["model_conf"])
|
||||
self.max_end_sil = (
|
||||
max_end_sil if max_end_sil is not None else config["model_conf"]["max_end_silence_time"]
|
||||
)
|
||||
self.encoder_conf = config["encoder_conf"]
|
||||
|
||||
def prepare_cache(self, in_cache: list = []):
|
||||
if len(in_cache) > 0:
|
||||
return in_cache
|
||||
fsmn_layers = self.encoder_conf["fsmn_layers"]
|
||||
proj_dim = self.encoder_conf["proj_dim"]
|
||||
lorder = self.encoder_conf["lorder"]
|
||||
for i in range(fsmn_layers):
|
||||
cache = np.zeros((1, proj_dim, lorder - 1, 1)).astype(np.float32)
|
||||
in_cache.append(cache)
|
||||
return in_cache
|
||||
|
||||
def __call__(self, audio_in: np.ndarray, **kwargs) -> List:
|
||||
waveforms = np.expand_dims(audio_in, axis=0)
|
||||
|
||||
param_dict = kwargs.get("param_dict", dict())
|
||||
is_final = param_dict.get("is_final", False)
|
||||
feats, feats_len = self.extract_feat(waveforms, is_final)
|
||||
segments = []
|
||||
if feats.size != 0:
|
||||
in_cache = param_dict.get("in_cache", list())
|
||||
in_cache = self.prepare_cache(in_cache)
|
||||
try:
|
||||
inputs = [feats]
|
||||
inputs.extend(in_cache)
|
||||
scores, out_caches = self.infer(inputs)
|
||||
param_dict["in_cache"] = out_caches
|
||||
waveforms = self.frontend.get_waveforms()
|
||||
segments = self.vad_scorer(
|
||||
scores, waveforms, is_final=is_final, max_end_sil=self.max_end_sil, online=True
|
||||
)
|
||||
|
||||
except ONNXRuntimeError:
|
||||
# logging.warning(traceback.format_exc())
|
||||
logging.warning("input wav is silence or noise")
|
||||
segments = []
|
||||
return segments
|
||||
|
||||
def load_data(self, wav_content: Union[str, np.ndarray, List[str]], fs: int = None) -> List:
|
||||
def load_wav(path: str) -> np.ndarray:
|
||||
waveform, _ = librosa.load(path, sr=fs)
|
||||
return waveform
|
||||
|
||||
if isinstance(wav_content, np.ndarray):
|
||||
return [wav_content]
|
||||
|
||||
if isinstance(wav_content, str):
|
||||
return [load_wav(wav_content)]
|
||||
|
||||
if isinstance(wav_content, list):
|
||||
return [load_wav(path) for path in wav_content]
|
||||
|
||||
raise TypeError(f"The type of {wav_content} is not in [str, np.ndarray, list]")
|
||||
|
||||
def extract_feat(
|
||||
self, waveforms: np.ndarray, is_final: bool = False
|
||||
) -> Tuple[np.ndarray, np.ndarray]:
|
||||
waveforms_lens = np.zeros(waveforms.shape[0]).astype(np.int32)
|
||||
for idx, waveform in enumerate(waveforms):
|
||||
waveforms_lens[idx] = waveform.shape[-1]
|
||||
|
||||
feats, feats_len = self.frontend.extract_fbank(waveforms, waveforms_lens, is_final)
|
||||
# feats.append(feat)
|
||||
# feats_len.append(feat_len)
|
||||
|
||||
# feats = self.pad_feats(feats, np.max(feats_len))
|
||||
# feats_len = np.array(feats_len).astype(np.int32)
|
||||
return feats.astype(np.float32), feats_len.astype(np.int32)
|
||||
|
||||
@staticmethod
|
||||
def pad_feats(feats: List[np.ndarray], max_feat_len: int) -> np.ndarray:
|
||||
def pad_feat(feat: np.ndarray, cur_len: int) -> np.ndarray:
|
||||
pad_width = ((0, max_feat_len - cur_len), (0, 0))
|
||||
return np.pad(feat, pad_width, "constant", constant_values=0)
|
||||
|
||||
feat_res = [pad_feat(feat, feat.shape[0]) for feat in feats]
|
||||
feats = np.array(feat_res).astype(np.float32)
|
||||
return feats
|
||||
|
||||
def infer(self, feats: List) -> Tuple[np.ndarray, np.ndarray]:
|
||||
|
||||
outputs = self.ort_infer(feats)
|
||||
scores, out_caches = outputs[0], outputs[1:]
|
||||
return scores, out_caches
|
||||
43
modules/python/vendors/FunASR/runtime/python/onnxruntime/funasr_server_http.py
vendored
Normal file
43
modules/python/vendors/FunASR/runtime/python/onnxruntime/funasr_server_http.py
vendored
Normal file
@@ -0,0 +1,43 @@
|
||||
import argparse
|
||||
import base64
|
||||
import io
|
||||
import soundfile as sf
|
||||
import uvicorn
|
||||
from fastapi import FastAPI, Body
|
||||
|
||||
app = FastAPI()
|
||||
|
||||
from funasr_onnx import Paraformer
|
||||
|
||||
model_dir = "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-onnx"
|
||||
model = Paraformer(model_dir, batch_size=1, quantize=True)
|
||||
|
||||
|
||||
async def recognition_onnx(waveform):
|
||||
result = model(waveform)[0]["preds"][0]
|
||||
return result
|
||||
|
||||
|
||||
@app.post("/api/asr")
|
||||
async def asr(item: dict = Body(...)):
|
||||
try:
|
||||
audio_bytes = base64.b64decode(bytes(item["wav_base64"], "utf-8"))
|
||||
waveform, _ = sf.read(io.BytesIO(audio_bytes))
|
||||
result = await recognition_onnx(waveform)
|
||||
ret = {"results": result, "code": 0}
|
||||
except:
|
||||
print("请求出错,这里是处理出错的")
|
||||
ret = {"results": "", "code": 1}
|
||||
return ret
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="API Service")
|
||||
parser.add_argument("--listen", default="0.0.0.0", type=str, help="the network to listen")
|
||||
parser.add_argument("--port", default=8888, type=int, help="the port to listen")
|
||||
args = parser.parse_args()
|
||||
|
||||
print("start...")
|
||||
print("server on:", args)
|
||||
|
||||
uvicorn.run(app, host=args.listen, port=args.port)
|
||||
49
modules/python/vendors/FunASR/runtime/python/onnxruntime/setup.py
vendored
Normal file
49
modules/python/vendors/FunASR/runtime/python/onnxruntime/setup.py
vendored
Normal file
@@ -0,0 +1,49 @@
|
||||
# -*- encoding: utf-8 -*-
|
||||
from pathlib import Path
|
||||
import setuptools
|
||||
|
||||
|
||||
def get_readme():
|
||||
root_dir = Path(__file__).resolve().parent
|
||||
readme_path = str(root_dir / "README.md")
|
||||
print(readme_path)
|
||||
with open(readme_path, "r", encoding="utf-8") as f:
|
||||
readme = f.read()
|
||||
return readme
|
||||
|
||||
|
||||
MODULE_NAME = "funasr_onnx"
|
||||
VERSION_NUM = "0.4.1"
|
||||
|
||||
setuptools.setup(
|
||||
name=MODULE_NAME,
|
||||
version=VERSION_NUM,
|
||||
platforms="Any",
|
||||
url="https://github.com/alibaba-damo-academy/FunASR.git",
|
||||
author="Speech Lab of DAMO Academy, Alibaba Group",
|
||||
author_email="funasr@list.alibaba-inc.com",
|
||||
description="FunASR: A Fundamental End-to-End Speech Recognition Toolkit",
|
||||
license="MIT",
|
||||
long_description=get_readme(),
|
||||
long_description_content_type="text/markdown",
|
||||
include_package_data=True,
|
||||
install_requires=[
|
||||
"librosa",
|
||||
"onnxruntime>=1.7.0",
|
||||
"scipy",
|
||||
"numpy<=1.26.4",
|
||||
"kaldi-native-fbank",
|
||||
"PyYAML>=5.1.2",
|
||||
"onnx",
|
||||
"sentencepiece",
|
||||
],
|
||||
packages=[MODULE_NAME, f"{MODULE_NAME}.utils"],
|
||||
keywords=["funasr,asr"],
|
||||
classifiers=[
|
||||
"Programming Language :: Python :: 3.6",
|
||||
"Programming Language :: Python :: 3.7",
|
||||
"Programming Language :: Python :: 3.8",
|
||||
"Programming Language :: Python :: 3.9",
|
||||
"Programming Language :: Python :: 3.10",
|
||||
],
|
||||
)
|
||||
Reference in New Issue
Block a user