Sync from bytedesk-private: update

This commit is contained in:
jack ning
2024-12-14 10:43:18 +08:00
parent 476eebb101
commit 5e082909e4
3421 changed files with 812709 additions and 0 deletions

View File

@@ -0,0 +1,22 @@
FROM nvcr.io/nvidia/tritonserver:24.05-py3
# https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html
# Please choose previous tritonserver:xx.xx if you encounter cuda driver mismatch issue
LABEL maintainer="NVIDIA"
LABEL repository="tritonserver"
RUN pip install torch
RUN apt-get update && apt-get -y install cmake
WORKDIR /workspace
RUN pip install -U "huggingface_hub[cli]" tritonclient[all] soundfile pyyaml torchaudio sentencepiece
ENV TORCH_CUDA_ARCH_LIST="8.0 8.6 8.9 9.0"
RUN git clone https://github.com/csukuangfj/kaldifeat && \
cd kaldifeat && \
sed -i 's/in running_cuda_version//g' get_version.py && \
python3 setup.py install && \
cd -
RUN huggingface-cli download --local-dir ./model_repo_sense_voice_small yuekai/model_repo_sense_voice_small
RUN rm -r ./model_repo_sense_voice_small/.huggingface

View File

@@ -0,0 +1,19 @@
FROM nvcr.io/nvidia/tritonserver:23.01-py3
# https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html
# Please choose previous tritonserver:xx.xx if you encounter cuda driver mismatch issue
LABEL maintainer="NVIDIA"
LABEL repository="tritonserver"
RUN apt-get update && apt-get -y install \
python3-dev \
cmake \
libsndfile1
# -i https://pypi.tuna.tsinghua.edu.cn/simple
RUN pip3 install torch torchaudio
RUN pip3 install kaldifeat pyyaml
# Dependency for client
RUN pip3 install soundfile grpcio-tools tritonclient
WORKDIR /workspace

View File

@@ -0,0 +1,81 @@
## Triton Inference Serving Best Practice for SenseVoice
### Quick Start
Directly launch the service using docker compose.
```sh
docker compose up --build
```
### Build Image
Build the docker image from scratch.
```sh
# build from scratch, cd to the parent dir of Dockerfile.server
docker build . -f Dockerfile/Dockerfile.sensevoice -t soar97/triton-sensevoice:24.05
```
### Create Docker Container
```sh
your_mount_dir=/mnt:/mnt
docker run -it --name "sensevoice-server" --gpus all --net host -v $your_mount_dir --shm-size=2g soar97/triton-sensevoice:24.05
```
### Export SenseVoice Model to Onnx
Please follow the official guide of FunASR to export the sensevoice onnx file. Also, you need to download the tokenizer file by yourself.
### Launch Server
Log of directory tree:
```sh
model_repo_sense_voice_small
|-- encoder
| |-- 1
| | `-- model.onnx -> /your/path/model.onnx
| `-- config.pbtxt
|-- feature_extractor
| |-- 1
| | `-- model.py
| |-- am.mvn
| |-- config.pbtxt
| `-- config.yaml
|-- scoring
| |-- 1
| | `-- model.py
| |-- chn_jpn_yue_eng_ko_spectok.bpe.model -> /your/path/chn_jpn_yue_eng_ko_spectok.bpe.model
| `-- config.pbtxt
`-- sensevoice
|-- 1
`-- config.pbtxt
8 directories, 10 files
# launch the service
tritonserver --model-repository /workspace/model_repo_sensevoice_small \
--pinned-memory-pool-byte-size=512000000 \
--cuda-memory-pool-byte-size=0:1024000000
```
### Benchmark using Dataset
```sh
git clone https://github.com/yuekaizhang/Triton-ASR-Client.git
cd Triton-ASR-Client
num_task=32
python3 client.py \
--server-addr localhost \
--server-port 10086 \
--model-name sensevoice \
--compute-cer \
--num-tasks $num_task \
--batch-size 16 \
--manifest-dir ./datasets/aishell1_test
```
Benchmark results below were based on Aishell1 test set with a single V100, the total audio duration is 36108.919 seconds.
|concurrent-tasks | batch-size-per-task | processing time(s) | RTF |
|----------|--------------------|------------|---------------------|
| 32 (onnx fp32) | 16 | 67.09 | 0.0019|
| 32 (onnx fp32) | 1 | 82.04 | 0.0023|
(Note: for batch-size-per-task=1 cases, tritonserver could use dynamic batching to improve throughput.)
## Acknowledge
This part originates from NVIDIA CISI project. We also have TTS and NLP solutions deployed on triton inference server. If you are interested, please contact us.

View File

@@ -0,0 +1,85 @@
## Inference with Triton
### Steps:
1. Prepare model repo files
```sh
git-lfs install
git clone https://www.modelscope.cn/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch.git
pretrained_model_dir=$(pwd)/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch
cp $pretrained_model_dir/am.mvn ./model_repo_paraformer_large_offline/feature_extractor/
cp $pretrained_model_dir/config.yaml ./model_repo_paraformer_large_offline/feature_extractor/
# Refer here to get model.onnx (https://github.com/alibaba-damo-academy/FunASR/blob/main/funasr/export/README.md)
cp <exported_onnx_dir>/model.onnx ./model_repo_paraformer_large_offline/encoder/1/
```
Log of directory tree:
```sh
model_repo_paraformer_large_offline/
|-- encoder
| |-- 1
| | `-- model.onnx
| `-- config.pbtxt
|-- feature_extractor
| |-- 1
| | `-- model.py
| |-- config.pbtxt
| |-- am.mvn
| `-- config.yaml
|-- infer_pipeline
| |-- 1
| `-- config.pbtxt
`-- scoring
|-- 1
| `-- model.py
`-- config.pbtxt
8 directories, 9 files
```
2. Follow below instructions to launch triton server
```sh
# using docker image Dockerfile/Dockerfile.server
docker build . -f Dockerfile/Dockerfile.server -t triton-paraformer:23.01
docker run -it --rm --name "paraformer_triton_server" --gpus all -v <path_host/model_repo_paraformer_large_offline>:/workspace/ --shm-size 1g --net host triton-paraformer:23.01
# launch the service
tritonserver --model-repository /workspace/model_repo_paraformer_large_offline \
--pinned-memory-pool-byte-size=512000000 \
--cuda-memory-pool-byte-size=0:1024000000
```
### Performance benchmark
Benchmark [speech_paraformer](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary) based on Aishell1 test set with a single V100, the total audio duration is 36108.919 seconds.
```sh
# For client container:
docker run -it --rm --name "client_test" --net host --gpus all -v <path_host/triton_gpu/client>:/workpace/ soar97/triton-k2:22.12.1 # noqa
# For aishell manifests:
apt-get install git-lfs
git-lfs install
git clone https://huggingface.co/csukuangfj/aishell-test-dev-manifests
sudo mkdir -p /root/fangjun/open-source/icefall-aishell/egs/aishell/ASR/download/aishell
tar xf ./aishell-test-dev-manifests/data_aishell.tar.gz -C /root/fangjun/open-source/icefall-aishell/egs/aishell/ASR/download/aishell/ # noqa
serveraddr=localhost
manifest_path=/workspace/aishell-test-dev-manifests/data/fbank/aishell_cuts_test.jsonl.gz
num_task=60
python3 client/decode_manifest_triton.py \
--server-addr $serveraddr \
--compute-cer \
--model-name infer_pipeline \
--num-tasks $num_task \
--manifest-filename $manifest_path
```
(Note: The service has been fully warm up.)
|concurrent-tasks | processing time(s) | RTF |
|----------|--------------------|------------|
| 60 (onnx fp32) | 116.0 | 0.0032|
## Acknowledge
This part originates from NVIDIA CISI project. We also have TTS and NLP solutions deployed on triton inference server. If you are interested, please contact us.

View File

@@ -0,0 +1,64 @@
### Steps:
1. Prepare model repo files
* git clone https://www.modelscope.cn/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online-onnx.git
* Convert lfr_cmvn_pe.onnx model. For example: python export_lfr_cmvn_pe_onnx.py
* If you export to onnx, you should have several model files in `${MODEL_DIR}`:
```
├── README.md
└── model_repo_paraformer_large_online
├── cif_search
│   ├── 1
│   │   └── model.py
│   └── config.pbtxt
├── decoder
│   ├── 1
│   │   └── decoder.onnx
│   └── config.pbtxt
├── encoder
│   ├── 1
│   │   └── model.onnx
│   └── config.pbtxt
├── feature_extractor
│   ├── 1
│   │   └── model.py
│   ├── config.pbtxt
│   └── config.yaml
├── lfr_cmvn_pe
│   ├── 1
│   │   └── lfr_cmvn_pe.onnx
│   ├── am.mvn
│   ├── config.pbtxt
│   └── export_lfr_cmvn_pe_onnx.py
└── streaming_paraformer
├── 1
└── config.pbtxt
```
2. Follow below instructions to launch triton server
```sh
# using docker image Dockerfile/Dockerfile.server
docker build . -f Dockerfile/Dockerfile.server -t triton-paraformer:23.01
docker run -it --rm --name "paraformer_triton_server" --gpus all -v <path_host/model_repo_paraformer_large_online>:/workspace/ --shm-size 1g --net host triton-paraformer:23.01
# launch the service
cd /workspace
tritonserver --model-repository model_repo_paraformer_large_online \
--pinned-memory-pool-byte-size=512000000 \
--cuda-memory-pool-byte-size=0:1024000000
```
### Performance benchmark with a single A10
* FP32, onnx, [paraformer larger online](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online-onnx/summary
),Our chunksize is 10 * 960 / 16000 = 0.6 s, so we should care about the perf of latency less than 0.6s so that it can be a realtime application.
| Concurrency | Throughput | Latency_p50 (ms) | Latency_p90 (ms) | Latency_p95 (ms) | Latency_p99 (ms) |
|-------------|------------|------------------|------------------|------------------|------------------|
| 20 | 309.252 | 56.913 | 76.267 | 85.598 | 138.462 |
| 40 | 391.058 | 97.911 | 145.509 | 150.545 | 185.399 |
| 60 | 426.269 | 138.244 | 185.855 | 201.016 | 236.528 |
| 80 | 431.781 | 170.991 | 227.983 | 252.453 | 412.273 |
| 100 | 473.351 | 206.205 | 262.612 | 288.964 | 463.337 |

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,191 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import multiprocessing
from multiprocessing import Pool
import argparse
import os
import tritonclient.grpc as grpcclient
from utils import cal_cer
from speech_client import *
import numpy as np
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"-v",
"--verbose",
action="store_true",
required=False,
default=False,
help="Enable verbose output",
)
parser.add_argument(
"-u",
"--url",
type=str,
required=False,
default="localhost:10086",
help="Inference server URL. Default is " "localhost:8001.",
)
parser.add_argument(
"--model_name",
required=False,
default="attention_rescoring",
choices=["attention_rescoring", "streaming_wenet", "infer_pipeline"],
help="the model to send request to",
)
parser.add_argument(
"--wavscp",
type=str,
required=False,
default=None,
help="audio_id \t wav_path",
)
parser.add_argument(
"--trans",
type=str,
required=False,
default=None,
help="audio_id \t text",
)
parser.add_argument(
"--data_dir",
type=str,
required=False,
default=None,
help="path prefix for wav_path in wavscp/audio_file",
)
parser.add_argument(
"--audio_file",
type=str,
required=False,
default=None,
help="single wav file path",
)
# below arguments are for streaming
# Please check onnx_config.yaml and train.yaml
parser.add_argument("--streaming", action="store_true", required=False)
parser.add_argument(
"--sample_rate",
type=int,
required=False,
default=16000,
help="sample rate used in training",
)
parser.add_argument(
"--frame_length_ms",
type=int,
required=False,
default=25,
help="frame length",
)
parser.add_argument(
"--frame_shift_ms",
type=int,
required=False,
default=10,
help="frame shift length",
)
parser.add_argument(
"--chunk_size",
type=int,
required=False,
default=16,
help="chunk size default is 16",
)
parser.add_argument(
"--context",
type=int,
required=False,
default=7,
help="subsampling context",
)
parser.add_argument(
"--subsampling",
type=int,
required=False,
default=4,
help="subsampling rate",
)
FLAGS = parser.parse_args()
print(FLAGS)
# load data
filenames = []
transcripts = []
if FLAGS.audio_file is not None:
path = FLAGS.audio_file
if FLAGS.data_dir:
path = os.path.join(FLAGS.data_dir, path)
if os.path.exists(path):
filenames = [path]
elif FLAGS.wavscp is not None:
audio_data = {}
with open(FLAGS.wavscp, "r", encoding="utf-8") as f:
for line in f:
aid, path = line.strip().split("\t")
if FLAGS.data_dir:
path = os.path.join(FLAGS.data_dir, path)
audio_data[aid] = {"path": path}
with open(FLAGS.trans, "r", encoding="utf-8") as f:
for line in f:
aid, text = line.strip().split("\t")
audio_data[aid]["text"] = text
for key, value in audio_data.items():
filenames.append(value["path"])
transcripts.append(value["text"])
num_workers = multiprocessing.cpu_count() // 2
if FLAGS.streaming:
speech_client_cls = StreamingSpeechClient
else:
speech_client_cls = OfflineSpeechClient
def single_job(client_files):
with grpcclient.InferenceServerClient(
url=FLAGS.url, verbose=FLAGS.verbose
) as triton_client:
protocol_client = grpcclient
speech_client = speech_client_cls(
triton_client, FLAGS.model_name, protocol_client, FLAGS
)
idx, audio_files = client_files
predictions = []
for li in audio_files:
result = speech_client.recognize(li, idx)
print("Recognized {}:{}".format(li, result[0]))
predictions += result
return predictions
# start to do inference
# Group requests in batches
predictions = []
tasks = []
splits = np.array_split(filenames, num_workers)
for idx, per_split in enumerate(splits):
cur_files = per_split.tolist()
tasks.append((idx, cur_files))
with Pool(processes=num_workers) as pool:
predictions = pool.map(single_job, tasks)
predictions = [item for sublist in predictions for item in sublist]
if transcripts:
cer = cal_cer(predictions, transcripts)
print("CER is: {}".format(cer))

View File

@@ -0,0 +1,518 @@
#!/usr/bin/env python3
# Copyright 2022 Xiaomi Corp. (authors: Fangjun Kuang)
# 2023 Nvidia (authors: Yuekai Zhang)
# See LICENSE for clarification regarding multiple authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This script loads a manifest in lhotse format and sends it to the server
for decoding, in parallel.
Usage:
# For offline wenet server
./decode_manifest_triton.py \
--server-addr localhost \
--compute-cer \
--model-name attention_rescoring \
--num-tasks 300 \
--manifest-filename ./aishell-test-dev-manifests/data/fbank/aishell_cuts_test.jsonl.gz # noqa
# For streaming wenet server
./decode_manifest_triton.py \
--server-addr localhost \
--streaming \
--compute-cer \
--context 7 \
--model-name streaming_wenet \
--num-tasks 300 \
--manifest-filename ./aishell-test-dev-manifests/data/fbank/aishell_cuts_test.jsonl.gz # noqa
# For simulate streaming mode wenet server
./decode_manifest_triton.py \
--server-addr localhost \
--simulate-streaming \
--compute-cer \
--context 7 \
--model-name streaming_wenet \
--num-tasks 300 \
--manifest-filename ./aishell-test-dev-manifests/data/fbank/aishell_cuts_test.jsonl.gz # noqa
# For test container:
docker run -it --rm --name "wenet_client_test" --net host --gpus all soar97/triton-k2:22.12.1 # noqa
# For aishell manifests:
apt-get install git-lfs
git-lfs install
git clone https://huggingface.co/csukuangfj/aishell-test-dev-manifests
sudo mkdir -p /root/fangjun/open-source/icefall-aishell/egs/aishell/ASR/download/aishell
tar xf ./aishell-test-dev-manifests/data_aishell.tar.gz -C /root/fangjun/open-source/icefall-aishell/egs/aishell/ASR/download/aishell/ # noqa
"""
import argparse
import asyncio
import math
import time
import types
from pathlib import Path
import json
import numpy as np
import tritonclient
import tritonclient.grpc.aio as grpcclient
from lhotse import CutSet, load_manifest
from tritonclient.utils import np_to_triton_dtype
from icefall.utils import store_transcripts, write_error_stats
DEFAULT_MANIFEST_FILENAME = "/mnt/samsung-t7/yuekai/aishell-test-dev-manifests/data/fbank/aishell_cuts_test.jsonl.gz" # noqa
def get_args():
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument(
"--server-addr",
type=str,
default="localhost",
help="Address of the server",
)
parser.add_argument(
"--server-port",
type=int,
default=8001,
help="Port of the server",
)
parser.add_argument(
"--manifest-filename",
type=str,
default=DEFAULT_MANIFEST_FILENAME,
help="Path to the manifest for decoding",
)
parser.add_argument(
"--model-name",
type=str,
default="transducer",
help="triton model_repo module name to request",
)
parser.add_argument(
"--num-tasks",
type=int,
default=50,
help="Number of tasks to use for sending",
)
parser.add_argument(
"--log-interval",
type=int,
default=5,
help="Controls how frequently we print the log.",
)
parser.add_argument(
"--compute-cer",
action="store_true",
default=False,
help="""True to compute CER, e.g., for Chinese.
False to compute WER, e.g., for English words.
""",
)
parser.add_argument(
"--streaming",
action="store_true",
default=False,
help="""True for streaming ASR.
""",
)
parser.add_argument(
"--simulate-streaming",
action="store_true",
default=False,
help="""True for strictly simulate streaming ASR.
Threads will sleep to simulate the real speaking scene.
""",
)
parser.add_argument(
"--chunk_size",
type=int,
required=False,
default=16,
help="chunk size default is 16",
)
parser.add_argument(
"--context",
type=int,
required=False,
default=-1,
help="subsampling context for wenet",
)
parser.add_argument(
"--encoder_right_context",
type=int,
required=False,
default=2,
help="encoder right context",
)
parser.add_argument(
"--subsampling",
type=int,
required=False,
default=4,
help="subsampling rate",
)
parser.add_argument(
"--stats_file",
type=str,
required=False,
default="./stats.json",
help="output of stats anaylasis",
)
return parser.parse_args()
async def send(
cuts: CutSet,
name: str,
triton_client: tritonclient.grpc.aio.InferenceServerClient,
protocol_client: types.ModuleType,
log_interval: int,
compute_cer: bool,
model_name: str,
):
total_duration = 0.0
results = []
for i, c in enumerate(cuts):
if i % log_interval == 0:
print(f"{name}: {i}/{len(cuts)}")
waveform = c.load_audio().reshape(-1).astype(np.float32)
sample_rate = 16000
# padding to nearset 10 seconds
samples = np.zeros(
(
1,
10 * sample_rate * (int(len(waveform) / sample_rate // 10) + 1),
),
dtype=np.float32,
)
samples[0, : len(waveform)] = waveform
lengths = np.array([[len(waveform)]], dtype=np.int32)
inputs = [
protocol_client.InferInput("WAV", samples.shape, np_to_triton_dtype(samples.dtype)),
protocol_client.InferInput(
"WAV_LENS", lengths.shape, np_to_triton_dtype(lengths.dtype)
),
]
inputs[0].set_data_from_numpy(samples)
inputs[1].set_data_from_numpy(lengths)
outputs = [protocol_client.InferRequestedOutput("TRANSCRIPTS")]
sequence_id = 10086 + i
response = await triton_client.infer(
model_name, inputs, request_id=str(sequence_id), outputs=outputs
)
decoding_results = response.as_numpy("TRANSCRIPTS")[0]
if type(decoding_results) == np.ndarray:
decoding_results = b" ".join(decoding_results).decode("utf-8")
else:
# For wenet
decoding_results = decoding_results.decode("utf-8")
total_duration += c.duration
if compute_cer:
ref = c.supervisions[0].text.split()
hyp = decoding_results.split()
ref = list("".join(ref))
hyp = list("".join(hyp))
results.append((c.id, ref, hyp))
else:
results.append(
(
c.id,
c.supervisions[0].text.split(),
decoding_results.split(),
)
) # noqa
return total_duration, results
async def send_streaming(
cuts: CutSet,
name: str,
triton_client: tritonclient.grpc.aio.InferenceServerClient,
protocol_client: types.ModuleType,
log_interval: int,
compute_cer: bool,
model_name: str,
first_chunk_in_secs: float,
other_chunk_in_secs: float,
task_index: int,
simulate_mode: bool = False,
):
total_duration = 0.0
results = []
latency_data = []
for i, c in enumerate(cuts):
if i % log_interval == 0:
print(f"{name}: {i}/{len(cuts)}")
waveform = c.load_audio().reshape(-1).astype(np.float32)
sample_rate = 16000
wav_segs = []
j = 0
while j < len(waveform):
if j == 0:
stride = int(first_chunk_in_secs * sample_rate)
wav_segs.append(waveform[j : j + stride])
else:
stride = int(other_chunk_in_secs * sample_rate)
wav_segs.append(waveform[j : j + stride])
j += len(wav_segs[-1])
sequence_id = task_index + 10086
for idx, seg in enumerate(wav_segs):
chunk_len = len(seg)
if simulate_mode:
await asyncio.sleep(chunk_len / sample_rate)
chunk_start = time.time()
if idx == 0:
chunk_samples = int(first_chunk_in_secs * sample_rate)
expect_input = np.zeros((1, chunk_samples), dtype=np.float32)
else:
chunk_samples = int(other_chunk_in_secs * sample_rate)
expect_input = np.zeros((1, chunk_samples), dtype=np.float32)
expect_input[0][0:chunk_len] = seg
input0_data = expect_input
input1_data = np.array([[chunk_len]], dtype=np.int32)
inputs = [
protocol_client.InferInput(
"WAV",
input0_data.shape,
np_to_triton_dtype(input0_data.dtype),
),
protocol_client.InferInput(
"WAV_LENS",
input1_data.shape,
np_to_triton_dtype(input1_data.dtype),
),
]
inputs[0].set_data_from_numpy(input0_data)
inputs[1].set_data_from_numpy(input1_data)
outputs = [protocol_client.InferRequestedOutput("TRANSCRIPTS")]
end = False
if idx == len(wav_segs) - 1:
end = True
response = await triton_client.infer(
model_name,
inputs,
outputs=outputs,
sequence_id=sequence_id,
sequence_start=idx == 0,
sequence_end=end,
)
idx += 1
decoding_results = response.as_numpy("TRANSCRIPTS")
if type(decoding_results) == np.ndarray:
decoding_results = b" ".join(decoding_results).decode("utf-8")
else:
# For wenet
decoding_results = response.as_numpy("TRANSCRIPTS")[0].decode("utf-8")
chunk_end = time.time() - chunk_start
latency_data.append((chunk_end, chunk_len / sample_rate))
total_duration += c.duration
if compute_cer:
ref = c.supervisions[0].text.split()
hyp = decoding_results.split()
ref = list("".join(ref))
hyp = list("".join(hyp))
results.append((c.id, ref, hyp))
else:
results.append(
(
c.id,
c.supervisions[0].text.split(),
decoding_results.split(),
)
) # noqa
return total_duration, results, latency_data
async def main():
args = get_args()
filename = args.manifest_filename
server_addr = args.server_addr
server_port = args.server_port
url = f"{server_addr}:{server_port}"
num_tasks = args.num_tasks
log_interval = args.log_interval
compute_cer = args.compute_cer
cuts = load_manifest(filename)
cuts_list = cuts.split(num_tasks)
tasks = []
triton_client = grpcclient.InferenceServerClient(url=url, verbose=False)
protocol_client = grpcclient
if args.streaming or args.simulate_streaming:
frame_shift_ms = 10
frame_length_ms = 25
add_frames = math.ceil((frame_length_ms - frame_shift_ms) / frame_shift_ms)
# decode_window_length: input sequence length of streaming encoder
if args.context > 0:
# decode window length calculation for wenet
decode_window_length = (args.chunk_size - 1) * args.subsampling + args.context
else:
# decode window length calculation for icefall
decode_window_length = (
args.chunk_size + 2 + args.encoder_right_context
) * args.subsampling + 3
first_chunk_ms = (decode_window_length + add_frames) * frame_shift_ms
start_time = time.time()
for i in range(num_tasks):
if args.streaming:
assert not args.simulate_streaming
task = asyncio.create_task(
send_streaming(
cuts=cuts_list[i],
name=f"task-{i}",
triton_client=triton_client,
protocol_client=protocol_client,
log_interval=log_interval,
compute_cer=compute_cer,
model_name=args.model_name,
first_chunk_in_secs=first_chunk_ms / 1000,
other_chunk_in_secs=args.chunk_size * args.subsampling * frame_shift_ms / 1000,
task_index=i,
)
)
elif args.simulate_streaming:
task = asyncio.create_task(
send_streaming(
cuts=cuts_list[i],
name=f"task-{i}",
triton_client=triton_client,
protocol_client=protocol_client,
log_interval=log_interval,
compute_cer=compute_cer,
model_name=args.model_name,
first_chunk_in_secs=first_chunk_ms / 1000,
other_chunk_in_secs=args.chunk_size * args.subsampling * frame_shift_ms / 1000,
task_index=i,
simulate_mode=True,
)
)
else:
task = asyncio.create_task(
send(
cuts=cuts_list[i],
name=f"task-{i}",
triton_client=triton_client,
protocol_client=protocol_client,
log_interval=log_interval,
compute_cer=compute_cer,
model_name=args.model_name,
)
)
tasks.append(task)
ans_list = await asyncio.gather(*tasks)
end_time = time.time()
elapsed = end_time - start_time
results = []
total_duration = 0.0
latency_data = []
for ans in ans_list:
total_duration += ans[0]
results += ans[1]
if args.streaming or args.simulate_streaming:
latency_data += ans[2]
rtf = elapsed / total_duration
s = f"RTF: {rtf:.4f}\n"
s += f"total_duration: {total_duration:.3f} seconds\n"
s += f"({total_duration/3600:.2f} hours)\n"
s += f"processing time: {elapsed:.3f} seconds " f"({elapsed/3600:.2f} hours)\n"
if args.streaming or args.simulate_streaming:
latency_list = [chunk_end for (chunk_end, chunk_duration) in latency_data]
latency_ms = sum(latency_list) / float(len(latency_list)) * 1000.0
latency_variance = np.var(latency_list, dtype=np.float64) * 1000.0
s += f"latency_variance: {latency_variance:.2f}\n"
s += f"latency_50_percentile: {np.percentile(latency_list, 50) * 1000.0:.2f}\n"
s += f"latency_90_percentile: {np.percentile(latency_list, 90) * 1000.0:.2f}\n"
s += f"latency_99_percentile: {np.percentile(latency_list, 99) * 1000.0:.2f}\n"
s += f"average_latency_ms: {latency_ms:.2f}\n"
print(s)
with open("rtf.txt", "w") as f:
f.write(s)
name = Path(filename).stem.split(".")[0]
results = sorted(results)
store_transcripts(filename=f"recogs-{name}.txt", texts=results)
with open(f"errs-{name}.txt", "w") as f:
write_error_stats(f, "test-set", results, enable_log=True)
with open(f"errs-{name}.txt", "r") as f:
print(f.readline()) # WER
print(f.readline()) # Detailed errors
if args.stats_file:
stats = await triton_client.get_inference_statistics(model_name="", as_json=True)
with open(args.stats_file, "w") as f:
json.dump(stats, f)
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,536 @@
#!/usr/bin/env python3
# Copyright 2022 Xiaomi Corp. (authors: Fangjun Kuang)
# 2023 Nvidia (authors: Yuekai Zhang)
# 2023 Recurrent.ai (authors: Songtao Shi)
# See LICENSE for clarification regarding multiple authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This script loads a manifest in nemo format and sends it to the server
for decoding, in parallel.
{'audio_filepath':'','text':'',duration:}\n
{'audio_filepath':'','text':'',duration:}\n
Usage:
# For aishell manifests:
apt-get install git-lfs
git-lfs install
git clone https://huggingface.co/csukuangfj/aishell-test-dev-manifests
sudo mkdir -p ./aishell-test-dev-manifests/aishell
tar xf ./aishell-test-dev-manifests/data_aishell.tar.gz -C ./aishell-test-dev-manifests/aishell # noqa
# cmd run
manifest_path='./client/aishell_test.txt'
serveraddr=localhost
num_task=60
python3 client/decode_manifest_triton_wo_cuts.py \
--server-addr $serveraddr \
--compute-cer \
--model-name infer_pipeline \
--num-tasks $num_task \
--manifest-filename $manifest_path \
"""
from pydub import AudioSegment
import argparse
import asyncio
import math
import time
import types
from pathlib import Path
import json
import os
import numpy as np
import tritonclient
import tritonclient.grpc.aio as grpcclient
from tritonclient.utils import np_to_triton_dtype
from icefall.utils import store_transcripts, write_error_stats
DEFAULT_MANIFEST_FILENAME = "./aishell_test.txt" # noqa
DEFAULT_ROOT = "./"
DEFAULT_ROOT = "/mfs/songtao/researchcode/FunASR/data/"
def get_args():
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument(
"--server-addr",
type=str,
default="localhost",
help="Address of the server",
)
parser.add_argument(
"--server-port",
type=int,
default=8001,
help="Port of the server",
)
parser.add_argument(
"--manifest-filename",
type=str,
default=DEFAULT_MANIFEST_FILENAME,
help="Path to the manifest for decoding",
)
parser.add_argument(
"--model-name",
type=str,
default="transducer",
help="triton model_repo module name to request",
)
parser.add_argument(
"--num-tasks",
type=int,
default=50,
help="Number of tasks to use for sending",
)
parser.add_argument(
"--log-interval",
type=int,
default=5,
help="Controls how frequently we print the log.",
)
parser.add_argument(
"--compute-cer",
action="store_true",
default=False,
help="""True to compute CER, e.g., for Chinese.
False to compute WER, e.g., for English words.
""",
)
parser.add_argument(
"--streaming",
action="store_true",
default=False,
help="""True for streaming ASR.
""",
)
parser.add_argument(
"--simulate-streaming",
action="store_true",
default=False,
help="""True for strictly simulate streaming ASR.
Threads will sleep to simulate the real speaking scene.
""",
)
parser.add_argument(
"--chunk_size",
type=int,
required=False,
default=16,
help="chunk size default is 16",
)
parser.add_argument(
"--context",
type=int,
required=False,
default=-1,
help="subsampling context for wenet",
)
parser.add_argument(
"--encoder_right_context",
type=int,
required=False,
default=2,
help="encoder right context",
)
parser.add_argument(
"--subsampling",
type=int,
required=False,
default=4,
help="subsampling rate",
)
parser.add_argument(
"--stats_file",
type=str,
required=False,
default="./stats.json",
help="output of stats anaylasis",
)
return parser.parse_args()
def load_manifest(fp):
data = []
with open(fp) as f:
for i, dp in enumerate(f.readlines()):
dp = eval(dp)
dp["id"] = i
data.append(dp)
return data
def split_dps(dps, num_tasks):
dps_splited = []
# import pdb;pdb.set_trace()
assert len(dps) > num_tasks
one_task_num = len(dps) // num_tasks
for i in range(0, len(dps), one_task_num):
if i + one_task_num >= len(dps):
for k, j in enumerate(range(i, len(dps))):
dps_splited[k].append(dps[j])
else:
dps_splited.append(dps[i : i + one_task_num])
return dps_splited
def load_audio(path):
audio = AudioSegment.from_wav(path).set_frame_rate(16000).set_channels(1)
audiop_np = np.array(audio.get_array_of_samples()) / 32768.0
return audiop_np.astype(np.float32), audio.duration_seconds
async def send(
dps: list,
name: str,
triton_client: tritonclient.grpc.aio.InferenceServerClient,
protocol_client: types.ModuleType,
log_interval: int,
compute_cer: bool,
model_name: str,
):
total_duration = 0.0
results = []
for i, dp in enumerate(dps):
if i % log_interval == 0:
print(f"{name}: {i}/{len(dps)}")
waveform, duration = load_audio(os.path.join(DEFAULT_ROOT, dp["audio_filepath"]))
sample_rate = 16000
# padding to nearset 10 seconds
samples = np.zeros(
(
1,
10 * sample_rate * (int(len(waveform) / sample_rate // 10) + 1),
),
dtype=np.float32,
)
samples[0, : len(waveform)] = waveform
lengths = np.array([[len(waveform)]], dtype=np.int32)
inputs = [
protocol_client.InferInput("WAV", samples.shape, np_to_triton_dtype(samples.dtype)),
protocol_client.InferInput(
"WAV_LENS", lengths.shape, np_to_triton_dtype(lengths.dtype)
),
]
inputs[0].set_data_from_numpy(samples)
inputs[1].set_data_from_numpy(lengths)
outputs = [protocol_client.InferRequestedOutput("TRANSCRIPTS")]
sequence_id = 10086 + i
response = await triton_client.infer(
model_name, inputs, request_id=str(sequence_id), outputs=outputs
)
decoding_results = response.as_numpy("TRANSCRIPTS")[0]
if type(decoding_results) == np.ndarray:
decoding_results = b" ".join(decoding_results).decode("utf-8")
else:
# For wenet
decoding_results = decoding_results.decode("utf-8")
total_duration += duration
if compute_cer:
ref = dp["text"].split()
hyp = decoding_results.split()
ref = list("".join(ref))
hyp = list("".join(hyp))
results.append((dp["id"], ref, hyp))
else:
results.append(
(
dp["id"],
dp["text"].split(),
decoding_results.split(),
)
) # noqa
return total_duration, results
async def send_streaming(
dps: list,
name: str,
triton_client: tritonclient.grpc.aio.InferenceServerClient,
protocol_client: types.ModuleType,
log_interval: int,
compute_cer: bool,
model_name: str,
first_chunk_in_secs: float,
other_chunk_in_secs: float,
task_index: int,
simulate_mode: bool = False,
):
total_duration = 0.0
results = []
latency_data = []
for i, dp in enumerate(dps):
if i % log_interval == 0:
print(f"{name}: {i}/{len(dps)}")
waveform, duration = load_audio(dp["audio_filepath"])
sample_rate = 16000
wav_segs = []
j = 0
while j < len(waveform):
if j == 0:
stride = int(first_chunk_in_secs * sample_rate)
wav_segs.append(waveform[j : j + stride])
else:
stride = int(other_chunk_in_secs * sample_rate)
wav_segs.append(waveform[j : j + stride])
j += len(wav_segs[-1])
sequence_id = task_index + 10086
for idx, seg in enumerate(wav_segs):
chunk_len = len(seg)
if simulate_mode:
await asyncio.sleep(chunk_len / sample_rate)
chunk_start = time.time()
if idx == 0:
chunk_samples = int(first_chunk_in_secs * sample_rate)
expect_input = np.zeros((1, chunk_samples), dtype=np.float32)
else:
chunk_samples = int(other_chunk_in_secs * sample_rate)
expect_input = np.zeros((1, chunk_samples), dtype=np.float32)
expect_input[0][0:chunk_len] = seg
input0_data = expect_input
input1_data = np.array([[chunk_len]], dtype=np.int32)
inputs = [
protocol_client.InferInput(
"WAV",
input0_data.shape,
np_to_triton_dtype(input0_data.dtype),
),
protocol_client.InferInput(
"WAV_LENS",
input1_data.shape,
np_to_triton_dtype(input1_data.dtype),
),
]
inputs[0].set_data_from_numpy(input0_data)
inputs[1].set_data_from_numpy(input1_data)
outputs = [protocol_client.InferRequestedOutput("TRANSCRIPTS")]
end = False
if idx == len(wav_segs) - 1:
end = True
response = await triton_client.infer(
model_name,
inputs,
outputs=outputs,
sequence_id=sequence_id,
sequence_start=idx == 0,
sequence_end=end,
)
idx += 1
decoding_results = response.as_numpy("TRANSCRIPTS")
if type(decoding_results) == np.ndarray:
decoding_results = b" ".join(decoding_results).decode("utf-8")
else:
# For wenet
decoding_results = response.as_numpy("TRANSCRIPTS")[0].decode("utf-8")
chunk_end = time.time() - chunk_start
latency_data.append((chunk_end, chunk_len / sample_rate))
total_duration += duration
if compute_cer:
ref = dp["text"].split()
hyp = decoding_results.split()
ref = list("".join(ref))
hyp = list("".join(hyp))
results.append((dp["id"], ref, hyp))
else:
results.append(
(
dp["id"],
dp["text"].split(),
decoding_results.split(),
)
) # noqa
return total_duration, results, latency_data
async def main():
args = get_args()
filename = args.manifest_filename
server_addr = args.server_addr
server_port = args.server_port
url = f"{server_addr}:{server_port}"
num_tasks = args.num_tasks
log_interval = args.log_interval
compute_cer = args.compute_cer
dps = load_manifest(filename)
dps_list = split_dps(dps, num_tasks)
tasks = []
triton_client = grpcclient.InferenceServerClient(url=url, verbose=False)
protocol_client = grpcclient
if args.streaming or args.simulate_streaming:
frame_shift_ms = 10
frame_length_ms = 25
add_frames = math.ceil((frame_length_ms - frame_shift_ms) / frame_shift_ms)
# decode_window_length: input sequence length of streaming encoder
if args.context > 0:
# decode window length calculation for wenet
decode_window_length = (args.chunk_size - 1) * args.subsampling + args.context
else:
# decode window length calculation for icefall
decode_window_length = (
args.chunk_size + 2 + args.encoder_right_context
) * args.subsampling + 3
first_chunk_ms = (decode_window_length + add_frames) * frame_shift_ms
start_time = time.time()
for i in range(num_tasks):
if args.streaming:
assert not args.simulate_streaming
task = asyncio.create_task(
send_streaming(
dps=dps_list[i],
name=f"task-{i}",
triton_client=triton_client,
protocol_client=protocol_client,
log_interval=log_interval,
compute_cer=compute_cer,
model_name=args.model_name,
first_chunk_in_secs=first_chunk_ms / 1000,
other_chunk_in_secs=args.chunk_size * args.subsampling * frame_shift_ms / 1000,
task_index=i,
)
)
elif args.simulate_streaming:
task = asyncio.create_task(
send_streaming(
dps=dps_list[i],
name=f"task-{i}",
triton_client=triton_client,
protocol_client=protocol_client,
log_interval=log_interval,
compute_cer=compute_cer,
model_name=args.model_name,
first_chunk_in_secs=first_chunk_ms / 1000,
other_chunk_in_secs=args.chunk_size * args.subsampling * frame_shift_ms / 1000,
task_index=i,
simulate_mode=True,
)
)
else:
task = asyncio.create_task(
send(
dps=dps_list[i],
name=f"task-{i}",
triton_client=triton_client,
protocol_client=protocol_client,
log_interval=log_interval,
compute_cer=compute_cer,
model_name=args.model_name,
)
)
tasks.append(task)
ans_list = await asyncio.gather(*tasks)
end_time = time.time()
elapsed = end_time - start_time
results = []
total_duration = 0.0
latency_data = []
for ans in ans_list:
total_duration += ans[0]
results += ans[1]
if args.streaming or args.simulate_streaming:
latency_data += ans[2]
rtf = elapsed / total_duration
s = f"RTF: {rtf:.4f}\n"
s += f"total_duration: {total_duration:.3f} seconds\n"
s += f"({total_duration/3600:.2f} hours)\n"
s += f"processing time: {elapsed:.3f} seconds " f"({elapsed/3600:.2f} hours)\n"
if args.streaming or args.simulate_streaming:
latency_list = [chunk_end for (chunk_end, chunk_duration) in latency_data]
latency_ms = sum(latency_list) / float(len(latency_list)) * 1000.0
latency_variance = np.var(latency_list, dtype=np.float64) * 1000.0
s += f"latency_variance: {latency_variance:.2f}\n"
s += f"latency_50_percentile: {np.percentile(latency_list, 50) * 1000.0:.2f}\n"
s += f"latency_90_percentile: {np.percentile(latency_list, 90) * 1000.0:.2f}\n"
s += f"latency_99_percentile: {np.percentile(latency_list, 99) * 1000.0:.2f}\n"
s += f"average_latency_ms: {latency_ms:.2f}\n"
print(s)
with open("rtf.txt", "w") as f:
f.write(s)
name = Path(filename).stem.split(".")[0]
results = sorted(results)
store_transcripts(filename=f"recogs-{name}.txt", texts=results)
with open(f"errs-{name}.txt", "w") as f:
write_error_stats(f, "test-set", results, enable_log=True)
with open(f"errs-{name}.txt", "r") as f:
print(f.readline()) # WER
print(f.readline()) # Detailed errors
if args.stats_file:
stats = await triton_client.get_inference_statistics(model_name="", as_json=True)
with open(args.stats_file, "w") as f:
json.dump(stats, f)
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,140 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from tritonclient.utils import np_to_triton_dtype
import numpy as np
import math
import soundfile as sf
class OfflineSpeechClient(object):
def __init__(self, triton_client, model_name, protocol_client, args):
self.triton_client = triton_client
self.protocol_client = protocol_client
self.model_name = model_name
def recognize(self, wav_file, idx=0):
waveform, sample_rate = sf.read(wav_file)
samples = np.array([waveform], dtype=np.float32)
lengths = np.array([[len(waveform)]], dtype=np.int32)
# better pad waveform to nearest length here
# target_seconds = math.cel(len(waveform) / sample_rate)
# target_samples = np.zeros([1, target_seconds * sample_rate])
# target_samples[0][0: len(waveform)] = waveform
# samples = target_samples
sequence_id = 10086 + idx
result = ""
inputs = [
self.protocol_client.InferInput(
"WAV", samples.shape, np_to_triton_dtype(samples.dtype)
),
self.protocol_client.InferInput(
"WAV_LENS", lengths.shape, np_to_triton_dtype(lengths.dtype)
),
]
inputs[0].set_data_from_numpy(samples)
inputs[1].set_data_from_numpy(lengths)
outputs = [self.protocol_client.InferRequestedOutput("TRANSCRIPTS")]
response = self.triton_client.infer(
self.model_name,
inputs,
request_id=str(sequence_id),
outputs=outputs,
)
result = response.as_numpy("TRANSCRIPTS")[0].decode("utf-8")
return [result]
class StreamingSpeechClient(object):
def __init__(self, triton_client, model_name, protocol_client, args):
self.triton_client = triton_client
self.protocol_client = protocol_client
self.model_name = model_name
chunk_size = args.chunk_size
subsampling = args.subsampling
context = args.context
frame_shift_ms = args.frame_shift_ms
frame_length_ms = args.frame_length_ms
# for the first chunk
# we need additional frames to generate
# the exact first chunk length frames
# since the subsampling will look ahead several frames
first_chunk_length = (chunk_size - 1) * subsampling + context
add_frames = math.ceil((frame_length_ms - frame_shift_ms) / frame_shift_ms)
first_chunk_ms = (first_chunk_length + add_frames) * frame_shift_ms
other_chunk_ms = chunk_size * subsampling * frame_shift_ms
self.first_chunk_in_secs = first_chunk_ms / 1000
self.other_chunk_in_secs = other_chunk_ms / 1000
def recognize(self, wav_file, idx=0):
waveform, sample_rate = sf.read(wav_file)
wav_segs = []
i = 0
while i < len(waveform):
if i == 0:
stride = int(self.first_chunk_in_secs * sample_rate)
wav_segs.append(waveform[i : i + stride])
else:
stride = int(self.other_chunk_in_secs * sample_rate)
wav_segs.append(waveform[i : i + stride])
i += len(wav_segs[-1])
sequence_id = idx + 10086
# simulate streaming
for idx, seg in enumerate(wav_segs):
chunk_len = len(seg)
if idx == 0:
chunk_samples = int(self.first_chunk_in_secs * sample_rate)
expect_input = np.zeros((1, chunk_samples), dtype=np.float32)
else:
chunk_samples = int(self.other_chunk_in_secs * sample_rate)
expect_input = np.zeros((1, chunk_samples), dtype=np.float32)
expect_input[0][0:chunk_len] = seg
input0_data = expect_input
input1_data = np.array([[chunk_len]], dtype=np.int32)
inputs = [
self.protocol_client.InferInput(
"WAV",
input0_data.shape,
np_to_triton_dtype(input0_data.dtype),
),
self.protocol_client.InferInput(
"WAV_LENS",
input1_data.shape,
np_to_triton_dtype(input1_data.dtype),
),
]
inputs[0].set_data_from_numpy(input0_data)
inputs[1].set_data_from_numpy(input1_data)
outputs = [self.protocol_client.InferRequestedOutput("TRANSCRIPTS")]
end = False
if idx == len(wav_segs) - 1:
end = True
response = self.triton_client.infer(
self.model_name,
inputs,
outputs=outputs,
sequence_id=sequence_id,
sequence_start=idx == 0,
sequence_end=end,
)
idx += 1
result = response.as_numpy("TRANSCRIPTS")[0].decode("utf-8")
print("Get response from {}th chunk: {}".format(idx, result))
return [result]

View File

@@ -0,0 +1,60 @@
import numpy as np
def _levenshtein_distance(ref, hyp):
"""Levenshtein distance is a string metric for measuring the difference
between two sequences. Informally, the levenshtein disctance is defined as
the minimum number of single-character edits (substitutions, insertions or
deletions) required to change one word into the other. We can naturally
extend the edits to word level when calculate levenshtein disctance for
two sentences.
"""
m = len(ref)
n = len(hyp)
# special case
if ref == hyp:
return 0
if m == 0:
return n
if n == 0:
return m
if m < n:
ref, hyp = hyp, ref
m, n = n, m
# use O(min(m, n)) space
distance = np.zeros((2, n + 1), dtype=np.int32)
# initialize distance matrix
for j in range(n + 1):
distance[0][j] = j
# calculate levenshtein distance
for i in range(1, m + 1):
prev_row_idx = (i - 1) % 2
cur_row_idx = i % 2
distance[cur_row_idx][0] = i
for j in range(1, n + 1):
if ref[i - 1] == hyp[j - 1]:
distance[cur_row_idx][j] = distance[prev_row_idx][j - 1]
else:
s_num = distance[prev_row_idx][j - 1] + 1
i_num = distance[cur_row_idx][j - 1] + 1
d_num = distance[prev_row_idx][j] + 1
distance[cur_row_idx][j] = min(s_num, i_num, d_num)
return distance[m % 2][n]
def cal_cer(references, predictions):
errors = 0
lengths = 0
for ref, pred in zip(references, predictions):
cur_ref = list(ref)
cur_hyp = list(pred)
cur_error = _levenshtein_distance(cur_ref, cur_hyp)
errors += cur_error
lengths += len(cur_ref)
return float(errors) / lengths

View File

@@ -0,0 +1,18 @@
services:
asr:
image: soar97/triton-sensevoice:24.05
ports:
- "10085:8000"
- "10086:8001"
- "10087:8002"
environment:
- PYTHONIOENCODING=utf-8
deploy:
resources:
reservations:
devices:
- driver: nvidia
device_ids: ['0']
capabilities: [gpu]
command: >
/bin/bash -c "cd ./model_repo_sense_voice_small && bash run.sh"

View File

@@ -0,0 +1,62 @@
# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: "encoder"
backend: "onnxruntime"
default_model_filename: "model.onnx"
max_batch_size: 64
input [
{
name: "speech"
data_type: TYPE_FP32
dims: [-1, 560]
},
{
name: "speech_lengths"
data_type: TYPE_INT32
dims: [1]
reshape: { shape: [ ] }
}
]
output [
{
name: "logits"
data_type: TYPE_FP32
dims: [-1, 8404]
},
{
name: "token_num"
data_type: TYPE_INT32
dims: [1]
reshape: { shape: [ ] }
}
]
dynamic_batching {
preferred_batch_size: [ 2,4,8,16,32,64 ]
max_queue_delay_microseconds: 500
}
parameters { key: "cudnn_conv_algo_search" value: { string_value: "2" } }
instance_group [
{
count: 1
kind: KIND_GPU
}
]

View File

@@ -0,0 +1,314 @@
#!/bin/bash
#
# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
import triton_python_backend_utils as pb_utils
from torch.utils.dlpack import to_dlpack
import torch
import numpy as np
import kaldifeat
import _kaldifeat
from typing import List
import json
import yaml
from typing import Any, Dict, Iterable, List, NamedTuple, Set, Tuple, Union
class LFR(torch.nn.Module):
"""Batch LFR: https://github.com/Mddct/devil-asr/blob/main/patch/lfr.py"""
def __init__(self, m: int = 7, n: int = 6) -> None:
"""
Actually, this implements stacking frames and skipping frames.
if m = 1 and n = 1, just return the origin features.
if m = 1 and n > 1, it works like skipping.
if m > 1 and n = 1, it works like stacking but only support right frames.
if m > 1 and n > 1, it works like LFR.
"""
super().__init__()
self.m = m
self.n = n
self.left_padding_nums = math.ceil((self.m - 1) // 2)
def forward(
self, input_tensor: torch.Tensor, input_lens: torch.Tensor
) -> Tuple[torch.Tensor, torch.Tensor]:
B, _, D = input_tensor.size()
n_lfr = torch.ceil(input_lens / self.n)
prepad_nums = input_lens + self.left_padding_nums
right_padding_nums = torch.where(
self.m >= (prepad_nums - self.n * (n_lfr - 1)),
self.m - (prepad_nums - self.n * (n_lfr - 1)),
0,
)
T_all = self.left_padding_nums + input_lens + right_padding_nums
new_len = T_all // self.n
T_all_max = T_all.max().int()
tail_frames_index = (input_lens - 1).view(B, 1, 1).repeat(1, 1, D) # [B,1,D]
tail_frames = torch.gather(input_tensor, 1, tail_frames_index)
tail_frames = tail_frames.repeat(1, right_padding_nums.max().int(), 1)
head_frames = input_tensor[:, 0:1, :].repeat(1, self.left_padding_nums, 1)
# stack
input_tensor = torch.cat([head_frames, input_tensor, tail_frames], dim=1)
index = (
torch.arange(T_all_max, device=input_tensor.device, dtype=input_lens.dtype)
.unsqueeze(0)
.repeat(B, 1)
) # [B, T_all_max]
index_mask = index < (self.left_padding_nums + input_lens).unsqueeze(1) # [B, T_all_max]
tail_index_mask = torch.logical_not(index >= (T_all.unsqueeze(1))) & index_mask
tail = torch.ones(T_all_max, dtype=input_lens.dtype, device=input_tensor.device).unsqueeze(
0
).repeat(B, 1) * (
T_all_max - 1
) # [B, T_all_max]
indices = torch.where(torch.logical_or(index_mask, tail_index_mask), index, tail)
input_tensor = torch.gather(input_tensor, 1, indices.unsqueeze(2).repeat(1, 1, D))
input_tensor = input_tensor.unfold(1, self.m, step=self.n).transpose(2, 3)
return input_tensor.reshape(B, -1, D * self.m), new_len
class WavFrontend:
"""Conventional frontend structure for ASR."""
def __init__(
self,
cmvn_file: str = None,
fs: int = 16000,
window: str = "hamming",
n_mels: int = 80,
frame_length: int = 25,
frame_shift: int = 10,
filter_length_min: int = -1,
filter_length_max: float = -1,
lfr_m: int = 7,
lfr_n: int = 6,
dither: float = 1.0,
) -> None:
self.fs = fs
self.window = window
self.n_mels = n_mels
self.frame_length = frame_length
self.frame_shift = frame_shift
self.filter_length_min = filter_length_min
self.filter_length_max = filter_length_max
self.lfr_m = lfr_m
self.lfr_n = lfr_n
self.lfr = LFR(lfr_m, lfr_n)
self.cmvn_file = cmvn_file
self.dither = dither
if self.cmvn_file:
self.cmvn = self.load_cmvn()
def apply_cmvn_batch(self, inputs: np.ndarray) -> np.ndarray:
"""
Apply CMVN with mvn data
"""
batch, frame, dim = inputs.shape
means = np.tile(self.cmvn[0:1, :dim], (frame, 1))
vars = np.tile(self.cmvn[1:2, :dim], (frame, 1))
means = torch.from_numpy(means).to(inputs.device)
vars = torch.from_numpy(vars).to(inputs.device)
# print(inputs.shape, means.shape, vars.shape)
inputs = (inputs + means) * vars
return inputs
def load_cmvn(
self,
) -> np.ndarray:
with open(self.cmvn_file, "r", encoding="utf-8") as f:
lines = f.readlines()
means_list = []
vars_list = []
for i in range(len(lines)):
line_item = lines[i].split()
if line_item[0] == "<AddShift>":
line_item = lines[i + 1].split()
if line_item[0] == "<LearnRateCoef>":
add_shift_line = line_item[3 : (len(line_item) - 1)]
means_list = list(add_shift_line)
continue
elif line_item[0] == "<Rescale>":
line_item = lines[i + 1].split()
if line_item[0] == "<LearnRateCoef>":
rescale_line = line_item[3 : (len(line_item) - 1)]
vars_list = list(rescale_line)
continue
means = np.array(means_list).astype(np.float64)
vars = np.array(vars_list).astype(np.float64)
cmvn = np.array([means, vars])
return cmvn
class Fbank(torch.nn.Module):
def __init__(self, opts):
super(Fbank, self).__init__()
self.fbank = kaldifeat.Fbank(opts)
def forward(self, waves: List[torch.Tensor]):
return self.fbank(waves)
class TritonPythonModel:
"""Your Python model must use the same class name. Every Python model
that is created must have "TritonPythonModel" as the class name.
"""
def initialize(self, args):
"""`initialize` is called only once when the model is being loaded.
Implementing `initialize` function is optional. This function allows
the model to initialize any state associated with this model.
Parameters
----------
args : dict
Both keys and values are strings. The dictionary keys and values are:
* model_config: A JSON string containing the model configuration
* model_instance_kind: A string containing model instance kind
* model_instance_device_id: A string containing model instance device ID
* model_repository: Model repository path
* model_version: Model version
* model_name: Model name
"""
self.model_config = model_config = json.loads(args["model_config"])
self.max_batch_size = max(model_config["max_batch_size"], 1)
self.device = "cuda"
# Get OUTPUT0 configuration
output0_config = pb_utils.get_output_config_by_name(model_config, "speech")
# Convert Triton types to numpy types
output0_dtype = pb_utils.triton_string_to_numpy(output0_config["data_type"])
if output0_dtype == np.float32:
self.output0_dtype = torch.float32
else:
self.output0_dtype = torch.float16
# Get OUTPUT1 configuration
output1_config = pb_utils.get_output_config_by_name(model_config, "speech_lengths")
# Convert Triton types to numpy types
self.output1_dtype = pb_utils.triton_string_to_numpy(output1_config["data_type"])
params = self.model_config["parameters"]
for li in params.items():
key, value = li
value = value["string_value"]
if key == "config_path":
with open(str(value), "rb") as f:
config = yaml.load(f, Loader=yaml.Loader)
if key == "cmvn_path":
cmvn_path = str(value)
opts = kaldifeat.FbankOptions()
opts.frame_opts.dither = 1.0 # TODO: 0.0 or 1.0
opts.frame_opts.window_type = config["frontend_conf"]["window"]
opts.mel_opts.num_bins = int(config["frontend_conf"]["n_mels"])
opts.frame_opts.frame_shift_ms = float(config["frontend_conf"]["frame_shift"])
opts.frame_opts.frame_length_ms = float(config["frontend_conf"]["frame_length"])
opts.frame_opts.samp_freq = int(config["frontend_conf"]["fs"])
opts.device = torch.device(self.device)
self.opts = opts
self.feature_extractor = Fbank(self.opts)
self.feature_size = opts.mel_opts.num_bins
self.frontend = WavFrontend(cmvn_file=cmvn_path, **config["frontend_conf"])
def extract_feat(self, waveform_list: List[np.ndarray]) -> Tuple[np.ndarray, np.ndarray]:
feats, feats_len = [], []
wavs = []
for waveform in waveform_list:
wav = torch.from_numpy(waveform).float().squeeze().to(self.device)
wavs.append(wav)
features = self.feature_extractor(wavs)
features_len = [feature.shape[0] for feature in features]
speech = torch.zeros(
(len(features), max(features_len), self.opts.mel_opts.num_bins),
dtype=self.output0_dtype,
device=self.device,
)
for i, feature in enumerate(features):
speech[i, : int(features_len[i])] = feature
speech_lens = torch.tensor(features_len, dtype=torch.int64).to(self.device)
feats, feats_len = self.frontend.lfr(speech, speech_lens)
feats_len = feats_len.type(torch.int32)
feats = self.frontend.apply_cmvn_batch(feats)
feats = feats.type(self.output0_dtype)
return feats, feats_len
def execute(self, requests):
"""`execute` must be implemented in every Python model. `execute`
function receives a list of pb_utils.InferenceRequest as the only
argument. This function is called when an inference is requested
for this model.
Parameters
----------
requests : list
A list of pb_utils.InferenceRequest
Returns
-------
list
A list of pb_utils.InferenceResponse. The length of this list must
be the same as `requests`
"""
batch_count = []
total_waves = []
batch_len = []
responses = []
for request in requests:
input0 = pb_utils.get_input_tensor_by_name(request, "wav")
input1 = pb_utils.get_input_tensor_by_name(request, "wav_lens")
cur_b_wav = input0.as_numpy() * (1 << 15) # b x -1
total_waves.append(cur_b_wav)
features, feats_len = self.extract_feat(total_waves)
for i in range(features.shape[0]):
speech = features[i : i + 1][: int(feats_len[i].cpu())]
speech_lengths = feats_len[i].unsqueeze(0).unsqueeze(0)
speech, speech_lengths = speech.cpu(), speech_lengths.cpu()
out0 = pb_utils.Tensor.from_dlpack("speech", to_dlpack(speech))
out1 = pb_utils.Tensor.from_dlpack("speech_lengths", to_dlpack(speech_lengths))
inference_response = pb_utils.InferenceResponse(output_tensors=[out0, out1])
responses.append(inference_response)
return responses

View File

@@ -0,0 +1,81 @@
# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: "feature_extractor"
backend: "python"
max_batch_size: 64
parameters [
{
key: "num_mel_bins",
value: { string_value: "80"}
},
{
key: "frame_shift_in_ms"
value: { string_value: "10"}
},
{
key: "frame_length_in_ms"
value: { string_value: "25"}
},
{
key: "sample_rate"
value: { string_value: "16000"}
},
{
key: "cmvn_path"
value: { string_value: "./model_repo_paraformer_large_offline/feature_extractor/am.mvn"}
},
{
key: "config_path"
value: { string_value: "./model_repo_paraformer_large_offline/feature_extractor/config.yaml"}
}
]
input [
{
name: "wav"
data_type: TYPE_FP32
dims: [-1]
},
{
name: "wav_lens"
data_type: TYPE_INT32
dims: [1]
}
]
output [
{
name: "speech"
data_type: TYPE_FP32
dims: [-1, 560] # 80
},
{
name: "speech_lengths"
data_type: TYPE_INT32
dims: [1]
}
]
dynamic_batching {
}
instance_group [
{
count: 2
kind: KIND_GPU
}
]

View File

@@ -0,0 +1,99 @@
# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: "infer_pipeline"
platform: "ensemble"
max_batch_size: 64 #MAX_BATCH
input [
{
name: "WAV"
data_type: TYPE_FP32
dims: [-1]
},
{
name: "WAV_LENS"
data_type: TYPE_INT32
dims: [1]
}
]
output [
{
name: "TRANSCRIPTS"
data_type: TYPE_STRING
dims: [1]
}
]
ensemble_scheduling {
step [
{
model_name: "feature_extractor"
model_version: -1
input_map {
key: "wav"
value: "WAV"
}
input_map {
key: "wav_lens"
value: "WAV_LENS"
}
output_map {
key: "speech"
value: "SPEECH"
}
output_map {
key: "speech_lengths"
value: "SPEECH_LENGTHS"
}
},
{
model_name: "encoder"
model_version: -1
input_map {
key: "speech"
value: "SPEECH"
}
input_map {
key: "speech_lengths"
value: "SPEECH_LENGTHS"
}
output_map {
key: "logits"
value: "logits"
}
output_map {
key: "token_num"
value: "token_num"
}
},
{
model_name: "scoring"
model_version: -1
input_map {
key: "logits"
value: "logits"
}
input_map {
key: "token_num"
value: "token_num"
}
output_map {
key: "OUTPUT0"
value: "TRANSCRIPTS"
}
}
]
}

View File

@@ -0,0 +1,157 @@
#!/bin/bash
#
# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import triton_python_backend_utils as pb_utils
import numpy as np
import torch
from torch.utils.dlpack import from_dlpack
import json
import os
import yaml
class TritonPythonModel:
"""Your Python model must use the same class name. Every Python model
that is created must have "TritonPythonModel" as the class name.
"""
def initialize(self, args):
"""`initialize` is called only once when the model is being loaded.
Implementing `initialize` function is optional. This function allows
the model to initialize any state associated with this model.
Parameters
----------
args : dict
Both keys and values are strings. The dictionary keys and values are:
* model_config: A JSON string containing the model configuration
* model_instance_kind: A string containing model instance kind
* model_instance_device_id: A string containing model instance device ID
* model_repository: Model repository path
* model_version: Model version
* model_name: Model name
"""
self.model_config = model_config = json.loads(args["model_config"])
self.max_batch_size = max(model_config["max_batch_size"], 1)
# # Get OUTPUT0 configuration
output0_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT0")
# # Convert Triton types to numpy types
self.out0_dtype = pb_utils.triton_string_to_numpy(output0_config["data_type"])
self.init_vocab(self.model_config["parameters"])
def init_vocab(self, parameters):
blank_id = 0
for li in parameters.items():
key, value = li
value = value["string_value"]
if key == "blank_id":
self.blank_id = int(value)
elif key == "lm_path":
lm_path = value
elif key == "vocabulary":
self.vocab_dict = self.load_vocab(value)
if key == "ignore_id":
ignore_id = int(value)
def load_vocab(self, vocab_file):
"""
load lang_char.txt
"""
with open(str(vocab_file), "rb") as f:
config = yaml.load(f, Loader=yaml.Loader)
return config["token_list"]
def execute(self, requests):
"""`execute` must be implemented in every Python model. `execute`
function receives a list of pb_utils.InferenceRequest as the only
argument. This function is called when an inference is requested
for this model.
Parameters
----------
requests : list
A list of pb_utils.InferenceRequest
Returns
-------
list
A list of pb_utils.InferenceResponse. The length of this list must
be the same as `requests`
"""
# Every Python backend must iterate through list of requests and create
# an instance of pb_utils.InferenceResponse class for each of them. You
# should avoid storing any of the input Tensors in the class attributes
# as they will be overridden in subsequent inference requests. You can
# make a copy of the underlying NumPy array and store it if it is
# required.
total_seq, max_token_num = 0, 0
assert len(self.vocab_dict) == 8404, len(self.vocab_dict)
logits_list, token_num_list = [], []
for request in requests:
# Perform inference on the request and append it to responses list...
in_0 = pb_utils.get_input_tensor_by_name(request, "logits")
in_1 = pb_utils.get_input_tensor_by_name(request, "token_num")
logits, token_num = from_dlpack(in_0.to_dlpack()), from_dlpack(in_1.to_dlpack()).cpu()
max_token_num = max(max_token_num, token_num)
assert logits.shape[0] == 1
logits_list.append(logits)
token_num_list.append(token_num)
total_seq += 1
logits_batch = torch.zeros(
len(logits_list),
max_token_num,
len(self.vocab_dict),
dtype=torch.float32,
device=logits.device,
)
token_num_batch = torch.zeros(len(logits_list))
for i, (logits, token_num) in enumerate(zip(logits_list, token_num_list)):
logits_batch[i][: int(token_num)] = logits[0][: int(token_num)]
token_num_batch[i] = token_num
yseq_batch = logits_batch.argmax(axis=-1).tolist()
token_int_batch = [list(filter(lambda x: x not in (0, 2), yseq)) for yseq in yseq_batch]
tokens_batch = [[self.vocab_dict[i] for i in token_int] for token_int in token_int_batch]
hyps = [
"".join([t if t != "<space>" else " " for t in tokens]).encode("utf-8")
for tokens in tokens_batch
]
responses = []
for i in range(total_seq):
sents = np.array(hyps[i : i + 1])
out0 = pb_utils.Tensor("OUTPUT0", sents.astype(self.out0_dtype))
inference_response = pb_utils.InferenceResponse(output_tensors=[out0])
responses.append(inference_response)
return responses
def finalize(self):
"""`finalize` is called only once when the model is being unloaded.
Implementing `finalize` function is optional. This function allows
the model to perform any necessary clean ups before exit.
"""
print("Cleaning up...")

View File

@@ -0,0 +1,67 @@
# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: "scoring"
backend: "python"
max_batch_size: 64
parameters [
{
key: "ignore_id",
value: { string_value: "-1"}
},
{
key: "vocabulary",
value: { string_value: "./model_repo_paraformer_large_offline/feature_extractor/config.yaml"}
},
{
key: "lm_path"
value: { string_value: "#lm_path"}
},
{ key: "FORCE_CPU_ONLY_INPUT_TENSORS"
value: {string_value:"no"}
}
]
input [
{
name: "logits"
data_type: TYPE_FP32
dims: [-1, 8404]
},
{
name: "token_num"
data_type: TYPE_INT32
dims: [1]
reshape: { shape: [ ] }
}
]
output [
{
name: "OUTPUT0"
data_type: TYPE_STRING
dims: [1]
}
]
dynamic_batching {
}
instance_group [
{
count: 2
kind: KIND_CPU
}
]

View File

@@ -0,0 +1,273 @@
# Created on 2024-01-01
# Author: GuAn Zhu
import triton_python_backend_utils as pb_utils
import numpy as np
from torch.utils.dlpack import from_dlpack
import json
import yaml
import asyncio
from collections import OrderedDict
class LimitedDict(OrderedDict):
def __init__(self, max_length):
super().__init__()
self.max_length = max_length
def __setitem__(self, key, value):
if len(self) >= self.max_length:
self.popitem(last=False)
super().__setitem__(key, value)
class CIFSearch:
"""CIFSearch: https://github.com/alibaba-damo-academy/FunASR/blob/main/runtime/python/onnxruntime/funasr_onnx
/paraformer_online_bin.py"""
def __init__(self):
self.cache = {
"cif_hidden": np.zeros((1, 1, 512)).astype(np.float32),
"cif_alphas": np.zeros((1, 1)).astype(np.float32),
"last_chunk": False,
}
self.chunk_size = [5, 10, 5]
self.tail_threshold = 0.45
self.cif_threshold = 1.0
def infer(self, hidden, alphas):
batch_size, len_time, hidden_size = hidden.shape
token_length = []
list_fires = []
list_frames = []
cache_alphas = []
cache_hiddens = []
alphas[:, : self.chunk_size[0]] = 0.0
alphas[:, sum(self.chunk_size[:2]) :] = 0.0
if self.cache is not None and "cif_alphas" in self.cache and "cif_hidden" in self.cache:
hidden = np.concatenate((self.cache["cif_hidden"], hidden), axis=1)
alphas = np.concatenate((self.cache["cif_alphas"], alphas), axis=1)
if self.cache is not None and "last_chunk" in self.cache and self.cache["last_chunk"]:
tail_hidden = np.zeros((batch_size, 1, hidden_size)).astype(np.float32)
tail_alphas = np.array([[self.tail_threshold]]).astype(np.float32)
tail_alphas = np.tile(tail_alphas, (batch_size, 1))
hidden = np.concatenate((hidden, tail_hidden), axis=1)
alphas = np.concatenate((alphas, tail_alphas), axis=1)
len_time = alphas.shape[1]
for b in range(batch_size):
integrate = 0.0
frames = np.zeros(hidden_size).astype(np.float32)
list_frame = []
list_fire = []
for t in range(len_time):
alpha = alphas[b][t]
if alpha + integrate < self.cif_threshold:
integrate += alpha
list_fire.append(integrate)
frames += alpha * hidden[b][t]
else:
frames += (self.cif_threshold - integrate) * hidden[b][t]
list_frame.append(frames)
integrate += alpha
list_fire.append(integrate)
integrate -= self.cif_threshold
frames = integrate * hidden[b][t]
cache_alphas.append(integrate)
if integrate > 0.0:
cache_hiddens.append(frames / integrate)
else:
cache_hiddens.append(frames)
token_length.append(len(list_frame))
list_fires.append(list_fire)
list_frames.append(list_frame)
max_token_len = max(token_length)
list_ls = []
for b in range(batch_size):
pad_frames = np.zeros((max_token_len - token_length[b], hidden_size)).astype(np.float32)
if token_length[b] == 0:
list_ls.append(pad_frames)
else:
list_ls.append(np.concatenate((list_frames[b], pad_frames), axis=0))
self.cache["cif_alphas"] = np.stack(cache_alphas, axis=0)
self.cache["cif_alphas"] = np.expand_dims(self.cache["cif_alphas"], axis=0)
self.cache["cif_hidden"] = np.stack(cache_hiddens, axis=0)
self.cache["cif_hidden"] = np.expand_dims(self.cache["cif_hidden"], axis=0)
return np.stack(list_ls, axis=0).astype(np.float32), np.stack(token_length, axis=0).astype(
np.int32
)
class TritonPythonModel:
"""Your Python model must use the same class name. Every Python model
that is created must have "TritonPythonModel" as the class name.
"""
def initialize(self, args):
"""`initialize` is called only once when the model is being loaded.
Implementing `initialize` function is optional. This function allows
the model to initialize any state associated with this model.
Parameters
----------
args : dict
Both keys and values are strings. The dictionary keys and values are:
* model_config: A JSON string containing the model configuration
* model_instance_kind: A string containing model instance kind
* model_instance_device_id: A string containing model instance device ID
* model_repository: Model repository path
* model_version: Model version
* model_name: Model name
"""
self.model_config = model_config = json.loads(args["model_config"])
self.max_batch_size = max(model_config["max_batch_size"], 1)
# # Get OUTPUT0 configuration
output0_config = pb_utils.get_output_config_by_name(model_config, "transcripts")
# # Convert Triton types to numpy types
self.out0_dtype = pb_utils.triton_string_to_numpy(output0_config["data_type"])
self.init_vocab(self.model_config["parameters"])
self.cif_search_cache = LimitedDict(1024)
self.start = LimitedDict(1024)
def init_vocab(self, parameters):
for li in parameters.items():
key, value = li
value = value["string_value"]
if key == "vocabulary":
self.vocab_dict = self.load_vocab(value)
def load_vocab(self, vocab_file):
with open(str(vocab_file), "rb") as f:
config = yaml.load(f, Loader=yaml.Loader)
return config["token_list"]
async def execute(self, requests):
"""`execute` must be implemented in every Python model. `execute`
function receives a list of pb_utils.InferenceRequest as the only
argument. This function is called when an inference is requested
for this model.
Parameters
----------
requests : list
A list of pb_utils.InferenceRequest
Returns
-------
list
A list of pb_utils.InferenceResponse. The length of this list must
be the same as `requests`
"""
# Every Python backend must iterate through list of requests and create
# an instance of pb_utils.InferenceResponse class for each of them. You
# should avoid storing any of the input Tensors in the class attributes
# as they will be overridden in subsequent inference requests. You can
# make a copy of the underlying NumPy array and store it if it is
# required.
batch_end = []
responses = []
batch_corrid = []
qualified_corrid = []
batch_result = {}
inference_response_awaits = []
for request in requests:
hidden = pb_utils.get_input_tensor_by_name(request, "enc")
hidden = from_dlpack(hidden.to_dlpack()).cpu().numpy()
alphas = pb_utils.get_input_tensor_by_name(request, "alphas")
alphas = from_dlpack(alphas.to_dlpack()).cpu().numpy()
hidden_len = pb_utils.get_input_tensor_by_name(request, "enc_len")
hidden_len = from_dlpack(hidden_len.to_dlpack()).cpu().numpy()
in_start = pb_utils.get_input_tensor_by_name(request, "START")
start = in_start.as_numpy()[0][0]
in_corrid = pb_utils.get_input_tensor_by_name(request, "CORRID")
corrid = in_corrid.as_numpy()[0][0]
in_end = pb_utils.get_input_tensor_by_name(request, "END")
end = in_end.as_numpy()[0][0]
batch_end.append(end)
batch_corrid.append(corrid)
if start:
self.cif_search_cache[corrid] = CIFSearch()
self.start[corrid] = 1
if end:
self.cif_search_cache[corrid].cache["last_chunk"] = True
acoustic, acoustic_len = self.cif_search_cache[corrid].infer(hidden, alphas)
batch_result[corrid] = ""
if acoustic.shape[1] == 0:
continue
else:
qualified_corrid.append(corrid)
input_tensor0 = pb_utils.Tensor("enc", hidden)
input_tensor1 = pb_utils.Tensor("enc_len", np.array([hidden_len], dtype=np.int32))
input_tensor2 = pb_utils.Tensor("acoustic_embeds", acoustic)
input_tensor3 = pb_utils.Tensor(
"acoustic_embeds_len", np.array([acoustic_len], dtype=np.int32)
)
input_tensors = [input_tensor0, input_tensor1, input_tensor2, input_tensor3]
if self.start[corrid] and end:
flag = 3
elif end:
flag = 2
elif self.start[corrid]:
flag = 1
self.start[corrid] = 0
else:
flag = 0
inference_request = pb_utils.InferenceRequest(
model_name="decoder",
requested_output_names=["sample_ids"],
inputs=input_tensors,
request_id="",
correlation_id=corrid,
flags=flag,
)
inference_response_awaits.append(inference_request.async_exec())
inference_responses = await asyncio.gather(*inference_response_awaits)
for index_corrid, inference_response in zip(qualified_corrid, inference_responses):
if inference_response.has_error():
raise pb_utils.TritonModelException(inference_response.error().message())
else:
sample_ids = pb_utils.get_output_tensor_by_name(inference_response, "sample_ids")
token_ids = from_dlpack(sample_ids.to_dlpack()).cpu().numpy()[0]
# Change integer-ids to tokens
tokens = [self.vocab_dict[token_id] for token_id in token_ids]
batch_result[index_corrid] = "".join(tokens)
for i, index_corrid in enumerate(batch_corrid):
sent = np.array([batch_result[index_corrid]])
out0 = pb_utils.Tensor("transcripts", sent.astype(self.out0_dtype))
inference_response = pb_utils.InferenceResponse(output_tensors=[out0])
responses.append(inference_response)
if batch_end[i]:
del self.cif_search_cache[index_corrid]
del self.start[index_corrid]
return responses
def finalize(self):
"""`finalize` is called only once when the model is being unloaded.
Implementing `finalize` function is optional. This function allows
the model to perform any necessary clean ups before exit.
"""
print("Cleaning up...")

View File

@@ -0,0 +1,111 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Created on 2024-01-01
# Author: GuAn Zhu
name: "cif_search"
backend: "python"
max_batch_size: 128
sequence_batching{
max_sequence_idle_microseconds: 15000000
oldest {
max_candidate_sequences: 1024
preferred_batch_size: [32, 64, 128]
}
control_input [
{
name: "START",
control [
{
kind: CONTROL_SEQUENCE_START
fp32_false_true: [0, 1]
}
]
},
{
name: "READY"
control [
{
kind: CONTROL_SEQUENCE_READY
fp32_false_true: [0, 1]
}
]
},
{
name: "CORRID",
control [
{
kind: CONTROL_SEQUENCE_CORRID
data_type: TYPE_UINT64
}
]
},
{
name: "END",
control [
{
kind: CONTROL_SEQUENCE_END
fp32_false_true: [0, 1]
}
]
}
]
}
parameters [
{
key: "vocabulary",
value: { string_value: "model_repo_paraformer_large_online/feature_extractor/config.yaml"}
},
{ key: "FORCE_CPU_ONLY_INPUT_TENSORS"
value: {string_value:"no"}
}
]
input [
{
name: "enc"
data_type: TYPE_FP32
dims: [-1, 512]
},
{
name: "enc_len"
data_type: TYPE_INT32
dims: [1]
reshape: { shape: [ ] }
},
{
name: 'alphas'
data_type: TYPE_FP32
dims: [-1]
}
]
output [
{
name: "transcripts"
data_type: TYPE_STRING
dims: [1]
}
]
instance_group [
{
count: 6
kind: KIND_CPU
}
]

View File

@@ -0,0 +1,274 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Created on 2024-01-01
# Author: GuAn Zhu
name: "decoder"
backend: "onnxruntime"
default_model_filename: "decoder.onnx"
max_batch_size: 128
sequence_batching{
max_sequence_idle_microseconds: 15000000
oldest {
max_candidate_sequences: 1024
preferred_batch_size: [16, 32, 64]
}
control_input [
]
state [
{
input_name: "in_cache_0"
output_name: "out_cache_0"
data_type: TYPE_FP32
dims: [ 512, 10 ]
initial_state: {
data_type: TYPE_FP32
dims: [ 512, 10]
zero_data: true
name: "initial state"
}
},
{
input_name: "in_cache_1"
output_name: "out_cache_1"
data_type: TYPE_FP32
dims: [ 512, 10 ]
initial_state: {
data_type: TYPE_FP32
dims: [ 512, 10]
zero_data: true
name: "initial state"
}
},
{
input_name: "in_cache_2"
output_name: "out_cache_2"
data_type: TYPE_FP32
dims: [ 512, 10 ]
initial_state: {
data_type: TYPE_FP32
dims: [ 512, 10]
zero_data: true
name: "initial state"
}
},
{
input_name: "in_cache_3"
output_name: "out_cache_3"
data_type: TYPE_FP32
dims: [ 512, 10 ]
initial_state: {
data_type: TYPE_FP32
dims: [ 512, 10]
zero_data: true
name: "initial state"
}
},
{
input_name: "in_cache_4"
output_name: "out_cache_4"
data_type: TYPE_FP32
dims: [ 512, 10 ]
initial_state: {
data_type: TYPE_FP32
dims: [ 512, 10]
zero_data: true
name: "initial state"
}
},
{
input_name: "in_cache_5"
output_name: "out_cache_5"
data_type: TYPE_FP32
dims: [ 512, 10 ]
initial_state: {
data_type: TYPE_FP32
dims: [ 512, 10]
zero_data: true
name: "initial state"
}
},
{
input_name: "in_cache_6"
output_name: "out_cache_6"
data_type: TYPE_FP32
dims: [ 512, 10 ]
initial_state: {
data_type: TYPE_FP32
dims: [ 512, 10]
zero_data: true
name: "initial state"
}
},
{
input_name: "in_cache_7"
output_name: "out_cache_7"
data_type: TYPE_FP32
dims: [ 512, 10 ]
initial_state: {
data_type: TYPE_FP32
dims: [ 512, 10]
zero_data: true
name: "initial state"
}
},
{
input_name: "in_cache_8"
output_name: "out_cache_8"
data_type: TYPE_FP32
dims: [ 512, 10 ]
initial_state: {
data_type: TYPE_FP32
dims: [ 512, 10]
zero_data: true
name: "initial state"
}
},
{
input_name: "in_cache_9"
output_name: "out_cache_9"
data_type: TYPE_FP32
dims: [ 512, 10 ]
initial_state: {
data_type: TYPE_FP32
dims: [ 512, 10]
zero_data: true
name: "initial state"
}
},
{
input_name: "in_cache_10"
output_name: "out_cache_10"
data_type: TYPE_FP32
dims: [ 512, 10 ]
initial_state: {
data_type: TYPE_FP32
dims: [ 512, 10]
zero_data: true
name: "initial state"
}
},
{
input_name: "in_cache_11"
output_name: "out_cache_11"
data_type: TYPE_FP32
dims: [ 512, 10 ]
initial_state: {
data_type: TYPE_FP32
dims: [ 512, 10]
zero_data: true
name: "initial state"
}
},
{
input_name: "in_cache_12"
output_name: "out_cache_12"
data_type: TYPE_FP32
dims: [ 512, 10 ]
initial_state: {
data_type: TYPE_FP32
dims: [ 512, 10]
zero_data: true
name: "initial state"
}
},
{
input_name: "in_cache_13"
output_name: "out_cache_13"
data_type: TYPE_FP32
dims: [ 512, 10 ]
initial_state: {
data_type: TYPE_FP32
dims: [ 512, 10]
zero_data: true
name: "initial state"
}
},
{
input_name: "in_cache_14"
output_name: "out_cache_14"
data_type: TYPE_FP32
dims: [ 512, 10 ]
initial_state: {
data_type: TYPE_FP32
dims: [ 512, 10]
zero_data: true
name: "initial state"
}
},
{
input_name: "in_cache_15"
output_name: "out_cache_15"
data_type: TYPE_FP32
dims: [ 512, 10 ]
initial_state: {
data_type: TYPE_FP32
dims: [ 512, 10]
zero_data: true
name: "initial state"
}
}
]
}
input [
{
name: "enc"
data_type: TYPE_FP32
dims: [-1, 512]
},
{
name: "enc_len"
data_type: TYPE_INT32
dims: [1]
reshape: { shape: [ ] }
},
{
name: "acoustic_embeds"
data_type: TYPE_FP32
dims: [-1, 512]
},
{
name: "acoustic_embeds_len"
data_type: TYPE_INT32
dims: [1]
reshape: { shape: [ ] }
}
]
output [
{
name: "logits"
data_type: TYPE_FP32
dims: [-1, 8404]
},
{
name: "sample_ids"
data_type: TYPE_INT64
dims: [-1]
}
]
instance_group [
{
count: 1
kind: KIND_GPU
}
]

View File

@@ -0,0 +1,79 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Created on 2024-01-01
# Author: GuAn Zhu
name: "encoder"
backend: "onnxruntime"
default_model_filename: "model.onnx"
max_batch_size: 128
sequence_batching{
max_sequence_idle_microseconds: 15000000
oldest {
max_candidate_sequences: 1024
preferred_batch_size: [32, 64, 128]
max_queue_delay_microseconds: 300
}
control_input [
]
state [
]
}
input [
{
name: "speech"
data_type: TYPE_FP32
dims: [-1, 560]
},
{
name: "speech_lengths"
data_type: TYPE_INT32
dims: [1]
reshape: { shape: [ ] }
}
]
output [
{
name: "enc"
data_type: TYPE_FP32
dims: [-1, 512]
},
{
name: "enc_len"
data_type: TYPE_INT32
dims: [1]
reshape: { shape: [ ] }
},
{
name: "alphas"
data_type: TYPE_FP32
dims: [-1]
}
]
parameters { key: "cudnn_conv_algo_search" value: { string_value: "2" } }
instance_group [
{
count: 1
kind: KIND_GPU
}
]

View File

@@ -0,0 +1,216 @@
# Created on 2024-01-01
# Author: GuAn Zhu
# Modified from NVIDIA(https://github.com/wenet-e2e/wenet/blob/main/runtime/gpu/
# model_repo_stateful/feature_extractor/1/model.py)
import triton_python_backend_utils as pb_utils
from torch.utils.dlpack import from_dlpack
import torch
import kaldifeat
from typing import List
import json
import numpy as np
import yaml
from collections import OrderedDict
class LimitedDict(OrderedDict):
def __init__(self, max_length):
super().__init__()
self.max_length = max_length
def __setitem__(self, key, value):
if len(self) >= self.max_length:
self.popitem(last=False)
super().__setitem__(key, value)
class Fbank(torch.nn.Module):
def __init__(self, opts):
super(Fbank, self).__init__()
self.fbank = kaldifeat.Fbank(opts)
def forward(self, waves: List[torch.Tensor]):
return self.fbank(waves)
class Feat(object):
def __init__(self, seqid, offset_ms, sample_rate, frame_stride, device="cpu"):
self.seqid = seqid
self.sample_rate = sample_rate
self.wav = torch.tensor([], device=device)
self.offset = int(offset_ms / 1000 * sample_rate)
self.frames = None
self.frame_stride = int(frame_stride)
self.device = device
self.lfr_m = 7
def add_wavs(self, wav: torch.tensor):
wav = wav.to(self.device)
self.wav = torch.cat((self.wav, wav), axis=0)
def get_seg_wav(self):
seg = self.wav[:]
self.wav = self.wav[-self.offset :]
return seg
def add_frames(self, frames: torch.tensor):
"""
frames: seq_len x feat_sz
"""
if self.frames is None:
self.frames = torch.cat((frames[0, :].repeat((self.lfr_m - 1) // 2, 1), frames), axis=0)
else:
self.frames = torch.cat([self.frames, frames], axis=0)
def get_frames(self, num_frames: int):
seg = self.frames[0:num_frames]
self.frames = self.frames[self.frame_stride :]
return seg
class TritonPythonModel:
"""Your Python model must use the same class name. Every Python model
that is created must have "TritonPythonModel" as the class name.
"""
def initialize(self, args):
"""`initialize` is called only once when the model is being loaded.
Implementing `initialize` function is optional. This function allows
the model to initialize any state associated with this model.
Parameters
----------
args : dict
Both keys and values are strings. The dictionary keys and values are:
* model_config: A JSON string containing the model configuration
* model_instance_kind: A string containing model instance kind
* model_instance_device_id: A string containing model instance device ID
* model_repository: Model repository path
* model_version: Model version
* model_name: Model name
"""
self.model_config = model_config = json.loads(args["model_config"])
self.max_batch_size = max(model_config["max_batch_size"], 1)
if "GPU" in model_config["instance_group"][0]["kind"]:
self.device = "cuda"
else:
self.device = "cpu"
# Get OUTPUT0 configuration
output0_config = pb_utils.get_output_config_by_name(model_config, "speech")
# Convert Triton types to numpy types
self.output0_dtype = pb_utils.triton_string_to_numpy(output0_config["data_type"])
if self.output0_dtype == np.float32:
self.dtype = torch.float32
else:
self.dtype = torch.float16
self.feature_size = output0_config["dims"][-1]
self.decoding_window = output0_config["dims"][-2]
params = self.model_config["parameters"]
for li in params.items():
key, value = li
value = value["string_value"]
if key == "config_path":
with open(str(value), "rb") as f:
config = yaml.load(f, Loader=yaml.Loader)
opts = kaldifeat.FbankOptions()
opts.frame_opts.dither = 0.0
opts.frame_opts.window_type = config["frontend_conf"]["window"]
opts.mel_opts.num_bins = int(config["frontend_conf"]["n_mels"])
opts.frame_opts.frame_shift_ms = float(config["frontend_conf"]["frame_shift"])
opts.frame_opts.frame_length_ms = float(config["frontend_conf"]["frame_length"])
opts.frame_opts.samp_freq = int(config["frontend_conf"]["fs"])
opts.device = torch.device(self.device)
self.opts = opts
self.feature_extractor = Fbank(self.opts)
self.seq_feat = LimitedDict(1024)
chunk_size_s = float(params["chunk_size_s"]["string_value"])
sample_rate = opts.frame_opts.samp_freq
frame_shift_ms = opts.frame_opts.frame_shift_ms
frame_length_ms = opts.frame_opts.frame_length_ms
self.chunk_size = int(chunk_size_s * sample_rate)
self.frame_stride = (chunk_size_s * 1000) // frame_shift_ms
self.offset_ms = self.get_offset(frame_length_ms, frame_shift_ms)
self.sample_rate = sample_rate
def get_offset(self, frame_length_ms, frame_shift_ms):
offset_ms = 0
while offset_ms + frame_shift_ms < frame_length_ms:
offset_ms += frame_shift_ms
return offset_ms
def execute(self, requests):
"""`execute` must be implemented in every Python model. `execute`
function receives a list of pb_utils.InferenceRequest as the only
argument. This function is called when an inference is requested
for this model.
Parameters
----------
requests : list
A list of pb_utils.InferenceRequest
Returns
-------
list
A list of pb_utils.InferenceResponse. The length of this list must
be the same as `requests`
"""
total_waves = []
responses = []
batch_seqid = []
end_seqid = {}
for request in requests:
input0 = pb_utils.get_input_tensor_by_name(request, "wav")
wav = from_dlpack(input0.to_dlpack())[0]
# input1 = pb_utils.get_input_tensor_by_name(request, "wav_lens")
# wav_len = from_dlpack(input1.to_dlpack())[0]
wav_len = len(wav)
if wav_len < self.chunk_size:
temp = torch.zeros(self.chunk_size, dtype=torch.float32, device=self.device)
temp[0:wav_len] = wav[:]
wav = temp
in_start = pb_utils.get_input_tensor_by_name(request, "START")
start = in_start.as_numpy()[0][0]
in_ready = pb_utils.get_input_tensor_by_name(request, "READY")
ready = in_ready.as_numpy()[0][0]
in_corrid = pb_utils.get_input_tensor_by_name(request, "CORRID")
corrid = in_corrid.as_numpy()[0][0]
in_end = pb_utils.get_input_tensor_by_name(request, "END")
end = in_end.as_numpy()[0][0]
if start:
self.seq_feat[corrid] = Feat(
corrid, self.offset_ms, self.sample_rate, self.frame_stride, self.device
)
if ready:
self.seq_feat[corrid].add_wavs(wav)
batch_seqid.append(corrid)
if end:
end_seqid[corrid] = 1
wav = self.seq_feat[corrid].get_seg_wav() * 32768
total_waves.append(wav)
features = self.feature_extractor(total_waves)
for corrid, frames in zip(batch_seqid, features):
self.seq_feat[corrid].add_frames(frames)
speech = self.seq_feat[corrid].get_frames(self.decoding_window)
out_tensor0 = pb_utils.Tensor("speech", torch.unsqueeze(speech, 0).to("cpu").numpy())
output_tensors = [out_tensor0]
response = pb_utils.InferenceResponse(output_tensors=output_tensors)
responses.append(response)
if corrid in end_seqid:
del self.seq_feat[corrid]
return responses
def finalize(self):
print("Remove feature extractor!")

View File

@@ -0,0 +1,109 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Created on 2024-01-01
# Author: GuAn Zhu
name: "feature_extractor"
backend: "python"
max_batch_size: 128
parameters [
{
key: "chunk_size_s",
value: { string_value: "0.6"}
},
{
key: "config_path"
value: { string_value: "model_repo_paraformer_large_online/feature_extractor/config.yaml"}
}
]
sequence_batching{
max_sequence_idle_microseconds: 15000000
oldest {
max_candidate_sequences: 1024
preferred_batch_size: [32, 64, 128]
max_queue_delay_microseconds: 300
}
control_input [
{
name: "START",
control [
{
kind: CONTROL_SEQUENCE_START
fp32_false_true: [0, 1]
}
]
},
{
name: "READY"
control [
{
kind: CONTROL_SEQUENCE_READY
fp32_false_true: [0, 1]
}
]
},
{
name: "CORRID",
control [
{
kind: CONTROL_SEQUENCE_CORRID
data_type: TYPE_UINT64
}
]
},
{
name: "END",
control [
{
kind: CONTROL_SEQUENCE_END
fp32_false_true: [0, 1]
}
]
}
]
}
input [
{
name: "wav"
data_type: TYPE_FP32
dims: [-1]
},
{
name: "wav_lens"
data_type: TYPE_INT32
dims: [1]
}
]
output [
{
name: "speech"
data_type: TYPE_FP32
dims: [61, 80] # 80
}
]
instance_group [
{
count: 1
kind: KIND_GPU
}
]

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,85 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Created on 2024-01-01
# Author: GuAn Zhu
name: "lfr_cmvn_pe"
backend: "onnxruntime"
default_model_filename: "lfr_cmvn_pe.onnx"
max_batch_size: 128
sequence_batching{
max_sequence_idle_microseconds: 15000000
oldest {
max_candidate_sequences: 1024
preferred_batch_size: [32, 64, 128]
max_queue_delay_microseconds: 300
}
control_input [
]
state [
{
input_name: "cache"
output_name: "r_cache"
data_type: TYPE_FP32
dims: [10, 560]
initial_state: {
data_type: TYPE_FP32
dims: [10, 560]
zero_data: true
name: "initial state"
}
},
{
input_name: "offset"
output_name: "r_offset"
data_type: TYPE_INT32
dims: [1]
initial_state: {
data_type: TYPE_INT32
dims: [1]
zero_data: true
name: "initial state"
}
}
]
}
input [
{
name: "chunk_xs"
data_type: TYPE_FP32
dims: [61, 80]
}
]
output [
{
name: "chunk_xs_out"
data_type: TYPE_FP32
dims: [-1, 560]
},
{
name: "chunk_xs_out_len"
data_type: TYPE_INT32
dims: [-1]
}
]
instance_group [
{
count: 1
kind: KIND_GPU
}
]

View File

@@ -0,0 +1,142 @@
# Created on 2024-01-01
# Author: GuAn Zhu
import torch
import numpy as np
import math
import torch.nn.functional as F
class LFR_CMVN_PE(torch.nn.Module):
def __init__(
self,
mean: torch.Tensor,
istd: torch.Tensor,
m: int = 7,
n: int = 6,
max_len: int = 5000,
encoder_input_size: int = 560,
encoder_output_size: int = 512,
):
super().__init__()
# LRF
self.m = m
self.n = n
self.subsample = (m - 1) // 2
# CMVN
assert mean.shape == istd.shape
# The buffer can be accessed from this module using self.mean
self.register_buffer("mean", mean)
self.register_buffer("istd", istd)
# PE
self.encoder_input_size = encoder_input_size
self.encoder_output_size = encoder_output_size
self.max_len = max_len
self.pe = torch.zeros(self.max_len, self.encoder_input_size)
position = torch.arange(0, self.max_len, dtype=torch.float32).unsqueeze(1)
div_term = torch.exp(
torch.arange((self.encoder_input_size / 2), dtype=torch.float32)
* -(math.log(10000.0) / (self.encoder_input_size / 2 - 1))
)
self.pe[:, 0::1] = torch.cat(
(torch.sin(position * div_term), torch.cos(position * div_term)), dim=1
)
def forward(self, x, cache, offset):
"""
Args:
x (torch.Tensor): (batch, max_len, feat_dim)
Returns:
(torch.Tensor): normalized feature
"""
B, _, D = x.size()
x = x.unfold(1, self.m, step=self.n).transpose(2, 3)
x = x.view(B, -1, D * self.m)
x = (x + self.mean) * self.istd
x = x * (self.encoder_output_size**0.5)
index = offset + torch.arange(1, x.size(1) + 1).to(dtype=torch.int32)
pos_emb = F.embedding(index, self.pe) # B X T X d_model
r_cache = x + pos_emb
r_x = torch.cat((cache, r_cache), dim=1)
r_offset = offset + x.size(1)
r_x_len = torch.ones((B, 1), dtype=torch.int32) * r_x.size(1)
return r_x, r_x_len, r_cache, r_offset
def load_cmvn(cmvn_file):
with open(cmvn_file, "r", encoding="utf-8") as f:
lines = f.readlines()
means_list = []
vars_list = []
for i in range(len(lines)):
line_item = lines[i].split()
if line_item[0] == "<AddShift>":
line_item = lines[i + 1].split()
if line_item[0] == "<LearnRateCoef>":
add_shift_line = line_item[3 : (len(line_item) - 1)]
means_list = list(add_shift_line)
continue
elif line_item[0] == "<Rescale>":
line_item = lines[i + 1].split()
if line_item[0] == "<LearnRateCoef>":
rescale_line = line_item[3 : (len(line_item) - 1)]
vars_list = list(rescale_line)
continue
means = np.array(means_list).astype(np.float32)
vars = np.array(vars_list).astype(np.float32)
means = torch.from_numpy(means)
vars = torch.from_numpy(vars)
return means, vars
if __name__ == "__main__":
means, vars = load_cmvn("am.mvn")
means = torch.tile(means, (10, 1))
vars = torch.tile(vars, (10, 1))
model = LFR_CMVN_PE(means, vars)
model.eval()
all_names = [
"chunk_xs",
"cache",
"offset",
"chunk_xs_out",
"chunk_xs_out_len",
"r_cache",
"r_offset",
]
dynamic_axes = {}
for name in all_names:
dynamic_axes[name] = {0: "B"}
input_data1 = torch.randn(4, 61, 80).to(torch.float32)
input_data2 = torch.randn(4, 10, 560).to(torch.float32)
input_data3 = torch.randn(4, 1).to(torch.int32)
onnx_path = "./1/lfr_cmvn_pe.onnx"
torch.onnx.export(
model,
(input_data1, input_data2, input_data3),
onnx_path,
export_params=True,
opset_version=11,
do_constant_folding=True,
input_names=["chunk_xs", "cache", "offset"],
output_names=["chunk_xs_out", "chunk_xs_out_len", "r_cache", "r_offset"],
dynamic_axes=dynamic_axes,
verbose=False,
)
print("export to onnx model succeed!")

View File

@@ -0,0 +1,122 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Created on 2024-01-01
# Author: GuAn Zhu
name: "streaming_paraformer"
platform: "ensemble"
max_batch_size: 128 #MAX_BATCH
input [
{
name: "WAV"
data_type: TYPE_FP32
dims: [-1]
},
{
name: "WAV_LENS"
data_type: TYPE_INT32
dims: [1]
}
]
output [
{
name: "TRANSCRIPTS"
data_type: TYPE_STRING
dims: [1]
}
]
ensemble_scheduling {
step [
{
model_name: "feature_extractor"
model_version: -1
input_map {
key: "wav"
value: "WAV"
}
input_map {
key: "wav_lens"
value: "WAV_LENS"
}
output_map {
key: "speech"
value: "SPEECH"
}
},
{
model_name: "lfr_cmvn_pe"
model_version: -1
input_map {
key: "chunk_xs"
value: "SPEECH"
}
output_map {
key: "chunk_xs_out"
value: "CHUNK_XS_OUT"
}
output_map {
key: "chunk_xs_out_len"
value: "CHUNK_XS_OUT_LEN"
}
},
{
model_name: "encoder"
model_version: -1
input_map {
key: "speech"
value: "CHUNK_XS_OUT"
}
input_map {
key: "speech_lengths"
value: "CHUNK_XS_OUT_LEN"
}
output_map {
key: "enc"
value: "ENC"
}
output_map {
key: "enc_len"
value: "ENC_LEN"
}
output_map {
key: "alphas"
value: "ALPHAS"
}
},
{
model_name: "cif_search"
model_version: -1
input_map {
key: "enc"
value: "ENC"
}
input_map {
key: "enc_len"
value: "ENC_LEN"
}
input_map {
key: "alphas"
value: "ALPHAS"
}
output_map {
key: "transcripts"
value: "TRANSCRIPTS"
}
}
]
}

View File

@@ -0,0 +1 @@
/mnt/samsung-t7/yuekai/asr/funaudiollm/SenseVoice/model.onnx

View File

@@ -0,0 +1,71 @@
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: "encoder"
backend: "onnxruntime"
default_model_filename: "model.onnx"
max_batch_size: 16
input [
{
name: "speech"
data_type: TYPE_FP32
dims: [-1, 560]
},
{
name: "speech_lengths"
data_type: TYPE_INT32
dims: [1]
reshape: { shape: [ ] }
},
{
name: "language"
data_type: TYPE_INT32
dims: [1]
reshape: { shape: [ ] }
},
{
name: "textnorm"
data_type: TYPE_INT32
dims: [1]
reshape: { shape: [ ] }
}
]
output [
{
name: "ctc_logits"
data_type: TYPE_FP32
dims: [-1, 25055]
},
{
name: "encoder_out_lens"
data_type: TYPE_INT32
dims: [1]
reshape: { shape: [ ] }
}
]
dynamic_batching {
}
parameters { key: "cudnn_conv_algo_search" value: { string_value: "2" } }
instance_group [
{
count: 1
kind: KIND_GPU
}
]

View File

@@ -0,0 +1,325 @@
#!/bin/bash
#
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
import triton_python_backend_utils as pb_utils
from torch.utils.dlpack import to_dlpack
import torch
import numpy as np
import kaldifeat
import _kaldifeat
from typing import List
import json
import yaml
from typing import Any, Dict, Iterable, List, NamedTuple, Set, Tuple, Union
class LFR(torch.nn.Module):
"""Batch LFR: https://github.com/Mddct/devil-asr/blob/main/patch/lfr.py"""
def __init__(self, m: int = 7, n: int = 6) -> None:
"""
Actually, this implements stacking frames and skipping frames.
if m = 1 and n = 1, just return the origin features.
if m = 1 and n > 1, it works like skipping.
if m > 1 and n = 1, it works like stacking but only support right frames.
if m > 1 and n > 1, it works like LFR.
"""
super().__init__()
self.m = m
self.n = n
self.left_padding_nums = math.ceil((self.m - 1) // 2)
def forward(
self, input_tensor: torch.Tensor, input_lens: torch.Tensor
) -> Tuple[torch.Tensor, torch.Tensor]:
B, _, D = input_tensor.size()
n_lfr = torch.ceil(input_lens / self.n)
prepad_nums = input_lens + self.left_padding_nums
right_padding_nums = torch.where(
self.m >= (prepad_nums - self.n * (n_lfr - 1)),
self.m - (prepad_nums - self.n * (n_lfr - 1)),
0,
)
T_all = self.left_padding_nums + input_lens + right_padding_nums
new_len = T_all // self.n
T_all_max = T_all.max().int()
tail_frames_index = (input_lens - 1).view(B, 1, 1).repeat(1, 1, D) # [B,1,D]
tail_frames = torch.gather(input_tensor, 1, tail_frames_index)
tail_frames = tail_frames.repeat(1, right_padding_nums.max().int(), 1)
head_frames = input_tensor[:, 0:1, :].repeat(1, self.left_padding_nums, 1)
# stack
input_tensor = torch.cat([head_frames, input_tensor, tail_frames], dim=1)
index = (
torch.arange(T_all_max, device=input_tensor.device, dtype=input_lens.dtype)
.unsqueeze(0)
.repeat(B, 1)
) # [B, T_all_max]
index_mask = index < (self.left_padding_nums + input_lens).unsqueeze(1) # [B, T_all_max]
tail_index_mask = torch.logical_not(index >= (T_all.unsqueeze(1))) & index_mask
tail = torch.ones(T_all_max, dtype=input_lens.dtype, device=input_tensor.device).unsqueeze(
0
).repeat(B, 1) * (
T_all_max - 1
) # [B, T_all_max]
indices = torch.where(torch.logical_or(index_mask, tail_index_mask), index, tail)
input_tensor = torch.gather(input_tensor, 1, indices.unsqueeze(2).repeat(1, 1, D))
input_tensor = input_tensor.unfold(1, self.m, step=self.n).transpose(2, 3)
return input_tensor.reshape(B, -1, D * self.m), new_len
class WavFrontend:
"""Conventional frontend structure for ASR."""
def __init__(
self,
cmvn_file: str = None,
fs: int = 16000,
window: str = "hamming",
n_mels: int = 80,
frame_length: int = 25,
frame_shift: int = 10,
filter_length_min: int = -1,
filter_length_max: float = -1,
lfr_m: int = 7,
lfr_n: int = 6,
dither: float = 1.0,
) -> None:
self.fs = fs
self.window = window
self.n_mels = n_mels
self.frame_length = frame_length
self.frame_shift = frame_shift
self.filter_length_min = filter_length_min
self.filter_length_max = filter_length_max
self.lfr_m = lfr_m
self.lfr_n = lfr_n
self.lfr = LFR(lfr_m, lfr_n)
self.cmvn_file = cmvn_file
self.dither = dither
if self.cmvn_file:
self.cmvn = self.load_cmvn()
def apply_cmvn_batch(self, inputs: np.ndarray) -> np.ndarray:
"""
Apply CMVN with mvn data
"""
batch, frame, dim = inputs.shape
means = np.tile(self.cmvn[0:1, :dim], (frame, 1))
vars = np.tile(self.cmvn[1:2, :dim], (frame, 1))
means = torch.from_numpy(means).to(inputs.device)
vars = torch.from_numpy(vars).to(inputs.device)
inputs = (inputs + means) * vars
return inputs
def load_cmvn(
self,
) -> np.ndarray:
with open(self.cmvn_file, "r", encoding="utf-8") as f:
lines = f.readlines()
means_list = []
vars_list = []
for i in range(len(lines)):
line_item = lines[i].split()
if line_item[0] == "<AddShift>":
line_item = lines[i + 1].split()
if line_item[0] == "<LearnRateCoef>":
add_shift_line = line_item[3 : (len(line_item) - 1)]
means_list = list(add_shift_line)
continue
elif line_item[0] == "<Rescale>":
line_item = lines[i + 1].split()
if line_item[0] == "<LearnRateCoef>":
rescale_line = line_item[3 : (len(line_item) - 1)]
vars_list = list(rescale_line)
continue
means = np.array(means_list).astype(np.float64)
vars = np.array(vars_list).astype(np.float64)
cmvn = np.array([means, vars])
return cmvn
class Fbank(torch.nn.Module):
def __init__(self, opts):
super(Fbank, self).__init__()
self.fbank = kaldifeat.Fbank(opts)
def forward(self, waves: List[torch.Tensor]):
return self.fbank(waves)
class TritonPythonModel:
"""Your Python model must use the same class name. Every Python model
that is created must have "TritonPythonModel" as the class name.
"""
def initialize(self, args):
"""`initialize` is called only once when the model is being loaded.
Implementing `initialize` function is optional. This function allows
the model to initialize any state associated with this model.
Parameters
----------
args : dict
Both keys and values are strings. The dictionary keys and values are:
* model_config: A JSON string containing the model configuration
* model_instance_kind: A string containing model instance kind
* model_instance_device_id: A string containing model instance device ID
* model_repository: Model repository path
* model_version: Model version
* model_name: Model name
"""
self.model_config = model_config = json.loads(args["model_config"])
self.max_batch_size = max(model_config["max_batch_size"], 1)
self.device = "cuda"
# Get OUTPUT0 configuration
output0_config = pb_utils.get_output_config_by_name(model_config, "speech")
# Convert Triton types to numpy types
output0_dtype = pb_utils.triton_string_to_numpy(output0_config["data_type"])
if output0_dtype == np.float32:
self.output0_dtype = torch.float32
else:
self.output0_dtype = torch.float16
# Get OUTPUT1 configuration
output1_config = pb_utils.get_output_config_by_name(model_config, "speech_lengths")
# Convert Triton types to numpy types
self.output1_dtype = pb_utils.triton_string_to_numpy(output1_config["data_type"])
params = self.model_config["parameters"]
for li in params.items():
key, value = li
value = value["string_value"]
if key == "config_path":
with open(str(value), "rb") as f:
config = yaml.load(f, Loader=yaml.Loader)
if key == "cmvn_path":
cmvn_path = str(value)
config["frontend_conf"]["cmvn_file"] = cmvn_path
opts = kaldifeat.FbankOptions()
opts.frame_opts.dither = 1.0 # TODO: 0.0 or 1.0
opts.frame_opts.window_type = config["frontend_conf"]["window"]
opts.mel_opts.num_bins = int(config["frontend_conf"]["n_mels"])
opts.frame_opts.frame_shift_ms = float(config["frontend_conf"]["frame_shift"])
opts.frame_opts.frame_length_ms = float(config["frontend_conf"]["frame_length"])
opts.frame_opts.samp_freq = int(config["frontend_conf"]["fs"])
opts.device = torch.device(self.device)
self.opts = opts
self.feature_extractor = Fbank(self.opts)
self.feature_size = opts.mel_opts.num_bins
self.frontend = WavFrontend(**config["frontend_conf"])
def extract_feat(self, waveform_list: List[np.ndarray]) -> Tuple[np.ndarray, np.ndarray]:
feats, feats_len = [], []
wavs = []
for waveform in waveform_list:
wav = torch.from_numpy(waveform).float().squeeze().to(self.device)
wavs.append(wav)
features = self.feature_extractor(wavs)
features_len = [feature.shape[0] for feature in features]
speech = torch.zeros(
(len(features), max(features_len), self.opts.mel_opts.num_bins),
dtype=self.output0_dtype,
device=self.device,
)
for i, feature in enumerate(features):
speech[i, : int(features_len[i])] = feature
speech_lens = torch.tensor(features_len, dtype=torch.int64).to(self.device)
feats, feats_len = self.frontend.lfr(speech, speech_lens)
feats_len = feats_len.type(torch.int32)
feats = self.frontend.apply_cmvn_batch(feats)
feats = feats.type(self.output0_dtype)
return feats, feats_len
def execute(self, requests):
"""`execute` must be implemented in every Python model. `execute`
function receives a list of pb_utils.InferenceRequest as the only
argument. This function is called when an inference is requested
for this model.
Parameters
----------
requests : list
A list of pb_utils.InferenceRequest
Returns
-------
list
A list of pb_utils.InferenceResponse. The length of this list must
be the same as `requests`
"""
batch_count = []
total_waves = []
batch_len = []
responses = []
for request in requests:
input0 = pb_utils.get_input_tensor_by_name(request, "wav")
input1 = pb_utils.get_input_tensor_by_name(request, "wav_lens")
cur_b_wav = input0.as_numpy() * (1 << 15) # b x -1
# remove paddings, however, encoder may can't batch requests since different lengths.
# cur_b_wav = cur_b_wav[:, : int(input1.as_numpy()[0])]
batch_count.append(cur_b_wav.shape[0])
# convert the bx-1 numpy array into a 1x-1 list of arrays
cur_b_wav_list = [np.expand_dims(cur_b_wav[i], 0) for i in range(cur_b_wav.shape[0])]
total_waves.extend(cur_b_wav_list)
features, feats_len = self.extract_feat(total_waves)
i = 0
for batch in batch_count:
speech = features[i : i + batch]
speech_lengths = feats_len[i : i + batch].unsqueeze(1)
speech, speech_lengths = speech.cpu(), speech_lengths.cpu()
out0 = pb_utils.Tensor.from_dlpack("speech", to_dlpack(speech))
out1 = pb_utils.Tensor.from_dlpack("speech_lengths", to_dlpack(speech_lengths))
inference_response = pb_utils.InferenceResponse(output_tensors=[out0, out1])
responses.append(inference_response)
i += batch
return responses

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,81 @@
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: "feature_extractor"
backend: "python"
max_batch_size: 16
parameters [
{
key: "num_mel_bins",
value: { string_value: "80"}
},
{
key: "frame_shift_in_ms"
value: { string_value: "10"}
},
{
key: "frame_length_in_ms"
value: { string_value: "25"}
},
{
key: "sample_rate"
value: { string_value: "16000"}
},
{
key: "cmvn_path"
value: { string_value: "./model_repo_sense_voice_small/feature_extractor/am.mvn"}
},
{
key: "config_path"
value: { string_value: "./model_repo_sense_voice_small/feature_extractor/config.yaml"}
}
]
input [
{
name: "wav"
data_type: TYPE_FP32
dims: [-1]
},
{
name: "wav_lens"
data_type: TYPE_INT32
dims: [1]
}
]
output [
{
name: "speech"
data_type: TYPE_FP32
dims: [-1, 560] # 80
},
{
name: "speech_lengths"
data_type: TYPE_INT32
dims: [1]
}
]
dynamic_batching {
}
instance_group [
{
count: 2
kind: KIND_GPU
}
]

View File

@@ -0,0 +1,97 @@
encoder: SenseVoiceEncoderSmall
encoder_conf:
output_size: 512
attention_heads: 4
linear_units: 2048
num_blocks: 50
tp_blocks: 20
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.1
input_layer: pe
pos_enc_class: SinusoidalPositionEncoder
normalize_before: true
kernel_size: 11
sanm_shfit: 0
selfattention_layer_type: sanm
model: SenseVoiceSmall
model_conf:
length_normalized_loss: true
sos: 1
eos: 2
ignore_id: -1
tokenizer: SentencepiecesTokenizer
tokenizer_conf:
bpemodel: null
unk_symbol: <unk>
split_with_space: true
frontend: WavFrontend
frontend_conf:
fs: 16000
window: hamming
n_mels: 80
frame_length: 25
frame_shift: 10
lfr_m: 7
lfr_n: 6
cmvn_file: null
dataset: SenseVoiceCTCDataset
dataset_conf:
index_ds: IndexDSJsonl
batch_sampler: EspnetStyleBatchSampler
data_split_num: 32
batch_type: token
batch_size: 14000
max_token_length: 2000
min_token_length: 60
max_source_length: 2000
min_source_length: 60
max_target_length: 200
min_target_length: 0
shuffle: true
num_workers: 4
sos: ${model_conf.sos}
eos: ${model_conf.eos}
IndexDSJsonl: IndexDSJsonl
retry: 20
train_conf:
accum_grad: 1
grad_clip: 5
max_epoch: 20
keep_nbest_models: 10
avg_nbest_model: 10
log_interval: 100
resume: true
validate_interval: 10000
save_checkpoint_interval: 10000
optim: adamw
optim_conf:
lr: 0.00002
scheduler: warmuplr
scheduler_conf:
warmup_steps: 25000
specaug: SpecAugLFR
specaug_conf:
apply_time_warp: false
time_warp_window: 5
time_warp_mode: bicubic
apply_freq_mask: true
freq_mask_width_range:
- 0
- 30
lfr_rate: 6
num_freq_mask: 1
apply_time_mask: true
time_mask_width_range:
- 0
- 12
num_time_mask: 1

View File

@@ -0,0 +1,136 @@
#!/bin/bash
#
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import triton_python_backend_utils as pb_utils
import numpy as np
import torch
from torch.utils.dlpack import from_dlpack
import json
import os
import yaml
import sentencepiece as spm
class TritonPythonModel:
"""Your Python model must use the same class name. Every Python model
that is created must have "TritonPythonModel" as the class name.
"""
def initialize(self, args):
"""`initialize` is called only once when the model is being loaded.
Implementing `initialize` function is optional. This function allows
the model to initialize any state associated with this model.
Parameters
----------
args : dict
Both keys and values are strings. The dictionary keys and values are:
* model_config: A JSON string containing the model configuration
* model_instance_kind: A string containing model instance kind
* model_instance_device_id: A string containing model instance device ID
* model_repository: Model repository path
* model_version: Model version
* model_name: Model name
"""
self.model_config = model_config = json.loads(args["model_config"])
self.max_batch_size = max(model_config["max_batch_size"], 1)
# # Get OUTPUT0 configuration
output0_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT0")
# # Convert Triton types to numpy types
self.out0_dtype = pb_utils.triton_string_to_numpy(output0_config["data_type"])
self.init_tokenizer(self.model_config["parameters"])
def init_tokenizer(self, parameters):
for li in parameters.items():
key, value = li
value = value["string_value"]
if key == "tokenizer_path":
tokenizer_path = value
self.tokenizer = spm.SentencePieceProcessor()
self.tokenizer.Load(tokenizer_path)
def execute(self, requests):
"""`execute` must be implemented in every Python model. `execute`
function receives a list of pb_utils.InferenceRequest as the only
argument. This function is called when an inference is requested
for this model.
Parameters
----------
requests : list
A list of pb_utils.InferenceRequest
Returns
-------
list
A list of pb_utils.InferenceResponse. The length of this list must
be the same as `requests`
"""
# Every Python backend must iterate through list of requests and create
# an instance of pb_utils.InferenceResponse class for each of them. You
# should avoid storing any of the input Tensors in the class attributes
# as they will be overridden in subsequent inference requests. You can
# make a copy of the underlying NumPy array and store it if it is
# required.
total_seq = 0
logits_list, batch_count = [], []
for request in requests:
# Perform inference on the request and append it to responses list...
in_0 = pb_utils.get_input_tensor_by_name(request, "ctc_logits")
logits = from_dlpack(in_0.to_dlpack())
logits_list.append(logits)
total_seq += logits.shape[0]
batch_count.append(logits.shape[0])
logits_batch = torch.cat(logits_list, dim=0)
yseq_batch = logits_batch.argmax(axis=-1)
yseq_batch = torch.unique_consecutive(yseq_batch, dim=-1)
yseq_batch = yseq_batch.tolist()
# Remove blank_id and EOS tokens
token_int_batch = [list(filter(lambda x: x not in (0, 2), yseq)) for yseq in yseq_batch]
hyps = []
for i, token_int in enumerate(token_int_batch):
hyp = self.tokenizer.DecodeIds(token_int)
hyps.append(hyp)
responses = []
i = 0
for batch in batch_count:
sents = np.array(hyps[i : i + batch])
out0 = pb_utils.Tensor("OUTPUT0", sents.astype(self.out0_dtype))
inference_response = pb_utils.InferenceResponse(output_tensors=[out0])
responses.append(inference_response)
i += batch
return responses
def finalize(self):
"""`finalize` is called only once when the model is being unloaded.
Implementing `finalize` function is optional. This function allows
the model to perform any necessary clean ups before exit.
"""
print("Cleaning up...")

View File

@@ -0,0 +1 @@
/mnt/samsung-t7/yuekai/asr/funaudiollm/SenseVoiceSmall/chn_jpn_yue_eng_ko_spectok.bpe.model

View File

@@ -0,0 +1,59 @@
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: "scoring"
backend: "python"
max_batch_size: 16
parameters [
{
key: "tokenizer_path",
value: { string_value: "./model_repo_sense_voice_small/scoring/chn_jpn_yue_eng_ko_spectok.bpe.model"}
},
{ key: "FORCE_CPU_ONLY_INPUT_TENSORS"
value: {string_value:"no"}
}
]
input [
{
name: "ctc_logits"
data_type: TYPE_FP32
dims: [-1, 25055]
},
{
name: "encoder_out_lens"
data_type: TYPE_INT32
dims: [1]
reshape: { shape: [ ] }
}
]
output [
{
name: "OUTPUT0"
data_type: TYPE_STRING
dims: [1]
}
]
dynamic_batching {
}
instance_group [
{
count: 2
kind: KIND_CPU
}
]

View File

@@ -0,0 +1,117 @@
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: "sensevoice"
platform: "ensemble"
max_batch_size: 16
input [
{
name: "WAV"
data_type: TYPE_FP32
dims: [-1]
},
{
name: "WAV_LENS"
data_type: TYPE_INT32
dims: [1]
},
{
name: "LANGUAGE"
data_type: TYPE_INT32
dims: [1]
},
{
name: "TEXT_NORM"
data_type: TYPE_INT32
dims: [1]
}
]
output [
{
name: "TRANSCRIPTS"
data_type: TYPE_STRING
dims: [1]
}
]
ensemble_scheduling {
step [
{
model_name: "feature_extractor"
model_version: -1
input_map {
key: "wav"
value: "WAV"
}
input_map {
key: "wav_lens"
value: "WAV_LENS"
}
output_map {
key: "speech"
value: "SPEECH"
}
output_map {
key: "speech_lengths"
value: "SPEECH_LENGTHS"
}
},
{
model_name: "encoder"
model_version: -1
input_map {
key: "speech"
value: "SPEECH"
}
input_map {
key: "speech_lengths"
value: "SPEECH_LENGTHS"
}
input_map {
key: "language"
value: "LANGUAGE"
}
input_map {
key: "textnorm"
value: "TEXT_NORM"
}
output_map {
key: "ctc_logits"
value: "ctc_logits"
}
output_map {
key: "encoder_out_lens"
value: "encoder_out_lens"
}
},
{
model_name: "scoring"
model_version: -1
input_map {
key: "ctc_logits"
value: "ctc_logits"
}
input_map {
key: "encoder_out_lens"
value: "encoder_out_lens"
}
output_map {
key: "OUTPUT0"
value: "TRANSCRIPTS"
}
}
]
}