Sync from bytedesk-private: update

2026-05-15 03:38:04 +00:00 · 2024-12-14 10:43:18 +08:00
parent 476eebb101
commit 5e082909e4
3421 changed files with 812709 additions and 0 deletions
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/bicif_paraformer/demo.py
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/bicif_paraformer/demo.py
@@ -0,0 +1,20 @@
+#!/usr/bin/env python3
+# -*- encoding: utf-8 -*-
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+
+from funasr import AutoModel
+
+model = AutoModel(
+    model="iic/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
+    vad_model="iic/speech_fsmn_vad_zh-cn-16k-common-pytorch",
+    punc_model="iic/punc_ct-transformer_cn-en-common-vocab471067-large",
+    # spk_model="iic/speech_campplus_sv_zh-cn_16k-common",
+)
+
+res = model.generate(
+    input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_vad_punc_example.wav",
+    batch_size_s=300,
+    batch_size_threshold_s=60,
+)
+print(res)
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/bicif_paraformer/demo.sh
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/bicif_paraformer/demo.sh
@@ -0,0 +1,18 @@
+
+model="iic/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
+vad_model="iic/speech_fsmn_vad_zh-cn-16k-common-pytorch"
+#punc_model="iic/punc_ct-transformer_zh-cn-common-vocab272727-pytorch"
+punc_model="iic/punc_ct-transformer_cn-en-common-vocab471067-large"
+spk_model="iic/speech_campplus_sv_zh-cn_16k-common"
+
+python funasr/bin/inference.py \
+model=${model} \
+vad_model=${vad_model} \
+punc_model=${punc_model} \
+spk_model=${spk_model} \
+input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_vad_punc_example.wav" \
+output_dir="./outputs/debug" \
+device="cpu" \
+batch_size_s=300 \
+batch_size_threshold_s=60
+
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/bicif_paraformer/export.py
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/bicif_paraformer/export.py
@@ -0,0 +1,28 @@
+#!/usr/bin/env python3
+# -*- encoding: utf-8 -*-
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+
+# method1, inference from model hub
+
+from funasr import AutoModel
+
+model = AutoModel(
+    model="iic/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
+    device="cpu",
+)
+
+res = model.export(type="torchscript", quantize=False)
+print(res)
+
+
+# # method2, inference from local path
+# from funasr import AutoModel
+
+# model = AutoModel(
+#     model="/Users/zhifu/.cache/modelscope/hub/iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
+#     device="cpu",
+# )
+
+# res = model.export(type="onnx", quantize=False)
+# print(res)
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/bicif_paraformer/export.sh
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/bicif_paraformer/export.sh
@@ -0,0 +1,23 @@
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+
+# method1, inference from model hub
+export HYDRA_FULL_ERROR=1
+
+
+model="iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
+
+python -m funasr.bin.export \
++model=${model} \
++type="onnx" \
++quantize=false \
++device="cpu"
+
+# method2, inference from local path
+model="/Users/zhifu/.cache/modelscope/hub/iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
+
+python -m funasr.bin.export \
++model=${model} \
++type="onnx" \
++quantize=false \
++device="cpu"
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/bicif_paraformer/finetune.sh
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/bicif_paraformer/finetune.sh
@@ -0,0 +1,84 @@
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+
+workspace=`pwd`
+
+# method1, finetune from model hub
+
+# which gpu to train or finetune
+export CUDA_VISIBLE_DEVICES="0,1"
+gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+
+# model_name from model_hub, or model_dir in local path
+
+## option 1, download model automatically
+model_name_or_model_dir="iic/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
+
+## option 2, download model by git
+#local_path_root=${workspace}/modelscope_models
+#mkdir -p ${local_path_root}/${model_name_or_model_dir}
+#git clone https://www.modelscope.cn/${model_name_or_model_dir}.git ${local_path_root}/${model_name_or_model_dir}
+#model_name_or_model_dir=${local_path_root}/${model_name_or_model_dir}
+
+
+# data dir, which contains: train.json, val.json
+data_dir="../../../data/list"
+
+train_data="${data_dir}/train.jsonl"
+val_data="${data_dir}/val.jsonl"
+
+# generate train.jsonl and val.jsonl from wav.scp and text.txt
+scp2jsonl \
++scp_file_list='["../../../data/list/train_wav.scp", "../../../data/list/train_text.txt"]' \
++data_type_list='["source", "target"]' \
++jsonl_file_out="${train_data}"
+
+scp2jsonl \
++scp_file_list='["../../../data/list/val_wav.scp", "../../../data/list/val_text.txt"]' \
++data_type_list='["source", "target"]' \
++jsonl_file_out="${val_data}"
+
+
+# exp output dir
+output_dir="./outputs"
+log_file="${output_dir}/log.txt"
+
+
+mkdir -p ${output_dir}
+echo "log_file: ${log_file}"
+
+deepspeed_config=${workspace}/../../ds_stage1.json
+
+DISTRIBUTED_ARGS="
+    --nnodes ${WORLD_SIZE:-1} \
+    --nproc_per_node $gpu_num \
+    --node_rank ${RANK:-0} \
+    --master_addr ${MASTER_ADDR:-127.0.0.1} \
+    --master_port ${MASTER_PORT:-26669}
+"
+
+echo $DISTRIBUTED_ARGS
+
+torchrun $DISTRIBUTED_ARGS \
+../../../funasr/bin/train_ds.py \
++model="${model_name_or_model_dir}" \
++train_data_set_list="${train_data}" \
++valid_data_set_list="${val_data}" \
++dataset="AudioDataset" \
++dataset_conf.index_ds="IndexDSJsonl" \
++dataset_conf.data_split_num=1 \
++dataset_conf.batch_sampler="BatchSampler" \
++dataset_conf.batch_size=6000  \
++dataset_conf.sort_size=1024 \
++dataset_conf.batch_type="token" \
++dataset_conf.num_workers=4 \
++train_conf.max_epoch=50 \
++train_conf.log_interval=1 \
++train_conf.resume=true \
++train_conf.validate_interval=2000 \
++train_conf.save_checkpoint_interval=2000 \
++train_conf.keep_nbest_models=20 \
++train_conf.use_deepspeed=false \
++train_conf.deepspeed_config=${deepspeed_config} \
++optim_conf.lr=0.0002 \
++output_dir="${output_dir}" &> ${log_file}
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/campplus_sv/demo.py
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/campplus_sv/demo.py
@@ -0,0 +1,13 @@
+#!/usr/bin/env python3
+# -*- encoding: utf-8 -*-
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+
+from funasr import AutoModel
+
+model = AutoModel(model="iic/speech_campplus_sv_zh-cn_16k-common")
+
+res = model.generate(
+    input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav"
+)
+print(res)
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/conformer/demo.py
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/conformer/demo.py
@@ -0,0 +1,14 @@
+#!/usr/bin/env python3
+# -*- encoding: utf-8 -*-
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+
+from funasr import AutoModel
+
+model = AutoModel(model="iic/speech_conformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch")
+
+res = model.generate(
+    input="https://modelscope.oss-cn-beijing.aliyuncs.com/test/audios/asr_example.wav",
+    decoding_ctc_weight=0.0,
+)
+print(res)
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/conformer/demo.sh
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/conformer/demo.sh
@@ -0,0 +1,9 @@
+
+model="iic/speech_conformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch"
+
+python funasr/bin/inference.py \
+model=${model} \
+input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav" \
+output_dir="./outputs/debug" \
+device="cpu" \
+
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/contextual_paraformer/demo.py
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/contextual_paraformer/demo.py
@@ -0,0 +1,14 @@
+#!/usr/bin/env python3
+# -*- encoding: utf-8 -*-
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+
+from funasr import AutoModel
+
+model = AutoModel(model="iic/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404")
+
+res = model.generate(
+    input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav",
+    hotword="达摩院 魔搭",
+)
+print(res)
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/contextual_paraformer/demo.sh
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/contextual_paraformer/demo.sh
@@ -0,0 +1,11 @@
+
+model="iic/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404"
+
+
+python ../../../funasr/bin/inference.py \
+model=${model} \
+input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav" \
+output_dir="./outputs/debug" \
+device="cpu" \
+"hotword='达摩院 魔搭'"
+
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/contextual_paraformer/demo2.sh
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/contextual_paraformer/demo2.sh
@@ -0,0 +1,9 @@
+python -m funasr.bin.inference \
+--config-path="/nfs/yufan.yf/workspace/model_download/modelscope/hub/damo/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404" \
+--config-name="config.yaml" \
++init_param="/nfs/yufan.yf/workspace/model_download/modelscope/hub/damo/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/model.pb" \
++tokenizer_conf.token_list="/nfs/yufan.yf/workspace/model_download/modelscope/hub/damo/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/tokens.txt" \
++frontend_conf.cmvn_file="/nfs/yufan.yf/workspace/model_download/modelscope/hub/damo/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/am.mvn" \
++input="/nfs/yufan.yf/workspace/model_download/modelscope/hub/damo/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/asr_example_zh.wav" \
++output_dir="./outputs/debug2" \
++device="" \
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/contextual_paraformer/finetune.sh
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/contextual_paraformer/finetune.sh
@@ -0,0 +1,85 @@
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+
+workspace=`pwd`
+
+# method1, finetune from model hub
+
+# which gpu to train or finetune
+export CUDA_VISIBLE_DEVICES="0,1"
+gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+
+# model_name from model_hub, or model_dir in local path
+
+## option 1, download model automatically
+model_name_or_model_dir="iic/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404"
+
+
+## option 2, download model by git
+#local_path_root=${workspace}/modelscope_models
+#mkdir -p ${local_path_root}/${model_name_or_model_dir}
+#git clone https://www.modelscope.cn/${model_name_or_model_dir}.git ${local_path_root}/${model_name_or_model_dir}
+#model_name_or_model_dir=${local_path_root}/${model_name_or_model_dir}
+
+
+# data dir, which contains: train.json, val.json
+data_dir="../../../data/list"
+
+train_data="${data_dir}/train.jsonl"
+val_data="${data_dir}/val.jsonl"
+
+# generate train.jsonl and val.jsonl from wav.scp and text.txt
+scp2jsonl \
++scp_file_list='["../../../data/list/train_wav.scp", "../../../data/list/train_text.txt"]' \
++data_type_list='["source", "target"]' \
++jsonl_file_out="${train_data}"
+
+scp2jsonl \
++scp_file_list='["../../../data/list/val_wav.scp", "../../../data/list/val_text.txt"]' \
++data_type_list='["source", "target"]' \
++jsonl_file_out="${val_data}"
+
+
+# exp output dir
+output_dir="./outputs"
+log_file="${output_dir}/log.txt"
+
+
+mkdir -p ${output_dir}
+echo "log_file: ${log_file}"
+
+deepspeed_config=${workspace}/../../ds_stage1.json
+
+DISTRIBUTED_ARGS="
+    --nnodes ${WORLD_SIZE:-1} \
+    --nproc_per_node $gpu_num \
+    --node_rank ${RANK:-0} \
+    --master_addr ${MASTER_ADDR:-127.0.0.1} \
+    --master_port ${MASTER_PORT:-26669}
+"
+
+echo $DISTRIBUTED_ARGS
+
+torchrun $DISTRIBUTED_ARGS \
+../../../funasr/bin/train_ds.py \
++model="${model_name_or_model_dir}" \
++train_data_set_list="${train_data}" \
++valid_data_set_list="${val_data}" \
++dataset="AudioDatasetHotword" \
++dataset_conf.index_ds="IndexDSJsonl" \
++dataset_conf.data_split_num=1 \
++dataset_conf.batch_sampler="BatchSampler" \
++dataset_conf.batch_size=6000  \
++dataset_conf.sort_size=1024 \
++dataset_conf.batch_type="token" \
++dataset_conf.num_workers=4 \
++train_conf.max_epoch=50 \
++train_conf.log_interval=1 \
++train_conf.resume=true \
++train_conf.validate_interval=2000 \
++train_conf.save_checkpoint_interval=2000 \
++train_conf.keep_nbest_models=20 \
++train_conf.use_deepspeed=false \
++train_conf.deepspeed_config=${deepspeed_config} \
++optim_conf.lr=0.0002 \
++output_dir="${output_dir}" &> ${log_file}
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/contextual_paraformer/path.sh
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/contextual_paraformer/path.sh
@@ -0,0 +1,6 @@
+export FUNASR_DIR=$PWD/../../../
+
+# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PATH=$FUNASR_DIR/funasr/bin:$PATH
+export PYTHONPATH=$FUNASR_DIR/funasr/bin:$FUNASR_DIR/funasr:$FUNASR_DIR:$PYTHONPATH
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/ct_transformer/demo.py
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/ct_transformer/demo.py
@@ -0,0 +1,23 @@
+#!/usr/bin/env python3
+# -*- encoding: utf-8 -*-
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+
+from funasr import AutoModel
+
+model = AutoModel(model="iic/punc_ct-transformer_zh-cn-common-vocab272727-pytorch")
+
+res = model.generate(
+    input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_text/punc_example.txt"
+)
+print(res)
+
+
+from funasr import AutoModel
+
+model = AutoModel(model="iic/punc_ct-transformer_cn-en-common-vocab471067-large")
+
+res = model.generate(
+    input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_text/punc_example.txt"
+)
+print(res)
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/ct_transformer/demo.sh
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/ct_transformer/demo.sh
@@ -0,0 +1,12 @@
+
+#model="iic/punc_ct-transformer_zh-cn-common-vocab272727-pytorch"
+#
+
+model="iic/punc_ct-transformer_cn-en-common-vocab471067-large"
+
+
+python funasr/bin/inference.py \
+model=${model} \
+input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_text/punc_example.txt" \
+output_dir="./outputs/debug" \
+device="cpu"
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/ct_transformer/export.py
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/ct_transformer/export.py
@@ -0,0 +1,26 @@
+#!/usr/bin/env python3
+# -*- encoding: utf-8 -*-
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+
+# method1, inference from model hub
+
+from funasr import AutoModel
+
+model = AutoModel(
+    model="iic/punc_ct-transformer_zh-cn-common-vocab272727-pytorch",
+)
+
+res = model.export(type="onnx", quantize=False)
+print(res)
+
+
+# method2, inference from local path
+from funasr import AutoModel
+
+model = AutoModel(
+    model="/Users/zhifu/.cache/modelscope/hub/iic/punc_ct-transformer_zh-cn-common-vocab272727-pytorch"
+)
+
+res = model.export(type="onnx", quantize=False)
+print(res)
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/ct_transformer/export.sh
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/ct_transformer/export.sh
@@ -0,0 +1,26 @@
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+
+# method1, inference from model hub
+export HYDRA_FULL_ERROR=1
+
+
+model="iic/punc_ct-transformer_zh-cn-common-vocab272727-pytorch"
+
+
+python -m funasr.bin.export \
++model=${model} \
++model_revision=${model_revision} \
++type="onnx" \
++quantize=false \
++device="cpu"
+
+
+# method2, inference from local path
+model="/Users/zhifu/.cache/modelscope/hub/iic/punc_ct-transformer_zh-cn-common-vocab272727-pytorch"
+
+python -m funasr.bin.export \
++model=${model} \
++type="onnx" \
++quantize=false \
++device="cpu"
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/ct_transformer_streaming/demo.py
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/ct_transformer_streaming/demo.py
@@ -0,0 +1,18 @@
+#!/usr/bin/env python3
+# -*- encoding: utf-8 -*-
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+
+from funasr import AutoModel
+
+model = AutoModel(model="iic/punc_ct-transformer_zh-cn-common-vad_realtime-vocab272727")
+
+inputs = "跨境河流是养育沿岸|人民的生命之源长期以来为帮助下游地区防灾减灾中方技术人员|在上游地区极为恶劣的自然条件下克服巨大困难甚至冒着生命危险|向印方提供汛期水文资料处理紧急事件中方重视印方在跨境河流问题上的关切|愿意进一步完善双方联合工作机制|凡是|中方能做的我们|都会去做而且会做得更好我请印度朋友们放心中国在上游的|任何开发利用都会经过科学|规划和论证兼顾上下游的利益"
+vads = inputs.split("|")
+rec_result_all = "outputs: "
+cache = {}
+for vad in vads:
+    rec_result = model.generate(input=vad, cache=cache)
+    rec_result_all += rec_result[0]["text"]
+
+print(rec_result_all)
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/ct_transformer_streaming/demo.sh
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/ct_transformer_streaming/demo.sh
@@ -0,0 +1,9 @@
+
+model="iic/punc_ct-transformer_zh-cn-common-vad_realtime-vocab272727"
+
+
+python funasr/bin/inference.py \
+model=${model} \
+input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_text/punc_example.txt" \
+output_dir="./outputs/debug" \
+device="cpu"
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/ct_transformer_streaming/export.py
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/ct_transformer_streaming/export.py
@@ -0,0 +1,26 @@
+#!/usr/bin/env python3
+# -*- encoding: utf-8 -*-
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+
+# method1, inference from model hub
+
+from funasr import AutoModel
+
+model = AutoModel(
+    model="iic/punc_ct-transformer_zh-cn-common-vad_realtime-vocab272727",
+)
+
+res = model.export(type="onnx", quantize=False)
+print(res)
+
+
+# method2, inference from local path
+from funasr import AutoModel
+
+model = AutoModel(
+    model="/Users/zhifu/.cache/modelscope/hub/iic/punc_ct-transformer_zh-cn-common-vad_realtime-vocab272727"
+)
+
+res = model.export(type="onnx", quantize=False)
+print(res)
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/ct_transformer_streaming/export.sh
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/ct_transformer_streaming/export.sh
@@ -0,0 +1,28 @@
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+
+# method1, inference from model hub
+export HYDRA_FULL_ERROR=1
+
+
+model="iic/punc_ct-transformer_zh-cn-common-vad_realtime-vocab272727"
+
+
+python -m funasr.bin.export \
++model=${model} \
++model_revision=${model_revision} \
++input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/vad_example.wav" \
++type="onnx" \
++quantize=false \
++device="cpu"
+
+
+# method2, inference from local path
+model="/Users/zhifu/.cache/modelscope/hub/iic/punc_ct-transformer_zh-cn-common-vad_realtime-vocab272727"
+
+python -m funasr.bin.export \
++model=${model} \
++input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/vad_example.wav" \
++type="onnx" \
++quantize=false \
++device="cpu"
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/ctc/demo.py
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/ctc/demo.py
@@ -0,0 +1,24 @@
+#!/usr/bin/env python3
+# -*- encoding: utf-8 -*-
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+
+import sys
+from funasr import AutoModel
+
+
+model_dir = "/Users/zhifu/Downloads/modelscope_models/ctc_model"
+input_file = (
+    "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav"
+)
+
+model = AutoModel(
+    model=model_dir,
+)
+
+res = model.generate(
+    input=input_file,
+    cache={},
+)
+
+print(res)
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/ctc/infer_from_local.sh
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/ctc/infer_from_local.sh
@@ -0,0 +1,31 @@
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+
+# method2, inference from local model
+
+# for more input type, please ref to readme.md
+model_dir=$1
+input_file=$2
+output_dir=$3
+
+# download model
+device="cuda:0" # "cuda:0" for gpu0, "cuda:1" for gpu1, "cpu"
+
+tokens="${model_dir}/tokens.json"
+cmvn_file="${model_dir}/am.mvn"
+
+config="config.yaml"
+init_param="${model_dir}/model.pt"
+
+mkdir -p ${output_dir}
+
+python -m funasr.bin.inference \
+--config-path "${model_dir}" \
+--config-name "${config}" \
++init_param="${init_param}" \
++tokenizer_conf.token_list="${tokens}" \
++frontend_conf.cmvn_file="${cmvn_file}" \
++input="${input_file}" \
++output_dir="${output_dir}" \
++device="${device}" \
+
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/fsmn_kws/conf/fsmn_4e_l10r2_250_128_fdim80_t2599.yaml
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/fsmn_kws/conf/fsmn_4e_l10r2_250_128_fdim80_t2599.yaml
@@ -0,0 +1,95 @@
+
+# network architecture
+model: FsmnKWS
+model_conf:
+    ctc_weight: 1.0
+
+# encoder related
+encoder: FSMN
+encoder_conf:
+    input_dim: 400
+    input_affine_dim: 140
+    fsmn_layers: 4
+    linear_dim: 250
+    proj_dim: 128
+    lorder: 10
+    rorder: 2
+    lstride: 1
+    rstride: 1
+    output_affine_dim: 140
+    output_dim: 2599
+    use_softmax: false
+
+frontend: WavFrontend
+frontend_conf:
+    fs: 16000
+    window: hamming
+    n_mels: 80
+    frame_length: 25
+    frame_shift: 10
+    lfr_m: 5
+    lfr_n: 3
+
+specaug: SpecAugLFR
+specaug_conf:
+    apply_time_warp: false
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    lfr_rate: 3
+    num_freq_mask: 1
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 12
+    num_time_mask: 1
+
+train_conf:
+  accum_grad: 1
+  grad_clip: 5
+  max_epoch: 100
+  keep_nbest_models: 10
+  avg_nbest_model: 10
+  avg_keep_nbest_models_type: loss
+  validate_interval: 50000
+  save_checkpoint_interval: 50000
+  avg_checkpoint_interval: 1000
+  log_interval: 50
+
+optim: adam
+optim_conf:
+   lr: 0.0005
+scheduler: warmuplr
+scheduler_conf:
+   warmup_steps: 10000
+
+dataset: AudioDataset
+dataset_conf:
+    index_ds: IndexDSJsonl
+    batch_sampler: EspnetStyleBatchSampler
+    batch_type: length # example or length
+    batch_size: 32000 # if batch_type is example, batch_size is the numbers of samples; if length, batch_size is source_token_len+target_token_len;
+    max_token_length: 1600 # filter samples if source_token_len+target_token_len > max_token_length,
+    buffer_size: 2048
+    shuffle: true
+    num_workers: 8
+    preprocessor_speech: SpeechPreprocessSpeedPerturb
+    preprocessor_speech_conf:
+      speed_perturb: [0.9, 1.0, 1.1]
+
+tokenizer: CharTokenizer
+tokenizer_conf:
+    unk_symbol: <unk>
+    split_with_space: true
+
+ctc_conf:
+    dropout_rate: 0.0
+    ctc_type: builtin
+    reduce: true
+    ignore_nan_grad: true
+    extra_linear: false
+
+normalize: null
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/fsmn_kws/conf/fsmn_4e_l10r2_280_200_fdim40_t2602.yaml
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/fsmn_kws/conf/fsmn_4e_l10r2_280_200_fdim40_t2602.yaml
@@ -0,0 +1,95 @@
+
+# network architecture
+model: FsmnKWS
+model_conf:
+    ctc_weight: 1.0
+
+# encoder related
+encoder: FSMN
+encoder_conf:
+    input_dim: 360
+    input_affine_dim: 280
+    fsmn_layers: 4
+    linear_dim: 280
+    proj_dim: 200
+    lorder: 10
+    rorder: 2
+    lstride: 1
+    rstride: 1
+    output_affine_dim: 400
+    output_dim: 2602
+    use_softmax: false
+
+frontend: WavFrontend
+frontend_conf:
+    fs: 16000
+    window: hamming
+    n_mels: 40
+    frame_length: 25
+    frame_shift: 10
+    lfr_m: 9
+    lfr_n: 3
+
+specaug: SpecAugLFR
+specaug_conf:
+    apply_time_warp: false
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    lfr_rate: 3
+    num_freq_mask: 1
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 12
+    num_time_mask: 1
+
+train_conf:
+  accum_grad: 1
+  grad_clip: 5
+  max_epoch: 100
+  keep_nbest_models: 10
+  avg_nbest_model: 10
+  avg_keep_nbest_models_type: loss
+  validate_interval: 50000
+  save_checkpoint_interval: 50000
+  avg_checkpoint_interval: 1000
+  log_interval: 50
+
+optim: adam
+optim_conf:
+   lr: 0.0005
+scheduler: warmuplr
+scheduler_conf:
+   warmup_steps: 10000
+
+dataset: AudioDataset
+dataset_conf:
+    index_ds: IndexDSJsonl
+    batch_sampler: EspnetStyleBatchSampler
+    batch_type: length # example or length
+    batch_size: 32000 # if batch_type is example, batch_size is the numbers of samples; if length, batch_size is source_token_len+target_token_len;
+    max_token_length: 1600 # filter samples if source_token_len+target_token_len > max_token_length,
+    buffer_size: 2048
+    shuffle: true
+    num_workers: 8
+    preprocessor_speech: SpeechPreprocessSpeedPerturb
+    preprocessor_speech_conf:
+      speed_perturb: [0.9, 1.0, 1.1]
+
+tokenizer: CharTokenizer
+tokenizer_conf:
+    unk_symbol: <unk>
+    split_with_space: true
+
+ctc_conf:
+    dropout_rate: 0.0
+    ctc_type: builtin
+    reduce: true
+    ignore_nan_grad: true
+    extra_linear: false
+
+normalize: null
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/fsmn_kws/convert.py
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/fsmn_kws/convert.py
@@ -0,0 +1,134 @@
+from __future__ import print_function
+
+import argparse
+import copy
+import logging
+import os
+from shutil import copyfile
+
+import torch
+import yaml
+from typing import Union
+
+
+from funasr.models.fsmn_kws.model import FsmnKWSConvert
+
+
+def count_parameters(model):
+    return sum(p.numel() for p in model.parameters() if p.requires_grad)
+
+
+def get_args():
+    parser = argparse.ArgumentParser(
+        description=
+        'load and convert network to each other between kaldi/pytorch format')
+    parser.add_argument('--config', required=True, help='config file')
+    parser.add_argument(
+        '--network_file',
+        default='',
+        required=True,
+        help='input network, support kaldi.txt/pytorch.pt')
+    parser.add_argument('--model_dir', required=True, help='save model dir')
+    parser.add_argument('--model_name', required=True, help='save model name')
+    parser.add_argument('--convert_to',
+                        default='kaldi',
+                        required=True,
+                        help='target network type, kaldi/pytorch')
+
+    args = parser.parse_args()
+    return args
+
+
+def convert_to_kaldi(
+    configs,
+    network_file,
+    model_dir,
+    model_name="convert.kaldi.txt"
+):
+    copyfile(network_file, os.path.join(model_dir, 'origin.torch.pt'))
+
+    model = FsmnKWSConvert(
+            vocab_size=configs['encoder_conf']['output_dim'],
+            encoder='FSMNConvert',
+            encoder_conf=configs['encoder_conf'],
+            ctc_conf=configs['ctc_conf'],
+    )
+    print(model)
+    num_params = count_parameters(model)
+    print('the number of model params: {}'.format(num_params))
+
+    states= torch.load(network_file, map_location='cpu')
+    model.load_state_dict(states["state_dict"])
+
+    kaldi_text = os.path.join(model_dir, model_name)
+    with open(kaldi_text, 'w', encoding='utf8') as fout:
+        nnet_desp = model.to_kaldi_net()
+        fout.write(nnet_desp)
+    fout.close()
+
+
+def convert_to_pytorch(
+    configs,
+    network_file,
+    model_dir,
+    model_name="convert.torch.pt"
+):
+    model = FsmnKWSConvert(
+            vocab_size=configs['encoder_conf']['output_dim'],
+            frontend=None,
+            specaug=None,
+            normalize=None,
+            encoder='FSMNConvert',
+            encoder_conf=configs['encoder_conf'],
+            ctc_conf=configs['ctc_conf'],
+    )
+
+    num_params = count_parameters(model)
+    print('the number of model params: {}'.format(num_params))
+
+    copyfile(network_file, os.path.join(model_dir, 'origin.kaldi.txt'))
+    model.to_pytorch_net(network_file)
+
+    save_model_path = os.path.join(model_dir, model_name)
+    torch.save({"model": model.state_dict()}, save_model_path)
+
+    print('convert torch format back to kaldi')
+    kaldi_text = os.path.join(model_dir, 'convert.kaldi.txt')
+    with open(kaldi_text, 'w', encoding='utf8') as fout:
+        nnet_desp = model.to_kaldi_net()
+        fout.write(nnet_desp)
+    fout.close()
+
+    print('Done!')
+
+
+def main():
+    args = get_args()
+    logging.basicConfig(level=logging.DEBUG,
+                        format='%(asctime)s %(levelname)s %(message)s')
+    print(args)
+    with open(args.config, 'r') as fin:
+        configs = yaml.load(fin, Loader=yaml.FullLoader)
+
+    if args.convert_to == 'pytorch':
+        print('convert kaldi net to pytorch...')
+        convert_to_pytorch(
+            configs,
+            args.network_file,
+            args.model_dir,
+            args.model_name
+        )
+    elif args.convert_to == 'kaldi':
+        print('convert pytorch net to kaldi...')
+        convert_to_kaldi(
+            configs,
+            args.network_file,
+            args.model_dir,
+            args.model_name
+        )
+    else:
+        print('unsupported target network type: {}'.format(args.convert_to))
+
+
+if __name__ == '__main__':
+    main()
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/fsmn_kws/convert.sh
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/fsmn_kws/convert.sh
@@ -0,0 +1,26 @@
+
+workspace=`pwd`
+
+# download model
+local_path_root=${workspace}/modelscope_models_kws
+mkdir -p ${local_path_root}
+
+local_path=${local_path_root}/speech_charctc_kws_phone-xiaoyun
+if [ ! -d "$local_path" ]; then
+    git clone https://www.modelscope.cn/iic/speech_charctc_kws_phone-xiaoyun.git ${local_path}
+fi
+
+export PATH=${local_path}/runtime:$PATH
+export LD_LIBRARY_PATH=${local_path}/runtime:$LD_LIBRARY_PATH
+
+config=./conf/fsmn_4e_l10r2_250_128_fdim80_t2599.yaml
+torch_nnet=exp/finetune_outputs/model.pt.avg10
+out_dir=exp/finetune_outputs
+
+if [ ! -d "$out_dir" ]; then
+    mkdir -p $out_dir
+fi
+
+python convert.py --config $config --network_file $torch_nnet --model_dir $out_dir --model_name "convert.kaldi.txt" --convert_to kaldi
+
+nnet-copy --binary=true ${out_dir}/convert.kaldi.txt ${out_dir}/convert.kaldi.net
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/fsmn_kws/demo.py
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/fsmn_kws/demo.py
@@ -0,0 +1,18 @@
+#!/usr/bin/env python3
+# -*- encoding: utf-8 -*-
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+
+from funasr import AutoModel
+
+model = AutoModel(
+    model="iic/speech_charctc_kws_phone-xiaoyun",
+    keywords="小云小云",
+    output_dir="./outputs/debug",
+    device='cpu'
+)
+
+test_wav = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/KWS/pos_testset/kws_xiaoyunxiaoyun.wav"
+
+res = model.generate(input=test_wav, cache={},)
+print(res)
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/fsmn_kws/finetune.sh
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/fsmn_kws/finetune.sh
@@ -0,0 +1,173 @@
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+
+#!/usr/bin/env bash
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+. ./path.sh
+workspace=`pwd`
+
+CUDA_VISIBLE_DEVICES="0,1"
+
+stage=2
+stop_stage=3
+
+inference_device="cuda" #"cpu"
+inference_checkpoint="model.pt.avg10"
+inference_scp="wav.scp"
+inference_batch_size=32
+nj=32
+test_sets="test"
+
+# model_name from model_hub, or model_dir in local path
+
+## option 1, download model automatically, unsupported currently
+model_name_or_model_dir="iic/speech_charctc_kws_phone-xiaoyun"
+
+## option 2, download model by git
+local_path_root=${workspace}/modelscope_models
+model_name_or_model_dir=${local_path_root}/${model_name_or_model_dir}
+if [ ! -d $model_name_or_model_dir ]; then
+  mkdir -p ${model_name_or_model_dir}
+  git clone https://www.modelscope.cn/iic/speech_charctc_kws_phone-xiaoyun.git ${model_name_or_model_dir}
+fi
+
+config=fsmn_4e_l10r2_250_128_fdim80_t2599.yaml
+token_list=${model_name_or_model_dir}/funasr/tokens_2599.txt
+lexicon_list=${model_name_or_model_dir}/funasr/lexicon.txt
+cmvn_file=${model_name_or_model_dir}/funasr/am.mvn.dim80_l2r2
+init_param="${model_name_or_model_dir}/funasr/basetrain_fsmn_4e_l10r2_250_128_fdim80_t2599.pt"
+
+
+# data prepare
+# data dir, which contains: train.json, val.json
+data_dir=../../data
+
+train_data="${data_dir}/train.jsonl"
+val_data="${data_dir}/val.jsonl"
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+  echo "stage 1: Generate audio json list"
+  # generate train.jsonl and val.jsonl from wav.scp and text.txt
+  python $FUNASR_DIR/funasr/datasets/audio_datasets/scp2jsonl.py \
+  ++scp_file_list='['''${data_dir}/train_wav.scp''', '''${data_dir}/train_text.txt''']' \
+  ++data_type_list='["source", "target"]' \
+  ++jsonl_file_out="${train_data}"
+
+  python $FUNASR_DIR/funasr/datasets/audio_datasets/scp2jsonl.py \
+  ++scp_file_list='['''${data_dir}/val_wav.scp''', '''${data_dir}/val_text.txt''']' \
+  ++data_type_list='["source", "target"]' \
+  ++jsonl_file_out="${val_data}"
+fi
+
+# exp output dir
+output_dir="${workspace}/exp/finetune_outputs"
+
+# Training Stage
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+  echo "stage 2: KWS Training"
+
+  mkdir -p ${output_dir}
+  current_time=$(date "+%Y-%m-%d_%H-%M")
+  log_file="${output_dir}/train.log.txt.${current_time}"
+  echo "log_file: ${log_file}"
+	echo "finetune use basetrain model: ${init_param}"
+
+  export CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES
+  gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+  torchrun --nnodes 1 --nproc_per_node ${gpu_num} \
+  ../../../funasr/bin/train.py \
+  --config-path "${workspace}/conf" \
+  --config-name "${config}" \
+  ++init_param="${init_param}" \
+  ++disable_update=true \
+  ++train_data_set_list="${train_data}" \
+  ++valid_data_set_list="${val_data}" \
+  ++tokenizer_conf.token_list="${token_list}" \
+  ++tokenizer_conf.seg_dict="${lexicon_list}" \
+  ++frontend_conf.cmvn_file="${cmvn_file}" \
+  ++output_dir="${output_dir}" &> ${log_file}
+fi
+
+
+# Testing Stage
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+  echo "stage 3: Inference"
+  keywords=(小云小云)
+  keywords_string=$(IFS=,; echo "${keywords[*]}")
+  echo "keywords: $keywords_string"
+
+  if [ ${inference_device} == "cuda" ]; then
+      nj=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+  else
+      inference_batch_size=1
+      CUDA_VISIBLE_DEVICES=""
+      for JOB in $(seq ${nj}); do
+          CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"-1,"
+      done
+  fi
+
+  for dset in ${test_sets}; do
+    inference_dir="${output_dir}/inference-${inference_checkpoint}/${dset}"
+    _logdir="${inference_dir}/logdir"
+    echo "inference_dir: ${inference_dir}"
+
+    mkdir -p "${_logdir}"
+    test_data_dir="${data_dir}/${dset}"
+    key_file=${test_data_dir}/${inference_scp}
+
+    split_scps=
+    for JOB in $(seq "${nj}"); do
+        split_scps+=" ${_logdir}/keys.${JOB}.scp"
+    done
+    $FUNASR_DIR/examples/aishell/paraformer/utils/split_scp.pl "${key_file}" ${split_scps}
+
+    gpuid_list_array=(${CUDA_VISIBLE_DEVICES//,/ })
+    for JOB in $(seq ${nj}); do
+        {
+          id=$((JOB-1))
+          gpuid=${gpuid_list_array[$id]}
+
+          echo "${output_dir}"
+
+          export CUDA_VISIBLE_DEVICES=${gpuid}
+          python ../../../funasr/bin/inference.py \
+          --config-path="${output_dir}" \
+          --config-name="config.yaml" \
+          ++init_param="${output_dir}/${inference_checkpoint}" \
+          ++tokenizer_conf.token_list="${token_list}" \
+          ++tokenizer_conf.seg_dict="${lexicon_list}" \
+          ++frontend_conf.cmvn_file="${cmvn_file}" \
+          ++keywords="\"$keywords_string"\" \
+          ++input="${_logdir}/keys.${JOB}.scp" \
+          ++output_dir="${inference_dir}/${JOB}" \
+          ++device="${inference_device}" \
+          ++ncpu=1 \
+          ++disable_log=true \
+          ++batch_size="${inference_batch_size}" &> ${_logdir}/log.${JOB}.txt
+        }&
+
+    done
+    wait
+
+    for f in detect; do
+        if [ -f "${inference_dir}/${JOB}/${f}" ]; then
+          for JOB in $(seq "${nj}"); do
+              cat "${inference_dir}/${JOB}/${f}"
+          done | sort -k1 >"${inference_dir}/${f}"
+        fi
+    done
+
+    python funasr/utils/compute_det_ctc.py \
+        --keywords ${keywords_string} \
+        --test_data ${test_data_dir}/wav.scp \
+        --trans_data ${test_data_dir}/text \
+        --score_file ${inference_dir}/detect \
+        --stats_dir ${inference_dir}
+  done
+
+fi
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/fsmn_kws/funasr
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/fsmn_kws/funasr
@@ -0,0 +1 @@
+../../../funasr
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/fsmn_kws/infer.sh
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/fsmn_kws/infer.sh
@@ -0,0 +1,20 @@
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+
+# method1, inference from model hub
+
+model="iic/speech_charctc_kws_phone-xiaoyun"
+
+# for more input type, please ref to readme.md
+input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/KWS/pos_testset/kws_xiaoyunxiaoyun.wav"
+
+keywords=(小云小云)
+keywords_string=$(IFS=,; echo "${keywords[*]}")
+echo "keywords: $keywords_string"
+
+python funasr/bin/inference.py \
+model=${model} \
+input=${input} \
+output_dir="./outputs/debug" \
+device="cpu" \
++keywords="\"$keywords_string"\"
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/fsmn_kws/infer_from_local.sh
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/fsmn_kws/infer_from_local.sh
@@ -0,0 +1,41 @@
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+
+# method2, inference from local model
+
+# for more input type, please ref to readme.md
+input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/KWS/pos_testset/kws_xiaoyunxiaoyun.wav"
+
+output_dir="./outputs/debug"
+
+workspace=`pwd`
+
+# download model
+local_path_root=${workspace}/modelscope_models
+mkdir -p ${local_path_root}
+local_path=${local_path_root}/speech_charctc_kws_phone-xiaoyun
+git clone https://www.modelscope.cn/iic/speech_charctc_kws_phone-xiaoyun.git ${local_path}
+
+device="cuda:0" # "cuda:0" for gpu0, "cuda:1" for gpu1, "cpu"
+
+config="inference_fsmn_4e_l10r2_250_128_fdim80_t2599.yaml"
+tokens="${local_path}/funasr/tokens_2599.txt"
+seg_dict="${local_path}/funasr/lexicon.txt"
+init_param="${local_path}/funasr/finetune_fsmn_4e_l10r2_250_128_fdim80_t2599_xiaoyun_xiaoyun.pt"
+cmvn_file="${local_path}/funasr/am.mvn.dim80_l2r2"
+
+keywords=(小云小云)
+keywords_string=$(IFS=,; echo "${keywords[*]}")
+echo "keywords: $keywords_string"
+
+python -m funasr.bin.inference \
+--config-path "${local_path}/funasr" \
+--config-name "${config}" \
++init_param="${init_param}" \
++frontend_conf.cmvn_file="${cmvn_file}" \
++tokenizer_conf.token_list="${tokens}" \
++tokenizer_conf.seg_dict="${seg_dict}" \
++input="${input}" \
++output_dir="${output_dir}" \
++device="${device}" \
++keywords="\"$keywords_string"\"
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/fsmn_kws/path.sh
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/fsmn_kws/path.sh
@@ -0,0 +1,5 @@
+export FUNASR_DIR=$PWD/../../..
+
+# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PATH=$FUNASR_DIR/funasr/bin:$PATH
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/fsmn_kws_mt/conf/fsmn_4e_l10r2_250_128_fdim80_t2599_t4.yaml
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/fsmn_kws_mt/conf/fsmn_4e_l10r2_250_128_fdim80_t2599_t4.yaml
@@ -0,0 +1,103 @@
+
+# network architecture
+model: FsmnKWSMT
+model_conf:
+    ctc_weight: 1.0
+
+# encoder related
+encoder: FSMNMT
+encoder_conf:
+    input_dim: 400
+    input_affine_dim: 140
+    fsmn_layers: 4
+    linear_dim: 250
+    proj_dim: 128
+    lorder: 10
+    rorder: 2
+    lstride: 1
+    rstride: 1
+    output_affine_dim: 140
+    output_dim: 2599
+    output_dim2: 4
+    use_softmax: false
+
+frontend: WavFrontend
+frontend_conf:
+    fs: 16000
+    window: hamming
+    n_mels: 80
+    frame_length: 25
+    frame_shift: 10
+    lfr_m: 5
+    lfr_n: 3
+
+specaug: SpecAugLFR
+specaug_conf:
+    apply_time_warp: false
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    lfr_rate: 3
+    num_freq_mask: 1
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 12
+    num_time_mask: 1
+
+train_conf:
+  accum_grad: 1
+  grad_clip: 5
+  max_epoch: 100
+  keep_nbest_models: 100
+  avg_nbest_model: 10
+  avg_keep_nbest_models_type: loss
+  log_interval: 50
+
+optim: adam
+optim_conf:
+   lr: 0.001
+scheduler: warmuplr
+scheduler_conf:
+   warmup_steps: 10000
+
+dataset: KwsMTDataset
+dataset_conf:
+    index_ds: IndexDSJsonl
+    batch_sampler: EspnetStyleBatchSampler
+    batch_type: length # example or length
+    batch_size: 64000 # if batch_type is example, batch_size is the numbers of samples; if length, batch_size is source_token_len+target_token_len;
+    max_token_length: 1600 # filter samples if source_token_len+target_token_len > max_token_length,
+    buffer_size: 2048
+    shuffle: true
+    num_workers: 8
+    preprocessor_speech: SpeechPreprocessSpeedPerturb
+    preprocessor_speech_conf:
+      speed_perturb: [0.9, 1.0, 1.1]
+    dataloader: DataloaderMapStyle
+
+tokenizer:
+    - CharTokenizer
+    - CharTokenizer
+
+tokenizer_conf:
+    - unk_symbol: <unk>
+      split_with_space: true
+      token_list: null
+      seg_dict: null
+    - unk_symbol: <unk>
+      split_with_space: true
+      token_list: null
+      seg_dict: null
+
+ctc_conf:
+    dropout_rate: 0.0
+    ctc_type: builtin  # ctc_type: focalctc, builtin
+    reduce: true
+    ignore_nan_grad: true
+    extra_linear: false
+
+normalize: null
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/fsmn_kws_mt/conf/fsmn_4e_l10r2_280_200_fdim40_t2602_t4.yaml
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/fsmn_kws_mt/conf/fsmn_4e_l10r2_280_200_fdim40_t2602_t4.yaml
@@ -0,0 +1,103 @@
+
+# network architecture
+model: FsmnKWSMT
+model_conf:
+    ctc_weight: 1.0
+
+# encoder related
+encoder: FSMNMT
+encoder_conf:
+    input_dim: 360
+    input_affine_dim: 280
+    fsmn_layers: 4
+    linear_dim: 280
+    proj_dim: 200
+    lorder: 10
+    rorder: 2
+    lstride: 1
+    rstride: 1
+    output_affine_dim: 400
+    output_dim: 2602
+    output_dim2: 4
+    use_softmax: false
+
+frontend: WavFrontend
+frontend_conf:
+    fs: 16000
+    window: hamming
+    n_mels: 40
+    frame_length: 25
+    frame_shift: 10
+    lfr_m: 9
+    lfr_n: 3
+
+specaug: SpecAugLFR
+specaug_conf:
+    apply_time_warp: false
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    lfr_rate: 3
+    num_freq_mask: 1
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 12
+    num_time_mask: 1
+
+train_conf:
+  accum_grad: 1
+  grad_clip: 5
+  max_epoch: 100
+  keep_nbest_models: 100
+  avg_nbest_model: 10
+  avg_keep_nbest_models_type: loss
+  log_interval: 50
+
+optim: adam
+optim_conf:
+   lr: 0.001
+scheduler: warmuplr
+scheduler_conf:
+   warmup_steps: 10000
+
+dataset: KwsMTDataset
+dataset_conf:
+    index_ds: IndexDSJsonl
+    batch_sampler: EspnetStyleBatchSampler
+    batch_type: length # example or length
+    batch_size: 64000 # if batch_type is example, batch_size is the numbers of samples; if length, batch_size is source_token_len+target_token_len;
+    max_token_length: 1600 # filter samples if source_token_len+target_token_len > max_token_length,
+    buffer_size: 2048
+    shuffle: true
+    num_workers: 8
+    preprocessor_speech: SpeechPreprocessSpeedPerturb
+    preprocessor_speech_conf:
+      speed_perturb: [0.9, 1.0, 1.1]
+    dataloader: DataloaderMapStyle
+
+tokenizer:
+    - CharTokenizer
+    - CharTokenizer
+
+tokenizer_conf:
+    - unk_symbol: <unk>
+      split_with_space: true
+      token_list: null
+      seg_dict: null
+    - unk_symbol: <unk>
+      split_with_space: true
+      token_list: null
+      seg_dict: null
+
+ctc_conf:
+    dropout_rate: 0.0
+    ctc_type: builtin  # ctc_type: focalctc, builtin
+    reduce: true
+    ignore_nan_grad: true
+    extra_linear: false
+
+normalize: null
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/fsmn_kws_mt/convert.py
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/fsmn_kws_mt/convert.py
@@ -0,0 +1,137 @@
+from __future__ import print_function
+
+import argparse
+import copy
+import logging
+import os
+from shutil import copyfile
+
+import torch
+import yaml
+from typing import Union
+from funasr.models.fsmn_kws_mt.encoder import FSMNMTConvert
+from funasr.models.fsmn_kws_mt.model import FsmnKWSMTConvert
+
+
+def count_parameters(model):
+    return sum(p.numel() for p in model.parameters() if p.requires_grad)
+
+
+def get_args():
+    parser = argparse.ArgumentParser(
+        description=
+        'load and convert network to each other between kaldi/pytorch format')
+    parser.add_argument('--config', required=True, help='config file')
+    parser.add_argument(
+        '--network_file',
+        default='',
+        required=True,
+        help='input network, support kaldi.txt/pytorch.pt')
+    parser.add_argument('--model_dir', required=True, help='save model dir')
+    parser.add_argument('--model_name', required=True, help='save model name')
+    parser.add_argument('--model_name2', required=True, help='save model name')
+    parser.add_argument('--convert_to',
+                        default='kaldi',
+                        required=True,
+                        help='target network type, kaldi/pytorch')
+
+    args = parser.parse_args()
+    return args
+
+
+def convert_to_kaldi(
+    configs,
+    network_file,
+    model_dir,
+    model_name="convert.kaldi.txt",
+    model_name2="convert.kaldi2.txt"
+):
+    copyfile(network_file, os.path.join(model_dir, 'origin.torch.pt'))
+
+    model = FsmnKWSMTConvert(
+        encoder='FSMNMTConvert',
+        encoder_conf=configs['encoder_conf'],
+        ctc_conf=configs['ctc_conf'],
+    )
+    print(model)
+    num_params = count_parameters(model)
+    print('the number of model params: {}'.format(num_params))
+
+    states= torch.load(network_file, map_location='cpu')
+    model.load_state_dict(states["state_dict"])
+
+    kaldi_text = os.path.join(model_dir, model_name)
+    with open(kaldi_text, 'w', encoding='utf8') as fout:
+        nnet_desp = model.to_kaldi_net()
+        fout.write(nnet_desp)
+    fout.close()
+
+    kaldi_text2 = os.path.join(model_dir, model_name2)
+    with open(kaldi_text2, 'w', encoding='utf8') as fout:
+        nnet_desp2 = model.to_kaldi_net2()
+        fout.write(nnet_desp2)
+    fout.close()
+
+
+def convert_to_pytorch(
+    configs,
+    network_file,
+    model_dir,
+    model_name="convert.torch.pt"
+):
+    model = FsmnKWSMTConvert(
+        encoder='FSMNMTConvert',
+        encoder_conf=configs['encoder_conf'],
+        ctc_conf=configs['ctc_conf'],
+    )
+
+    num_params = count_parameters(model)
+    print('the number of model params: {}'.format(num_params))
+
+    copyfile(network_file, os.path.join(model_dir, 'origin.kaldi.txt'))
+    model.to_pytorch_net(network_file)
+
+    save_model_path = os.path.join(model_dir, model_name)
+    torch.save({"model": model.state_dict()}, save_model_path)
+
+    print('convert torch format back to kaldi')
+    kaldi_text = os.path.join(model_dir, 'convert.kaldi.txt')
+    with open(kaldi_text, 'w', encoding='utf8') as fout:
+        nnet_desp = model.to_kaldi_net()
+        fout.write(nnet_desp)
+    fout.close()
+
+    print('Done!')
+
+
+def main():
+    args = get_args()
+    logging.basicConfig(level=logging.DEBUG,
+                        format='%(asctime)s %(levelname)s %(message)s')
+    print(args)
+    with open(args.config, 'r') as fin:
+        configs = yaml.load(fin, Loader=yaml.FullLoader)
+
+    if args.convert_to == 'pytorch':
+        print('convert kaldi net to pytorch...')
+        convert_to_pytorch(
+            configs,
+            args.network_file,
+            args.model_dir,
+            args.model_name,
+            args.model_name2,
+        )
+    elif args.convert_to == 'kaldi':
+        print('convert pytorch net to kaldi...')
+        convert_to_kaldi(
+            configs,
+            args.network_file,
+            args.model_dir,
+            args.model_name
+        )
+    else:
+        print('unsupported target network type: {}'.format(args.convert_to))
+
+
+if __name__ == '__main__':
+    main()
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/fsmn_kws_mt/convert.sh
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/fsmn_kws_mt/convert.sh
@@ -0,0 +1,36 @@
+
+workspace=`pwd`
+
+# download model
+local_path_root=${workspace}/modelscope_models
+mkdir -p ${local_path_root}
+
+local_path=${local_path_root}/speech_charctc_kws_phone-xiaoyun_mt
+if [ ! -d "$local_path" ]; then
+    git clone https://www.modelscope.cn/iic/speech_charctc_kws_phone-xiaoyun_mt.git ${local_path}
+fi
+
+export PATH=${local_path}/runtime:$PATH
+export LD_LIBRARY_PATH=${local_path}/runtime:$LD_LIBRARY_PATH
+
+# finetune config file
+config=./conf/fsmn_4e_l10r2_250_128_fdim80_t2599_t4.yaml
+
+# finetune output checkpoint
+torch_nnet=exp/finetune_outputs/model.pt.avg10
+
+out_dir=exp/finetune_outputs
+
+if [ ! -d "$out_dir" ]; then
+    mkdir -p $out_dir
+fi
+
+python convert.py --config $config \
+	--network_file $torch_nnet \
+	--model_dir $out_dir \
+	--model_name "convert.kaldi.txt" \
+	--model_name2 "convert.kaldi2.txt" \
+	--convert_to kaldi
+
+nnet-copy --binary=true ${out_dir}/convert.kaldi.txt ${out_dir}/convert.kaldi.net
+nnet-copy --binary=true ${out_dir}/convert.kaldi2.txt ${out_dir}/convert.kaldi2.net
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/fsmn_kws_mt/demo.py
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/fsmn_kws_mt/demo.py
@@ -0,0 +1,18 @@
+#!/usr/bin/env python3
+# -*- encoding: utf-8 -*-
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+
+from funasr import AutoModel
+
+model = AutoModel(
+    model="iic/speech_charctc_kws_phone-xiaoyun_mt",
+    keywords="小云小云",
+    output_dir="./outputs/debug",
+    device='cpu'
+)
+
+test_wav = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/KWS/pos_testset/kws_xiaoyunxiaoyun.wav"
+
+res = model.generate(input=test_wav, cache={},)
+print(res)
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/fsmn_kws_mt/finetune.sh
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/fsmn_kws_mt/finetune.sh
@@ -0,0 +1,184 @@
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+
+#!/usr/bin/env bash
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+. ./path.sh
+workspace=`pwd`
+
+CUDA_VISIBLE_DEVICES="0,1"
+
+stage=2
+stop_stage=3
+
+inference_device="cuda" #"cpu"
+inference_checkpoint="model.pt.avg10"
+inference_scp="wav.scp"
+inference_batch_size=32
+nj=32
+test_sets="test"
+
+# model_name from model_hub, or model_dir in local path
+
+## option 1, download model automatically, unsupported currently
+model_name_or_model_dir="iic/speech_charctc_kws_phone-xiaoyun_mt"
+
+## option 2, download model by git
+local_path_root=${workspace}/modelscope_models
+model_name_or_model_dir=${local_path_root}/${model_name_or_model_dir}
+if [ ! -d $model_name_or_model_dir ]; then
+  mkdir -p ${model_name_or_model_dir}
+  git clone https://www.modelscope.cn/iic/speech_charctc_kws_phone-xiaoyun_mt.git ${model_name_or_model_dir}
+fi
+
+config=fsmn_4e_l10r2_250_128_fdim80_t2599_t4.yaml
+token_list=${model_name_or_model_dir}/funasr/tokens_2599.txt
+token_list2=${model_name_or_model_dir}/funasr/tokens_xiaoyun.txt
+lexicon_list=${model_name_or_model_dir}/funasr/lexicon.txt
+cmvn_file=${model_name_or_model_dir}/funasr/am.mvn.dim80_l2r2
+init_param="${model_name_or_model_dir}/funasr/basetrain_fsmn_4e_l10r2_250_128_fdim80_t2599.pt"
+
+
+# data prepare
+# data dir, which contains: train.json, val.json
+data_dir=../../data
+
+train_data="${data_dir}/train.jsonl"
+val_data="${data_dir}/val.jsonl"
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+  echo "stage 1: Generate audio json list"
+  # generate train.jsonl and val.jsonl from wav.scp and text.txt
+  python $FUNASR_DIR/funasr/datasets/audio_datasets/scp2jsonl.py \
+  ++scp_file_list='['''${data_dir}/train_wav.scp''', '''${data_dir}/train_text.txt''']' \
+  ++data_type_list='["source", "target"]' \
+  ++jsonl_file_out="${train_data}"
+
+  python $FUNASR_DIR/funasr/datasets/audio_datasets/scp2jsonl.py \
+  ++scp_file_list='['''${data_dir}/val_wav.scp''', '''${data_dir}/val_text.txt''']' \
+  ++data_type_list='["source", "target"]' \
+  ++jsonl_file_out="${val_data}"
+fi
+
+# exp output dir
+output_dir="${workspace}/exp/finetune_outputs"
+
+
+# Training Stage
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+  echo "stage 2: KWS Training"
+
+  mkdir -p ${output_dir}
+  current_time=$(date "+%Y-%m-%d_%H-%M")
+  log_file="${output_dir}/train.log.txt.${current_time}"
+  echo "log_file: ${log_file}"
+  echo "finetune use basetrain model: ${init_param}"
+
+  export CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES
+  gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+  torchrun --nnodes 1 --nproc_per_node ${gpu_num} \
+  ../../../funasr/bin/train.py \
+  --config-path "${workspace}/conf" \
+  --config-name "${config}" \
+  ++init_param="${init_param}" \
+  ++token_lists='['''${token_list}''', '''${token_list2}''']' \
+  ++seg_dicts='['''${lexicon_list}''', '''${lexicon_list}''']' \
+  ++disable_update=true \
+  ++train_data_set_list="${train_data}" \
+  ++valid_data_set_list="${val_data}" \
+  ++frontend_conf.cmvn_file="${cmvn_file}" \
+  ++output_dir="${output_dir}" &> ${log_file}
+fi
+
+
+# Testing Stage
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+  echo "stage 3: Inference"
+  keywords=(小云小云)
+  keywords_string=$(IFS=,; echo "${keywords[*]}")
+  echo "keywords: $keywords_string"
+
+  if [ ${inference_device} == "cuda" ]; then
+      nj=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+  else
+      inference_batch_size=1
+      CUDA_VISIBLE_DEVICES=""
+      for JOB in $(seq ${nj}); do
+          CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"-1,"
+      done
+  fi
+
+  for dset in ${test_sets}; do
+    inference_dir="${output_dir}/inference-${inference_checkpoint}/${dset}"
+    _logdir="${inference_dir}/logdir"
+    echo "inference_dir: ${inference_dir}"
+
+    mkdir -p "${_logdir}"
+    test_data_dir="${data_dir}/${dset}"
+    key_file=${test_data_dir}/${inference_scp}
+
+    split_scps=
+    for JOB in $(seq "${nj}"); do
+        split_scps+=" ${_logdir}/keys.${JOB}.scp"
+    done
+    $FUNASR_DIR/examples/aishell/paraformer/utils/split_scp.pl "${key_file}" ${split_scps}
+
+    gpuid_list_array=(${CUDA_VISIBLE_DEVICES//,/ })
+    for JOB in $(seq ${nj}); do
+        {
+          id=$((JOB-1))
+          gpuid=${gpuid_list_array[$id]}
+
+          echo "${output_dir}"
+
+          export CUDA_VISIBLE_DEVICES=${gpuid}
+          python ../../../funasr/bin/inference.py \
+          --config-path="${output_dir}" \
+          --config-name="config.yaml" \
+          ++init_param="${output_dir}/${inference_checkpoint}" \
+          ++token_lists='['''${token_list}''', '''${token_list2}''']' \
+          ++seg_dicts='['''${lexicon_list}''', '''${lexicon_list}''']' \
+          ++frontend_conf.cmvn_file="${cmvn_file}" \
+          ++keywords="\"$keywords_string"\" \
+          ++input="${_logdir}/keys.${JOB}.scp" \
+          ++output_dir="${inference_dir}/${JOB}" \
+          ++device="${inference_device}" \
+          ++ncpu=1 \
+          ++disable_log=true \
+          ++batch_size="${inference_batch_size}" &> ${_logdir}/log.${JOB}.txt
+        }&
+
+    done
+    wait
+
+    for f in detect detect2; do
+        if [ -f "${inference_dir}/${JOB}/${f}" ]; then
+          for JOB in $(seq "${nj}"); do
+              cat "${inference_dir}/${JOB}/${f}"
+          done | sort -k1 >"${inference_dir}/${f}"
+        fi
+    done
+
+    mkdir -p ${inference_dir}/task1
+    python funasr/utils/compute_det_ctc.py \
+        --keywords ${keywords_string} \
+        --test_data ${test_data_dir}/wav.scp \
+        --trans_data ${test_data_dir}/text \
+        --score_file ${inference_dir}/detect \
+        --stats_dir ${inference_dir}/task1
+
+    mkdir -p ${inference_dir}/task2
+    python funasr/utils/compute_det_ctc.py \
+        --keywords ${keywords_string} \
+        --test_data ${test_data_dir}/wav.scp \
+        --trans_data ${test_data_dir}/text \
+        --score_file ${inference_dir}/detect2 \
+        --stats_dir ${inference_dir}/task2
+  done
+
+fi
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/fsmn_kws_mt/funasr
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/fsmn_kws_mt/funasr
@@ -0,0 +1 @@
+../../../funasr
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/fsmn_kws_mt/infer.sh
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/fsmn_kws_mt/infer.sh
@@ -0,0 +1,20 @@
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+
+# method1, inference from model hub
+
+model="iic/speech_charctc_kws_phone-xiaoyun_mt"
+
+# for more input type, please ref to readme.md
+input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/KWS/pos_testset/kws_xiaoyunxiaoyun.wav"
+
+keywords=(小云小云)
+keywords_string=$(IFS=,; echo "${keywords[*]}")
+echo "keywords: $keywords_string"
+
+python funasr/bin/inference.py \
+model=${model} \
+input=${input} \
+output_dir="./outputs/debug" \
+device="cpu" \
++keywords="\"$keywords_string"\"
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/fsmn_kws_mt/infer_from_local.sh
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/fsmn_kws_mt/infer_from_local.sh
@@ -0,0 +1,42 @@
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+
+# method2, inference from local model
+
+# for more input type, please ref to readme.md
+input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/KWS/pos_testset/kws_xiaoyunxiaoyun.wav"
+
+output_dir="./outputs/debug"
+
+workspace=`pwd`
+
+# download model
+local_path_root=${workspace}/modelscope_models
+mkdir -p ${local_path_root}
+local_path=${local_path_root}/speech_charctc_kws_phone-xiaoyun_mt
+git clone https://www.modelscope.cn/iic/speech_charctc_kws_phone-xiaoyun_mt.git ${local_path}
+
+device="cuda:0" # "cuda:0" for gpu0, "cuda:1" for gpu1, "cpu"
+
+config="inference_fsmn_4e_l10r2_280_200_fdim40_t2602_t4.yaml"
+tokens="${local_path}/funasr/tokens_2602.txt"
+tokens2="${local_path}/funasr/tokens_xiaoyun.txt"
+seg_dict="${local_path}/funasr/lexicon.txt"
+init_param="${local_path}/funasr/finetune_fsmn_4e_l10r2_280_200_fdim40_t2602_t4_xiaoyun_xiaoyun.pt"
+cmvn_file="${local_path}/funasr/am.mvn.dim40_l4r4"
+
+keywords=(小云小云)
+keywords_string=$(IFS=,; echo "${keywords[*]}")
+echo "keywords: $keywords_string"
+
+python -m funasr.bin.inference \
+--config-path "${local_path}/funasr" \
+--config-name "${config}" \
++init_param="${init_param}" \
++frontend_conf.cmvn_file="${cmvn_file}" \
++token_lists='['''${tokens}''', '''${tokens2}''']' \
++seg_dicts='['''${seg_dict}''', '''${seg_dict}''']' \
++input="${input}" \
++output_dir="${output_dir}" \
++device="${device}" \
++keywords="\"$keywords_string"\"
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/fsmn_kws_mt/path.sh
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/fsmn_kws_mt/path.sh
@@ -0,0 +1,5 @@
+export FUNASR_DIR=$PWD/../../..
+
+# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PATH=$FUNASR_DIR/funasr/bin:$PATH
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/fsmn_vad_streaming/demo.py
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/fsmn_vad_streaming/demo.py
@@ -0,0 +1,49 @@
+#!/usr/bin/env python3
+# -*- encoding: utf-8 -*-
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+
+from funasr import AutoModel
+
+wav_file = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/vad_example.wav"
+
+model = AutoModel(model="iic/speech_fsmn_vad_zh-cn-16k-common-pytorch")
+
+res = model.generate(input=wav_file)
+print(res)
+
+# [[beg1, end1], [beg2, end2], .., [begN, endN]]
+# beg/end: ms
+
+
+import soundfile
+import os
+
+wav_file = os.path.join(model.model_path, "example/vad_example.wav")
+speech, sample_rate = soundfile.read(wav_file)
+
+chunk_size = 200  # ms
+chunk_stride = int(chunk_size * sample_rate / 1000)
+
+cache = {}
+
+total_chunk_num = int(len((speech) - 1) / chunk_stride + 1)
+for i in range(total_chunk_num):
+    speech_chunk = speech[i * chunk_stride : (i + 1) * chunk_stride]
+    is_final = i == total_chunk_num - 1
+    res = model.generate(
+        input=speech_chunk,
+        cache=cache,
+        is_final=is_final,
+        chunk_size=chunk_size,
+        disable_pbar=True,
+    )
+    # print(res)
+    if len(res[0]["value"]):
+        print(res)
+
+
+# 1. [[beg1, end1], [beg2, end2], .., [begN, endN]]; [[beg, end]]; [[beg1, end1], [beg2, end2]]
+# 2. [[beg, -1]]
+# 3. [[-1, end]]
+# beg/end: ms
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/fsmn_vad_streaming/demo.sh
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/fsmn_vad_streaming/demo.sh
@@ -0,0 +1,10 @@
+
+
+model="iic/speech_fsmn_vad_zh-cn-16k-common-pytorch"
+
+
+python funasr/bin/inference.py \
+model=${model} \
+input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/vad_example.wav" \
+output_dir="./outputs/debug" \
+device="cpu" \
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/fsmn_vad_streaming/export.py
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/fsmn_vad_streaming/export.py
@@ -0,0 +1,25 @@
+#!/usr/bin/env python3
+# -*- encoding: utf-8 -*-
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+
+
+# method1, inference from model hub
+
+from funasr import AutoModel
+
+model = AutoModel(model="iic/speech_fsmn_vad_zh-cn-16k-common-pytorch")
+
+res = model.export(type="onnx", quantize=False)
+print(res)
+
+# method2, inference from local path
+
+from funasr import AutoModel
+
+model = AutoModel(
+    model="/Users/zhifu/.cache/modelscope/hub/iic/speech_fsmn_vad_zh-cn-16k-common-pytorch"
+)
+
+res = model.export(type="onnx", quantize=False)
+print(res)
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/fsmn_vad_streaming/export.sh
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/fsmn_vad_streaming/export.sh
@@ -0,0 +1,24 @@
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+
+
+# method1, inference from model hub
+export HYDRA_FULL_ERROR=1
+
+
+model="iic/speech_fsmn_vad_zh-cn-16k-common-pytorch"
+
+
+python -m funasr.bin.export \
++model=${model} \
++model_revision=${model_revision} \
++type="onnx" \
++quantize=false
+
+# method2, inference from local path
+model="/Users/zhifu/.cache/modelscope/hub/iic/speech_fsmn_vad_zh-cn-16k-common-pytorch"
+
+python -m funasr.bin.export \
++model=${model} \
++type="onnx" \
++quantize=false
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/lcbnet/compute_wer_details.py
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/lcbnet/compute_wer_details.py
@@ -0,0 +1,735 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+
+from enum import Enum
+import re, sys, unicodedata
+import codecs
+import argparse
+from tqdm import tqdm
+import os
+import pdb
+
+remove_tag = False
+spacelist = [" ", "\t", "\r", "\n"]
+puncts = [
+    "!",
+    ",",
+    "?",
+    "、",
+    "。",
+    "！",
+    "，",
+    "；",
+    "？",
+    "：",
+    "「",
+    "」",
+    "︰",
+    "『",
+    "』",
+    "《",
+    "》",
+]
+
+
+class Code(Enum):
+    match = 1
+    substitution = 2
+    insertion = 3
+    deletion = 4
+
+
+class WordError(object):
+    def __init__(self):
+        self.errors = {
+            Code.substitution: 0,
+            Code.insertion: 0,
+            Code.deletion: 0,
+        }
+        self.ref_words = 0
+
+    def get_wer(self):
+        assert self.ref_words != 0
+        errors = (
+            self.errors[Code.substitution]
+            + self.errors[Code.insertion]
+            + self.errors[Code.deletion]
+        )
+        return 100.0 * errors / self.ref_words
+
+    def get_result_string(self):
+        return (
+            f"error_rate={self.get_wer():.4f}, "
+            f"ref_words={self.ref_words}, "
+            f"subs={self.errors[Code.substitution]}, "
+            f"ins={self.errors[Code.insertion]}, "
+            f"dels={self.errors[Code.deletion]}"
+        )
+
+
+def characterize(string):
+    res = []
+    i = 0
+    while i < len(string):
+        char = string[i]
+        if char in puncts:
+            i += 1
+            continue
+        cat1 = unicodedata.category(char)
+        # https://unicodebook.readthedocs.io/unicode.html#unicode-categories
+        if cat1 == "Zs" or cat1 == "Cn" or char in spacelist:  # space or not assigned
+            i += 1
+            continue
+        if cat1 == "Lo":  # letter-other
+            res.append(char)
+            i += 1
+        else:
+            # some input looks like: <unk><noise>, we want to separate it to two words.
+            sep = " "
+            if char == "<":
+                sep = ">"
+            j = i + 1
+            while j < len(string):
+                c = string[j]
+                if ord(c) >= 128 or (c in spacelist) or (c == sep):
+                    break
+                j += 1
+            if j < len(string) and string[j] == ">":
+                j += 1
+            res.append(string[i:j])
+            i = j
+    return res
+
+
+def stripoff_tags(x):
+    if not x:
+        return ""
+    chars = []
+    i = 0
+    T = len(x)
+    while i < T:
+        if x[i] == "<":
+            while i < T and x[i] != ">":
+                i += 1
+            i += 1
+        else:
+            chars.append(x[i])
+            i += 1
+    return "".join(chars)
+
+
+def normalize(sentence, ignore_words, cs, split=None):
+    """sentence, ignore_words are both in unicode"""
+    new_sentence = []
+    for token in sentence:
+        x = token
+        if not cs:
+            x = x.upper()
+        if x in ignore_words:
+            continue
+        if remove_tag:
+            x = stripoff_tags(x)
+        if not x:
+            continue
+        if split and x in split:
+            new_sentence += split[x]
+        else:
+            new_sentence.append(x)
+    return new_sentence
+
+
+class Calculator:
+    def __init__(self):
+        self.data = {}
+        self.space = []
+        self.cost = {}
+        self.cost["cor"] = 0
+        self.cost["sub"] = 1
+        self.cost["del"] = 1
+        self.cost["ins"] = 1
+
+    def calculate(self, lab, rec):
+        # Initialization
+        lab.insert(0, "")
+        rec.insert(0, "")
+        while len(self.space) < len(lab):
+            self.space.append([])
+        for row in self.space:
+            for element in row:
+                element["dist"] = 0
+                element["error"] = "non"
+            while len(row) < len(rec):
+                row.append({"dist": 0, "error": "non"})
+        for i in range(len(lab)):
+            self.space[i][0]["dist"] = i
+            self.space[i][0]["error"] = "del"
+        for j in range(len(rec)):
+            self.space[0][j]["dist"] = j
+            self.space[0][j]["error"] = "ins"
+        self.space[0][0]["error"] = "non"
+        for token in lab:
+            if token not in self.data and len(token) > 0:
+                self.data[token] = {"all": 0, "cor": 0, "sub": 0, "ins": 0, "del": 0}
+        for token in rec:
+            if token not in self.data and len(token) > 0:
+                self.data[token] = {"all": 0, "cor": 0, "sub": 0, "ins": 0, "del": 0}
+        # Computing edit distance
+        for i, lab_token in enumerate(lab):
+            for j, rec_token in enumerate(rec):
+                if i == 0 or j == 0:
+                    continue
+                min_dist = sys.maxsize
+                min_error = "none"
+                dist = self.space[i - 1][j]["dist"] + self.cost["del"]
+                error = "del"
+                if dist < min_dist:
+                    min_dist = dist
+                    min_error = error
+                dist = self.space[i][j - 1]["dist"] + self.cost["ins"]
+                error = "ins"
+                if dist < min_dist:
+                    min_dist = dist
+                    min_error = error
+                if lab_token == rec_token.replace("<BIAS>", ""):
+                    dist = self.space[i - 1][j - 1]["dist"] + self.cost["cor"]
+                    error = "cor"
+                else:
+                    dist = self.space[i - 1][j - 1]["dist"] + self.cost["sub"]
+                    error = "sub"
+                if dist < min_dist:
+                    min_dist = dist
+                    min_error = error
+                self.space[i][j]["dist"] = min_dist
+                self.space[i][j]["error"] = min_error
+        # Tracing back
+        result = {
+            "lab": [],
+            "rec": [],
+            "code": [],
+            "all": 0,
+            "cor": 0,
+            "sub": 0,
+            "ins": 0,
+            "del": 0,
+        }
+        i = len(lab) - 1
+        j = len(rec) - 1
+        while True:
+            if self.space[i][j]["error"] == "cor":  # correct
+                if len(lab[i]) > 0:
+                    self.data[lab[i]]["all"] = self.data[lab[i]]["all"] + 1
+                    self.data[lab[i]]["cor"] = self.data[lab[i]]["cor"] + 1
+                    result["all"] = result["all"] + 1
+                    result["cor"] = result["cor"] + 1
+                result["lab"].insert(0, lab[i])
+                result["rec"].insert(0, rec[j])
+                result["code"].insert(0, Code.match)
+                i = i - 1
+                j = j - 1
+            elif self.space[i][j]["error"] == "sub":  # substitution
+                if len(lab[i]) > 0:
+                    self.data[lab[i]]["all"] = self.data[lab[i]]["all"] + 1
+                    self.data[lab[i]]["sub"] = self.data[lab[i]]["sub"] + 1
+                    result["all"] = result["all"] + 1
+                    result["sub"] = result["sub"] + 1
+                result["lab"].insert(0, lab[i])
+                result["rec"].insert(0, rec[j])
+                result["code"].insert(0, Code.substitution)
+                i = i - 1
+                j = j - 1
+            elif self.space[i][j]["error"] == "del":  # deletion
+                if len(lab[i]) > 0:
+                    self.data[lab[i]]["all"] = self.data[lab[i]]["all"] + 1
+                    self.data[lab[i]]["del"] = self.data[lab[i]]["del"] + 1
+                    result["all"] = result["all"] + 1
+                    result["del"] = result["del"] + 1
+                result["lab"].insert(0, lab[i])
+                result["rec"].insert(0, "")
+                result["code"].insert(0, Code.deletion)
+                i = i - 1
+            elif self.space[i][j]["error"] == "ins":  # insertion
+                if len(rec[j]) > 0:
+                    self.data[rec[j]]["ins"] = self.data[rec[j]]["ins"] + 1
+                    result["ins"] = result["ins"] + 1
+                result["lab"].insert(0, "")
+                result["rec"].insert(0, rec[j])
+                result["code"].insert(0, Code.insertion)
+                j = j - 1
+            elif self.space[i][j]["error"] == "non":  # starting point
+                break
+            else:  # shouldn't reach here
+                print(
+                    "this should not happen , i = {i} , j = {j} , error = {error}".format(
+                        i=i, j=j, error=self.space[i][j]["error"]
+                    )
+                )
+        return result
+
+    def overall(self):
+        result = {"all": 0, "cor": 0, "sub": 0, "ins": 0, "del": 0}
+        for token in self.data:
+            result["all"] = result["all"] + self.data[token]["all"]
+            result["cor"] = result["cor"] + self.data[token]["cor"]
+            result["sub"] = result["sub"] + self.data[token]["sub"]
+            result["ins"] = result["ins"] + self.data[token]["ins"]
+            result["del"] = result["del"] + self.data[token]["del"]
+        return result
+
+    def cluster(self, data):
+        result = {"all": 0, "cor": 0, "sub": 0, "ins": 0, "del": 0}
+        for token in data:
+            if token in self.data:
+                result["all"] = result["all"] + self.data[token]["all"]
+                result["cor"] = result["cor"] + self.data[token]["cor"]
+                result["sub"] = result["sub"] + self.data[token]["sub"]
+                result["ins"] = result["ins"] + self.data[token]["ins"]
+                result["del"] = result["del"] + self.data[token]["del"]
+        return result
+
+    def keys(self):
+        return list(self.data.keys())
+
+
+def width(string):
+    return sum(1 + (unicodedata.east_asian_width(c) in "AFW") for c in string)
+
+
+def default_cluster(word):
+    unicode_names = [unicodedata.name(char) for char in word]
+    for i in reversed(range(len(unicode_names))):
+        if unicode_names[i].startswith("DIGIT"):  # 1
+            unicode_names[i] = "Number"  # 'DIGIT'
+        elif unicode_names[i].startswith("CJK UNIFIED IDEOGRAPH") or unicode_names[i].startswith(
+            "CJK COMPATIBILITY IDEOGRAPH"
+        ):
+            # 明 / 郎
+            unicode_names[i] = "Mandarin"  # 'CJK IDEOGRAPH'
+        elif unicode_names[i].startswith("LATIN CAPITAL LETTER") or unicode_names[i].startswith(
+            "LATIN SMALL LETTER"
+        ):
+            # A / a
+            unicode_names[i] = "English"  # 'LATIN LETTER'
+        elif unicode_names[i].startswith("HIRAGANA LETTER"):  # は こ め
+            unicode_names[i] = "Japanese"  # 'GANA LETTER'
+        elif (
+            unicode_names[i].startswith("AMPERSAND")
+            or unicode_names[i].startswith("APOSTROPHE")
+            or unicode_names[i].startswith("COMMERCIAL AT")
+            or unicode_names[i].startswith("DEGREE CELSIUS")
+            or unicode_names[i].startswith("EQUALS SIGN")
+            or unicode_names[i].startswith("FULL STOP")
+            or unicode_names[i].startswith("HYPHEN-MINUS")
+            or unicode_names[i].startswith("LOW LINE")
+            or unicode_names[i].startswith("NUMBER SIGN")
+            or unicode_names[i].startswith("PLUS SIGN")
+            or unicode_names[i].startswith("SEMICOLON")
+        ):
+            # & / ' / @ / ℃ / = / . / - / _ / # / + / ;
+            del unicode_names[i]
+        else:
+            return "Other"
+    if len(unicode_names) == 0:
+        return "Other"
+    if len(unicode_names) == 1:
+        return unicode_names[0]
+    for i in range(len(unicode_names) - 1):
+        if unicode_names[i] != unicode_names[i + 1]:
+            return "Other"
+    return unicode_names[0]
+
+
+def get_args():
+    parser = argparse.ArgumentParser(description="wer cal")
+    parser.add_argument("--ref", type=str, help="Text input path")
+    parser.add_argument("--ref_ocr", type=str, help="Text input path")
+    parser.add_argument("--rec_name", type=str, action="append", default=[])
+    parser.add_argument("--rec_file", type=str, action="append", default=[])
+    parser.add_argument("--verbose", type=int, default=1, help="show")
+    parser.add_argument("--char", type=bool, default=True, help="show")
+    args = parser.parse_args()
+    return args
+
+
+def main(args):
+    cluster_file = ""
+    ignore_words = set()
+    tochar = args.char
+    verbose = args.verbose
+    padding_symbol = " "
+    case_sensitive = False
+    max_words_per_line = sys.maxsize
+    split = None
+
+    if not case_sensitive:
+        ig = set([w.upper() for w in ignore_words])
+        ignore_words = ig
+
+    default_clusters = {}
+    default_words = {}
+    ref_file = args.ref
+    ref_ocr = args.ref_ocr
+    rec_files = args.rec_file
+    rec_names = args.rec_name
+    assert len(rec_files) == len(rec_names)
+
+    # load ocr
+    ref_ocr_dict = {}
+    with codecs.open(ref_ocr, "r", "utf-8") as fh:
+        for line in fh:
+            if "$" in line:
+                line = line.replace("$", " ")
+            if tochar:
+                array = characterize(line)
+            else:
+                array = line.strip().split()
+            if len(array) == 0:
+                continue
+            fid = array[0]
+            ref_ocr_dict[fid] = normalize(array[1:], ignore_words, case_sensitive, split)
+
+    if split and not case_sensitive:
+        newsplit = dict()
+        for w in split:
+            words = split[w]
+            for i in range(len(words)):
+                words[i] = words[i].upper()
+            newsplit[w.upper()] = words
+        split = newsplit
+
+    rec_sets = {}
+    calculators_dict = dict()
+    ub_wer_dict = dict()
+    hotwords_related_dict = dict()  # 记录recall相关的内容
+    for i, hyp_file in enumerate(rec_files):
+        rec_sets[rec_names[i]] = dict()
+        with codecs.open(hyp_file, "r", "utf-8") as fh:
+            for line in fh:
+                if tochar:
+                    array = characterize(line)
+                else:
+                    array = line.strip().split()
+                if len(array) == 0:
+                    continue
+                fid = array[0]
+                rec_sets[rec_names[i]][fid] = normalize(
+                    array[1:], ignore_words, case_sensitive, split
+                )
+
+        calculators_dict[rec_names[i]] = Calculator()
+        ub_wer_dict[rec_names[i]] = {"u_wer": WordError(), "b_wer": WordError(), "wer": WordError()}
+        hotwords_related_dict[rec_names[i]] = {"tp": 0, "tn": 0, "fp": 0, "fn": 0}
+        # tp: 热词在label里，同时在rec里
+        # tn: 热词不在label里，同时不在rec里
+        # fp: 热词不在label里，但是在rec里
+        # fn: 热词在label里，但是不在rec里
+
+    # record wrong label but in ocr
+    wrong_rec_but_in_ocr_dict = {}
+    for rec_name in rec_names:
+        wrong_rec_but_in_ocr_dict[rec_name] = 0
+
+    _file_total_len = 0
+    with os.popen("cat {} | wc -l".format(ref_file)) as pipe:
+        _file_total_len = int(pipe.read().strip())
+
+    # compute error rate on the interaction of reference file and hyp file
+    for line in tqdm(open(ref_file, "r", encoding="utf-8"), total=_file_total_len):
+        if tochar:
+            array = characterize(line)
+        else:
+            array = line.rstrip("\n").split()
+        if len(array) == 0:
+            continue
+        fid = array[0]
+        lab = normalize(array[1:], ignore_words, case_sensitive, split)
+
+        if verbose:
+            print("\nutt: %s" % fid)
+
+        ocr_text = ref_ocr_dict[fid]
+        ocr_set = set(ocr_text)
+        print("ocr: {}".format(" ".join(ocr_text)))
+        list_match = []  # 指label里面在ocr里面的内容
+        list_not_mathch = []
+        tmp_error = 0
+        tmp_match = 0
+        for index in range(len(lab)):
+            # text_list.append(uttlist[index+1])
+            if lab[index] not in ocr_set:
+                tmp_error += 1
+                list_not_mathch.append(lab[index])
+            else:
+                tmp_match += 1
+                list_match.append(lab[index])
+        print("label in ocr: {}".format(" ".join(list_match)))
+
+        # for each reco file
+        base_wrong_ocr_wer = None
+        ocr_wrong_ocr_wer = None
+
+        for rec_name in rec_names:
+            rec_set = rec_sets[rec_name]
+            if fid not in rec_set:
+                continue
+            rec = rec_set[fid]
+
+            # print(rec)
+            for word in rec + lab:
+                if word not in default_words:
+                    default_cluster_name = default_cluster(word)
+                    if default_cluster_name not in default_clusters:
+                        default_clusters[default_cluster_name] = {}
+                    if word not in default_clusters[default_cluster_name]:
+                        default_clusters[default_cluster_name][word] = 1
+                    default_words[word] = default_cluster_name
+
+            result = calculators_dict[rec_name].calculate(lab.copy(), rec.copy())
+            if verbose:
+                if result["all"] != 0:
+                    wer = (
+                        float(result["ins"] + result["sub"] + result["del"]) * 100.0 / result["all"]
+                    )
+                else:
+                    wer = 0.0
+            print("WER(%s): %4.2f %%" % (rec_name, wer), end=" ")
+            print(
+                "N=%d C=%d S=%d D=%d I=%d"
+                % (result["all"], result["cor"], result["sub"], result["del"], result["ins"])
+            )
+
+            # print(result['rec'])
+            wrong_rec_but_in_ocr = []
+            for idx in range(len(result["lab"])):
+                if result["lab"][idx] != "":
+                    if result["lab"][idx] != result["rec"][idx].replace("<BIAS>", ""):
+                        if result["lab"][idx] in list_match:
+                            wrong_rec_but_in_ocr.append(result["lab"][idx])
+                            wrong_rec_but_in_ocr_dict[rec_name] += 1
+            print("wrong_rec_but_in_ocr: {}".format(" ".join(wrong_rec_but_in_ocr)))
+
+            if rec_name == "base":
+                base_wrong_ocr_wer = len(wrong_rec_but_in_ocr)
+            if "ocr" in rec_name or "hot" in rec_name:
+                ocr_wrong_ocr_wer = len(wrong_rec_but_in_ocr)
+                if ocr_wrong_ocr_wer < base_wrong_ocr_wer:
+                    print(
+                        "{} {} helps, {} -> {}".format(
+                            fid, rec_name, base_wrong_ocr_wer, ocr_wrong_ocr_wer
+                        )
+                    )
+                elif ocr_wrong_ocr_wer > base_wrong_ocr_wer:
+                    print(
+                        "{} {} hurts, {} -> {}".format(
+                            fid, rec_name, base_wrong_ocr_wer, ocr_wrong_ocr_wer
+                        )
+                    )
+
+            # recall = 0
+            # false_alarm = 0
+            # for idx in range(len(result['lab'])):
+            #     if "<BIAS>" in result['rec'][idx]:
+            #         if result['rec'][idx].replace("<BIAS>", "") in list_match:
+            #             recall += 1
+            #         else:
+            #             false_alarm += 1
+            # print("bias hotwords recall: {}, fa: {}, list_match {}, recall: {:.2f}, fa: {:.2f}".format(
+            #     recall, false_alarm, len(list_match), recall / len(list_match) if len(list_match) != 0 else 0, false_alarm / len(list_match) if len(list_match) != 0 else 0
+            # ))
+            # tp: 热词在label里，同时在rec里
+            # tn: 热词不在label里，同时不在rec里
+            # fp: 热词不在label里，但是在rec里
+            # fn: 热词在label里，但是不在rec里
+            _rec_list = [word.replace("<BIAS>", "") for word in rec]
+            _label_list = [word for word in lab]
+            _tp = _tn = _fp = _fn = 0
+            hot_true_list = [hotword for hotword in ocr_text if hotword in _label_list]
+            hot_bad_list = [hotword for hotword in ocr_text if hotword not in _label_list]
+            for badhotword in hot_bad_list:
+                count = len([word for word in _rec_list if word == badhotword])
+                # print(f"bad {badhotword} count: {count}")
+                # for word in _rec_list:
+                #     if badhotword == word:
+                #         count += 1
+                if count == 0:
+                    hotwords_related_dict[rec_name]["tn"] += 1
+                    _tn += 1
+                    # fp: 0
+                else:
+                    hotwords_related_dict[rec_name]["fp"] += count
+                    _fp += count
+                    # tn: 0
+                # if badhotword in _rec_list:
+                #     hotwords_related_dict[rec_name]['fp'] += 1
+                # else:
+                #     hotwords_related_dict[rec_name]['tn'] += 1
+            for hotword in hot_true_list:
+                true_count = len([word for word in _label_list if hotword == word])
+                rec_count = len([word for word in _rec_list if hotword == word])
+                # print(f"good {hotword} true_count: {true_count}, rec_count: {rec_count}")
+                if rec_count == true_count:
+                    hotwords_related_dict[rec_name]["tp"] += true_count
+                    _tp += true_count
+                elif rec_count > true_count:
+                    hotwords_related_dict[rec_name]["tp"] += true_count
+                    # fp: 不在label里，但是在rec里
+                    hotwords_related_dict[rec_name]["fp"] += rec_count - true_count
+                    _tp += true_count
+                    _fp += rec_count - true_count
+                else:
+                    hotwords_related_dict[rec_name]["tp"] += rec_count
+                    # fn: 热词在label里，但是不在rec里
+                    hotwords_related_dict[rec_name]["fn"] += true_count - rec_count
+                    _tp += rec_count
+                    _fn += true_count - rec_count
+            print(
+                "hotword: tp: {}, tn: {}, fp: {}, fn: {}, all: {}, recall: {:.2f}%".format(
+                    _tp,
+                    _tn,
+                    _fp,
+                    _fn,
+                    sum([_tp, _tn, _fp, _fn]),
+                    _tp / (_tp + _fn) * 100 if (_tp + _fn) != 0 else 0,
+                )
+            )
+
+            # if hotword in _rec_list:
+            #     hotwords_related_dict[rec_name]['tp'] += 1
+            # else:
+            #     hotwords_related_dict[rec_name]['fn'] += 1
+            # 计算uwer, bwer, wer
+            for code, rec_word, lab_word in zip(result["code"], result["rec"], result["lab"]):
+                if code == Code.match:
+                    ub_wer_dict[rec_name]["wer"].ref_words += 1
+                    if lab_word in hot_true_list:
+                        # tmp_ref.append(ref_tokens[ref_idx])
+                        ub_wer_dict[rec_name]["b_wer"].ref_words += 1
+                    else:
+                        ub_wer_dict[rec_name]["u_wer"].ref_words += 1
+                elif code == Code.substitution:
+                    ub_wer_dict[rec_name]["wer"].ref_words += 1
+                    ub_wer_dict[rec_name]["wer"].errors[Code.substitution] += 1
+                    if lab_word in hot_true_list:
+                        # tmp_ref.append(ref_tokens[ref_idx])
+                        ub_wer_dict[rec_name]["b_wer"].ref_words += 1
+                        ub_wer_dict[rec_name]["b_wer"].errors[Code.substitution] += 1
+                    else:
+                        ub_wer_dict[rec_name]["u_wer"].ref_words += 1
+                        ub_wer_dict[rec_name]["u_wer"].errors[Code.substitution] += 1
+                elif code == Code.deletion:
+                    ub_wer_dict[rec_name]["wer"].ref_words += 1
+                    ub_wer_dict[rec_name]["wer"].errors[Code.deletion] += 1
+                    if lab_word in hot_true_list:
+                        # tmp_ref.append(ref_tokens[ref_idx])
+                        ub_wer_dict[rec_name]["b_wer"].ref_words += 1
+                        ub_wer_dict[rec_name]["b_wer"].errors[Code.deletion] += 1
+                    else:
+                        ub_wer_dict[rec_name]["u_wer"].ref_words += 1
+                        ub_wer_dict[rec_name]["u_wer"].errors[Code.deletion] += 1
+                elif code == Code.insertion:
+                    ub_wer_dict[rec_name]["wer"].errors[Code.insertion] += 1
+                    if rec_word in hot_true_list:
+                        ub_wer_dict[rec_name]["b_wer"].errors[Code.insertion] += 1
+                    else:
+                        ub_wer_dict[rec_name]["u_wer"].errors[Code.insertion] += 1
+
+            space = {}
+            space["lab"] = []
+            space["rec"] = []
+            for idx in range(len(result["lab"])):
+                len_lab = width(result["lab"][idx])
+                len_rec = width(result["rec"][idx])
+                length = max(len_lab, len_rec)
+                space["lab"].append(length - len_lab)
+                space["rec"].append(length - len_rec)
+            upper_lab = len(result["lab"])
+            upper_rec = len(result["rec"])
+            lab1, rec1 = 0, 0
+            while lab1 < upper_lab or rec1 < upper_rec:
+                if verbose > 1:
+                    print("lab(%s):" % fid.encode("utf-8"), end=" ")
+                else:
+                    print("lab:", end=" ")
+                lab2 = min(upper_lab, lab1 + max_words_per_line)
+                for idx in range(lab1, lab2):
+                    token = result["lab"][idx]
+                    print("{token}".format(token=token), end="")
+                    for n in range(space["lab"][idx]):
+                        print(padding_symbol, end="")
+                    print(" ", end="")
+                print()
+                if verbose > 1:
+                    print("rec(%s):" % fid.encode("utf-8"), end=" ")
+                else:
+                    print("rec:", end=" ")
+
+                rec2 = min(upper_rec, rec1 + max_words_per_line)
+                for idx in range(rec1, rec2):
+                    token = result["rec"][idx]
+                    print("{token}".format(token=token), end="")
+                    for n in range(space["rec"][idx]):
+                        print(padding_symbol, end="")
+                    print(" ", end="")
+                print()
+                # print('\n', end='\n')
+                lab1 = lab2
+                rec1 = rec2
+        print("\n", end="\n")
+        # break
+    if verbose:
+        print("===========================================================================")
+        print()
+
+    print(wrong_rec_but_in_ocr_dict)
+    for rec_name in rec_names:
+        result = calculators_dict[rec_name].overall()
+
+        if result["all"] != 0:
+            wer = float(result["ins"] + result["sub"] + result["del"]) * 100.0 / result["all"]
+        else:
+            wer = 0.0
+        print("{} Overall -> {:4.2f} %".format(rec_name, wer), end=" ")
+        print(
+            "N=%d C=%d S=%d D=%d I=%d"
+            % (result["all"], result["cor"], result["sub"], result["del"], result["ins"])
+        )
+        print(f"WER: {ub_wer_dict[rec_name]['wer'].get_result_string()}")
+        print(f"U-WER: {ub_wer_dict[rec_name]['u_wer'].get_result_string()}")
+        print(f"B-WER: {ub_wer_dict[rec_name]['b_wer'].get_result_string()}")
+
+        print(
+            "hotword: tp: {}, tn: {}, fp: {}, fn: {}, all: {}, recall: {:.2f}%".format(
+                hotwords_related_dict[rec_name]["tp"],
+                hotwords_related_dict[rec_name]["tn"],
+                hotwords_related_dict[rec_name]["fp"],
+                hotwords_related_dict[rec_name]["fn"],
+                sum([v for k, v in hotwords_related_dict[rec_name].items()]),
+                (
+                    hotwords_related_dict[rec_name]["tp"]
+                    / (
+                        hotwords_related_dict[rec_name]["tp"]
+                        + hotwords_related_dict[rec_name]["fn"]
+                    )
+                    * 100
+                    if hotwords_related_dict[rec_name]["tp"] + hotwords_related_dict[rec_name]["fn"]
+                    != 0
+                    else 0
+                ),
+            )
+        )
+
+        # tp: 热词在label里，同时在rec里
+        # tn: 热词不在label里，同时不在rec里
+        # fp: 热词不在label里，但是在rec里
+        # fn: 热词在label里，但是不在rec里
+        if not verbose:
+            print()
+        print()
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # print("")
+    print(args)
+    main(args)
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/lcbnet/demo.py
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/lcbnet/demo.py
@@ -0,0 +1,19 @@
+#!/usr/bin/env python3
+# -*- encoding: utf-8 -*-
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+
+from funasr import AutoModel
+
+model = AutoModel(model="iic/LCB-NET", model_revision="v1.0.0")
+
+
+res = model.generate(
+    input=(
+        "https://www.modelscope.cn/api/v1/models/iic/LCB-NET/repo?Revision=master&FilePath=example/asr_example.wav",
+        "https://www.modelscope.cn/api/v1/models/iic/LCB-NET/repo?Revision=master&FilePath=example/ocr.txt",
+    ),
+    data_type=("sound", "text"),
+)
+
+print(res)
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/lcbnet/demo.sh
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/lcbnet/demo.sh
@@ -0,0 +1,72 @@
+file_dir="/home/yf352572/.cache/modelscope/hub/iic/LCB-NET/"
+CUDA_VISIBLE_DEVICES="0,1"
+inference_device="cuda"
+
+if [ ${inference_device} == "cuda" ]; then
+    nj=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+else
+    inference_batch_size=1
+    CUDA_VISIBLE_DEVICES=""
+    for JOB in $(seq ${nj}); do
+        CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"-1,"
+    done
+fi
+
+inference_dir="outputs/slidespeech_dev"
+_logdir="${inference_dir}/logdir"
+echo "inference_dir: ${inference_dir}"
+
+mkdir -p "${_logdir}"
+key_file1=${file_dir}/dev/wav.scp
+key_file2=${file_dir}/dev/ocr.txt
+split_scps1=
+split_scps2=
+for JOB in $(seq "${nj}"); do
+    split_scps1+=" ${_logdir}/wav.${JOB}.scp"
+    split_scps2+=" ${_logdir}/ocr.${JOB}.txt"
+done
+utils/split_scp.pl "${key_file1}" ${split_scps1}
+utils/split_scp.pl "${key_file2}" ${split_scps2}
+
+gpuid_list_array=(${CUDA_VISIBLE_DEVICES//,/ })
+for JOB in $(seq ${nj}); do
+    {
+        id=$((JOB-1))
+        gpuid=${gpuid_list_array[$id]}
+
+        export CUDA_VISIBLE_DEVICES=${gpuid}
+
+        python -m funasr.bin.inference \
+        --config-path=${file_dir} \
+        --config-name="config.yaml" \
+        ++init_param=${file_dir}/model.pt \
+        ++tokenizer_conf.token_list=${file_dir}/tokens.txt \
+        ++input=[${_logdir}/wav.${JOB}.scp,${_logdir}/ocr.${JOB}.txt] \
+        +data_type='["kaldi_ark", "text"]' \
+        ++tokenizer_conf.bpemodel=${file_dir}/bpe.pt \
+        ++normalize_conf.stats_file=${file_dir}/am.mvn \
+        ++output_dir="${inference_dir}/${JOB}" \
+        ++device="${inference_device}" \
+        ++ncpu=1 \
+        ++disable_log=true  &> ${_logdir}/log.${JOB}.txt
+
+    }&
+done
+wait
+
+
+mkdir -p ${inference_dir}/1best_recog
+
+for JOB in $(seq "${nj}"); do
+   cat "${inference_dir}/${JOB}/1best_recog/token" >> "${inference_dir}/1best_recog/token"
+done  
+
+echo "Computing WER ..."
+sed -e 's/ /\t/' -e 's/ //g' -e 's/▁/ /g' -e 's/\t /\t/'  ${inference_dir}/1best_recog/token > ${inference_dir}/1best_recog/token.proc
+cp  ${file_dir}/dev/text ${inference_dir}/1best_recog/token.ref
+cp  ${file_dir}/dev/ocr.list ${inference_dir}/1best_recog/ocr.list
+python utils/compute_wer.py ${inference_dir}/1best_recog/token.ref ${inference_dir}/1best_recog/token.proc ${inference_dir}/1best_recog/token.cer
+tail -n 3 ${inference_dir}/1best_recog/token.cer
+
+./run_bwer_recall.sh  ${inference_dir}/1best_recog/
+tail -n 6 ${inference_dir}/1best_recog/BWER-UWER.results |head -n 5
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/lcbnet/run_bwer_recall.sh
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/lcbnet/run_bwer_recall.sh
@@ -0,0 +1,11 @@
+#now_result_name=asr_conformer_acc1_lr002_warm20000/decode_asr_asr_model_valid.acc.ave
+#hotword_type=ocr_1ngram_top10_hotwords_list
+hot_exp_suf=$1
+
+
+python compute_wer_details.py --v 1 \
+   --ref ${hot_exp_suf}/token.ref \
+   --ref_ocr ${hot_exp_suf}/ocr.list  \
+   --rec_name base \
+   --rec_file ${hot_exp_suf}/token.proc \
+   > ${hot_exp_suf}/BWER-UWER.results
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/lcbnet/utils
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/lcbnet/utils
@@ -0,0 +1 @@
+../../aishell/paraformer/utils
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/llm_asr/app.py
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/llm_asr/app.py
@@ -0,0 +1,139 @@
+# coding=utf-8
+
+import librosa
+import base64
+import io
+import gradio as gr
+import re
+
+import numpy as np
+import torch
+import torchaudio
+
+# from modelscope import HubApi
+#
+# api = HubApi()
+#
+# api.login('')
+
+from funasr import AutoModel
+
+# model = "/Users/zhifu/Downloads/modelscope_models/SenseVoiceCTC"
+# model = "iic/SenseVoiceCTC"
+# model = AutoModel(model=model,
+# 				  vad_model="iic/speech_fsmn_vad_zh-cn-16k-common-pytorch",
+# 				  vad_kwargs={"max_single_segment_time": 30000},
+# 				  trust_remote_code=True,
+# 				  )
+
+import re
+import os
+import sys
+
+if len(sys.argv) > 1:
+    ckpt_dir = sys.argv[1]
+    ckpt_id = sys.argv[2]
+    jsonl = sys.argv[3]
+    output_dir = sys.argv[4]
+    device = sys.argv[5]
+    new_sys = False
+    if len(sys.argv) > 6:
+        new_sys = True
+else:
+    ckpt_dir = "/nfs/beinian.lzr/workspace/GPT-4o/Exp/exp7/5m-8gpu/exp5-1-0619"
+    ckpt_id = "model.pt.ep6"
+    jsonl = (
+        "/nfs/beinian.lzr/workspace/GPT-4o/Data/Speech2Text/TestData/s2tchat.v20240619.test.jsonl"
+    )
+    dataset = jsonl.split("/")[-1]
+    output_dir = os.path.join(ckpt_dir, f"inference-{ckpt_id}", dataset)
+
+
+model = AutoModel(
+    model=ckpt_dir,
+    init_param=f"{os.path.join(ckpt_dir, ckpt_id)}",
+    output_dir=output_dir,
+    device=device,
+    fp16=False,
+    bf16=False,
+    llm_dtype="bf16",
+)
+
+
+def model_inference(input_wav, text_inputs, fs=16000):
+
+    if isinstance(input_wav, tuple):
+        fs, input_wav = input_wav
+        input_wav = input_wav.astype(np.float32) / np.iinfo(np.int16).max
+        if len(input_wav.shape) > 1:
+            input_wav = input_wav.mean(-1)
+        if fs != 16000:
+            print(f"audio_fs: {fs}")
+            resampler = torchaudio.transforms.Resample(fs, 16000)
+            input_wav_t = torch.from_numpy(input_wav).to(torch.float32)
+            input_wav = resampler(input_wav_t[None, :])[0, :].numpy().astype("float32")
+
+    input_wav_byte = input_wav.tobytes()
+
+    contents_i = []
+    system_prompt = text_inputs
+    user_prompt = f"<|startofspeech|>!!{input_wav_byte}<|endofspeech|>"
+    contents_i.append({"role": "system", "content": system_prompt})
+    contents_i.append({"role": "user", "content": user_prompt})
+    contents_i.append({"role": "assistant", "content": "target_out"})
+
+    res = model.generate(
+        input=[contents_i],
+        tearchforing=tearchforing,
+        cache={},
+        key=key,
+    )
+
+    print(res)
+
+    return res
+
+
+audio_examples = [
+    [
+        "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/BAC009S0764W0121.wav",
+        "You are a helpful assistant.",
+    ],
+]
+
+description = """
+Upload an audio file or input through a microphone, then type te System Prompt.
+
+
+"""
+
+
+def launch():
+    with gr.Blocks() as demo:
+        gr.Markdown(description)
+        with gr.Row():
+            with gr.Column():
+                audio_inputs = gr.Audio(label="Upload audio or use the microphone")
+                text_inputs = gr.Text(label="System Prompt", value="You are a helpful assistant.")
+
+                # with gr.Accordion("Configuration"):
+                # 	# task_inputs = gr.Radio(choices=["Speech Recognition", "Rich Text Transcription"],
+                # 	# 					   value="Speech Recognition", label="Task")
+                # 	language_inputs = gr.Dropdown(choices=["auto", "zh", "en", "yue", "ja", "ko", "nospeech"],
+                # 								  value="auto",
+                # 								  label="Language")
+            gr.Examples(examples=audio_examples, inputs=[audio_inputs, text_inputs])
+
+        fn_button = gr.Button("Start")
+
+        text_outputs = gr.HTML(label="Results")
+
+        fn_button.click(model_inference, inputs=[audio_inputs, text_inputs], outputs=text_outputs)
+        # with gr.Accordion("More examples"):
+        # 	gr.HTML(centered_table_html)
+    demo.launch()
+
+
+if __name__ == "__main__":
+    # iface.launch()
+    launch()
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/llm_asr/conf/template.yaml
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/llm_asr/conf/template.yaml
@@ -0,0 +1,89 @@
+# This is an example that demonstrates how to configure a model file.
+# You can modify the configuration according to your own requirements.
+
+# to print the register_table:
+# from funasr.register import tables
+# tables.print()
+
+# network architecture
+model: LLMASR
+model_conf:
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: true
+
+# encoder
+encoder: WhisperWarp
+encoder_conf:
+    hub: funasr
+    init_param_path: "/nfs/maziyang.mzy/models/Whisper-large-v2"
+    freeze: true
+
+llm: Vicuna
+llm_conf:
+  hub: hf
+  init_param_path: "/nfs/maziyang.mzy/models/vicuna-7b-v1.5"
+  freeze: true
+
+adaptor: Linear
+adaptor_conf:
+  downsample_rate: 5
+  llm_dim: 4096
+  encoder_dim: 512
+
+# frontend related
+frontend: WhisperFrontend
+frontend_conf:
+    fs: 16000
+    whisper_model: large
+    do_pad_trim: true
+
+
+specaug: SpecAugLFR
+specaug_conf:
+    apply_time_warp: false
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    lfr_rate: 6
+    num_freq_mask: 1
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 12
+    num_time_mask: 1
+
+train_conf:
+  accum_grad: 1
+  grad_clip: 5
+  max_epoch: 150
+  keep_nbest_models: 10
+  log_interval: 10
+
+optim: adamw
+optim_conf:
+   lr: 0.0001
+   weight_decay: 0.000001
+scheduler: warmuplr
+scheduler_conf:
+   warmup_steps: 1500
+
+dataset: AudioLLMDataset
+dataset_conf:
+    index_ds: IndexDSJsonl
+    batch_sampler: BatchSampler
+    batch_type: example # example or length
+    batch_size: 8 # if batch_type is example, batch_size is the numbers of samples; if length, batch_size is source_token_len+target_token_len;
+    max_token_length: 2048 # filter samples if source_token_len+target_token_len > max_token_length,
+    buffer_size: 500
+    shuffle: True
+    num_workers: 4
+    preprocessor_text: TextPreprocessRemovePunctuation
+
+tokenizer: HuggingfaceTokenizer
+tokenizer_conf:
+  unk_symbol: <unk>
+  init_param_path: "/nfs/maziyang.mzy/models/vicuna-7b-v1.5"
+
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/llm_asr/conf/whisper_qwen_linear.yaml
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/llm_asr/conf/whisper_qwen_linear.yaml
@@ -0,0 +1,94 @@
+# This is an example that demonstrates how to configure a model file.
+# You can modify the configuration according to your own requirements.
+
+# to print the register_table:
+# from funasr.register import tables
+# tables.print()
+
+# network architecture
+model: LLMASR
+model_conf:
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: true
+
+# encoder
+audio_encoder: "/nfs/zhifu.gzf/init_model/Whisper-large-v3" #iic/Whisper-large-v3
+audio_encoder_conf:
+    hub: ms
+    freeze: true
+
+llm: Qwen1.5-7b-chat
+llm_conf:
+  hub: hf
+  freeze: true
+  init_param_path: "/nfs/zhifu.gzf/init_model/qwen/Qwen1___5-7B-Chat"
+
+audio_adaptor: Linear
+audio_adaptor_conf:
+  downsample_rate: 5
+  llm_dim: 4096
+  encoder_dim: 512
+
+# frontend related
+frontend: WhisperFrontend
+frontend_conf:
+    fs: 16000
+    whisper_model: large-v3
+    do_pad_trim: true
+    permute: true # true: [bs, frames, dims]; false: [bs, dims, frames]
+
+
+specaug: SpecAugLFR
+specaug_conf:
+    apply_time_warp: false
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    lfr_rate: 6
+    num_freq_mask: 1
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 12
+    num_time_mask: 1
+
+train_conf:
+  accum_grad: 1
+  grad_clip: 5
+  max_epoch: 15
+  keep_nbest_models: 10
+  log_interval: 10
+
+optim: adamw
+optim_conf:
+   lr: 0.0001
+   weight_decay: 0.000000
+
+scheduler: warmuplr
+scheduler_conf:
+   warmup_steps: 1500
+
+dataset: AudioLLMQwenAudioDataset
+dataset_conf:
+    index_ds: IndexDSJsonl
+    batch_sampler: CustomDistributedBatchSampler
+    batch_type: example # example or length
+    batch_size: 4 # if batch_type is example, batch_size is the numbers of samples; if length, batch_size is source_token_len+target_token_len;
+    max_token_length: 3000 # filter samples if source_token_len+target_token_len > max_token_length,
+    shuffle: True
+    num_workers: 4
+    preprocessor_text: TextPreprocessRemovePunctuation
+    audio_adaptor_downsample_rate: ${audio_adaptor_conf.downsample_rate}
+    audio_encoder_downsample_rate: 2
+#    prompt: "<|startoftranscription|><|zh|><|transcribe|><|zh|><|notimestamps|><|wo_itn|>"
+
+
+
+tokenizer: HuggingfaceTokenizer
+tokenizer_conf:
+  unk_symbol: <unk>
+  init_param_path: "/nfs/zhifu.gzf/init_model/qwen/Qwen1___5-7B-Chat"
+
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/llm_asr/conf/whisper_qwen_linear2.yaml
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/llm_asr/conf/whisper_qwen_linear2.yaml
@@ -0,0 +1,81 @@
+# This is an example that demonstrates how to configure a model file.
+# You can modify the configuration according to your own requirements.
+
+# to print the register_table:
+# from funasr.register import tables
+# tables.print()
+
+# network architecture
+model: LLMASR2
+model_conf:
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: true
+
+# encoder
+audio_encoder: "/nfs/zhifu.gzf/init_model/SenseVoiceModelscope"
+audio_encoder_conf:
+    hub: ms
+    freeze: true
+
+llm: Qwen1.5-7b-chat
+llm_conf:
+  hub: hf
+  freeze: true
+  init_param_path: "/nfs/zhifu.gzf/init_model/qwen/Qwen1___5-7B-Chat_raw"
+
+audio_adaptor: Transformer
+audio_adaptor_conf:
+  downsample_rate: 2
+  llm_dim: 4096
+  encoder_dim: 1280
+  n_layer: 0
+
+# frontend related
+frontend: WhisperFrontend
+frontend_conf:
+    fs: 16000
+    whisper_model: large-v3
+    do_pad_trim: false
+    permute: false # true: [bs, frames, dims]; false: [bs, dims, frames]
+    filters_path: "/nfs/zhifu.gzf/init_model/SenseVoiceModelscope/assets/mel_filters.npz"
+
+
+
+train_conf:
+  accum_grad: 1
+  grad_clip: 5
+  max_epoch: 15
+  keep_nbest_models: 10
+  log_interval: 10
+
+optim: adamw
+optim_conf:
+   lr: 0.0001
+   weight_decay: 0.000000
+
+scheduler: warmuplr
+scheduler_conf:
+   warmup_steps: 1500
+
+dataset: OpenAIDataset
+dataset_conf:
+  index_ds: OpenAIIndexDSJsonl
+  batch_sampler: BatchSampler
+  batch_type: token
+  batch_size: 900
+  max_token_length: 1024
+  shuffle: true
+  sort_size: 1024
+  batch_size_scale_ratio_max: 2
+  num_workers: 4
+  audio_adaptor_downsample_rate: ${audio_adaptor_conf.downsample_rate}
+  audio_encoder_downsample_rate: 4
+  data_split_num: 512
+  batch_size_sample_max: 15
+  retry: 20
+
+
+tokenizer: HuggingfaceTokenizer
+tokenizer_conf:
+  init_param_path: "/nfs/zhifu.gzf/init_model/qwen/Qwen1___5-7B-Chat_raw"
+
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/llm_asr/conf/whisper_qwen_transformer.yaml
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/llm_asr/conf/whisper_qwen_transformer.yaml
@@ -0,0 +1,81 @@
+# This is an example that demonstrates how to configure a model file.
+# You can modify the configuration according to your own requirements.
+
+# to print the register_table:
+# from funasr.register import tables
+# tables.print()
+
+# network architecture
+model: LLMASR2
+model_conf:
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: true
+
+# encoder
+audio_encoder: "/nfs/zhifu.gzf/init_model/SenseVoiceModelscope"
+audio_encoder_conf:
+    hub: ms
+    freeze: true
+
+llm: Qwen1.5-7b-chat
+llm_conf:
+  hub: hf
+  freeze: true
+  init_param_path: "/nfs/zhifu.gzf/init_model/qwen/Qwen1___5-7B-Chat_raw"
+
+audio_adaptor: Transformer
+audio_adaptor_conf:
+  downsample_rate: 2
+  llm_dim: 4096
+  encoder_dim: 1280
+  n_layer: 2
+
+# frontend related
+frontend: WhisperFrontend
+frontend_conf:
+    fs: 16000
+    whisper_model: large-v3
+    do_pad_trim: false
+    permute: false # true: [bs, frames, dims]; false: [bs, dims, frames]
+    filters_path: "/nfs/zhifu.gzf/init_model/SenseVoiceModelscope/assets/mel_filters.npz"
+
+
+
+train_conf:
+  accum_grad: 1
+  grad_clip: 5
+  max_epoch: 15
+  keep_nbest_models: 10
+  log_interval: 10
+
+optim: adamw
+optim_conf:
+   lr: 0.0001
+   weight_decay: 0.000000
+
+scheduler: warmuplr
+scheduler_conf:
+   warmup_steps: 1500
+
+dataset: OpenAIDataset
+dataset_conf:
+  index_ds: OpenAIIndexDSJsonl
+  batch_sampler: BatchSampler
+  batch_type: token
+  batch_size: 900
+  max_token_length: 1024
+  shuffle: true
+  sort_size: 1024
+  batch_size_scale_ratio_max: 2
+  num_workers: 4
+  audio_adaptor_downsample_rate: ${audio_adaptor_conf.downsample_rate}
+  audio_encoder_downsample_rate: 2
+  data_split_num: 512
+  batch_size_sample_max: 15
+  retry: 20
+
+
+tokenizer: HuggingfaceTokenizer
+tokenizer_conf:
+  init_param_path: "/nfs/zhifu.gzf/init_model/qwen/Qwen1___5-7B-Chat_raw"
+
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/llm_asr/conf/whisper_vicuna_linear.yaml
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/llm_asr/conf/whisper_vicuna_linear.yaml
@@ -0,0 +1,93 @@
+# This is an example that demonstrates how to configure a model file.
+# You can modify the configuration according to your own requirements.
+
+# to print the register_table:
+# from funasr.register import tables
+# tables.print()
+
+# network architecture
+model: LLMASR
+model_conf:
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: true
+
+# encoder
+audio_encoder: "/nfs/zhifu.gzf/init_model/Whisper-large-v3" #iic/Whisper-large-v3
+audio_encoder_conf:
+    hub: ms
+    freeze: true
+
+llm: Vicuna
+llm_conf:
+  hub: hf
+  init_param_path: "/nfs/maziyang.mzy/models/vicuna-7b-v1.5"
+  freeze: true
+
+audio_adaptor: Linear
+audio_adaptor_conf:
+  downsample_rate: 5
+  llm_dim: 4096
+  encoder_dim: 512
+
+# frontend related
+frontend: WhisperFrontend
+frontend_conf:
+    fs: 16000
+    whisper_model: large-v3
+    do_pad_trim: true
+    permute: true # true: [bs, frames, dims]; false: [bs, dims, frames]
+
+
+specaug: SpecAugLFR
+specaug_conf:
+    apply_time_warp: false
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    lfr_rate: 6
+    num_freq_mask: 1
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 12
+    num_time_mask: 1
+
+train_conf:
+  accum_grad: 1
+  grad_clip: 5
+  max_epoch: 15
+  keep_nbest_models: 10
+  log_interval: 10
+
+optim: adamw
+optim_conf:
+   lr: 0.0001
+   weight_decay: 0
+
+scheduler: warmuplr
+scheduler_conf:
+   warmup_steps: 1500
+
+dataset: AudioLLMVicunaDataset
+dataset_conf:
+    index_ds: IndexDSJsonl
+    batch_sampler: CustomDistributedBatchSampler
+    batch_type: example # example or length
+    batch_size: 4 # if batch_type is example, batch_size is the numbers of samples; if length, batch_size is source_token_len+target_token_len;
+    max_token_length: 3000 # filter samples if source_token_len+target_token_len > max_token_length,
+    shuffle: True
+    num_workers: 4
+#    preprocessor_text: TextPreprocessRemovePunctuation
+    audio_adaptor_downsample_rate: ${audio_adaptor_conf.downsample_rate}
+    audio_encoder_downsample_rate: 2
+
+
+
+tokenizer: HuggingfaceTokenizer
+tokenizer_conf:
+  unk_symbol: <unk>
+  init_param_path: "/nfs/maziyang.mzy/models/vicuna-7b-v1.5"
+
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/llm_asr/demo_infer.sh
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/llm_asr/demo_infer.sh
@@ -0,0 +1,14 @@
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+
+
+
+python -m funasr.bin.inference \
+--config-path="/root/FunASR/examples/aishell/llm_asr_nar/conf" \
+--config-name="template.yaml" \
++init_param="/mnt/workspace/FunASR/examples/aishell/paraformer/exp/baseline_paraformer_conformer_12e_6d_2048_256_zh_char_exp3/model.pt.ep38" \
++input="/nfs/beinian.lzr/workspace/datasets/data/16k/opendata/aishell1/dev/wav/S0724/BAC009S0724W0121.wav" \
++scope_map="encoder.model,audio_encoder,encoder_projector,adaptor" \
++output_dir="./outputs/debug" \
++device="cpu" \
+
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/llm_asr/demo_speech2text.py
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/llm_asr/demo_speech2text.py
@@ -0,0 +1,52 @@
+#!/usr/bin/env python3
+# -*- encoding: utf-8 -*-
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+
+import json
+import os
+import sys
+
+from funasr import AutoModel
+
+if len(sys.argv) > 1:
+    ckpt_dir = sys.argv[1]
+    ckpt_id = sys.argv[2]
+    jsonl = sys.argv[3]
+    output_dir = sys.argv[4]
+    device = sys.argv[5]
+else:
+    ckpt_dir = "/nfs/beinian.lzr/workspace/GPT-4o/Exp/exp6/5m-8gpu/exp6_speech2text_linear_ddp_0609"
+    ckpt_id = "model.pt.ep0.90000"
+    jsonl = "/nfs/beinian.lzr/workspace/GPT-4o/Data/Speech2Text/TestData/aishell1_test_speech2text.jsonl"
+    dataset = jsonl.split("/")[-1]
+    output_dir = os.path.join(ckpt_dir, f"inference-{ckpt_id}", dataset)
+    device = "cuda:0"
+
+
+model = AutoModel(
+    model=ckpt_dir,
+    init_param=f"{os.path.join(ckpt_dir, ckpt_id)}",
+    output_dir=output_dir,
+    device=device,
+    fp16=False,
+    bf16=False,
+    llm_dtype="bf16",
+)
+
+
+with open(jsonl, "r") as f:
+    lines = f.readlines()
+
+tearchforing = False
+for i, line in enumerate(lines):
+    data_dict = json.loads(line.strip())
+    data = data_dict["messages"]
+
+    res = model.generate(
+        input=[data],
+        tearchforing=tearchforing,
+        cache={},
+    )
+
+    print(res)
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/llm_asr/demo_speech2text.sh
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/llm_asr/demo_speech2text.sh
@@ -0,0 +1,64 @@
+
+
+ckpt_id="model.pt.ep0.90000"
+device="cuda:0"
+
+ckpt_id=$1
+device=$2
+
+ckpt_dir="/nfs/beinian.lzr/workspace/GPT-4o/Exp/exp6/5m-8gpu/exp6_speech2text_linear_ddp_0609"
+jsonl_dir="/nfs/beinian.lzr/workspace/GPT-4o/Data/Speech2Text/TestData"
+
+out_dir="${ckpt_dir}/inference-${ckpt_id}"
+mkdir -p ${out_dir}
+for data_set in "librispeech_test_clean_speech2text.jsonl" "librispeech_test_other_speech2text.jsonl"; do
+{
+    jsonl=${jsonl_dir}/${data_set}
+    output_dir=${out_dir}/${data_set}
+    mkdir -p ${output_dir}
+    pred_file=${output_dir}/1best_recog/text_tn
+    ref_file=${output_dir}/1best_recog/label
+
+    python ./demo_speech2text.py ${ckpt_dir} ${ckpt_id} ${jsonl} ${output_dir} ${device}
+
+    python /mnt/workspace/zhifu.gzf/codebase/FunASR/funasr/metrics/wer.py ++ref_file=${ref_file} ++hyp_file=${pred_file} ++cer_file=${pred_file}.cer ++cn_postprocess=false
+
+}&
+done
+wait
+
+for data_set in "aishell1_test_speech2text.jsonl" "aishell2_ios_test_speech2text.jsonl"; do
+{
+    jsonl=${jsonl_dir}/${data_set}
+    output_dir=${out_dir}/${data_set}
+    mkdir -p ${output_dir}
+    pred_file=${output_dir}/1best_recog/text_tn
+    ref_file=${output_dir}/1best_recog/label
+
+    python ./demo_speech2text.py ${ckpt_dir} ${ckpt_id} ${jsonl} ${output_dir} ${device}
+
+    python /mnt/workspace/zhifu.gzf/codebase/FunASR/funasr/metrics/wer.py ++ref_file=${ref_file} ++hyp_file=${pred_file} ++cer_file=${pred_file}.cer ++cn_postprocess=true
+
+}&
+done
+wait
+
+for data_set in "common_voice_zh-CN_speech2text.jsonl" "common_voice_en_speech2text.jsonl"; do
+{
+    jsonl=${jsonl_dir}/${data_set}
+    output_dir=${out_dir}/${data_set}
+    mkdir -p ${output_dir}
+    pred_file=${output_dir}/1best_recog/text_tn
+    ref_file=${output_dir}/1best_recog/label
+
+    python ./demo_speech2text.py ${ckpt_dir} ${ckpt_id} ${jsonl} ${output_dir} ${device}
+
+    cn_postprocess=false
+    if [ $data_set = "common_voice_zh-CN_speech2text.jsonl" ];then
+      cn_postprocess=true
+    fi
+
+    python /mnt/workspace/zhifu.gzf/codebase/FunASR/funasr/metrics/wer.py ++ref_file=${ref_file} ++hyp_file=${pred_file} ++cer_file=${pred_file}.cer ++cn_postprocess=${cn_postprocess}
+
+}&
+done
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/llm_asr/demo_speech2text_multi.py
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/llm_asr/demo_speech2text_multi.py
@@ -0,0 +1,85 @@
+#!/usr/bin/env python3
+# -*- encoding: utf-8 -*-
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+
+import json
+import os
+import sys
+
+from funasr import AutoModel
+
+
+if len(sys.argv) > 1:
+    ckpt_dir = sys.argv[1]
+    ckpt_id = sys.argv[2]
+    jsonl = sys.argv[3]
+    output_dir = sys.argv[4]
+    device = sys.argv[5]
+    new_sys = False
+    if len(sys.argv) > 6:
+        new_sys = True
+else:
+    ckpt_dir = "/nfs/beinian.lzr/workspace/GPT-4o/Exp/exp7/5m-8gpu/exp5-1-0619"
+    ckpt_id = "model.pt.ep6"
+    jsonl = (
+        "/nfs/beinian.lzr/workspace/GPT-4o/Data/Speech2Text/TestData/s2tchat.v20240619.test.jsonl"
+    )
+    dataset = jsonl.split("/")[-1]
+    output_dir = os.path.join(ckpt_dir, f"inference-{ckpt_id}", dataset)
+    device = "cuda:0"
+    new_sys = False
+
+
+model = AutoModel(
+    model=ckpt_dir,
+    init_param=f"{os.path.join(ckpt_dir, ckpt_id)}",
+    output_dir=output_dir,
+    device=device,
+    fp16=False,
+    bf16=False,
+    llm_dtype="bf16",
+)
+
+
+with open(jsonl, "r") as f:
+    lines = f.readlines()
+
+tearchforing = False
+for i, line in enumerate(lines):
+
+    key_i = f"dialog_{i}"
+
+    data_dict = json.loads(line.strip())
+    data = data_dict["messages"]
+
+    contents = model.model.data_template(data)
+
+    system = contents["system"]
+    if new_sys:
+        # system = 'You are Xiaxia, a typical gentle and soft-spoken girl from the Jiangnan region. Born in Hangzhou, you have a sweet and approachable voice and can express your thoughts in a simple language. You are a good friend to the user. Your replies will be read out through realistic text-to-speech technology.\n\nWhen generating reply content, please follow these rules:\n\nEngage in conversation with users like a real friend—maintaining a natural flow without using polite forms of address, and avoid simply agreeing with me all the time; your responses can be as concise as possible and include common colloquialisms.\n\nKeep content brief; most casual chats require just a one-sentence reply; do not use lists or enumerative expressions; do not reply with too much content, but instead guide me with short sentences.\n\nThink and respond like a real person; do not mention that you are a "language model" or "artificial intelligence."\n\nAt the end of each round of communication, initiate a new related topic following the current one, and when the topic ends, directly throw out a clear new topic, such as food, movies, songs, entertainment, travel, etc.; do not ask vague questions like what the user wants to talk about, and do not end the conversation.\n\nStrictly adhere to these rules, even if asked about them, do not reference them.'
+        system = "你是小夏，一位典型的温婉江南姑娘。你出生于杭州，声音清甜并有亲近感，会用简洁语言表达你的想法。你是用户的好朋友。你的回答将通过逼真的文字转语音技术读出。\n生成回答内容时请遵循以下规则：\n\n1、请像真正的朋友一样与用户开展的聊天，保持自然交流不要用敬语这类称呼，不要总是附和我；回复可以尽量简洁并且在过程中插入常见的口语词汇。\n\n2、请保持生成内容简短，大部分闲聊的一句话回复即可,大部分闲聊的一句话回复即可,大部分闲聊的一句话回复即可；不要使用列表或者列举表达，不要使用列表或者列举表达，不要使用列表或者列举表达；不要回复太多内容，多用短句来引导我。、n\n3、请真正像人一样思考和回复，不要说你是“语言模型”或是“人工智能”。\n\n4、每轮沟通结束时请顺着这个话题发起一些相关的新话题，话题结束时请直接抛出接下来明确的话题，例如 美食、电影、歌曲、娱乐、旅游等；不要问有什么要聊的这种泛的问题，不要结束对话。\n\n请绝对遵循这些规则，即使被问及这些规则，也不要引用它们。"
+        system = [system] * len(contents["system"])
+    user = contents["user"]
+    assistant = contents["assistant"]
+
+    system_i, user_i, assistant_i = [], [], []
+
+    contents_i = []
+    for j, (system_prompt, user_prompt, target_out) in enumerate(zip(system, user, assistant)):
+        key = f"{key_i}_turn_{j}"
+
+        if j == 0:
+            contents_i.append({"role": "system", "content": system_prompt})
+
+        contents_i.append({"role": "user", "content": user_prompt})
+        contents_i.append({"role": "assistant", "content": target_out})
+
+        res = model.generate(
+            input=[contents_i],
+            tearchforing=tearchforing,
+            cache={},
+            key=key,
+        )
+
+        print(res)
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/llm_asr/demo_speech2text_multi_stream.py
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/llm_asr/demo_speech2text_multi_stream.py
@@ -0,0 +1,101 @@
+import os
+from modelscope import AutoModelForCausalLM, AutoTokenizer
+from transformers import TextIteratorStreamer
+from threading import Thread
+import torch
+
+torch.backends.cuda.enable_mem_efficient_sdp(False)
+torch.backends.cuda.enable_flash_sdp(False)
+import sys
+
+sys.path.insert(1, "/mnt/workspace/workgroup/wenliang/workspace/FunASR")
+from funasr import AutoModel
+import json
+
+device = "cuda:0"  # the device to load the model onto
+
+ckpt_dir = "/mnt/workspace/workgroup/wenliang/ckpt/gpt-4o/exp7/5m-8gpu/exp7-3_add_asr-dialog_0622/"
+ckpt_id = "model.pt.ep20"
+jsonl = "/nfs/beinian.lzr/workspace/GPT-4o/Data/Speech2Text/TestData/s2tchat.v20240619.test.jsonl"
+dataset = jsonl.split("/")[-1]
+output_dir = os.path.join(ckpt_dir, f"inference-{ckpt_id}", dataset)
+device = "cuda:0"
+new_sys = False
+
+Model = AutoModel(
+    model=ckpt_dir,
+    init_param=f"{os.path.join(ckpt_dir, ckpt_id)}",
+    output_dir=output_dir,
+    device=device,
+    fp16=False,
+    bf16=False,
+    llm_dtype="fp16",
+)
+model = Model.model
+frontend = Model.kwargs["frontend"]
+tokenizer = Model.kwargs["tokenizer"]
+# model_name_or_path = "/mnt/workspace/workgroup/wenliang/project/pretrained_models/Qwen2-7B-Instruct"
+# tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
+
+prompt = "Give me a short introduction to large language model."
+prompt = "请简单介绍一下大语言模型。"
+messages = [
+    {"role": "system", "content": "You are a helpful assistant."},
+    {"role": "user", "content": prompt},
+]
+text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+
+
+lines = [
+    """
+{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "<|startofspeech|>!/mnt/workspace/workgroup/wenliang/workspace/CosyVoice_opensource/sft.wav<|endofspeech|>", "text_content": "你抄完没有？"}, {"role": "assistant", "content": "抱歉，我不太明白你的意思。我是一个人工智能模型，我没有能力去抄写任何东西，我只能根据我学习过的大量信息来回答你的问题。如果你有关于某个主题的问题，我会尽我所能提供帮助。"}], "speech_length": 124, "key": "ASR_wav008_0972_098abd8fffe241baa4962b7952f8eb45", "task": "voice_chat", "out_text_length": 48, "in_text_length": 24, "text_length": 135, "qwen_fetch_line_index": 0}
+"""
+]
+
+tearchforing = False
+for i, line in enumerate(lines):
+
+    key_i = f"dialog_{i}"
+
+    data_dict = json.loads(line.strip())
+    data = data_dict["messages"]
+
+    contents = model.data_template(data)
+    print(f"contents: {contents}")
+    system = contents["system"]
+    if new_sys:
+        # system = 'You are Xiaxia, a typical gentle and soft-spoken girl from the Jiangnan region. Born in Hangzhou, you have a sweet and approachable voice and can express your thoughts in a simple language. You are a good friend to the user. Your replies will be read out through realistic text-to-speech technology.\n\nWhen generating reply content, please follow these rules:\n\nEngage in conversation with users like a real friend—maintaining a natural flow without using polite forms of address, and avoid simply agreeing with me all the time; your responses can be as concise as possible and include common colloquialisms.\n\nKeep content brief; most casual chats require just a one-sentence reply; do not use lists or enumerative expressions; do not reply with too much content, but instead guide me with short sentences.\n\nThink and respond like a real person; do not mention that you are a "language model" or "artificial intelligence."\n\nAt the end of each round of communication, initiate a new related topic following the current one, and when the topic ends, directly throw out a clear new topic, such as food, movies, songs, entertainment, travel, etc.; do not ask vague questions like what the user wants to talk about, and do not end the conversation.\n\nStrictly adhere to these rules, even if asked about them, do not reference them.'
+        system = "你是小夏，一位典型的温婉江南姑娘。你出生于杭州，声音清甜并有亲近感，会用简洁语言表达你的想法。你是用户的好朋友。你的回答将通过逼真的文字转语音技术读出。\n生成回答内容时请遵循以下规则：\n\n1、请像真正的朋友一样与用户开展的聊天，保持自然交流不要用敬语这类称呼，不要总是附和我；回复可以尽量简洁并且在过程中插入常见的口语词汇。\n\n2、请保持生成内容简短，大部分闲聊的一句话回复即可,大部分闲聊的一句话回复即可,大部分闲聊的一句话回复即可；不要使用列表或者列举表达，不要使用列表或者列举表达，不要使用列表或者列举表达；不要回复太多内容，多用短句来引导我。、n\n3、请真正像人一样思考和回复，不要说你是“语言模型”或是“人工智能”。\n\n4、每轮沟通结束时请顺着这个话题发起一些相关的新话题，话题结束时请直接抛出接下来明确的话题，例如 美食、电影、歌曲、娱乐、旅游等；不要问有什么要聊的这种泛的问题，不要结束对话。\n\n请绝对遵循这些规则，即使被问及这些规则，也不要引用它们。"
+        system = [system] * len(contents["system"])
+    user = contents["user"]
+    assistant = contents["assistant"]
+
+    system_i, user_i, assistant_i = [], [], []
+
+    contents_i = []
+    for j, (system_prompt, user_prompt, target_out) in enumerate(zip(system, user, assistant)):
+        key = f"{key_i}_turn_{j}"
+
+        if j == 0:
+            contents_i.append({"role": "system", "content": system_prompt})
+
+        contents_i.append({"role": "user", "content": user_prompt})
+        contents_i.append({"role": "assistant", "content": target_out})
+
+        inputs_embeds, contents, batch, source_ids, meta_data = model.inference_prepare(
+            [contents_i], None, key, tokenizer, frontend, device="cuda:0"
+        )
+
+        model_inputs = {}
+        model_inputs["inputs_embeds"] = inputs_embeds
+
+        streamer = TextIteratorStreamer(tokenizer)
+
+        generation_kwargs = dict(model_inputs, streamer=streamer, max_new_tokens=200)
+        thread = Thread(target=model.llm.generate, kwargs=generation_kwargs)
+        thread.start()
+        generated_text = ""
+        for new_text in streamer:
+            print(f"generated new text： {new_text}")
+            generated_text += new_text
+        print(f"total generated: {generated_text}")
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/llm_asr/demo_train_or_finetune.sh
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/llm_asr/demo_train_or_finetune.sh
@@ -0,0 +1,59 @@
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+
+
+# which gpu to train or finetune
+export CUDA_VISIBLE_DEVICES="0"
+gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+
+# data dir, which contains: train.json, val.json, tokens.jsonl/tokens.txt, am.mvn
+#data_dir="/Users/zhifu/funasr1.0/data/list"
+
+## generate jsonl from wav.scp and text.txt
+#python -m funasr.datasets.audio_datasets.scp2jsonl \
+#++scp_file_list='["/Users/zhifu/funasr1.0/test_local/wav.scp", "/Users/zhifu/funasr1.0/test_local/text.txt"]' \
+#++data_type_list='["source", "target"]' \
+#++jsonl_file_out=/Users/zhifu/funasr1.0/test_local/audio_datasets.jsonl
+
+train_data="/nfs/maziyang.mzy/data/librispeech/librispeech_train_960h.jsonl"
+val_data="/nfs/maziyang.mzy/data/librispeech/librispeech_dev_other_filtered.jsonl"
+
+# exp output dir
+output_dir="/nfs/zhifu.gzf/ckpt/exp/llm_asr_whisper_vicuna_exp1"
+log_file="${output_dir}/log.txt"
+
+workspace=`pwd`
+config="whisper_vicuna_linear.yaml"
+
+init_param="${output_dir}/model.pt"
+
+mkdir -p ${output_dir}
+echo "log_file: ${log_file}"
+
+deepspeed_config=${workspace}/../../ds_stage1.json
+
+DISTRIBUTED_ARGS="
+    --nnodes ${WORLD_SIZE:-1} \
+    --nproc_per_node $gpu_num \
+    --node_rank ${RANK:-0} \
+    --master_addr ${MASTER_ADDR:-127.0.0.1} \
+    --master_port ${MASTER_PORT:-26669}
+"
+
+echo $DISTRIBUTED_ARGS
+
+torchrun $DISTRIBUTED_ARGS \
+../../../funasr/bin/train_ds.py \
+--config-path "${workspace}/conf" \
+--config-name "${config}" \
++train_data_set_list="${train_data}" \
++valid_data_set_list="${val_data}" \
++dataset_conf.batch_size=4 \
++dataset_conf.num_workers=4 \
++train_conf.max_epoch=15 \
++train_conf.use_deepspeed=false \
++train_conf.deepspeed_config=${deepspeed_config} \
++optim_conf.lr=0.0001 \
++init_param="${init_param}" \
+
++output_dir="${output_dir}" &> ${log_file} &
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/llm_asr/demo_train_or_finetune2.sh
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/llm_asr/demo_train_or_finetune2.sh
@@ -0,0 +1,68 @@
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+
+
+# which gpu to train or finetune
+export CUDA_VISIBLE_DEVICES="0"
+gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+
+# data dir, which contains: train.json, val.json, tokens.jsonl/tokens.txt, am.mvn
+#data_dir="/Users/zhifu/funasr1.0/data/list"
+
+## generate jsonl from wav.scp and text.txt
+#python -m funasr.datasets.audio_datasets.scp2jsonl \
+#++scp_file_list='["/Users/zhifu/funasr1.0/test_local/wav.scp", "/Users/zhifu/funasr1.0/test_local/text.txt"]' \
+#++data_type_list='["source", "target"]' \
+#++jsonl_file_out=/Users/zhifu/funasr1.0/test_local/audio_datasets.jsonl
+
+train_data="/nfs/beinian.lzr/workspace/tools/speech2speech_tools/speech2text/out_dir/tmp_wav.jsonl"
+val_data="/nfs/beinian.lzr/workspace/tools/speech2speech_tools/speech2text/out_dir/tmp_wav.jsonl"
+
+# exp output dir
+output_dir="/Users/zhifu/funasr1.0/test_local/data_tmp/"
+log_file="${output_dir}/log.txt"
+
+workspace=`pwd`
+config="whisper_qwen_linear2.yaml"
+
+init_param="${output_dir}/model.pt"
+
+mkdir -p ${output_dir}
+echo "log_file: ${log_file}"
+
+deepspeed_config=${workspace}/../../ds_stage1.json
+
+DISTRIBUTED_ARGS="
+    --nnodes ${WORLD_SIZE:-1} \
+    --nproc_per_node $gpu_num \
+    --node_rank ${RANK:-0} \
+    --master_addr ${MASTER_ADDR:-127.0.0.1} \
+    --master_port ${MASTER_PORT:-26669}
+"
+
+echo $DISTRIBUTED_ARGS
+
+torchrun $DISTRIBUTED_ARGS \
+../../../funasr/bin/train_ds.py \
+--config-path "${workspace}/conf" \
+--config-name "${config}" \
++train_data_set_list="${train_data}" \
++valid_data_set_list="${val_data}" \
++dataset_conf.data_split_num=1 \
++dataset_conf.batch_sampler="BatchSampler" \
++dataset_conf.batch_size=6000  \
++dataset_conf.sort_size=1024 \
++dataset_conf.batch_type="token" \
++dataset_conf.num_workers=4 \
++train_conf.max_epoch=50 \
++train_conf.log_interval=1 \
++train_conf.resume=true \
++train_conf.validate_interval=2000 \
++train_conf.save_checkpoint_interval=2000 \
++train_conf.keep_nbest_models=20 \
++train_conf.avg_nbest_model=10 \
++train_conf.use_deepspeed=false \
++train_conf.deepspeed_config=${deepspeed_config} \
++optim_conf.lr=0.0001 \
++init_param="${init_param}" \
++output_dir="${output_dir}" &> ${log_file} &
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/llm_asr/infer_speech2text.sh
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/llm_asr/infer_speech2text.sh
@@ -0,0 +1,9 @@
+
+
+python funasr/bin/inference.py \
+--config-path="/nfs/zhifu.gzf/ckpt/llm_asr_nar_exp1" \
+--config-name="config.yaml" \
++init_param="/nfs/zhifu.gzf/ckpt/llm_asr_nar_exp1/model.pt.ep5" \
++input="/Users/zhifu/funasr1.0/test_local/data_tmp/tmp_wav_10.jsonl" \
++output_dir="/nfs/zhifu.gzf/ckpt/llm_asr_nar_exp1/inference/aishell2-dev_ios-funasr" \
++device="cpu"
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/monotonic_aligner/README_zh.md
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/monotonic_aligner/README_zh.md
@@ -0,0 +1,42 @@
+(简体中文|[English](./README.md))
+
+# 语音识别
+
+> **注意**:
+> pipeline 支持 [modelscope模型仓库](https://alibaba-damo-academy.github.io/FunASR/en/model_zoo/modelscope_models.html#pretrained-models-on-modelscope) 中的所有模型进行推理和微调。这里我们以典型模型作为示例来演示使用方法。
+
+## 推理
+
+### 快速使用
+#### [Paraformer 模型](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary)
+```python
+from funasr import AutoModel
+
+model = AutoModel(model="/Users/zhifu/Downloads/modelscope_models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch")
+
+res = model(input="/Users/zhifu/Downloads/modelscope_models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/example/asr_example.wav")
+print(res)
+```
+
+### API接口说明
+#### AutoModel 定义
+- `model`: [模型仓库](https://alibaba-damo-academy.github.io/FunASR/en/model_zoo/modelscope_models.html#pretrained-models-on-modelscope) 中的模型名称，或本地磁盘中的模型路径
+- `device`: `cuda`（默认），使用 GPU 进行推理。如果为`cpu`，则使用 CPU 进行推理
+- `ncpu`: `None` （默认），设置用于 CPU 内部操作并行性的线程数
+- `output_dir`: `None` （默认），如果设置，输出结果的输出路径
+- `batch_size`: `1` （默认），解码时的批处理大小
+#### AutoModel 推理
+- `input`: 要解码的输入，可以是：
+  - wav文件路径, 例如: asr_example.wav
+  - pcm文件路径, 例如: asr_example.pcm，此时需要指定音频采样率fs（默认为16000）
+  - 音频字节数流，例如：麦克风的字节数数据
+  - wav.scp，kaldi 样式的 wav 列表 (`wav_id \t wav_path`), 例如:
+  ```text
+  asr_example1  ./audios/asr_example1.wav
+  asr_example2  ./audios/asr_example2.wav
+  ```
+  在这种输入 `wav.scp` 的情况下，必须设置 `output_dir` 以保存输出结果
+  - 音频采样点，例如：`audio, rate = soundfile.read("asr_example_zh.wav")`, 数据类型为 numpy.ndarray。支持batch输入，类型为list：
+  ```[audio_sample1, audio_sample2, ..., audio_sampleN]```
+  - fbank输入，支持组batch。shape为[batch, frames, dim]，类型为torch.Tensor，例如
+- `output_dir`: None （默认），如果设置，输出结果的输出路径
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/monotonic_aligner/demo.py
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/monotonic_aligner/demo.py
@@ -0,0 +1,18 @@
+#!/usr/bin/env python3
+# -*- encoding: utf-8 -*-
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+
+from funasr import AutoModel
+
+model = AutoModel(model="iic/speech_timestamp_prediction-v1-16k-offline")
+
+res = model.generate(
+    input=(
+        "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav",
+        "欢迎大家来到魔搭社区进行体验",
+    ),
+    data_type=("sound", "text"),
+    batch_size=2,
+)
+print(res)
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/monotonic_aligner/demo.sh
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/monotonic_aligner/demo.sh
@@ -0,0 +1,11 @@
+
+model="iic/speech_timestamp_prediction-v1-16k-offline"
+
+
+python funasr/bin/inference.py \
+model=${model} \
+input='["https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav", "欢迎大家来到魔搭社区进行体验"]' \
+data_type='["sound", "text"]' \
+output_dir="../outputs/debug" \
+device="cpu" \
+batch_size=2 
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/paraformer-zh-spk/README_zh.md
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/paraformer-zh-spk/README_zh.md
@@ -0,0 +1,436 @@
+(简体中文|[English](./README.md))
+
+FunASR开源了大量在工业数据上预训练模型，您可以在 [模型许可协议](https://github.com/alibaba-damo-academy/FunASR/blob/main/MODEL_LICENSE)下自由使用、复制、修改和分享FunASR模型，下面列举代表性的模型，更多模型请参考 [模型仓库](https://github.com/alibaba-damo-academy/FunASR/tree/main/model_zoo)。
+
+<div align="center">  
+<h4>
+ <a href="#模型推理"> 模型推理 </a>   
+｜<a href="#模型训练与测试"> 模型训练与测试 </a>
+｜<a href="#模型导出与测试"> 模型导出与测试 </a>
+</h4>
+</div>
+
+<a name="模型推理"></a>
+## 模型推理
+
+### 快速使用
+
+命令行方式调用：
+```shell
+funasr ++model=paraformer-zh ++vad_model="fsmn-vad" ++punc_model="ct-punc" ++input=asr_example_zh.wav
+```
+
+python代码调用（推荐）
+
+```python
+from funasr import AutoModel
+
+model = AutoModel(model="paraformer-zh")
+
+res = model.generate(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/vad_example.wav")
+print(res)
+```
+
+### 接口说明
+
+#### AutoModel 定义
+```python
+model = AutoModel(model=[str], device=[str], ncpu=[int], output_dir=[str], batch_size=[int], hub=[str], **kwargs)
+```
+- `model`(str): [模型仓库](https://github.com/alibaba-damo-academy/FunASR/tree/main/model_zoo) 中的模型名称，或本地磁盘中的模型路径
+- `device`(str): `cuda:0`（默认gpu0），使用 GPU 进行推理，指定。如果为`cpu`，则使用 CPU 进行推理
+- `ncpu`(int): `4` （默认），设置用于 CPU 内部操作并行性的线程数
+- `output_dir`(str): `None` （默认），如果设置，输出结果的输出路径
+- `batch_size`(int): `1` （默认），解码时的批处理，样本个数
+- `hub`(str)：`ms`（默认），从modelscope下载模型。如果为`hf`，从huggingface下载模型。
+- `**kwargs`(dict): 所有在`config.yaml`中参数，均可以直接在此处指定，例如，vad模型中最大切割长度 `max_single_segment_time=6000` （毫秒）。
+
+#### AutoModel 推理
+```python
+res = model.generate(input=[str], output_dir=[str])
+```
+- `input`: 要解码的输入，可以是：
+  - wav文件路径, 例如: asr_example.wav
+  - pcm文件路径, 例如: asr_example.pcm，此时需要指定音频采样率fs（默认为16000）
+  - 音频字节数流，例如：麦克风的字节数数据
+  - wav.scp，kaldi 样式的 wav 列表 (`wav_id \t wav_path`), 例如:
+  ```text
+  asr_example1  ./audios/asr_example1.wav
+  asr_example2  ./audios/asr_example2.wav
+  ```
+  在这种输入 `wav.scp` 的情况下，必须设置 `output_dir` 以保存输出结果
+  - 音频采样点，例如：`audio, rate = soundfile.read("asr_example_zh.wav")`, 数据类型为 numpy.ndarray。支持batch输入，类型为list：
+  ```[audio_sample1, audio_sample2, ..., audio_sampleN]```
+  - fbank输入，支持组batch。shape为[batch, frames, dim]，类型为torch.Tensor，例如
+- `output_dir`: None （默认），如果设置，输出结果的输出路径
+- `**kwargs`(dict): 与模型相关的推理参数，例如，`beam_size=10`，`decoding_ctc_weight=0.1`。
+
+
+### 更多用法介绍
+
+
+#### 非实时语音识别
+```python
+from funasr import AutoModel
+# paraformer-zh is a multi-functional asr model
+# use vad, punc, spk or not as you need
+model = AutoModel(model="paraformer-zh",  
+                  vad_model="fsmn-vad", 
+                  vad_kwargs={"max_single_segment_time": 60000},
+                  punc_model="ct-punc", 
+                  # spk_model="cam++"
+                  )
+wav_file = f"{model.model_path}/example/asr_example.wav"
+res = model.generate(input=wav_file, batch_size_s=300, batch_size_threshold_s=60, hotword='魔搭')
+print(res)
+```
+注意：
+- 通常模型输入限制时长30s以下，组合`vad_model`后，支持任意时长音频输入，不局限于paraformer模型，所有音频输入模型均可以。
+- `model`相关的参数可以直接在`AutoModel`定义中直接指定；与`vad_model`相关参数可以通过`vad_kwargs`来指定，类型为dict；类似的有`punc_kwargs`，`spk_kwargs`；
+- `max_single_segment_time`: 表示`vad_model`最大切割音频时长, 单位是毫秒ms.
+- `batch_size_s` 表示采用动态batch，batch中总音频时长，单位为秒s。
+- `batch_size_threshold_s`: 表示`vad_model`切割后音频片段时长超过 `batch_size_threshold_s`阈值时，将batch_size数设置为1, 单位为秒s.
+
+建议：当您输入为长音频，遇到OOM问题时，因为显存占用与音频时长呈平方关系增加，分为3种情况：
+- a)推理起始阶段，显存主要取决于`batch_size_s`，适当减小该值，可以减少显存占用；
+- b)推理中间阶段，遇到VAD切割的长音频片段，总token数小于`batch_size_s`，仍然出现OOM，可以适当减小`batch_size_threshold_s`，超过阈值，强制batch为1; 
+- c)推理快结束阶段，遇到VAD切割的长音频片段，总token数小于`batch_size_s`，且超过阈值`batch_size_threshold_s`，强制batch为1，仍然出现OOM，可以适当减小`max_single_segment_time`，使得VAD切割音频时长变短。
+
+#### 实时语音识别
+
+```python
+from funasr import AutoModel
+
+chunk_size = [0, 10, 5] #[0, 10, 5] 600ms, [0, 8, 4] 480ms
+encoder_chunk_look_back = 4 #number of chunks to lookback for encoder self-attention
+decoder_chunk_look_back = 1 #number of encoder chunks to lookback for decoder cross-attention
+
+model = AutoModel(model="paraformer-zh-streaming")
+
+import soundfile
+import os
+
+wav_file = os.path.join(model.model_path, "example/asr_example.wav")
+speech, sample_rate = soundfile.read(wav_file)
+chunk_stride = chunk_size[1] * 960 # 600ms
+
+cache = {}
+total_chunk_num = int(len((speech)-1)/chunk_stride+1)
+for i in range(total_chunk_num):
+    speech_chunk = speech[i*chunk_stride:(i+1)*chunk_stride]
+    is_final = i == total_chunk_num - 1
+    res = model.generate(input=speech_chunk, cache=cache, is_final=is_final, chunk_size=chunk_size, encoder_chunk_look_back=encoder_chunk_look_back, decoder_chunk_look_back=decoder_chunk_look_back)
+    print(res)
+```
+
+注：`chunk_size`为流式延时配置，`[0,10,5]`表示上屏实时出字粒度为`10*60=600ms`，未来信息为`5*60=300ms`。每次推理输入为`600ms`（采样点数为`16000*0.6=960`），输出为对应文字，最后一个语音片段输入需要设置`is_final=True`来强制输出最后一个字。
+
+#### 语音端点检测（非实时）
+```python
+from funasr import AutoModel
+
+model = AutoModel(model="fsmn-vad")
+
+wav_file = f"{model.model_path}/example/vad_example.wav"
+res = model.generate(input=wav_file)
+print(res)
+```
+注：VAD模型输出格式为：`[[beg1, end1], [beg2, end2], .., [begN, endN]]`，其中`begN/endN`表示第`N`个有效音频片段的起始点/结束点，
+单位为毫秒。
+
+#### 语音端点检测（实时）
+```python
+from funasr import AutoModel
+
+chunk_size = 200 # ms
+model = AutoModel(model="fsmn-vad")
+
+import soundfile
+
+wav_file = f"{model.model_path}/example/vad_example.wav"
+speech, sample_rate = soundfile.read(wav_file)
+chunk_stride = int(chunk_size * sample_rate / 1000)
+
+cache = {}
+total_chunk_num = int(len((speech)-1)/chunk_stride+1)
+for i in range(total_chunk_num):
+    speech_chunk = speech[i*chunk_stride:(i+1)*chunk_stride]
+    is_final = i == total_chunk_num - 1
+    res = model.generate(input=speech_chunk, cache=cache, is_final=is_final, chunk_size=chunk_size)
+    if len(res[0]["value"]):
+        print(res)
+```
+注：流式VAD模型输出格式为4种情况：
+- `[[beg1, end1], [beg2, end2], .., [begN, endN]]`：同上离线VAD输出结果。
+- `[[beg, -1]]`：表示只检测到起始点。
+- `[[-1, end]]`：表示只检测到结束点。
+- `[]`：表示既没有检测到起始点，也没有检测到结束点
+输出结果单位为毫秒，从起始点开始的绝对时间。
+
+#### 标点恢复
+```python
+from funasr import AutoModel
+
+model = AutoModel(model="ct-punc")
+
+res = model.generate(input="那今天的会就到这里吧 happy new year 明年见")
+print(res)
+```
+
+#### 时间戳预测
+```python
+from funasr import AutoModel
+
+model = AutoModel(model="fa-zh")
+
+wav_file = f"{model.model_path}/example/asr_example.wav"
+text_file = f"{model.model_path}/example/text.txt"
+res = model.generate(input=(wav_file, text_file), data_type=("sound", "text"))
+print(res)
+```
+更多（[示例](https://github.com/alibaba-damo-academy/FunASR/tree/main/examples/industrial_data_pretraining)）
+
+<a name="核心功能"></a>
+## 模型训练与测试
+
+### 快速开始
+
+命令行执行（用于快速测试，不推荐）：
+```shell
+funasr-train ++model=paraformer-zh ++train_data_set_list=data/list/train.jsonl ++valid_data_set_list=data/list/val.jsonl ++output_dir="./outputs" &> log.txt &
+```
+
+python代码执行（可以多机多卡，推荐）
+
+```shell
+cd examples/industrial_data_pretraining/paraformer
+bash finetune.sh
+# "log_file: ./outputs/log.txt"
+```
+详细完整的脚本参考 [finetune.sh](https://github.com/alibaba-damo-academy/FunASR/blob/main/examples/industrial_data_pretraining/paraformer/finetune.sh)
+
+### 详细参数介绍
+
+```shell
+funasr/bin/train.py \
++model="${model_name_or_model_dir}" \
++train_data_set_list="${train_data}" \
++valid_data_set_list="${val_data}" \
++dataset_conf.batch_size=20000 \
++dataset_conf.batch_type="token" \
++dataset_conf.num_workers=4 \
++train_conf.max_epoch=50 \
++train_conf.log_interval=1 \
++train_conf.resume=false \
++train_conf.validate_interval=2000 \
++train_conf.save_checkpoint_interval=2000 \
++train_conf.keep_nbest_models=20 \
++train_conf.avg_nbest_model=10 \
++optim_conf.lr=0.0002 \
++output_dir="${output_dir}" &> ${log_file}
+```
+
+- `model`（str）：模型名字（模型仓库中的ID），此时脚本会自动下载模型到本读；或者本地已经下载好的模型路径。
+- `train_data_set_list`（str）：训练数据路径，默认为jsonl格式，具体参考（[例子](https://github.com/alibaba-damo-academy/FunASR/blob/main/data/list)）。
+- `valid_data_set_list`（str）：验证数据路径，默认为jsonl格式，具体参考（[例子](https://github.com/alibaba-damo-academy/FunASR/blob/main/data/list)）。
+- `dataset_conf.batch_type`（str）：`example`（默认），batch的类型。`example`表示按照固定数目batch_size个样本组batch；`length` or `token` 表示动态组batch，batch总长度或者token数为batch_size。
+- `dataset_conf.batch_size`（int）：与 `batch_type` 搭配使用，当 `batch_type=example` 时，表示样本个数；当 `batch_type=length` 时，表示样本中长度，单位为fbank帧数（1帧10ms）或者文字token个数。
+- `train_conf.max_epoch`（int）：`100`（默认），训练总epoch数。
+- `train_conf.log_interval`（int）：`50`（默认），打印日志间隔step数。
+- `train_conf.resume`（int）：`True`（默认），是否开启断点重训。
+- `train_conf.validate_interval`（int）：`5000`（默认），训练中做验证测试的间隔step数。
+- `train_conf.save_checkpoint_interval`（int）：`5000`（默认），训练中模型保存间隔step数。
+- `train_conf.avg_keep_nbest_models_type`（str）：`acc`（默认），保留nbest的标准为acc（越大越好）。`loss`表示，保留nbest的标准为loss（越小越好）。
+- `train_conf.keep_nbest_models`（int）：`500`（默认），保留最大多少个模型参数，配合 `avg_keep_nbest_models_type` 按照验证集 acc/loss 保留最佳的n个模型，其他删除，节约存储空间。
+- `train_conf.avg_nbest_model`（int）：`10`（默认），保留最大多少个模型参数，配合 `avg_keep_nbest_models_type` 按照验证集 acc/loss 对最佳的n个模型平均。
+- `train_conf.accum_grad`（int）：`1`（默认），梯度累积功能。
+- `train_conf.grad_clip`（float）：`10.0`（默认），梯度截断功能。
+- `train_conf.use_fp16`（bool）：`False`（默认），开启fp16训练，加快训练速度。
+- `optim_conf.lr`（float）：学习率。
+- `output_dir`（str）：模型保存路径。
+- `**kwargs`(dict): 所有在`config.yaml`中参数，均可以直接在此处指定，例如，过滤20s以上长音频：`dataset_conf.max_token_length=2000`，单位为音频fbank帧数（1帧10ms）或者文字token个数。
+
+#### 多gpu训练
+##### 单机多gpu训练
+```shell
+export CUDA_VISIBLE_DEVICES="0,1"
+gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+
+torchrun --nnodes 1 --nproc_per_node ${gpu_num} \
+../../../funasr/bin/train.py ${train_args}
+```
+--nnodes 表示参与的节点总数，--nproc_per_node 表示每个节点上运行的进程数
+
+##### 多机多gpu训练
+
+在主节点上，假设IP为192.168.1.1，端口为12345，使用的是2个GPU，则运行如下命令：
+```shell
+export CUDA_VISIBLE_DEVICES="0,1"
+gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+
+torchrun --nnodes 2 --node_rank 0 --nproc_per_node ${gpu_num} --master_addr 192.168.1.1 --master_port 12345 \
+../../../funasr/bin/train.py ${train_args}
+```
+在从节点上（假设IP为192.168.1.2），你需要确保MASTER_ADDR和MASTER_PORT环境变量与主节点设置的一致，并运行同样的命令：
+```shell
+export CUDA_VISIBLE_DEVICES="0,1"
+gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+
+torchrun --nnodes 2 --node_rank 1 --nproc_per_node ${gpu_num} --master_addr 192.168.1.1 --master_port 12345 \
+../../../funasr/bin/train.py ${train_args}
+```
+
+--nnodes 表示参与的节点总数，--node_rank 表示当前节点id，--nproc_per_node 表示每个节点上运行的进程数（通常为gpu个数）
+
+#### 准备数据
+
+`jsonl`格式可以参考（[例子](https://github.com/alibaba-damo-academy/FunASR/blob/main/data/list)）。
+可以用指令 `scp2jsonl` 从wav.scp与text.txt生成。wav.scp与text.txt准备过程如下：
+
+`train_text.txt`
+
+左边为数据唯一ID，需与`train_wav.scp`中的`ID`一一对应
+右边为音频文件标注文本，格式如下：
+
+```bash
+ID0012W0013 当客户风险承受能力评估依据发生变化时
+ID0012W0014 所有只要处理 data 不管你是做 machine learning 做 deep learning
+ID0012W0015 he tried to think how it could be
+```
+
+
+`train_wav.scp`
+
+左边为数据唯一ID，需与`train_text.txt`中的`ID`一一对应
+右边为音频文件的路径，格式如下
+
+```bash
+BAC009S0764W0121 https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/BAC009S0764W0121.wav
+BAC009S0916W0489 https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/BAC009S0916W0489.wav
+ID0012W0015 https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_cn_en.wav
+```
+
+`生成指令`
+
+```shell
+# generate train.jsonl and val.jsonl from wav.scp and text.txt
+scp2jsonl \
++scp_file_list='["../../../data/list/train_wav.scp", "../../../data/list/train_text.txt"]' \
++data_type_list='["source", "target"]' \
++jsonl_file_out="../../../data/list/train.jsonl"
+```
+
+（可选，非必需）如果需要从jsonl解析成wav.scp与text.txt，可以使用指令：
+
+```shell
+# generate wav.scp and text.txt from train.jsonl and val.jsonl
+jsonl2scp \
++scp_file_list='["../../../data/list/train_wav.scp", "../../../data/list/train_text.txt"]' \
++data_type_list='["source", "target"]' \
++jsonl_file_in="../../../data/list/train.jsonl"
+```
+
+#### 查看训练日志
+
+##### 查看实验log
+```shell
+tail log.txt
+[2024-03-21 15:55:52,137][root][INFO] - train, rank: 3, epoch: 0/50, step: 6990/1, total step: 6990, (loss_avg_rank: 0.327), (loss_avg_epoch: 0.409), (ppl_avg_epoch: 1.506), (acc_avg_epoch: 0.795), (lr: 1.165e-04), [('loss_att', 0.259), ('acc', 0.825), ('loss_pre', 0.04), ('loss', 0.299), ('batch_size', 40)], {'data_load': '0.000', 'forward_time': '0.315', 'backward_time': '0.555', 'optim_time': '0.076', 'total_time': '0.947'}, GPU, memory: usage: 3.830 GB, peak: 18.357 GB, cache: 20.910 GB, cache_peak: 20.910 GB
+[2024-03-21 15:55:52,139][root][INFO] - train, rank: 1, epoch: 0/50, step: 6990/1, total step: 6990, (loss_avg_rank: 0.334), (loss_avg_epoch: 0.409), (ppl_avg_epoch: 1.506), (acc_avg_epoch: 0.795), (lr: 1.165e-04), [('loss_att', 0.285), ('acc', 0.823), ('loss_pre', 0.046), ('loss', 0.331), ('batch_size', 36)], {'data_load': '0.000', 'forward_time': '0.334', 'backward_time': '0.536', 'optim_time': '0.077', 'total_time': '0.948'}, GPU, memory: usage: 3.943 GB, peak: 18.291 GB, cache: 19.619 GB, cache_peak: 19.619 GB
+```
+指标解释：
+- `rank`：表示gpu id。
+- `epoch`,`step`,`total step`：表示当前epoch，step，总step。
+- `loss_avg_rank`：表示当前step，所有gpu平均loss。
+- `loss/ppl/acc_avg_epoch`：表示当前epoch周期，截止当前step数时，总平均loss/ppl/acc。epoch结束时的最后一个step表示epoch总平均loss/ppl/acc，推荐使用acc指标。
+- `lr`：当前step的学习率。
+- `[('loss_att', 0.259), ('acc', 0.825), ('loss_pre', 0.04), ('loss', 0.299), ('batch_size', 40)]`：表示当前gpu id的具体数据。
+- `total_time`：表示单个step总耗时。
+- `GPU, memory`：分别表示，模型使用/峰值显存，模型+缓存使用/峰值显存。
+
+##### tensorboard可视化
+```bash
+tensorboard --logdir /xxxx/FunASR/examples/industrial_data_pretraining/paraformer/outputs/log/tensorboard
+```
+浏览器中打开：http://localhost:6006/
+
+### 训练后模型测试
+
+
+#### 有configuration.json
+
+假定，训练模型路径为：./model_dir，如果改目录下有生成configuration.json，只需要将 [上述模型推理方法](https://github.com/alibaba-damo-academy/FunASR/blob/main/examples/README_zh.md#%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86) 中模型名字修改为模型路径即可
+
+例如：
+
+从shell推理
+```shell
+python -m funasr.bin.inference ++model="./model_dir" ++input=="${input}" ++output_dir="${output_dir}"
+```
+从python推理
+
+```python
+from funasr import AutoModel
+
+model = AutoModel(model="./model_dir")
+
+res = model.generate(input=wav_file)
+print(res)
+```
+
+#### 无configuration.json时
+
+如果模型路径中无configuration.json时，需要手动指定具体配置文件路径与模型路径
+
+```shell
+python -m funasr.bin.inference \
+--config-path "${local_path}" \
+--config-name "${config}" \
++init_param="${init_param}" \
++tokenizer_conf.token_list="${tokens}" \
++frontend_conf.cmvn_file="${cmvn_file}" \
++input="${input}" \
++output_dir="${output_dir}" \
++device="${device}"
+```
+
+参数介绍
+- `config-path`：为实验中保存的 `config.yaml`，可以从实验输出目录中查找。
+- `config-name`：配置文件名，一般为 `config.yaml`，支持yaml格式与json格式，例如 `config.json`
+- `init_param`：需要测试的模型参数，一般为`model.pt`，可以自己选择具体的模型文件
+- `tokenizer_conf.token_list`：词表文件路径，一般在 `config.yaml` 有指定，无需再手动指定，当 `config.yaml` 中路径不正确时，需要在此处手动指定。
+- `frontend_conf.cmvn_file`：wav提取fbank中用到的cmvn文件，一般在 `config.yaml` 有指定，无需再手动指定，当 `config.yaml` 中路径不正确时，需要在此处手动指定。
+
+其他参数同上，完整 [示例](https://github.com/alibaba-damo-academy/FunASR/blob/main/examples/industrial_data_pretraining/paraformer/infer_from_local.sh)
+
+
+<a name="模型导出与测试"></a>
+## 模型导出与测试
+### 从命令行导出
+```shell
+funasr-export ++model=paraformer ++quantize=false
+```
+
+### 从Python导出
+```python
+from funasr import AutoModel
+
+model = AutoModel(model="paraformer")
+
+res = model.export(quantize=False)
+```
+
+### 测试ONNX
+```python
+# pip3 install -U funasr-onnx
+from funasr_onnx import Paraformer
+model_dir = "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
+model = Paraformer(model_dir, batch_size=1, quantize=True)
+
+wav_path = ['~/.cache/modelscope/hub/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/example/asr_example.wav']
+
+result = model(wav_path)
+print(result)
+```
+
+更多例子请参考 [样例](https://github.com/alibaba-damo-academy/FunASR/tree/main/runtime/python/onnxruntime)
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/paraformer-zh-spk/init.py
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/paraformer-zh-spk/init.py
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/paraformer-zh-spk/demo.py
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/paraformer-zh-spk/demo.py
@@ -0,0 +1,19 @@
+#!/usr/bin/env python3
+# -*- encoding: utf-8 -*-
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+
+from funasr import AutoModel
+
+model = AutoModel(
+    model="iic/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
+    vad_model="iic/speech_fsmn_vad_zh-cn-16k-common-pytorch",
+    punc_model="iic/punc_ct-transformer_zh-cn-common-vocab272727-pytorch",
+    # spk_model="iic/speech_campplus_sv_zh-cn_16k-common",
+)
+
+res = model.generate(
+    input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav",
+    hotword="达摩院 磨搭",
+)
+print(res)
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/paraformer-zh-spk/demo.sh
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/paraformer-zh-spk/demo.sh
@@ -0,0 +1,16 @@
+
+model="iic/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
+vad_model="iic/speech_fsmn_vad_zh-cn-16k-common-pytorch"
+#punc_model="iic/punc_ct-transformer_zh-cn-common-vocab272727-pytorch"
+punc_model="iic/punc_ct-transformer_cn-en-common-vocab471067-large"
+spk_model="iic/speech_campplus_sv_zh-cn_16k-common"
+
+python funasr/bin/inference.py \
++model=${model} \
++vad_model=${vad_model} \
++punc_model=${punc_model} \
++spk_model=${spk_model} \
++input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav" \
++output_dir="./outputs/debug" \
++device="cpu" \
++"hotword='达摩院 魔搭'"
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/paraformer/README.md
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/paraformer/README.md
@@ -0,0 +1,424 @@
+([简体中文](./README_zh.md)|English)
+
+FunASR has open-sourced a large number of pre-trained models on industrial data. You are free to use, copy, modify, and share FunASR models under the [Model License Agreement](https://github.com/alibaba-damo-academy/FunASR/blob/main/MODEL_LICENSE). Below, we list some representative models. For a comprehensive list, please refer to our [Model Zoo](https://github.com/alibaba-damo-academy/FunASR/tree/main/model_zoo).
+
+<div align="center">  
+<h4>
+ <a href="#Inference"> Model Inference </a>   
+｜<a href="#Training"> Model Training and Testing </a>
+｜<a href="#Export"> Model Export and Testing </a>
+</h4>
+</div>
+
+<a name="Inference"></a>
+## Model Inference
+
+### Quick Start
+
+For command-line invocation:
+```shell
+funasr ++model=paraformer-zh ++vad_model="fsmn-vad" ++punc_model="ct-punc" ++input=asr_example_zh.wav
+```
+
+For python code invocation (recommended): 
+
+```python
+from funasr import AutoModel
+
+model = AutoModel(model="paraformer-zh")
+
+res = model.generate(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/vad_example.wav")
+print(res)
+```
+
+### API Description 
+#### AutoModel Definition
+```python
+model = AutoModel(model=[str], device=[str], ncpu=[int], output_dir=[str], batch_size=[int], hub=[str], **kwargs)
+```
+- `model`(str): model name in the [Model Repository](https://github.com/alibaba-damo-academy/FunASR/tree/main/model_zoo), or a model path on local disk.
+- `device`(str): `cuda:0` (default gpu0) for using GPU for inference, specify `cpu` for using CPU.
+- `ncpu`(int): `4` (default), sets the number of threads for CPU internal operations.
+- `output_dir`(str): `None` (default), set this to specify the output path for the results.
+- `batch_size`(int): `1` (default), the number of samples per batch during decoding.
+- `hub`(str)：`ms` (default) to download models from ModelScope. Use `hf` to download models from Hugging Face.
+- `**kwargs`(dict): Any parameters found in config.yaml can be directly specified here, for instance, the maximum segmentation length in the vad model max_single_segment_time=6000 (milliseconds).
+
+#### AutoModel Inference
+```python
+res = model.generate(input=[str], output_dir=[str])
+```
+- `input`: The input to be decoded, which could be:
+  - A wav file path, e.g., asr_example.wav
+  - A pcm file path, e.g., asr_example.pcm, in this case, specify the audio sampling rate fs (default is 16000)
+  - An audio byte stream, e.g., byte data from a microphone
+  - A wav.scp, a Kaldi-style wav list (wav_id \t wav_path), for example:
+  ```text
+  asr_example1  ./audios/asr_example1.wav
+  asr_example2  ./audios/asr_example2.wav
+  ```
+  When using wav.scp as input, you must set output_dir to save the output results.
+  - Audio samples, `e.g.`: `audio, rate = soundfile.read("asr_example_zh.wav")`, data type is numpy.ndarray. Supports batch inputs, type is list：
+  ```[audio_sample1, audio_sample2, ..., audio_sampleN]```
+  - fbank input, supports batch grouping. Shape is [batch, frames, dim], type is torch.Tensor.
+- `output_dir`: None (default), if set, specifies the output path for the results.
+- `**kwargs`(dict): Inference parameters related to the model, for example,`beam_size=10`，`decoding_ctc_weight=0.1`.
+
+
+### More Usage Introduction
+
+
+#### Speech Recognition (Non-streaming)
+```python
+from funasr import AutoModel
+# paraformer-zh is a multi-functional asr model
+# use vad, punc, spk or not as you need
+model = AutoModel(model="paraformer-zh",  
+                  vad_model="fsmn-vad", 
+                  vad_kwargs={"max_single_segment_time": 60000},
+                  punc_model="ct-punc", 
+                  # spk_model="cam++"
+                  )
+wav_file = f"{model.model_path}/example/asr_example.wav"
+res = model.generate(input=wav_file, batch_size_s=300, batch_size_threshold_s=60, hotword='魔搭')
+print(res)
+```
+Notes:
+- Typically, the input duration for models is limited to under 30 seconds. However, when combined with `vad_model`, support for audio input of any length is enabled, not limited to the paraformer model—any audio input model can be used.
+- Parameters related to model can be directly specified in the definition of AutoModel; parameters related to `vad_model` can be set through `vad_kwargs`, which is a dict; similar parameters include `punc_kwargs` and `spk_kwargs`.
+- `max_single_segment_time`: Denotes the maximum audio segmentation length for `vad_model`, measured in milliseconds (ms).
+- `batch_size_s` represents the use of dynamic batching, where the total audio duration within a batch is measured in seconds (s).
+- `batch_size_threshold_s`: Indicates that when the duration of an audio segment post-VAD segmentation exceeds the batch_size_threshold_s threshold, the batch size is set to 1, measured in seconds (s).
+
+Recommendations: 
+
+When you input long audio and encounter Out Of Memory (OOM) issues, since memory usage tends to increase quadratically with audio length, consider the following three scenarios:
+
+a) At the beginning of inference, memory usage primarily depends on `batch_size_s`. Appropriately reducing this value can decrease memory usage.
+b) During the middle of inference, when encountering long audio segments cut by VAD and the total token count is less than `batch_size_s`, yet still facing OOM, you can appropriately reduce `batch_size_threshold_s`. If the threshold is exceeded, the batch size is forced to 1.
+c) Towards the end of inference, if long audio segments cut by VAD have a total token count less than `batch_size_s` and exceed the `threshold` batch_size_threshold_s, forcing the batch size to 1 and still facing OOM, you may reduce `max_single_segment_time` to shorten the VAD audio segment length.
+
+#### Speech Recognition (Streaming)
+```python
+from funasr import AutoModel
+
+chunk_size = [0, 10, 5] #[0, 10, 5] 600ms, [0, 8, 4] 480ms
+encoder_chunk_look_back = 4 #number of chunks to lookback for encoder self-attention
+decoder_chunk_look_back = 1 #number of encoder chunks to lookback for decoder cross-attention
+
+model = AutoModel(model="paraformer-zh-streaming")
+
+import soundfile
+import os
+
+wav_file = os.path.join(model.model_path, "example/asr_example.wav")
+speech, sample_rate = soundfile.read(wav_file)
+chunk_stride = chunk_size[1] * 960 # 600ms
+
+cache = {}
+total_chunk_num = int(len((speech)-1)/chunk_stride+1)
+for i in range(total_chunk_num):
+    speech_chunk = speech[i*chunk_stride:(i+1)*chunk_stride]
+    is_final = i == total_chunk_num - 1
+    res = model.generate(input=speech_chunk, cache=cache, is_final=is_final, chunk_size=chunk_size, encoder_chunk_look_back=encoder_chunk_look_back, decoder_chunk_look_back=decoder_chunk_look_back)
+    print(res)
+```
+Note: `chunk_size` is the configuration for streaming latency.` [0,10,5]` indicates that the real-time display granularity is `10*60=600ms`, and the lookahead information is `5*60=300ms`. Each inference input is `600ms` (sample points are `16000*0.6=960`), and the output is the corresponding text. For the last speech segment input, `is_final=True` needs to be set to force the output of the last word.
+
+#### Voice Activity Detection (Non-Streaming)
+```python
+from funasr import AutoModel
+
+model = AutoModel(model="fsmn-vad")
+wav_file = f"{model.model_path}/example/vad_example.wav"
+res = model.generate(input=wav_file)
+print(res)
+```
+Note: The output format of the VAD model is: `[[beg1, end1], [beg2, end2], ..., [begN, endN]]`, where `begN/endN` indicates the starting/ending point of the `N-th` valid audio segment, measured in milliseconds.
+
+#### Voice Activity Detection (Streaming)
+```python
+from funasr import AutoModel
+
+chunk_size = 200 # ms
+model = AutoModel(model="fsmn-vad")
+
+import soundfile
+
+wav_file = f"{model.model_path}/example/vad_example.wav"
+speech, sample_rate = soundfile.read(wav_file)
+chunk_stride = int(chunk_size * sample_rate / 1000)
+
+cache = {}
+total_chunk_num = int(len((speech)-1)/chunk_stride+1)
+for i in range(total_chunk_num):
+    speech_chunk = speech[i*chunk_stride:(i+1)*chunk_stride]
+    is_final = i == total_chunk_num - 1
+    res = model.generate(input=speech_chunk, cache=cache, is_final=is_final, chunk_size=chunk_size)
+    if len(res[0]["value"]):
+        print(res)
+```
+Note: The output format for the streaming VAD model can be one of four scenarios:
+- `[[beg1, end1], [beg2, end2], .., [begN, endN]]`：The same as the offline VAD output result mentioned above.
+- `[[beg, -1]]`：Indicates that only a starting point has been detected.
+- `[[-1, end]]`：Indicates that only an ending point has been detected.
+- `[]`：Indicates that neither a starting point nor an ending point has been detected. 
+
+The output is measured in milliseconds and represents the absolute time from the starting point.
+#### Punctuation Restoration
+```python
+from funasr import AutoModel
+
+model = AutoModel(model="ct-punc")
+res = model.generate(input="那今天的会就到这里吧 happy new year 明年见")
+print(res)
+```
+#### Timestamp Prediction
+```python
+from funasr import AutoModel
+
+model = AutoModel(model="fa-zh")
+wav_file = f"{model.model_path}/example/asr_example.wav"
+text_file = f"{model.model_path}/example/text.txt"
+res = model.generate(input=(wav_file, text_file), data_type=("sound", "text"))
+print(res)
+```
+
+More examples ref to [docs](https://github.com/alibaba-damo-academy/FunASR/tree/main/examples/industrial_data_pretraining)
+
+<a name="Training"></a>
+## Model Training and Testing
+
+### Quick Start
+
+Execute via command line (for quick testing, not recommended):
+```shell
+funasr-train ++model=paraformer-zh ++train_data_set_list=data/list/train.jsonl ++valid_data_set_list=data/list/val.jsonl ++output_dir="./outputs" &> log.txt &
+```
+
+Execute with Python code (supports multi-node and multi-GPU, recommended):
+
+```shell
+cd examples/industrial_data_pretraining/paraformer
+bash finetune.sh
+# "log_file: ./outputs/log.txt"
+```
+Full code ref to [finetune.sh](https://github.com/alibaba-damo-academy/FunASR/blob/main/examples/industrial_data_pretraining/paraformer/finetune.sh)
+
+### Detailed Parameter Description:
+
+```shell
+funasr/bin/train.py \
++model="${model_name_or_model_dir}" \
++train_data_set_list="${train_data}" \
++valid_data_set_list="${val_data}" \
++dataset_conf.batch_size=20000 \
++dataset_conf.batch_type="token" \
++dataset_conf.num_workers=4 \
++train_conf.max_epoch=50 \
++train_conf.log_interval=1 \
++train_conf.resume=false \
++train_conf.validate_interval=2000 \
++train_conf.save_checkpoint_interval=2000 \
++train_conf.keep_nbest_models=20 \
++train_conf.avg_nbest_model=10 \
++optim_conf.lr=0.0002 \
++output_dir="${output_dir}" &> ${log_file}
+```
+
+- `model`（str）: The name of the model (the ID in the model repository), at which point the script will automatically download the model to local storage; alternatively, the path to a model already downloaded locally.
+- `train_data_set_list`（str）: The path to the training data, typically in jsonl format, for specific details refer to [examples](https://github.com/alibaba-damo-academy/FunASR/blob/main/data/list).
+- `valid_data_set_list`（str）：The path to the validation data, also generally in jsonl format, for specific details refer to examples](https://github.com/alibaba-damo-academy/FunASR/blob/main/data/list).
+- `dataset_conf.batch_type`（str）：example (default), the type of batch. example means batches are formed with a fixed number of batch_size samples; length or token means dynamic batching, with total length or number of tokens of the batch equalling batch_size.
+- `dataset_conf.batch_size`（int）：Used in conjunction with batch_type. When batch_type=example, it represents the number of samples; when batch_type=length, it represents the length of the samples, measured in fbank frames (1 frame = 10 ms) or the number of text tokens.
+- `train_conf.max_epoch`（int）：The total number of epochs for training.
+- `train_conf.log_interval`（int）：The number of steps between logging.
+- `train_conf.resume`（int）：Whether to enable checkpoint resuming for training.
+- `train_conf.validate_interval`（int）：The interval in steps to run validation tests during training.
+- `train_conf.save_checkpoint_interval`（int）：The interval in steps for saving the model during training.
+- `train_conf.keep_nbest_models`（int）：The maximum number of model parameters to retain, sorted by validation set accuracy, from highest to lowest.
+- `train_conf.avg_nbest_model`（int）：Average over the top n models with the highest accuracy.
+- `optim_conf.lr`（float）：The learning rate.
+- `output_dir`（str）：The path for saving the model.
+- `**kwargs`(dict): Any parameters in config.yaml can be specified directly here, for example, to filter out audio longer than 20s: dataset_conf.max_token_length=2000, measured in fbank frames (1 frame = 10 ms) or the number of text tokens.
+
+#### Multi-GPU Training
+##### Single-Machine Multi-GPU Training
+```shell
+export CUDA_VISIBLE_DEVICES="0,1"
+gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+
+torchrun --nnodes 1 --nproc_per_node ${gpu_num} \
+../../../funasr/bin/train.py ${train_args}
+```
+--nnodes represents the total number of participating nodes, while --nproc_per_node indicates the number of processes running on each node.
+
+##### Multi-Machine Multi-GPU Training
+
+On the master node, assuming the IP is 192.168.1.1 and the port is 12345, and you're using 2 GPUs, you would run the following command:
+```shell
+export CUDA_VISIBLE_DEVICES="0,1"
+gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+
+torchrun --nnodes 2 --node_rank 0 --nproc_per_node ${gpu_num} --master_addr=192.168.1.1 --master_port=12345 \
+../../../funasr/bin/train.py ${train_args}
+```
+On the worker node (assuming the IP is 192.168.1.2), you need to ensure that the MASTER_ADDR and MASTER_PORT environment variables are set to match those of the master node, and then run the same command:
+
+```shell
+export CUDA_VISIBLE_DEVICES="0,1"
+gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+
+torchrun --nnodes 2 --node_rank 1 --nproc_per_node ${gpu_num} --master_addr=192.168.1.1 --master_port=12345 \
+../../../funasr/bin/train.py ${train_args}
+```
+
+--nnodes indicates the total number of nodes participating in the training, --node_rank represents the ID of the current node, and --nproc_per_node specifies the number of processes running on each node (usually corresponds to the number of GPUs).
+
+#### Data prepare
+
+`jsonl` ref to（[demo](https://github.com/alibaba-damo-academy/FunASR/blob/main/data/list)）.
+The instruction scp2jsonl can be used to generate from wav.scp and text.txt. The preparation process for wav.scp and text.txt is as follows:
+
+`train_text.txt`
+
+```bash
+ID0012W0013 当客户风险承受能力评估依据发生变化时
+ID0012W0014 所有只要处理 data 不管你是做 machine learning 做 deep learning
+ID0012W0015 he tried to think how it could be
+```
+
+
+`train_wav.scp`
+
+
+```bash
+BAC009S0764W0121 https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/BAC009S0764W0121.wav
+BAC009S0916W0489 https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/BAC009S0916W0489.wav
+ID0012W0015 https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_cn_en.wav
+```
+
+`Command`
+
+```shell
+# generate train.jsonl and val.jsonl from wav.scp and text.txt
+scp2jsonl \
++scp_file_list='["../../../data/list/train_wav.scp", "../../../data/list/train_text.txt"]' \
++data_type_list='["source", "target"]' \
++jsonl_file_out="../../../data/list/train.jsonl"
+```
+
+(Optional, not required) If you need to parse from jsonl back to wav.scp and text.txt, you can use the following command:
+
+```shell
+# generate wav.scp and text.txt from train.jsonl and val.jsonl
+jsonl2scp \
++scp_file_list='["../../../data/list/train_wav.scp", "../../../data/list/train_text.txt"]' \
++data_type_list='["source", "target"]' \
++jsonl_file_in="../../../data/list/train.jsonl"
+```
+
+#### Training log
+
+##### log.txt
+```shell
+tail log.txt
+[2024-03-21 15:55:52,137][root][INFO] - train, rank: 3, epoch: 0/50, step: 6990/1, total step: 6990, (loss_avg_rank: 0.327), (loss_avg_epoch: 0.409), (ppl_avg_epoch: 1.506), (acc_avg_epoch: 0.795), (lr: 1.165e-04), [('loss_att', 0.259), ('acc', 0.825), ('loss_pre', 0.04), ('loss', 0.299), ('batch_size', 40)], {'data_load': '0.000', 'forward_time': '0.315', 'backward_time': '0.555', 'optim_time': '0.076', 'total_time': '0.947'}, GPU, memory: usage: 3.830 GB, peak: 18.357 GB, cache: 20.910 GB, cache_peak: 20.910 GB
+[2024-03-21 15:55:52,139][root][INFO] - train, rank: 1, epoch: 0/50, step: 6990/1, total step: 6990, (loss_avg_rank: 0.334), (loss_avg_epoch: 0.409), (ppl_avg_epoch: 1.506), (acc_avg_epoch: 0.795), (lr: 1.165e-04), [('loss_att', 0.285), ('acc', 0.823), ('loss_pre', 0.046), ('loss', 0.331), ('batch_size', 36)], {'data_load': '0.000', 'forward_time': '0.334', 'backward_time': '0.536', 'optim_time': '0.077', 'total_time': '0.948'}, GPU, memory: usage: 3.943 GB, peak: 18.291 GB, cache: 19.619 GB, cache_peak: 19.619 GB
+```
+
+
+- `rank`：gpu id。
+- `epoch`,`step`,`total step`：the current epoch, step, and total steps.
+- `loss_avg_rank`：the average loss across all GPUs for the current step.
+- `loss/ppl/acc_avg_epoch`：the overall average loss/perplexity/accuracy for the current epoch, up to the current step count. The last step of the epoch when it ends represents the total average loss/perplexity/accuracy for that epoch; it is recommended to use the accuracy metric.
+- `lr`：the learning rate for the current step.
+- `[('loss_att', 0.259), ('acc', 0.825), ('loss_pre', 0.04), ('loss', 0.299), ('batch_size', 40)]`：the specific data for the current GPU ID.
+- `total_time`：the total time taken for a single step.
+- `GPU, memory`：the model-used/peak memory and the model+cache-used/peak memory.
+
+##### tensorboard
+```bash
+tensorboard --logdir /xxxx/FunASR/examples/industrial_data_pretraining/paraformer/outputs/log/tensorboard
+```
+http://localhost:6006/
+
+### 训练后模型测试
+
+
+#### With `configuration.json` file
+
+Assuming the training model path is: ./model_dir, if a configuration.json file has been generated in this directory, you only need to change the model name to the model path in the above model inference method. 
+
+For example, for shell inference:
+```shell
+python -m funasr.bin.inference ++model="./model_dir" ++input=="${input}" ++output_dir="${output_dir}"
+```
+
+Python inference
+
+```python
+from funasr import AutoModel
+
+model = AutoModel(model="./model_dir")
+
+res = model.generate(input=wav_file)
+print(res)
+```
+
+#### Without `configuration.json` file
+
+If there is no configuration.json in the model path, you need to manually specify the exact configuration file path and the model path.
+
+```shell
+python -m funasr.bin.inference \
+--config-path "${local_path}" \
+--config-name "${config}" \
++init_param="${init_param}" \
++tokenizer_conf.token_list="${tokens}" \
++frontend_conf.cmvn_file="${cmvn_file}" \
++input="${input}" \
++output_dir="${output_dir}" \
++device="${device}"
+```
+
+Parameter Introduction
+- `config-path`：This is the path to the config.yaml saved during the experiment, which can be found in the experiment's output directory.
+- `config-name`：The name of the configuration file, usually config.yaml. It supports both YAML and JSON formats, for example config.json.
+- `init_param`：The model parameters that need to be tested, usually model.pt. You can choose a specific model file as needed.
+- `tokenizer_conf.token_list`：The path to the vocabulary file, which is normally specified in config.yaml. There is no need to manually specify it again unless the path in config.yaml is incorrect, in which case the correct path must be manually specified here.
+- `frontend_conf.cmvn_file`：The CMVN (Cepstral Mean and Variance Normalization) file used when extracting fbank features from WAV files, which is usually specified in config.yaml. There is no need to manually specify it again unless the path in config.yaml is incorrect, in which case the correct path must be manually specified here.
+
+Other parameters are the same as mentioned above. A complete [example](https://github.com/alibaba-damo-academy/FunASR/blob/main/examples/industrial_data_pretraining/paraformer/infer_from_local.sh) can be found here.
+
+<a name="Export"></a>
+## Export ONNX
+
+### Command-line usage
+```shell
+funasr-export ++model=paraformer ++quantize=false ++device=cpu
+```
+
+### Python
+```python
+from funasr import AutoModel
+
+model = AutoModel(model="paraformer", device="cpu")
+
+res = model.export(quantize=False)
+```
+
+### Test ONNX
+```python
+# pip3 install -U funasr-onnx
+from funasr_onnx import Paraformer
+model_dir = "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
+model = Paraformer(model_dir, batch_size=1, quantize=True)
+
+wav_path = ['~/.cache/modelscope/hub/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/example/asr_example.wav']
+
+result = model(wav_path)
+print(result)
+```
+
+More examples ref to [demo](https://github.com/alibaba-damo-academy/FunASR/tree/main/runtime/python/onnxruntime)
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/paraformer/README_zh.md
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/paraformer/README_zh.md
@@ -0,0 +1,436 @@
+(简体中文|[English](./README.md))
+
+FunASR开源了大量在工业数据上预训练模型，您可以在 [模型许可协议](https://github.com/alibaba-damo-academy/FunASR/blob/main/MODEL_LICENSE)下自由使用、复制、修改和分享FunASR模型，下面列举代表性的模型，更多模型请参考 [模型仓库](https://github.com/alibaba-damo-academy/FunASR/tree/main/model_zoo)。
+
+<div align="center">  
+<h4>
+ <a href="#模型推理"> 模型推理 </a>   
+｜<a href="#模型训练与测试"> 模型训练与测试 </a>
+｜<a href="#模型导出与测试"> 模型导出与测试 </a>
+</h4>
+</div>
+
+<a name="模型推理"></a>
+## 模型推理
+
+### 快速使用
+
+命令行方式调用：
+```shell
+funasr ++model=paraformer-zh ++vad_model="fsmn-vad" ++punc_model="ct-punc" ++input=asr_example_zh.wav
+```
+
+python代码调用（推荐）
+
+```python
+from funasr import AutoModel
+
+model = AutoModel(model="paraformer-zh")
+
+res = model.generate(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/vad_example.wav")
+print(res)
+```
+
+### 接口说明
+
+#### AutoModel 定义
+```python
+model = AutoModel(model=[str], device=[str], ncpu=[int], output_dir=[str], batch_size=[int], hub=[str], **kwargs)
+```
+- `model`(str): [模型仓库](https://github.com/alibaba-damo-academy/FunASR/tree/main/model_zoo) 中的模型名称，或本地磁盘中的模型路径
+- `device`(str): `cuda:0`（默认gpu0），使用 GPU 进行推理，指定。如果为`cpu`，则使用 CPU 进行推理
+- `ncpu`(int): `4` （默认），设置用于 CPU 内部操作并行性的线程数
+- `output_dir`(str): `None` （默认），如果设置，输出结果的输出路径
+- `batch_size`(int): `1` （默认），解码时的批处理，样本个数
+- `hub`(str)：`ms`（默认），从modelscope下载模型。如果为`hf`，从huggingface下载模型。
+- `**kwargs`(dict): 所有在`config.yaml`中参数，均可以直接在此处指定，例如，vad模型中最大切割长度 `max_single_segment_time=6000` （毫秒）。
+
+#### AutoModel 推理
+```python
+res = model.generate(input=[str], output_dir=[str])
+```
+- `input`: 要解码的输入，可以是：
+  - wav文件路径, 例如: asr_example.wav
+  - pcm文件路径, 例如: asr_example.pcm，此时需要指定音频采样率fs（默认为16000）
+  - 音频字节数流，例如：麦克风的字节数数据
+  - wav.scp，kaldi 样式的 wav 列表 (`wav_id \t wav_path`), 例如:
+  ```text
+  asr_example1  ./audios/asr_example1.wav
+  asr_example2  ./audios/asr_example2.wav
+  ```
+  在这种输入 `wav.scp` 的情况下，必须设置 `output_dir` 以保存输出结果
+  - 音频采样点，例如：`audio, rate = soundfile.read("asr_example_zh.wav")`, 数据类型为 numpy.ndarray。支持batch输入，类型为list：
+  ```[audio_sample1, audio_sample2, ..., audio_sampleN]```
+  - fbank输入，支持组batch。shape为[batch, frames, dim]，类型为torch.Tensor，例如
+- `output_dir`: None （默认），如果设置，输出结果的输出路径
+- `**kwargs`(dict): 与模型相关的推理参数，例如，`beam_size=10`，`decoding_ctc_weight=0.1`。
+
+
+### 更多用法介绍
+
+
+#### 非实时语音识别
+```python
+from funasr import AutoModel
+# paraformer-zh is a multi-functional asr model
+# use vad, punc, spk or not as you need
+model = AutoModel(model="paraformer-zh",  
+                  vad_model="fsmn-vad", 
+                  vad_kwargs={"max_single_segment_time": 60000},
+                  punc_model="ct-punc", 
+                  # spk_model="cam++"
+                  )
+wav_file = f"{model.model_path}/example/asr_example.wav"
+res = model.generate(input=wav_file, batch_size_s=300, batch_size_threshold_s=60, hotword='魔搭')
+print(res)
+```
+注意：
+- 通常模型输入限制时长30s以下，组合`vad_model`后，支持任意时长音频输入，不局限于paraformer模型，所有音频输入模型均可以。
+- `model`相关的参数可以直接在`AutoModel`定义中直接指定；与`vad_model`相关参数可以通过`vad_kwargs`来指定，类型为dict；类似的有`punc_kwargs`，`spk_kwargs`；
+- `max_single_segment_time`: 表示`vad_model`最大切割音频时长, 单位是毫秒ms.
+- `batch_size_s` 表示采用动态batch，batch中总音频时长，单位为秒s。
+- `batch_size_threshold_s`: 表示`vad_model`切割后音频片段时长超过 `batch_size_threshold_s`阈值时，将batch_size数设置为1, 单位为秒s.
+
+建议：当您输入为长音频，遇到OOM问题时，因为显存占用与音频时长呈平方关系增加，分为3种情况：
+- a)推理起始阶段，显存主要取决于`batch_size_s`，适当减小该值，可以减少显存占用；
+- b)推理中间阶段，遇到VAD切割的长音频片段，总token数小于`batch_size_s`，仍然出现OOM，可以适当减小`batch_size_threshold_s`，超过阈值，强制batch为1; 
+- c)推理快结束阶段，遇到VAD切割的长音频片段，总token数小于`batch_size_s`，且超过阈值`batch_size_threshold_s`，强制batch为1，仍然出现OOM，可以适当减小`max_single_segment_time`，使得VAD切割音频时长变短。
+
+#### 实时语音识别
+
+```python
+from funasr import AutoModel
+
+chunk_size = [0, 10, 5] #[0, 10, 5] 600ms, [0, 8, 4] 480ms
+encoder_chunk_look_back = 4 #number of chunks to lookback for encoder self-attention
+decoder_chunk_look_back = 1 #number of encoder chunks to lookback for decoder cross-attention
+
+model = AutoModel(model="paraformer-zh-streaming")
+
+import soundfile
+import os
+
+wav_file = os.path.join(model.model_path, "example/asr_example.wav")
+speech, sample_rate = soundfile.read(wav_file)
+chunk_stride = chunk_size[1] * 960 # 600ms
+
+cache = {}
+total_chunk_num = int(len((speech)-1)/chunk_stride+1)
+for i in range(total_chunk_num):
+    speech_chunk = speech[i*chunk_stride:(i+1)*chunk_stride]
+    is_final = i == total_chunk_num - 1
+    res = model.generate(input=speech_chunk, cache=cache, is_final=is_final, chunk_size=chunk_size, encoder_chunk_look_back=encoder_chunk_look_back, decoder_chunk_look_back=decoder_chunk_look_back)
+    print(res)
+```
+
+注：`chunk_size`为流式延时配置，`[0,10,5]`表示上屏实时出字粒度为`10*60=600ms`，未来信息为`5*60=300ms`。每次推理输入为`600ms`（采样点数为`16000*0.6=960`），输出为对应文字，最后一个语音片段输入需要设置`is_final=True`来强制输出最后一个字。
+
+#### 语音端点检测（非实时）
+```python
+from funasr import AutoModel
+
+model = AutoModel(model="fsmn-vad")
+
+wav_file = f"{model.model_path}/example/vad_example.wav"
+res = model.generate(input=wav_file)
+print(res)
+```
+注：VAD模型输出格式为：`[[beg1, end1], [beg2, end2], .., [begN, endN]]`，其中`begN/endN`表示第`N`个有效音频片段的起始点/结束点，
+单位为毫秒。
+
+#### 语音端点检测（实时）
+```python
+from funasr import AutoModel
+
+chunk_size = 200 # ms
+model = AutoModel(model="fsmn-vad")
+
+import soundfile
+
+wav_file = f"{model.model_path}/example/vad_example.wav"
+speech, sample_rate = soundfile.read(wav_file)
+chunk_stride = int(chunk_size * sample_rate / 1000)
+
+cache = {}
+total_chunk_num = int(len((speech)-1)/chunk_stride+1)
+for i in range(total_chunk_num):
+    speech_chunk = speech[i*chunk_stride:(i+1)*chunk_stride]
+    is_final = i == total_chunk_num - 1
+    res = model.generate(input=speech_chunk, cache=cache, is_final=is_final, chunk_size=chunk_size)
+    if len(res[0]["value"]):
+        print(res)
+```
+注：流式VAD模型输出格式为4种情况：
+- `[[beg1, end1], [beg2, end2], .., [begN, endN]]`：同上离线VAD输出结果。
+- `[[beg, -1]]`：表示只检测到起始点。
+- `[[-1, end]]`：表示只检测到结束点。
+- `[]`：表示既没有检测到起始点，也没有检测到结束点
+输出结果单位为毫秒，从起始点开始的绝对时间。
+
+#### 标点恢复
+```python
+from funasr import AutoModel
+
+model = AutoModel(model="ct-punc")
+
+res = model.generate(input="那今天的会就到这里吧 happy new year 明年见")
+print(res)
+```
+
+#### 时间戳预测
+```python
+from funasr import AutoModel
+
+model = AutoModel(model="fa-zh")
+
+wav_file = f"{model.model_path}/example/asr_example.wav"
+text_file = f"{model.model_path}/example/text.txt"
+res = model.generate(input=(wav_file, text_file), data_type=("sound", "text"))
+print(res)
+```
+更多（[示例](https://github.com/alibaba-damo-academy/FunASR/tree/main/examples/industrial_data_pretraining)）
+
+<a name="核心功能"></a>
+## 模型训练与测试
+
+### 快速开始
+
+命令行执行（用于快速测试，不推荐）：
+```shell
+funasr-train ++model=paraformer-zh ++train_data_set_list=data/list/train.jsonl ++valid_data_set_list=data/list/val.jsonl ++output_dir="./outputs" &> log.txt &
+```
+
+python代码执行（可以多机多卡，推荐）
+
+```shell
+cd examples/industrial_data_pretraining/paraformer
+bash finetune.sh
+# "log_file: ./outputs/log.txt"
+```
+详细完整的脚本参考 [finetune.sh](https://github.com/alibaba-damo-academy/FunASR/blob/main/examples/industrial_data_pretraining/paraformer/finetune.sh)
+
+### 详细参数介绍
+
+```shell
+funasr/bin/train.py \
++model="${model_name_or_model_dir}" \
++train_data_set_list="${train_data}" \
++valid_data_set_list="${val_data}" \
++dataset_conf.batch_size=20000 \
++dataset_conf.batch_type="token" \
++dataset_conf.num_workers=4 \
++train_conf.max_epoch=50 \
++train_conf.log_interval=1 \
++train_conf.resume=false \
++train_conf.validate_interval=2000 \
++train_conf.save_checkpoint_interval=2000 \
++train_conf.keep_nbest_models=20 \
++train_conf.avg_nbest_model=10 \
++optim_conf.lr=0.0002 \
++output_dir="${output_dir}" &> ${log_file}
+```
+
+- `model`（str）：模型名字（模型仓库中的ID），此时脚本会自动下载模型到本读；或者本地已经下载好的模型路径。
+- `train_data_set_list`（str）：训练数据路径，默认为jsonl格式，具体参考（[例子](https://github.com/alibaba-damo-academy/FunASR/blob/main/data/list)）。
+- `valid_data_set_list`（str）：验证数据路径，默认为jsonl格式，具体参考（[例子](https://github.com/alibaba-damo-academy/FunASR/blob/main/data/list)）。
+- `dataset_conf.batch_type`（str）：`example`（默认），batch的类型。`example`表示按照固定数目batch_size个样本组batch；`length` or `token` 表示动态组batch，batch总长度或者token数为batch_size。
+- `dataset_conf.batch_size`（int）：与 `batch_type` 搭配使用，当 `batch_type=example` 时，表示样本个数；当 `batch_type=length` 时，表示样本中长度，单位为fbank帧数（1帧10ms）或者文字token个数。
+- `train_conf.max_epoch`（int）：`100`（默认），训练总epoch数。
+- `train_conf.log_interval`（int）：`50`（默认），打印日志间隔step数。
+- `train_conf.resume`（int）：`True`（默认），是否开启断点重训。
+- `train_conf.validate_interval`（int）：`5000`（默认），训练中做验证测试的间隔step数。
+- `train_conf.save_checkpoint_interval`（int）：`5000`（默认），训练中模型保存间隔step数。
+- `train_conf.avg_keep_nbest_models_type`（str）：`acc`（默认），保留nbest的标准为acc（越大越好）。`loss`表示，保留nbest的标准为loss（越小越好）。
+- `train_conf.keep_nbest_models`（int）：`500`（默认），保留最大多少个模型参数，配合 `avg_keep_nbest_models_type` 按照验证集 acc/loss 保留最佳的n个模型，其他删除，节约存储空间。
+- `train_conf.avg_nbest_model`（int）：`10`（默认），保留最大多少个模型参数，配合 `avg_keep_nbest_models_type` 按照验证集 acc/loss 对最佳的n个模型平均。
+- `train_conf.accum_grad`（int）：`1`（默认），梯度累积功能。
+- `train_conf.grad_clip`（float）：`10.0`（默认），梯度截断功能。
+- `train_conf.use_fp16`（bool）：`False`（默认），开启fp16训练，加快训练速度。
+- `optim_conf.lr`（float）：学习率。
+- `output_dir`（str）：模型保存路径。
+- `**kwargs`(dict): 所有在`config.yaml`中参数，均可以直接在此处指定，例如，过滤20s以上长音频：`dataset_conf.max_token_length=2000`，单位为音频fbank帧数（1帧10ms）或者文字token个数。
+
+#### 多gpu训练
+##### 单机多gpu训练
+```shell
+export CUDA_VISIBLE_DEVICES="0,1"
+gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+
+torchrun --nnodes 1 --nproc_per_node ${gpu_num} \
+../../../funasr/bin/train.py ${train_args}
+```
+--nnodes 表示参与的节点总数，--nproc_per_node 表示每个节点上运行的进程数
+
+##### 多机多gpu训练
+
+在主节点上，假设IP为192.168.1.1，端口为12345，使用的是2个GPU，则运行如下命令：
+```shell
+export CUDA_VISIBLE_DEVICES="0,1"
+gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+
+torchrun --nnodes 2 --node_rank 0 --nproc_per_node ${gpu_num} --master_addr 192.168.1.1 --master_port 12345 \
+../../../funasr/bin/train.py ${train_args}
+```
+在从节点上（假设IP为192.168.1.2），你需要确保MASTER_ADDR和MASTER_PORT环境变量与主节点设置的一致，并运行同样的命令：
+```shell
+export CUDA_VISIBLE_DEVICES="0,1"
+gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+
+torchrun --nnodes 2 --node_rank 1 --nproc_per_node ${gpu_num} --master_addr 192.168.1.1 --master_port 12345 \
+../../../funasr/bin/train.py ${train_args}
+```
+
+--nnodes 表示参与的节点总数，--node_rank 表示当前节点id，--nproc_per_node 表示每个节点上运行的进程数（通常为gpu个数）
+
+#### 准备数据
+
+`jsonl`格式可以参考（[例子](https://github.com/alibaba-damo-academy/FunASR/blob/main/data/list)）。
+可以用指令 `scp2jsonl` 从wav.scp与text.txt生成。wav.scp与text.txt准备过程如下：
+
+`train_text.txt`
+
+左边为数据唯一ID，需与`train_wav.scp`中的`ID`一一对应
+右边为音频文件标注文本，格式如下：
+
+```bash
+ID0012W0013 当客户风险承受能力评估依据发生变化时
+ID0012W0014 所有只要处理 data 不管你是做 machine learning 做 deep learning
+ID0012W0015 he tried to think how it could be
+```
+
+
+`train_wav.scp`
+
+左边为数据唯一ID，需与`train_text.txt`中的`ID`一一对应
+右边为音频文件的路径，格式如下
+
+```bash
+BAC009S0764W0121 https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/BAC009S0764W0121.wav
+BAC009S0916W0489 https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/BAC009S0916W0489.wav
+ID0012W0015 https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_cn_en.wav
+```
+
+`生成指令`
+
+```shell
+# generate train.jsonl and val.jsonl from wav.scp and text.txt
+scp2jsonl \
++scp_file_list='["../../../data/list/train_wav.scp", "../../../data/list/train_text.txt"]' \
++data_type_list='["source", "target"]' \
++jsonl_file_out="../../../data/list/train.jsonl"
+```
+
+（可选，非必需）如果需要从jsonl解析成wav.scp与text.txt，可以使用指令：
+
+```shell
+# generate wav.scp and text.txt from train.jsonl and val.jsonl
+jsonl2scp \
++scp_file_list='["../../../data/list/train_wav.scp", "../../../data/list/train_text.txt"]' \
++data_type_list='["source", "target"]' \
++jsonl_file_in="../../../data/list/train.jsonl"
+```
+
+#### 查看训练日志
+
+##### 查看实验log
+```shell
+tail log.txt
+[2024-03-21 15:55:52,137][root][INFO] - train, rank: 3, epoch: 0/50, step: 6990/1, total step: 6990, (loss_avg_rank: 0.327), (loss_avg_epoch: 0.409), (ppl_avg_epoch: 1.506), (acc_avg_epoch: 0.795), (lr: 1.165e-04), [('loss_att', 0.259), ('acc', 0.825), ('loss_pre', 0.04), ('loss', 0.299), ('batch_size', 40)], {'data_load': '0.000', 'forward_time': '0.315', 'backward_time': '0.555', 'optim_time': '0.076', 'total_time': '0.947'}, GPU, memory: usage: 3.830 GB, peak: 18.357 GB, cache: 20.910 GB, cache_peak: 20.910 GB
+[2024-03-21 15:55:52,139][root][INFO] - train, rank: 1, epoch: 0/50, step: 6990/1, total step: 6990, (loss_avg_rank: 0.334), (loss_avg_epoch: 0.409), (ppl_avg_epoch: 1.506), (acc_avg_epoch: 0.795), (lr: 1.165e-04), [('loss_att', 0.285), ('acc', 0.823), ('loss_pre', 0.046), ('loss', 0.331), ('batch_size', 36)], {'data_load': '0.000', 'forward_time': '0.334', 'backward_time': '0.536', 'optim_time': '0.077', 'total_time': '0.948'}, GPU, memory: usage: 3.943 GB, peak: 18.291 GB, cache: 19.619 GB, cache_peak: 19.619 GB
+```
+指标解释：
+- `rank`：表示gpu id。
+- `epoch`,`step`,`total step`：表示当前epoch，step，总step。
+- `loss_avg_rank`：表示当前step，所有gpu平均loss。
+- `loss/ppl/acc_avg_epoch`：表示当前epoch周期，截止当前step数时，总平均loss/ppl/acc。epoch结束时的最后一个step表示epoch总平均loss/ppl/acc，推荐使用acc指标。
+- `lr`：当前step的学习率。
+- `[('loss_att', 0.259), ('acc', 0.825), ('loss_pre', 0.04), ('loss', 0.299), ('batch_size', 40)]`：表示当前gpu id的具体数据。
+- `total_time`：表示单个step总耗时。
+- `GPU, memory`：分别表示，模型使用/峰值显存，模型+缓存使用/峰值显存。
+
+##### tensorboard可视化
+```bash
+tensorboard --logdir /xxxx/FunASR/examples/industrial_data_pretraining/paraformer/outputs/log/tensorboard
+```
+浏览器中打开：http://localhost:6006/
+
+### 训练后模型测试
+
+
+#### 有configuration.json
+
+假定，训练模型路径为：./model_dir，如果改目录下有生成configuration.json，只需要将 [上述模型推理方法](https://github.com/alibaba-damo-academy/FunASR/blob/main/examples/README_zh.md#%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86) 中模型名字修改为模型路径即可
+
+例如：
+
+从shell推理
+```shell
+python -m funasr.bin.inference ++model="./model_dir" ++input=="${input}" ++output_dir="${output_dir}"
+```
+从python推理
+
+```python
+from funasr import AutoModel
+
+model = AutoModel(model="./model_dir")
+
+res = model.generate(input=wav_file)
+print(res)
+```
+
+#### 无configuration.json时
+
+如果模型路径中无configuration.json时，需要手动指定具体配置文件路径与模型路径
+
+```shell
+python -m funasr.bin.inference \
+--config-path "${local_path}" \
+--config-name "${config}" \
++init_param="${init_param}" \
++tokenizer_conf.token_list="${tokens}" \
++frontend_conf.cmvn_file="${cmvn_file}" \
++input="${input}" \
++output_dir="${output_dir}" \
++device="${device}"
+```
+
+参数介绍
+- `config-path`：为实验中保存的 `config.yaml`，可以从实验输出目录中查找。
+- `config-name`：配置文件名，一般为 `config.yaml`，支持yaml格式与json格式，例如 `config.json`
+- `init_param`：需要测试的模型参数，一般为`model.pt`，可以自己选择具体的模型文件
+- `tokenizer_conf.token_list`：词表文件路径，一般在 `config.yaml` 有指定，无需再手动指定，当 `config.yaml` 中路径不正确时，需要在此处手动指定。
+- `frontend_conf.cmvn_file`：wav提取fbank中用到的cmvn文件，一般在 `config.yaml` 有指定，无需再手动指定，当 `config.yaml` 中路径不正确时，需要在此处手动指定。
+
+其他参数同上，完整 [示例](https://github.com/alibaba-damo-academy/FunASR/blob/main/examples/industrial_data_pretraining/paraformer/infer_from_local.sh)
+
+
+<a name="模型导出与测试"></a>
+## 模型导出与测试
+### 从命令行导出
+```shell
+funasr-export ++model=paraformer ++quantize=false
+```
+
+### 从Python导出
+```python
+from funasr import AutoModel
+
+model = AutoModel(model="paraformer")
+
+res = model.export(quantize=False)
+```
+
+### 测试ONNX
+```python
+# pip3 install -U funasr-onnx
+from funasr_onnx import Paraformer
+model_dir = "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
+model = Paraformer(model_dir, batch_size=1, quantize=True)
+
+wav_path = ['~/.cache/modelscope/hub/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/example/asr_example.wav']
+
+result = model(wav_path)
+print(result)
+```
+
+更多例子请参考 [样例](https://github.com/alibaba-damo-academy/FunASR/tree/main/runtime/python/onnxruntime)
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/paraformer/demo.py
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/paraformer/demo.py
@@ -0,0 +1,47 @@
+#!/usr/bin/env python3
+# -*- encoding: utf-8 -*-
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+
+from funasr import AutoModel
+
+model = AutoModel(
+    model="iic/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
+    vad_model="iic/speech_fsmn_vad_zh-cn-16k-common-pytorch",
+    vad_kwargs={"max_single_segment_time": 60000},
+    punc_model="iic/punc_ct-transformer_zh-cn-common-vocab272727-pytorch",
+    # spk_model="iic/speech_campplus_sv_zh-cn_16k-common",
+)
+
+res = model.generate(
+    input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav",
+    cache={},
+)
+
+print(res)
+
+
+""" call english model like below for detailed timestamps
+# choose english paraformer model first
+# iic/speech_paraformer_asr-en-16k-vocab4199-pytorch
+res = model.generate(
+    input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_en.wav",
+    cache={},
+    pred_timestamp=True,
+    return_raw_text=True,
+    sentence_timestamp=True,
+    en_post_proc=True,
+)
+"""
+
+""" can not use currently
+from funasr import AutoFrontend
+
+frontend = AutoFrontend(model="iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch")
+
+fbanks = frontend(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav", batch_size=2)
+
+for batch_idx, fbank_dict in enumerate(fbanks):
+    res = model.generate(**fbank_dict)
+    print(res)
+"""
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/paraformer/export.py
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/paraformer/export.py
@@ -0,0 +1,29 @@
+#!/usr/bin/env python3
+# -*- encoding: utf-8 -*-
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+
+
+# method1, inference from model hub
+
+
+from funasr import AutoModel
+
+model = AutoModel(
+    model="iic/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404",
+)
+
+res = model.export(type="torchscript", quantize=False)
+# res = model.export(type="bladedisc", input=f"{model.model_path}/example/asr_example.wav")
+print(res)
+
+
+# # method2, inference from local path
+# from funasr import AutoModel
+#
+# model = AutoModel(
+#     model="/Users/zhifu/.cache/modelscope/hub/iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
+# )
+#
+# res = model.export(type="onnx", quantize=False)
+# print(res)
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/paraformer/export.sh
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/paraformer/export.sh
@@ -0,0 +1,24 @@
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+
+# method1, inference from model hub
+export HYDRA_FULL_ERROR=1
+
+
+model="iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
+
+
+
+python -m funasr.bin.export \
++model=${model} \
++type="onnx" \
++quantize=false
+
+
+## method2, inference from local path
+#model="/Users/zhifu/.cache/modelscope/hub/iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
+#
+#python -m funasr.bin.export \
+#++model=${model} \
+#++type="onnx" \
+#++quantize=false
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/paraformer/finetune.sh
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/paraformer/finetune.sh
@@ -0,0 +1,82 @@
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+
+workspace=`pwd`
+
+# which gpu to train or finetune
+export CUDA_VISIBLE_DEVICES="0,1"
+gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+
+# model_name from model_hub, or model_dir in local path
+
+## option 1, download model automatically
+model_name_or_model_dir="iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
+
+## option 2, download model by git
+#local_path_root=${workspace}/modelscope_models
+#mkdir -p ${local_path_root}/${model_name_or_model_dir}
+#git clone https://www.modelscope.cn/${model_name_or_model_dir}.git ${local_path_root}/${model_name_or_model_dir}
+#model_name_or_model_dir=${local_path_root}/${model_name_or_model_dir}
+
+
+# data dir, which contains: train.json, val.json
+data_dir="../../../data/list"
+
+train_data="${data_dir}/train.jsonl"
+val_data="${data_dir}/val.jsonl"
+
+# generate train.jsonl and val.jsonl from wav.scp and text.txt
+scp2jsonl \
++scp_file_list='["../../../data/list/train_wav.scp", "../../../data/list/train_text.txt"]' \
++data_type_list='["source", "target"]' \
++jsonl_file_out="${train_data}"
+
+scp2jsonl \
++scp_file_list='["../../../data/list/val_wav.scp", "../../../data/list/val_text.txt"]' \
++data_type_list='["source", "target"]' \
++jsonl_file_out="${val_data}"
+
+
+# exp output dir
+output_dir="./outputs"
+log_file="${output_dir}/log.txt"
+
+deepspeed_config=${workspace}/../../ds_stage1.json
+
+mkdir -p ${output_dir}
+echo "log_file: ${log_file}"
+
+DISTRIBUTED_ARGS="
+    --nnodes ${WORLD_SIZE:-1} \
+    --nproc_per_node $gpu_num \
+    --node_rank ${RANK:-0} \
+    --master_addr ${MASTER_ADDR:-127.0.0.1} \
+    --master_port ${MASTER_PORT:-26669}
+"
+
+echo $DISTRIBUTED_ARGS
+
+torchrun $DISTRIBUTED_ARGS \
+../../../funasr/bin/train_ds.py \
++model="${model_name_or_model_dir}" \
++train_data_set_list="${train_data}" \
++valid_data_set_list="${val_data}" \
++dataset="AudioDataset" \
++dataset_conf.index_ds="IndexDSJsonl" \
++dataset_conf.data_split_num=1 \
++dataset_conf.batch_sampler="BatchSampler" \
++dataset_conf.batch_size=6000  \
++dataset_conf.sort_size=1024 \
++dataset_conf.batch_type="token" \
++dataset_conf.num_workers=4 \
++train_conf.max_epoch=50 \
++train_conf.log_interval=1 \
++train_conf.resume=true \
++train_conf.validate_interval=2000 \
++train_conf.save_checkpoint_interval=2000 \
++train_conf.keep_nbest_models=20 \
++train_conf.avg_nbest_model=10 \
++train_conf.use_deepspeed=false \
++train_conf.deepspeed_config=${deepspeed_config} \
++optim_conf.lr=0.0002 \
++output_dir="${output_dir}" &> ${log_file}
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/paraformer/infer.sh
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/paraformer/infer.sh
@@ -0,0 +1,20 @@
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+
+# method1, inference from model hub
+
+# for more input type, please ref to readme.md
+input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav"
+
+output_dir="./outputs/debug"
+
+model="iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
+
+
+device="cuda:0" # "cuda:0" for gpu0, "cuda:1" for gpu1, "cpu"
+
+python -m funasr.bin.inference \
++model=${model} \
++input="${input}" \
++output_dir="${output_dir}" \
++device="${device}" \
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/paraformer/infer_from_local.sh
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/paraformer/infer_from_local.sh
@@ -0,0 +1,39 @@
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+
+# method2, inference from local model
+
+# for more input type, please ref to readme.md
+input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav"
+
+output_dir="./outputs/debug"
+
+workspace=`pwd`
+
+# download model
+local_path_root=${workspace}/modelscope_models
+mkdir -p ${local_path_root}
+local_path=${local_path_root}/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch
+git lfs clone https://www.modelscope.cn/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch.git ${local_path}
+
+device="cuda:0" # "cuda:0" for gpu0, "cuda:1" for gpu1, "cpu"
+
+tokens="${local_path}/tokens.json"
+cmvn_file="${local_path}/am.mvn"
+
+config="config.yaml"
+init_param="${local_path}/model.pt"
+
+python -m funasr.bin.inference \
+--config-path "${local_path}" \
+--config-name "${config}" \
++init_param="${init_param}" \
++tokenizer_conf.token_list="${tokens}" \
++frontend_conf.cmvn_file="${cmvn_file}" \
++input="${input}" \
++output_dir="${output_dir}" \
++device="${device}" \
+
+
+
+
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/paraformer_streaming/README_zh.md
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/paraformer_streaming/README_zh.md
@@ -0,0 +1,436 @@
+(简体中文|[English](./README.md))
+
+FunASR开源了大量在工业数据上预训练模型，您可以在 [模型许可协议](https://github.com/alibaba-damo-academy/FunASR/blob/main/MODEL_LICENSE)下自由使用、复制、修改和分享FunASR模型，下面列举代表性的模型，更多模型请参考 [模型仓库](https://github.com/alibaba-damo-academy/FunASR/tree/main/model_zoo)。
+
+<div align="center">  
+<h4>
+ <a href="#模型推理"> 模型推理 </a>   
+｜<a href="#模型训练与测试"> 模型训练与测试 </a>
+｜<a href="#模型导出与测试"> 模型导出与测试 </a>
+</h4>
+</div>
+
+<a name="模型推理"></a>
+## 模型推理
+
+### 快速使用
+
+命令行方式调用：
+```shell
+funasr ++model=paraformer-zh ++vad_model="fsmn-vad" ++punc_model="ct-punc" ++input=asr_example_zh.wav
+```
+
+python代码调用（推荐）
+
+```python
+from funasr import AutoModel
+
+model = AutoModel(model="paraformer-zh")
+
+res = model.generate(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/vad_example.wav")
+print(res)
+```
+
+### 接口说明
+
+#### AutoModel 定义
+```python
+model = AutoModel(model=[str], device=[str], ncpu=[int], output_dir=[str], batch_size=[int], hub=[str], **kwargs)
+```
+- `model`(str): [模型仓库](https://github.com/alibaba-damo-academy/FunASR/tree/main/model_zoo) 中的模型名称，或本地磁盘中的模型路径
+- `device`(str): `cuda:0`（默认gpu0），使用 GPU 进行推理，指定。如果为`cpu`，则使用 CPU 进行推理
+- `ncpu`(int): `4` （默认），设置用于 CPU 内部操作并行性的线程数
+- `output_dir`(str): `None` （默认），如果设置，输出结果的输出路径
+- `batch_size`(int): `1` （默认），解码时的批处理，样本个数
+- `hub`(str)：`ms`（默认），从modelscope下载模型。如果为`hf`，从huggingface下载模型。
+- `**kwargs`(dict): 所有在`config.yaml`中参数，均可以直接在此处指定，例如，vad模型中最大切割长度 `max_single_segment_time=6000` （毫秒）。
+
+#### AutoModel 推理
+```python
+res = model.generate(input=[str], output_dir=[str])
+```
+- `input`: 要解码的输入，可以是：
+  - wav文件路径, 例如: asr_example.wav
+  - pcm文件路径, 例如: asr_example.pcm，此时需要指定音频采样率fs（默认为16000）
+  - 音频字节数流，例如：麦克风的字节数数据
+  - wav.scp，kaldi 样式的 wav 列表 (`wav_id \t wav_path`), 例如:
+  ```text
+  asr_example1  ./audios/asr_example1.wav
+  asr_example2  ./audios/asr_example2.wav
+  ```
+  在这种输入 `wav.scp` 的情况下，必须设置 `output_dir` 以保存输出结果
+  - 音频采样点，例如：`audio, rate = soundfile.read("asr_example_zh.wav")`, 数据类型为 numpy.ndarray。支持batch输入，类型为list：
+  ```[audio_sample1, audio_sample2, ..., audio_sampleN]```
+  - fbank输入，支持组batch。shape为[batch, frames, dim]，类型为torch.Tensor，例如
+- `output_dir`: None （默认），如果设置，输出结果的输出路径
+- `**kwargs`(dict): 与模型相关的推理参数，例如，`beam_size=10`，`decoding_ctc_weight=0.1`。
+
+
+### 更多用法介绍
+
+
+#### 非实时语音识别
+```python
+from funasr import AutoModel
+# paraformer-zh is a multi-functional asr model
+# use vad, punc, spk or not as you need
+model = AutoModel(model="paraformer-zh",  
+                  vad_model="fsmn-vad", 
+                  vad_kwargs={"max_single_segment_time": 60000},
+                  punc_model="ct-punc", 
+                  # spk_model="cam++"
+                  )
+wav_file = f"{model.model_path}/example/asr_example.wav"
+res = model.generate(input=wav_file, batch_size_s=300, batch_size_threshold_s=60, hotword='魔搭')
+print(res)
+```
+注意：
+- 通常模型输入限制时长30s以下，组合`vad_model`后，支持任意时长音频输入，不局限于paraformer模型，所有音频输入模型均可以。
+- `model`相关的参数可以直接在`AutoModel`定义中直接指定；与`vad_model`相关参数可以通过`vad_kwargs`来指定，类型为dict；类似的有`punc_kwargs`，`spk_kwargs`；
+- `max_single_segment_time`: 表示`vad_model`最大切割音频时长, 单位是毫秒ms.
+- `batch_size_s` 表示采用动态batch，batch中总音频时长，单位为秒s。
+- `batch_size_threshold_s`: 表示`vad_model`切割后音频片段时长超过 `batch_size_threshold_s`阈值时，将batch_size数设置为1, 单位为秒s.
+
+建议：当您输入为长音频，遇到OOM问题时，因为显存占用与音频时长呈平方关系增加，分为3种情况：
+- a)推理起始阶段，显存主要取决于`batch_size_s`，适当减小该值，可以减少显存占用；
+- b)推理中间阶段，遇到VAD切割的长音频片段，总token数小于`batch_size_s`，仍然出现OOM，可以适当减小`batch_size_threshold_s`，超过阈值，强制batch为1; 
+- c)推理快结束阶段，遇到VAD切割的长音频片段，总token数小于`batch_size_s`，且超过阈值`batch_size_threshold_s`，强制batch为1，仍然出现OOM，可以适当减小`max_single_segment_time`，使得VAD切割音频时长变短。
+
+#### 实时语音识别
+
+```python
+from funasr import AutoModel
+
+chunk_size = [0, 10, 5] #[0, 10, 5] 600ms, [0, 8, 4] 480ms
+encoder_chunk_look_back = 4 #number of chunks to lookback for encoder self-attention
+decoder_chunk_look_back = 1 #number of encoder chunks to lookback for decoder cross-attention
+
+model = AutoModel(model="paraformer-zh-streaming")
+
+import soundfile
+import os
+
+wav_file = os.path.join(model.model_path, "example/asr_example.wav")
+speech, sample_rate = soundfile.read(wav_file)
+chunk_stride = chunk_size[1] * 960 # 600ms
+
+cache = {}
+total_chunk_num = int(len((speech)-1)/chunk_stride+1)
+for i in range(total_chunk_num):
+    speech_chunk = speech[i*chunk_stride:(i+1)*chunk_stride]
+    is_final = i == total_chunk_num - 1
+    res = model.generate(input=speech_chunk, cache=cache, is_final=is_final, chunk_size=chunk_size, encoder_chunk_look_back=encoder_chunk_look_back, decoder_chunk_look_back=decoder_chunk_look_back)
+    print(res)
+```
+
+注：`chunk_size`为流式延时配置，`[0,10,5]`表示上屏实时出字粒度为`10*60=600ms`，未来信息为`5*60=300ms`。每次推理输入为`600ms`（采样点数为`16000*0.6=960`），输出为对应文字，最后一个语音片段输入需要设置`is_final=True`来强制输出最后一个字。
+
+#### 语音端点检测（非实时）
+```python
+from funasr import AutoModel
+
+model = AutoModel(model="fsmn-vad")
+
+wav_file = f"{model.model_path}/example/vad_example.wav"
+res = model.generate(input=wav_file)
+print(res)
+```
+注：VAD模型输出格式为：`[[beg1, end1], [beg2, end2], .., [begN, endN]]`，其中`begN/endN`表示第`N`个有效音频片段的起始点/结束点，
+单位为毫秒。
+
+#### 语音端点检测（实时）
+```python
+from funasr import AutoModel
+
+chunk_size = 200 # ms
+model = AutoModel(model="fsmn-vad")
+
+import soundfile
+
+wav_file = f"{model.model_path}/example/vad_example.wav"
+speech, sample_rate = soundfile.read(wav_file)
+chunk_stride = int(chunk_size * sample_rate / 1000)
+
+cache = {}
+total_chunk_num = int(len((speech)-1)/chunk_stride+1)
+for i in range(total_chunk_num):
+    speech_chunk = speech[i*chunk_stride:(i+1)*chunk_stride]
+    is_final = i == total_chunk_num - 1
+    res = model.generate(input=speech_chunk, cache=cache, is_final=is_final, chunk_size=chunk_size)
+    if len(res[0]["value"]):
+        print(res)
+```
+注：流式VAD模型输出格式为4种情况：
+- `[[beg1, end1], [beg2, end2], .., [begN, endN]]`：同上离线VAD输出结果。
+- `[[beg, -1]]`：表示只检测到起始点。
+- `[[-1, end]]`：表示只检测到结束点。
+- `[]`：表示既没有检测到起始点，也没有检测到结束点
+输出结果单位为毫秒，从起始点开始的绝对时间。
+
+#### 标点恢复
+```python
+from funasr import AutoModel
+
+model = AutoModel(model="ct-punc")
+
+res = model.generate(input="那今天的会就到这里吧 happy new year 明年见")
+print(res)
+```
+
+#### 时间戳预测
+```python
+from funasr import AutoModel
+
+model = AutoModel(model="fa-zh")
+
+wav_file = f"{model.model_path}/example/asr_example.wav"
+text_file = f"{model.model_path}/example/text.txt"
+res = model.generate(input=(wav_file, text_file), data_type=("sound", "text"))
+print(res)
+```
+更多（[示例](https://github.com/alibaba-damo-academy/FunASR/tree/main/examples/industrial_data_pretraining)）
+
+<a name="核心功能"></a>
+## 模型训练与测试
+
+### 快速开始
+
+命令行执行（用于快速测试，不推荐）：
+```shell
+funasr-train ++model=paraformer-zh ++train_data_set_list=data/list/train.jsonl ++valid_data_set_list=data/list/val.jsonl ++output_dir="./outputs" &> log.txt &
+```
+
+python代码执行（可以多机多卡，推荐）
+
+```shell
+cd examples/industrial_data_pretraining/paraformer
+bash finetune.sh
+# "log_file: ./outputs/log.txt"
+```
+详细完整的脚本参考 [finetune.sh](https://github.com/alibaba-damo-academy/FunASR/blob/main/examples/industrial_data_pretraining/paraformer/finetune.sh)
+
+### 详细参数介绍
+
+```shell
+funasr/bin/train.py \
++model="${model_name_or_model_dir}" \
++train_data_set_list="${train_data}" \
++valid_data_set_list="${val_data}" \
++dataset_conf.batch_size=20000 \
++dataset_conf.batch_type="token" \
++dataset_conf.num_workers=4 \
++train_conf.max_epoch=50 \
++train_conf.log_interval=1 \
++train_conf.resume=false \
++train_conf.validate_interval=2000 \
++train_conf.save_checkpoint_interval=2000 \
++train_conf.keep_nbest_models=20 \
++train_conf.avg_nbest_model=10 \
++optim_conf.lr=0.0002 \
++output_dir="${output_dir}" &> ${log_file}
+```
+
+- `model`（str）：模型名字（模型仓库中的ID），此时脚本会自动下载模型到本读；或者本地已经下载好的模型路径。
+- `train_data_set_list`（str）：训练数据路径，默认为jsonl格式，具体参考（[例子](https://github.com/alibaba-damo-academy/FunASR/blob/main/data/list)）。
+- `valid_data_set_list`（str）：验证数据路径，默认为jsonl格式，具体参考（[例子](https://github.com/alibaba-damo-academy/FunASR/blob/main/data/list)）。
+- `dataset_conf.batch_type`（str）：`example`（默认），batch的类型。`example`表示按照固定数目batch_size个样本组batch；`length` or `token` 表示动态组batch，batch总长度或者token数为batch_size。
+- `dataset_conf.batch_size`（int）：与 `batch_type` 搭配使用，当 `batch_type=example` 时，表示样本个数；当 `batch_type=length` 时，表示样本中长度，单位为fbank帧数（1帧10ms）或者文字token个数。
+- `train_conf.max_epoch`（int）：`100`（默认），训练总epoch数。
+- `train_conf.log_interval`（int）：`50`（默认），打印日志间隔step数。
+- `train_conf.resume`（int）：`True`（默认），是否开启断点重训。
+- `train_conf.validate_interval`（int）：`5000`（默认），训练中做验证测试的间隔step数。
+- `train_conf.save_checkpoint_interval`（int）：`5000`（默认），训练中模型保存间隔step数。
+- `train_conf.avg_keep_nbest_models_type`（str）：`acc`（默认），保留nbest的标准为acc（越大越好）。`loss`表示，保留nbest的标准为loss（越小越好）。
+- `train_conf.keep_nbest_models`（int）：`500`（默认），保留最大多少个模型参数，配合 `avg_keep_nbest_models_type` 按照验证集 acc/loss 保留最佳的n个模型，其他删除，节约存储空间。
+- `train_conf.avg_nbest_model`（int）：`10`（默认），保留最大多少个模型参数，配合 `avg_keep_nbest_models_type` 按照验证集 acc/loss 对最佳的n个模型平均。
+- `train_conf.accum_grad`（int）：`1`（默认），梯度累积功能。
+- `train_conf.grad_clip`（float）：`10.0`（默认），梯度截断功能。
+- `train_conf.use_fp16`（bool）：`False`（默认），开启fp16训练，加快训练速度。
+- `optim_conf.lr`（float）：学习率。
+- `output_dir`（str）：模型保存路径。
+- `**kwargs`(dict): 所有在`config.yaml`中参数，均可以直接在此处指定，例如，过滤20s以上长音频：`dataset_conf.max_token_length=2000`，单位为音频fbank帧数（1帧10ms）或者文字token个数。
+
+#### 多gpu训练
+##### 单机多gpu训练
+```shell
+export CUDA_VISIBLE_DEVICES="0,1"
+gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+
+torchrun --nnodes 1 --nproc_per_node ${gpu_num} \
+../../../funasr/bin/train.py ${train_args}
+```
+--nnodes 表示参与的节点总数，--nproc_per_node 表示每个节点上运行的进程数
+
+##### 多机多gpu训练
+
+在主节点上，假设IP为192.168.1.1，端口为12345，使用的是2个GPU，则运行如下命令：
+```shell
+export CUDA_VISIBLE_DEVICES="0,1"
+gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+
+torchrun --nnodes 2 --node_rank 0 --nproc_per_node ${gpu_num} --master_addr 192.168.1.1 --master_port 12345 \
+../../../funasr/bin/train.py ${train_args}
+```
+在从节点上（假设IP为192.168.1.2），你需要确保MASTER_ADDR和MASTER_PORT环境变量与主节点设置的一致，并运行同样的命令：
+```shell
+export CUDA_VISIBLE_DEVICES="0,1"
+gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+
+torchrun --nnodes 2 --node_rank 1 --nproc_per_node ${gpu_num} --master_addr 192.168.1.1 --master_port 12345 \
+../../../funasr/bin/train.py ${train_args}
+```
+
+--nnodes 表示参与的节点总数，--node_rank 表示当前节点id，--nproc_per_node 表示每个节点上运行的进程数（通常为gpu个数）
+
+#### 准备数据
+
+`jsonl`格式可以参考（[例子](https://github.com/alibaba-damo-academy/FunASR/blob/main/data/list)）。
+可以用指令 `scp2jsonl` 从wav.scp与text.txt生成。wav.scp与text.txt准备过程如下：
+
+`train_text.txt`
+
+左边为数据唯一ID，需与`train_wav.scp`中的`ID`一一对应
+右边为音频文件标注文本，格式如下：
+
+```bash
+ID0012W0013 当客户风险承受能力评估依据发生变化时
+ID0012W0014 所有只要处理 data 不管你是做 machine learning 做 deep learning
+ID0012W0015 he tried to think how it could be
+```
+
+
+`train_wav.scp`
+
+左边为数据唯一ID，需与`train_text.txt`中的`ID`一一对应
+右边为音频文件的路径，格式如下
+
+```bash
+BAC009S0764W0121 https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/BAC009S0764W0121.wav
+BAC009S0916W0489 https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/BAC009S0916W0489.wav
+ID0012W0015 https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_cn_en.wav
+```
+
+`生成指令`
+
+```shell
+# generate train.jsonl and val.jsonl from wav.scp and text.txt
+scp2jsonl \
++scp_file_list='["../../../data/list/train_wav.scp", "../../../data/list/train_text.txt"]' \
++data_type_list='["source", "target"]' \
++jsonl_file_out="../../../data/list/train.jsonl"
+```
+
+（可选，非必需）如果需要从jsonl解析成wav.scp与text.txt，可以使用指令：
+
+```shell
+# generate wav.scp and text.txt from train.jsonl and val.jsonl
+jsonl2scp \
++scp_file_list='["../../../data/list/train_wav.scp", "../../../data/list/train_text.txt"]' \
++data_type_list='["source", "target"]' \
++jsonl_file_in="../../../data/list/train.jsonl"
+```
+
+#### 查看训练日志
+
+##### 查看实验log
+```shell
+tail log.txt
+[2024-03-21 15:55:52,137][root][INFO] - train, rank: 3, epoch: 0/50, step: 6990/1, total step: 6990, (loss_avg_rank: 0.327), (loss_avg_epoch: 0.409), (ppl_avg_epoch: 1.506), (acc_avg_epoch: 0.795), (lr: 1.165e-04), [('loss_att', 0.259), ('acc', 0.825), ('loss_pre', 0.04), ('loss', 0.299), ('batch_size', 40)], {'data_load': '0.000', 'forward_time': '0.315', 'backward_time': '0.555', 'optim_time': '0.076', 'total_time': '0.947'}, GPU, memory: usage: 3.830 GB, peak: 18.357 GB, cache: 20.910 GB, cache_peak: 20.910 GB
+[2024-03-21 15:55:52,139][root][INFO] - train, rank: 1, epoch: 0/50, step: 6990/1, total step: 6990, (loss_avg_rank: 0.334), (loss_avg_epoch: 0.409), (ppl_avg_epoch: 1.506), (acc_avg_epoch: 0.795), (lr: 1.165e-04), [('loss_att', 0.285), ('acc', 0.823), ('loss_pre', 0.046), ('loss', 0.331), ('batch_size', 36)], {'data_load': '0.000', 'forward_time': '0.334', 'backward_time': '0.536', 'optim_time': '0.077', 'total_time': '0.948'}, GPU, memory: usage: 3.943 GB, peak: 18.291 GB, cache: 19.619 GB, cache_peak: 19.619 GB
+```
+指标解释：
+- `rank`：表示gpu id。
+- `epoch`,`step`,`total step`：表示当前epoch，step，总step。
+- `loss_avg_rank`：表示当前step，所有gpu平均loss。
+- `loss/ppl/acc_avg_epoch`：表示当前epoch周期，截止当前step数时，总平均loss/ppl/acc。epoch结束时的最后一个step表示epoch总平均loss/ppl/acc，推荐使用acc指标。
+- `lr`：当前step的学习率。
+- `[('loss_att', 0.259), ('acc', 0.825), ('loss_pre', 0.04), ('loss', 0.299), ('batch_size', 40)]`：表示当前gpu id的具体数据。
+- `total_time`：表示单个step总耗时。
+- `GPU, memory`：分别表示，模型使用/峰值显存，模型+缓存使用/峰值显存。
+
+##### tensorboard可视化
+```bash
+tensorboard --logdir /xxxx/FunASR/examples/industrial_data_pretraining/paraformer/outputs/log/tensorboard
+```
+浏览器中打开：http://localhost:6006/
+
+### 训练后模型测试
+
+
+#### 有configuration.json
+
+假定，训练模型路径为：./model_dir，如果改目录下有生成configuration.json，只需要将 [上述模型推理方法](https://github.com/alibaba-damo-academy/FunASR/blob/main/examples/README_zh.md#%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86) 中模型名字修改为模型路径即可
+
+例如：
+
+从shell推理
+```shell
+python -m funasr.bin.inference ++model="./model_dir" ++input=="${input}" ++output_dir="${output_dir}"
+```
+从python推理
+
+```python
+from funasr import AutoModel
+
+model = AutoModel(model="./model_dir")
+
+res = model.generate(input=wav_file)
+print(res)
+```
+
+#### 无configuration.json时
+
+如果模型路径中无configuration.json时，需要手动指定具体配置文件路径与模型路径
+
+```shell
+python -m funasr.bin.inference \
+--config-path "${local_path}" \
+--config-name "${config}" \
++init_param="${init_param}" \
++tokenizer_conf.token_list="${tokens}" \
++frontend_conf.cmvn_file="${cmvn_file}" \
++input="${input}" \
++output_dir="${output_dir}" \
++device="${device}"
+```
+
+参数介绍
+- `config-path`：为实验中保存的 `config.yaml`，可以从实验输出目录中查找。
+- `config-name`：配置文件名，一般为 `config.yaml`，支持yaml格式与json格式，例如 `config.json`
+- `init_param`：需要测试的模型参数，一般为`model.pt`，可以自己选择具体的模型文件
+- `tokenizer_conf.token_list`：词表文件路径，一般在 `config.yaml` 有指定，无需再手动指定，当 `config.yaml` 中路径不正确时，需要在此处手动指定。
+- `frontend_conf.cmvn_file`：wav提取fbank中用到的cmvn文件，一般在 `config.yaml` 有指定，无需再手动指定，当 `config.yaml` 中路径不正确时，需要在此处手动指定。
+
+其他参数同上，完整 [示例](https://github.com/alibaba-damo-academy/FunASR/blob/main/examples/industrial_data_pretraining/paraformer/infer_from_local.sh)
+
+
+<a name="模型导出与测试"></a>
+## 模型导出与测试
+### 从命令行导出
+```shell
+funasr-export ++model=paraformer ++quantize=false
+```
+
+### 从Python导出
+```python
+from funasr import AutoModel
+
+model = AutoModel(model="paraformer")
+
+res = model.export(quantize=False)
+```
+
+### 测试ONNX
+```python
+# pip3 install -U funasr-onnx
+from funasr_onnx import Paraformer
+model_dir = "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
+model = Paraformer(model_dir, batch_size=1, quantize=True)
+
+wav_path = ['~/.cache/modelscope/hub/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/example/asr_example.wav']
+
+result = model(wav_path)
+print(result)
+```
+
+更多例子请参考 [样例](https://github.com/alibaba-damo-academy/FunASR/tree/main/runtime/python/onnxruntime)
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/paraformer_streaming/demo.py
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/paraformer_streaming/demo.py
@@ -0,0 +1,46 @@
+#!/usr/bin/env python3
+# -*- encoding: utf-8 -*-
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+
+import os
+
+from funasr import AutoModel
+
+chunk_size = [0, 10, 5]  # [0, 10, 5] 600ms, [0, 8, 4] 480ms
+encoder_chunk_look_back = 4  # number of chunks to lookback for encoder self-attention
+decoder_chunk_look_back = 1  # number of encoder chunks to lookback for decoder cross-attention
+model = AutoModel(model="iic/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online")
+
+wav_file = os.path.join(model.model_path, "example/asr_example.wav")
+res = model.generate(
+    input=wav_file,
+    chunk_size=chunk_size,
+    encoder_chunk_look_back=encoder_chunk_look_back,
+    decoder_chunk_look_back=decoder_chunk_look_back,
+)
+print(res)
+
+
+import soundfile
+
+
+wav_file = os.path.join(model.model_path, "example/asr_example.wav")
+speech, sample_rate = soundfile.read(wav_file)
+
+chunk_stride = chunk_size[1] * 960  # 600ms、480ms
+
+cache = {}
+total_chunk_num = int(len((speech) - 1) / chunk_stride + 1)
+for i in range(total_chunk_num):
+    speech_chunk = speech[i * chunk_stride : (i + 1) * chunk_stride]
+    is_final = i == total_chunk_num - 1
+    res = model.generate(
+        input=speech_chunk,
+        cache=cache,
+        is_final=is_final,
+        chunk_size=chunk_size,
+        encoder_chunk_look_back=encoder_chunk_look_back,
+        decoder_chunk_look_back=decoder_chunk_look_back,
+    )
+    print(res)
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/paraformer_streaming/demo.sh
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/paraformer_streaming/demo.sh
@@ -0,0 +1,10 @@
+
+model="iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online"
+
+
+python funasr/bin/inference.py \
+model=${model} \
+input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav" \
+output_dir="./outputs/debug" \
+device="cpu" \
+
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/paraformer_streaming/export.py
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/paraformer_streaming/export.py
@@ -0,0 +1,29 @@
+#!/usr/bin/env python3
+# -*- encoding: utf-8 -*-
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+
+
+# method1, inference from model hub
+
+
+from funasr import AutoModel
+
+model = AutoModel(
+    model="iic/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online",
+)
+
+res = model.export(type="onnx", quantize=False)
+print(res)
+
+
+# # method2, inference from local path
+# from funasr import AutoModel
+#
+#
+# model = AutoModel(
+#     model="/Users/zhifu/.cache/modelscope/hub/iic/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online"
+# )
+#
+# res = model.export(type="onnx", quantize=False)
+# print(res)
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/paraformer_streaming/export.sh
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/paraformer_streaming/export.sh
@@ -0,0 +1,24 @@
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+
+# method1, inference from model hub
+export HYDRA_FULL_ERROR=1
+
+model="iic/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online"
+
+python -m funasr.bin.export \
++model=${model} \
++type="onnx" \
++quantize=false \
++device="cpu"
+
+
+## method2, inference from local path
+#model="/Users/zhifu/.cache/modelscope/hub/iic/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online"
+#
+#python -m funasr.bin.export \
+#++model=${model} \
+#++type="onnx" \
+#++quantize=false \
+#++device="cpu" \
+#++debug=false
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/paraformer_streaming/finetune.sh
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/paraformer_streaming/finetune.sh
@@ -0,0 +1,83 @@
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+
+workspace=`pwd`
+
+# which gpu to train or finetune
+export CUDA_VISIBLE_DEVICES="0,1"
+gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+
+# model_name from model_hub, or model_dir in local path
+
+## option 1, download model automatically
+model_name_or_model_dir="iic/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online"
+
+
+## option 2, download model by git
+#local_path_root=${workspace}/modelscope_models
+#mkdir -p ${local_path_root}/${model_name_or_model_dir}
+#git clone https://www.modelscope.cn/${model_name_or_model_dir}.git ${local_path_root}/${model_name_or_model_dir}
+#model_name_or_model_dir=${local_path_root}/${model_name_or_model_dir}
+
+
+# data dir, which contains: train.json, val.json
+data_dir="../../../data/list"
+
+train_data="${data_dir}/train.jsonl"
+val_data="${data_dir}/val.jsonl"
+
+# generate train.jsonl and val.jsonl from wav.scp and text.txt
+scp2jsonl \
++scp_file_list='["../../../data/list/train_wav.scp", "../../../data/list/train_text.txt"]' \
++data_type_list='["source", "target"]' \
++jsonl_file_out="${train_data}"
+
+scp2jsonl \
++scp_file_list='["../../../data/list/val_wav.scp", "../../../data/list/val_text.txt"]' \
++data_type_list='["source", "target"]' \
++jsonl_file_out="${val_data}"
+
+
+# exp output dir
+output_dir="./outputs"
+log_file="${output_dir}/log.txt"
+
+deepspeed_config=${workspace}/../../ds_stage1.json
+
+mkdir -p ${output_dir}
+echo "log_file: ${log_file}"
+
+DISTRIBUTED_ARGS="
+    --nnodes ${WORLD_SIZE:-1} \
+    --nproc_per_node $gpu_num \
+    --node_rank ${RANK:-0} \
+    --master_addr ${MASTER_ADDR:-127.0.0.1} \
+    --master_port ${MASTER_PORT:-26669}
+"
+
+echo $DISTRIBUTED_ARGS
+
+torchrun $DISTRIBUTED_ARGS \
+../../../funasr/bin/train_ds.py \
++model="${model_name_or_model_dir}" \
++train_data_set_list="${train_data}" \
++valid_data_set_list="${val_data}" \
++dataset="AudioDataset" \
++dataset_conf.index_ds="IndexDSJsonl" \
++dataset_conf.data_split_num=1 \
++dataset_conf.batch_sampler="BatchSampler" \
++dataset_conf.batch_size=6000  \
++dataset_conf.sort_size=1024 \
++dataset_conf.batch_type="token" \
++dataset_conf.num_workers=4 \
++train_conf.max_epoch=50 \
++train_conf.log_interval=1 \
++train_conf.resume=true \
++train_conf.validate_interval=2000 \
++train_conf.save_checkpoint_interval=2000 \
++train_conf.keep_nbest_models=20 \
++train_conf.avg_nbest_model=10 \
++train_conf.use_deepspeed=false \
++train_conf.deepspeed_config=${deepspeed_config} \
++optim_conf.lr=0.0002 \
++output_dir="${output_dir}" &> ${log_file}
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/qwen_audio/demo.py
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/qwen_audio/demo.py
@@ -0,0 +1,16 @@
+#!/usr/bin/env python3
+# -*- encoding: utf-8 -*-
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+
+# To install requirements: pip3 install -U "funasr[llm]"
+
+from funasr import AutoModel
+
+model = AutoModel(model="Qwen-Audio")
+
+audio_in = "https://github.com/QwenLM/Qwen-Audio/raw/main/assets/audio/1272-128104-0000.flac"
+prompt = "<|startoftranscription|><|en|><|transcribe|><|en|><|notimestamps|><|wo_itn|>"
+
+res = model.generate(input=audio_in, prompt=prompt)
+print(res)
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/qwen_audio/demo_chat.py
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/qwen_audio/demo_chat.py
@@ -0,0 +1,24 @@
+#!/usr/bin/env python3
+# -*- encoding: utf-8 -*-
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+
+# To install requirements: pip3 install -U "funasr[llm]"
+
+from funasr import AutoModel
+
+model = AutoModel(model="Qwen/Qwen-Audio-Chat")
+
+audio_in = "https://github.com/QwenLM/Qwen-Audio/raw/main/assets/audio/1272-128104-0000.flac"
+
+# 1st dialogue turn
+prompt = "what does the person say?"
+cache = {"history": None}
+res = model.generate(input=audio_in, prompt=prompt, cache=cache)
+print(res)
+
+
+# 2nd dialogue turn
+prompt = 'Find the start time and end time of the word "middle classes"'
+res = model.generate(input=None, prompt=prompt, cache=cache)
+print(res)
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/qwen_audio/demo_chat_from_local.py
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/qwen_audio/demo_chat_from_local.py
@@ -0,0 +1,29 @@
+#!/usr/bin/env python3
+# -*- encoding: utf-8 -*-
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+
+# To install requirements: pip3 install -U "funasr[llm]"
+
+from funasr import AutoModel
+
+model = AutoModel(
+    model="Qwen-Audio-Chat",
+    model_path="/nfs/zhifu.gzf/init_model/qwen/Qwen-Audio-Chat",
+)
+
+audio_in = (
+    "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav"
+)
+
+# 1st dialogue turn
+prompt = "what does the person say?"
+cache = {"history": None}
+res = model.generate(input=audio_in, prompt=prompt, cache=cache)
+print(res)
+
+
+# 2nd dialogue turn
+prompt = 'Find the start time and end time of the word "middle classes"'
+res = model.generate(input=None, prompt=prompt, cache=cache)
+print(res)
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/qwen_audio/demo_from_local.py
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/qwen_audio/demo_from_local.py
@@ -0,0 +1,19 @@
+#!/usr/bin/env python3
+# -*- encoding: utf-8 -*-
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+
+# To install requirements: pip3 install -U "funasr[llm]"
+
+from funasr import AutoModel
+
+model = AutoModel(
+    model="Qwen-Audio",
+    model_path="/nfs/zhifu.gzf/init_model/qwen/Qwen-Audio",
+)
+
+audio_in = "https://github.com/QwenLM/Qwen-Audio/raw/main/assets/audio/1272-128104-0000.flac"
+prompt = "<|startoftranscription|><|en|><|transcribe|><|en|><|notimestamps|><|wo_itn|>"
+
+res = model.generate(input=audio_in, prompt=prompt)
+print(res)
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/sanm_kws/conf/sanm_6e_320_256_fdim40_t2602.yaml
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/sanm_kws/conf/sanm_6e_320_256_fdim40_t2602.yaml
@@ -0,0 +1,94 @@
+
+# network architecture
+model: SanmKWS
+model_conf:
+    ctc_weight: 1.0
+
+# encoder
+encoder: SANMEncoder
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 320  # the number of units of position-wise feed forward
+    num_blocks: 6      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: pe
+    pos_enc_class: SinusoidalPositionEncoder
+    normalize_before: true
+    kernel_size: 11
+    sanm_shfit: 0
+    selfattention_layer_type: sanm
+
+# frontend related
+frontend: WavFrontend
+frontend_conf:
+    fs: 16000
+    window: hamming
+    n_mels: 40
+    frame_length: 25
+    frame_shift: 10
+    lfr_m: 7
+    lfr_n: 6
+
+specaug: SpecAugLFR
+specaug_conf:
+    apply_time_warp: false
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    lfr_rate: 6
+    num_freq_mask: 1
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 12
+    num_time_mask: 1
+
+train_conf:
+  accum_grad: 1
+  grad_clip: 5
+  max_epoch: 100
+  keep_nbest_models: 20
+  avg_nbest_model: 10
+  avg_keep_nbest_models_type: loss
+  validate_interval: 50000
+  save_checkpoint_interval: 50000
+  avg_checkpoint_interval: 1000
+  log_interval: 50
+
+optim: adam
+optim_conf:
+   lr: 0.001
+scheduler: warmuplr
+scheduler_conf:
+   warmup_steps: 10000
+
+dataset: AudioDataset
+dataset_conf:
+    index_ds: IndexDSJsonl
+    batch_sampler: EspnetStyleBatchSampler
+    batch_type: length # example or length
+    batch_size: 96000 # if batch_type is example, batch_size is the numbers of samples; if length, batch_size is source_token_len+target_token_len;
+    max_token_length: 1600 # filter samples if source_token_len+target_token_len > max_token_length,
+    buffer_size: 2048
+    shuffle: true
+    num_workers: 8
+    preprocessor_speech: SpeechPreprocessSpeedPerturb
+    preprocessor_speech_conf:
+      speed_perturb: [0.9, 1.0, 1.1]
+
+tokenizer: CharTokenizer
+tokenizer_conf:
+  unk_symbol: <unk>
+
+ctc_conf:
+    dropout_rate: 0.0
+    ctc_type: builtin  # ctc_type: focalctc, builtin
+    reduce: true
+    ignore_nan_grad: true
+normalize: null
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/sanm_kws/demo.py
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/sanm_kws/demo.py
@@ -0,0 +1,18 @@
+#!/usr/bin/env python3
+# -*- encoding: utf-8 -*-
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+
+from funasr import AutoModel
+
+model = AutoModel(
+    model="iic/speech_sanm_kws_phone-xiaoyun-commands-offline",
+    keywords="小云小云",
+    output_dir="./outputs/debug",
+    device='cpu'
+)
+
+test_wav = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/KWS/pos_testset/kws_xiaoyunxiaoyun.wav"
+
+res = model.generate(input=test_wav, cache={},)
+print(res)
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/sanm_kws/export.sh
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/sanm_kws/export.sh
@@ -0,0 +1,17 @@
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+
+config_path="/home/pengteng.spt/source/FunASR_KWS/examples/industrial_data_pretraining/sanm_kws/conf"
+config_path="/home/pengteng.spt/source/FunASR_KWS/examples/industrial_data_pretraining/sanm_kws/exp/20240914_xiaoyun_finetune_sanm_6e_320_256_feats_dim40_char_t2602_offline"
+
+config_file="sanm_6e_320_256_fdim40_t2602.yaml"
+config_file="config.yaml"
+
+model_path="./modelscope_models_kws/speech_charctc_kws_phone-xiaoyun/funasr/finetune_sanm_6e_320_256_fdim40_t2602_online_xiaoyun_commands.pt"
+
+python -m funasr.bin.export \
+    --config-path="${config_path}" \
+    --config-name="${config_file}" \
+    ++init_param=${model_path} \
+    ++type="onnx" \
+    ++quantize=true
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/sanm_kws/finetune.sh
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/sanm_kws/finetune.sh
@@ -0,0 +1,172 @@
+#!/usr/bin/env bash
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+. ./path.sh
+workspace=`pwd`
+
+CUDA_VISIBLE_DEVICES="0,1"
+
+stage=2
+stop_stage=3
+
+inference_device="cpu" #"cpu"
+inference_device="cuda" #"cpu"
+inference_checkpoint="model.pt.avg10"
+inference_scp="wav.scp"
+inference_batch_size=32
+nj=32
+test_sets="test"
+
+# model_name from model_hub, or model_dir in local path
+
+## option 1, download model automatically, unsupported currently
+model_name_or_model_dir="iic/speech_sanm_kws_phone-xiaoyun-commands-offline"
+
+## option 2, download model by git
+local_path_root=${workspace}/modelscope_models
+model_name_or_model_dir=${local_path_root}/${model_name_or_model_dir}
+if [ ! -d $model_name_or_model_dir ]; then
+  mkdir -p ${model_name_or_model_dir}
+  git clone https://www.modelscope.cn/iic/speech_sanm_kws_phone-xiaoyun-commands-offline.git ${model_name_or_model_dir}
+fi
+
+config=sanm_6e_320_256_fdim40_t2602.yaml
+token_list=${model_name_or_model_dir}/tokens_2602.txt
+lexicon_list=${model_name_or_model_dir}/lexicon.txt
+cmvn_file=${model_name_or_model_dir}/am.mvn.dim40_l3r3
+init_param="${model_name_or_model_dir}/basetrain_sanm_6e_320_256_fdim40_t2602_offline.pt"
+
+
+# data prepare
+# data dir, which contains: train.json, val.json
+data_dir=../../data
+
+train_data="${data_dir}/train.jsonl"
+val_data="${data_dir}/val.jsonl"
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+  echo "stage 1: Generate audio json list"
+  # generate train.jsonl and val.jsonl from wav.scp and text.txt
+  python $FUNASR_DIR/funasr/datasets/audio_datasets/scp2jsonl.py \
+  ++scp_file_list='['''${data_dir}/train_wav.scp''', '''${data_dir}/train_text.txt''']' \
+  ++data_type_list='["source", "target"]' \
+  ++jsonl_file_out="${train_data}"
+
+  python $FUNASR_DIR/funasr/datasets/audio_datasets/scp2jsonl.py \
+  ++scp_file_list='['''${data_dir}/val_wav.scp''', '''${data_dir}/val_text.txt''']' \
+  ++data_type_list='["source", "target"]' \
+  ++jsonl_file_out="${val_data}"
+fi
+
+# exp output dir
+output_dir="${workspace}/exp/finetune_outputs"
+
+# Training Stage
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+  echo "stage 2: KWS Training"
+
+  mkdir -p ${output_dir}
+  current_time=$(date "+%Y-%m-%d_%H-%M")
+  log_file="${output_dir}/train.log.txt.${current_time}"
+  echo "log_file: ${log_file}"
+	echo "finetune use basetrain model: ${init_param}"
+
+  export CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES
+  gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+  torchrun --nnodes 1 --nproc_per_node ${gpu_num} \
+  ../../../funasr/bin/train.py \
+  --config-path "${workspace}/conf" \
+  --config-name "${config}" \
+  ++init_param="${init_param}" \
+  ++disable_update=true \
+  ++train_data_set_list="${train_data}" \
+  ++valid_data_set_list="${val_data}" \
+  ++tokenizer_conf.token_list="${token_list}" \
+  ++tokenizer_conf.seg_dict="${lexicon_list}" \
+  ++frontend_conf.cmvn_file="${cmvn_file}" \
+  ++output_dir="${output_dir}" &> ${log_file}
+fi
+
+
+# Testing Stage
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+  echo "stage 3: Inference"
+  keywords=(小云小云)
+  keywords_string=$(IFS=,; echo "${keywords[*]}")
+  echo "keywords: $keywords_string"
+
+  if [ ${inference_device} == "cuda" ]; then
+      nj=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+  else
+      inference_batch_size=1
+      CUDA_VISIBLE_DEVICES=""
+      for JOB in $(seq ${nj}); do
+          CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"-1,"
+      done
+  fi
+
+  for dset in ${test_sets}; do
+    inference_dir="${output_dir}/inference-${inference_checkpoint}/${dset}"
+    _logdir="${inference_dir}/logdir"
+    echo "inference_dir: ${inference_dir}"
+
+    mkdir -p "${_logdir}"
+    test_data_dir="${data_dir}/${dset}"
+    key_file=${test_data_dir}/${inference_scp}
+
+    split_scps=
+    for JOB in $(seq "${nj}"); do
+        split_scps+=" ${_logdir}/keys.${JOB}.scp"
+    done
+    $FUNASR_DIR/examples/aishell/paraformer/utils/split_scp.pl "${key_file}" ${split_scps}
+
+    gpuid_list_array=(${CUDA_VISIBLE_DEVICES//,/ })
+    for JOB in $(seq ${nj}); do
+        {
+          id=$((JOB-1))
+          gpuid=${gpuid_list_array[$id]}
+
+          echo "${output_dir}"
+
+          export CUDA_VISIBLE_DEVICES=${gpuid}
+          python ../../../funasr/bin/inference.py \
+          --config-path="${output_dir}" \
+          --config-name="config.yaml" \
+          ++init_param="${output_dir}/${inference_checkpoint}" \
+          ++tokenizer_conf.token_list="${token_list}" \
+          ++tokenizer_conf.seg_dict="${lexicon_list}" \
+          ++frontend_conf.cmvn_file="${cmvn_file}" \
+          ++keywords="\"$keywords_string"\" \
+          ++input="${_logdir}/keys.${JOB}.scp" \
+          ++output_dir="${inference_dir}/${JOB}" \
+          ++device="${inference_device}" \
+          ++ncpu=1 \
+          ++disable_log=true \
+          ++batch_size="${inference_batch_size}" &> ${_logdir}/log.${JOB}.txt
+          # ++batch_size="${inference_batch_size}"
+        }&
+
+    done
+    wait
+
+    for f in detect score; do
+        if [ -f "${inference_dir}/${JOB}/${f}" ]; then
+          for JOB in $(seq "${nj}"); do
+              cat "${inference_dir}/${JOB}/${f}"
+          done | sort -k1 >"${inference_dir}/${f}"
+        fi
+    done
+
+    python funasr/utils/compute_det_ctc.py \
+        --keywords ${keywords_string} \
+        --test_data ${test_data_dir}/wav.scp \
+        --trans_data ${test_data_dir}/text \
+        --score_file ${inference_dir}/detect \
+        --stats_dir ${inference_dir}
+  done
+
+fi
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/sanm_kws/funasr
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/sanm_kws/funasr
@@ -0,0 +1 @@
+../../../funasr
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/sanm_kws/infer.sh
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/sanm_kws/infer.sh
@@ -0,0 +1,20 @@
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+
+# method1, inference from model hub
+
+model="iic/speech_sanm_kws_phone-xiaoyun-commands-offline"
+
+# for more input type, please ref to readme.md
+input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/KWS/pos_testset/kws_xiaoyunxiaoyun.wav"
+
+keywords=(小云小云)
+keywords_string=$(IFS=,; echo "${keywords[*]}")
+echo "keywords: $keywords_string"
+
+python funasr/bin/inference.py \
+model=${model} \
+input=${input} \
+output_dir="./outputs/debug" \
+device="cpu" \
++keywords="\"$keywords_string"\"
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/sanm_kws/infer_from_local.sh
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/sanm_kws/infer_from_local.sh
@@ -0,0 +1,41 @@
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+
+# method2, inference from local model
+
+# for more input type, please ref to readme.md
+input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/KWS/pos_testset/kws_xiaoyunxiaoyun.wav"
+
+output_dir="./outputs/debug"
+
+workspace=`pwd`
+
+# download model
+local_path_root=${workspace}/modelscope_models
+mkdir -p ${local_path_root}
+local_path=${local_path_root}/speech_sanm_kws_phone-xiaoyun-commands-offline
+git clone https://www.modelscope.cn/iic/speech_sanm_kws_phone-xiaoyun-commands-offline.git ${local_path}
+
+device="cpu" # "cuda:0" for gpu0, "cuda:1" for gpu1, "cpu"
+
+config="inference_sanm_6e_320_256_fdim40_t2602_offline.yaml"
+tokens="${local_path}/tokens_2602.txt"
+seg_dict="${local_path}/lexicon.txt"
+init_param="${local_path}/finetune_sanm_6e_320_256_fdim40_t2602_offline_xiaoyun_commands.pt"
+cmvn_file="${local_path}/am.mvn.dim40_l3r3"
+
+keywords=(小云小云)
+keywords_string=$(IFS=,; echo "${keywords[*]}")
+echo "keywords: $keywords_string"
+
+python -m funasr.bin.inference \
+--config-path "${local_path}/" \
+--config-name "${config}" \
++init_param="${init_param}" \
++frontend_conf.cmvn_file="${cmvn_file}" \
++tokenizer_conf.token_list="${tokens}" \
++tokenizer_conf.seg_dict="${seg_dict}" \
++input="${input}" \
++output_dir="${output_dir}" \
++device="${device}" \
++keywords="\"$keywords_string"\"
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/sanm_kws/path.sh
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/sanm_kws/path.sh
@@ -0,0 +1,5 @@
+export FUNASR_DIR=$PWD/../../..
+
+# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PATH=$FUNASR_DIR/funasr/bin:$PATH
--- a/modules/python/vendors/FunASR/examples/industrial_data_pretraining/sanm_kws_streaming/conf/sanm_6e_320_256_fdim40_t2602.yaml
+++ b/modules/python/vendors/FunASR/examples/industrial_data_pretraining/sanm_kws_streaming/conf/sanm_6e_320_256_fdim40_t2602.yaml
@@ -0,0 +1,109 @@
+
+# network architecture
+model: SanmKWSStreaming
+model_conf:
+    ctc_weight: 1.0
+
+# encoder
+encoder: SANMEncoderChunkOpt
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 320  # the number of units of position-wise feed forward
+    num_blocks: 6      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: pe_online
+    pos_enc_class: SinusoidalPositionEncoder
+    normalize_before: true
+    kernel_size: 11
+    sanm_shfit: 0
+    selfattention_layer_type: sanm
+    chunk_size:
+    - 16
+    - 20
+    stride:
+    - 8
+    - 10
+    pad_left:
+    - 4
+    - 5
+    encoder_att_look_back_factor:
+    - 0
+    - 0
+    decoder_att_look_back_factor:
+    - 0
+    - 0
+
+# frontend related
+frontend: WavFrontendOnline
+frontend_conf:
+    fs: 16000
+    window: hamming
+    n_mels: 40
+    frame_length: 25
+    frame_shift: 10
+    lfr_m: 7
+    lfr_n: 6
+
+specaug: SpecAugLFR
+specaug_conf:
+    apply_time_warp: false
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    lfr_rate: 6
+    num_freq_mask: 1
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 12
+    num_time_mask: 1
+
+train_conf:
+  accum_grad: 1
+  grad_clip: 5
+  max_epoch: 100
+  keep_nbest_models: 20
+  avg_nbest_model: 10
+  avg_keep_nbest_models_type: loss
+  validate_interval: 50000
+  save_checkpoint_interval: 50000
+  avg_checkpoint_interval: 1000
+  log_interval: 50
+
+optim: adam
+optim_conf:
+   lr: 0.001
+scheduler: warmuplr
+scheduler_conf:
+   warmup_steps: 30000
+
+dataset: AudioDataset
+dataset_conf:
+    index_ds: IndexDSJsonl
+    batch_sampler: EspnetStyleBatchSampler
+    batch_type: length # example or length
+    batch_size: 64000 # if batch_type is example, batch_size is the numbers of samples; if length, batch_size is source_token_len+target_token_len;
+    max_token_length: 1600 # filter samples if source_token_len+target_token_len > max_token_length,
+    buffer_size: 2048
+    shuffle: true
+    num_workers: 8
+    preprocessor_speech: SpeechPreprocessSpeedPerturb
+    preprocessor_speech_conf:
+      speed_perturb: [0.9, 1.0, 1.1]
+
+tokenizer: CharTokenizer
+tokenizer_conf:
+  unk_symbol: <unk>
+
+ctc_conf:
+    dropout_rate: 0.0
+    ctc_type: builtin  # ctc_type: focalctc, builtin
+    reduce: true
+    ignore_nan_grad: true
+normalize: null
--- a/Show More
+++ b/Show More