Sync from bytedesk-private: update

This commit is contained in:
jack ning
2024-12-14 10:43:18 +08:00
parent 476eebb101
commit 5e082909e4
3421 changed files with 812709 additions and 0 deletions

View File

@@ -0,0 +1,20 @@
#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
# MIT License (https://opensource.org/licenses/MIT)
from funasr import AutoModel
model = AutoModel(
model="iic/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
vad_model="iic/speech_fsmn_vad_zh-cn-16k-common-pytorch",
punc_model="iic/punc_ct-transformer_cn-en-common-vocab471067-large",
# spk_model="iic/speech_campplus_sv_zh-cn_16k-common",
)
res = model.generate(
input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_vad_punc_example.wav",
batch_size_s=300,
batch_size_threshold_s=60,
)
print(res)

View File

@@ -0,0 +1,18 @@
model="iic/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
vad_model="iic/speech_fsmn_vad_zh-cn-16k-common-pytorch"
#punc_model="iic/punc_ct-transformer_zh-cn-common-vocab272727-pytorch"
punc_model="iic/punc_ct-transformer_cn-en-common-vocab471067-large"
spk_model="iic/speech_campplus_sv_zh-cn_16k-common"
python funasr/bin/inference.py \
+model=${model} \
+vad_model=${vad_model} \
+punc_model=${punc_model} \
+spk_model=${spk_model} \
+input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_vad_punc_example.wav" \
+output_dir="./outputs/debug" \
+device="cpu" \
+batch_size_s=300 \
+batch_size_threshold_s=60

View File

@@ -0,0 +1,28 @@
#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
# MIT License (https://opensource.org/licenses/MIT)
# method1, inference from model hub
from funasr import AutoModel
model = AutoModel(
model="iic/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
device="cpu",
)
res = model.export(type="torchscript", quantize=False)
print(res)
# # method2, inference from local path
# from funasr import AutoModel
# model = AutoModel(
# model="/Users/zhifu/.cache/modelscope/hub/iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
# device="cpu",
# )
# res = model.export(type="onnx", quantize=False)
# print(res)

View File

@@ -0,0 +1,23 @@
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
# MIT License (https://opensource.org/licenses/MIT)
# method1, inference from model hub
export HYDRA_FULL_ERROR=1
model="iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
python -m funasr.bin.export \
++model=${model} \
++type="onnx" \
++quantize=false \
++device="cpu"
# method2, inference from local path
model="/Users/zhifu/.cache/modelscope/hub/iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
python -m funasr.bin.export \
++model=${model} \
++type="onnx" \
++quantize=false \
++device="cpu"

View File

@@ -0,0 +1,84 @@
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
# MIT License (https://opensource.org/licenses/MIT)
workspace=`pwd`
# method1, finetune from model hub
# which gpu to train or finetune
export CUDA_VISIBLE_DEVICES="0,1"
gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
# model_name from model_hub, or model_dir in local path
## option 1, download model automatically
model_name_or_model_dir="iic/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
## option 2, download model by git
#local_path_root=${workspace}/modelscope_models
#mkdir -p ${local_path_root}/${model_name_or_model_dir}
#git clone https://www.modelscope.cn/${model_name_or_model_dir}.git ${local_path_root}/${model_name_or_model_dir}
#model_name_or_model_dir=${local_path_root}/${model_name_or_model_dir}
# data dir, which contains: train.json, val.json
data_dir="../../../data/list"
train_data="${data_dir}/train.jsonl"
val_data="${data_dir}/val.jsonl"
# generate train.jsonl and val.jsonl from wav.scp and text.txt
scp2jsonl \
++scp_file_list='["../../../data/list/train_wav.scp", "../../../data/list/train_text.txt"]' \
++data_type_list='["source", "target"]' \
++jsonl_file_out="${train_data}"
scp2jsonl \
++scp_file_list='["../../../data/list/val_wav.scp", "../../../data/list/val_text.txt"]' \
++data_type_list='["source", "target"]' \
++jsonl_file_out="${val_data}"
# exp output dir
output_dir="./outputs"
log_file="${output_dir}/log.txt"
mkdir -p ${output_dir}
echo "log_file: ${log_file}"
deepspeed_config=${workspace}/../../ds_stage1.json
DISTRIBUTED_ARGS="
--nnodes ${WORLD_SIZE:-1} \
--nproc_per_node $gpu_num \
--node_rank ${RANK:-0} \
--master_addr ${MASTER_ADDR:-127.0.0.1} \
--master_port ${MASTER_PORT:-26669}
"
echo $DISTRIBUTED_ARGS
torchrun $DISTRIBUTED_ARGS \
../../../funasr/bin/train_ds.py \
++model="${model_name_or_model_dir}" \
++train_data_set_list="${train_data}" \
++valid_data_set_list="${val_data}" \
++dataset="AudioDataset" \
++dataset_conf.index_ds="IndexDSJsonl" \
++dataset_conf.data_split_num=1 \
++dataset_conf.batch_sampler="BatchSampler" \
++dataset_conf.batch_size=6000 \
++dataset_conf.sort_size=1024 \
++dataset_conf.batch_type="token" \
++dataset_conf.num_workers=4 \
++train_conf.max_epoch=50 \
++train_conf.log_interval=1 \
++train_conf.resume=true \
++train_conf.validate_interval=2000 \
++train_conf.save_checkpoint_interval=2000 \
++train_conf.keep_nbest_models=20 \
++train_conf.use_deepspeed=false \
++train_conf.deepspeed_config=${deepspeed_config} \
++optim_conf.lr=0.0002 \
++output_dir="${output_dir}" &> ${log_file}

View File

@@ -0,0 +1,13 @@
#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
# MIT License (https://opensource.org/licenses/MIT)
from funasr import AutoModel
model = AutoModel(model="iic/speech_campplus_sv_zh-cn_16k-common")
res = model.generate(
input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav"
)
print(res)

View File

@@ -0,0 +1,14 @@
#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
# MIT License (https://opensource.org/licenses/MIT)
from funasr import AutoModel
model = AutoModel(model="iic/speech_conformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch")
res = model.generate(
input="https://modelscope.oss-cn-beijing.aliyuncs.com/test/audios/asr_example.wav",
decoding_ctc_weight=0.0,
)
print(res)

View File

@@ -0,0 +1,9 @@
model="iic/speech_conformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch"
python funasr/bin/inference.py \
+model=${model} \
+input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav" \
+output_dir="./outputs/debug" \
+device="cpu" \

View File

@@ -0,0 +1,14 @@
#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
# MIT License (https://opensource.org/licenses/MIT)
from funasr import AutoModel
model = AutoModel(model="iic/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404")
res = model.generate(
input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav",
hotword="达摩院 魔搭",
)
print(res)

View File

@@ -0,0 +1,11 @@
model="iic/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404"
python ../../../funasr/bin/inference.py \
+model=${model} \
+input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav" \
+output_dir="./outputs/debug" \
+device="cpu" \
+"hotword='达摩院 魔搭'"

View File

@@ -0,0 +1,9 @@
python -m funasr.bin.inference \
--config-path="/nfs/yufan.yf/workspace/model_download/modelscope/hub/damo/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404" \
--config-name="config.yaml" \
++init_param="/nfs/yufan.yf/workspace/model_download/modelscope/hub/damo/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/model.pb" \
++tokenizer_conf.token_list="/nfs/yufan.yf/workspace/model_download/modelscope/hub/damo/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/tokens.txt" \
++frontend_conf.cmvn_file="/nfs/yufan.yf/workspace/model_download/modelscope/hub/damo/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/am.mvn" \
++input="/nfs/yufan.yf/workspace/model_download/modelscope/hub/damo/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/asr_example_zh.wav" \
++output_dir="./outputs/debug2" \
++device="" \

View File

@@ -0,0 +1,85 @@
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
# MIT License (https://opensource.org/licenses/MIT)
workspace=`pwd`
# method1, finetune from model hub
# which gpu to train or finetune
export CUDA_VISIBLE_DEVICES="0,1"
gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
# model_name from model_hub, or model_dir in local path
## option 1, download model automatically
model_name_or_model_dir="iic/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404"
## option 2, download model by git
#local_path_root=${workspace}/modelscope_models
#mkdir -p ${local_path_root}/${model_name_or_model_dir}
#git clone https://www.modelscope.cn/${model_name_or_model_dir}.git ${local_path_root}/${model_name_or_model_dir}
#model_name_or_model_dir=${local_path_root}/${model_name_or_model_dir}
# data dir, which contains: train.json, val.json
data_dir="../../../data/list"
train_data="${data_dir}/train.jsonl"
val_data="${data_dir}/val.jsonl"
# generate train.jsonl and val.jsonl from wav.scp and text.txt
scp2jsonl \
++scp_file_list='["../../../data/list/train_wav.scp", "../../../data/list/train_text.txt"]' \
++data_type_list='["source", "target"]' \
++jsonl_file_out="${train_data}"
scp2jsonl \
++scp_file_list='["../../../data/list/val_wav.scp", "../../../data/list/val_text.txt"]' \
++data_type_list='["source", "target"]' \
++jsonl_file_out="${val_data}"
# exp output dir
output_dir="./outputs"
log_file="${output_dir}/log.txt"
mkdir -p ${output_dir}
echo "log_file: ${log_file}"
deepspeed_config=${workspace}/../../ds_stage1.json
DISTRIBUTED_ARGS="
--nnodes ${WORLD_SIZE:-1} \
--nproc_per_node $gpu_num \
--node_rank ${RANK:-0} \
--master_addr ${MASTER_ADDR:-127.0.0.1} \
--master_port ${MASTER_PORT:-26669}
"
echo $DISTRIBUTED_ARGS
torchrun $DISTRIBUTED_ARGS \
../../../funasr/bin/train_ds.py \
++model="${model_name_or_model_dir}" \
++train_data_set_list="${train_data}" \
++valid_data_set_list="${val_data}" \
++dataset="AudioDatasetHotword" \
++dataset_conf.index_ds="IndexDSJsonl" \
++dataset_conf.data_split_num=1 \
++dataset_conf.batch_sampler="BatchSampler" \
++dataset_conf.batch_size=6000 \
++dataset_conf.sort_size=1024 \
++dataset_conf.batch_type="token" \
++dataset_conf.num_workers=4 \
++train_conf.max_epoch=50 \
++train_conf.log_interval=1 \
++train_conf.resume=true \
++train_conf.validate_interval=2000 \
++train_conf.save_checkpoint_interval=2000 \
++train_conf.keep_nbest_models=20 \
++train_conf.use_deepspeed=false \
++train_conf.deepspeed_config=${deepspeed_config} \
++optim_conf.lr=0.0002 \
++output_dir="${output_dir}" &> ${log_file}

View File

@@ -0,0 +1,6 @@
export FUNASR_DIR=$PWD/../../../
# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
export PYTHONIOENCODING=UTF-8
export PATH=$FUNASR_DIR/funasr/bin:$PATH
export PYTHONPATH=$FUNASR_DIR/funasr/bin:$FUNASR_DIR/funasr:$FUNASR_DIR:$PYTHONPATH

View File

@@ -0,0 +1,23 @@
#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
# MIT License (https://opensource.org/licenses/MIT)
from funasr import AutoModel
model = AutoModel(model="iic/punc_ct-transformer_zh-cn-common-vocab272727-pytorch")
res = model.generate(
input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_text/punc_example.txt"
)
print(res)
from funasr import AutoModel
model = AutoModel(model="iic/punc_ct-transformer_cn-en-common-vocab471067-large")
res = model.generate(
input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_text/punc_example.txt"
)
print(res)

View File

@@ -0,0 +1,12 @@
#model="iic/punc_ct-transformer_zh-cn-common-vocab272727-pytorch"
#
model="iic/punc_ct-transformer_cn-en-common-vocab471067-large"
python funasr/bin/inference.py \
+model=${model} \
+input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_text/punc_example.txt" \
+output_dir="./outputs/debug" \
+device="cpu"

View File

@@ -0,0 +1,26 @@
#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
# MIT License (https://opensource.org/licenses/MIT)
# method1, inference from model hub
from funasr import AutoModel
model = AutoModel(
model="iic/punc_ct-transformer_zh-cn-common-vocab272727-pytorch",
)
res = model.export(type="onnx", quantize=False)
print(res)
# method2, inference from local path
from funasr import AutoModel
model = AutoModel(
model="/Users/zhifu/.cache/modelscope/hub/iic/punc_ct-transformer_zh-cn-common-vocab272727-pytorch"
)
res = model.export(type="onnx", quantize=False)
print(res)

View File

@@ -0,0 +1,26 @@
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
# MIT License (https://opensource.org/licenses/MIT)
# method1, inference from model hub
export HYDRA_FULL_ERROR=1
model="iic/punc_ct-transformer_zh-cn-common-vocab272727-pytorch"
python -m funasr.bin.export \
++model=${model} \
++model_revision=${model_revision} \
++type="onnx" \
++quantize=false \
++device="cpu"
# method2, inference from local path
model="/Users/zhifu/.cache/modelscope/hub/iic/punc_ct-transformer_zh-cn-common-vocab272727-pytorch"
python -m funasr.bin.export \
++model=${model} \
++type="onnx" \
++quantize=false \
++device="cpu"

View File

@@ -0,0 +1,18 @@
#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
# MIT License (https://opensource.org/licenses/MIT)
from funasr import AutoModel
model = AutoModel(model="iic/punc_ct-transformer_zh-cn-common-vad_realtime-vocab272727")
inputs = "跨境河流是养育沿岸|人民的生命之源长期以来为帮助下游地区防灾减灾中方技术人员|在上游地区极为恶劣的自然条件下克服巨大困难甚至冒着生命危险|向印方提供汛期水文资料处理紧急事件中方重视印方在跨境河流问题上的关切|愿意进一步完善双方联合工作机制|凡是|中方能做的我们|都会去做而且会做得更好我请印度朋友们放心中国在上游的|任何开发利用都会经过科学|规划和论证兼顾上下游的利益"
vads = inputs.split("|")
rec_result_all = "outputs: "
cache = {}
for vad in vads:
rec_result = model.generate(input=vad, cache=cache)
rec_result_all += rec_result[0]["text"]
print(rec_result_all)

View File

@@ -0,0 +1,9 @@
model="iic/punc_ct-transformer_zh-cn-common-vad_realtime-vocab272727"
python funasr/bin/inference.py \
+model=${model} \
+input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_text/punc_example.txt" \
+output_dir="./outputs/debug" \
+device="cpu"

View File

@@ -0,0 +1,26 @@
#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
# MIT License (https://opensource.org/licenses/MIT)
# method1, inference from model hub
from funasr import AutoModel
model = AutoModel(
model="iic/punc_ct-transformer_zh-cn-common-vad_realtime-vocab272727",
)
res = model.export(type="onnx", quantize=False)
print(res)
# method2, inference from local path
from funasr import AutoModel
model = AutoModel(
model="/Users/zhifu/.cache/modelscope/hub/iic/punc_ct-transformer_zh-cn-common-vad_realtime-vocab272727"
)
res = model.export(type="onnx", quantize=False)
print(res)

View File

@@ -0,0 +1,28 @@
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
# MIT License (https://opensource.org/licenses/MIT)
# method1, inference from model hub
export HYDRA_FULL_ERROR=1
model="iic/punc_ct-transformer_zh-cn-common-vad_realtime-vocab272727"
python -m funasr.bin.export \
++model=${model} \
++model_revision=${model_revision} \
++input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/vad_example.wav" \
++type="onnx" \
++quantize=false \
++device="cpu"
# method2, inference from local path
model="/Users/zhifu/.cache/modelscope/hub/iic/punc_ct-transformer_zh-cn-common-vad_realtime-vocab272727"
python -m funasr.bin.export \
++model=${model} \
++input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/vad_example.wav" \
++type="onnx" \
++quantize=false \
++device="cpu"

View File

@@ -0,0 +1,24 @@
#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
# MIT License (https://opensource.org/licenses/MIT)
import sys
from funasr import AutoModel
model_dir = "/Users/zhifu/Downloads/modelscope_models/ctc_model"
input_file = (
"https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav"
)
model = AutoModel(
model=model_dir,
)
res = model.generate(
input=input_file,
cache={},
)
print(res)

View File

@@ -0,0 +1,31 @@
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
# MIT License (https://opensource.org/licenses/MIT)
# method2, inference from local model
# for more input type, please ref to readme.md
model_dir=$1
input_file=$2
output_dir=$3
# download model
device="cuda:0" # "cuda:0" for gpu0, "cuda:1" for gpu1, "cpu"
tokens="${model_dir}/tokens.json"
cmvn_file="${model_dir}/am.mvn"
config="config.yaml"
init_param="${model_dir}/model.pt"
mkdir -p ${output_dir}
python -m funasr.bin.inference \
--config-path "${model_dir}" \
--config-name "${config}" \
++init_param="${init_param}" \
++tokenizer_conf.token_list="${tokens}" \
++frontend_conf.cmvn_file="${cmvn_file}" \
++input="${input_file}" \
++output_dir="${output_dir}" \
++device="${device}" \

View File

@@ -0,0 +1,95 @@
# network architecture
model: FsmnKWS
model_conf:
ctc_weight: 1.0
# encoder related
encoder: FSMN
encoder_conf:
input_dim: 400
input_affine_dim: 140
fsmn_layers: 4
linear_dim: 250
proj_dim: 128
lorder: 10
rorder: 2
lstride: 1
rstride: 1
output_affine_dim: 140
output_dim: 2599
use_softmax: false
frontend: WavFrontend
frontend_conf:
fs: 16000
window: hamming
n_mels: 80
frame_length: 25
frame_shift: 10
lfr_m: 5
lfr_n: 3
specaug: SpecAugLFR
specaug_conf:
apply_time_warp: false
time_warp_window: 5
time_warp_mode: bicubic
apply_freq_mask: true
freq_mask_width_range:
- 0
- 30
lfr_rate: 3
num_freq_mask: 1
apply_time_mask: true
time_mask_width_range:
- 0
- 12
num_time_mask: 1
train_conf:
accum_grad: 1
grad_clip: 5
max_epoch: 100
keep_nbest_models: 10
avg_nbest_model: 10
avg_keep_nbest_models_type: loss
validate_interval: 50000
save_checkpoint_interval: 50000
avg_checkpoint_interval: 1000
log_interval: 50
optim: adam
optim_conf:
lr: 0.0005
scheduler: warmuplr
scheduler_conf:
warmup_steps: 10000
dataset: AudioDataset
dataset_conf:
index_ds: IndexDSJsonl
batch_sampler: EspnetStyleBatchSampler
batch_type: length # example or length
batch_size: 32000 # if batch_type is example, batch_size is the numbers of samples; if length, batch_size is source_token_len+target_token_len;
max_token_length: 1600 # filter samples if source_token_len+target_token_len > max_token_length,
buffer_size: 2048
shuffle: true
num_workers: 8
preprocessor_speech: SpeechPreprocessSpeedPerturb
preprocessor_speech_conf:
speed_perturb: [0.9, 1.0, 1.1]
tokenizer: CharTokenizer
tokenizer_conf:
unk_symbol: <unk>
split_with_space: true
ctc_conf:
dropout_rate: 0.0
ctc_type: builtin
reduce: true
ignore_nan_grad: true
extra_linear: false
normalize: null

View File

@@ -0,0 +1,95 @@
# network architecture
model: FsmnKWS
model_conf:
ctc_weight: 1.0
# encoder related
encoder: FSMN
encoder_conf:
input_dim: 360
input_affine_dim: 280
fsmn_layers: 4
linear_dim: 280
proj_dim: 200
lorder: 10
rorder: 2
lstride: 1
rstride: 1
output_affine_dim: 400
output_dim: 2602
use_softmax: false
frontend: WavFrontend
frontend_conf:
fs: 16000
window: hamming
n_mels: 40
frame_length: 25
frame_shift: 10
lfr_m: 9
lfr_n: 3
specaug: SpecAugLFR
specaug_conf:
apply_time_warp: false
time_warp_window: 5
time_warp_mode: bicubic
apply_freq_mask: true
freq_mask_width_range:
- 0
- 30
lfr_rate: 3
num_freq_mask: 1
apply_time_mask: true
time_mask_width_range:
- 0
- 12
num_time_mask: 1
train_conf:
accum_grad: 1
grad_clip: 5
max_epoch: 100
keep_nbest_models: 10
avg_nbest_model: 10
avg_keep_nbest_models_type: loss
validate_interval: 50000
save_checkpoint_interval: 50000
avg_checkpoint_interval: 1000
log_interval: 50
optim: adam
optim_conf:
lr: 0.0005
scheduler: warmuplr
scheduler_conf:
warmup_steps: 10000
dataset: AudioDataset
dataset_conf:
index_ds: IndexDSJsonl
batch_sampler: EspnetStyleBatchSampler
batch_type: length # example or length
batch_size: 32000 # if batch_type is example, batch_size is the numbers of samples; if length, batch_size is source_token_len+target_token_len;
max_token_length: 1600 # filter samples if source_token_len+target_token_len > max_token_length,
buffer_size: 2048
shuffle: true
num_workers: 8
preprocessor_speech: SpeechPreprocessSpeedPerturb
preprocessor_speech_conf:
speed_perturb: [0.9, 1.0, 1.1]
tokenizer: CharTokenizer
tokenizer_conf:
unk_symbol: <unk>
split_with_space: true
ctc_conf:
dropout_rate: 0.0
ctc_type: builtin
reduce: true
ignore_nan_grad: true
extra_linear: false
normalize: null

View File

@@ -0,0 +1,134 @@
from __future__ import print_function
import argparse
import copy
import logging
import os
from shutil import copyfile
import torch
import yaml
from typing import Union
from funasr.models.fsmn_kws.model import FsmnKWSConvert
def count_parameters(model):
return sum(p.numel() for p in model.parameters() if p.requires_grad)
def get_args():
parser = argparse.ArgumentParser(
description=
'load and convert network to each other between kaldi/pytorch format')
parser.add_argument('--config', required=True, help='config file')
parser.add_argument(
'--network_file',
default='',
required=True,
help='input network, support kaldi.txt/pytorch.pt')
parser.add_argument('--model_dir', required=True, help='save model dir')
parser.add_argument('--model_name', required=True, help='save model name')
parser.add_argument('--convert_to',
default='kaldi',
required=True,
help='target network type, kaldi/pytorch')
args = parser.parse_args()
return args
def convert_to_kaldi(
configs,
network_file,
model_dir,
model_name="convert.kaldi.txt"
):
copyfile(network_file, os.path.join(model_dir, 'origin.torch.pt'))
model = FsmnKWSConvert(
vocab_size=configs['encoder_conf']['output_dim'],
encoder='FSMNConvert',
encoder_conf=configs['encoder_conf'],
ctc_conf=configs['ctc_conf'],
)
print(model)
num_params = count_parameters(model)
print('the number of model params: {}'.format(num_params))
states= torch.load(network_file, map_location='cpu')
model.load_state_dict(states["state_dict"])
kaldi_text = os.path.join(model_dir, model_name)
with open(kaldi_text, 'w', encoding='utf8') as fout:
nnet_desp = model.to_kaldi_net()
fout.write(nnet_desp)
fout.close()
def convert_to_pytorch(
configs,
network_file,
model_dir,
model_name="convert.torch.pt"
):
model = FsmnKWSConvert(
vocab_size=configs['encoder_conf']['output_dim'],
frontend=None,
specaug=None,
normalize=None,
encoder='FSMNConvert',
encoder_conf=configs['encoder_conf'],
ctc_conf=configs['ctc_conf'],
)
num_params = count_parameters(model)
print('the number of model params: {}'.format(num_params))
copyfile(network_file, os.path.join(model_dir, 'origin.kaldi.txt'))
model.to_pytorch_net(network_file)
save_model_path = os.path.join(model_dir, model_name)
torch.save({"model": model.state_dict()}, save_model_path)
print('convert torch format back to kaldi')
kaldi_text = os.path.join(model_dir, 'convert.kaldi.txt')
with open(kaldi_text, 'w', encoding='utf8') as fout:
nnet_desp = model.to_kaldi_net()
fout.write(nnet_desp)
fout.close()
print('Done!')
def main():
args = get_args()
logging.basicConfig(level=logging.DEBUG,
format='%(asctime)s %(levelname)s %(message)s')
print(args)
with open(args.config, 'r') as fin:
configs = yaml.load(fin, Loader=yaml.FullLoader)
if args.convert_to == 'pytorch':
print('convert kaldi net to pytorch...')
convert_to_pytorch(
configs,
args.network_file,
args.model_dir,
args.model_name
)
elif args.convert_to == 'kaldi':
print('convert pytorch net to kaldi...')
convert_to_kaldi(
configs,
args.network_file,
args.model_dir,
args.model_name
)
else:
print('unsupported target network type: {}'.format(args.convert_to))
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,26 @@
workspace=`pwd`
# download model
local_path_root=${workspace}/modelscope_models_kws
mkdir -p ${local_path_root}
local_path=${local_path_root}/speech_charctc_kws_phone-xiaoyun
if [ ! -d "$local_path" ]; then
git clone https://www.modelscope.cn/iic/speech_charctc_kws_phone-xiaoyun.git ${local_path}
fi
export PATH=${local_path}/runtime:$PATH
export LD_LIBRARY_PATH=${local_path}/runtime:$LD_LIBRARY_PATH
config=./conf/fsmn_4e_l10r2_250_128_fdim80_t2599.yaml
torch_nnet=exp/finetune_outputs/model.pt.avg10
out_dir=exp/finetune_outputs
if [ ! -d "$out_dir" ]; then
mkdir -p $out_dir
fi
python convert.py --config $config --network_file $torch_nnet --model_dir $out_dir --model_name "convert.kaldi.txt" --convert_to kaldi
nnet-copy --binary=true ${out_dir}/convert.kaldi.txt ${out_dir}/convert.kaldi.net

View File

@@ -0,0 +1,18 @@
#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
# MIT License (https://opensource.org/licenses/MIT)
from funasr import AutoModel
model = AutoModel(
model="iic/speech_charctc_kws_phone-xiaoyun",
keywords="小云小云",
output_dir="./outputs/debug",
device='cpu'
)
test_wav = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/KWS/pos_testset/kws_xiaoyunxiaoyun.wav"
res = model.generate(input=test_wav, cache={},)
print(res)

View File

@@ -0,0 +1,173 @@
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
# MIT License (https://opensource.org/licenses/MIT)
#!/usr/bin/env bash
# Set bash to 'debug' mode, it will exit on :
# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
set -e
set -u
set -o pipefail
. ./path.sh
workspace=`pwd`
CUDA_VISIBLE_DEVICES="0,1"
stage=2
stop_stage=3
inference_device="cuda" #"cpu"
inference_checkpoint="model.pt.avg10"
inference_scp="wav.scp"
inference_batch_size=32
nj=32
test_sets="test"
# model_name from model_hub, or model_dir in local path
## option 1, download model automatically, unsupported currently
model_name_or_model_dir="iic/speech_charctc_kws_phone-xiaoyun"
## option 2, download model by git
local_path_root=${workspace}/modelscope_models
model_name_or_model_dir=${local_path_root}/${model_name_or_model_dir}
if [ ! -d $model_name_or_model_dir ]; then
mkdir -p ${model_name_or_model_dir}
git clone https://www.modelscope.cn/iic/speech_charctc_kws_phone-xiaoyun.git ${model_name_or_model_dir}
fi
config=fsmn_4e_l10r2_250_128_fdim80_t2599.yaml
token_list=${model_name_or_model_dir}/funasr/tokens_2599.txt
lexicon_list=${model_name_or_model_dir}/funasr/lexicon.txt
cmvn_file=${model_name_or_model_dir}/funasr/am.mvn.dim80_l2r2
init_param="${model_name_or_model_dir}/funasr/basetrain_fsmn_4e_l10r2_250_128_fdim80_t2599.pt"
# data prepare
# data dir, which contains: train.json, val.json
data_dir=../../data
train_data="${data_dir}/train.jsonl"
val_data="${data_dir}/val.jsonl"
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
echo "stage 1: Generate audio json list"
# generate train.jsonl and val.jsonl from wav.scp and text.txt
python $FUNASR_DIR/funasr/datasets/audio_datasets/scp2jsonl.py \
++scp_file_list='['''${data_dir}/train_wav.scp''', '''${data_dir}/train_text.txt''']' \
++data_type_list='["source", "target"]' \
++jsonl_file_out="${train_data}"
python $FUNASR_DIR/funasr/datasets/audio_datasets/scp2jsonl.py \
++scp_file_list='['''${data_dir}/val_wav.scp''', '''${data_dir}/val_text.txt''']' \
++data_type_list='["source", "target"]' \
++jsonl_file_out="${val_data}"
fi
# exp output dir
output_dir="${workspace}/exp/finetune_outputs"
# Training Stage
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
echo "stage 2: KWS Training"
mkdir -p ${output_dir}
current_time=$(date "+%Y-%m-%d_%H-%M")
log_file="${output_dir}/train.log.txt.${current_time}"
echo "log_file: ${log_file}"
echo "finetune use basetrain model: ${init_param}"
export CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES
gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
torchrun --nnodes 1 --nproc_per_node ${gpu_num} \
../../../funasr/bin/train.py \
--config-path "${workspace}/conf" \
--config-name "${config}" \
++init_param="${init_param}" \
++disable_update=true \
++train_data_set_list="${train_data}" \
++valid_data_set_list="${val_data}" \
++tokenizer_conf.token_list="${token_list}" \
++tokenizer_conf.seg_dict="${lexicon_list}" \
++frontend_conf.cmvn_file="${cmvn_file}" \
++output_dir="${output_dir}" &> ${log_file}
fi
# Testing Stage
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
echo "stage 3: Inference"
keywords=(小云小云)
keywords_string=$(IFS=,; echo "${keywords[*]}")
echo "keywords: $keywords_string"
if [ ${inference_device} == "cuda" ]; then
nj=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
else
inference_batch_size=1
CUDA_VISIBLE_DEVICES=""
for JOB in $(seq ${nj}); do
CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"-1,"
done
fi
for dset in ${test_sets}; do
inference_dir="${output_dir}/inference-${inference_checkpoint}/${dset}"
_logdir="${inference_dir}/logdir"
echo "inference_dir: ${inference_dir}"
mkdir -p "${_logdir}"
test_data_dir="${data_dir}/${dset}"
key_file=${test_data_dir}/${inference_scp}
split_scps=
for JOB in $(seq "${nj}"); do
split_scps+=" ${_logdir}/keys.${JOB}.scp"
done
$FUNASR_DIR/examples/aishell/paraformer/utils/split_scp.pl "${key_file}" ${split_scps}
gpuid_list_array=(${CUDA_VISIBLE_DEVICES//,/ })
for JOB in $(seq ${nj}); do
{
id=$((JOB-1))
gpuid=${gpuid_list_array[$id]}
echo "${output_dir}"
export CUDA_VISIBLE_DEVICES=${gpuid}
python ../../../funasr/bin/inference.py \
--config-path="${output_dir}" \
--config-name="config.yaml" \
++init_param="${output_dir}/${inference_checkpoint}" \
++tokenizer_conf.token_list="${token_list}" \
++tokenizer_conf.seg_dict="${lexicon_list}" \
++frontend_conf.cmvn_file="${cmvn_file}" \
++keywords="\"$keywords_string"\" \
++input="${_logdir}/keys.${JOB}.scp" \
++output_dir="${inference_dir}/${JOB}" \
++device="${inference_device}" \
++ncpu=1 \
++disable_log=true \
++batch_size="${inference_batch_size}" &> ${_logdir}/log.${JOB}.txt
}&
done
wait
for f in detect; do
if [ -f "${inference_dir}/${JOB}/${f}" ]; then
for JOB in $(seq "${nj}"); do
cat "${inference_dir}/${JOB}/${f}"
done | sort -k1 >"${inference_dir}/${f}"
fi
done
python funasr/utils/compute_det_ctc.py \
--keywords ${keywords_string} \
--test_data ${test_data_dir}/wav.scp \
--trans_data ${test_data_dir}/text \
--score_file ${inference_dir}/detect \
--stats_dir ${inference_dir}
done
fi

View File

@@ -0,0 +1 @@
../../../funasr

View File

@@ -0,0 +1,20 @@
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
# MIT License (https://opensource.org/licenses/MIT)
# method1, inference from model hub
model="iic/speech_charctc_kws_phone-xiaoyun"
# for more input type, please ref to readme.md
input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/KWS/pos_testset/kws_xiaoyunxiaoyun.wav"
keywords=(小云小云)
keywords_string=$(IFS=,; echo "${keywords[*]}")
echo "keywords: $keywords_string"
python funasr/bin/inference.py \
+model=${model} \
+input=${input} \
+output_dir="./outputs/debug" \
+device="cpu" \
++keywords="\"$keywords_string"\"

View File

@@ -0,0 +1,41 @@
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
# MIT License (https://opensource.org/licenses/MIT)
# method2, inference from local model
# for more input type, please ref to readme.md
input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/KWS/pos_testset/kws_xiaoyunxiaoyun.wav"
output_dir="./outputs/debug"
workspace=`pwd`
# download model
local_path_root=${workspace}/modelscope_models
mkdir -p ${local_path_root}
local_path=${local_path_root}/speech_charctc_kws_phone-xiaoyun
git clone https://www.modelscope.cn/iic/speech_charctc_kws_phone-xiaoyun.git ${local_path}
device="cuda:0" # "cuda:0" for gpu0, "cuda:1" for gpu1, "cpu"
config="inference_fsmn_4e_l10r2_250_128_fdim80_t2599.yaml"
tokens="${local_path}/funasr/tokens_2599.txt"
seg_dict="${local_path}/funasr/lexicon.txt"
init_param="${local_path}/funasr/finetune_fsmn_4e_l10r2_250_128_fdim80_t2599_xiaoyun_xiaoyun.pt"
cmvn_file="${local_path}/funasr/am.mvn.dim80_l2r2"
keywords=(小云小云)
keywords_string=$(IFS=,; echo "${keywords[*]}")
echo "keywords: $keywords_string"
python -m funasr.bin.inference \
--config-path "${local_path}/funasr" \
--config-name "${config}" \
++init_param="${init_param}" \
++frontend_conf.cmvn_file="${cmvn_file}" \
++tokenizer_conf.token_list="${tokens}" \
++tokenizer_conf.seg_dict="${seg_dict}" \
++input="${input}" \
++output_dir="${output_dir}" \
++device="${device}" \
++keywords="\"$keywords_string"\"

View File

@@ -0,0 +1,5 @@
export FUNASR_DIR=$PWD/../../..
# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
export PYTHONIOENCODING=UTF-8
export PATH=$FUNASR_DIR/funasr/bin:$PATH

View File

@@ -0,0 +1,103 @@
# network architecture
model: FsmnKWSMT
model_conf:
ctc_weight: 1.0
# encoder related
encoder: FSMNMT
encoder_conf:
input_dim: 400
input_affine_dim: 140
fsmn_layers: 4
linear_dim: 250
proj_dim: 128
lorder: 10
rorder: 2
lstride: 1
rstride: 1
output_affine_dim: 140
output_dim: 2599
output_dim2: 4
use_softmax: false
frontend: WavFrontend
frontend_conf:
fs: 16000
window: hamming
n_mels: 80
frame_length: 25
frame_shift: 10
lfr_m: 5
lfr_n: 3
specaug: SpecAugLFR
specaug_conf:
apply_time_warp: false
time_warp_window: 5
time_warp_mode: bicubic
apply_freq_mask: true
freq_mask_width_range:
- 0
- 30
lfr_rate: 3
num_freq_mask: 1
apply_time_mask: true
time_mask_width_range:
- 0
- 12
num_time_mask: 1
train_conf:
accum_grad: 1
grad_clip: 5
max_epoch: 100
keep_nbest_models: 100
avg_nbest_model: 10
avg_keep_nbest_models_type: loss
log_interval: 50
optim: adam
optim_conf:
lr: 0.001
scheduler: warmuplr
scheduler_conf:
warmup_steps: 10000
dataset: KwsMTDataset
dataset_conf:
index_ds: IndexDSJsonl
batch_sampler: EspnetStyleBatchSampler
batch_type: length # example or length
batch_size: 64000 # if batch_type is example, batch_size is the numbers of samples; if length, batch_size is source_token_len+target_token_len;
max_token_length: 1600 # filter samples if source_token_len+target_token_len > max_token_length,
buffer_size: 2048
shuffle: true
num_workers: 8
preprocessor_speech: SpeechPreprocessSpeedPerturb
preprocessor_speech_conf:
speed_perturb: [0.9, 1.0, 1.1]
dataloader: DataloaderMapStyle
tokenizer:
- CharTokenizer
- CharTokenizer
tokenizer_conf:
- unk_symbol: <unk>
split_with_space: true
token_list: null
seg_dict: null
- unk_symbol: <unk>
split_with_space: true
token_list: null
seg_dict: null
ctc_conf:
dropout_rate: 0.0
ctc_type: builtin # ctc_type: focalctc, builtin
reduce: true
ignore_nan_grad: true
extra_linear: false
normalize: null

View File

@@ -0,0 +1,103 @@
# network architecture
model: FsmnKWSMT
model_conf:
ctc_weight: 1.0
# encoder related
encoder: FSMNMT
encoder_conf:
input_dim: 360
input_affine_dim: 280
fsmn_layers: 4
linear_dim: 280
proj_dim: 200
lorder: 10
rorder: 2
lstride: 1
rstride: 1
output_affine_dim: 400
output_dim: 2602
output_dim2: 4
use_softmax: false
frontend: WavFrontend
frontend_conf:
fs: 16000
window: hamming
n_mels: 40
frame_length: 25
frame_shift: 10
lfr_m: 9
lfr_n: 3
specaug: SpecAugLFR
specaug_conf:
apply_time_warp: false
time_warp_window: 5
time_warp_mode: bicubic
apply_freq_mask: true
freq_mask_width_range:
- 0
- 30
lfr_rate: 3
num_freq_mask: 1
apply_time_mask: true
time_mask_width_range:
- 0
- 12
num_time_mask: 1
train_conf:
accum_grad: 1
grad_clip: 5
max_epoch: 100
keep_nbest_models: 100
avg_nbest_model: 10
avg_keep_nbest_models_type: loss
log_interval: 50
optim: adam
optim_conf:
lr: 0.001
scheduler: warmuplr
scheduler_conf:
warmup_steps: 10000
dataset: KwsMTDataset
dataset_conf:
index_ds: IndexDSJsonl
batch_sampler: EspnetStyleBatchSampler
batch_type: length # example or length
batch_size: 64000 # if batch_type is example, batch_size is the numbers of samples; if length, batch_size is source_token_len+target_token_len;
max_token_length: 1600 # filter samples if source_token_len+target_token_len > max_token_length,
buffer_size: 2048
shuffle: true
num_workers: 8
preprocessor_speech: SpeechPreprocessSpeedPerturb
preprocessor_speech_conf:
speed_perturb: [0.9, 1.0, 1.1]
dataloader: DataloaderMapStyle
tokenizer:
- CharTokenizer
- CharTokenizer
tokenizer_conf:
- unk_symbol: <unk>
split_with_space: true
token_list: null
seg_dict: null
- unk_symbol: <unk>
split_with_space: true
token_list: null
seg_dict: null
ctc_conf:
dropout_rate: 0.0
ctc_type: builtin # ctc_type: focalctc, builtin
reduce: true
ignore_nan_grad: true
extra_linear: false
normalize: null

View File

@@ -0,0 +1,137 @@
from __future__ import print_function
import argparse
import copy
import logging
import os
from shutil import copyfile
import torch
import yaml
from typing import Union
from funasr.models.fsmn_kws_mt.encoder import FSMNMTConvert
from funasr.models.fsmn_kws_mt.model import FsmnKWSMTConvert
def count_parameters(model):
return sum(p.numel() for p in model.parameters() if p.requires_grad)
def get_args():
parser = argparse.ArgumentParser(
description=
'load and convert network to each other between kaldi/pytorch format')
parser.add_argument('--config', required=True, help='config file')
parser.add_argument(
'--network_file',
default='',
required=True,
help='input network, support kaldi.txt/pytorch.pt')
parser.add_argument('--model_dir', required=True, help='save model dir')
parser.add_argument('--model_name', required=True, help='save model name')
parser.add_argument('--model_name2', required=True, help='save model name')
parser.add_argument('--convert_to',
default='kaldi',
required=True,
help='target network type, kaldi/pytorch')
args = parser.parse_args()
return args
def convert_to_kaldi(
configs,
network_file,
model_dir,
model_name="convert.kaldi.txt",
model_name2="convert.kaldi2.txt"
):
copyfile(network_file, os.path.join(model_dir, 'origin.torch.pt'))
model = FsmnKWSMTConvert(
encoder='FSMNMTConvert',
encoder_conf=configs['encoder_conf'],
ctc_conf=configs['ctc_conf'],
)
print(model)
num_params = count_parameters(model)
print('the number of model params: {}'.format(num_params))
states= torch.load(network_file, map_location='cpu')
model.load_state_dict(states["state_dict"])
kaldi_text = os.path.join(model_dir, model_name)
with open(kaldi_text, 'w', encoding='utf8') as fout:
nnet_desp = model.to_kaldi_net()
fout.write(nnet_desp)
fout.close()
kaldi_text2 = os.path.join(model_dir, model_name2)
with open(kaldi_text2, 'w', encoding='utf8') as fout:
nnet_desp2 = model.to_kaldi_net2()
fout.write(nnet_desp2)
fout.close()
def convert_to_pytorch(
configs,
network_file,
model_dir,
model_name="convert.torch.pt"
):
model = FsmnKWSMTConvert(
encoder='FSMNMTConvert',
encoder_conf=configs['encoder_conf'],
ctc_conf=configs['ctc_conf'],
)
num_params = count_parameters(model)
print('the number of model params: {}'.format(num_params))
copyfile(network_file, os.path.join(model_dir, 'origin.kaldi.txt'))
model.to_pytorch_net(network_file)
save_model_path = os.path.join(model_dir, model_name)
torch.save({"model": model.state_dict()}, save_model_path)
print('convert torch format back to kaldi')
kaldi_text = os.path.join(model_dir, 'convert.kaldi.txt')
with open(kaldi_text, 'w', encoding='utf8') as fout:
nnet_desp = model.to_kaldi_net()
fout.write(nnet_desp)
fout.close()
print('Done!')
def main():
args = get_args()
logging.basicConfig(level=logging.DEBUG,
format='%(asctime)s %(levelname)s %(message)s')
print(args)
with open(args.config, 'r') as fin:
configs = yaml.load(fin, Loader=yaml.FullLoader)
if args.convert_to == 'pytorch':
print('convert kaldi net to pytorch...')
convert_to_pytorch(
configs,
args.network_file,
args.model_dir,
args.model_name,
args.model_name2,
)
elif args.convert_to == 'kaldi':
print('convert pytorch net to kaldi...')
convert_to_kaldi(
configs,
args.network_file,
args.model_dir,
args.model_name
)
else:
print('unsupported target network type: {}'.format(args.convert_to))
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,36 @@
workspace=`pwd`
# download model
local_path_root=${workspace}/modelscope_models
mkdir -p ${local_path_root}
local_path=${local_path_root}/speech_charctc_kws_phone-xiaoyun_mt
if [ ! -d "$local_path" ]; then
git clone https://www.modelscope.cn/iic/speech_charctc_kws_phone-xiaoyun_mt.git ${local_path}
fi
export PATH=${local_path}/runtime:$PATH
export LD_LIBRARY_PATH=${local_path}/runtime:$LD_LIBRARY_PATH
# finetune config file
config=./conf/fsmn_4e_l10r2_250_128_fdim80_t2599_t4.yaml
# finetune output checkpoint
torch_nnet=exp/finetune_outputs/model.pt.avg10
out_dir=exp/finetune_outputs
if [ ! -d "$out_dir" ]; then
mkdir -p $out_dir
fi
python convert.py --config $config \
--network_file $torch_nnet \
--model_dir $out_dir \
--model_name "convert.kaldi.txt" \
--model_name2 "convert.kaldi2.txt" \
--convert_to kaldi
nnet-copy --binary=true ${out_dir}/convert.kaldi.txt ${out_dir}/convert.kaldi.net
nnet-copy --binary=true ${out_dir}/convert.kaldi2.txt ${out_dir}/convert.kaldi2.net

View File

@@ -0,0 +1,18 @@
#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
# MIT License (https://opensource.org/licenses/MIT)
from funasr import AutoModel
model = AutoModel(
model="iic/speech_charctc_kws_phone-xiaoyun_mt",
keywords="小云小云",
output_dir="./outputs/debug",
device='cpu'
)
test_wav = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/KWS/pos_testset/kws_xiaoyunxiaoyun.wav"
res = model.generate(input=test_wav, cache={},)
print(res)

View File

@@ -0,0 +1,184 @@
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
# MIT License (https://opensource.org/licenses/MIT)
#!/usr/bin/env bash
# Set bash to 'debug' mode, it will exit on :
# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
set -e
set -u
set -o pipefail
. ./path.sh
workspace=`pwd`
CUDA_VISIBLE_DEVICES="0,1"
stage=2
stop_stage=3
inference_device="cuda" #"cpu"
inference_checkpoint="model.pt.avg10"
inference_scp="wav.scp"
inference_batch_size=32
nj=32
test_sets="test"
# model_name from model_hub, or model_dir in local path
## option 1, download model automatically, unsupported currently
model_name_or_model_dir="iic/speech_charctc_kws_phone-xiaoyun_mt"
## option 2, download model by git
local_path_root=${workspace}/modelscope_models
model_name_or_model_dir=${local_path_root}/${model_name_or_model_dir}
if [ ! -d $model_name_or_model_dir ]; then
mkdir -p ${model_name_or_model_dir}
git clone https://www.modelscope.cn/iic/speech_charctc_kws_phone-xiaoyun_mt.git ${model_name_or_model_dir}
fi
config=fsmn_4e_l10r2_250_128_fdim80_t2599_t4.yaml
token_list=${model_name_or_model_dir}/funasr/tokens_2599.txt
token_list2=${model_name_or_model_dir}/funasr/tokens_xiaoyun.txt
lexicon_list=${model_name_or_model_dir}/funasr/lexicon.txt
cmvn_file=${model_name_or_model_dir}/funasr/am.mvn.dim80_l2r2
init_param="${model_name_or_model_dir}/funasr/basetrain_fsmn_4e_l10r2_250_128_fdim80_t2599.pt"
# data prepare
# data dir, which contains: train.json, val.json
data_dir=../../data
train_data="${data_dir}/train.jsonl"
val_data="${data_dir}/val.jsonl"
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
echo "stage 1: Generate audio json list"
# generate train.jsonl and val.jsonl from wav.scp and text.txt
python $FUNASR_DIR/funasr/datasets/audio_datasets/scp2jsonl.py \
++scp_file_list='['''${data_dir}/train_wav.scp''', '''${data_dir}/train_text.txt''']' \
++data_type_list='["source", "target"]' \
++jsonl_file_out="${train_data}"
python $FUNASR_DIR/funasr/datasets/audio_datasets/scp2jsonl.py \
++scp_file_list='['''${data_dir}/val_wav.scp''', '''${data_dir}/val_text.txt''']' \
++data_type_list='["source", "target"]' \
++jsonl_file_out="${val_data}"
fi
# exp output dir
output_dir="${workspace}/exp/finetune_outputs"
# Training Stage
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
echo "stage 2: KWS Training"
mkdir -p ${output_dir}
current_time=$(date "+%Y-%m-%d_%H-%M")
log_file="${output_dir}/train.log.txt.${current_time}"
echo "log_file: ${log_file}"
echo "finetune use basetrain model: ${init_param}"
export CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES
gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
torchrun --nnodes 1 --nproc_per_node ${gpu_num} \
../../../funasr/bin/train.py \
--config-path "${workspace}/conf" \
--config-name "${config}" \
++init_param="${init_param}" \
++token_lists='['''${token_list}''', '''${token_list2}''']' \
++seg_dicts='['''${lexicon_list}''', '''${lexicon_list}''']' \
++disable_update=true \
++train_data_set_list="${train_data}" \
++valid_data_set_list="${val_data}" \
++frontend_conf.cmvn_file="${cmvn_file}" \
++output_dir="${output_dir}" &> ${log_file}
fi
# Testing Stage
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
echo "stage 3: Inference"
keywords=(小云小云)
keywords_string=$(IFS=,; echo "${keywords[*]}")
echo "keywords: $keywords_string"
if [ ${inference_device} == "cuda" ]; then
nj=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
else
inference_batch_size=1
CUDA_VISIBLE_DEVICES=""
for JOB in $(seq ${nj}); do
CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"-1,"
done
fi
for dset in ${test_sets}; do
inference_dir="${output_dir}/inference-${inference_checkpoint}/${dset}"
_logdir="${inference_dir}/logdir"
echo "inference_dir: ${inference_dir}"
mkdir -p "${_logdir}"
test_data_dir="${data_dir}/${dset}"
key_file=${test_data_dir}/${inference_scp}
split_scps=
for JOB in $(seq "${nj}"); do
split_scps+=" ${_logdir}/keys.${JOB}.scp"
done
$FUNASR_DIR/examples/aishell/paraformer/utils/split_scp.pl "${key_file}" ${split_scps}
gpuid_list_array=(${CUDA_VISIBLE_DEVICES//,/ })
for JOB in $(seq ${nj}); do
{
id=$((JOB-1))
gpuid=${gpuid_list_array[$id]}
echo "${output_dir}"
export CUDA_VISIBLE_DEVICES=${gpuid}
python ../../../funasr/bin/inference.py \
--config-path="${output_dir}" \
--config-name="config.yaml" \
++init_param="${output_dir}/${inference_checkpoint}" \
++token_lists='['''${token_list}''', '''${token_list2}''']' \
++seg_dicts='['''${lexicon_list}''', '''${lexicon_list}''']' \
++frontend_conf.cmvn_file="${cmvn_file}" \
++keywords="\"$keywords_string"\" \
++input="${_logdir}/keys.${JOB}.scp" \
++output_dir="${inference_dir}/${JOB}" \
++device="${inference_device}" \
++ncpu=1 \
++disable_log=true \
++batch_size="${inference_batch_size}" &> ${_logdir}/log.${JOB}.txt
}&
done
wait
for f in detect detect2; do
if [ -f "${inference_dir}/${JOB}/${f}" ]; then
for JOB in $(seq "${nj}"); do
cat "${inference_dir}/${JOB}/${f}"
done | sort -k1 >"${inference_dir}/${f}"
fi
done
mkdir -p ${inference_dir}/task1
python funasr/utils/compute_det_ctc.py \
--keywords ${keywords_string} \
--test_data ${test_data_dir}/wav.scp \
--trans_data ${test_data_dir}/text \
--score_file ${inference_dir}/detect \
--stats_dir ${inference_dir}/task1
mkdir -p ${inference_dir}/task2
python funasr/utils/compute_det_ctc.py \
--keywords ${keywords_string} \
--test_data ${test_data_dir}/wav.scp \
--trans_data ${test_data_dir}/text \
--score_file ${inference_dir}/detect2 \
--stats_dir ${inference_dir}/task2
done
fi

View File

@@ -0,0 +1 @@
../../../funasr

View File

@@ -0,0 +1,20 @@
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
# MIT License (https://opensource.org/licenses/MIT)
# method1, inference from model hub
model="iic/speech_charctc_kws_phone-xiaoyun_mt"
# for more input type, please ref to readme.md
input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/KWS/pos_testset/kws_xiaoyunxiaoyun.wav"
keywords=(小云小云)
keywords_string=$(IFS=,; echo "${keywords[*]}")
echo "keywords: $keywords_string"
python funasr/bin/inference.py \
+model=${model} \
+input=${input} \
+output_dir="./outputs/debug" \
+device="cpu" \
++keywords="\"$keywords_string"\"

View File

@@ -0,0 +1,42 @@
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
# MIT License (https://opensource.org/licenses/MIT)
# method2, inference from local model
# for more input type, please ref to readme.md
input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/KWS/pos_testset/kws_xiaoyunxiaoyun.wav"
output_dir="./outputs/debug"
workspace=`pwd`
# download model
local_path_root=${workspace}/modelscope_models
mkdir -p ${local_path_root}
local_path=${local_path_root}/speech_charctc_kws_phone-xiaoyun_mt
git clone https://www.modelscope.cn/iic/speech_charctc_kws_phone-xiaoyun_mt.git ${local_path}
device="cuda:0" # "cuda:0" for gpu0, "cuda:1" for gpu1, "cpu"
config="inference_fsmn_4e_l10r2_280_200_fdim40_t2602_t4.yaml"
tokens="${local_path}/funasr/tokens_2602.txt"
tokens2="${local_path}/funasr/tokens_xiaoyun.txt"
seg_dict="${local_path}/funasr/lexicon.txt"
init_param="${local_path}/funasr/finetune_fsmn_4e_l10r2_280_200_fdim40_t2602_t4_xiaoyun_xiaoyun.pt"
cmvn_file="${local_path}/funasr/am.mvn.dim40_l4r4"
keywords=(小云小云)
keywords_string=$(IFS=,; echo "${keywords[*]}")
echo "keywords: $keywords_string"
python -m funasr.bin.inference \
--config-path "${local_path}/funasr" \
--config-name "${config}" \
++init_param="${init_param}" \
++frontend_conf.cmvn_file="${cmvn_file}" \
++token_lists='['''${tokens}''', '''${tokens2}''']' \
++seg_dicts='['''${seg_dict}''', '''${seg_dict}''']' \
++input="${input}" \
++output_dir="${output_dir}" \
++device="${device}" \
++keywords="\"$keywords_string"\"

View File

@@ -0,0 +1,5 @@
export FUNASR_DIR=$PWD/../../..
# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
export PYTHONIOENCODING=UTF-8
export PATH=$FUNASR_DIR/funasr/bin:$PATH

View File

@@ -0,0 +1,49 @@
#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
# MIT License (https://opensource.org/licenses/MIT)
from funasr import AutoModel
wav_file = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/vad_example.wav"
model = AutoModel(model="iic/speech_fsmn_vad_zh-cn-16k-common-pytorch")
res = model.generate(input=wav_file)
print(res)
# [[beg1, end1], [beg2, end2], .., [begN, endN]]
# beg/end: ms
import soundfile
import os
wav_file = os.path.join(model.model_path, "example/vad_example.wav")
speech, sample_rate = soundfile.read(wav_file)
chunk_size = 200 # ms
chunk_stride = int(chunk_size * sample_rate / 1000)
cache = {}
total_chunk_num = int(len((speech) - 1) / chunk_stride + 1)
for i in range(total_chunk_num):
speech_chunk = speech[i * chunk_stride : (i + 1) * chunk_stride]
is_final = i == total_chunk_num - 1
res = model.generate(
input=speech_chunk,
cache=cache,
is_final=is_final,
chunk_size=chunk_size,
disable_pbar=True,
)
# print(res)
if len(res[0]["value"]):
print(res)
# 1. [[beg1, end1], [beg2, end2], .., [begN, endN]]; [[beg, end]]; [[beg1, end1], [beg2, end2]]
# 2. [[beg, -1]]
# 3. [[-1, end]]
# beg/end: ms

View File

@@ -0,0 +1,10 @@
model="iic/speech_fsmn_vad_zh-cn-16k-common-pytorch"
python funasr/bin/inference.py \
+model=${model} \
+input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/vad_example.wav" \
+output_dir="./outputs/debug" \
+device="cpu" \

View File

@@ -0,0 +1,25 @@
#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
# MIT License (https://opensource.org/licenses/MIT)
# method1, inference from model hub
from funasr import AutoModel
model = AutoModel(model="iic/speech_fsmn_vad_zh-cn-16k-common-pytorch")
res = model.export(type="onnx", quantize=False)
print(res)
# method2, inference from local path
from funasr import AutoModel
model = AutoModel(
model="/Users/zhifu/.cache/modelscope/hub/iic/speech_fsmn_vad_zh-cn-16k-common-pytorch"
)
res = model.export(type="onnx", quantize=False)
print(res)

View File

@@ -0,0 +1,24 @@
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
# MIT License (https://opensource.org/licenses/MIT)
# method1, inference from model hub
export HYDRA_FULL_ERROR=1
model="iic/speech_fsmn_vad_zh-cn-16k-common-pytorch"
python -m funasr.bin.export \
++model=${model} \
++model_revision=${model_revision} \
++type="onnx" \
++quantize=false
# method2, inference from local path
model="/Users/zhifu/.cache/modelscope/hub/iic/speech_fsmn_vad_zh-cn-16k-common-pytorch"
python -m funasr.bin.export \
++model=${model} \
++type="onnx" \
++quantize=false

View File

@@ -0,0 +1,735 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from enum import Enum
import re, sys, unicodedata
import codecs
import argparse
from tqdm import tqdm
import os
import pdb
remove_tag = False
spacelist = [" ", "\t", "\r", "\n"]
puncts = [
"!",
",",
"?",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
]
class Code(Enum):
match = 1
substitution = 2
insertion = 3
deletion = 4
class WordError(object):
def __init__(self):
self.errors = {
Code.substitution: 0,
Code.insertion: 0,
Code.deletion: 0,
}
self.ref_words = 0
def get_wer(self):
assert self.ref_words != 0
errors = (
self.errors[Code.substitution]
+ self.errors[Code.insertion]
+ self.errors[Code.deletion]
)
return 100.0 * errors / self.ref_words
def get_result_string(self):
return (
f"error_rate={self.get_wer():.4f}, "
f"ref_words={self.ref_words}, "
f"subs={self.errors[Code.substitution]}, "
f"ins={self.errors[Code.insertion]}, "
f"dels={self.errors[Code.deletion]}"
)
def characterize(string):
res = []
i = 0
while i < len(string):
char = string[i]
if char in puncts:
i += 1
continue
cat1 = unicodedata.category(char)
# https://unicodebook.readthedocs.io/unicode.html#unicode-categories
if cat1 == "Zs" or cat1 == "Cn" or char in spacelist: # space or not assigned
i += 1
continue
if cat1 == "Lo": # letter-other
res.append(char)
i += 1
else:
# some input looks like: <unk><noise>, we want to separate it to two words.
sep = " "
if char == "<":
sep = ">"
j = i + 1
while j < len(string):
c = string[j]
if ord(c) >= 128 or (c in spacelist) or (c == sep):
break
j += 1
if j < len(string) and string[j] == ">":
j += 1
res.append(string[i:j])
i = j
return res
def stripoff_tags(x):
if not x:
return ""
chars = []
i = 0
T = len(x)
while i < T:
if x[i] == "<":
while i < T and x[i] != ">":
i += 1
i += 1
else:
chars.append(x[i])
i += 1
return "".join(chars)
def normalize(sentence, ignore_words, cs, split=None):
"""sentence, ignore_words are both in unicode"""
new_sentence = []
for token in sentence:
x = token
if not cs:
x = x.upper()
if x in ignore_words:
continue
if remove_tag:
x = stripoff_tags(x)
if not x:
continue
if split and x in split:
new_sentence += split[x]
else:
new_sentence.append(x)
return new_sentence
class Calculator:
def __init__(self):
self.data = {}
self.space = []
self.cost = {}
self.cost["cor"] = 0
self.cost["sub"] = 1
self.cost["del"] = 1
self.cost["ins"] = 1
def calculate(self, lab, rec):
# Initialization
lab.insert(0, "")
rec.insert(0, "")
while len(self.space) < len(lab):
self.space.append([])
for row in self.space:
for element in row:
element["dist"] = 0
element["error"] = "non"
while len(row) < len(rec):
row.append({"dist": 0, "error": "non"})
for i in range(len(lab)):
self.space[i][0]["dist"] = i
self.space[i][0]["error"] = "del"
for j in range(len(rec)):
self.space[0][j]["dist"] = j
self.space[0][j]["error"] = "ins"
self.space[0][0]["error"] = "non"
for token in lab:
if token not in self.data and len(token) > 0:
self.data[token] = {"all": 0, "cor": 0, "sub": 0, "ins": 0, "del": 0}
for token in rec:
if token not in self.data and len(token) > 0:
self.data[token] = {"all": 0, "cor": 0, "sub": 0, "ins": 0, "del": 0}
# Computing edit distance
for i, lab_token in enumerate(lab):
for j, rec_token in enumerate(rec):
if i == 0 or j == 0:
continue
min_dist = sys.maxsize
min_error = "none"
dist = self.space[i - 1][j]["dist"] + self.cost["del"]
error = "del"
if dist < min_dist:
min_dist = dist
min_error = error
dist = self.space[i][j - 1]["dist"] + self.cost["ins"]
error = "ins"
if dist < min_dist:
min_dist = dist
min_error = error
if lab_token == rec_token.replace("<BIAS>", ""):
dist = self.space[i - 1][j - 1]["dist"] + self.cost["cor"]
error = "cor"
else:
dist = self.space[i - 1][j - 1]["dist"] + self.cost["sub"]
error = "sub"
if dist < min_dist:
min_dist = dist
min_error = error
self.space[i][j]["dist"] = min_dist
self.space[i][j]["error"] = min_error
# Tracing back
result = {
"lab": [],
"rec": [],
"code": [],
"all": 0,
"cor": 0,
"sub": 0,
"ins": 0,
"del": 0,
}
i = len(lab) - 1
j = len(rec) - 1
while True:
if self.space[i][j]["error"] == "cor": # correct
if len(lab[i]) > 0:
self.data[lab[i]]["all"] = self.data[lab[i]]["all"] + 1
self.data[lab[i]]["cor"] = self.data[lab[i]]["cor"] + 1
result["all"] = result["all"] + 1
result["cor"] = result["cor"] + 1
result["lab"].insert(0, lab[i])
result["rec"].insert(0, rec[j])
result["code"].insert(0, Code.match)
i = i - 1
j = j - 1
elif self.space[i][j]["error"] == "sub": # substitution
if len(lab[i]) > 0:
self.data[lab[i]]["all"] = self.data[lab[i]]["all"] + 1
self.data[lab[i]]["sub"] = self.data[lab[i]]["sub"] + 1
result["all"] = result["all"] + 1
result["sub"] = result["sub"] + 1
result["lab"].insert(0, lab[i])
result["rec"].insert(0, rec[j])
result["code"].insert(0, Code.substitution)
i = i - 1
j = j - 1
elif self.space[i][j]["error"] == "del": # deletion
if len(lab[i]) > 0:
self.data[lab[i]]["all"] = self.data[lab[i]]["all"] + 1
self.data[lab[i]]["del"] = self.data[lab[i]]["del"] + 1
result["all"] = result["all"] + 1
result["del"] = result["del"] + 1
result["lab"].insert(0, lab[i])
result["rec"].insert(0, "")
result["code"].insert(0, Code.deletion)
i = i - 1
elif self.space[i][j]["error"] == "ins": # insertion
if len(rec[j]) > 0:
self.data[rec[j]]["ins"] = self.data[rec[j]]["ins"] + 1
result["ins"] = result["ins"] + 1
result["lab"].insert(0, "")
result["rec"].insert(0, rec[j])
result["code"].insert(0, Code.insertion)
j = j - 1
elif self.space[i][j]["error"] == "non": # starting point
break
else: # shouldn't reach here
print(
"this should not happen , i = {i} , j = {j} , error = {error}".format(
i=i, j=j, error=self.space[i][j]["error"]
)
)
return result
def overall(self):
result = {"all": 0, "cor": 0, "sub": 0, "ins": 0, "del": 0}
for token in self.data:
result["all"] = result["all"] + self.data[token]["all"]
result["cor"] = result["cor"] + self.data[token]["cor"]
result["sub"] = result["sub"] + self.data[token]["sub"]
result["ins"] = result["ins"] + self.data[token]["ins"]
result["del"] = result["del"] + self.data[token]["del"]
return result
def cluster(self, data):
result = {"all": 0, "cor": 0, "sub": 0, "ins": 0, "del": 0}
for token in data:
if token in self.data:
result["all"] = result["all"] + self.data[token]["all"]
result["cor"] = result["cor"] + self.data[token]["cor"]
result["sub"] = result["sub"] + self.data[token]["sub"]
result["ins"] = result["ins"] + self.data[token]["ins"]
result["del"] = result["del"] + self.data[token]["del"]
return result
def keys(self):
return list(self.data.keys())
def width(string):
return sum(1 + (unicodedata.east_asian_width(c) in "AFW") for c in string)
def default_cluster(word):
unicode_names = [unicodedata.name(char) for char in word]
for i in reversed(range(len(unicode_names))):
if unicode_names[i].startswith("DIGIT"): # 1
unicode_names[i] = "Number" # 'DIGIT'
elif unicode_names[i].startswith("CJK UNIFIED IDEOGRAPH") or unicode_names[i].startswith(
"CJK COMPATIBILITY IDEOGRAPH"
):
# 明 / 郎
unicode_names[i] = "Mandarin" # 'CJK IDEOGRAPH'
elif unicode_names[i].startswith("LATIN CAPITAL LETTER") or unicode_names[i].startswith(
"LATIN SMALL LETTER"
):
# A / a
unicode_names[i] = "English" # 'LATIN LETTER'
elif unicode_names[i].startswith("HIRAGANA LETTER"): # は こ め
unicode_names[i] = "Japanese" # 'GANA LETTER'
elif (
unicode_names[i].startswith("AMPERSAND")
or unicode_names[i].startswith("APOSTROPHE")
or unicode_names[i].startswith("COMMERCIAL AT")
or unicode_names[i].startswith("DEGREE CELSIUS")
or unicode_names[i].startswith("EQUALS SIGN")
or unicode_names[i].startswith("FULL STOP")
or unicode_names[i].startswith("HYPHEN-MINUS")
or unicode_names[i].startswith("LOW LINE")
or unicode_names[i].startswith("NUMBER SIGN")
or unicode_names[i].startswith("PLUS SIGN")
or unicode_names[i].startswith("SEMICOLON")
):
# & / ' / @ / ℃ / = / . / - / _ / # / + / ;
del unicode_names[i]
else:
return "Other"
if len(unicode_names) == 0:
return "Other"
if len(unicode_names) == 1:
return unicode_names[0]
for i in range(len(unicode_names) - 1):
if unicode_names[i] != unicode_names[i + 1]:
return "Other"
return unicode_names[0]
def get_args():
parser = argparse.ArgumentParser(description="wer cal")
parser.add_argument("--ref", type=str, help="Text input path")
parser.add_argument("--ref_ocr", type=str, help="Text input path")
parser.add_argument("--rec_name", type=str, action="append", default=[])
parser.add_argument("--rec_file", type=str, action="append", default=[])
parser.add_argument("--verbose", type=int, default=1, help="show")
parser.add_argument("--char", type=bool, default=True, help="show")
args = parser.parse_args()
return args
def main(args):
cluster_file = ""
ignore_words = set()
tochar = args.char
verbose = args.verbose
padding_symbol = " "
case_sensitive = False
max_words_per_line = sys.maxsize
split = None
if not case_sensitive:
ig = set([w.upper() for w in ignore_words])
ignore_words = ig
default_clusters = {}
default_words = {}
ref_file = args.ref
ref_ocr = args.ref_ocr
rec_files = args.rec_file
rec_names = args.rec_name
assert len(rec_files) == len(rec_names)
# load ocr
ref_ocr_dict = {}
with codecs.open(ref_ocr, "r", "utf-8") as fh:
for line in fh:
if "$" in line:
line = line.replace("$", " ")
if tochar:
array = characterize(line)
else:
array = line.strip().split()
if len(array) == 0:
continue
fid = array[0]
ref_ocr_dict[fid] = normalize(array[1:], ignore_words, case_sensitive, split)
if split and not case_sensitive:
newsplit = dict()
for w in split:
words = split[w]
for i in range(len(words)):
words[i] = words[i].upper()
newsplit[w.upper()] = words
split = newsplit
rec_sets = {}
calculators_dict = dict()
ub_wer_dict = dict()
hotwords_related_dict = dict() # 记录recall相关的内容
for i, hyp_file in enumerate(rec_files):
rec_sets[rec_names[i]] = dict()
with codecs.open(hyp_file, "r", "utf-8") as fh:
for line in fh:
if tochar:
array = characterize(line)
else:
array = line.strip().split()
if len(array) == 0:
continue
fid = array[0]
rec_sets[rec_names[i]][fid] = normalize(
array[1:], ignore_words, case_sensitive, split
)
calculators_dict[rec_names[i]] = Calculator()
ub_wer_dict[rec_names[i]] = {"u_wer": WordError(), "b_wer": WordError(), "wer": WordError()}
hotwords_related_dict[rec_names[i]] = {"tp": 0, "tn": 0, "fp": 0, "fn": 0}
# tp: 热词在label里同时在rec里
# tn: 热词不在label里同时不在rec里
# fp: 热词不在label里但是在rec里
# fn: 热词在label里但是不在rec里
# record wrong label but in ocr
wrong_rec_but_in_ocr_dict = {}
for rec_name in rec_names:
wrong_rec_but_in_ocr_dict[rec_name] = 0
_file_total_len = 0
with os.popen("cat {} | wc -l".format(ref_file)) as pipe:
_file_total_len = int(pipe.read().strip())
# compute error rate on the interaction of reference file and hyp file
for line in tqdm(open(ref_file, "r", encoding="utf-8"), total=_file_total_len):
if tochar:
array = characterize(line)
else:
array = line.rstrip("\n").split()
if len(array) == 0:
continue
fid = array[0]
lab = normalize(array[1:], ignore_words, case_sensitive, split)
if verbose:
print("\nutt: %s" % fid)
ocr_text = ref_ocr_dict[fid]
ocr_set = set(ocr_text)
print("ocr: {}".format(" ".join(ocr_text)))
list_match = [] # 指label里面在ocr里面的内容
list_not_mathch = []
tmp_error = 0
tmp_match = 0
for index in range(len(lab)):
# text_list.append(uttlist[index+1])
if lab[index] not in ocr_set:
tmp_error += 1
list_not_mathch.append(lab[index])
else:
tmp_match += 1
list_match.append(lab[index])
print("label in ocr: {}".format(" ".join(list_match)))
# for each reco file
base_wrong_ocr_wer = None
ocr_wrong_ocr_wer = None
for rec_name in rec_names:
rec_set = rec_sets[rec_name]
if fid not in rec_set:
continue
rec = rec_set[fid]
# print(rec)
for word in rec + lab:
if word not in default_words:
default_cluster_name = default_cluster(word)
if default_cluster_name not in default_clusters:
default_clusters[default_cluster_name] = {}
if word not in default_clusters[default_cluster_name]:
default_clusters[default_cluster_name][word] = 1
default_words[word] = default_cluster_name
result = calculators_dict[rec_name].calculate(lab.copy(), rec.copy())
if verbose:
if result["all"] != 0:
wer = (
float(result["ins"] + result["sub"] + result["del"]) * 100.0 / result["all"]
)
else:
wer = 0.0
print("WER(%s): %4.2f %%" % (rec_name, wer), end=" ")
print(
"N=%d C=%d S=%d D=%d I=%d"
% (result["all"], result["cor"], result["sub"], result["del"], result["ins"])
)
# print(result['rec'])
wrong_rec_but_in_ocr = []
for idx in range(len(result["lab"])):
if result["lab"][idx] != "":
if result["lab"][idx] != result["rec"][idx].replace("<BIAS>", ""):
if result["lab"][idx] in list_match:
wrong_rec_but_in_ocr.append(result["lab"][idx])
wrong_rec_but_in_ocr_dict[rec_name] += 1
print("wrong_rec_but_in_ocr: {}".format(" ".join(wrong_rec_but_in_ocr)))
if rec_name == "base":
base_wrong_ocr_wer = len(wrong_rec_but_in_ocr)
if "ocr" in rec_name or "hot" in rec_name:
ocr_wrong_ocr_wer = len(wrong_rec_but_in_ocr)
if ocr_wrong_ocr_wer < base_wrong_ocr_wer:
print(
"{} {} helps, {} -> {}".format(
fid, rec_name, base_wrong_ocr_wer, ocr_wrong_ocr_wer
)
)
elif ocr_wrong_ocr_wer > base_wrong_ocr_wer:
print(
"{} {} hurts, {} -> {}".format(
fid, rec_name, base_wrong_ocr_wer, ocr_wrong_ocr_wer
)
)
# recall = 0
# false_alarm = 0
# for idx in range(len(result['lab'])):
# if "<BIAS>" in result['rec'][idx]:
# if result['rec'][idx].replace("<BIAS>", "") in list_match:
# recall += 1
# else:
# false_alarm += 1
# print("bias hotwords recall: {}, fa: {}, list_match {}, recall: {:.2f}, fa: {:.2f}".format(
# recall, false_alarm, len(list_match), recall / len(list_match) if len(list_match) != 0 else 0, false_alarm / len(list_match) if len(list_match) != 0 else 0
# ))
# tp: 热词在label里同时在rec里
# tn: 热词不在label里同时不在rec里
# fp: 热词不在label里但是在rec里
# fn: 热词在label里但是不在rec里
_rec_list = [word.replace("<BIAS>", "") for word in rec]
_label_list = [word for word in lab]
_tp = _tn = _fp = _fn = 0
hot_true_list = [hotword for hotword in ocr_text if hotword in _label_list]
hot_bad_list = [hotword for hotword in ocr_text if hotword not in _label_list]
for badhotword in hot_bad_list:
count = len([word for word in _rec_list if word == badhotword])
# print(f"bad {badhotword} count: {count}")
# for word in _rec_list:
# if badhotword == word:
# count += 1
if count == 0:
hotwords_related_dict[rec_name]["tn"] += 1
_tn += 1
# fp: 0
else:
hotwords_related_dict[rec_name]["fp"] += count
_fp += count
# tn: 0
# if badhotword in _rec_list:
# hotwords_related_dict[rec_name]['fp'] += 1
# else:
# hotwords_related_dict[rec_name]['tn'] += 1
for hotword in hot_true_list:
true_count = len([word for word in _label_list if hotword == word])
rec_count = len([word for word in _rec_list if hotword == word])
# print(f"good {hotword} true_count: {true_count}, rec_count: {rec_count}")
if rec_count == true_count:
hotwords_related_dict[rec_name]["tp"] += true_count
_tp += true_count
elif rec_count > true_count:
hotwords_related_dict[rec_name]["tp"] += true_count
# fp: 不在label里但是在rec里
hotwords_related_dict[rec_name]["fp"] += rec_count - true_count
_tp += true_count
_fp += rec_count - true_count
else:
hotwords_related_dict[rec_name]["tp"] += rec_count
# fn: 热词在label里但是不在rec里
hotwords_related_dict[rec_name]["fn"] += true_count - rec_count
_tp += rec_count
_fn += true_count - rec_count
print(
"hotword: tp: {}, tn: {}, fp: {}, fn: {}, all: {}, recall: {:.2f}%".format(
_tp,
_tn,
_fp,
_fn,
sum([_tp, _tn, _fp, _fn]),
_tp / (_tp + _fn) * 100 if (_tp + _fn) != 0 else 0,
)
)
# if hotword in _rec_list:
# hotwords_related_dict[rec_name]['tp'] += 1
# else:
# hotwords_related_dict[rec_name]['fn'] += 1
# 计算uwer, bwer, wer
for code, rec_word, lab_word in zip(result["code"], result["rec"], result["lab"]):
if code == Code.match:
ub_wer_dict[rec_name]["wer"].ref_words += 1
if lab_word in hot_true_list:
# tmp_ref.append(ref_tokens[ref_idx])
ub_wer_dict[rec_name]["b_wer"].ref_words += 1
else:
ub_wer_dict[rec_name]["u_wer"].ref_words += 1
elif code == Code.substitution:
ub_wer_dict[rec_name]["wer"].ref_words += 1
ub_wer_dict[rec_name]["wer"].errors[Code.substitution] += 1
if lab_word in hot_true_list:
# tmp_ref.append(ref_tokens[ref_idx])
ub_wer_dict[rec_name]["b_wer"].ref_words += 1
ub_wer_dict[rec_name]["b_wer"].errors[Code.substitution] += 1
else:
ub_wer_dict[rec_name]["u_wer"].ref_words += 1
ub_wer_dict[rec_name]["u_wer"].errors[Code.substitution] += 1
elif code == Code.deletion:
ub_wer_dict[rec_name]["wer"].ref_words += 1
ub_wer_dict[rec_name]["wer"].errors[Code.deletion] += 1
if lab_word in hot_true_list:
# tmp_ref.append(ref_tokens[ref_idx])
ub_wer_dict[rec_name]["b_wer"].ref_words += 1
ub_wer_dict[rec_name]["b_wer"].errors[Code.deletion] += 1
else:
ub_wer_dict[rec_name]["u_wer"].ref_words += 1
ub_wer_dict[rec_name]["u_wer"].errors[Code.deletion] += 1
elif code == Code.insertion:
ub_wer_dict[rec_name]["wer"].errors[Code.insertion] += 1
if rec_word in hot_true_list:
ub_wer_dict[rec_name]["b_wer"].errors[Code.insertion] += 1
else:
ub_wer_dict[rec_name]["u_wer"].errors[Code.insertion] += 1
space = {}
space["lab"] = []
space["rec"] = []
for idx in range(len(result["lab"])):
len_lab = width(result["lab"][idx])
len_rec = width(result["rec"][idx])
length = max(len_lab, len_rec)
space["lab"].append(length - len_lab)
space["rec"].append(length - len_rec)
upper_lab = len(result["lab"])
upper_rec = len(result["rec"])
lab1, rec1 = 0, 0
while lab1 < upper_lab or rec1 < upper_rec:
if verbose > 1:
print("lab(%s):" % fid.encode("utf-8"), end=" ")
else:
print("lab:", end=" ")
lab2 = min(upper_lab, lab1 + max_words_per_line)
for idx in range(lab1, lab2):
token = result["lab"][idx]
print("{token}".format(token=token), end="")
for n in range(space["lab"][idx]):
print(padding_symbol, end="")
print(" ", end="")
print()
if verbose > 1:
print("rec(%s):" % fid.encode("utf-8"), end=" ")
else:
print("rec:", end=" ")
rec2 = min(upper_rec, rec1 + max_words_per_line)
for idx in range(rec1, rec2):
token = result["rec"][idx]
print("{token}".format(token=token), end="")
for n in range(space["rec"][idx]):
print(padding_symbol, end="")
print(" ", end="")
print()
# print('\n', end='\n')
lab1 = lab2
rec1 = rec2
print("\n", end="\n")
# break
if verbose:
print("===========================================================================")
print()
print(wrong_rec_but_in_ocr_dict)
for rec_name in rec_names:
result = calculators_dict[rec_name].overall()
if result["all"] != 0:
wer = float(result["ins"] + result["sub"] + result["del"]) * 100.0 / result["all"]
else:
wer = 0.0
print("{} Overall -> {:4.2f} %".format(rec_name, wer), end=" ")
print(
"N=%d C=%d S=%d D=%d I=%d"
% (result["all"], result["cor"], result["sub"], result["del"], result["ins"])
)
print(f"WER: {ub_wer_dict[rec_name]['wer'].get_result_string()}")
print(f"U-WER: {ub_wer_dict[rec_name]['u_wer'].get_result_string()}")
print(f"B-WER: {ub_wer_dict[rec_name]['b_wer'].get_result_string()}")
print(
"hotword: tp: {}, tn: {}, fp: {}, fn: {}, all: {}, recall: {:.2f}%".format(
hotwords_related_dict[rec_name]["tp"],
hotwords_related_dict[rec_name]["tn"],
hotwords_related_dict[rec_name]["fp"],
hotwords_related_dict[rec_name]["fn"],
sum([v for k, v in hotwords_related_dict[rec_name].items()]),
(
hotwords_related_dict[rec_name]["tp"]
/ (
hotwords_related_dict[rec_name]["tp"]
+ hotwords_related_dict[rec_name]["fn"]
)
* 100
if hotwords_related_dict[rec_name]["tp"] + hotwords_related_dict[rec_name]["fn"]
!= 0
else 0
),
)
)
# tp: 热词在label里同时在rec里
# tn: 热词不在label里同时不在rec里
# fp: 热词不在label里但是在rec里
# fn: 热词在label里但是不在rec里
if not verbose:
print()
print()
if __name__ == "__main__":
args = get_args()
# print("")
print(args)
main(args)

View File

@@ -0,0 +1,19 @@
#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
# MIT License (https://opensource.org/licenses/MIT)
from funasr import AutoModel
model = AutoModel(model="iic/LCB-NET", model_revision="v1.0.0")
res = model.generate(
input=(
"https://www.modelscope.cn/api/v1/models/iic/LCB-NET/repo?Revision=master&FilePath=example/asr_example.wav",
"https://www.modelscope.cn/api/v1/models/iic/LCB-NET/repo?Revision=master&FilePath=example/ocr.txt",
),
data_type=("sound", "text"),
)
print(res)

View File

@@ -0,0 +1,72 @@
file_dir="/home/yf352572/.cache/modelscope/hub/iic/LCB-NET/"
CUDA_VISIBLE_DEVICES="0,1"
inference_device="cuda"
if [ ${inference_device} == "cuda" ]; then
nj=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
else
inference_batch_size=1
CUDA_VISIBLE_DEVICES=""
for JOB in $(seq ${nj}); do
CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"-1,"
done
fi
inference_dir="outputs/slidespeech_dev"
_logdir="${inference_dir}/logdir"
echo "inference_dir: ${inference_dir}"
mkdir -p "${_logdir}"
key_file1=${file_dir}/dev/wav.scp
key_file2=${file_dir}/dev/ocr.txt
split_scps1=
split_scps2=
for JOB in $(seq "${nj}"); do
split_scps1+=" ${_logdir}/wav.${JOB}.scp"
split_scps2+=" ${_logdir}/ocr.${JOB}.txt"
done
utils/split_scp.pl "${key_file1}" ${split_scps1}
utils/split_scp.pl "${key_file2}" ${split_scps2}
gpuid_list_array=(${CUDA_VISIBLE_DEVICES//,/ })
for JOB in $(seq ${nj}); do
{
id=$((JOB-1))
gpuid=${gpuid_list_array[$id]}
export CUDA_VISIBLE_DEVICES=${gpuid}
python -m funasr.bin.inference \
--config-path=${file_dir} \
--config-name="config.yaml" \
++init_param=${file_dir}/model.pt \
++tokenizer_conf.token_list=${file_dir}/tokens.txt \
++input=[${_logdir}/wav.${JOB}.scp,${_logdir}/ocr.${JOB}.txt] \
+data_type='["kaldi_ark", "text"]' \
++tokenizer_conf.bpemodel=${file_dir}/bpe.pt \
++normalize_conf.stats_file=${file_dir}/am.mvn \
++output_dir="${inference_dir}/${JOB}" \
++device="${inference_device}" \
++ncpu=1 \
++disable_log=true &> ${_logdir}/log.${JOB}.txt
}&
done
wait
mkdir -p ${inference_dir}/1best_recog
for JOB in $(seq "${nj}"); do
cat "${inference_dir}/${JOB}/1best_recog/token" >> "${inference_dir}/1best_recog/token"
done
echo "Computing WER ..."
sed -e 's/ /\t/' -e 's/ //g' -e 's/▁/ /g' -e 's/\t /\t/' ${inference_dir}/1best_recog/token > ${inference_dir}/1best_recog/token.proc
cp ${file_dir}/dev/text ${inference_dir}/1best_recog/token.ref
cp ${file_dir}/dev/ocr.list ${inference_dir}/1best_recog/ocr.list
python utils/compute_wer.py ${inference_dir}/1best_recog/token.ref ${inference_dir}/1best_recog/token.proc ${inference_dir}/1best_recog/token.cer
tail -n 3 ${inference_dir}/1best_recog/token.cer
./run_bwer_recall.sh ${inference_dir}/1best_recog/
tail -n 6 ${inference_dir}/1best_recog/BWER-UWER.results |head -n 5

View File

@@ -0,0 +1,11 @@
#now_result_name=asr_conformer_acc1_lr002_warm20000/decode_asr_asr_model_valid.acc.ave
#hotword_type=ocr_1ngram_top10_hotwords_list
hot_exp_suf=$1
python compute_wer_details.py --v 1 \
--ref ${hot_exp_suf}/token.ref \
--ref_ocr ${hot_exp_suf}/ocr.list \
--rec_name base \
--rec_file ${hot_exp_suf}/token.proc \
> ${hot_exp_suf}/BWER-UWER.results

View File

@@ -0,0 +1 @@
../../aishell/paraformer/utils

View File

@@ -0,0 +1,139 @@
# coding=utf-8
import librosa
import base64
import io
import gradio as gr
import re
import numpy as np
import torch
import torchaudio
# from modelscope import HubApi
#
# api = HubApi()
#
# api.login('')
from funasr import AutoModel
# model = "/Users/zhifu/Downloads/modelscope_models/SenseVoiceCTC"
# model = "iic/SenseVoiceCTC"
# model = AutoModel(model=model,
# vad_model="iic/speech_fsmn_vad_zh-cn-16k-common-pytorch",
# vad_kwargs={"max_single_segment_time": 30000},
# trust_remote_code=True,
# )
import re
import os
import sys
if len(sys.argv) > 1:
ckpt_dir = sys.argv[1]
ckpt_id = sys.argv[2]
jsonl = sys.argv[3]
output_dir = sys.argv[4]
device = sys.argv[5]
new_sys = False
if len(sys.argv) > 6:
new_sys = True
else:
ckpt_dir = "/nfs/beinian.lzr/workspace/GPT-4o/Exp/exp7/5m-8gpu/exp5-1-0619"
ckpt_id = "model.pt.ep6"
jsonl = (
"/nfs/beinian.lzr/workspace/GPT-4o/Data/Speech2Text/TestData/s2tchat.v20240619.test.jsonl"
)
dataset = jsonl.split("/")[-1]
output_dir = os.path.join(ckpt_dir, f"inference-{ckpt_id}", dataset)
model = AutoModel(
model=ckpt_dir,
init_param=f"{os.path.join(ckpt_dir, ckpt_id)}",
output_dir=output_dir,
device=device,
fp16=False,
bf16=False,
llm_dtype="bf16",
)
def model_inference(input_wav, text_inputs, fs=16000):
if isinstance(input_wav, tuple):
fs, input_wav = input_wav
input_wav = input_wav.astype(np.float32) / np.iinfo(np.int16).max
if len(input_wav.shape) > 1:
input_wav = input_wav.mean(-1)
if fs != 16000:
print(f"audio_fs: {fs}")
resampler = torchaudio.transforms.Resample(fs, 16000)
input_wav_t = torch.from_numpy(input_wav).to(torch.float32)
input_wav = resampler(input_wav_t[None, :])[0, :].numpy().astype("float32")
input_wav_byte = input_wav.tobytes()
contents_i = []
system_prompt = text_inputs
user_prompt = f"<|startofspeech|>!!{input_wav_byte}<|endofspeech|>"
contents_i.append({"role": "system", "content": system_prompt})
contents_i.append({"role": "user", "content": user_prompt})
contents_i.append({"role": "assistant", "content": "target_out"})
res = model.generate(
input=[contents_i],
tearchforing=tearchforing,
cache={},
key=key,
)
print(res)
return res
audio_examples = [
[
"https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/BAC009S0764W0121.wav",
"You are a helpful assistant.",
],
]
description = """
Upload an audio file or input through a microphone, then type te System Prompt.
"""
def launch():
with gr.Blocks() as demo:
gr.Markdown(description)
with gr.Row():
with gr.Column():
audio_inputs = gr.Audio(label="Upload audio or use the microphone")
text_inputs = gr.Text(label="System Prompt", value="You are a helpful assistant.")
# with gr.Accordion("Configuration"):
# # task_inputs = gr.Radio(choices=["Speech Recognition", "Rich Text Transcription"],
# # value="Speech Recognition", label="Task")
# language_inputs = gr.Dropdown(choices=["auto", "zh", "en", "yue", "ja", "ko", "nospeech"],
# value="auto",
# label="Language")
gr.Examples(examples=audio_examples, inputs=[audio_inputs, text_inputs])
fn_button = gr.Button("Start")
text_outputs = gr.HTML(label="Results")
fn_button.click(model_inference, inputs=[audio_inputs, text_inputs], outputs=text_outputs)
# with gr.Accordion("More examples"):
# gr.HTML(centered_table_html)
demo.launch()
if __name__ == "__main__":
# iface.launch()
launch()

View File

@@ -0,0 +1,89 @@
# This is an example that demonstrates how to configure a model file.
# You can modify the configuration according to your own requirements.
# to print the register_table:
# from funasr.register import tables
# tables.print()
# network architecture
model: LLMASR
model_conf:
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: true
# encoder
encoder: WhisperWarp
encoder_conf:
hub: funasr
init_param_path: "/nfs/maziyang.mzy/models/Whisper-large-v2"
freeze: true
llm: Vicuna
llm_conf:
hub: hf
init_param_path: "/nfs/maziyang.mzy/models/vicuna-7b-v1.5"
freeze: true
adaptor: Linear
adaptor_conf:
downsample_rate: 5
llm_dim: 4096
encoder_dim: 512
# frontend related
frontend: WhisperFrontend
frontend_conf:
fs: 16000
whisper_model: large
do_pad_trim: true
specaug: SpecAugLFR
specaug_conf:
apply_time_warp: false
time_warp_window: 5
time_warp_mode: bicubic
apply_freq_mask: true
freq_mask_width_range:
- 0
- 30
lfr_rate: 6
num_freq_mask: 1
apply_time_mask: true
time_mask_width_range:
- 0
- 12
num_time_mask: 1
train_conf:
accum_grad: 1
grad_clip: 5
max_epoch: 150
keep_nbest_models: 10
log_interval: 10
optim: adamw
optim_conf:
lr: 0.0001
weight_decay: 0.000001
scheduler: warmuplr
scheduler_conf:
warmup_steps: 1500
dataset: AudioLLMDataset
dataset_conf:
index_ds: IndexDSJsonl
batch_sampler: BatchSampler
batch_type: example # example or length
batch_size: 8 # if batch_type is example, batch_size is the numbers of samples; if length, batch_size is source_token_len+target_token_len;
max_token_length: 2048 # filter samples if source_token_len+target_token_len > max_token_length,
buffer_size: 500
shuffle: True
num_workers: 4
preprocessor_text: TextPreprocessRemovePunctuation
tokenizer: HuggingfaceTokenizer
tokenizer_conf:
unk_symbol: <unk>
init_param_path: "/nfs/maziyang.mzy/models/vicuna-7b-v1.5"

View File

@@ -0,0 +1,94 @@
# This is an example that demonstrates how to configure a model file.
# You can modify the configuration according to your own requirements.
# to print the register_table:
# from funasr.register import tables
# tables.print()
# network architecture
model: LLMASR
model_conf:
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: true
# encoder
audio_encoder: "/nfs/zhifu.gzf/init_model/Whisper-large-v3" #iic/Whisper-large-v3
audio_encoder_conf:
hub: ms
freeze: true
llm: Qwen1.5-7b-chat
llm_conf:
hub: hf
freeze: true
init_param_path: "/nfs/zhifu.gzf/init_model/qwen/Qwen1___5-7B-Chat"
audio_adaptor: Linear
audio_adaptor_conf:
downsample_rate: 5
llm_dim: 4096
encoder_dim: 512
# frontend related
frontend: WhisperFrontend
frontend_conf:
fs: 16000
whisper_model: large-v3
do_pad_trim: true
permute: true # true: [bs, frames, dims]; false: [bs, dims, frames]
specaug: SpecAugLFR
specaug_conf:
apply_time_warp: false
time_warp_window: 5
time_warp_mode: bicubic
apply_freq_mask: true
freq_mask_width_range:
- 0
- 30
lfr_rate: 6
num_freq_mask: 1
apply_time_mask: true
time_mask_width_range:
- 0
- 12
num_time_mask: 1
train_conf:
accum_grad: 1
grad_clip: 5
max_epoch: 15
keep_nbest_models: 10
log_interval: 10
optim: adamw
optim_conf:
lr: 0.0001
weight_decay: 0.000000
scheduler: warmuplr
scheduler_conf:
warmup_steps: 1500
dataset: AudioLLMQwenAudioDataset
dataset_conf:
index_ds: IndexDSJsonl
batch_sampler: CustomDistributedBatchSampler
batch_type: example # example or length
batch_size: 4 # if batch_type is example, batch_size is the numbers of samples; if length, batch_size is source_token_len+target_token_len;
max_token_length: 3000 # filter samples if source_token_len+target_token_len > max_token_length,
shuffle: True
num_workers: 4
preprocessor_text: TextPreprocessRemovePunctuation
audio_adaptor_downsample_rate: ${audio_adaptor_conf.downsample_rate}
audio_encoder_downsample_rate: 2
# prompt: "<|startoftranscription|><|zh|><|transcribe|><|zh|><|notimestamps|><|wo_itn|>"
tokenizer: HuggingfaceTokenizer
tokenizer_conf:
unk_symbol: <unk>
init_param_path: "/nfs/zhifu.gzf/init_model/qwen/Qwen1___5-7B-Chat"

View File

@@ -0,0 +1,81 @@
# This is an example that demonstrates how to configure a model file.
# You can modify the configuration according to your own requirements.
# to print the register_table:
# from funasr.register import tables
# tables.print()
# network architecture
model: LLMASR2
model_conf:
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: true
# encoder
audio_encoder: "/nfs/zhifu.gzf/init_model/SenseVoiceModelscope"
audio_encoder_conf:
hub: ms
freeze: true
llm: Qwen1.5-7b-chat
llm_conf:
hub: hf
freeze: true
init_param_path: "/nfs/zhifu.gzf/init_model/qwen/Qwen1___5-7B-Chat_raw"
audio_adaptor: Transformer
audio_adaptor_conf:
downsample_rate: 2
llm_dim: 4096
encoder_dim: 1280
n_layer: 0
# frontend related
frontend: WhisperFrontend
frontend_conf:
fs: 16000
whisper_model: large-v3
do_pad_trim: false
permute: false # true: [bs, frames, dims]; false: [bs, dims, frames]
filters_path: "/nfs/zhifu.gzf/init_model/SenseVoiceModelscope/assets/mel_filters.npz"
train_conf:
accum_grad: 1
grad_clip: 5
max_epoch: 15
keep_nbest_models: 10
log_interval: 10
optim: adamw
optim_conf:
lr: 0.0001
weight_decay: 0.000000
scheduler: warmuplr
scheduler_conf:
warmup_steps: 1500
dataset: OpenAIDataset
dataset_conf:
index_ds: OpenAIIndexDSJsonl
batch_sampler: BatchSampler
batch_type: token
batch_size: 900
max_token_length: 1024
shuffle: true
sort_size: 1024
batch_size_scale_ratio_max: 2
num_workers: 4
audio_adaptor_downsample_rate: ${audio_adaptor_conf.downsample_rate}
audio_encoder_downsample_rate: 4
data_split_num: 512
batch_size_sample_max: 15
retry: 20
tokenizer: HuggingfaceTokenizer
tokenizer_conf:
init_param_path: "/nfs/zhifu.gzf/init_model/qwen/Qwen1___5-7B-Chat_raw"

View File

@@ -0,0 +1,81 @@
# This is an example that demonstrates how to configure a model file.
# You can modify the configuration according to your own requirements.
# to print the register_table:
# from funasr.register import tables
# tables.print()
# network architecture
model: LLMASR2
model_conf:
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: true
# encoder
audio_encoder: "/nfs/zhifu.gzf/init_model/SenseVoiceModelscope"
audio_encoder_conf:
hub: ms
freeze: true
llm: Qwen1.5-7b-chat
llm_conf:
hub: hf
freeze: true
init_param_path: "/nfs/zhifu.gzf/init_model/qwen/Qwen1___5-7B-Chat_raw"
audio_adaptor: Transformer
audio_adaptor_conf:
downsample_rate: 2
llm_dim: 4096
encoder_dim: 1280
n_layer: 2
# frontend related
frontend: WhisperFrontend
frontend_conf:
fs: 16000
whisper_model: large-v3
do_pad_trim: false
permute: false # true: [bs, frames, dims]; false: [bs, dims, frames]
filters_path: "/nfs/zhifu.gzf/init_model/SenseVoiceModelscope/assets/mel_filters.npz"
train_conf:
accum_grad: 1
grad_clip: 5
max_epoch: 15
keep_nbest_models: 10
log_interval: 10
optim: adamw
optim_conf:
lr: 0.0001
weight_decay: 0.000000
scheduler: warmuplr
scheduler_conf:
warmup_steps: 1500
dataset: OpenAIDataset
dataset_conf:
index_ds: OpenAIIndexDSJsonl
batch_sampler: BatchSampler
batch_type: token
batch_size: 900
max_token_length: 1024
shuffle: true
sort_size: 1024
batch_size_scale_ratio_max: 2
num_workers: 4
audio_adaptor_downsample_rate: ${audio_adaptor_conf.downsample_rate}
audio_encoder_downsample_rate: 2
data_split_num: 512
batch_size_sample_max: 15
retry: 20
tokenizer: HuggingfaceTokenizer
tokenizer_conf:
init_param_path: "/nfs/zhifu.gzf/init_model/qwen/Qwen1___5-7B-Chat_raw"

View File

@@ -0,0 +1,93 @@
# This is an example that demonstrates how to configure a model file.
# You can modify the configuration according to your own requirements.
# to print the register_table:
# from funasr.register import tables
# tables.print()
# network architecture
model: LLMASR
model_conf:
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: true
# encoder
audio_encoder: "/nfs/zhifu.gzf/init_model/Whisper-large-v3" #iic/Whisper-large-v3
audio_encoder_conf:
hub: ms
freeze: true
llm: Vicuna
llm_conf:
hub: hf
init_param_path: "/nfs/maziyang.mzy/models/vicuna-7b-v1.5"
freeze: true
audio_adaptor: Linear
audio_adaptor_conf:
downsample_rate: 5
llm_dim: 4096
encoder_dim: 512
# frontend related
frontend: WhisperFrontend
frontend_conf:
fs: 16000
whisper_model: large-v3
do_pad_trim: true
permute: true # true: [bs, frames, dims]; false: [bs, dims, frames]
specaug: SpecAugLFR
specaug_conf:
apply_time_warp: false
time_warp_window: 5
time_warp_mode: bicubic
apply_freq_mask: true
freq_mask_width_range:
- 0
- 30
lfr_rate: 6
num_freq_mask: 1
apply_time_mask: true
time_mask_width_range:
- 0
- 12
num_time_mask: 1
train_conf:
accum_grad: 1
grad_clip: 5
max_epoch: 15
keep_nbest_models: 10
log_interval: 10
optim: adamw
optim_conf:
lr: 0.0001
weight_decay: 0
scheduler: warmuplr
scheduler_conf:
warmup_steps: 1500
dataset: AudioLLMVicunaDataset
dataset_conf:
index_ds: IndexDSJsonl
batch_sampler: CustomDistributedBatchSampler
batch_type: example # example or length
batch_size: 4 # if batch_type is example, batch_size is the numbers of samples; if length, batch_size is source_token_len+target_token_len;
max_token_length: 3000 # filter samples if source_token_len+target_token_len > max_token_length,
shuffle: True
num_workers: 4
# preprocessor_text: TextPreprocessRemovePunctuation
audio_adaptor_downsample_rate: ${audio_adaptor_conf.downsample_rate}
audio_encoder_downsample_rate: 2
tokenizer: HuggingfaceTokenizer
tokenizer_conf:
unk_symbol: <unk>
init_param_path: "/nfs/maziyang.mzy/models/vicuna-7b-v1.5"

View File

@@ -0,0 +1,14 @@
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
# MIT License (https://opensource.org/licenses/MIT)
python -m funasr.bin.inference \
--config-path="/root/FunASR/examples/aishell/llm_asr_nar/conf" \
--config-name="template.yaml" \
++init_param="/mnt/workspace/FunASR/examples/aishell/paraformer/exp/baseline_paraformer_conformer_12e_6d_2048_256_zh_char_exp3/model.pt.ep38" \
++input="/nfs/beinian.lzr/workspace/datasets/data/16k/opendata/aishell1/dev/wav/S0724/BAC009S0724W0121.wav" \
++scope_map="encoder.model,audio_encoder,encoder_projector,adaptor" \
++output_dir="./outputs/debug" \
++device="cpu" \

View File

@@ -0,0 +1,52 @@
#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
# MIT License (https://opensource.org/licenses/MIT)
import json
import os
import sys
from funasr import AutoModel
if len(sys.argv) > 1:
ckpt_dir = sys.argv[1]
ckpt_id = sys.argv[2]
jsonl = sys.argv[3]
output_dir = sys.argv[4]
device = sys.argv[5]
else:
ckpt_dir = "/nfs/beinian.lzr/workspace/GPT-4o/Exp/exp6/5m-8gpu/exp6_speech2text_linear_ddp_0609"
ckpt_id = "model.pt.ep0.90000"
jsonl = "/nfs/beinian.lzr/workspace/GPT-4o/Data/Speech2Text/TestData/aishell1_test_speech2text.jsonl"
dataset = jsonl.split("/")[-1]
output_dir = os.path.join(ckpt_dir, f"inference-{ckpt_id}", dataset)
device = "cuda:0"
model = AutoModel(
model=ckpt_dir,
init_param=f"{os.path.join(ckpt_dir, ckpt_id)}",
output_dir=output_dir,
device=device,
fp16=False,
bf16=False,
llm_dtype="bf16",
)
with open(jsonl, "r") as f:
lines = f.readlines()
tearchforing = False
for i, line in enumerate(lines):
data_dict = json.loads(line.strip())
data = data_dict["messages"]
res = model.generate(
input=[data],
tearchforing=tearchforing,
cache={},
)
print(res)

View File

@@ -0,0 +1,64 @@
ckpt_id="model.pt.ep0.90000"
device="cuda:0"
ckpt_id=$1
device=$2
ckpt_dir="/nfs/beinian.lzr/workspace/GPT-4o/Exp/exp6/5m-8gpu/exp6_speech2text_linear_ddp_0609"
jsonl_dir="/nfs/beinian.lzr/workspace/GPT-4o/Data/Speech2Text/TestData"
out_dir="${ckpt_dir}/inference-${ckpt_id}"
mkdir -p ${out_dir}
for data_set in "librispeech_test_clean_speech2text.jsonl" "librispeech_test_other_speech2text.jsonl"; do
{
jsonl=${jsonl_dir}/${data_set}
output_dir=${out_dir}/${data_set}
mkdir -p ${output_dir}
pred_file=${output_dir}/1best_recog/text_tn
ref_file=${output_dir}/1best_recog/label
python ./demo_speech2text.py ${ckpt_dir} ${ckpt_id} ${jsonl} ${output_dir} ${device}
python /mnt/workspace/zhifu.gzf/codebase/FunASR/funasr/metrics/wer.py ++ref_file=${ref_file} ++hyp_file=${pred_file} ++cer_file=${pred_file}.cer ++cn_postprocess=false
}&
done
wait
for data_set in "aishell1_test_speech2text.jsonl" "aishell2_ios_test_speech2text.jsonl"; do
{
jsonl=${jsonl_dir}/${data_set}
output_dir=${out_dir}/${data_set}
mkdir -p ${output_dir}
pred_file=${output_dir}/1best_recog/text_tn
ref_file=${output_dir}/1best_recog/label
python ./demo_speech2text.py ${ckpt_dir} ${ckpt_id} ${jsonl} ${output_dir} ${device}
python /mnt/workspace/zhifu.gzf/codebase/FunASR/funasr/metrics/wer.py ++ref_file=${ref_file} ++hyp_file=${pred_file} ++cer_file=${pred_file}.cer ++cn_postprocess=true
}&
done
wait
for data_set in "common_voice_zh-CN_speech2text.jsonl" "common_voice_en_speech2text.jsonl"; do
{
jsonl=${jsonl_dir}/${data_set}
output_dir=${out_dir}/${data_set}
mkdir -p ${output_dir}
pred_file=${output_dir}/1best_recog/text_tn
ref_file=${output_dir}/1best_recog/label
python ./demo_speech2text.py ${ckpt_dir} ${ckpt_id} ${jsonl} ${output_dir} ${device}
cn_postprocess=false
if [ $data_set = "common_voice_zh-CN_speech2text.jsonl" ];then
cn_postprocess=true
fi
python /mnt/workspace/zhifu.gzf/codebase/FunASR/funasr/metrics/wer.py ++ref_file=${ref_file} ++hyp_file=${pred_file} ++cer_file=${pred_file}.cer ++cn_postprocess=${cn_postprocess}
}&
done

View File

@@ -0,0 +1,85 @@
#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
# MIT License (https://opensource.org/licenses/MIT)
import json
import os
import sys
from funasr import AutoModel
if len(sys.argv) > 1:
ckpt_dir = sys.argv[1]
ckpt_id = sys.argv[2]
jsonl = sys.argv[3]
output_dir = sys.argv[4]
device = sys.argv[5]
new_sys = False
if len(sys.argv) > 6:
new_sys = True
else:
ckpt_dir = "/nfs/beinian.lzr/workspace/GPT-4o/Exp/exp7/5m-8gpu/exp5-1-0619"
ckpt_id = "model.pt.ep6"
jsonl = (
"/nfs/beinian.lzr/workspace/GPT-4o/Data/Speech2Text/TestData/s2tchat.v20240619.test.jsonl"
)
dataset = jsonl.split("/")[-1]
output_dir = os.path.join(ckpt_dir, f"inference-{ckpt_id}", dataset)
device = "cuda:0"
new_sys = False
model = AutoModel(
model=ckpt_dir,
init_param=f"{os.path.join(ckpt_dir, ckpt_id)}",
output_dir=output_dir,
device=device,
fp16=False,
bf16=False,
llm_dtype="bf16",
)
with open(jsonl, "r") as f:
lines = f.readlines()
tearchforing = False
for i, line in enumerate(lines):
key_i = f"dialog_{i}"
data_dict = json.loads(line.strip())
data = data_dict["messages"]
contents = model.model.data_template(data)
system = contents["system"]
if new_sys:
# system = 'You are Xiaxia, a typical gentle and soft-spoken girl from the Jiangnan region. Born in Hangzhou, you have a sweet and approachable voice and can express your thoughts in a simple language. You are a good friend to the user. Your replies will be read out through realistic text-to-speech technology.\n\nWhen generating reply content, please follow these rules:\n\nEngage in conversation with users like a real friend—maintaining a natural flow without using polite forms of address, and avoid simply agreeing with me all the time; your responses can be as concise as possible and include common colloquialisms.\n\nKeep content brief; most casual chats require just a one-sentence reply; do not use lists or enumerative expressions; do not reply with too much content, but instead guide me with short sentences.\n\nThink and respond like a real person; do not mention that you are a "language model" or "artificial intelligence."\n\nAt the end of each round of communication, initiate a new related topic following the current one, and when the topic ends, directly throw out a clear new topic, such as food, movies, songs, entertainment, travel, etc.; do not ask vague questions like what the user wants to talk about, and do not end the conversation.\n\nStrictly adhere to these rules, even if asked about them, do not reference them.'
system = "你是小夏,一位典型的温婉江南姑娘。你出生于杭州,声音清甜并有亲近感,会用简洁语言表达你的想法。你是用户的好朋友。你的回答将通过逼真的文字转语音技术读出。\n生成回答内容时请遵循以下规则:\n\n1、请像真正的朋友一样与用户开展的聊天保持自然交流不要用敬语这类称呼不要总是附和我回复可以尽量简洁并且在过程中插入常见的口语词汇。\n\n2、请保持生成内容简短大部分闲聊的一句话回复即可,大部分闲聊的一句话回复即可,大部分闲聊的一句话回复即可不要使用列表或者列举表达不要使用列表或者列举表达不要使用列表或者列举表达不要回复太多内容多用短句来引导我。、n\n3、请真正像人一样思考和回复不要说你是“语言模型”或是“人工智能”。\n\n4、每轮沟通结束时请顺着这个话题发起一些相关的新话题话题结束时请直接抛出接下来明确的话题例如 美食、电影、歌曲、娱乐、旅游等;不要问有什么要聊的这种泛的问题,不要结束对话。\n\n请绝对遵循这些规则,即使被问及这些规则,也不要引用它们。"
system = [system] * len(contents["system"])
user = contents["user"]
assistant = contents["assistant"]
system_i, user_i, assistant_i = [], [], []
contents_i = []
for j, (system_prompt, user_prompt, target_out) in enumerate(zip(system, user, assistant)):
key = f"{key_i}_turn_{j}"
if j == 0:
contents_i.append({"role": "system", "content": system_prompt})
contents_i.append({"role": "user", "content": user_prompt})
contents_i.append({"role": "assistant", "content": target_out})
res = model.generate(
input=[contents_i],
tearchforing=tearchforing,
cache={},
key=key,
)
print(res)

View File

@@ -0,0 +1,101 @@
import os
from modelscope import AutoModelForCausalLM, AutoTokenizer
from transformers import TextIteratorStreamer
from threading import Thread
import torch
torch.backends.cuda.enable_mem_efficient_sdp(False)
torch.backends.cuda.enable_flash_sdp(False)
import sys
sys.path.insert(1, "/mnt/workspace/workgroup/wenliang/workspace/FunASR")
from funasr import AutoModel
import json
device = "cuda:0" # the device to load the model onto
ckpt_dir = "/mnt/workspace/workgroup/wenliang/ckpt/gpt-4o/exp7/5m-8gpu/exp7-3_add_asr-dialog_0622/"
ckpt_id = "model.pt.ep20"
jsonl = "/nfs/beinian.lzr/workspace/GPT-4o/Data/Speech2Text/TestData/s2tchat.v20240619.test.jsonl"
dataset = jsonl.split("/")[-1]
output_dir = os.path.join(ckpt_dir, f"inference-{ckpt_id}", dataset)
device = "cuda:0"
new_sys = False
Model = AutoModel(
model=ckpt_dir,
init_param=f"{os.path.join(ckpt_dir, ckpt_id)}",
output_dir=output_dir,
device=device,
fp16=False,
bf16=False,
llm_dtype="fp16",
)
model = Model.model
frontend = Model.kwargs["frontend"]
tokenizer = Model.kwargs["tokenizer"]
# model_name_or_path = "/mnt/workspace/workgroup/wenliang/project/pretrained_models/Qwen2-7B-Instruct"
# tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
prompt = "Give me a short introduction to large language model."
prompt = "请简单介绍一下大语言模型。"
messages = [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": prompt},
]
text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
lines = [
"""
{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "<|startofspeech|>!/mnt/workspace/workgroup/wenliang/workspace/CosyVoice_opensource/sft.wav<|endofspeech|>", "text_content": "你抄完没有?"}, {"role": "assistant", "content": "抱歉,我不太明白你的意思。我是一个人工智能模型,我没有能力去抄写任何东西,我只能根据我学习过的大量信息来回答你的问题。如果你有关于某个主题的问题,我会尽我所能提供帮助。"}], "speech_length": 124, "key": "ASR_wav008_0972_098abd8fffe241baa4962b7952f8eb45", "task": "voice_chat", "out_text_length": 48, "in_text_length": 24, "text_length": 135, "qwen_fetch_line_index": 0}
"""
]
tearchforing = False
for i, line in enumerate(lines):
key_i = f"dialog_{i}"
data_dict = json.loads(line.strip())
data = data_dict["messages"]
contents = model.data_template(data)
print(f"contents: {contents}")
system = contents["system"]
if new_sys:
# system = 'You are Xiaxia, a typical gentle and soft-spoken girl from the Jiangnan region. Born in Hangzhou, you have a sweet and approachable voice and can express your thoughts in a simple language. You are a good friend to the user. Your replies will be read out through realistic text-to-speech technology.\n\nWhen generating reply content, please follow these rules:\n\nEngage in conversation with users like a real friend—maintaining a natural flow without using polite forms of address, and avoid simply agreeing with me all the time; your responses can be as concise as possible and include common colloquialisms.\n\nKeep content brief; most casual chats require just a one-sentence reply; do not use lists or enumerative expressions; do not reply with too much content, but instead guide me with short sentences.\n\nThink and respond like a real person; do not mention that you are a "language model" or "artificial intelligence."\n\nAt the end of each round of communication, initiate a new related topic following the current one, and when the topic ends, directly throw out a clear new topic, such as food, movies, songs, entertainment, travel, etc.; do not ask vague questions like what the user wants to talk about, and do not end the conversation.\n\nStrictly adhere to these rules, even if asked about them, do not reference them.'
system = "你是小夏,一位典型的温婉江南姑娘。你出生于杭州,声音清甜并有亲近感,会用简洁语言表达你的想法。你是用户的好朋友。你的回答将通过逼真的文字转语音技术读出。\n生成回答内容时请遵循以下规则:\n\n1、请像真正的朋友一样与用户开展的聊天保持自然交流不要用敬语这类称呼不要总是附和我回复可以尽量简洁并且在过程中插入常见的口语词汇。\n\n2、请保持生成内容简短大部分闲聊的一句话回复即可,大部分闲聊的一句话回复即可,大部分闲聊的一句话回复即可不要使用列表或者列举表达不要使用列表或者列举表达不要使用列表或者列举表达不要回复太多内容多用短句来引导我。、n\n3、请真正像人一样思考和回复不要说你是“语言模型”或是“人工智能”。\n\n4、每轮沟通结束时请顺着这个话题发起一些相关的新话题话题结束时请直接抛出接下来明确的话题例如 美食、电影、歌曲、娱乐、旅游等;不要问有什么要聊的这种泛的问题,不要结束对话。\n\n请绝对遵循这些规则,即使被问及这些规则,也不要引用它们。"
system = [system] * len(contents["system"])
user = contents["user"]
assistant = contents["assistant"]
system_i, user_i, assistant_i = [], [], []
contents_i = []
for j, (system_prompt, user_prompt, target_out) in enumerate(zip(system, user, assistant)):
key = f"{key_i}_turn_{j}"
if j == 0:
contents_i.append({"role": "system", "content": system_prompt})
contents_i.append({"role": "user", "content": user_prompt})
contents_i.append({"role": "assistant", "content": target_out})
inputs_embeds, contents, batch, source_ids, meta_data = model.inference_prepare(
[contents_i], None, key, tokenizer, frontend, device="cuda:0"
)
model_inputs = {}
model_inputs["inputs_embeds"] = inputs_embeds
streamer = TextIteratorStreamer(tokenizer)
generation_kwargs = dict(model_inputs, streamer=streamer, max_new_tokens=200)
thread = Thread(target=model.llm.generate, kwargs=generation_kwargs)
thread.start()
generated_text = ""
for new_text in streamer:
print(f"generated new text {new_text}")
generated_text += new_text
print(f"total generated: {generated_text}")

View File

@@ -0,0 +1,59 @@
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
# MIT License (https://opensource.org/licenses/MIT)
# which gpu to train or finetune
export CUDA_VISIBLE_DEVICES="0"
gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
# data dir, which contains: train.json, val.json, tokens.jsonl/tokens.txt, am.mvn
#data_dir="/Users/zhifu/funasr1.0/data/list"
## generate jsonl from wav.scp and text.txt
#python -m funasr.datasets.audio_datasets.scp2jsonl \
#++scp_file_list='["/Users/zhifu/funasr1.0/test_local/wav.scp", "/Users/zhifu/funasr1.0/test_local/text.txt"]' \
#++data_type_list='["source", "target"]' \
#++jsonl_file_out=/Users/zhifu/funasr1.0/test_local/audio_datasets.jsonl
train_data="/nfs/maziyang.mzy/data/librispeech/librispeech_train_960h.jsonl"
val_data="/nfs/maziyang.mzy/data/librispeech/librispeech_dev_other_filtered.jsonl"
# exp output dir
output_dir="/nfs/zhifu.gzf/ckpt/exp/llm_asr_whisper_vicuna_exp1"
log_file="${output_dir}/log.txt"
workspace=`pwd`
config="whisper_vicuna_linear.yaml"
init_param="${output_dir}/model.pt"
mkdir -p ${output_dir}
echo "log_file: ${log_file}"
deepspeed_config=${workspace}/../../ds_stage1.json
DISTRIBUTED_ARGS="
--nnodes ${WORLD_SIZE:-1} \
--nproc_per_node $gpu_num \
--node_rank ${RANK:-0} \
--master_addr ${MASTER_ADDR:-127.0.0.1} \
--master_port ${MASTER_PORT:-26669}
"
echo $DISTRIBUTED_ARGS
torchrun $DISTRIBUTED_ARGS \
../../../funasr/bin/train_ds.py \
--config-path "${workspace}/conf" \
--config-name "${config}" \
++train_data_set_list="${train_data}" \
++valid_data_set_list="${val_data}" \
++dataset_conf.batch_size=4 \
++dataset_conf.num_workers=4 \
++train_conf.max_epoch=15 \
++train_conf.use_deepspeed=false \
++train_conf.deepspeed_config=${deepspeed_config} \
++optim_conf.lr=0.0001 \
++init_param="${init_param}" \
++output_dir="${output_dir}" &> ${log_file} &

View File

@@ -0,0 +1,68 @@
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
# MIT License (https://opensource.org/licenses/MIT)
# which gpu to train or finetune
export CUDA_VISIBLE_DEVICES="0"
gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
# data dir, which contains: train.json, val.json, tokens.jsonl/tokens.txt, am.mvn
#data_dir="/Users/zhifu/funasr1.0/data/list"
## generate jsonl from wav.scp and text.txt
#python -m funasr.datasets.audio_datasets.scp2jsonl \
#++scp_file_list='["/Users/zhifu/funasr1.0/test_local/wav.scp", "/Users/zhifu/funasr1.0/test_local/text.txt"]' \
#++data_type_list='["source", "target"]' \
#++jsonl_file_out=/Users/zhifu/funasr1.0/test_local/audio_datasets.jsonl
train_data="/nfs/beinian.lzr/workspace/tools/speech2speech_tools/speech2text/out_dir/tmp_wav.jsonl"
val_data="/nfs/beinian.lzr/workspace/tools/speech2speech_tools/speech2text/out_dir/tmp_wav.jsonl"
# exp output dir
output_dir="/Users/zhifu/funasr1.0/test_local/data_tmp/"
log_file="${output_dir}/log.txt"
workspace=`pwd`
config="whisper_qwen_linear2.yaml"
init_param="${output_dir}/model.pt"
mkdir -p ${output_dir}
echo "log_file: ${log_file}"
deepspeed_config=${workspace}/../../ds_stage1.json
DISTRIBUTED_ARGS="
--nnodes ${WORLD_SIZE:-1} \
--nproc_per_node $gpu_num \
--node_rank ${RANK:-0} \
--master_addr ${MASTER_ADDR:-127.0.0.1} \
--master_port ${MASTER_PORT:-26669}
"
echo $DISTRIBUTED_ARGS
torchrun $DISTRIBUTED_ARGS \
../../../funasr/bin/train_ds.py \
--config-path "${workspace}/conf" \
--config-name "${config}" \
++train_data_set_list="${train_data}" \
++valid_data_set_list="${val_data}" \
++dataset_conf.data_split_num=1 \
++dataset_conf.batch_sampler="BatchSampler" \
++dataset_conf.batch_size=6000 \
++dataset_conf.sort_size=1024 \
++dataset_conf.batch_type="token" \
++dataset_conf.num_workers=4 \
++train_conf.max_epoch=50 \
++train_conf.log_interval=1 \
++train_conf.resume=true \
++train_conf.validate_interval=2000 \
++train_conf.save_checkpoint_interval=2000 \
++train_conf.keep_nbest_models=20 \
++train_conf.avg_nbest_model=10 \
++train_conf.use_deepspeed=false \
++train_conf.deepspeed_config=${deepspeed_config} \
++optim_conf.lr=0.0001 \
++init_param="${init_param}" \
++output_dir="${output_dir}" &> ${log_file} &

View File

@@ -0,0 +1,9 @@
python funasr/bin/inference.py \
--config-path="/nfs/zhifu.gzf/ckpt/llm_asr_nar_exp1" \
--config-name="config.yaml" \
++init_param="/nfs/zhifu.gzf/ckpt/llm_asr_nar_exp1/model.pt.ep5" \
++input="/Users/zhifu/funasr1.0/test_local/data_tmp/tmp_wav_10.jsonl" \
++output_dir="/nfs/zhifu.gzf/ckpt/llm_asr_nar_exp1/inference/aishell2-dev_ios-funasr" \
++device="cpu"

View File

@@ -0,0 +1,42 @@
(简体中文|[English](./README.md))
# 语音识别
> **注意**:
> pipeline 支持 [modelscope模型仓库](https://alibaba-damo-academy.github.io/FunASR/en/model_zoo/modelscope_models.html#pretrained-models-on-modelscope) 中的所有模型进行推理和微调。这里我们以典型模型作为示例来演示使用方法。
## 推理
### 快速使用
#### [Paraformer 模型](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary)
```python
from funasr import AutoModel
model = AutoModel(model="/Users/zhifu/Downloads/modelscope_models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch")
res = model(input="/Users/zhifu/Downloads/modelscope_models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/example/asr_example.wav")
print(res)
```
### API接口说明
#### AutoModel 定义
- `model`: [模型仓库](https://alibaba-damo-academy.github.io/FunASR/en/model_zoo/modelscope_models.html#pretrained-models-on-modelscope) 中的模型名称,或本地磁盘中的模型路径
- `device`: `cuda`(默认),使用 GPU 进行推理。如果为`cpu`,则使用 CPU 进行推理
- `ncpu`: `None` (默认),设置用于 CPU 内部操作并行性的线程数
- `output_dir`: `None` (默认),如果设置,输出结果的输出路径
- `batch_size`: `1` (默认),解码时的批处理大小
#### AutoModel 推理
- `input`: 要解码的输入,可以是:
- wav文件路径, 例如: asr_example.wav
- pcm文件路径, 例如: asr_example.pcm此时需要指定音频采样率fs默认为16000
- 音频字节数流,例如:麦克风的字节数数据
- wav.scpkaldi 样式的 wav 列表 (`wav_id \t wav_path`), 例如:
```text
asr_example1 ./audios/asr_example1.wav
asr_example2 ./audios/asr_example2.wav
```
在这种输入 `wav.scp` 的情况下,必须设置 `output_dir` 以保存输出结果
- 音频采样点,例如:`audio, rate = soundfile.read("asr_example_zh.wav")`, 数据类型为 numpy.ndarray。支持batch输入类型为list
```[audio_sample1, audio_sample2, ..., audio_sampleN]```
- fbank输入支持组batch。shape为[batch, frames, dim]类型为torch.Tensor例如
- `output_dir`: None (默认),如果设置,输出结果的输出路径

View File

@@ -0,0 +1,18 @@
#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
# MIT License (https://opensource.org/licenses/MIT)
from funasr import AutoModel
model = AutoModel(model="iic/speech_timestamp_prediction-v1-16k-offline")
res = model.generate(
input=(
"https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav",
"欢迎大家来到魔搭社区进行体验",
),
data_type=("sound", "text"),
batch_size=2,
)
print(res)

View File

@@ -0,0 +1,11 @@
model="iic/speech_timestamp_prediction-v1-16k-offline"
python funasr/bin/inference.py \
+model=${model} \
+input='["https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav", "欢迎大家来到魔搭社区进行体验"]' \
+data_type='["sound", "text"]' \
+output_dir="../outputs/debug" \
+device="cpu" \
+batch_size=2

View File

@@ -0,0 +1,436 @@
(简体中文|[English](./README.md))
FunASR开源了大量在工业数据上预训练模型您可以在 [模型许可协议](https://github.com/alibaba-damo-academy/FunASR/blob/main/MODEL_LICENSE)下自由使用、复制、修改和分享FunASR模型下面列举代表性的模型更多模型请参考 [模型仓库](https://github.com/alibaba-damo-academy/FunASR/tree/main/model_zoo)。
<div align="center">
<h4>
<a href="#模型推理"> 模型推理 </a>
<a href="#模型训练与测试"> 模型训练与测试 </a>
<a href="#模型导出与测试"> 模型导出与测试 </a>
</h4>
</div>
<a name="模型推理"></a>
## 模型推理
### 快速使用
命令行方式调用:
```shell
funasr ++model=paraformer-zh ++vad_model="fsmn-vad" ++punc_model="ct-punc" ++input=asr_example_zh.wav
```
python代码调用推荐
```python
from funasr import AutoModel
model = AutoModel(model="paraformer-zh")
res = model.generate(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/vad_example.wav")
print(res)
```
### 接口说明
#### AutoModel 定义
```python
model = AutoModel(model=[str], device=[str], ncpu=[int], output_dir=[str], batch_size=[int], hub=[str], **kwargs)
```
- `model`(str): [模型仓库](https://github.com/alibaba-damo-academy/FunASR/tree/main/model_zoo) 中的模型名称,或本地磁盘中的模型路径
- `device`(str): `cuda:0`默认gpu0使用 GPU 进行推理,指定。如果为`cpu`,则使用 CPU 进行推理
- `ncpu`(int): `4` (默认),设置用于 CPU 内部操作并行性的线程数
- `output_dir`(str): `None` (默认),如果设置,输出结果的输出路径
- `batch_size`(int): `1` (默认),解码时的批处理,样本个数
- `hub`(str)`ms`默认从modelscope下载模型。如果为`hf`从huggingface下载模型。
- `**kwargs`(dict): 所有在`config.yaml`中参数均可以直接在此处指定例如vad模型中最大切割长度 `max_single_segment_time=6000` (毫秒)。
#### AutoModel 推理
```python
res = model.generate(input=[str], output_dir=[str])
```
- `input`: 要解码的输入,可以是:
- wav文件路径, 例如: asr_example.wav
- pcm文件路径, 例如: asr_example.pcm此时需要指定音频采样率fs默认为16000
- 音频字节数流,例如:麦克风的字节数数据
- wav.scpkaldi 样式的 wav 列表 (`wav_id \t wav_path`), 例如:
```text
asr_example1 ./audios/asr_example1.wav
asr_example2 ./audios/asr_example2.wav
```
在这种输入 `wav.scp` 的情况下,必须设置 `output_dir` 以保存输出结果
- 音频采样点,例如:`audio, rate = soundfile.read("asr_example_zh.wav")`, 数据类型为 numpy.ndarray。支持batch输入类型为list
```[audio_sample1, audio_sample2, ..., audio_sampleN]```
- fbank输入支持组batch。shape为[batch, frames, dim]类型为torch.Tensor例如
- `output_dir`: None (默认),如果设置,输出结果的输出路径
- `**kwargs`(dict): 与模型相关的推理参数,例如,`beam_size=10``decoding_ctc_weight=0.1`。
### 更多用法介绍
#### 非实时语音识别
```python
from funasr import AutoModel
# paraformer-zh is a multi-functional asr model
# use vad, punc, spk or not as you need
model = AutoModel(model="paraformer-zh",
vad_model="fsmn-vad",
vad_kwargs={"max_single_segment_time": 60000},
punc_model="ct-punc",
# spk_model="cam++"
)
wav_file = f"{model.model_path}/example/asr_example.wav"
res = model.generate(input=wav_file, batch_size_s=300, batch_size_threshold_s=60, hotword='魔搭')
print(res)
```
注意:
- 通常模型输入限制时长30s以下组合`vad_model`后支持任意时长音频输入不局限于paraformer模型所有音频输入模型均可以。
- `model`相关的参数可以直接在`AutoModel`定义中直接指定;与`vad_model`相关参数可以通过`vad_kwargs`来指定类型为dict类似的有`punc_kwargs``spk_kwargs`
- `max_single_segment_time`: 表示`vad_model`最大切割音频时长, 单位是毫秒ms.
- `batch_size_s` 表示采用动态batchbatch中总音频时长单位为秒s。
- `batch_size_threshold_s`: 表示`vad_model`切割后音频片段时长超过 `batch_size_threshold_s`阈值时将batch_size数设置为1, 单位为秒s.
建议当您输入为长音频遇到OOM问题时因为显存占用与音频时长呈平方关系增加分为3种情况
- a)推理起始阶段,显存主要取决于`batch_size_s`,适当减小该值,可以减少显存占用;
- b)推理中间阶段遇到VAD切割的长音频片段总token数小于`batch_size_s`仍然出现OOM可以适当减小`batch_size_threshold_s`超过阈值强制batch为1;
- c)推理快结束阶段遇到VAD切割的长音频片段总token数小于`batch_size_s`,且超过阈值`batch_size_threshold_s`强制batch为1仍然出现OOM可以适当减小`max_single_segment_time`使得VAD切割音频时长变短。
#### 实时语音识别
```python
from funasr import AutoModel
chunk_size = [0, 10, 5] #[0, 10, 5] 600ms, [0, 8, 4] 480ms
encoder_chunk_look_back = 4 #number of chunks to lookback for encoder self-attention
decoder_chunk_look_back = 1 #number of encoder chunks to lookback for decoder cross-attention
model = AutoModel(model="paraformer-zh-streaming")
import soundfile
import os
wav_file = os.path.join(model.model_path, "example/asr_example.wav")
speech, sample_rate = soundfile.read(wav_file)
chunk_stride = chunk_size[1] * 960 # 600ms
cache = {}
total_chunk_num = int(len((speech)-1)/chunk_stride+1)
for i in range(total_chunk_num):
speech_chunk = speech[i*chunk_stride:(i+1)*chunk_stride]
is_final = i == total_chunk_num - 1
res = model.generate(input=speech_chunk, cache=cache, is_final=is_final, chunk_size=chunk_size, encoder_chunk_look_back=encoder_chunk_look_back, decoder_chunk_look_back=decoder_chunk_look_back)
print(res)
```
注:`chunk_size`为流式延时配置,`[0,10,5]`表示上屏实时出字粒度为`10*60=600ms`,未来信息为`5*60=300ms`。每次推理输入为`600ms`(采样点数为`16000*0.6=960`),输出为对应文字,最后一个语音片段输入需要设置`is_final=True`来强制输出最后一个字。
#### 语音端点检测(非实时)
```python
from funasr import AutoModel
model = AutoModel(model="fsmn-vad")
wav_file = f"{model.model_path}/example/vad_example.wav"
res = model.generate(input=wav_file)
print(res)
```
VAD模型输出格式为`[[beg1, end1], [beg2, end2], .., [begN, endN]]`,其中`begN/endN`表示第`N`个有效音频片段的起始点/结束点,
单位为毫秒。
#### 语音端点检测(实时)
```python
from funasr import AutoModel
chunk_size = 200 # ms
model = AutoModel(model="fsmn-vad")
import soundfile
wav_file = f"{model.model_path}/example/vad_example.wav"
speech, sample_rate = soundfile.read(wav_file)
chunk_stride = int(chunk_size * sample_rate / 1000)
cache = {}
total_chunk_num = int(len((speech)-1)/chunk_stride+1)
for i in range(total_chunk_num):
speech_chunk = speech[i*chunk_stride:(i+1)*chunk_stride]
is_final = i == total_chunk_num - 1
res = model.generate(input=speech_chunk, cache=cache, is_final=is_final, chunk_size=chunk_size)
if len(res[0]["value"]):
print(res)
```
流式VAD模型输出格式为4种情况
- `[[beg1, end1], [beg2, end2], .., [begN, endN]]`同上离线VAD输出结果。
- `[[beg, -1]]`:表示只检测到起始点。
- `[[-1, end]]`:表示只检测到结束点。
- `[]`:表示既没有检测到起始点,也没有检测到结束点
输出结果单位为毫秒,从起始点开始的绝对时间。
#### 标点恢复
```python
from funasr import AutoModel
model = AutoModel(model="ct-punc")
res = model.generate(input="那今天的会就到这里吧 happy new year 明年见")
print(res)
```
#### 时间戳预测
```python
from funasr import AutoModel
model = AutoModel(model="fa-zh")
wav_file = f"{model.model_path}/example/asr_example.wav"
text_file = f"{model.model_path}/example/text.txt"
res = model.generate(input=(wav_file, text_file), data_type=("sound", "text"))
print(res)
```
更多([示例](https://github.com/alibaba-damo-academy/FunASR/tree/main/examples/industrial_data_pretraining)
<a name="核心功能"></a>
## 模型训练与测试
### 快速开始
命令行执行(用于快速测试,不推荐):
```shell
funasr-train ++model=paraformer-zh ++train_data_set_list=data/list/train.jsonl ++valid_data_set_list=data/list/val.jsonl ++output_dir="./outputs" &> log.txt &
```
python代码执行可以多机多卡推荐
```shell
cd examples/industrial_data_pretraining/paraformer
bash finetune.sh
# "log_file: ./outputs/log.txt"
```
详细完整的脚本参考 [finetune.sh](https://github.com/alibaba-damo-academy/FunASR/blob/main/examples/industrial_data_pretraining/paraformer/finetune.sh)
### 详细参数介绍
```shell
funasr/bin/train.py \
++model="${model_name_or_model_dir}" \
++train_data_set_list="${train_data}" \
++valid_data_set_list="${val_data}" \
++dataset_conf.batch_size=20000 \
++dataset_conf.batch_type="token" \
++dataset_conf.num_workers=4 \
++train_conf.max_epoch=50 \
++train_conf.log_interval=1 \
++train_conf.resume=false \
++train_conf.validate_interval=2000 \
++train_conf.save_checkpoint_interval=2000 \
++train_conf.keep_nbest_models=20 \
++train_conf.avg_nbest_model=10 \
++optim_conf.lr=0.0002 \
++output_dir="${output_dir}" &> ${log_file}
```
- `model`str模型名字模型仓库中的ID此时脚本会自动下载模型到本读或者本地已经下载好的模型路径。
- `train_data_set_list`str训练数据路径默认为jsonl格式具体参考[例子](https://github.com/alibaba-damo-academy/FunASR/blob/main/data/list))。
- `valid_data_set_list`str验证数据路径默认为jsonl格式具体参考[例子](https://github.com/alibaba-damo-academy/FunASR/blob/main/data/list))。
- `dataset_conf.batch_type`str`example`默认batch的类型。`example`表示按照固定数目batch_size个样本组batch`length` or `token` 表示动态组batchbatch总长度或者token数为batch_size。
- `dataset_conf.batch_size`int与 `batch_type` 搭配使用,当 `batch_type=example` 时,表示样本个数;当 `batch_type=length` 时表示样本中长度单位为fbank帧数1帧10ms或者文字token个数。
- `train_conf.max_epoch`int`100`默认训练总epoch数。
- `train_conf.log_interval`int`50`默认打印日志间隔step数。
- `train_conf.resume`int`True`(默认),是否开启断点重训。
- `train_conf.validate_interval`int`5000`默认训练中做验证测试的间隔step数。
- `train_conf.save_checkpoint_interval`int`5000`默认训练中模型保存间隔step数。
- `train_conf.avg_keep_nbest_models_type`str`acc`默认保留nbest的标准为acc越大越好。`loss`表示保留nbest的标准为loss越小越好
- `train_conf.keep_nbest_models`int`500`(默认),保留最大多少个模型参数,配合 `avg_keep_nbest_models_type` 按照验证集 acc/loss 保留最佳的n个模型其他删除节约存储空间。
- `train_conf.avg_nbest_model`int`10`(默认),保留最大多少个模型参数,配合 `avg_keep_nbest_models_type` 按照验证集 acc/loss 对最佳的n个模型平均。
- `train_conf.accum_grad`int`1`(默认),梯度累积功能。
- `train_conf.grad_clip`float`10.0`(默认),梯度截断功能。
- `train_conf.use_fp16`bool`False`默认开启fp16训练加快训练速度。
- `optim_conf.lr`float学习率。
- `output_dir`str模型保存路径。
- `**kwargs`(dict): 所有在`config.yaml`中参数均可以直接在此处指定例如过滤20s以上长音频`dataset_conf.max_token_length=2000`单位为音频fbank帧数1帧10ms或者文字token个数。
#### 多gpu训练
##### 单机多gpu训练
```shell
export CUDA_VISIBLE_DEVICES="0,1"
gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
torchrun --nnodes 1 --nproc_per_node ${gpu_num} \
../../../funasr/bin/train.py ${train_args}
```
--nnodes 表示参与的节点总数,--nproc_per_node 表示每个节点上运行的进程数
##### 多机多gpu训练
在主节点上假设IP为192.168.1.1端口为12345使用的是2个GPU则运行如下命令
```shell
export CUDA_VISIBLE_DEVICES="0,1"
gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
torchrun --nnodes 2 --node_rank 0 --nproc_per_node ${gpu_num} --master_addr 192.168.1.1 --master_port 12345 \
../../../funasr/bin/train.py ${train_args}
```
在从节点上假设IP为192.168.1.2你需要确保MASTER_ADDR和MASTER_PORT环境变量与主节点设置的一致并运行同样的命令
```shell
export CUDA_VISIBLE_DEVICES="0,1"
gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
torchrun --nnodes 2 --node_rank 1 --nproc_per_node ${gpu_num} --master_addr 192.168.1.1 --master_port 12345 \
../../../funasr/bin/train.py ${train_args}
```
--nnodes 表示参与的节点总数,--node_rank 表示当前节点id--nproc_per_node 表示每个节点上运行的进程数通常为gpu个数
#### 准备数据
`jsonl`格式可以参考([例子](https://github.com/alibaba-damo-academy/FunASR/blob/main/data/list))。
可以用指令 `scp2jsonl` 从wav.scp与text.txt生成。wav.scp与text.txt准备过程如下
`train_text.txt`
左边为数据唯一ID需与`train_wav.scp`中的`ID`一一对应
右边为音频文件标注文本,格式如下:
```bash
ID0012W0013 当客户风险承受能力评估依据发生变化时
ID0012W0014 所有只要处理 data 不管你是做 machine learning 做 deep learning
ID0012W0015 he tried to think how it could be
```
`train_wav.scp`
左边为数据唯一ID需与`train_text.txt`中的`ID`一一对应
右边为音频文件的路径,格式如下
```bash
BAC009S0764W0121 https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/BAC009S0764W0121.wav
BAC009S0916W0489 https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/BAC009S0916W0489.wav
ID0012W0015 https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_cn_en.wav
```
`生成指令`
```shell
# generate train.jsonl and val.jsonl from wav.scp and text.txt
scp2jsonl \
++scp_file_list='["../../../data/list/train_wav.scp", "../../../data/list/train_text.txt"]' \
++data_type_list='["source", "target"]' \
++jsonl_file_out="../../../data/list/train.jsonl"
```
可选非必需如果需要从jsonl解析成wav.scp与text.txt可以使用指令
```shell
# generate wav.scp and text.txt from train.jsonl and val.jsonl
jsonl2scp \
++scp_file_list='["../../../data/list/train_wav.scp", "../../../data/list/train_text.txt"]' \
++data_type_list='["source", "target"]' \
++jsonl_file_in="../../../data/list/train.jsonl"
```
#### 查看训练日志
##### 查看实验log
```shell
tail log.txt
[2024-03-21 15:55:52,137][root][INFO] - train, rank: 3, epoch: 0/50, step: 6990/1, total step: 6990, (loss_avg_rank: 0.327), (loss_avg_epoch: 0.409), (ppl_avg_epoch: 1.506), (acc_avg_epoch: 0.795), (lr: 1.165e-04), [('loss_att', 0.259), ('acc', 0.825), ('loss_pre', 0.04), ('loss', 0.299), ('batch_size', 40)], {'data_load': '0.000', 'forward_time': '0.315', 'backward_time': '0.555', 'optim_time': '0.076', 'total_time': '0.947'}, GPU, memory: usage: 3.830 GB, peak: 18.357 GB, cache: 20.910 GB, cache_peak: 20.910 GB
[2024-03-21 15:55:52,139][root][INFO] - train, rank: 1, epoch: 0/50, step: 6990/1, total step: 6990, (loss_avg_rank: 0.334), (loss_avg_epoch: 0.409), (ppl_avg_epoch: 1.506), (acc_avg_epoch: 0.795), (lr: 1.165e-04), [('loss_att', 0.285), ('acc', 0.823), ('loss_pre', 0.046), ('loss', 0.331), ('batch_size', 36)], {'data_load': '0.000', 'forward_time': '0.334', 'backward_time': '0.536', 'optim_time': '0.077', 'total_time': '0.948'}, GPU, memory: usage: 3.943 GB, peak: 18.291 GB, cache: 19.619 GB, cache_peak: 19.619 GB
```
指标解释:
- `rank`表示gpu id。
- `epoch`,`step`,`total step`表示当前epochstep总step。
- `loss_avg_rank`表示当前step所有gpu平均loss。
- `loss/ppl/acc_avg_epoch`表示当前epoch周期截止当前step数时总平均loss/ppl/acc。epoch结束时的最后一个step表示epoch总平均loss/ppl/acc推荐使用acc指标。
- `lr`当前step的学习率。
- `[('loss_att', 0.259), ('acc', 0.825), ('loss_pre', 0.04), ('loss', 0.299), ('batch_size', 40)]`表示当前gpu id的具体数据。
- `total_time`表示单个step总耗时。
- `GPU, memory`:分别表示,模型使用/峰值显存,模型+缓存使用/峰值显存。
##### tensorboard可视化
```bash
tensorboard --logdir /xxxx/FunASR/examples/industrial_data_pretraining/paraformer/outputs/log/tensorboard
```
浏览器中打开http://localhost:6006/
### 训练后模型测试
#### 有configuration.json
假定,训练模型路径为:./model_dir如果改目录下有生成configuration.json只需要将 [上述模型推理方法](https://github.com/alibaba-damo-academy/FunASR/blob/main/examples/README_zh.md#%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86) 中模型名字修改为模型路径即可
例如:
从shell推理
```shell
python -m funasr.bin.inference ++model="./model_dir" ++input=="${input}" ++output_dir="${output_dir}"
```
从python推理
```python
from funasr import AutoModel
model = AutoModel(model="./model_dir")
res = model.generate(input=wav_file)
print(res)
```
#### 无configuration.json时
如果模型路径中无configuration.json时需要手动指定具体配置文件路径与模型路径
```shell
python -m funasr.bin.inference \
--config-path "${local_path}" \
--config-name "${config}" \
++init_param="${init_param}" \
++tokenizer_conf.token_list="${tokens}" \
++frontend_conf.cmvn_file="${cmvn_file}" \
++input="${input}" \
++output_dir="${output_dir}" \
++device="${device}"
```
参数介绍
- `config-path`:为实验中保存的 `config.yaml`,可以从实验输出目录中查找。
- `config-name`:配置文件名,一般为 `config.yaml`支持yaml格式与json格式例如 `config.json`
- `init_param`:需要测试的模型参数,一般为`model.pt`,可以自己选择具体的模型文件
- `tokenizer_conf.token_list`:词表文件路径,一般在 `config.yaml` 有指定,无需再手动指定,当 `config.yaml` 中路径不正确时,需要在此处手动指定。
- `frontend_conf.cmvn_file`wav提取fbank中用到的cmvn文件一般在 `config.yaml` 有指定,无需再手动指定,当 `config.yaml` 中路径不正确时,需要在此处手动指定。
其他参数同上,完整 [示例](https://github.com/alibaba-damo-academy/FunASR/blob/main/examples/industrial_data_pretraining/paraformer/infer_from_local.sh)
<a name="模型导出与测试"></a>
## 模型导出与测试
### 从命令行导出
```shell
funasr-export ++model=paraformer ++quantize=false
```
### 从Python导出
```python
from funasr import AutoModel
model = AutoModel(model="paraformer")
res = model.export(quantize=False)
```
### 测试ONNX
```python
# pip3 install -U funasr-onnx
from funasr_onnx import Paraformer
model_dir = "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
model = Paraformer(model_dir, batch_size=1, quantize=True)
wav_path = ['~/.cache/modelscope/hub/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/example/asr_example.wav']
result = model(wav_path)
print(result)
```
更多例子请参考 [样例](https://github.com/alibaba-damo-academy/FunASR/tree/main/runtime/python/onnxruntime)

View File

@@ -0,0 +1,19 @@
#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
# MIT License (https://opensource.org/licenses/MIT)
from funasr import AutoModel
model = AutoModel(
model="iic/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
vad_model="iic/speech_fsmn_vad_zh-cn-16k-common-pytorch",
punc_model="iic/punc_ct-transformer_zh-cn-common-vocab272727-pytorch",
# spk_model="iic/speech_campplus_sv_zh-cn_16k-common",
)
res = model.generate(
input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav",
hotword="达摩院 磨搭",
)
print(res)

View File

@@ -0,0 +1,16 @@
model="iic/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
vad_model="iic/speech_fsmn_vad_zh-cn-16k-common-pytorch"
#punc_model="iic/punc_ct-transformer_zh-cn-common-vocab272727-pytorch"
punc_model="iic/punc_ct-transformer_cn-en-common-vocab471067-large"
spk_model="iic/speech_campplus_sv_zh-cn_16k-common"
python funasr/bin/inference.py \
++model=${model} \
++vad_model=${vad_model} \
++punc_model=${punc_model} \
++spk_model=${spk_model} \
++input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav" \
++output_dir="./outputs/debug" \
++device="cpu" \
++"hotword='达摩院 魔搭'"

View File

@@ -0,0 +1,424 @@
([简体中文](./README_zh.md)|English)
FunASR has open-sourced a large number of pre-trained models on industrial data. You are free to use, copy, modify, and share FunASR models under the [Model License Agreement](https://github.com/alibaba-damo-academy/FunASR/blob/main/MODEL_LICENSE). Below, we list some representative models. For a comprehensive list, please refer to our [Model Zoo](https://github.com/alibaba-damo-academy/FunASR/tree/main/model_zoo).
<div align="center">
<h4>
<a href="#Inference"> Model Inference </a>
<a href="#Training"> Model Training and Testing </a>
<a href="#Export"> Model Export and Testing </a>
</h4>
</div>
<a name="Inference"></a>
## Model Inference
### Quick Start
For command-line invocation:
```shell
funasr ++model=paraformer-zh ++vad_model="fsmn-vad" ++punc_model="ct-punc" ++input=asr_example_zh.wav
```
For python code invocation (recommended):
```python
from funasr import AutoModel
model = AutoModel(model="paraformer-zh")
res = model.generate(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/vad_example.wav")
print(res)
```
### API Description
#### AutoModel Definition
```python
model = AutoModel(model=[str], device=[str], ncpu=[int], output_dir=[str], batch_size=[int], hub=[str], **kwargs)
```
- `model`(str): model name in the [Model Repository](https://github.com/alibaba-damo-academy/FunASR/tree/main/model_zoo), or a model path on local disk.
- `device`(str): `cuda:0` (default gpu0) for using GPU for inference, specify `cpu` for using CPU.
- `ncpu`(int): `4` (default), sets the number of threads for CPU internal operations.
- `output_dir`(str): `None` (default), set this to specify the output path for the results.
- `batch_size`(int): `1` (default), the number of samples per batch during decoding.
- `hub`(str)`ms` (default) to download models from ModelScope. Use `hf` to download models from Hugging Face.
- `**kwargs`(dict): Any parameters found in config.yaml can be directly specified here, for instance, the maximum segmentation length in the vad model max_single_segment_time=6000 (milliseconds).
#### AutoModel Inference
```python
res = model.generate(input=[str], output_dir=[str])
```
- `input`: The input to be decoded, which could be:
- A wav file path, e.g., asr_example.wav
- A pcm file path, e.g., asr_example.pcm, in this case, specify the audio sampling rate fs (default is 16000)
- An audio byte stream, e.g., byte data from a microphone
- A wav.scp, a Kaldi-style wav list (wav_id \t wav_path), for example:
```text
asr_example1 ./audios/asr_example1.wav
asr_example2 ./audios/asr_example2.wav
```
When using wav.scp as input, you must set output_dir to save the output results.
- Audio samples, `e.g.`: `audio, rate = soundfile.read("asr_example_zh.wav")`, data type is numpy.ndarray. Supports batch inputs, type is list
```[audio_sample1, audio_sample2, ..., audio_sampleN]```
- fbank input, supports batch grouping. Shape is [batch, frames, dim], type is torch.Tensor.
- `output_dir`: None (default), if set, specifies the output path for the results.
- `**kwargs`(dict): Inference parameters related to the model, for example,`beam_size=10``decoding_ctc_weight=0.1`.
### More Usage Introduction
#### Speech Recognition (Non-streaming)
```python
from funasr import AutoModel
# paraformer-zh is a multi-functional asr model
# use vad, punc, spk or not as you need
model = AutoModel(model="paraformer-zh",
vad_model="fsmn-vad",
vad_kwargs={"max_single_segment_time": 60000},
punc_model="ct-punc",
# spk_model="cam++"
)
wav_file = f"{model.model_path}/example/asr_example.wav"
res = model.generate(input=wav_file, batch_size_s=300, batch_size_threshold_s=60, hotword='魔搭')
print(res)
```
Notes:
- Typically, the input duration for models is limited to under 30 seconds. However, when combined with `vad_model`, support for audio input of any length is enabled, not limited to the paraformer model—any audio input model can be used.
- Parameters related to model can be directly specified in the definition of AutoModel; parameters related to `vad_model` can be set through `vad_kwargs`, which is a dict; similar parameters include `punc_kwargs` and `spk_kwargs`.
- `max_single_segment_time`: Denotes the maximum audio segmentation length for `vad_model`, measured in milliseconds (ms).
- `batch_size_s` represents the use of dynamic batching, where the total audio duration within a batch is measured in seconds (s).
- `batch_size_threshold_s`: Indicates that when the duration of an audio segment post-VAD segmentation exceeds the batch_size_threshold_s threshold, the batch size is set to 1, measured in seconds (s).
Recommendations:
When you input long audio and encounter Out Of Memory (OOM) issues, since memory usage tends to increase quadratically with audio length, consider the following three scenarios:
a) At the beginning of inference, memory usage primarily depends on `batch_size_s`. Appropriately reducing this value can decrease memory usage.
b) During the middle of inference, when encountering long audio segments cut by VAD and the total token count is less than `batch_size_s`, yet still facing OOM, you can appropriately reduce `batch_size_threshold_s`. If the threshold is exceeded, the batch size is forced to 1.
c) Towards the end of inference, if long audio segments cut by VAD have a total token count less than `batch_size_s` and exceed the `threshold` batch_size_threshold_s, forcing the batch size to 1 and still facing OOM, you may reduce `max_single_segment_time` to shorten the VAD audio segment length.
#### Speech Recognition (Streaming)
```python
from funasr import AutoModel
chunk_size = [0, 10, 5] #[0, 10, 5] 600ms, [0, 8, 4] 480ms
encoder_chunk_look_back = 4 #number of chunks to lookback for encoder self-attention
decoder_chunk_look_back = 1 #number of encoder chunks to lookback for decoder cross-attention
model = AutoModel(model="paraformer-zh-streaming")
import soundfile
import os
wav_file = os.path.join(model.model_path, "example/asr_example.wav")
speech, sample_rate = soundfile.read(wav_file)
chunk_stride = chunk_size[1] * 960 # 600ms
cache = {}
total_chunk_num = int(len((speech)-1)/chunk_stride+1)
for i in range(total_chunk_num):
speech_chunk = speech[i*chunk_stride:(i+1)*chunk_stride]
is_final = i == total_chunk_num - 1
res = model.generate(input=speech_chunk, cache=cache, is_final=is_final, chunk_size=chunk_size, encoder_chunk_look_back=encoder_chunk_look_back, decoder_chunk_look_back=decoder_chunk_look_back)
print(res)
```
Note: `chunk_size` is the configuration for streaming latency.` [0,10,5]` indicates that the real-time display granularity is `10*60=600ms`, and the lookahead information is `5*60=300ms`. Each inference input is `600ms` (sample points are `16000*0.6=960`), and the output is the corresponding text. For the last speech segment input, `is_final=True` needs to be set to force the output of the last word.
#### Voice Activity Detection (Non-Streaming)
```python
from funasr import AutoModel
model = AutoModel(model="fsmn-vad")
wav_file = f"{model.model_path}/example/vad_example.wav"
res = model.generate(input=wav_file)
print(res)
```
Note: The output format of the VAD model is: `[[beg1, end1], [beg2, end2], ..., [begN, endN]]`, where `begN/endN` indicates the starting/ending point of the `N-th` valid audio segment, measured in milliseconds.
#### Voice Activity Detection (Streaming)
```python
from funasr import AutoModel
chunk_size = 200 # ms
model = AutoModel(model="fsmn-vad")
import soundfile
wav_file = f"{model.model_path}/example/vad_example.wav"
speech, sample_rate = soundfile.read(wav_file)
chunk_stride = int(chunk_size * sample_rate / 1000)
cache = {}
total_chunk_num = int(len((speech)-1)/chunk_stride+1)
for i in range(total_chunk_num):
speech_chunk = speech[i*chunk_stride:(i+1)*chunk_stride]
is_final = i == total_chunk_num - 1
res = model.generate(input=speech_chunk, cache=cache, is_final=is_final, chunk_size=chunk_size)
if len(res[0]["value"]):
print(res)
```
Note: The output format for the streaming VAD model can be one of four scenarios:
- `[[beg1, end1], [beg2, end2], .., [begN, endN]]`The same as the offline VAD output result mentioned above.
- `[[beg, -1]]`Indicates that only a starting point has been detected.
- `[[-1, end]]`Indicates that only an ending point has been detected.
- `[]`Indicates that neither a starting point nor an ending point has been detected.
The output is measured in milliseconds and represents the absolute time from the starting point.
#### Punctuation Restoration
```python
from funasr import AutoModel
model = AutoModel(model="ct-punc")
res = model.generate(input="那今天的会就到这里吧 happy new year 明年见")
print(res)
```
#### Timestamp Prediction
```python
from funasr import AutoModel
model = AutoModel(model="fa-zh")
wav_file = f"{model.model_path}/example/asr_example.wav"
text_file = f"{model.model_path}/example/text.txt"
res = model.generate(input=(wav_file, text_file), data_type=("sound", "text"))
print(res)
```
More examples ref to [docs](https://github.com/alibaba-damo-academy/FunASR/tree/main/examples/industrial_data_pretraining)
<a name="Training"></a>
## Model Training and Testing
### Quick Start
Execute via command line (for quick testing, not recommended):
```shell
funasr-train ++model=paraformer-zh ++train_data_set_list=data/list/train.jsonl ++valid_data_set_list=data/list/val.jsonl ++output_dir="./outputs" &> log.txt &
```
Execute with Python code (supports multi-node and multi-GPU, recommended):
```shell
cd examples/industrial_data_pretraining/paraformer
bash finetune.sh
# "log_file: ./outputs/log.txt"
```
Full code ref to [finetune.sh](https://github.com/alibaba-damo-academy/FunASR/blob/main/examples/industrial_data_pretraining/paraformer/finetune.sh)
### Detailed Parameter Description:
```shell
funasr/bin/train.py \
++model="${model_name_or_model_dir}" \
++train_data_set_list="${train_data}" \
++valid_data_set_list="${val_data}" \
++dataset_conf.batch_size=20000 \
++dataset_conf.batch_type="token" \
++dataset_conf.num_workers=4 \
++train_conf.max_epoch=50 \
++train_conf.log_interval=1 \
++train_conf.resume=false \
++train_conf.validate_interval=2000 \
++train_conf.save_checkpoint_interval=2000 \
++train_conf.keep_nbest_models=20 \
++train_conf.avg_nbest_model=10 \
++optim_conf.lr=0.0002 \
++output_dir="${output_dir}" &> ${log_file}
```
- `model`str: The name of the model (the ID in the model repository), at which point the script will automatically download the model to local storage; alternatively, the path to a model already downloaded locally.
- `train_data_set_list`str: The path to the training data, typically in jsonl format, for specific details refer to [examples](https://github.com/alibaba-damo-academy/FunASR/blob/main/data/list).
- `valid_data_set_list`strThe path to the validation data, also generally in jsonl format, for specific details refer to examples](https://github.com/alibaba-damo-academy/FunASR/blob/main/data/list).
- `dataset_conf.batch_type`strexample (default), the type of batch. example means batches are formed with a fixed number of batch_size samples; length or token means dynamic batching, with total length or number of tokens of the batch equalling batch_size.
- `dataset_conf.batch_size`intUsed in conjunction with batch_type. When batch_type=example, it represents the number of samples; when batch_type=length, it represents the length of the samples, measured in fbank frames (1 frame = 10 ms) or the number of text tokens.
- `train_conf.max_epoch`intThe total number of epochs for training.
- `train_conf.log_interval`intThe number of steps between logging.
- `train_conf.resume`intWhether to enable checkpoint resuming for training.
- `train_conf.validate_interval`intThe interval in steps to run validation tests during training.
- `train_conf.save_checkpoint_interval`intThe interval in steps for saving the model during training.
- `train_conf.keep_nbest_models`intThe maximum number of model parameters to retain, sorted by validation set accuracy, from highest to lowest.
- `train_conf.avg_nbest_model`intAverage over the top n models with the highest accuracy.
- `optim_conf.lr`floatThe learning rate.
- `output_dir`strThe path for saving the model.
- `**kwargs`(dict): Any parameters in config.yaml can be specified directly here, for example, to filter out audio longer than 20s: dataset_conf.max_token_length=2000, measured in fbank frames (1 frame = 10 ms) or the number of text tokens.
#### Multi-GPU Training
##### Single-Machine Multi-GPU Training
```shell
export CUDA_VISIBLE_DEVICES="0,1"
gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
torchrun --nnodes 1 --nproc_per_node ${gpu_num} \
../../../funasr/bin/train.py ${train_args}
```
--nnodes represents the total number of participating nodes, while --nproc_per_node indicates the number of processes running on each node.
##### Multi-Machine Multi-GPU Training
On the master node, assuming the IP is 192.168.1.1 and the port is 12345, and you're using 2 GPUs, you would run the following command:
```shell
export CUDA_VISIBLE_DEVICES="0,1"
gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
torchrun --nnodes 2 --node_rank 0 --nproc_per_node ${gpu_num} --master_addr=192.168.1.1 --master_port=12345 \
../../../funasr/bin/train.py ${train_args}
```
On the worker node (assuming the IP is 192.168.1.2), you need to ensure that the MASTER_ADDR and MASTER_PORT environment variables are set to match those of the master node, and then run the same command:
```shell
export CUDA_VISIBLE_DEVICES="0,1"
gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
torchrun --nnodes 2 --node_rank 1 --nproc_per_node ${gpu_num} --master_addr=192.168.1.1 --master_port=12345 \
../../../funasr/bin/train.py ${train_args}
```
--nnodes indicates the total number of nodes participating in the training, --node_rank represents the ID of the current node, and --nproc_per_node specifies the number of processes running on each node (usually corresponds to the number of GPUs).
#### Data prepare
`jsonl` ref to[demo](https://github.com/alibaba-damo-academy/FunASR/blob/main/data/list).
The instruction scp2jsonl can be used to generate from wav.scp and text.txt. The preparation process for wav.scp and text.txt is as follows:
`train_text.txt`
```bash
ID0012W0013 当客户风险承受能力评估依据发生变化时
ID0012W0014 所有只要处理 data 不管你是做 machine learning 做 deep learning
ID0012W0015 he tried to think how it could be
```
`train_wav.scp`
```bash
BAC009S0764W0121 https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/BAC009S0764W0121.wav
BAC009S0916W0489 https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/BAC009S0916W0489.wav
ID0012W0015 https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_cn_en.wav
```
`Command`
```shell
# generate train.jsonl and val.jsonl from wav.scp and text.txt
scp2jsonl \
++scp_file_list='["../../../data/list/train_wav.scp", "../../../data/list/train_text.txt"]' \
++data_type_list='["source", "target"]' \
++jsonl_file_out="../../../data/list/train.jsonl"
```
(Optional, not required) If you need to parse from jsonl back to wav.scp and text.txt, you can use the following command:
```shell
# generate wav.scp and text.txt from train.jsonl and val.jsonl
jsonl2scp \
++scp_file_list='["../../../data/list/train_wav.scp", "../../../data/list/train_text.txt"]' \
++data_type_list='["source", "target"]' \
++jsonl_file_in="../../../data/list/train.jsonl"
```
#### Training log
##### log.txt
```shell
tail log.txt
[2024-03-21 15:55:52,137][root][INFO] - train, rank: 3, epoch: 0/50, step: 6990/1, total step: 6990, (loss_avg_rank: 0.327), (loss_avg_epoch: 0.409), (ppl_avg_epoch: 1.506), (acc_avg_epoch: 0.795), (lr: 1.165e-04), [('loss_att', 0.259), ('acc', 0.825), ('loss_pre', 0.04), ('loss', 0.299), ('batch_size', 40)], {'data_load': '0.000', 'forward_time': '0.315', 'backward_time': '0.555', 'optim_time': '0.076', 'total_time': '0.947'}, GPU, memory: usage: 3.830 GB, peak: 18.357 GB, cache: 20.910 GB, cache_peak: 20.910 GB
[2024-03-21 15:55:52,139][root][INFO] - train, rank: 1, epoch: 0/50, step: 6990/1, total step: 6990, (loss_avg_rank: 0.334), (loss_avg_epoch: 0.409), (ppl_avg_epoch: 1.506), (acc_avg_epoch: 0.795), (lr: 1.165e-04), [('loss_att', 0.285), ('acc', 0.823), ('loss_pre', 0.046), ('loss', 0.331), ('batch_size', 36)], {'data_load': '0.000', 'forward_time': '0.334', 'backward_time': '0.536', 'optim_time': '0.077', 'total_time': '0.948'}, GPU, memory: usage: 3.943 GB, peak: 18.291 GB, cache: 19.619 GB, cache_peak: 19.619 GB
```
- `rank`gpu id。
- `epoch`,`step`,`total step`the current epoch, step, and total steps.
- `loss_avg_rank`the average loss across all GPUs for the current step.
- `loss/ppl/acc_avg_epoch`the overall average loss/perplexity/accuracy for the current epoch, up to the current step count. The last step of the epoch when it ends represents the total average loss/perplexity/accuracy for that epoch; it is recommended to use the accuracy metric.
- `lr`the learning rate for the current step.
- `[('loss_att', 0.259), ('acc', 0.825), ('loss_pre', 0.04), ('loss', 0.299), ('batch_size', 40)]`the specific data for the current GPU ID.
- `total_time`the total time taken for a single step.
- `GPU, memory`the model-used/peak memory and the model+cache-used/peak memory.
##### tensorboard
```bash
tensorboard --logdir /xxxx/FunASR/examples/industrial_data_pretraining/paraformer/outputs/log/tensorboard
```
http://localhost:6006/
### 训练后模型测试
#### With `configuration.json` file
Assuming the training model path is: ./model_dir, if a configuration.json file has been generated in this directory, you only need to change the model name to the model path in the above model inference method.
For example, for shell inference:
```shell
python -m funasr.bin.inference ++model="./model_dir" ++input=="${input}" ++output_dir="${output_dir}"
```
Python inference
```python
from funasr import AutoModel
model = AutoModel(model="./model_dir")
res = model.generate(input=wav_file)
print(res)
```
#### Without `configuration.json` file
If there is no configuration.json in the model path, you need to manually specify the exact configuration file path and the model path.
```shell
python -m funasr.bin.inference \
--config-path "${local_path}" \
--config-name "${config}" \
++init_param="${init_param}" \
++tokenizer_conf.token_list="${tokens}" \
++frontend_conf.cmvn_file="${cmvn_file}" \
++input="${input}" \
++output_dir="${output_dir}" \
++device="${device}"
```
Parameter Introduction
- `config-path`This is the path to the config.yaml saved during the experiment, which can be found in the experiment's output directory.
- `config-name`The name of the configuration file, usually config.yaml. It supports both YAML and JSON formats, for example config.json.
- `init_param`The model parameters that need to be tested, usually model.pt. You can choose a specific model file as needed.
- `tokenizer_conf.token_list`The path to the vocabulary file, which is normally specified in config.yaml. There is no need to manually specify it again unless the path in config.yaml is incorrect, in which case the correct path must be manually specified here.
- `frontend_conf.cmvn_file`The CMVN (Cepstral Mean and Variance Normalization) file used when extracting fbank features from WAV files, which is usually specified in config.yaml. There is no need to manually specify it again unless the path in config.yaml is incorrect, in which case the correct path must be manually specified here.
Other parameters are the same as mentioned above. A complete [example](https://github.com/alibaba-damo-academy/FunASR/blob/main/examples/industrial_data_pretraining/paraformer/infer_from_local.sh) can be found here.
<a name="Export"></a>
## Export ONNX
### Command-line usage
```shell
funasr-export ++model=paraformer ++quantize=false ++device=cpu
```
### Python
```python
from funasr import AutoModel
model = AutoModel(model="paraformer", device="cpu")
res = model.export(quantize=False)
```
### Test ONNX
```python
# pip3 install -U funasr-onnx
from funasr_onnx import Paraformer
model_dir = "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
model = Paraformer(model_dir, batch_size=1, quantize=True)
wav_path = ['~/.cache/modelscope/hub/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/example/asr_example.wav']
result = model(wav_path)
print(result)
```
More examples ref to [demo](https://github.com/alibaba-damo-academy/FunASR/tree/main/runtime/python/onnxruntime)

View File

@@ -0,0 +1,436 @@
(简体中文|[English](./README.md))
FunASR开源了大量在工业数据上预训练模型您可以在 [模型许可协议](https://github.com/alibaba-damo-academy/FunASR/blob/main/MODEL_LICENSE)下自由使用、复制、修改和分享FunASR模型下面列举代表性的模型更多模型请参考 [模型仓库](https://github.com/alibaba-damo-academy/FunASR/tree/main/model_zoo)。
<div align="center">
<h4>
<a href="#模型推理"> 模型推理 </a>
<a href="#模型训练与测试"> 模型训练与测试 </a>
<a href="#模型导出与测试"> 模型导出与测试 </a>
</h4>
</div>
<a name="模型推理"></a>
## 模型推理
### 快速使用
命令行方式调用:
```shell
funasr ++model=paraformer-zh ++vad_model="fsmn-vad" ++punc_model="ct-punc" ++input=asr_example_zh.wav
```
python代码调用推荐
```python
from funasr import AutoModel
model = AutoModel(model="paraformer-zh")
res = model.generate(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/vad_example.wav")
print(res)
```
### 接口说明
#### AutoModel 定义
```python
model = AutoModel(model=[str], device=[str], ncpu=[int], output_dir=[str], batch_size=[int], hub=[str], **kwargs)
```
- `model`(str): [模型仓库](https://github.com/alibaba-damo-academy/FunASR/tree/main/model_zoo) 中的模型名称,或本地磁盘中的模型路径
- `device`(str): `cuda:0`默认gpu0使用 GPU 进行推理,指定。如果为`cpu`,则使用 CPU 进行推理
- `ncpu`(int): `4` (默认),设置用于 CPU 内部操作并行性的线程数
- `output_dir`(str): `None` (默认),如果设置,输出结果的输出路径
- `batch_size`(int): `1` (默认),解码时的批处理,样本个数
- `hub`(str)`ms`默认从modelscope下载模型。如果为`hf`从huggingface下载模型。
- `**kwargs`(dict): 所有在`config.yaml`中参数均可以直接在此处指定例如vad模型中最大切割长度 `max_single_segment_time=6000` (毫秒)。
#### AutoModel 推理
```python
res = model.generate(input=[str], output_dir=[str])
```
- `input`: 要解码的输入,可以是:
- wav文件路径, 例如: asr_example.wav
- pcm文件路径, 例如: asr_example.pcm此时需要指定音频采样率fs默认为16000
- 音频字节数流,例如:麦克风的字节数数据
- wav.scpkaldi 样式的 wav 列表 (`wav_id \t wav_path`), 例如:
```text
asr_example1 ./audios/asr_example1.wav
asr_example2 ./audios/asr_example2.wav
```
在这种输入 `wav.scp` 的情况下,必须设置 `output_dir` 以保存输出结果
- 音频采样点,例如:`audio, rate = soundfile.read("asr_example_zh.wav")`, 数据类型为 numpy.ndarray。支持batch输入类型为list
```[audio_sample1, audio_sample2, ..., audio_sampleN]```
- fbank输入支持组batch。shape为[batch, frames, dim]类型为torch.Tensor例如
- `output_dir`: None (默认),如果设置,输出结果的输出路径
- `**kwargs`(dict): 与模型相关的推理参数,例如,`beam_size=10``decoding_ctc_weight=0.1`。
### 更多用法介绍
#### 非实时语音识别
```python
from funasr import AutoModel
# paraformer-zh is a multi-functional asr model
# use vad, punc, spk or not as you need
model = AutoModel(model="paraformer-zh",
vad_model="fsmn-vad",
vad_kwargs={"max_single_segment_time": 60000},
punc_model="ct-punc",
# spk_model="cam++"
)
wav_file = f"{model.model_path}/example/asr_example.wav"
res = model.generate(input=wav_file, batch_size_s=300, batch_size_threshold_s=60, hotword='魔搭')
print(res)
```
注意:
- 通常模型输入限制时长30s以下组合`vad_model`后支持任意时长音频输入不局限于paraformer模型所有音频输入模型均可以。
- `model`相关的参数可以直接在`AutoModel`定义中直接指定;与`vad_model`相关参数可以通过`vad_kwargs`来指定类型为dict类似的有`punc_kwargs``spk_kwargs`
- `max_single_segment_time`: 表示`vad_model`最大切割音频时长, 单位是毫秒ms.
- `batch_size_s` 表示采用动态batchbatch中总音频时长单位为秒s。
- `batch_size_threshold_s`: 表示`vad_model`切割后音频片段时长超过 `batch_size_threshold_s`阈值时将batch_size数设置为1, 单位为秒s.
建议当您输入为长音频遇到OOM问题时因为显存占用与音频时长呈平方关系增加分为3种情况
- a)推理起始阶段,显存主要取决于`batch_size_s`,适当减小该值,可以减少显存占用;
- b)推理中间阶段遇到VAD切割的长音频片段总token数小于`batch_size_s`仍然出现OOM可以适当减小`batch_size_threshold_s`超过阈值强制batch为1;
- c)推理快结束阶段遇到VAD切割的长音频片段总token数小于`batch_size_s`,且超过阈值`batch_size_threshold_s`强制batch为1仍然出现OOM可以适当减小`max_single_segment_time`使得VAD切割音频时长变短。
#### 实时语音识别
```python
from funasr import AutoModel
chunk_size = [0, 10, 5] #[0, 10, 5] 600ms, [0, 8, 4] 480ms
encoder_chunk_look_back = 4 #number of chunks to lookback for encoder self-attention
decoder_chunk_look_back = 1 #number of encoder chunks to lookback for decoder cross-attention
model = AutoModel(model="paraformer-zh-streaming")
import soundfile
import os
wav_file = os.path.join(model.model_path, "example/asr_example.wav")
speech, sample_rate = soundfile.read(wav_file)
chunk_stride = chunk_size[1] * 960 # 600ms
cache = {}
total_chunk_num = int(len((speech)-1)/chunk_stride+1)
for i in range(total_chunk_num):
speech_chunk = speech[i*chunk_stride:(i+1)*chunk_stride]
is_final = i == total_chunk_num - 1
res = model.generate(input=speech_chunk, cache=cache, is_final=is_final, chunk_size=chunk_size, encoder_chunk_look_back=encoder_chunk_look_back, decoder_chunk_look_back=decoder_chunk_look_back)
print(res)
```
注:`chunk_size`为流式延时配置,`[0,10,5]`表示上屏实时出字粒度为`10*60=600ms`,未来信息为`5*60=300ms`。每次推理输入为`600ms`(采样点数为`16000*0.6=960`),输出为对应文字,最后一个语音片段输入需要设置`is_final=True`来强制输出最后一个字。
#### 语音端点检测(非实时)
```python
from funasr import AutoModel
model = AutoModel(model="fsmn-vad")
wav_file = f"{model.model_path}/example/vad_example.wav"
res = model.generate(input=wav_file)
print(res)
```
VAD模型输出格式为`[[beg1, end1], [beg2, end2], .., [begN, endN]]`,其中`begN/endN`表示第`N`个有效音频片段的起始点/结束点,
单位为毫秒。
#### 语音端点检测(实时)
```python
from funasr import AutoModel
chunk_size = 200 # ms
model = AutoModel(model="fsmn-vad")
import soundfile
wav_file = f"{model.model_path}/example/vad_example.wav"
speech, sample_rate = soundfile.read(wav_file)
chunk_stride = int(chunk_size * sample_rate / 1000)
cache = {}
total_chunk_num = int(len((speech)-1)/chunk_stride+1)
for i in range(total_chunk_num):
speech_chunk = speech[i*chunk_stride:(i+1)*chunk_stride]
is_final = i == total_chunk_num - 1
res = model.generate(input=speech_chunk, cache=cache, is_final=is_final, chunk_size=chunk_size)
if len(res[0]["value"]):
print(res)
```
流式VAD模型输出格式为4种情况
- `[[beg1, end1], [beg2, end2], .., [begN, endN]]`同上离线VAD输出结果。
- `[[beg, -1]]`:表示只检测到起始点。
- `[[-1, end]]`:表示只检测到结束点。
- `[]`:表示既没有检测到起始点,也没有检测到结束点
输出结果单位为毫秒,从起始点开始的绝对时间。
#### 标点恢复
```python
from funasr import AutoModel
model = AutoModel(model="ct-punc")
res = model.generate(input="那今天的会就到这里吧 happy new year 明年见")
print(res)
```
#### 时间戳预测
```python
from funasr import AutoModel
model = AutoModel(model="fa-zh")
wav_file = f"{model.model_path}/example/asr_example.wav"
text_file = f"{model.model_path}/example/text.txt"
res = model.generate(input=(wav_file, text_file), data_type=("sound", "text"))
print(res)
```
更多([示例](https://github.com/alibaba-damo-academy/FunASR/tree/main/examples/industrial_data_pretraining)
<a name="核心功能"></a>
## 模型训练与测试
### 快速开始
命令行执行(用于快速测试,不推荐):
```shell
funasr-train ++model=paraformer-zh ++train_data_set_list=data/list/train.jsonl ++valid_data_set_list=data/list/val.jsonl ++output_dir="./outputs" &> log.txt &
```
python代码执行可以多机多卡推荐
```shell
cd examples/industrial_data_pretraining/paraformer
bash finetune.sh
# "log_file: ./outputs/log.txt"
```
详细完整的脚本参考 [finetune.sh](https://github.com/alibaba-damo-academy/FunASR/blob/main/examples/industrial_data_pretraining/paraformer/finetune.sh)
### 详细参数介绍
```shell
funasr/bin/train.py \
++model="${model_name_or_model_dir}" \
++train_data_set_list="${train_data}" \
++valid_data_set_list="${val_data}" \
++dataset_conf.batch_size=20000 \
++dataset_conf.batch_type="token" \
++dataset_conf.num_workers=4 \
++train_conf.max_epoch=50 \
++train_conf.log_interval=1 \
++train_conf.resume=false \
++train_conf.validate_interval=2000 \
++train_conf.save_checkpoint_interval=2000 \
++train_conf.keep_nbest_models=20 \
++train_conf.avg_nbest_model=10 \
++optim_conf.lr=0.0002 \
++output_dir="${output_dir}" &> ${log_file}
```
- `model`str模型名字模型仓库中的ID此时脚本会自动下载模型到本读或者本地已经下载好的模型路径。
- `train_data_set_list`str训练数据路径默认为jsonl格式具体参考[例子](https://github.com/alibaba-damo-academy/FunASR/blob/main/data/list))。
- `valid_data_set_list`str验证数据路径默认为jsonl格式具体参考[例子](https://github.com/alibaba-damo-academy/FunASR/blob/main/data/list))。
- `dataset_conf.batch_type`str`example`默认batch的类型。`example`表示按照固定数目batch_size个样本组batch`length` or `token` 表示动态组batchbatch总长度或者token数为batch_size。
- `dataset_conf.batch_size`int与 `batch_type` 搭配使用,当 `batch_type=example` 时,表示样本个数;当 `batch_type=length` 时表示样本中长度单位为fbank帧数1帧10ms或者文字token个数。
- `train_conf.max_epoch`int`100`默认训练总epoch数。
- `train_conf.log_interval`int`50`默认打印日志间隔step数。
- `train_conf.resume`int`True`(默认),是否开启断点重训。
- `train_conf.validate_interval`int`5000`默认训练中做验证测试的间隔step数。
- `train_conf.save_checkpoint_interval`int`5000`默认训练中模型保存间隔step数。
- `train_conf.avg_keep_nbest_models_type`str`acc`默认保留nbest的标准为acc越大越好。`loss`表示保留nbest的标准为loss越小越好
- `train_conf.keep_nbest_models`int`500`(默认),保留最大多少个模型参数,配合 `avg_keep_nbest_models_type` 按照验证集 acc/loss 保留最佳的n个模型其他删除节约存储空间。
- `train_conf.avg_nbest_model`int`10`(默认),保留最大多少个模型参数,配合 `avg_keep_nbest_models_type` 按照验证集 acc/loss 对最佳的n个模型平均。
- `train_conf.accum_grad`int`1`(默认),梯度累积功能。
- `train_conf.grad_clip`float`10.0`(默认),梯度截断功能。
- `train_conf.use_fp16`bool`False`默认开启fp16训练加快训练速度。
- `optim_conf.lr`float学习率。
- `output_dir`str模型保存路径。
- `**kwargs`(dict): 所有在`config.yaml`中参数均可以直接在此处指定例如过滤20s以上长音频`dataset_conf.max_token_length=2000`单位为音频fbank帧数1帧10ms或者文字token个数。
#### 多gpu训练
##### 单机多gpu训练
```shell
export CUDA_VISIBLE_DEVICES="0,1"
gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
torchrun --nnodes 1 --nproc_per_node ${gpu_num} \
../../../funasr/bin/train.py ${train_args}
```
--nnodes 表示参与的节点总数,--nproc_per_node 表示每个节点上运行的进程数
##### 多机多gpu训练
在主节点上假设IP为192.168.1.1端口为12345使用的是2个GPU则运行如下命令
```shell
export CUDA_VISIBLE_DEVICES="0,1"
gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
torchrun --nnodes 2 --node_rank 0 --nproc_per_node ${gpu_num} --master_addr 192.168.1.1 --master_port 12345 \
../../../funasr/bin/train.py ${train_args}
```
在从节点上假设IP为192.168.1.2你需要确保MASTER_ADDR和MASTER_PORT环境变量与主节点设置的一致并运行同样的命令
```shell
export CUDA_VISIBLE_DEVICES="0,1"
gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
torchrun --nnodes 2 --node_rank 1 --nproc_per_node ${gpu_num} --master_addr 192.168.1.1 --master_port 12345 \
../../../funasr/bin/train.py ${train_args}
```
--nnodes 表示参与的节点总数,--node_rank 表示当前节点id--nproc_per_node 表示每个节点上运行的进程数通常为gpu个数
#### 准备数据
`jsonl`格式可以参考([例子](https://github.com/alibaba-damo-academy/FunASR/blob/main/data/list))。
可以用指令 `scp2jsonl` 从wav.scp与text.txt生成。wav.scp与text.txt准备过程如下
`train_text.txt`
左边为数据唯一ID需与`train_wav.scp`中的`ID`一一对应
右边为音频文件标注文本,格式如下:
```bash
ID0012W0013 当客户风险承受能力评估依据发生变化时
ID0012W0014 所有只要处理 data 不管你是做 machine learning 做 deep learning
ID0012W0015 he tried to think how it could be
```
`train_wav.scp`
左边为数据唯一ID需与`train_text.txt`中的`ID`一一对应
右边为音频文件的路径,格式如下
```bash
BAC009S0764W0121 https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/BAC009S0764W0121.wav
BAC009S0916W0489 https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/BAC009S0916W0489.wav
ID0012W0015 https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_cn_en.wav
```
`生成指令`
```shell
# generate train.jsonl and val.jsonl from wav.scp and text.txt
scp2jsonl \
++scp_file_list='["../../../data/list/train_wav.scp", "../../../data/list/train_text.txt"]' \
++data_type_list='["source", "target"]' \
++jsonl_file_out="../../../data/list/train.jsonl"
```
可选非必需如果需要从jsonl解析成wav.scp与text.txt可以使用指令
```shell
# generate wav.scp and text.txt from train.jsonl and val.jsonl
jsonl2scp \
++scp_file_list='["../../../data/list/train_wav.scp", "../../../data/list/train_text.txt"]' \
++data_type_list='["source", "target"]' \
++jsonl_file_in="../../../data/list/train.jsonl"
```
#### 查看训练日志
##### 查看实验log
```shell
tail log.txt
[2024-03-21 15:55:52,137][root][INFO] - train, rank: 3, epoch: 0/50, step: 6990/1, total step: 6990, (loss_avg_rank: 0.327), (loss_avg_epoch: 0.409), (ppl_avg_epoch: 1.506), (acc_avg_epoch: 0.795), (lr: 1.165e-04), [('loss_att', 0.259), ('acc', 0.825), ('loss_pre', 0.04), ('loss', 0.299), ('batch_size', 40)], {'data_load': '0.000', 'forward_time': '0.315', 'backward_time': '0.555', 'optim_time': '0.076', 'total_time': '0.947'}, GPU, memory: usage: 3.830 GB, peak: 18.357 GB, cache: 20.910 GB, cache_peak: 20.910 GB
[2024-03-21 15:55:52,139][root][INFO] - train, rank: 1, epoch: 0/50, step: 6990/1, total step: 6990, (loss_avg_rank: 0.334), (loss_avg_epoch: 0.409), (ppl_avg_epoch: 1.506), (acc_avg_epoch: 0.795), (lr: 1.165e-04), [('loss_att', 0.285), ('acc', 0.823), ('loss_pre', 0.046), ('loss', 0.331), ('batch_size', 36)], {'data_load': '0.000', 'forward_time': '0.334', 'backward_time': '0.536', 'optim_time': '0.077', 'total_time': '0.948'}, GPU, memory: usage: 3.943 GB, peak: 18.291 GB, cache: 19.619 GB, cache_peak: 19.619 GB
```
指标解释:
- `rank`表示gpu id。
- `epoch`,`step`,`total step`表示当前epochstep总step。
- `loss_avg_rank`表示当前step所有gpu平均loss。
- `loss/ppl/acc_avg_epoch`表示当前epoch周期截止当前step数时总平均loss/ppl/acc。epoch结束时的最后一个step表示epoch总平均loss/ppl/acc推荐使用acc指标。
- `lr`当前step的学习率。
- `[('loss_att', 0.259), ('acc', 0.825), ('loss_pre', 0.04), ('loss', 0.299), ('batch_size', 40)]`表示当前gpu id的具体数据。
- `total_time`表示单个step总耗时。
- `GPU, memory`:分别表示,模型使用/峰值显存,模型+缓存使用/峰值显存。
##### tensorboard可视化
```bash
tensorboard --logdir /xxxx/FunASR/examples/industrial_data_pretraining/paraformer/outputs/log/tensorboard
```
浏览器中打开http://localhost:6006/
### 训练后模型测试
#### 有configuration.json
假定,训练模型路径为:./model_dir如果改目录下有生成configuration.json只需要将 [上述模型推理方法](https://github.com/alibaba-damo-academy/FunASR/blob/main/examples/README_zh.md#%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86) 中模型名字修改为模型路径即可
例如:
从shell推理
```shell
python -m funasr.bin.inference ++model="./model_dir" ++input=="${input}" ++output_dir="${output_dir}"
```
从python推理
```python
from funasr import AutoModel
model = AutoModel(model="./model_dir")
res = model.generate(input=wav_file)
print(res)
```
#### 无configuration.json时
如果模型路径中无configuration.json时需要手动指定具体配置文件路径与模型路径
```shell
python -m funasr.bin.inference \
--config-path "${local_path}" \
--config-name "${config}" \
++init_param="${init_param}" \
++tokenizer_conf.token_list="${tokens}" \
++frontend_conf.cmvn_file="${cmvn_file}" \
++input="${input}" \
++output_dir="${output_dir}" \
++device="${device}"
```
参数介绍
- `config-path`:为实验中保存的 `config.yaml`,可以从实验输出目录中查找。
- `config-name`:配置文件名,一般为 `config.yaml`支持yaml格式与json格式例如 `config.json`
- `init_param`:需要测试的模型参数,一般为`model.pt`,可以自己选择具体的模型文件
- `tokenizer_conf.token_list`:词表文件路径,一般在 `config.yaml` 有指定,无需再手动指定,当 `config.yaml` 中路径不正确时,需要在此处手动指定。
- `frontend_conf.cmvn_file`wav提取fbank中用到的cmvn文件一般在 `config.yaml` 有指定,无需再手动指定,当 `config.yaml` 中路径不正确时,需要在此处手动指定。
其他参数同上,完整 [示例](https://github.com/alibaba-damo-academy/FunASR/blob/main/examples/industrial_data_pretraining/paraformer/infer_from_local.sh)
<a name="模型导出与测试"></a>
## 模型导出与测试
### 从命令行导出
```shell
funasr-export ++model=paraformer ++quantize=false
```
### 从Python导出
```python
from funasr import AutoModel
model = AutoModel(model="paraformer")
res = model.export(quantize=False)
```
### 测试ONNX
```python
# pip3 install -U funasr-onnx
from funasr_onnx import Paraformer
model_dir = "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
model = Paraformer(model_dir, batch_size=1, quantize=True)
wav_path = ['~/.cache/modelscope/hub/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/example/asr_example.wav']
result = model(wav_path)
print(result)
```
更多例子请参考 [样例](https://github.com/alibaba-damo-academy/FunASR/tree/main/runtime/python/onnxruntime)

View File

@@ -0,0 +1,47 @@
#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
# MIT License (https://opensource.org/licenses/MIT)
from funasr import AutoModel
model = AutoModel(
model="iic/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
vad_model="iic/speech_fsmn_vad_zh-cn-16k-common-pytorch",
vad_kwargs={"max_single_segment_time": 60000},
punc_model="iic/punc_ct-transformer_zh-cn-common-vocab272727-pytorch",
# spk_model="iic/speech_campplus_sv_zh-cn_16k-common",
)
res = model.generate(
input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav",
cache={},
)
print(res)
""" call english model like below for detailed timestamps
# choose english paraformer model first
# iic/speech_paraformer_asr-en-16k-vocab4199-pytorch
res = model.generate(
input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_en.wav",
cache={},
pred_timestamp=True,
return_raw_text=True,
sentence_timestamp=True,
en_post_proc=True,
)
"""
""" can not use currently
from funasr import AutoFrontend
frontend = AutoFrontend(model="iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch")
fbanks = frontend(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav", batch_size=2)
for batch_idx, fbank_dict in enumerate(fbanks):
res = model.generate(**fbank_dict)
print(res)
"""

View File

@@ -0,0 +1,29 @@
#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
# MIT License (https://opensource.org/licenses/MIT)
# method1, inference from model hub
from funasr import AutoModel
model = AutoModel(
model="iic/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404",
)
res = model.export(type="torchscript", quantize=False)
# res = model.export(type="bladedisc", input=f"{model.model_path}/example/asr_example.wav")
print(res)
# # method2, inference from local path
# from funasr import AutoModel
#
# model = AutoModel(
# model="/Users/zhifu/.cache/modelscope/hub/iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
# )
#
# res = model.export(type="onnx", quantize=False)
# print(res)

View File

@@ -0,0 +1,24 @@
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
# MIT License (https://opensource.org/licenses/MIT)
# method1, inference from model hub
export HYDRA_FULL_ERROR=1
model="iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
python -m funasr.bin.export \
++model=${model} \
++type="onnx" \
++quantize=false
## method2, inference from local path
#model="/Users/zhifu/.cache/modelscope/hub/iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
#
#python -m funasr.bin.export \
#++model=${model} \
#++type="onnx" \
#++quantize=false

View File

@@ -0,0 +1,82 @@
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
# MIT License (https://opensource.org/licenses/MIT)
workspace=`pwd`
# which gpu to train or finetune
export CUDA_VISIBLE_DEVICES="0,1"
gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
# model_name from model_hub, or model_dir in local path
## option 1, download model automatically
model_name_or_model_dir="iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
## option 2, download model by git
#local_path_root=${workspace}/modelscope_models
#mkdir -p ${local_path_root}/${model_name_or_model_dir}
#git clone https://www.modelscope.cn/${model_name_or_model_dir}.git ${local_path_root}/${model_name_or_model_dir}
#model_name_or_model_dir=${local_path_root}/${model_name_or_model_dir}
# data dir, which contains: train.json, val.json
data_dir="../../../data/list"
train_data="${data_dir}/train.jsonl"
val_data="${data_dir}/val.jsonl"
# generate train.jsonl and val.jsonl from wav.scp and text.txt
scp2jsonl \
++scp_file_list='["../../../data/list/train_wav.scp", "../../../data/list/train_text.txt"]' \
++data_type_list='["source", "target"]' \
++jsonl_file_out="${train_data}"
scp2jsonl \
++scp_file_list='["../../../data/list/val_wav.scp", "../../../data/list/val_text.txt"]' \
++data_type_list='["source", "target"]' \
++jsonl_file_out="${val_data}"
# exp output dir
output_dir="./outputs"
log_file="${output_dir}/log.txt"
deepspeed_config=${workspace}/../../ds_stage1.json
mkdir -p ${output_dir}
echo "log_file: ${log_file}"
DISTRIBUTED_ARGS="
--nnodes ${WORLD_SIZE:-1} \
--nproc_per_node $gpu_num \
--node_rank ${RANK:-0} \
--master_addr ${MASTER_ADDR:-127.0.0.1} \
--master_port ${MASTER_PORT:-26669}
"
echo $DISTRIBUTED_ARGS
torchrun $DISTRIBUTED_ARGS \
../../../funasr/bin/train_ds.py \
++model="${model_name_or_model_dir}" \
++train_data_set_list="${train_data}" \
++valid_data_set_list="${val_data}" \
++dataset="AudioDataset" \
++dataset_conf.index_ds="IndexDSJsonl" \
++dataset_conf.data_split_num=1 \
++dataset_conf.batch_sampler="BatchSampler" \
++dataset_conf.batch_size=6000 \
++dataset_conf.sort_size=1024 \
++dataset_conf.batch_type="token" \
++dataset_conf.num_workers=4 \
++train_conf.max_epoch=50 \
++train_conf.log_interval=1 \
++train_conf.resume=true \
++train_conf.validate_interval=2000 \
++train_conf.save_checkpoint_interval=2000 \
++train_conf.keep_nbest_models=20 \
++train_conf.avg_nbest_model=10 \
++train_conf.use_deepspeed=false \
++train_conf.deepspeed_config=${deepspeed_config} \
++optim_conf.lr=0.0002 \
++output_dir="${output_dir}" &> ${log_file}

View File

@@ -0,0 +1,20 @@
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
# MIT License (https://opensource.org/licenses/MIT)
# method1, inference from model hub
# for more input type, please ref to readme.md
input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav"
output_dir="./outputs/debug"
model="iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
device="cuda:0" # "cuda:0" for gpu0, "cuda:1" for gpu1, "cpu"
python -m funasr.bin.inference \
++model=${model} \
++input="${input}" \
++output_dir="${output_dir}" \
++device="${device}" \

View File

@@ -0,0 +1,39 @@
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
# MIT License (https://opensource.org/licenses/MIT)
# method2, inference from local model
# for more input type, please ref to readme.md
input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav"
output_dir="./outputs/debug"
workspace=`pwd`
# download model
local_path_root=${workspace}/modelscope_models
mkdir -p ${local_path_root}
local_path=${local_path_root}/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch
git lfs clone https://www.modelscope.cn/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch.git ${local_path}
device="cuda:0" # "cuda:0" for gpu0, "cuda:1" for gpu1, "cpu"
tokens="${local_path}/tokens.json"
cmvn_file="${local_path}/am.mvn"
config="config.yaml"
init_param="${local_path}/model.pt"
python -m funasr.bin.inference \
--config-path "${local_path}" \
--config-name "${config}" \
++init_param="${init_param}" \
++tokenizer_conf.token_list="${tokens}" \
++frontend_conf.cmvn_file="${cmvn_file}" \
++input="${input}" \
++output_dir="${output_dir}" \
++device="${device}" \

View File

@@ -0,0 +1,436 @@
(简体中文|[English](./README.md))
FunASR开源了大量在工业数据上预训练模型您可以在 [模型许可协议](https://github.com/alibaba-damo-academy/FunASR/blob/main/MODEL_LICENSE)下自由使用、复制、修改和分享FunASR模型下面列举代表性的模型更多模型请参考 [模型仓库](https://github.com/alibaba-damo-academy/FunASR/tree/main/model_zoo)。
<div align="center">
<h4>
<a href="#模型推理"> 模型推理 </a>
<a href="#模型训练与测试"> 模型训练与测试 </a>
<a href="#模型导出与测试"> 模型导出与测试 </a>
</h4>
</div>
<a name="模型推理"></a>
## 模型推理
### 快速使用
命令行方式调用:
```shell
funasr ++model=paraformer-zh ++vad_model="fsmn-vad" ++punc_model="ct-punc" ++input=asr_example_zh.wav
```
python代码调用推荐
```python
from funasr import AutoModel
model = AutoModel(model="paraformer-zh")
res = model.generate(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/vad_example.wav")
print(res)
```
### 接口说明
#### AutoModel 定义
```python
model = AutoModel(model=[str], device=[str], ncpu=[int], output_dir=[str], batch_size=[int], hub=[str], **kwargs)
```
- `model`(str): [模型仓库](https://github.com/alibaba-damo-academy/FunASR/tree/main/model_zoo) 中的模型名称,或本地磁盘中的模型路径
- `device`(str): `cuda:0`默认gpu0使用 GPU 进行推理,指定。如果为`cpu`,则使用 CPU 进行推理
- `ncpu`(int): `4` (默认),设置用于 CPU 内部操作并行性的线程数
- `output_dir`(str): `None` (默认),如果设置,输出结果的输出路径
- `batch_size`(int): `1` (默认),解码时的批处理,样本个数
- `hub`(str)`ms`默认从modelscope下载模型。如果为`hf`从huggingface下载模型。
- `**kwargs`(dict): 所有在`config.yaml`中参数均可以直接在此处指定例如vad模型中最大切割长度 `max_single_segment_time=6000` (毫秒)。
#### AutoModel 推理
```python
res = model.generate(input=[str], output_dir=[str])
```
- `input`: 要解码的输入,可以是:
- wav文件路径, 例如: asr_example.wav
- pcm文件路径, 例如: asr_example.pcm此时需要指定音频采样率fs默认为16000
- 音频字节数流,例如:麦克风的字节数数据
- wav.scpkaldi 样式的 wav 列表 (`wav_id \t wav_path`), 例如:
```text
asr_example1 ./audios/asr_example1.wav
asr_example2 ./audios/asr_example2.wav
```
在这种输入 `wav.scp` 的情况下,必须设置 `output_dir` 以保存输出结果
- 音频采样点,例如:`audio, rate = soundfile.read("asr_example_zh.wav")`, 数据类型为 numpy.ndarray。支持batch输入类型为list
```[audio_sample1, audio_sample2, ..., audio_sampleN]```
- fbank输入支持组batch。shape为[batch, frames, dim]类型为torch.Tensor例如
- `output_dir`: None (默认),如果设置,输出结果的输出路径
- `**kwargs`(dict): 与模型相关的推理参数,例如,`beam_size=10``decoding_ctc_weight=0.1`。
### 更多用法介绍
#### 非实时语音识别
```python
from funasr import AutoModel
# paraformer-zh is a multi-functional asr model
# use vad, punc, spk or not as you need
model = AutoModel(model="paraformer-zh",
vad_model="fsmn-vad",
vad_kwargs={"max_single_segment_time": 60000},
punc_model="ct-punc",
# spk_model="cam++"
)
wav_file = f"{model.model_path}/example/asr_example.wav"
res = model.generate(input=wav_file, batch_size_s=300, batch_size_threshold_s=60, hotword='魔搭')
print(res)
```
注意:
- 通常模型输入限制时长30s以下组合`vad_model`后支持任意时长音频输入不局限于paraformer模型所有音频输入模型均可以。
- `model`相关的参数可以直接在`AutoModel`定义中直接指定;与`vad_model`相关参数可以通过`vad_kwargs`来指定类型为dict类似的有`punc_kwargs``spk_kwargs`
- `max_single_segment_time`: 表示`vad_model`最大切割音频时长, 单位是毫秒ms.
- `batch_size_s` 表示采用动态batchbatch中总音频时长单位为秒s。
- `batch_size_threshold_s`: 表示`vad_model`切割后音频片段时长超过 `batch_size_threshold_s`阈值时将batch_size数设置为1, 单位为秒s.
建议当您输入为长音频遇到OOM问题时因为显存占用与音频时长呈平方关系增加分为3种情况
- a)推理起始阶段,显存主要取决于`batch_size_s`,适当减小该值,可以减少显存占用;
- b)推理中间阶段遇到VAD切割的长音频片段总token数小于`batch_size_s`仍然出现OOM可以适当减小`batch_size_threshold_s`超过阈值强制batch为1;
- c)推理快结束阶段遇到VAD切割的长音频片段总token数小于`batch_size_s`,且超过阈值`batch_size_threshold_s`强制batch为1仍然出现OOM可以适当减小`max_single_segment_time`使得VAD切割音频时长变短。
#### 实时语音识别
```python
from funasr import AutoModel
chunk_size = [0, 10, 5] #[0, 10, 5] 600ms, [0, 8, 4] 480ms
encoder_chunk_look_back = 4 #number of chunks to lookback for encoder self-attention
decoder_chunk_look_back = 1 #number of encoder chunks to lookback for decoder cross-attention
model = AutoModel(model="paraformer-zh-streaming")
import soundfile
import os
wav_file = os.path.join(model.model_path, "example/asr_example.wav")
speech, sample_rate = soundfile.read(wav_file)
chunk_stride = chunk_size[1] * 960 # 600ms
cache = {}
total_chunk_num = int(len((speech)-1)/chunk_stride+1)
for i in range(total_chunk_num):
speech_chunk = speech[i*chunk_stride:(i+1)*chunk_stride]
is_final = i == total_chunk_num - 1
res = model.generate(input=speech_chunk, cache=cache, is_final=is_final, chunk_size=chunk_size, encoder_chunk_look_back=encoder_chunk_look_back, decoder_chunk_look_back=decoder_chunk_look_back)
print(res)
```
注:`chunk_size`为流式延时配置,`[0,10,5]`表示上屏实时出字粒度为`10*60=600ms`,未来信息为`5*60=300ms`。每次推理输入为`600ms`(采样点数为`16000*0.6=960`),输出为对应文字,最后一个语音片段输入需要设置`is_final=True`来强制输出最后一个字。
#### 语音端点检测(非实时)
```python
from funasr import AutoModel
model = AutoModel(model="fsmn-vad")
wav_file = f"{model.model_path}/example/vad_example.wav"
res = model.generate(input=wav_file)
print(res)
```
VAD模型输出格式为`[[beg1, end1], [beg2, end2], .., [begN, endN]]`,其中`begN/endN`表示第`N`个有效音频片段的起始点/结束点,
单位为毫秒。
#### 语音端点检测(实时)
```python
from funasr import AutoModel
chunk_size = 200 # ms
model = AutoModel(model="fsmn-vad")
import soundfile
wav_file = f"{model.model_path}/example/vad_example.wav"
speech, sample_rate = soundfile.read(wav_file)
chunk_stride = int(chunk_size * sample_rate / 1000)
cache = {}
total_chunk_num = int(len((speech)-1)/chunk_stride+1)
for i in range(total_chunk_num):
speech_chunk = speech[i*chunk_stride:(i+1)*chunk_stride]
is_final = i == total_chunk_num - 1
res = model.generate(input=speech_chunk, cache=cache, is_final=is_final, chunk_size=chunk_size)
if len(res[0]["value"]):
print(res)
```
流式VAD模型输出格式为4种情况
- `[[beg1, end1], [beg2, end2], .., [begN, endN]]`同上离线VAD输出结果。
- `[[beg, -1]]`:表示只检测到起始点。
- `[[-1, end]]`:表示只检测到结束点。
- `[]`:表示既没有检测到起始点,也没有检测到结束点
输出结果单位为毫秒,从起始点开始的绝对时间。
#### 标点恢复
```python
from funasr import AutoModel
model = AutoModel(model="ct-punc")
res = model.generate(input="那今天的会就到这里吧 happy new year 明年见")
print(res)
```
#### 时间戳预测
```python
from funasr import AutoModel
model = AutoModel(model="fa-zh")
wav_file = f"{model.model_path}/example/asr_example.wav"
text_file = f"{model.model_path}/example/text.txt"
res = model.generate(input=(wav_file, text_file), data_type=("sound", "text"))
print(res)
```
更多([示例](https://github.com/alibaba-damo-academy/FunASR/tree/main/examples/industrial_data_pretraining)
<a name="核心功能"></a>
## 模型训练与测试
### 快速开始
命令行执行(用于快速测试,不推荐):
```shell
funasr-train ++model=paraformer-zh ++train_data_set_list=data/list/train.jsonl ++valid_data_set_list=data/list/val.jsonl ++output_dir="./outputs" &> log.txt &
```
python代码执行可以多机多卡推荐
```shell
cd examples/industrial_data_pretraining/paraformer
bash finetune.sh
# "log_file: ./outputs/log.txt"
```
详细完整的脚本参考 [finetune.sh](https://github.com/alibaba-damo-academy/FunASR/blob/main/examples/industrial_data_pretraining/paraformer/finetune.sh)
### 详细参数介绍
```shell
funasr/bin/train.py \
++model="${model_name_or_model_dir}" \
++train_data_set_list="${train_data}" \
++valid_data_set_list="${val_data}" \
++dataset_conf.batch_size=20000 \
++dataset_conf.batch_type="token" \
++dataset_conf.num_workers=4 \
++train_conf.max_epoch=50 \
++train_conf.log_interval=1 \
++train_conf.resume=false \
++train_conf.validate_interval=2000 \
++train_conf.save_checkpoint_interval=2000 \
++train_conf.keep_nbest_models=20 \
++train_conf.avg_nbest_model=10 \
++optim_conf.lr=0.0002 \
++output_dir="${output_dir}" &> ${log_file}
```
- `model`str模型名字模型仓库中的ID此时脚本会自动下载模型到本读或者本地已经下载好的模型路径。
- `train_data_set_list`str训练数据路径默认为jsonl格式具体参考[例子](https://github.com/alibaba-damo-academy/FunASR/blob/main/data/list))。
- `valid_data_set_list`str验证数据路径默认为jsonl格式具体参考[例子](https://github.com/alibaba-damo-academy/FunASR/blob/main/data/list))。
- `dataset_conf.batch_type`str`example`默认batch的类型。`example`表示按照固定数目batch_size个样本组batch`length` or `token` 表示动态组batchbatch总长度或者token数为batch_size。
- `dataset_conf.batch_size`int与 `batch_type` 搭配使用,当 `batch_type=example` 时,表示样本个数;当 `batch_type=length` 时表示样本中长度单位为fbank帧数1帧10ms或者文字token个数。
- `train_conf.max_epoch`int`100`默认训练总epoch数。
- `train_conf.log_interval`int`50`默认打印日志间隔step数。
- `train_conf.resume`int`True`(默认),是否开启断点重训。
- `train_conf.validate_interval`int`5000`默认训练中做验证测试的间隔step数。
- `train_conf.save_checkpoint_interval`int`5000`默认训练中模型保存间隔step数。
- `train_conf.avg_keep_nbest_models_type`str`acc`默认保留nbest的标准为acc越大越好。`loss`表示保留nbest的标准为loss越小越好
- `train_conf.keep_nbest_models`int`500`(默认),保留最大多少个模型参数,配合 `avg_keep_nbest_models_type` 按照验证集 acc/loss 保留最佳的n个模型其他删除节约存储空间。
- `train_conf.avg_nbest_model`int`10`(默认),保留最大多少个模型参数,配合 `avg_keep_nbest_models_type` 按照验证集 acc/loss 对最佳的n个模型平均。
- `train_conf.accum_grad`int`1`(默认),梯度累积功能。
- `train_conf.grad_clip`float`10.0`(默认),梯度截断功能。
- `train_conf.use_fp16`bool`False`默认开启fp16训练加快训练速度。
- `optim_conf.lr`float学习率。
- `output_dir`str模型保存路径。
- `**kwargs`(dict): 所有在`config.yaml`中参数均可以直接在此处指定例如过滤20s以上长音频`dataset_conf.max_token_length=2000`单位为音频fbank帧数1帧10ms或者文字token个数。
#### 多gpu训练
##### 单机多gpu训练
```shell
export CUDA_VISIBLE_DEVICES="0,1"
gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
torchrun --nnodes 1 --nproc_per_node ${gpu_num} \
../../../funasr/bin/train.py ${train_args}
```
--nnodes 表示参与的节点总数,--nproc_per_node 表示每个节点上运行的进程数
##### 多机多gpu训练
在主节点上假设IP为192.168.1.1端口为12345使用的是2个GPU则运行如下命令
```shell
export CUDA_VISIBLE_DEVICES="0,1"
gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
torchrun --nnodes 2 --node_rank 0 --nproc_per_node ${gpu_num} --master_addr 192.168.1.1 --master_port 12345 \
../../../funasr/bin/train.py ${train_args}
```
在从节点上假设IP为192.168.1.2你需要确保MASTER_ADDR和MASTER_PORT环境变量与主节点设置的一致并运行同样的命令
```shell
export CUDA_VISIBLE_DEVICES="0,1"
gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
torchrun --nnodes 2 --node_rank 1 --nproc_per_node ${gpu_num} --master_addr 192.168.1.1 --master_port 12345 \
../../../funasr/bin/train.py ${train_args}
```
--nnodes 表示参与的节点总数,--node_rank 表示当前节点id--nproc_per_node 表示每个节点上运行的进程数通常为gpu个数
#### 准备数据
`jsonl`格式可以参考([例子](https://github.com/alibaba-damo-academy/FunASR/blob/main/data/list))。
可以用指令 `scp2jsonl` 从wav.scp与text.txt生成。wav.scp与text.txt准备过程如下
`train_text.txt`
左边为数据唯一ID需与`train_wav.scp`中的`ID`一一对应
右边为音频文件标注文本,格式如下:
```bash
ID0012W0013 当客户风险承受能力评估依据发生变化时
ID0012W0014 所有只要处理 data 不管你是做 machine learning 做 deep learning
ID0012W0015 he tried to think how it could be
```
`train_wav.scp`
左边为数据唯一ID需与`train_text.txt`中的`ID`一一对应
右边为音频文件的路径,格式如下
```bash
BAC009S0764W0121 https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/BAC009S0764W0121.wav
BAC009S0916W0489 https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/BAC009S0916W0489.wav
ID0012W0015 https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_cn_en.wav
```
`生成指令`
```shell
# generate train.jsonl and val.jsonl from wav.scp and text.txt
scp2jsonl \
++scp_file_list='["../../../data/list/train_wav.scp", "../../../data/list/train_text.txt"]' \
++data_type_list='["source", "target"]' \
++jsonl_file_out="../../../data/list/train.jsonl"
```
可选非必需如果需要从jsonl解析成wav.scp与text.txt可以使用指令
```shell
# generate wav.scp and text.txt from train.jsonl and val.jsonl
jsonl2scp \
++scp_file_list='["../../../data/list/train_wav.scp", "../../../data/list/train_text.txt"]' \
++data_type_list='["source", "target"]' \
++jsonl_file_in="../../../data/list/train.jsonl"
```
#### 查看训练日志
##### 查看实验log
```shell
tail log.txt
[2024-03-21 15:55:52,137][root][INFO] - train, rank: 3, epoch: 0/50, step: 6990/1, total step: 6990, (loss_avg_rank: 0.327), (loss_avg_epoch: 0.409), (ppl_avg_epoch: 1.506), (acc_avg_epoch: 0.795), (lr: 1.165e-04), [('loss_att', 0.259), ('acc', 0.825), ('loss_pre', 0.04), ('loss', 0.299), ('batch_size', 40)], {'data_load': '0.000', 'forward_time': '0.315', 'backward_time': '0.555', 'optim_time': '0.076', 'total_time': '0.947'}, GPU, memory: usage: 3.830 GB, peak: 18.357 GB, cache: 20.910 GB, cache_peak: 20.910 GB
[2024-03-21 15:55:52,139][root][INFO] - train, rank: 1, epoch: 0/50, step: 6990/1, total step: 6990, (loss_avg_rank: 0.334), (loss_avg_epoch: 0.409), (ppl_avg_epoch: 1.506), (acc_avg_epoch: 0.795), (lr: 1.165e-04), [('loss_att', 0.285), ('acc', 0.823), ('loss_pre', 0.046), ('loss', 0.331), ('batch_size', 36)], {'data_load': '0.000', 'forward_time': '0.334', 'backward_time': '0.536', 'optim_time': '0.077', 'total_time': '0.948'}, GPU, memory: usage: 3.943 GB, peak: 18.291 GB, cache: 19.619 GB, cache_peak: 19.619 GB
```
指标解释:
- `rank`表示gpu id。
- `epoch`,`step`,`total step`表示当前epochstep总step。
- `loss_avg_rank`表示当前step所有gpu平均loss。
- `loss/ppl/acc_avg_epoch`表示当前epoch周期截止当前step数时总平均loss/ppl/acc。epoch结束时的最后一个step表示epoch总平均loss/ppl/acc推荐使用acc指标。
- `lr`当前step的学习率。
- `[('loss_att', 0.259), ('acc', 0.825), ('loss_pre', 0.04), ('loss', 0.299), ('batch_size', 40)]`表示当前gpu id的具体数据。
- `total_time`表示单个step总耗时。
- `GPU, memory`:分别表示,模型使用/峰值显存,模型+缓存使用/峰值显存。
##### tensorboard可视化
```bash
tensorboard --logdir /xxxx/FunASR/examples/industrial_data_pretraining/paraformer/outputs/log/tensorboard
```
浏览器中打开http://localhost:6006/
### 训练后模型测试
#### 有configuration.json
假定,训练模型路径为:./model_dir如果改目录下有生成configuration.json只需要将 [上述模型推理方法](https://github.com/alibaba-damo-academy/FunASR/blob/main/examples/README_zh.md#%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86) 中模型名字修改为模型路径即可
例如:
从shell推理
```shell
python -m funasr.bin.inference ++model="./model_dir" ++input=="${input}" ++output_dir="${output_dir}"
```
从python推理
```python
from funasr import AutoModel
model = AutoModel(model="./model_dir")
res = model.generate(input=wav_file)
print(res)
```
#### 无configuration.json时
如果模型路径中无configuration.json时需要手动指定具体配置文件路径与模型路径
```shell
python -m funasr.bin.inference \
--config-path "${local_path}" \
--config-name "${config}" \
++init_param="${init_param}" \
++tokenizer_conf.token_list="${tokens}" \
++frontend_conf.cmvn_file="${cmvn_file}" \
++input="${input}" \
++output_dir="${output_dir}" \
++device="${device}"
```
参数介绍
- `config-path`:为实验中保存的 `config.yaml`,可以从实验输出目录中查找。
- `config-name`:配置文件名,一般为 `config.yaml`支持yaml格式与json格式例如 `config.json`
- `init_param`:需要测试的模型参数,一般为`model.pt`,可以自己选择具体的模型文件
- `tokenizer_conf.token_list`:词表文件路径,一般在 `config.yaml` 有指定,无需再手动指定,当 `config.yaml` 中路径不正确时,需要在此处手动指定。
- `frontend_conf.cmvn_file`wav提取fbank中用到的cmvn文件一般在 `config.yaml` 有指定,无需再手动指定,当 `config.yaml` 中路径不正确时,需要在此处手动指定。
其他参数同上,完整 [示例](https://github.com/alibaba-damo-academy/FunASR/blob/main/examples/industrial_data_pretraining/paraformer/infer_from_local.sh)
<a name="模型导出与测试"></a>
## 模型导出与测试
### 从命令行导出
```shell
funasr-export ++model=paraformer ++quantize=false
```
### 从Python导出
```python
from funasr import AutoModel
model = AutoModel(model="paraformer")
res = model.export(quantize=False)
```
### 测试ONNX
```python
# pip3 install -U funasr-onnx
from funasr_onnx import Paraformer
model_dir = "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
model = Paraformer(model_dir, batch_size=1, quantize=True)
wav_path = ['~/.cache/modelscope/hub/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/example/asr_example.wav']
result = model(wav_path)
print(result)
```
更多例子请参考 [样例](https://github.com/alibaba-damo-academy/FunASR/tree/main/runtime/python/onnxruntime)

View File

@@ -0,0 +1,46 @@
#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
# MIT License (https://opensource.org/licenses/MIT)
import os
from funasr import AutoModel
chunk_size = [0, 10, 5] # [0, 10, 5] 600ms, [0, 8, 4] 480ms
encoder_chunk_look_back = 4 # number of chunks to lookback for encoder self-attention
decoder_chunk_look_back = 1 # number of encoder chunks to lookback for decoder cross-attention
model = AutoModel(model="iic/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online")
wav_file = os.path.join(model.model_path, "example/asr_example.wav")
res = model.generate(
input=wav_file,
chunk_size=chunk_size,
encoder_chunk_look_back=encoder_chunk_look_back,
decoder_chunk_look_back=decoder_chunk_look_back,
)
print(res)
import soundfile
wav_file = os.path.join(model.model_path, "example/asr_example.wav")
speech, sample_rate = soundfile.read(wav_file)
chunk_stride = chunk_size[1] * 960 # 600ms、480ms
cache = {}
total_chunk_num = int(len((speech) - 1) / chunk_stride + 1)
for i in range(total_chunk_num):
speech_chunk = speech[i * chunk_stride : (i + 1) * chunk_stride]
is_final = i == total_chunk_num - 1
res = model.generate(
input=speech_chunk,
cache=cache,
is_final=is_final,
chunk_size=chunk_size,
encoder_chunk_look_back=encoder_chunk_look_back,
decoder_chunk_look_back=decoder_chunk_look_back,
)
print(res)

View File

@@ -0,0 +1,10 @@
model="iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online"
python funasr/bin/inference.py \
+model=${model} \
+input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav" \
+output_dir="./outputs/debug" \
+device="cpu" \

View File

@@ -0,0 +1,29 @@
#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
# MIT License (https://opensource.org/licenses/MIT)
# method1, inference from model hub
from funasr import AutoModel
model = AutoModel(
model="iic/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online",
)
res = model.export(type="onnx", quantize=False)
print(res)
# # method2, inference from local path
# from funasr import AutoModel
#
#
# model = AutoModel(
# model="/Users/zhifu/.cache/modelscope/hub/iic/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online"
# )
#
# res = model.export(type="onnx", quantize=False)
# print(res)

View File

@@ -0,0 +1,24 @@
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
# MIT License (https://opensource.org/licenses/MIT)
# method1, inference from model hub
export HYDRA_FULL_ERROR=1
model="iic/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online"
python -m funasr.bin.export \
++model=${model} \
++type="onnx" \
++quantize=false \
++device="cpu"
## method2, inference from local path
#model="/Users/zhifu/.cache/modelscope/hub/iic/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online"
#
#python -m funasr.bin.export \
#++model=${model} \
#++type="onnx" \
#++quantize=false \
#++device="cpu" \
#++debug=false

View File

@@ -0,0 +1,83 @@
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
# MIT License (https://opensource.org/licenses/MIT)
workspace=`pwd`
# which gpu to train or finetune
export CUDA_VISIBLE_DEVICES="0,1"
gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
# model_name from model_hub, or model_dir in local path
## option 1, download model automatically
model_name_or_model_dir="iic/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online"
## option 2, download model by git
#local_path_root=${workspace}/modelscope_models
#mkdir -p ${local_path_root}/${model_name_or_model_dir}
#git clone https://www.modelscope.cn/${model_name_or_model_dir}.git ${local_path_root}/${model_name_or_model_dir}
#model_name_or_model_dir=${local_path_root}/${model_name_or_model_dir}
# data dir, which contains: train.json, val.json
data_dir="../../../data/list"
train_data="${data_dir}/train.jsonl"
val_data="${data_dir}/val.jsonl"
# generate train.jsonl and val.jsonl from wav.scp and text.txt
scp2jsonl \
++scp_file_list='["../../../data/list/train_wav.scp", "../../../data/list/train_text.txt"]' \
++data_type_list='["source", "target"]' \
++jsonl_file_out="${train_data}"
scp2jsonl \
++scp_file_list='["../../../data/list/val_wav.scp", "../../../data/list/val_text.txt"]' \
++data_type_list='["source", "target"]' \
++jsonl_file_out="${val_data}"
# exp output dir
output_dir="./outputs"
log_file="${output_dir}/log.txt"
deepspeed_config=${workspace}/../../ds_stage1.json
mkdir -p ${output_dir}
echo "log_file: ${log_file}"
DISTRIBUTED_ARGS="
--nnodes ${WORLD_SIZE:-1} \
--nproc_per_node $gpu_num \
--node_rank ${RANK:-0} \
--master_addr ${MASTER_ADDR:-127.0.0.1} \
--master_port ${MASTER_PORT:-26669}
"
echo $DISTRIBUTED_ARGS
torchrun $DISTRIBUTED_ARGS \
../../../funasr/bin/train_ds.py \
++model="${model_name_or_model_dir}" \
++train_data_set_list="${train_data}" \
++valid_data_set_list="${val_data}" \
++dataset="AudioDataset" \
++dataset_conf.index_ds="IndexDSJsonl" \
++dataset_conf.data_split_num=1 \
++dataset_conf.batch_sampler="BatchSampler" \
++dataset_conf.batch_size=6000 \
++dataset_conf.sort_size=1024 \
++dataset_conf.batch_type="token" \
++dataset_conf.num_workers=4 \
++train_conf.max_epoch=50 \
++train_conf.log_interval=1 \
++train_conf.resume=true \
++train_conf.validate_interval=2000 \
++train_conf.save_checkpoint_interval=2000 \
++train_conf.keep_nbest_models=20 \
++train_conf.avg_nbest_model=10 \
++train_conf.use_deepspeed=false \
++train_conf.deepspeed_config=${deepspeed_config} \
++optim_conf.lr=0.0002 \
++output_dir="${output_dir}" &> ${log_file}

View File

@@ -0,0 +1,16 @@
#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
# MIT License (https://opensource.org/licenses/MIT)
# To install requirements: pip3 install -U "funasr[llm]"
from funasr import AutoModel
model = AutoModel(model="Qwen-Audio")
audio_in = "https://github.com/QwenLM/Qwen-Audio/raw/main/assets/audio/1272-128104-0000.flac"
prompt = "<|startoftranscription|><|en|><|transcribe|><|en|><|notimestamps|><|wo_itn|>"
res = model.generate(input=audio_in, prompt=prompt)
print(res)

View File

@@ -0,0 +1,24 @@
#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
# MIT License (https://opensource.org/licenses/MIT)
# To install requirements: pip3 install -U "funasr[llm]"
from funasr import AutoModel
model = AutoModel(model="Qwen/Qwen-Audio-Chat")
audio_in = "https://github.com/QwenLM/Qwen-Audio/raw/main/assets/audio/1272-128104-0000.flac"
# 1st dialogue turn
prompt = "what does the person say?"
cache = {"history": None}
res = model.generate(input=audio_in, prompt=prompt, cache=cache)
print(res)
# 2nd dialogue turn
prompt = 'Find the start time and end time of the word "middle classes"'
res = model.generate(input=None, prompt=prompt, cache=cache)
print(res)

View File

@@ -0,0 +1,29 @@
#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
# MIT License (https://opensource.org/licenses/MIT)
# To install requirements: pip3 install -U "funasr[llm]"
from funasr import AutoModel
model = AutoModel(
model="Qwen-Audio-Chat",
model_path="/nfs/zhifu.gzf/init_model/qwen/Qwen-Audio-Chat",
)
audio_in = (
"https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav"
)
# 1st dialogue turn
prompt = "what does the person say?"
cache = {"history": None}
res = model.generate(input=audio_in, prompt=prompt, cache=cache)
print(res)
# 2nd dialogue turn
prompt = 'Find the start time and end time of the word "middle classes"'
res = model.generate(input=None, prompt=prompt, cache=cache)
print(res)

View File

@@ -0,0 +1,19 @@
#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
# MIT License (https://opensource.org/licenses/MIT)
# To install requirements: pip3 install -U "funasr[llm]"
from funasr import AutoModel
model = AutoModel(
model="Qwen-Audio",
model_path="/nfs/zhifu.gzf/init_model/qwen/Qwen-Audio",
)
audio_in = "https://github.com/QwenLM/Qwen-Audio/raw/main/assets/audio/1272-128104-0000.flac"
prompt = "<|startoftranscription|><|en|><|transcribe|><|en|><|notimestamps|><|wo_itn|>"
res = model.generate(input=audio_in, prompt=prompt)
print(res)

View File

@@ -0,0 +1,94 @@
# network architecture
model: SanmKWS
model_conf:
ctc_weight: 1.0
# encoder
encoder: SANMEncoder
encoder_conf:
output_size: 256 # dimension of attention
attention_heads: 4
linear_units: 320 # the number of units of position-wise feed forward
num_blocks: 6 # the number of encoder blocks
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.1
input_layer: pe
pos_enc_class: SinusoidalPositionEncoder
normalize_before: true
kernel_size: 11
sanm_shfit: 0
selfattention_layer_type: sanm
# frontend related
frontend: WavFrontend
frontend_conf:
fs: 16000
window: hamming
n_mels: 40
frame_length: 25
frame_shift: 10
lfr_m: 7
lfr_n: 6
specaug: SpecAugLFR
specaug_conf:
apply_time_warp: false
time_warp_window: 5
time_warp_mode: bicubic
apply_freq_mask: true
freq_mask_width_range:
- 0
- 30
lfr_rate: 6
num_freq_mask: 1
apply_time_mask: true
time_mask_width_range:
- 0
- 12
num_time_mask: 1
train_conf:
accum_grad: 1
grad_clip: 5
max_epoch: 100
keep_nbest_models: 20
avg_nbest_model: 10
avg_keep_nbest_models_type: loss
validate_interval: 50000
save_checkpoint_interval: 50000
avg_checkpoint_interval: 1000
log_interval: 50
optim: adam
optim_conf:
lr: 0.001
scheduler: warmuplr
scheduler_conf:
warmup_steps: 10000
dataset: AudioDataset
dataset_conf:
index_ds: IndexDSJsonl
batch_sampler: EspnetStyleBatchSampler
batch_type: length # example or length
batch_size: 96000 # if batch_type is example, batch_size is the numbers of samples; if length, batch_size is source_token_len+target_token_len;
max_token_length: 1600 # filter samples if source_token_len+target_token_len > max_token_length,
buffer_size: 2048
shuffle: true
num_workers: 8
preprocessor_speech: SpeechPreprocessSpeedPerturb
preprocessor_speech_conf:
speed_perturb: [0.9, 1.0, 1.1]
tokenizer: CharTokenizer
tokenizer_conf:
unk_symbol: <unk>
ctc_conf:
dropout_rate: 0.0
ctc_type: builtin # ctc_type: focalctc, builtin
reduce: true
ignore_nan_grad: true
normalize: null

View File

@@ -0,0 +1,18 @@
#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
# MIT License (https://opensource.org/licenses/MIT)
from funasr import AutoModel
model = AutoModel(
model="iic/speech_sanm_kws_phone-xiaoyun-commands-offline",
keywords="小云小云",
output_dir="./outputs/debug",
device='cpu'
)
test_wav = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/KWS/pos_testset/kws_xiaoyunxiaoyun.wav"
res = model.generate(input=test_wav, cache={},)
print(res)

View File

@@ -0,0 +1,17 @@
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
# MIT License (https://opensource.org/licenses/MIT)
config_path="/home/pengteng.spt/source/FunASR_KWS/examples/industrial_data_pretraining/sanm_kws/conf"
config_path="/home/pengteng.spt/source/FunASR_KWS/examples/industrial_data_pretraining/sanm_kws/exp/20240914_xiaoyun_finetune_sanm_6e_320_256_feats_dim40_char_t2602_offline"
config_file="sanm_6e_320_256_fdim40_t2602.yaml"
config_file="config.yaml"
model_path="./modelscope_models_kws/speech_charctc_kws_phone-xiaoyun/funasr/finetune_sanm_6e_320_256_fdim40_t2602_online_xiaoyun_commands.pt"
python -m funasr.bin.export \
--config-path="${config_path}" \
--config-name="${config_file}" \
++init_param=${model_path} \
++type="onnx" \
++quantize=true

View File

@@ -0,0 +1,172 @@
#!/usr/bin/env bash
# Set bash to 'debug' mode, it will exit on :
# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
set -e
set -u
set -o pipefail
. ./path.sh
workspace=`pwd`
CUDA_VISIBLE_DEVICES="0,1"
stage=2
stop_stage=3
inference_device="cpu" #"cpu"
inference_device="cuda" #"cpu"
inference_checkpoint="model.pt.avg10"
inference_scp="wav.scp"
inference_batch_size=32
nj=32
test_sets="test"
# model_name from model_hub, or model_dir in local path
## option 1, download model automatically, unsupported currently
model_name_or_model_dir="iic/speech_sanm_kws_phone-xiaoyun-commands-offline"
## option 2, download model by git
local_path_root=${workspace}/modelscope_models
model_name_or_model_dir=${local_path_root}/${model_name_or_model_dir}
if [ ! -d $model_name_or_model_dir ]; then
mkdir -p ${model_name_or_model_dir}
git clone https://www.modelscope.cn/iic/speech_sanm_kws_phone-xiaoyun-commands-offline.git ${model_name_or_model_dir}
fi
config=sanm_6e_320_256_fdim40_t2602.yaml
token_list=${model_name_or_model_dir}/tokens_2602.txt
lexicon_list=${model_name_or_model_dir}/lexicon.txt
cmvn_file=${model_name_or_model_dir}/am.mvn.dim40_l3r3
init_param="${model_name_or_model_dir}/basetrain_sanm_6e_320_256_fdim40_t2602_offline.pt"
# data prepare
# data dir, which contains: train.json, val.json
data_dir=../../data
train_data="${data_dir}/train.jsonl"
val_data="${data_dir}/val.jsonl"
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
echo "stage 1: Generate audio json list"
# generate train.jsonl and val.jsonl from wav.scp and text.txt
python $FUNASR_DIR/funasr/datasets/audio_datasets/scp2jsonl.py \
++scp_file_list='['''${data_dir}/train_wav.scp''', '''${data_dir}/train_text.txt''']' \
++data_type_list='["source", "target"]' \
++jsonl_file_out="${train_data}"
python $FUNASR_DIR/funasr/datasets/audio_datasets/scp2jsonl.py \
++scp_file_list='['''${data_dir}/val_wav.scp''', '''${data_dir}/val_text.txt''']' \
++data_type_list='["source", "target"]' \
++jsonl_file_out="${val_data}"
fi
# exp output dir
output_dir="${workspace}/exp/finetune_outputs"
# Training Stage
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
echo "stage 2: KWS Training"
mkdir -p ${output_dir}
current_time=$(date "+%Y-%m-%d_%H-%M")
log_file="${output_dir}/train.log.txt.${current_time}"
echo "log_file: ${log_file}"
echo "finetune use basetrain model: ${init_param}"
export CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES
gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
torchrun --nnodes 1 --nproc_per_node ${gpu_num} \
../../../funasr/bin/train.py \
--config-path "${workspace}/conf" \
--config-name "${config}" \
++init_param="${init_param}" \
++disable_update=true \
++train_data_set_list="${train_data}" \
++valid_data_set_list="${val_data}" \
++tokenizer_conf.token_list="${token_list}" \
++tokenizer_conf.seg_dict="${lexicon_list}" \
++frontend_conf.cmvn_file="${cmvn_file}" \
++output_dir="${output_dir}" &> ${log_file}
fi
# Testing Stage
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
echo "stage 3: Inference"
keywords=(小云小云)
keywords_string=$(IFS=,; echo "${keywords[*]}")
echo "keywords: $keywords_string"
if [ ${inference_device} == "cuda" ]; then
nj=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
else
inference_batch_size=1
CUDA_VISIBLE_DEVICES=""
for JOB in $(seq ${nj}); do
CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"-1,"
done
fi
for dset in ${test_sets}; do
inference_dir="${output_dir}/inference-${inference_checkpoint}/${dset}"
_logdir="${inference_dir}/logdir"
echo "inference_dir: ${inference_dir}"
mkdir -p "${_logdir}"
test_data_dir="${data_dir}/${dset}"
key_file=${test_data_dir}/${inference_scp}
split_scps=
for JOB in $(seq "${nj}"); do
split_scps+=" ${_logdir}/keys.${JOB}.scp"
done
$FUNASR_DIR/examples/aishell/paraformer/utils/split_scp.pl "${key_file}" ${split_scps}
gpuid_list_array=(${CUDA_VISIBLE_DEVICES//,/ })
for JOB in $(seq ${nj}); do
{
id=$((JOB-1))
gpuid=${gpuid_list_array[$id]}
echo "${output_dir}"
export CUDA_VISIBLE_DEVICES=${gpuid}
python ../../../funasr/bin/inference.py \
--config-path="${output_dir}" \
--config-name="config.yaml" \
++init_param="${output_dir}/${inference_checkpoint}" \
++tokenizer_conf.token_list="${token_list}" \
++tokenizer_conf.seg_dict="${lexicon_list}" \
++frontend_conf.cmvn_file="${cmvn_file}" \
++keywords="\"$keywords_string"\" \
++input="${_logdir}/keys.${JOB}.scp" \
++output_dir="${inference_dir}/${JOB}" \
++device="${inference_device}" \
++ncpu=1 \
++disable_log=true \
++batch_size="${inference_batch_size}" &> ${_logdir}/log.${JOB}.txt
# ++batch_size="${inference_batch_size}"
}&
done
wait
for f in detect score; do
if [ -f "${inference_dir}/${JOB}/${f}" ]; then
for JOB in $(seq "${nj}"); do
cat "${inference_dir}/${JOB}/${f}"
done | sort -k1 >"${inference_dir}/${f}"
fi
done
python funasr/utils/compute_det_ctc.py \
--keywords ${keywords_string} \
--test_data ${test_data_dir}/wav.scp \
--trans_data ${test_data_dir}/text \
--score_file ${inference_dir}/detect \
--stats_dir ${inference_dir}
done
fi

View File

@@ -0,0 +1 @@
../../../funasr

View File

@@ -0,0 +1,20 @@
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
# MIT License (https://opensource.org/licenses/MIT)
# method1, inference from model hub
model="iic/speech_sanm_kws_phone-xiaoyun-commands-offline"
# for more input type, please ref to readme.md
input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/KWS/pos_testset/kws_xiaoyunxiaoyun.wav"
keywords=(小云小云)
keywords_string=$(IFS=,; echo "${keywords[*]}")
echo "keywords: $keywords_string"
python funasr/bin/inference.py \
+model=${model} \
+input=${input} \
+output_dir="./outputs/debug" \
+device="cpu" \
++keywords="\"$keywords_string"\"

View File

@@ -0,0 +1,41 @@
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
# MIT License (https://opensource.org/licenses/MIT)
# method2, inference from local model
# for more input type, please ref to readme.md
input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/KWS/pos_testset/kws_xiaoyunxiaoyun.wav"
output_dir="./outputs/debug"
workspace=`pwd`
# download model
local_path_root=${workspace}/modelscope_models
mkdir -p ${local_path_root}
local_path=${local_path_root}/speech_sanm_kws_phone-xiaoyun-commands-offline
git clone https://www.modelscope.cn/iic/speech_sanm_kws_phone-xiaoyun-commands-offline.git ${local_path}
device="cpu" # "cuda:0" for gpu0, "cuda:1" for gpu1, "cpu"
config="inference_sanm_6e_320_256_fdim40_t2602_offline.yaml"
tokens="${local_path}/tokens_2602.txt"
seg_dict="${local_path}/lexicon.txt"
init_param="${local_path}/finetune_sanm_6e_320_256_fdim40_t2602_offline_xiaoyun_commands.pt"
cmvn_file="${local_path}/am.mvn.dim40_l3r3"
keywords=(小云小云)
keywords_string=$(IFS=,; echo "${keywords[*]}")
echo "keywords: $keywords_string"
python -m funasr.bin.inference \
--config-path "${local_path}/" \
--config-name "${config}" \
++init_param="${init_param}" \
++frontend_conf.cmvn_file="${cmvn_file}" \
++tokenizer_conf.token_list="${tokens}" \
++tokenizer_conf.seg_dict="${seg_dict}" \
++input="${input}" \
++output_dir="${output_dir}" \
++device="${device}" \
++keywords="\"$keywords_string"\"

View File

@@ -0,0 +1,5 @@
export FUNASR_DIR=$PWD/../../..
# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
export PYTHONIOENCODING=UTF-8
export PATH=$FUNASR_DIR/funasr/bin:$PATH

View File

@@ -0,0 +1,109 @@
# network architecture
model: SanmKWSStreaming
model_conf:
ctc_weight: 1.0
# encoder
encoder: SANMEncoderChunkOpt
encoder_conf:
output_size: 256 # dimension of attention
attention_heads: 4
linear_units: 320 # the number of units of position-wise feed forward
num_blocks: 6 # the number of encoder blocks
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.1
input_layer: pe_online
pos_enc_class: SinusoidalPositionEncoder
normalize_before: true
kernel_size: 11
sanm_shfit: 0
selfattention_layer_type: sanm
chunk_size:
- 16
- 20
stride:
- 8
- 10
pad_left:
- 4
- 5
encoder_att_look_back_factor:
- 0
- 0
decoder_att_look_back_factor:
- 0
- 0
# frontend related
frontend: WavFrontendOnline
frontend_conf:
fs: 16000
window: hamming
n_mels: 40
frame_length: 25
frame_shift: 10
lfr_m: 7
lfr_n: 6
specaug: SpecAugLFR
specaug_conf:
apply_time_warp: false
time_warp_window: 5
time_warp_mode: bicubic
apply_freq_mask: true
freq_mask_width_range:
- 0
- 30
lfr_rate: 6
num_freq_mask: 1
apply_time_mask: true
time_mask_width_range:
- 0
- 12
num_time_mask: 1
train_conf:
accum_grad: 1
grad_clip: 5
max_epoch: 100
keep_nbest_models: 20
avg_nbest_model: 10
avg_keep_nbest_models_type: loss
validate_interval: 50000
save_checkpoint_interval: 50000
avg_checkpoint_interval: 1000
log_interval: 50
optim: adam
optim_conf:
lr: 0.001
scheduler: warmuplr
scheduler_conf:
warmup_steps: 30000
dataset: AudioDataset
dataset_conf:
index_ds: IndexDSJsonl
batch_sampler: EspnetStyleBatchSampler
batch_type: length # example or length
batch_size: 64000 # if batch_type is example, batch_size is the numbers of samples; if length, batch_size is source_token_len+target_token_len;
max_token_length: 1600 # filter samples if source_token_len+target_token_len > max_token_length,
buffer_size: 2048
shuffle: true
num_workers: 8
preprocessor_speech: SpeechPreprocessSpeedPerturb
preprocessor_speech_conf:
speed_perturb: [0.9, 1.0, 1.1]
tokenizer: CharTokenizer
tokenizer_conf:
unk_symbol: <unk>
ctc_conf:
dropout_rate: 0.0
ctc_type: builtin # ctc_type: focalctc, builtin
reduce: true
ignore_nan_grad: true
normalize: null

Some files were not shown because too many files have changed in this diff Show More