Sync from bytedesk-private: update

2026-05-14 11:18:02 +00:00 · 2024-12-14 10:43:18 +08:00
parent 476eebb101
commit 5e082909e4
3421 changed files with 812709 additions and 0 deletions
--- a/modules/python/vendors/FunASR/examples/aishell/branchformer/README.md
+++ b/modules/python/vendors/FunASR/examples/aishell/branchformer/README.md
@@ -0,0 +1,14 @@
+# Branchformer Result
+
+## Training Config
+- Feature info: using raw speech, extracting 80 dims fbank online, global cmvn, speed perturb(0.9, 1.0, 1.1), specaugment
+- Train info: lr 0.001, batch_size 10000, 4 gpu(Tesla V100), acc_grad 1, 180 epochs
+- Train config: conf/train_asr_branchformer.yaml
+- LM config: LM was not used
+
+## Results (CER)
+
+|   testset   | CER(%)  |
+|:-----------:|:-------:|
+|     dev     |  4.15   |
+|    test     |  4.51   |
--- a/modules/python/vendors/FunASR/examples/aishell/branchformer/conf/branchformer_12e_6d_2048_256.yaml
+++ b/modules/python/vendors/FunASR/examples/aishell/branchformer/conf/branchformer_12e_6d_2048_256.yaml
@@ -0,0 +1,122 @@
+# This is an example that demonstrates how to configure a model file.
+# You can modify the configuration according to your own requirements.
+
+# to print the register_table:
+# from funasr.register import tables
+# tables.print()
+
+# network architecture
+model: Branchformer
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+
+# encoder
+encoder: BranchformerEncoder
+encoder_conf:
+    output_size: 256
+    use_attn: true
+    attention_heads: 4
+    attention_layer_type: rel_selfattn
+    pos_enc_layer_type: rel_pos
+    rel_pos_type: latest
+    use_cgmlp: true
+    cgmlp_linear_units: 2048
+    cgmlp_conv_kernel: 31
+    use_linear_after_conv: false
+    gate_activation: identity
+    merge_method: concat
+    cgmlp_weight: 0.5               # used only if merge_method is "fixed_ave"
+    attn_branch_drop_rate: 0.0      # used only if merge_method is "learned_ave"
+    num_blocks: 24
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    stochastic_depth_rate: 0.0
+
+# decoder
+decoder: TransformerDecoder
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.
+    src_attention_dropout_rate: 0.
+
+
+# frontend related
+frontend: WavFrontend
+frontend_conf:
+    fs: 16000
+    window: hamming
+    n_mels: 80
+    frame_length: 25
+    frame_shift: 10
+    dither: 0.0
+    lfr_m: 1
+    lfr_n: 1
+
+specaug: SpecAug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
+
+train_conf:
+  accum_grad: 1
+  grad_clip: 5
+  max_epoch: 180
+  keep_nbest_models: 10
+  avg_keep_nbest_models_type: acc
+  log_interval: 50
+
+optim: adam
+optim_conf:
+   lr: 0.001
+   weight_decay: 0.000001
+scheduler: warmuplr
+scheduler_conf:
+   warmup_steps: 35000
+
+dataset: AudioDataset
+dataset_conf:
+    index_ds: IndexDSJsonl
+    batch_sampler: EspnetStyleBatchSampler
+    batch_type: length # example or length
+    batch_size: 10000 # if batch_type is example, batch_size is the numbers of samples; if length, batch_size is source_token_len+target_token_len;
+    max_token_length: 2048 # filter samples if source_token_len+target_token_len > max_token_length,
+    buffer_size: 1024
+    shuffle: True
+    num_workers: 4
+    preprocessor_speech: SpeechPreprocessSpeedPerturb
+    preprocessor_speech_conf:
+      speed_perturb: [0.9, 1.0, 1.1]
+
+tokenizer: CharTokenizer
+tokenizer_conf:
+  unk_symbol: <unk>
+
+
+ctc_conf:
+    dropout_rate: 0.0
+    ctc_type: builtin
+    reduce: true
+    ignore_nan_grad: true
+normalize: null
+
+beam_size: 10
+decoding_ctc_weight: 0.4
--- a/modules/python/vendors/FunASR/examples/aishell/branchformer/demo_infer.sh
+++ b/modules/python/vendors/FunASR/examples/aishell/branchformer/demo_infer.sh
@@ -0,0 +1 @@
+../paraformer/demo_infer.sh
--- a/modules/python/vendors/FunASR/examples/aishell/branchformer/demo_train_or_finetune.sh
+++ b/modules/python/vendors/FunASR/examples/aishell/branchformer/demo_train_or_finetune.sh
@@ -0,0 +1 @@
+../paraformer/demo_train_or_finetune.sh
--- a/modules/python/vendors/FunASR/examples/aishell/branchformer/infer.sh
+++ b/modules/python/vendors/FunASR/examples/aishell/branchformer/infer.sh
@@ -0,0 +1,12 @@
+
+
+python -m funasr.bin.inference \
+--config-path="/mnt/workspace/FunASR/examples/aishell/paraformer/exp/baseline_paraformer_conformer_12e_6d_2048_256_zh_char_exp3" \
+--config-name="config.yaml" \
++init_param="/mnt/workspace/FunASR/examples/aishell/paraformer/exp/baseline_paraformer_conformer_12e_6d_2048_256_zh_char_exp3/model.pt.ep38" \
++tokenizer_conf.token_list="/mnt/nfs/zhifu.gzf/data/AISHELL-1-feats/DATA/data/zh_token_list/char/tokens.txt" \
++frontend_conf.cmvn_file="/mnt/nfs/zhifu.gzf/data/AISHELL-1-feats/DATA/data/train/am.mvn" \
++input="/mnt/nfs/zhifu.gzf/data/AISHELL-1/data_aishell/wav/train/S0002/BAC009S0002W0122.wav" \
++output_dir="./outputs/debug" \
++device="cuda:0" \
+
--- a/modules/python/vendors/FunASR/examples/aishell/branchformer/local/aishell_data_prep.sh
+++ b/modules/python/vendors/FunASR/examples/aishell/branchformer/local/aishell_data_prep.sh
@@ -0,0 +1,66 @@
+#!/bin/bash
+
+# Copyright 2017 Xingyu Na
+# Apache 2.0
+
+#. ./path.sh || exit 1;
+
+if [ $# != 3 ]; then
+  echo "Usage: $0 <audio-path> <text-path> <output-path>"
+  echo " $0 /export/a05/xna/data/data_aishell/wav /export/a05/xna/data/data_aishell/transcript data"
+  exit 1;
+fi
+
+aishell_audio_dir=$1
+aishell_text=$2/aishell_transcript_v0.8.txt
+output_dir=$3
+
+train_dir=$output_dir/data/local/train
+dev_dir=$output_dir/data/local/dev
+test_dir=$output_dir/data/local/test
+tmp_dir=$output_dir/data/local/tmp
+
+mkdir -p $train_dir
+mkdir -p $dev_dir
+mkdir -p $test_dir
+mkdir -p $tmp_dir
+
+# data directory check
+if [ ! -d $aishell_audio_dir ] || [ ! -f $aishell_text ]; then
+  echo "Error: $0 requires two directory arguments"
+  exit 1;
+fi
+
+# find wav audio file for train, dev and test resp.
+find $aishell_audio_dir -iname "*.wav" > $tmp_dir/wav.flist
+n=`cat $tmp_dir/wav.flist | wc -l`
+[ $n -ne 141925 ] && \
+  echo Warning: expected 141925 data data files, found $n
+
+grep -i "wav/train" $tmp_dir/wav.flist > $train_dir/wav.flist || exit 1;
+grep -i "wav/dev" $tmp_dir/wav.flist > $dev_dir/wav.flist || exit 1;
+grep -i "wav/test" $tmp_dir/wav.flist > $test_dir/wav.flist || exit 1;
+
+rm -r $tmp_dir
+
+# Transcriptions preparation
+for dir in $train_dir $dev_dir $test_dir; do
+  echo Preparing $dir transcriptions
+  sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{print $NF}' > $dir/utt.list
+  paste -d' ' $dir/utt.list $dir/wav.flist > $dir/wav.scp_all
+  utils/filter_scp.pl -f 1 $dir/utt.list $aishell_text > $dir/transcripts.txt
+  awk '{print $1}' $dir/transcripts.txt > $dir/utt.list
+  utils/filter_scp.pl -f 1 $dir/utt.list $dir/wav.scp_all | sort -u > $dir/wav.scp
+  sort -u $dir/transcripts.txt > $dir/text
+done
+
+mkdir -p $output_dir/data/train $output_dir/data/dev $output_dir/data/test
+
+for f in wav.scp text; do
+  cp $train_dir/$f $output_dir/data/train/$f || exit 1;
+  cp $dev_dir/$f $output_dir/data/dev/$f || exit 1;
+  cp $test_dir/$f $output_dir/data/test/$f || exit 1;
+done
+
+echo "$0: AISHELL data preparation succeeded"
+exit 0;
--- a/modules/python/vendors/FunASR/examples/aishell/branchformer/local/download_and_untar.sh
+++ b/modules/python/vendors/FunASR/examples/aishell/branchformer/local/download_and_untar.sh
@@ -0,0 +1,105 @@
+#!/usr/bin/env bash
+
+# Copyright   2014  Johns Hopkins University (author: Daniel Povey)
+#             2017  Xingyu Na
+# Apache 2.0
+
+remove_archive=false
+
+if [ "$1" == --remove-archive ]; then
+  remove_archive=true
+  shift
+fi
+
+if [ $# -ne 3 ]; then
+  echo "Usage: $0 [--remove-archive] <data-base> <url-base> <corpus-part>"
+  echo "e.g.: $0 /export/a05/xna/data www.openslr.org/resources/33 data_aishell"
+  echo "With --remove-archive it will remove the archive after successfully un-tarring it."
+  echo "<corpus-part> can be one of: data_aishell, resource_aishell."
+fi
+
+data=$1
+url=$2
+part=$3
+
+if [ ! -d "$data" ]; then
+  echo "$0: no such directory $data"
+  exit 1;
+fi
+
+part_ok=false
+list="data_aishell resource_aishell"
+for x in $list; do
+  if [ "$part" == $x ]; then part_ok=true; fi
+done
+if ! $part_ok; then
+  echo "$0: expected <corpus-part> to be one of $list, but got '$part'"
+  exit 1;
+fi
+
+if [ -z "$url" ]; then
+  echo "$0: empty URL base."
+  exit 1;
+fi
+
+if [ -f $data/$part/.complete ]; then
+  echo "$0: data part $part was already successfully extracted, nothing to do."
+  exit 0;
+fi
+
+# sizes of the archive files in bytes.
+sizes="15582913665 1246920"
+
+if [ -f $data/$part.tgz ]; then
+  size=$(/bin/ls -l $data/$part.tgz | awk '{print $5}')
+  size_ok=false
+  for s in $sizes; do if [ $s == $size ]; then size_ok=true; fi; done
+  if ! $size_ok; then
+    echo "$0: removing existing file $data/$part.tgz because its size in bytes $size"
+    echo "does not equal the size of one of the archives."
+    rm $data/$part.tgz
+  else
+    echo "$data/$part.tgz exists and appears to be complete."
+  fi
+fi
+
+if [ ! -f $data/$part.tgz ]; then
+  if ! command -v wget >/dev/null; then
+    echo "$0: wget is not installed."
+    exit 1;
+  fi
+  full_url=$url/$part.tgz
+  echo "$0: downloading data from $full_url.  This may take some time, please be patient."
+
+  cd $data || exit 1
+  if ! wget --no-check-certificate $full_url; then
+    echo "$0: error executing wget $full_url"
+    exit 1;
+  fi
+fi
+
+cd $data || exit 1
+
+if ! tar -xvzf $part.tgz; then
+  echo "$0: error un-tarring archive $data/$part.tgz"
+  exit 1;
+fi
+
+touch $data/$part/.complete
+
+if [ $part == "data_aishell" ]; then
+  cd $data/$part/wav || exit 1
+  for wav in ./*.tar.gz; do
+    echo "Extracting wav from $wav"
+    tar -zxf $wav && rm $wav
+  done
+fi
+
+echo "$0: Successfully downloaded and un-tarred $data/$part.tgz"
+
+if $remove_archive; then
+  echo "$0: removing $data/$part.tgz file since --remove-archive option was supplied."
+  rm $data/$part.tgz
+fi
+
+exit 0;
--- a/modules/python/vendors/FunASR/examples/aishell/branchformer/run.sh
+++ b/modules/python/vendors/FunASR/examples/aishell/branchformer/run.sh
@@ -0,0 +1,203 @@
+#!/usr/bin/env bash
+
+
+CUDA_VISIBLE_DEVICES="0,1,2,3"
+
+# general configuration
+feats_dir="../DATA" #feature output dictionary
+exp_dir=`pwd`
+lang=zh
+token_type=char
+stage=0
+stop_stage=5
+
+# feature configuration
+nj=32
+
+inference_device="cuda" #"cpu"
+inference_checkpoint="model.pt.avg10"
+inference_scp="wav.scp"
+inference_batch_size=1
+
+# data
+raw_data=../raw_data
+data_url=www.openslr.org/resources/33
+
+# exp tag
+tag="exp1"
+workspace=`pwd`
+
+master_port=12345
+
+. utils/parse_options.sh || exit 1;
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+train_set=train
+valid_set=dev
+test_sets="dev test"
+
+config=branchformer_12e_6d_2048_256.yaml
+model_dir="baseline_$(basename "${config}" .yaml)_${lang}_${token_type}_${tag}"
+
+
+
+if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
+    echo "stage -1: Data Download"
+    mkdir -p ${raw_data}
+    local/download_and_untar.sh ${raw_data} ${data_url} data_aishell
+    local/download_and_untar.sh ${raw_data} ${data_url} resource_aishell
+fi
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    echo "stage 0: Data preparation"
+    # Data preparation
+    local/aishell_data_prep.sh ${raw_data}/data_aishell/wav ${raw_data}/data_aishell/transcript ${feats_dir}
+    for x in train dev test; do
+        cp ${feats_dir}/data/${x}/text ${feats_dir}/data/${x}/text.org
+        paste -d " " <(cut -f 1 -d" " ${feats_dir}/data/${x}/text.org) <(cut -f 2- -d" " ${feats_dir}/data/${x}/text.org | tr -d " ") \
+            > ${feats_dir}/data/${x}/text
+        utils/text2token.py -n 1 -s 1 ${feats_dir}/data/${x}/text > ${feats_dir}/data/${x}/text.org
+        mv ${feats_dir}/data/${x}/text.org ${feats_dir}/data/${x}/text
+
+        # convert wav.scp text to jsonl
+        scp_file_list_arg="++scp_file_list='[\"${feats_dir}/data/${x}/wav.scp\",\"${feats_dir}/data/${x}/text\"]'"
+        python ../../../funasr/datasets/audio_datasets/scp2jsonl.py \
+        ++data_type_list='["source", "target"]' \
+        ++jsonl_file_out=${feats_dir}/data/${x}/audio_datasets.jsonl \
+        ${scp_file_list_arg}
+    done
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    echo "stage 1: Feature and CMVN Generation"
+    python ../../../funasr/bin/compute_audio_cmvn.py \
+    --config-path "${workspace}/conf" \
+    --config-name "${config}" \
+    ++train_data_set_list="${feats_dir}/data/${train_set}/audio_datasets.jsonl" \
+    ++cmvn_file="${feats_dir}/data/${train_set}/cmvn.json" \
+
+fi
+
+token_list=${feats_dir}/data/${lang}_token_list/$token_type/tokens.txt
+echo "dictionary: ${token_list}"
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    echo "stage 2: Dictionary Preparation"
+    mkdir -p ${feats_dir}/data/${lang}_token_list/$token_type/
+
+    echo "make a dictionary"
+    echo "<blank>" > ${token_list}
+    echo "<s>" >> ${token_list}
+    echo "</s>" >> ${token_list}
+    utils/text2token.py -s 1 -n 1 --space "" ${feats_dir}/data/$train_set/text | cut -f 2- -d" " | tr " " "\n" \
+        | sort | uniq | grep -a -v -e '^\s*$' | awk '{print $0}' >> ${token_list}
+    echo "<unk>" >> ${token_list}
+fi
+
+# LM Training Stage
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    echo "stage 3: LM Training"
+fi
+
+# ASR Training Stage
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+  echo "stage 4: ASR Training"
+
+  mkdir -p ${exp_dir}/exp/${model_dir}
+  current_time=$(date "+%Y-%m-%d_%H-%M")
+  log_file="${exp_dir}/exp/${model_dir}/train.log.txt.${current_time}"
+  echo "log_file: ${log_file}"
+
+  export CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES
+  gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+  torchrun \
+  --nnodes 1 \
+  --nproc_per_node ${gpu_num} \
+  --master_port ${master_port} \
+  ../../../funasr/bin/train.py \
+  --config-path "${workspace}/conf" \
+  --config-name "${config}" \
+  ++train_data_set_list="${feats_dir}/data/${train_set}/audio_datasets.jsonl" \
+  ++valid_data_set_list="${feats_dir}/data/${valid_set}/audio_datasets.jsonl" \
+  ++tokenizer_conf.token_list="${token_list}" \
+  ++frontend_conf.cmvn_file="${feats_dir}/data/${train_set}/am.mvn" \
+  ++output_dir="${exp_dir}/exp/${model_dir}" &> ${log_file}
+fi
+
+
+
+# Testing Stage
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+  echo "stage 5: Inference"
+
+  if [ ${inference_device} == "cuda" ]; then
+      nj=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+  else
+      inference_batch_size=1
+      CUDA_VISIBLE_DEVICES=""
+      for JOB in $(seq ${nj}); do
+          CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"-1,"
+      done
+  fi
+
+  for dset in ${test_sets}; do
+
+    inference_dir="${exp_dir}/exp/${model_dir}/inference-${inference_checkpoint}/${dset}"
+    _logdir="${inference_dir}/logdir"
+    echo "inference_dir: ${inference_dir}"
+
+    mkdir -p "${_logdir}"
+    data_dir="${feats_dir}/data/${dset}"
+    key_file=${data_dir}/${inference_scp}
+
+    split_scps=
+    for JOB in $(seq "${nj}"); do
+        split_scps+=" ${_logdir}/keys.${JOB}.scp"
+    done
+    utils/split_scp.pl "${key_file}" ${split_scps}
+
+    gpuid_list_array=(${CUDA_VISIBLE_DEVICES//,/ })
+    for JOB in $(seq ${nj}); do
+        {
+          id=$((JOB-1))
+          gpuid=${gpuid_list_array[$id]}
+
+          export CUDA_VISIBLE_DEVICES=${gpuid}
+          python ../../../funasr/bin/inference.py \
+          --config-path="${exp_dir}/exp/${model_dir}" \
+          --config-name="config.yaml" \
+          ++init_param="${exp_dir}/exp/${model_dir}/${inference_checkpoint}" \
+          ++tokenizer_conf.token_list="${token_list}" \
+          ++frontend_conf.cmvn_file="${feats_dir}/data/${train_set}/am.mvn" \
+          ++input="${_logdir}/keys.${JOB}.scp" \
+          ++output_dir="${inference_dir}/${JOB}" \
+          ++device="${inference_device}" \
+          ++ncpu=1 \
+          ++disable_log=true \
+          ++batch_size="${inference_batch_size}" &> ${_logdir}/log.${JOB}.txt
+        }&
+
+    done
+    wait
+
+    mkdir -p ${inference_dir}/1best_recog
+    for f in token score text; do
+        if [ -f "${inference_dir}/${JOB}/1best_recog/${f}" ]; then
+          for JOB in $(seq "${nj}"); do
+              cat "${inference_dir}/${JOB}/1best_recog/${f}"
+          done | sort -k1 >"${inference_dir}/1best_recog/${f}"
+        fi
+    done
+
+    echo "Computing WER ..."
+    python utils/postprocess_text_zh.py ${inference_dir}/1best_recog/text ${inference_dir}/1best_recog/text.proc
+    python utils/postprocess_text_zh.py  ${data_dir}/text ${inference_dir}/1best_recog/text.ref
+    python utils/compute_wer.py ${inference_dir}/1best_recog/text.ref ${inference_dir}/1best_recog/text.proc ${inference_dir}/1best_recog/text.cer
+    tail -n 3 ${inference_dir}/1best_recog/text.cer
+  done
+
+fi
--- a/modules/python/vendors/FunASR/examples/aishell/branchformer/utils
+++ b/modules/python/vendors/FunASR/examples/aishell/branchformer/utils
@@ -0,0 +1 @@
+../paraformer/utils
--- a/modules/python/vendors/FunASR/examples/aishell/conformer/README.md
+++ b/modules/python/vendors/FunASR/examples/aishell/conformer/README.md
@@ -0,0 +1,16 @@
+
+# Conformer Result
+
+## Training Config
+- Feature info: using 80 dims fbank, global cmvn, speed perturb(0.9, 1.0, 1.1), specaugment
+- Train info: lr 5e-4, batch_size 25000, 2 gpu(Tesla V100), acc_grad 1, 50 epochs
+- Train config: conf/train_asr_transformer.yaml
+- LM config: LM was not used
+- Model size: 46M
+
+## Results (CER)
+
+|   testset   | CER(%)  |
+|:-----------:|:-------:|
+|     dev     |  4.42   |
+|    test     |  4.87   |
--- a/modules/python/vendors/FunASR/examples/aishell/conformer/conf/conformer_12e_6d_2048_256.yaml
+++ b/modules/python/vendors/FunASR/examples/aishell/conformer/conf/conformer_12e_6d_2048_256.yaml
@@ -0,0 +1,110 @@
+# This is an example that demonstrates how to configure a model file.
+# You can modify the configuration according to your own requirements.
+
+# to print the register_table:
+# from funasr.register import tables
+# tables.print()
+
+# network architecture
+model: Conformer
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+
+# encoder
+encoder: ConformerEncoder
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder architecture type
+    normalize_before: true
+    pos_enc_layer_type: rel_pos
+    selfattention_layer_type: rel_selfattn
+    activation_type: swish
+    macaron_style: true
+    use_cnn_module: true
+    cnn_module_kernel: 15
+
+# decoder
+decoder: TransformerDecoder
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+
+# frontend related
+frontend: WavFrontend
+frontend_conf:
+    fs: 16000
+    window: hamming
+    n_mels: 80
+    frame_length: 25
+    frame_shift: 10
+    lfr_m: 1
+    lfr_n: 1
+
+specaug: SpecAug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
+
+train_conf:
+  accum_grad: 1
+  grad_clip: 5
+  max_epoch: 150
+  keep_nbest_models: 10
+  log_interval: 50
+
+optim: adam
+optim_conf:
+   lr: 0.0005
+scheduler: warmuplr
+scheduler_conf:
+   warmup_steps: 30000
+
+dataset: AudioDataset
+dataset_conf:
+    index_ds: IndexDSJsonl
+    batch_sampler: EspnetStyleBatchSampler
+    batch_type: length # example or length
+    batch_size: 25000 # if batch_type is example, batch_size is the numbers of samples; if length, batch_size is source_token_len+target_token_len;
+    max_token_length: 2048 # filter samples if source_token_len+target_token_len > max_token_length,
+    buffer_size: 1024
+    shuffle: True
+    num_workers: 4
+    preprocessor_speech: SpeechPreprocessSpeedPerturb
+    preprocessor_speech_conf:
+      speed_perturb: [0.9, 1.0, 1.1]
+
+tokenizer: CharTokenizer
+tokenizer_conf:
+  unk_symbol: <unk>
+
+ctc_conf:
+    dropout_rate: 0.0
+    ctc_type: builtin
+    reduce: true
+    ignore_nan_grad: true
+normalize: null
--- a/modules/python/vendors/FunASR/examples/aishell/conformer/conf/conformer_rwkv.yaml
+++ b/modules/python/vendors/FunASR/examples/aishell/conformer/conf/conformer_rwkv.yaml
@@ -0,0 +1,124 @@
+# This is an example that demonstrates how to configure a model file.
+# You can modify the configuration according to your own requirements.
+
+# to print the register_table:
+# from funasr.register import tables
+# tables.print()
+
+# network architecture
+model: Conformer
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+
+# encoder
+encoder: ConformerEncoder
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder architecture type
+    normalize_before: true
+    pos_enc_layer_type: rel_pos
+    selfattention_layer_type: rel_selfattn
+    activation_type: swish
+    macaron_style: true
+    use_cnn_module: true
+    cnn_module_kernel: 15
+
+# decoder
+decoder: TransformerRWKVDecoder
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+    input_layer: embed
+    rwkv_cfg:
+      n_embd: 256
+      dropout: 0
+      head_size_a: 64
+      ctx_len: 512
+      dim_att: 256 #${model_conf.rwkv_cfg.n_embd}
+      dim_ffn: null
+      head_size_divisor: 4
+      n_layer: 6
+      pre_ffn: 0
+      ln0: false
+      ln1: false
+      init_rwkv: true
+
+
+# frontend related
+frontend: WavFrontend
+frontend_conf:
+    fs: 16000
+    window: hamming
+    n_mels: 80
+    frame_length: 25
+    frame_shift: 10
+    lfr_m: 1
+    lfr_n: 1
+
+specaug: SpecAug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
+
+train_conf:
+  accum_grad: 1
+  grad_clip: 5
+  max_epoch: 150
+  keep_nbest_models: 10
+  log_interval: 50
+
+optim: adam
+optim_conf:
+   lr: 0.0005
+scheduler: warmuplr
+scheduler_conf:
+   warmup_steps: 30000
+
+dataset: AudioDataset
+dataset_conf:
+    index_ds: IndexDSJsonl
+    batch_sampler: EspnetStyleBatchSampler
+    batch_type: length # example or length
+    batch_size: 25000 # if batch_type is example, batch_size is the numbers of samples; if length, batch_size is source_token_len+target_token_len;
+    max_token_length: 2048 # filter samples if source_token_len+target_token_len > max_token_length,
+    buffer_size: 1024
+    shuffle: True
+    num_workers: 4
+    preprocessor_speech: SpeechPreprocessSpeedPerturb
+    preprocessor_speech_conf:
+      speed_perturb: [0.9, 1.0, 1.1]
+
+tokenizer: CharTokenizer
+tokenizer_conf:
+  unk_symbol: <unk>
+
+ctc_conf:
+    dropout_rate: 0.0
+    ctc_type: builtin
+    reduce: true
+    ignore_nan_grad: true
+normalize: null
--- a/modules/python/vendors/FunASR/examples/aishell/conformer/demo_infer.sh
+++ b/modules/python/vendors/FunASR/examples/aishell/conformer/demo_infer.sh
@@ -0,0 +1 @@
+../paraformer/demo_infer.sh
--- a/modules/python/vendors/FunASR/examples/aishell/conformer/demo_train_or_finetune.sh
+++ b/modules/python/vendors/FunASR/examples/aishell/conformer/demo_train_or_finetune.sh
@@ -0,0 +1 @@
+../paraformer/demo_train_or_finetune.sh
--- a/modules/python/vendors/FunASR/examples/aishell/conformer/local/aishell_data_prep.sh
+++ b/modules/python/vendors/FunASR/examples/aishell/conformer/local/aishell_data_prep.sh
@@ -0,0 +1,66 @@
+#!/bin/bash
+
+# Copyright 2017 Xingyu Na
+# Apache 2.0
+
+#. ./path.sh || exit 1;
+
+if [ $# != 3 ]; then
+  echo "Usage: $0 <audio-path> <text-path> <output-path>"
+  echo " $0 /export/a05/xna/data/data_aishell/wav /export/a05/xna/data/data_aishell/transcript data"
+  exit 1;
+fi
+
+aishell_audio_dir=$1
+aishell_text=$2/aishell_transcript_v0.8.txt
+output_dir=$3
+
+train_dir=$output_dir/data/local/train
+dev_dir=$output_dir/data/local/dev
+test_dir=$output_dir/data/local/test
+tmp_dir=$output_dir/data/local/tmp
+
+mkdir -p $train_dir
+mkdir -p $dev_dir
+mkdir -p $test_dir
+mkdir -p $tmp_dir
+
+# data directory check
+if [ ! -d $aishell_audio_dir ] || [ ! -f $aishell_text ]; then
+  echo "Error: $0 requires two directory arguments"
+  exit 1;
+fi
+
+# find wav audio file for train, dev and test resp.
+find $aishell_audio_dir -iname "*.wav" > $tmp_dir/wav.flist
+n=`cat $tmp_dir/wav.flist | wc -l`
+[ $n -ne 141925 ] && \
+  echo Warning: expected 141925 data data files, found $n
+
+grep -i "wav/train" $tmp_dir/wav.flist > $train_dir/wav.flist || exit 1;
+grep -i "wav/dev" $tmp_dir/wav.flist > $dev_dir/wav.flist || exit 1;
+grep -i "wav/test" $tmp_dir/wav.flist > $test_dir/wav.flist || exit 1;
+
+rm -r $tmp_dir
+
+# Transcriptions preparation
+for dir in $train_dir $dev_dir $test_dir; do
+  echo Preparing $dir transcriptions
+  sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{print $NF}' > $dir/utt.list
+  paste -d' ' $dir/utt.list $dir/wav.flist > $dir/wav.scp_all
+  utils/filter_scp.pl -f 1 $dir/utt.list $aishell_text > $dir/transcripts.txt
+  awk '{print $1}' $dir/transcripts.txt > $dir/utt.list
+  utils/filter_scp.pl -f 1 $dir/utt.list $dir/wav.scp_all | sort -u > $dir/wav.scp
+  sort -u $dir/transcripts.txt > $dir/text
+done
+
+mkdir -p $output_dir/data/train $output_dir/data/dev $output_dir/data/test
+
+for f in wav.scp text; do
+  cp $train_dir/$f $output_dir/data/train/$f || exit 1;
+  cp $dev_dir/$f $output_dir/data/dev/$f || exit 1;
+  cp $test_dir/$f $output_dir/data/test/$f || exit 1;
+done
+
+echo "$0: AISHELL data preparation succeeded"
+exit 0;
--- a/modules/python/vendors/FunASR/examples/aishell/conformer/local/download_and_untar.sh
+++ b/modules/python/vendors/FunASR/examples/aishell/conformer/local/download_and_untar.sh
@@ -0,0 +1,105 @@
+#!/usr/bin/env bash
+
+# Copyright   2014  Johns Hopkins University (author: Daniel Povey)
+#             2017  Xingyu Na
+# Apache 2.0
+
+remove_archive=false
+
+if [ "$1" == --remove-archive ]; then
+  remove_archive=true
+  shift
+fi
+
+if [ $# -ne 3 ]; then
+  echo "Usage: $0 [--remove-archive] <data-base> <url-base> <corpus-part>"
+  echo "e.g.: $0 /export/a05/xna/data www.openslr.org/resources/33 data_aishell"
+  echo "With --remove-archive it will remove the archive after successfully un-tarring it."
+  echo "<corpus-part> can be one of: data_aishell, resource_aishell."
+fi
+
+data=$1
+url=$2
+part=$3
+
+if [ ! -d "$data" ]; then
+  echo "$0: no such directory $data"
+  exit 1;
+fi
+
+part_ok=false
+list="data_aishell resource_aishell"
+for x in $list; do
+  if [ "$part" == $x ]; then part_ok=true; fi
+done
+if ! $part_ok; then
+  echo "$0: expected <corpus-part> to be one of $list, but got '$part'"
+  exit 1;
+fi
+
+if [ -z "$url" ]; then
+  echo "$0: empty URL base."
+  exit 1;
+fi
+
+if [ -f $data/$part/.complete ]; then
+  echo "$0: data part $part was already successfully extracted, nothing to do."
+  exit 0;
+fi
+
+# sizes of the archive files in bytes.
+sizes="15582913665 1246920"
+
+if [ -f $data/$part.tgz ]; then
+  size=$(/bin/ls -l $data/$part.tgz | awk '{print $5}')
+  size_ok=false
+  for s in $sizes; do if [ $s == $size ]; then size_ok=true; fi; done
+  if ! $size_ok; then
+    echo "$0: removing existing file $data/$part.tgz because its size in bytes $size"
+    echo "does not equal the size of one of the archives."
+    rm $data/$part.tgz
+  else
+    echo "$data/$part.tgz exists and appears to be complete."
+  fi
+fi
+
+if [ ! -f $data/$part.tgz ]; then
+  if ! command -v wget >/dev/null; then
+    echo "$0: wget is not installed."
+    exit 1;
+  fi
+  full_url=$url/$part.tgz
+  echo "$0: downloading data from $full_url.  This may take some time, please be patient."
+
+  cd $data || exit 1
+  if ! wget --no-check-certificate $full_url; then
+    echo "$0: error executing wget $full_url"
+    exit 1;
+  fi
+fi
+
+cd $data || exit 1
+
+if ! tar -xvzf $part.tgz; then
+  echo "$0: error un-tarring archive $data/$part.tgz"
+  exit 1;
+fi
+
+touch $data/$part/.complete
+
+if [ $part == "data_aishell" ]; then
+  cd $data/$part/wav || exit 1
+  for wav in ./*.tar.gz; do
+    echo "Extracting wav from $wav"
+    tar -zxf $wav && rm $wav
+  done
+fi
+
+echo "$0: Successfully downloaded and un-tarred $data/$part.tgz"
+
+if $remove_archive; then
+  echo "$0: removing $data/$part.tgz file since --remove-archive option was supplied."
+  rm $data/$part.tgz
+fi
+
+exit 0;
--- a/modules/python/vendors/FunASR/examples/aishell/conformer/run.sh
+++ b/modules/python/vendors/FunASR/examples/aishell/conformer/run.sh
@@ -0,0 +1,202 @@
+#!/usr/bin/env bash
+
+
+CUDA_VISIBLE_DEVICES="0,1"
+
+# general configuration
+feats_dir="../DATA" #feature output dictionary
+exp_dir=`pwd`
+lang=zh
+token_type=char
+stage=0
+stop_stage=5
+
+# feature configuration
+nj=32
+
+inference_device="cuda" #"cpu", "cuda:0", "cuda:1"
+inference_checkpoint="model.pt.avg10"
+inference_scp="wav.scp"
+inference_batch_size=1
+
+# data
+raw_data=../raw_data
+data_url=www.openslr.org/resources/33
+
+# exp tag
+tag="exp1"
+workspace=`pwd`
+
+master_port=12345
+
+. utils/parse_options.sh || exit 1;
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+train_set=train
+valid_set=dev
+test_sets="dev test"
+
+config=conformer_12e_6d_2048_256.yaml
+model_dir="baseline_$(basename "${config}" .yaml)_${lang}_${token_type}_${tag}"
+
+
+if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
+    echo "stage -1: Data Download"
+    mkdir -p ${raw_data}
+    local/download_and_untar.sh ${raw_data} ${data_url} data_aishell
+    local/download_and_untar.sh ${raw_data} ${data_url} resource_aishell
+fi
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    echo "stage 0: Data preparation"
+    # Data preparation
+    local/aishell_data_prep.sh ${raw_data}/data_aishell/wav ${raw_data}/data_aishell/transcript ${feats_dir}
+    for x in train dev test; do
+        cp ${feats_dir}/data/${x}/text ${feats_dir}/data/${x}/text.org
+        paste -d " " <(cut -f 1 -d" " ${feats_dir}/data/${x}/text.org) <(cut -f 2- -d" " ${feats_dir}/data/${x}/text.org | tr -d " ") \
+            > ${feats_dir}/data/${x}/text
+        utils/text2token.py -n 1 -s 1 ${feats_dir}/data/${x}/text > ${feats_dir}/data/${x}/text.org
+        mv ${feats_dir}/data/${x}/text.org ${feats_dir}/data/${x}/text
+
+        # convert wav.scp text to jsonl
+        scp_file_list_arg="++scp_file_list='[\"${feats_dir}/data/${x}/wav.scp\",\"${feats_dir}/data/${x}/text\"]'"
+        python ../../../funasr/datasets/audio_datasets/scp2jsonl.py \
+        ++data_type_list='["source", "target"]' \
+        ++jsonl_file_out=${feats_dir}/data/${x}/audio_datasets.jsonl \
+        ${scp_file_list_arg}
+    done
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    echo "stage 1: Feature and CMVN Generation"
+    python ../../../funasr/bin/compute_audio_cmvn.py \
+    --config-path "${workspace}/conf" \
+    --config-name "${config}" \
+    ++train_data_set_list="${feats_dir}/data/${train_set}/audio_datasets.jsonl" \
+    ++cmvn_file="${feats_dir}/data/${train_set}/cmvn.json" \
+
+fi
+
+token_list=${feats_dir}/data/${lang}_token_list/$token_type/tokens.txt
+echo "dictionary: ${token_list}"
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    echo "stage 2: Dictionary Preparation"
+    mkdir -p ${feats_dir}/data/${lang}_token_list/$token_type/
+   
+    echo "make a dictionary"
+    echo "<blank>" > ${token_list}
+    echo "<s>" >> ${token_list}
+    echo "</s>" >> ${token_list}
+    utils/text2token.py -s 1 -n 1 --space "" ${feats_dir}/data/$train_set/text | cut -f 2- -d" " | tr " " "\n" \
+        | sort | uniq | grep -a -v -e '^\s*$' | awk '{print $0}' >> ${token_list}
+    echo "<unk>" >> ${token_list}
+fi
+
+# LM Training Stage
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    echo "stage 3: LM Training"
+fi
+
+# ASR Training Stage
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+  echo "stage 4: ASR Training"
+
+  mkdir -p ${exp_dir}/exp/${model_dir}
+  current_time=$(date "+%Y-%m-%d_%H-%M")
+  log_file="${exp_dir}/exp/${model_dir}/train.log.txt.${current_time}"
+  echo "log_file: ${log_file}"
+
+  export CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES
+  gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+  torchrun \
+  --nnodes 1 \
+  --nproc_per_node ${gpu_num} \
+  --master_port ${master_port} \
+  ../../../funasr/bin/train.py \
+  --config-path "${workspace}/conf" \
+  --config-name "${config}" \
+  ++train_data_set_list="${feats_dir}/data/${train_set}/audio_datasets.jsonl" \
+  ++valid_data_set_list="${feats_dir}/data/${valid_set}/audio_datasets.jsonl" \
+  ++tokenizer_conf.token_list="${token_list}" \
+  ++frontend_conf.cmvn_file="${feats_dir}/data/${train_set}/am.mvn" \
+  ++output_dir="${exp_dir}/exp/${model_dir}" &> ${log_file}
+fi
+
+
+
+# Testing Stage
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+  echo "stage 5: Inference"
+
+  if [ ${inference_device} == "cuda" ]; then
+      nj=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+  else
+      inference_batch_size=1
+      CUDA_VISIBLE_DEVICES=""
+      for JOB in $(seq ${nj}); do
+          CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"-1,"
+      done
+  fi
+
+  for dset in ${test_sets}; do
+
+    inference_dir="${exp_dir}/exp/${model_dir}/inference-${inference_checkpoint}/${dset}"
+    _logdir="${inference_dir}/logdir"
+    echo "inference_dir: ${inference_dir}"
+
+    mkdir -p "${_logdir}"
+    data_dir="${feats_dir}/data/${dset}"
+    key_file=${data_dir}/${inference_scp}
+
+    split_scps=
+    for JOB in $(seq "${nj}"); do
+        split_scps+=" ${_logdir}/keys.${JOB}.scp"
+    done
+    utils/split_scp.pl "${key_file}" ${split_scps}
+
+    gpuid_list_array=(${CUDA_VISIBLE_DEVICES//,/ })
+    for JOB in $(seq ${nj}); do
+        {
+          id=$((JOB-1))
+          gpuid=${gpuid_list_array[$id]}
+
+          export CUDA_VISIBLE_DEVICES=${gpuid}
+          python ../../../funasr/bin/inference.py \
+          --config-path="${exp_dir}/exp/${model_dir}" \
+          --config-name="config.yaml" \
+          ++init_param="${exp_dir}/exp/${model_dir}/${inference_checkpoint}" \
+          ++tokenizer_conf.token_list="${token_list}" \
+          ++frontend_conf.cmvn_file="${feats_dir}/data/${train_set}/am.mvn" \
+          ++input="${_logdir}/keys.${JOB}.scp" \
+          ++output_dir="${inference_dir}/${JOB}" \
+          ++device="${inference_device}" \
+          ++ncpu=1 \
+          ++disable_log=true \
+          ++batch_size="${inference_batch_size}" &> ${_logdir}/log.${JOB}.txt
+        }&
+
+    done
+    wait
+
+    mkdir -p ${inference_dir}/1best_recog
+    for f in token score text; do
+        if [ -f "${inference_dir}/${JOB}/1best_recog/${f}" ]; then
+          for JOB in $(seq "${nj}"); do
+              cat "${inference_dir}/${JOB}/1best_recog/${f}"
+          done | sort -k1 >"${inference_dir}/1best_recog/${f}"
+        fi
+    done
+
+    echo "Computing WER ..."
+    python utils/postprocess_text_zh.py ${inference_dir}/1best_recog/text ${inference_dir}/1best_recog/text.proc
+    python utils/postprocess_text_zh.py  ${data_dir}/text ${inference_dir}/1best_recog/text.ref
+    python utils/compute_wer.py ${inference_dir}/1best_recog/text.ref ${inference_dir}/1best_recog/text.proc ${inference_dir}/1best_recog/text.cer
+    tail -n 3 ${inference_dir}/1best_recog/text.cer
+  done
+
+fi
--- a/modules/python/vendors/FunASR/examples/aishell/conformer/utils
+++ b/modules/python/vendors/FunASR/examples/aishell/conformer/utils
@@ -0,0 +1 @@
+../paraformer/utils
--- a/modules/python/vendors/FunASR/examples/aishell/e_branchformer/README.md
+++ b/modules/python/vendors/FunASR/examples/aishell/e_branchformer/README.md
@@ -0,0 +1,14 @@
+# E-Branchformer Result
+
+## Training Config
+- Feature info: using raw speech, extracting 80 dims fbank online, global cmvn, speed perturb(0.9, 1.0, 1.1), specaugment
+- Train info: lr 0.001, batch_size 10000, 4 gpu(Tesla V100), acc_grad 1, 180 epochs
+- Train config: conf/train_asr_e_branchformer.yaml
+- LM config: LM was not used
+
+## Results (CER)
+
+|   testset   | CER(%)  |
+|:-----------:|:-------:|
+|     dev     |  4.10   |
+|    test     |  4.52   |
--- a/modules/python/vendors/FunASR/examples/aishell/e_branchformer/conf/e_branchformer_12e_6d_2048_256.yaml
+++ b/modules/python/vendors/FunASR/examples/aishell/e_branchformer/conf/e_branchformer_12e_6d_2048_256.yaml
@@ -0,0 +1,118 @@
+# This is an example that demonstrates how to configure a model file.
+# You can modify the configuration according to your own requirements.
+
+# to print the register_table:
+# from funasr.register import tables
+# tables.print()
+
+# network architecture
+model: Branchformer
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+
+# encoder
+encoder: EBranchformerEncoder
+encoder_conf:
+    output_size: 256
+    attention_heads: 4
+    attention_layer_type: rel_selfattn
+    pos_enc_layer_type: rel_pos
+    rel_pos_type: latest
+    cgmlp_linear_units: 1024
+    cgmlp_conv_kernel: 31
+    use_linear_after_conv: false
+    gate_activation: identity
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    layer_drop_rate: 0.0
+    linear_units: 1024
+    positionwise_layer_type: linear
+    use_ffn: true
+    macaron_ffn: true
+    merge_conv_kernel: 31
+
+# decoder
+decoder: TransformerDecoder
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.
+    src_attention_dropout_rate: 0.
+
+
+# frontend related
+frontend: WavFrontend
+frontend_conf:
+    fs: 16000
+    window: hamming
+    n_mels: 80
+    frame_length: 25
+    frame_shift: 10
+    dither: 0.0
+    lfr_m: 1
+    lfr_n: 1
+
+specaug: SpecAug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
+
+train_conf:
+  accum_grad: 1
+  grad_clip: 5
+  max_epoch: 180
+  keep_nbest_models: 10
+  log_interval: 50
+
+optim: adam
+optim_conf:
+   lr: 0.001
+   weight_decay: 0.000001
+scheduler: warmuplr
+scheduler_conf:
+   warmup_steps: 35000
+
+dataset: AudioDataset
+dataset_conf:
+    index_ds: IndexDSJsonl
+    batch_sampler: EspnetStyleBatchSampler
+    batch_type: length # example or length
+    batch_size: 10000 # if batch_type is example, batch_size is the numbers of samples; if length, batch_size is source_token_len+target_token_len;
+    max_token_length: 2048 # filter samples if source_token_len+target_token_len > max_token_length,
+    buffer_size: 1024
+    shuffle: True
+    num_workers: 4
+    preprocessor_speech: SpeechPreprocessSpeedPerturb
+    preprocessor_speech_conf:
+      speed_perturb: [0.9, 1.0, 1.1]
+
+tokenizer: CharTokenizer
+tokenizer_conf:
+  unk_symbol: <unk>
+
+
+ctc_conf:
+    dropout_rate: 0.0
+    ctc_type: builtin
+    reduce: true
+    ignore_nan_grad: true
+normalize: null
--- a/modules/python/vendors/FunASR/examples/aishell/e_branchformer/demo_infer.sh
+++ b/modules/python/vendors/FunASR/examples/aishell/e_branchformer/demo_infer.sh
@@ -0,0 +1 @@
+../paraformer/demo_infer.sh
--- a/modules/python/vendors/FunASR/examples/aishell/e_branchformer/demo_train_or_finetune.sh
+++ b/modules/python/vendors/FunASR/examples/aishell/e_branchformer/demo_train_or_finetune.sh
@@ -0,0 +1 @@
+../paraformer/demo_train_or_finetune.sh
--- a/modules/python/vendors/FunASR/examples/aishell/e_branchformer/local/aishell_data_prep.sh
+++ b/modules/python/vendors/FunASR/examples/aishell/e_branchformer/local/aishell_data_prep.sh
@@ -0,0 +1,66 @@
+#!/bin/bash
+
+# Copyright 2017 Xingyu Na
+# Apache 2.0
+
+#. ./path.sh || exit 1;
+
+if [ $# != 3 ]; then
+  echo "Usage: $0 <audio-path> <text-path> <output-path>"
+  echo " $0 /export/a05/xna/data/data_aishell/wav /export/a05/xna/data/data_aishell/transcript data"
+  exit 1;
+fi
+
+aishell_audio_dir=$1
+aishell_text=$2/aishell_transcript_v0.8.txt
+output_dir=$3
+
+train_dir=$output_dir/data/local/train
+dev_dir=$output_dir/data/local/dev
+test_dir=$output_dir/data/local/test
+tmp_dir=$output_dir/data/local/tmp
+
+mkdir -p $train_dir
+mkdir -p $dev_dir
+mkdir -p $test_dir
+mkdir -p $tmp_dir
+
+# data directory check
+if [ ! -d $aishell_audio_dir ] || [ ! -f $aishell_text ]; then
+  echo "Error: $0 requires two directory arguments"
+  exit 1;
+fi
+
+# find wav audio file for train, dev and test resp.
+find $aishell_audio_dir -iname "*.wav" > $tmp_dir/wav.flist
+n=`cat $tmp_dir/wav.flist | wc -l`
+[ $n -ne 141925 ] && \
+  echo Warning: expected 141925 data data files, found $n
+
+grep -i "wav/train" $tmp_dir/wav.flist > $train_dir/wav.flist || exit 1;
+grep -i "wav/dev" $tmp_dir/wav.flist > $dev_dir/wav.flist || exit 1;
+grep -i "wav/test" $tmp_dir/wav.flist > $test_dir/wav.flist || exit 1;
+
+rm -r $tmp_dir
+
+# Transcriptions preparation
+for dir in $train_dir $dev_dir $test_dir; do
+  echo Preparing $dir transcriptions
+  sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{print $NF}' > $dir/utt.list
+  paste -d' ' $dir/utt.list $dir/wav.flist > $dir/wav.scp_all
+  utils/filter_scp.pl -f 1 $dir/utt.list $aishell_text > $dir/transcripts.txt
+  awk '{print $1}' $dir/transcripts.txt > $dir/utt.list
+  utils/filter_scp.pl -f 1 $dir/utt.list $dir/wav.scp_all | sort -u > $dir/wav.scp
+  sort -u $dir/transcripts.txt > $dir/text
+done
+
+mkdir -p $output_dir/data/train $output_dir/data/dev $output_dir/data/test
+
+for f in wav.scp text; do
+  cp $train_dir/$f $output_dir/data/train/$f || exit 1;
+  cp $dev_dir/$f $output_dir/data/dev/$f || exit 1;
+  cp $test_dir/$f $output_dir/data/test/$f || exit 1;
+done
+
+echo "$0: AISHELL data preparation succeeded"
+exit 0;
--- a/modules/python/vendors/FunASR/examples/aishell/e_branchformer/local/download_and_untar.sh
+++ b/modules/python/vendors/FunASR/examples/aishell/e_branchformer/local/download_and_untar.sh
@@ -0,0 +1,105 @@
+#!/usr/bin/env bash
+
+# Copyright   2014  Johns Hopkins University (author: Daniel Povey)
+#             2017  Xingyu Na
+# Apache 2.0
+
+remove_archive=false
+
+if [ "$1" == --remove-archive ]; then
+  remove_archive=true
+  shift
+fi
+
+if [ $# -ne 3 ]; then
+  echo "Usage: $0 [--remove-archive] <data-base> <url-base> <corpus-part>"
+  echo "e.g.: $0 /export/a05/xna/data www.openslr.org/resources/33 data_aishell"
+  echo "With --remove-archive it will remove the archive after successfully un-tarring it."
+  echo "<corpus-part> can be one of: data_aishell, resource_aishell."
+fi
+
+data=$1
+url=$2
+part=$3
+
+if [ ! -d "$data" ]; then
+  echo "$0: no such directory $data"
+  exit 1;
+fi
+
+part_ok=false
+list="data_aishell resource_aishell"
+for x in $list; do
+  if [ "$part" == $x ]; then part_ok=true; fi
+done
+if ! $part_ok; then
+  echo "$0: expected <corpus-part> to be one of $list, but got '$part'"
+  exit 1;
+fi
+
+if [ -z "$url" ]; then
+  echo "$0: empty URL base."
+  exit 1;
+fi
+
+if [ -f $data/$part/.complete ]; then
+  echo "$0: data part $part was already successfully extracted, nothing to do."
+  exit 0;
+fi
+
+# sizes of the archive files in bytes.
+sizes="15582913665 1246920"
+
+if [ -f $data/$part.tgz ]; then
+  size=$(/bin/ls -l $data/$part.tgz | awk '{print $5}')
+  size_ok=false
+  for s in $sizes; do if [ $s == $size ]; then size_ok=true; fi; done
+  if ! $size_ok; then
+    echo "$0: removing existing file $data/$part.tgz because its size in bytes $size"
+    echo "does not equal the size of one of the archives."
+    rm $data/$part.tgz
+  else
+    echo "$data/$part.tgz exists and appears to be complete."
+  fi
+fi
+
+if [ ! -f $data/$part.tgz ]; then
+  if ! command -v wget >/dev/null; then
+    echo "$0: wget is not installed."
+    exit 1;
+  fi
+  full_url=$url/$part.tgz
+  echo "$0: downloading data from $full_url.  This may take some time, please be patient."
+
+  cd $data || exit 1
+  if ! wget --no-check-certificate $full_url; then
+    echo "$0: error executing wget $full_url"
+    exit 1;
+  fi
+fi
+
+cd $data || exit 1
+
+if ! tar -xvzf $part.tgz; then
+  echo "$0: error un-tarring archive $data/$part.tgz"
+  exit 1;
+fi
+
+touch $data/$part/.complete
+
+if [ $part == "data_aishell" ]; then
+  cd $data/$part/wav || exit 1
+  for wav in ./*.tar.gz; do
+    echo "Extracting wav from $wav"
+    tar -zxf $wav && rm $wav
+  done
+fi
+
+echo "$0: Successfully downloaded and un-tarred $data/$part.tgz"
+
+if $remove_archive; then
+  echo "$0: removing $data/$part.tgz file since --remove-archive option was supplied."
+  rm $data/$part.tgz
+fi
+
+exit 0;
--- a/modules/python/vendors/FunASR/examples/aishell/e_branchformer/run.sh
+++ b/modules/python/vendors/FunASR/examples/aishell/e_branchformer/run.sh
@@ -0,0 +1,203 @@
+#!/usr/bin/env bash
+
+
+CUDA_VISIBLE_DEVICES="0,1,2,3"
+
+# general configuration
+feats_dir="../DATA" #feature output dictionary
+exp_dir=`pwd`
+lang=zh
+token_type=char
+stage=0
+stop_stage=5
+
+# feature configuration
+nj=32
+
+inference_device="cuda" #"cpu"
+inference_checkpoint="model.pt.avg10"
+inference_scp="wav.scp"
+inference_batch_size=1
+
+# data
+raw_data=../raw_data
+data_url=www.openslr.org/resources/33
+
+# exp tag
+tag="exp1"
+workspace=`pwd`
+
+master_port=12345
+
+. utils/parse_options.sh || exit 1;
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+train_set=train
+valid_set=dev
+test_sets="dev test"
+
+config=e_branchformer_12e_6d_2048_256.yaml
+model_dir="baseline_$(basename "${config}" .yaml)_${lang}_${token_type}_${tag}"
+
+
+
+if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
+    echo "stage -1: Data Download"
+    mkdir -p ${raw_data}
+    local/download_and_untar.sh ${raw_data} ${data_url} data_aishell
+    local/download_and_untar.sh ${raw_data} ${data_url} resource_aishell
+fi
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    echo "stage 0: Data preparation"
+    # Data preparation
+    local/aishell_data_prep.sh ${raw_data}/data_aishell/wav ${raw_data}/data_aishell/transcript ${feats_dir}
+    for x in train dev test; do
+        cp ${feats_dir}/data/${x}/text ${feats_dir}/data/${x}/text.org
+        paste -d " " <(cut -f 1 -d" " ${feats_dir}/data/${x}/text.org) <(cut -f 2- -d" " ${feats_dir}/data/${x}/text.org | tr -d " ") \
+            > ${feats_dir}/data/${x}/text
+        utils/text2token.py -n 1 -s 1 ${feats_dir}/data/${x}/text > ${feats_dir}/data/${x}/text.org
+        mv ${feats_dir}/data/${x}/text.org ${feats_dir}/data/${x}/text
+
+        # convert wav.scp text to jsonl
+        scp_file_list_arg="++scp_file_list='[\"${feats_dir}/data/${x}/wav.scp\",\"${feats_dir}/data/${x}/text\"]'"
+        python ../../../funasr/datasets/audio_datasets/scp2jsonl.py \
+        ++data_type_list='["source", "target"]' \
+        ++jsonl_file_out=${feats_dir}/data/${x}/audio_datasets.jsonl \
+        ${scp_file_list_arg}
+    done
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    echo "stage 1: Feature and CMVN Generation"
+    python ../../../funasr/bin/compute_audio_cmvn.py \
+    --config-path "${workspace}/conf" \
+    --config-name "${config}" \
+    ++train_data_set_list="${feats_dir}/data/${train_set}/audio_datasets.jsonl" \
+    ++cmvn_file="${feats_dir}/data/${train_set}/cmvn.json" \
+
+fi
+
+token_list=${feats_dir}/data/${lang}_token_list/$token_type/tokens.txt
+echo "dictionary: ${token_list}"
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    echo "stage 2: Dictionary Preparation"
+    mkdir -p ${feats_dir}/data/${lang}_token_list/$token_type/
+
+    echo "make a dictionary"
+    echo "<blank>" > ${token_list}
+    echo "<s>" >> ${token_list}
+    echo "</s>" >> ${token_list}
+    utils/text2token.py -s 1 -n 1 --space "" ${feats_dir}/data/$train_set/text | cut -f 2- -d" " | tr " " "\n" \
+        | sort | uniq | grep -a -v -e '^\s*$' | awk '{print $0}' >> ${token_list}
+    echo "<unk>" >> ${token_list}
+fi
+
+# LM Training Stage
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    echo "stage 3: LM Training"
+fi
+
+# ASR Training Stage
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+  echo "stage 4: ASR Training"
+
+  mkdir -p ${exp_dir}/exp/${model_dir}
+  current_time=$(date "+%Y-%m-%d_%H-%M")
+  log_file="${exp_dir}/exp/${model_dir}/train.log.txt.${current_time}"
+  echo "log_file: ${log_file}"
+
+  export CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES
+  gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+  torchrun \
+  --nnodes 1 \
+  --nproc_per_node ${gpu_num} \
+  --master_port ${master_port} \
+  ../../../funasr/bin/train.py \
+  --config-path "${workspace}/conf" \
+  --config-name "${config}" \
+  ++train_data_set_list="${feats_dir}/data/${train_set}/audio_datasets.jsonl" \
+  ++valid_data_set_list="${feats_dir}/data/${valid_set}/audio_datasets.jsonl" \
+  ++tokenizer_conf.token_list="${token_list}" \
+  ++frontend_conf.cmvn_file="${feats_dir}/data/${train_set}/am.mvn" \
+  ++output_dir="${exp_dir}/exp/${model_dir}" &> ${log_file}
+fi
+
+
+
+# Testing Stage
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+  echo "stage 5: Inference"
+
+  if [ ${inference_device} == "cuda" ]; then
+      nj=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+  else
+      inference_batch_size=1
+      CUDA_VISIBLE_DEVICES=""
+      for JOB in $(seq ${nj}); do
+          CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"-1,"
+      done
+  fi
+
+  for dset in ${test_sets}; do
+
+    inference_dir="${exp_dir}/exp/${model_dir}/inference-${inference_checkpoint}/${dset}"
+    _logdir="${inference_dir}/logdir"
+    echo "inference_dir: ${inference_dir}"
+
+    mkdir -p "${_logdir}"
+    data_dir="${feats_dir}/data/${dset}"
+    key_file=${data_dir}/${inference_scp}
+
+    split_scps=
+    for JOB in $(seq "${nj}"); do
+        split_scps+=" ${_logdir}/keys.${JOB}.scp"
+    done
+    utils/split_scp.pl "${key_file}" ${split_scps}
+
+    gpuid_list_array=(${CUDA_VISIBLE_DEVICES//,/ })
+    for JOB in $(seq ${nj}); do
+        {
+          id=$((JOB-1))
+          gpuid=${gpuid_list_array[$id]}
+
+          export CUDA_VISIBLE_DEVICES=${gpuid}
+          python ../../../funasr/bin/inference.py \
+          --config-path="${exp_dir}/exp/${model_dir}" \
+          --config-name="config.yaml" \
+          ++init_param="${exp_dir}/exp/${model_dir}/${inference_checkpoint}" \
+          ++tokenizer_conf.token_list="${token_list}" \
+          ++frontend_conf.cmvn_file="${feats_dir}/data/${train_set}/am.mvn" \
+          ++input="${_logdir}/keys.${JOB}.scp" \
+          ++output_dir="${inference_dir}/${JOB}" \
+          ++device="${inference_device}" \
+          ++ncpu=1 \
+          ++disable_log=true \
+          ++batch_size="${inference_batch_size}" &> ${_logdir}/log.${JOB}.txt
+        }&
+
+    done
+    wait
+
+    mkdir -p ${inference_dir}/1best_recog
+    for f in token score text; do
+        if [ -f "${inference_dir}/${JOB}/1best_recog/${f}" ]; then
+          for JOB in $(seq "${nj}"); do
+              cat "${inference_dir}/${JOB}/1best_recog/${f}"
+          done | sort -k1 >"${inference_dir}/1best_recog/${f}"
+        fi
+    done
+
+    echo "Computing WER ..."
+    python utils/postprocess_text_zh.py ${inference_dir}/1best_recog/text ${inference_dir}/1best_recog/text.proc
+    python utils/postprocess_text_zh.py  ${data_dir}/text ${inference_dir}/1best_recog/text.ref
+    python utils/compute_wer.py ${inference_dir}/1best_recog/text.ref ${inference_dir}/1best_recog/text.proc ${inference_dir}/1best_recog/text.cer
+    tail -n 3 ${inference_dir}/1best_recog/text.cer
+  done
+
+fi
--- a/modules/python/vendors/FunASR/examples/aishell/e_branchformer/utils
+++ b/modules/python/vendors/FunASR/examples/aishell/e_branchformer/utils
@@ -0,0 +1 @@
+../paraformer/utils
--- a/modules/python/vendors/FunASR/examples/aishell/paraformer/README.md
+++ b/modules/python/vendors/FunASR/examples/aishell/paraformer/README.md
@@ -0,0 +1,24 @@
+# Paraformer
+pretrained model in [ModelScope](https://www.modelscope.cn/home)：[speech_paraformer_asr_nat-aishell1-pytorch](https://www.modelscope.cn/models/damo/speech_paraformer_asr_nat-aishell1-pytorch/summary)
+
+## Training Config
+- Feature info: using 80 dims fbank, global cmvn, speed perturb(0.9, 1.0, 1.1), specaugment
+- Train info: lr 5e-4, batch_size 25000, 2 gpu(Tesla V100), acc_grad 1, 50 epochs
+- Train config: conf/train_asr_paraformer_conformer_12e_6d_2048_256.yaml
+- LM config: LM was not used
+
+## Results (CER)
+
+- Decode config: conf/decode_asr_transformer_noctc_1best.yaml (ctc weight:0.0)
+
+|   testset   | CER(%)  |
+|:-----------:|:-------:|
+|     dev     |  4.66   |
+|    test     |  5.11   |
+
+- Decode config: conf/decode_asr_transformer.yaml (ctc weight:0.5)
+
+|   testset   | CER(%)  |
+|:-----------:|:-------:|
+|     dev     |  4.52   |
+|    test     |  4.94   |
--- a/modules/python/vendors/FunASR/examples/aishell/paraformer/conf/paraformer_conformer_12e_6d_2048_256.yaml
+++ b/modules/python/vendors/FunASR/examples/aishell/paraformer/conf/paraformer_conformer_12e_6d_2048_256.yaml
@@ -0,0 +1,118 @@
+
+# network architecture
+model: Paraformer
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+    predictor_weight: 1.0
+    sampling_ratio: 0.4
+    use_1st_decoder_loss: true
+
+# encoder
+encoder: ConformerEncoder
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder architecture type
+    normalize_before: true
+    pos_enc_layer_type: rel_pos
+    selfattention_layer_type: rel_selfattn
+    activation_type: swish
+    macaron_style: true
+    use_cnn_module: true
+    cnn_module_kernel: 15
+
+# decoder
+decoder: ParaformerSANDecoder
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+# predictor
+predictor: CifPredictor
+predictor_conf:
+    idim: 256
+    threshold: 1.0
+    l_order: 1
+    r_order: 1
+    tail_threshold: 0.45
+
+# frontend related
+frontend: WavFrontend
+frontend_conf:
+    fs: 16000
+    window: hamming
+    n_mels: 80
+    frame_length: 25
+    frame_shift: 10
+    lfr_m: 1
+    lfr_n: 1
+
+specaug: SpecAug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
+
+train_conf:
+  accum_grad: 1
+  grad_clip: 5
+  max_epoch: 150
+  keep_nbest_models: 10
+  avg_nbest_model: 10
+  log_interval: 50
+
+optim: adam
+optim_conf:
+   lr: 0.0005
+scheduler: warmuplr
+scheduler_conf:
+   warmup_steps: 30000
+
+dataset: AudioDataset
+dataset_conf:
+    index_ds: IndexDSJsonl
+    batch_sampler: EspnetStyleBatchSampler
+    batch_type: length # example or length
+    batch_size: 25000 # if batch_type is example, batch_size is the numbers of samples; if length, batch_size is source_token_len+target_token_len;
+    max_token_length: 2048 # filter samples if source_token_len+target_token_len > max_token_length,
+    buffer_size: 1024
+    shuffle: True
+    num_workers: 4
+    preprocessor_speech: SpeechPreprocessSpeedPerturb
+    preprocessor_speech_conf:
+      speed_perturb: [0.9, 1.0, 1.1]
+
+tokenizer: CharTokenizer
+tokenizer_conf:
+  unk_symbol: <unk>
+
+ctc_conf:
+    dropout_rate: 0.0
+    ctc_type: builtin
+    reduce: true
+    ignore_nan_grad: true
+normalize: null
+
+
--- a/modules/python/vendors/FunASR/examples/aishell/paraformer/demo_infer.sh
+++ b/modules/python/vendors/FunASR/examples/aishell/paraformer/demo_infer.sh
@@ -0,0 +1,15 @@
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+
+
+
+python -m funasr.bin.inference \
+--config-path="/mnt/workspace/FunASR/examples/aishell/paraformer/exp/baseline_paraformer_conformer_12e_6d_2048_256_zh_char_exp3" \
+--config-name="config.yaml" \
++init_param="/mnt/workspace/FunASR/examples/aishell/paraformer/exp/baseline_paraformer_conformer_12e_6d_2048_256_zh_char_exp3/model.pt.ep38" \
++tokenizer_conf.token_list="/mnt/nfs/zhifu.gzf/data/AISHELL-1-feats/DATA/data/zh_token_list/char/tokens.txt" \
++frontend_conf.cmvn_file="/mnt/nfs/zhifu.gzf/data/AISHELL-1-feats/DATA/data/train/am.mvn" \
++input="/mnt/nfs/zhifu.gzf/data/AISHELL-1/data_aishell/wav/train/S0002/BAC009S0002W0122.wav" \
++output_dir="./outputs/debug" \
++device="cuda:0" \
+
--- a/modules/python/vendors/FunASR/examples/aishell/paraformer/demo_train_or_finetune.sh
+++ b/modules/python/vendors/FunASR/examples/aishell/paraformer/demo_train_or_finetune.sh
@@ -0,0 +1,51 @@
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+
+
+# which gpu to train or finetune
+export CUDA_VISIBLE_DEVICES="0,1"
+gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+
+# data dir, which contains: train.json, val.json, tokens.jsonl/tokens.txt, am.mvn
+data_dir="/Users/zhifu/funasr1.0/data/list"
+
+## generate jsonl from wav.scp and text.txt
+#python -m funasr.datasets.audio_datasets.scp2jsonl \
+#++scp_file_list='["/Users/zhifu/funasr1.0/test_local/wav.scp", "/Users/zhifu/funasr1.0/test_local/text.txt"]' \
+#++data_type_list='["source", "target"]' \
+#++jsonl_file_out=/Users/zhifu/funasr1.0/test_local/audio_datasets.jsonl
+
+train_data="${data_dir}/train.jsonl"
+val_data="${data_dir}/val.jsonl"
+tokens="${data_dir}/tokens.json"
+cmvn_file="${data_dir}/am.mvn"
+
+# exp output dir
+output_dir="/Users/zhifu/exp"
+log_file="${output_dir}/log.txt"
+
+workspace=`pwd`
+config="paraformer_conformer_12e_6d_2048_256.yaml"
+
+init_param="${output_dir}/model.pt"
+
+mkdir -p ${output_dir}
+echo "log_file: ${log_file}"
+
+torchrun \
+--nnodes 1 \
+--nproc_per_node ${gpu_num} \
+../../../funasr/bin/train.py \
+--config-path "${workspace}/conf" \
+--config-name "${config}" \
++train_data_set_list="${train_data}" \
++valid_data_set_list="${val_data}" \
++tokenizer_conf.token_list="${tokens}" \
++frontend_conf.cmvn_file="${cmvn_file}" \
++dataset_conf.batch_size=32 \
++dataset_conf.batch_type="example" \
++dataset_conf.num_workers=4 \
++train_conf.max_epoch=150 \
++optim_conf.lr=0.0002 \
++init_param="${init_param}" \
++output_dir="${output_dir}" &> ${log_file}
--- a/modules/python/vendors/FunASR/examples/aishell/paraformer/local/aishell_data_prep.sh
+++ b/modules/python/vendors/FunASR/examples/aishell/paraformer/local/aishell_data_prep.sh
@@ -0,0 +1,66 @@
+#!/bin/bash
+
+# Copyright 2017 Xingyu Na
+# Apache 2.0
+
+#. ./path.sh || exit 1;
+
+if [ $# != 3 ]; then
+  echo "Usage: $0 <audio-path> <text-path> <output-path>"
+  echo " $0 /export/a05/xna/data/data_aishell/wav /export/a05/xna/data/data_aishell/transcript data"
+  exit 1;
+fi
+
+aishell_audio_dir=$1
+aishell_text=$2/aishell_transcript_v0.8.txt
+output_dir=$3
+
+train_dir=$output_dir/data/local/train
+dev_dir=$output_dir/data/local/dev
+test_dir=$output_dir/data/local/test
+tmp_dir=$output_dir/data/local/tmp
+
+mkdir -p $train_dir
+mkdir -p $dev_dir
+mkdir -p $test_dir
+mkdir -p $tmp_dir
+
+# data directory check
+if [ ! -d $aishell_audio_dir ] || [ ! -f $aishell_text ]; then
+  echo "Error: $0 requires two directory arguments"
+  exit 1;
+fi
+
+# find wav audio file for train, dev and test resp.
+find $aishell_audio_dir -iname "*.wav" > $tmp_dir/wav.flist
+n=`cat $tmp_dir/wav.flist | wc -l`
+[ $n -ne 141925 ] && \
+  echo Warning: expected 141925 data data files, found $n
+
+grep -i "wav/train" $tmp_dir/wav.flist > $train_dir/wav.flist || exit 1;
+grep -i "wav/dev" $tmp_dir/wav.flist > $dev_dir/wav.flist || exit 1;
+grep -i "wav/test" $tmp_dir/wav.flist > $test_dir/wav.flist || exit 1;
+
+rm -r $tmp_dir
+
+# Transcriptions preparation
+for dir in $train_dir $dev_dir $test_dir; do
+  echo Preparing $dir transcriptions
+  sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{print $NF}' > $dir/utt.list
+  paste -d' ' $dir/utt.list $dir/wav.flist > $dir/wav.scp_all
+  utils/filter_scp.pl -f 1 $dir/utt.list $aishell_text > $dir/transcripts.txt
+  awk '{print $1}' $dir/transcripts.txt > $dir/utt.list
+  utils/filter_scp.pl -f 1 $dir/utt.list $dir/wav.scp_all | sort -u > $dir/wav.scp
+  sort -u $dir/transcripts.txt > $dir/text
+done
+
+mkdir -p $output_dir/data/train $output_dir/data/dev $output_dir/data/test
+
+for f in wav.scp text; do
+  cp $train_dir/$f $output_dir/data/train/$f || exit 1;
+  cp $dev_dir/$f $output_dir/data/dev/$f || exit 1;
+  cp $test_dir/$f $output_dir/data/test/$f || exit 1;
+done
+
+echo "$0: AISHELL data preparation succeeded"
+exit 0;
--- a/modules/python/vendors/FunASR/examples/aishell/paraformer/local/download_and_untar.sh
+++ b/modules/python/vendors/FunASR/examples/aishell/paraformer/local/download_and_untar.sh
@@ -0,0 +1,105 @@
+#!/usr/bin/env bash
+
+# Copyright   2014  Johns Hopkins University (author: Daniel Povey)
+#             2017  Xingyu Na
+# Apache 2.0
+
+remove_archive=false
+
+if [ "$1" == --remove-archive ]; then
+  remove_archive=true
+  shift
+fi
+
+if [ $# -ne 3 ]; then
+  echo "Usage: $0 [--remove-archive] <data-base> <url-base> <corpus-part>"
+  echo "e.g.: $0 /export/a05/xna/data www.openslr.org/resources/33 data_aishell"
+  echo "With --remove-archive it will remove the archive after successfully un-tarring it."
+  echo "<corpus-part> can be one of: data_aishell, resource_aishell."
+fi
+
+data=$1
+url=$2
+part=$3
+
+if [ ! -d "$data" ]; then
+  echo "$0: no such directory $data"
+  exit 1;
+fi
+
+part_ok=false
+list="data_aishell resource_aishell"
+for x in $list; do
+  if [ "$part" == $x ]; then part_ok=true; fi
+done
+if ! $part_ok; then
+  echo "$0: expected <corpus-part> to be one of $list, but got '$part'"
+  exit 1;
+fi
+
+if [ -z "$url" ]; then
+  echo "$0: empty URL base."
+  exit 1;
+fi
+
+if [ -f $data/$part/.complete ]; then
+  echo "$0: data part $part was already successfully extracted, nothing to do."
+  exit 0;
+fi
+
+# sizes of the archive files in bytes.
+sizes="15582913665 1246920"
+
+if [ -f $data/$part.tgz ]; then
+  size=$(/bin/ls -l $data/$part.tgz | awk '{print $5}')
+  size_ok=false
+  for s in $sizes; do if [ $s == $size ]; then size_ok=true; fi; done
+  if ! $size_ok; then
+    echo "$0: removing existing file $data/$part.tgz because its size in bytes $size"
+    echo "does not equal the size of one of the archives."
+    rm $data/$part.tgz
+  else
+    echo "$data/$part.tgz exists and appears to be complete."
+  fi
+fi
+
+if [ ! -f $data/$part.tgz ]; then
+  if ! command -v wget >/dev/null; then
+    echo "$0: wget is not installed."
+    exit 1;
+  fi
+  full_url=$url/$part.tgz
+  echo "$0: downloading data from $full_url.  This may take some time, please be patient."
+
+  cd $data || exit 1
+  if ! wget --no-check-certificate $full_url; then
+    echo "$0: error executing wget $full_url"
+    exit 1;
+  fi
+fi
+
+cd $data || exit 1
+
+if ! tar -xvzf $part.tgz; then
+  echo "$0: error un-tarring archive $data/$part.tgz"
+  exit 1;
+fi
+
+touch $data/$part/.complete
+
+if [ $part == "data_aishell" ]; then
+  cd $data/$part/wav || exit 1
+  for wav in ./*.tar.gz; do
+    echo "Extracting wav from $wav"
+    tar -zxf $wav && rm $wav
+  done
+fi
+
+echo "$0: Successfully downloaded and un-tarred $data/$part.tgz"
+
+if $remove_archive; then
+  echo "$0: removing $data/$part.tgz file since --remove-archive option was supplied."
+  rm $data/$part.tgz
+fi
+
+exit 0;
--- a/modules/python/vendors/FunASR/examples/aishell/paraformer/run.sh
+++ b/modules/python/vendors/FunASR/examples/aishell/paraformer/run.sh
@@ -0,0 +1,201 @@
+#!/usr/bin/env bash
+
+
+CUDA_VISIBLE_DEVICES="0,1"
+
+# general configuration
+feats_dir="../DATA" #feature output dictionary
+exp_dir=`pwd`
+lang=zh
+token_type=char
+stage=0
+stop_stage=5
+
+# feature configuration
+nj=32
+
+inference_device="cuda" #"cpu"
+inference_checkpoint="model.pt.avg10"
+inference_scp="wav.scp"
+inference_batch_size=32
+
+# data
+raw_data=../raw_data
+data_url=www.openslr.org/resources/33
+
+# exp tag
+tag="exp1"
+workspace=`pwd`
+
+master_port=12345
+
+. utils/parse_options.sh || exit 1;
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+train_set=train
+valid_set=dev
+test_sets="dev test"
+
+config=paraformer_conformer_12e_6d_2048_256.yaml
+model_dir="baseline_$(basename "${config}" .yaml)_${lang}_${token_type}_${tag}"
+
+
+if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
+    echo "stage -1: Data Download"
+    mkdir -p ${raw_data}
+    local/download_and_untar.sh ${raw_data} ${data_url} data_aishell
+    local/download_and_untar.sh ${raw_data} ${data_url} resource_aishell
+fi
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    echo "stage 0: Data preparation"
+    # Data preparation
+    local/aishell_data_prep.sh ${raw_data}/data_aishell/wav ${raw_data}/data_aishell/transcript ${feats_dir}
+    for x in train dev test; do
+        cp ${feats_dir}/data/${x}/text ${feats_dir}/data/${x}/text.org
+        paste -d " " <(cut -f 1 -d" " ${feats_dir}/data/${x}/text.org) <(cut -f 2- -d" " ${feats_dir}/data/${x}/text.org | tr -d " ") \
+            > ${feats_dir}/data/${x}/text
+        utils/text2token.py -n 1 -s 1 ${feats_dir}/data/${x}/text > ${feats_dir}/data/${x}/text.org
+        mv ${feats_dir}/data/${x}/text.org ${feats_dir}/data/${x}/text
+
+        # convert wav.scp text to jsonl
+        scp_file_list_arg="++scp_file_list='[\"${feats_dir}/data/${x}/wav.scp\",\"${feats_dir}/data/${x}/text\"]'"
+        python ../../../funasr/datasets/audio_datasets/scp2jsonl.py \
+        ++data_type_list='["source", "target"]' \
+        ++jsonl_file_out=${feats_dir}/data/${x}/audio_datasets.jsonl \
+        ${scp_file_list_arg}
+    done
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    echo "stage 1: Feature and CMVN Generation"
+    python ../../../funasr/bin/compute_audio_cmvn.py \
+    --config-path "${workspace}/conf" \
+    --config-name "${config}" \
+    ++train_data_set_list="${feats_dir}/data/${train_set}/audio_datasets.jsonl" \
+    ++cmvn_file="${feats_dir}/data/${train_set}/cmvn.json"
+fi
+
+token_list=${feats_dir}/data/${lang}_token_list/$token_type/tokens.txt
+echo "dictionary: ${token_list}"
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    echo "stage 2: Dictionary Preparation"
+    mkdir -p ${feats_dir}/data/${lang}_token_list/$token_type/
+   
+    echo "make a dictionary"
+    echo "<blank>" > ${token_list}
+    echo "<s>" >> ${token_list}
+    echo "</s>" >> ${token_list}
+    utils/text2token.py -s 1 -n 1 --space "" ${feats_dir}/data/$train_set/text | cut -f 2- -d" " | tr " " "\n" \
+        | sort | uniq | grep -a -v -e '^\s*$' | awk '{print $0}' >> ${token_list}
+    echo "<unk>" >> ${token_list}
+fi
+
+# LM Training Stage
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    echo "stage 3: LM Training"
+fi
+
+# ASR Training Stage
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+  echo "stage 4: ASR Training"
+
+  mkdir -p ${exp_dir}/exp/${model_dir}
+  current_time=$(date "+%Y-%m-%d_%H-%M")
+  log_file="${exp_dir}/exp/${model_dir}/train.log.txt.${current_time}"
+  echo "log_file: ${log_file}"
+
+  export CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES
+  gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+  torchrun \
+  --nnodes 1 \
+  --nproc_per_node ${gpu_num} \
+  --master_port ${master_port} \
+  ../../../funasr/bin/train.py \
+  --config-path "${workspace}/conf" \
+  --config-name "${config}" \
+  ++train_data_set_list="${feats_dir}/data/${train_set}/audio_datasets.jsonl" \
+  ++valid_data_set_list="${feats_dir}/data/${valid_set}/audio_datasets.jsonl" \
+  ++tokenizer_conf.token_list="${token_list}" \
+  ++frontend_conf.cmvn_file="${feats_dir}/data/${train_set}/am.mvn" \
+  ++output_dir="${exp_dir}/exp/${model_dir}" &> ${log_file}
+fi
+
+
+
+# Testing Stage
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+  echo "stage 5: Inference"
+
+  if [ ${inference_device} == "cuda" ]; then
+      nj=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+  else
+      inference_batch_size=1
+      CUDA_VISIBLE_DEVICES=""
+      for JOB in $(seq ${nj}); do
+          CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"-1,"
+      done
+  fi
+
+  for dset in ${test_sets}; do
+
+    inference_dir="${exp_dir}/exp/${model_dir}/inference-${inference_checkpoint}/${dset}"
+    _logdir="${inference_dir}/logdir"
+    echo "inference_dir: ${inference_dir}"
+
+    mkdir -p "${_logdir}"
+    data_dir="${feats_dir}/data/${dset}"
+    key_file=${data_dir}/${inference_scp}
+
+    split_scps=
+    for JOB in $(seq "${nj}"); do
+        split_scps+=" ${_logdir}/keys.${JOB}.scp"
+    done
+    utils/split_scp.pl "${key_file}" ${split_scps}
+
+    gpuid_list_array=(${CUDA_VISIBLE_DEVICES//,/ })
+    for JOB in $(seq ${nj}); do
+        {
+          id=$((JOB-1))
+          gpuid=${gpuid_list_array[$id]}
+
+          export CUDA_VISIBLE_DEVICES=${gpuid}
+          python ../../../funasr/bin/inference.py \
+          --config-path="${exp_dir}/exp/${model_dir}" \
+          --config-name="config.yaml" \
+          ++init_param="${exp_dir}/exp/${model_dir}/${inference_checkpoint}" \
+          ++tokenizer_conf.token_list="${token_list}" \
+          ++frontend_conf.cmvn_file="${feats_dir}/data/${train_set}/am.mvn" \
+          ++input="${_logdir}/keys.${JOB}.scp" \
+          ++output_dir="${inference_dir}/${JOB}" \
+          ++device="${inference_device}" \
+          ++ncpu=1 \
+          ++disable_log=true \
+          ++batch_size="${inference_batch_size}" &> ${_logdir}/log.${JOB}.txt
+        }&
+
+    done
+    wait
+
+    mkdir -p ${inference_dir}/1best_recog
+    for f in token score text; do
+        if [ -f "${inference_dir}/${JOB}/1best_recog/${f}" ]; then
+          for JOB in $(seq "${nj}"); do
+              cat "${inference_dir}/${JOB}/1best_recog/${f}"
+          done | sort -k1 >"${inference_dir}/1best_recog/${f}"
+        fi
+    done
+
+    echo "Computing WER ..."
+    python utils/postprocess_text_zh.py ${inference_dir}/1best_recog/text ${inference_dir}/1best_recog/text.proc
+    python utils/postprocess_text_zh.py  ${data_dir}/text ${inference_dir}/1best_recog/text.ref
+    python utils/compute_wer.py ${inference_dir}/1best_recog/text.ref ${inference_dir}/1best_recog/text.proc ${inference_dir}/1best_recog/text.cer
+    tail -n 3 ${inference_dir}/1best_recog/text.cer
+  done
+
+fi
--- a/modules/python/vendors/FunASR/examples/aishell/paraformer/utils/compute_wer.py
+++ b/modules/python/vendors/FunASR/examples/aishell/paraformer/utils/compute_wer.py
@@ -0,0 +1,197 @@
+import os
+import numpy as np
+import sys
+
+
+def compute_wer(ref_file, hyp_file, cer_detail_file):
+    rst = {
+        "Wrd": 0,
+        "Corr": 0,
+        "Ins": 0,
+        "Del": 0,
+        "Sub": 0,
+        "Snt": 0,
+        "Err": 0.0,
+        "S.Err": 0.0,
+        "wrong_words": 0,
+        "wrong_sentences": 0,
+    }
+
+    hyp_dict = {}
+    ref_dict = {}
+    with open(hyp_file, "r") as hyp_reader:
+        for line in hyp_reader:
+            key = line.strip().split()[0]
+            value = line.strip().split()[1:]
+            hyp_dict[key] = value
+    with open(ref_file, "r") as ref_reader:
+        for line in ref_reader:
+            key = line.strip().split()[0]
+            value = line.strip().split()[1:]
+            ref_dict[key] = value
+
+    cer_detail_writer = open(cer_detail_file, "w")
+    for hyp_key in hyp_dict:
+        if hyp_key in ref_dict:
+            out_item = compute_wer_by_line(hyp_dict[hyp_key], ref_dict[hyp_key])
+            rst["Wrd"] += out_item["nwords"]
+            rst["Corr"] += out_item["cor"]
+            rst["wrong_words"] += out_item["wrong"]
+            rst["Ins"] += out_item["ins"]
+            rst["Del"] += out_item["del"]
+            rst["Sub"] += out_item["sub"]
+            rst["Snt"] += 1
+            if out_item["wrong"] > 0:
+                rst["wrong_sentences"] += 1
+            cer_detail_writer.write(hyp_key + print_cer_detail(out_item) + "\n")
+            cer_detail_writer.write(
+                "ref:" + "\t" + " ".join(list(map(lambda x: x.lower(), ref_dict[hyp_key]))) + "\n"
+            )
+            cer_detail_writer.write(
+                "hyp:" + "\t" + " ".join(list(map(lambda x: x.lower(), hyp_dict[hyp_key]))) + "\n"
+            )
+
+    if rst["Wrd"] > 0:
+        rst["Err"] = round(rst["wrong_words"] * 100 / rst["Wrd"], 2)
+    if rst["Snt"] > 0:
+        rst["S.Err"] = round(rst["wrong_sentences"] * 100 / rst["Snt"], 2)
+
+    cer_detail_writer.write("\n")
+    cer_detail_writer.write(
+        "%WER "
+        + str(rst["Err"])
+        + " [ "
+        + str(rst["wrong_words"])
+        + " / "
+        + str(rst["Wrd"])
+        + ", "
+        + str(rst["Ins"])
+        + " ins, "
+        + str(rst["Del"])
+        + " del, "
+        + str(rst["Sub"])
+        + " sub ]"
+        + "\n"
+    )
+    cer_detail_writer.write(
+        "%SER "
+        + str(rst["S.Err"])
+        + " [ "
+        + str(rst["wrong_sentences"])
+        + " / "
+        + str(rst["Snt"])
+        + " ]"
+        + "\n"
+    )
+    cer_detail_writer.write(
+        "Scored "
+        + str(len(hyp_dict))
+        + " sentences, "
+        + str(len(hyp_dict) - rst["Snt"])
+        + " not present in hyp."
+        + "\n"
+    )
+
+
+def compute_wer_by_line(hyp, ref):
+    hyp = list(map(lambda x: x.lower(), hyp))
+    ref = list(map(lambda x: x.lower(), ref))
+
+    len_hyp = len(hyp)
+    len_ref = len(ref)
+
+    cost_matrix = np.zeros((len_hyp + 1, len_ref + 1), dtype=np.int16)
+
+    ops_matrix = np.zeros((len_hyp + 1, len_ref + 1), dtype=np.int8)
+
+    for i in range(len_hyp + 1):
+        cost_matrix[i][0] = i
+    for j in range(len_ref + 1):
+        cost_matrix[0][j] = j
+
+    for i in range(1, len_hyp + 1):
+        for j in range(1, len_ref + 1):
+            if hyp[i - 1] == ref[j - 1]:
+                cost_matrix[i][j] = cost_matrix[i - 1][j - 1]
+            else:
+                substitution = cost_matrix[i - 1][j - 1] + 1
+                insertion = cost_matrix[i - 1][j] + 1
+                deletion = cost_matrix[i][j - 1] + 1
+
+                compare_val = [substitution, insertion, deletion]
+
+                min_val = min(compare_val)
+                operation_idx = compare_val.index(min_val) + 1
+                cost_matrix[i][j] = min_val
+                ops_matrix[i][j] = operation_idx
+
+    match_idx = []
+    i = len_hyp
+    j = len_ref
+    rst = {"nwords": len_ref, "cor": 0, "wrong": 0, "ins": 0, "del": 0, "sub": 0}
+    while i >= 0 or j >= 0:
+        i_idx = max(0, i)
+        j_idx = max(0, j)
+
+        if ops_matrix[i_idx][j_idx] == 0:  # correct
+            if i - 1 >= 0 and j - 1 >= 0:
+                match_idx.append((j - 1, i - 1))
+                rst["cor"] += 1
+
+            i -= 1
+            j -= 1
+
+        elif ops_matrix[i_idx][j_idx] == 2:  # insert
+            i -= 1
+            rst["ins"] += 1
+
+        elif ops_matrix[i_idx][j_idx] == 3:  # delete
+            j -= 1
+            rst["del"] += 1
+
+        elif ops_matrix[i_idx][j_idx] == 1:  # substitute
+            i -= 1
+            j -= 1
+            rst["sub"] += 1
+
+        if i < 0 and j >= 0:
+            rst["del"] += 1
+        elif j < 0 and i >= 0:
+            rst["ins"] += 1
+
+    match_idx.reverse()
+    wrong_cnt = cost_matrix[len_hyp][len_ref]
+    rst["wrong"] = wrong_cnt
+
+    return rst
+
+
+def print_cer_detail(rst):
+    return (
+        "("
+        + "nwords="
+        + str(rst["nwords"])
+        + ",cor="
+        + str(rst["cor"])
+        + ",ins="
+        + str(rst["ins"])
+        + ",del="
+        + str(rst["del"])
+        + ",sub="
+        + str(rst["sub"])
+        + ") corr:"
+        + "{:.2%}".format(rst["cor"] / rst["nwords"])
+        + ",cer:"
+        + "{:.2%}".format(rst["wrong"] / rst["nwords"])
+    )
+
+
+if __name__ == "__main__":
+    if len(sys.argv) != 4:
+        print("usage : python compute-wer.py test.ref test.hyp test.wer")
+        sys.exit(0)
+
+    ref_file = sys.argv[1]
+    hyp_file = sys.argv[2]
+    cer_detail_file = sys.argv[3]
+    compute_wer(ref_file, hyp_file, cer_detail_file)
--- a/modules/python/vendors/FunASR/examples/aishell/paraformer/utils/extract_embeds.py
+++ b/modules/python/vendors/FunASR/examples/aishell/paraformer/utils/extract_embeds.py
@@ -0,0 +1,49 @@
+from transformers import AutoTokenizer, AutoModel, pipeline
+import numpy as np
+import sys
+import os
+import torch
+from kaldiio import WriteHelper
+import re
+
+text_file_json = sys.argv[1]
+out_ark = sys.argv[2]
+out_scp = sys.argv[3]
+out_shape = sys.argv[4]
+device = int(sys.argv[5])
+model_path = sys.argv[6]
+
+model = AutoModel.from_pretrained(model_path)
+tokenizer = AutoTokenizer.from_pretrained(model_path)
+extractor = pipeline(task="feature-extraction", model=model, tokenizer=tokenizer, device=device)
+
+with open(text_file_json, "r") as f:
+    js = f.readlines()
+
+
+f_shape = open(out_shape, "w")
+with WriteHelper("ark,scp:{},{}".format(out_ark, out_scp)) as writer:
+    with torch.no_grad():
+        for idx, line in enumerate(js):
+            id, tokens = line.strip().split(" ", 1)
+            tokens = re.sub(" ", "", tokens.strip())
+            tokens = " ".join([j for j in tokens])
+            token_num = len(tokens.split(" "))
+            outputs = extractor(tokens)
+            outputs = np.array(outputs)
+            embeds = outputs[0, 1:-1, :]
+
+            token_num_embeds, dim = embeds.shape
+            if token_num == token_num_embeds:
+                writer(id, embeds)
+                shape_line = "{} {},{}\n".format(id, token_num_embeds, dim)
+                f_shape.write(shape_line)
+            else:
+                print(
+                    "{}, size has changed, {}, {}, {}".format(
+                        id, token_num, token_num_embeds, tokens
+                    )
+                )
+
+
+f_shape.close()
--- a/modules/python/vendors/FunASR/examples/aishell/paraformer/utils/filter_scp.pl
+++ b/modules/python/vendors/FunASR/examples/aishell/paraformer/utils/filter_scp.pl
@@ -0,0 +1,87 @@
+#!/usr/bin/env perl
+# Copyright 2010-2012 Microsoft Corporation
+#                     Johns Hopkins University (author: Daniel Povey)
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+
+# This script takes a list of utterance-ids or any file whose first field
+# of each line is an utterance-id, and filters an scp
+# file (or any file whose "n-th" field is an utterance id), printing
+# out only those lines whose "n-th" field is in id_list. The index of
+# the "n-th" field is 1, by default, but can be changed by using
+# the -f <n> switch
+
+$exclude = 0;
+$field = 1;
+$shifted = 0;
+
+do {
+  $shifted=0;
+  if ($ARGV[0] eq "--exclude") {
+    $exclude = 1;
+    shift @ARGV;
+    $shifted=1;
+  }
+  if ($ARGV[0] eq "-f") {
+    $field = $ARGV[1];
+    shift @ARGV; shift @ARGV;
+    $shifted=1
+  }
+} while ($shifted);
+
+if(@ARGV < 1 || @ARGV > 2) {
+  die "Usage: filter_scp.pl [--exclude] [-f <field-to-filter-on>] id_list [in.scp] > out.scp \n" .
+      "Prints only the input lines whose f'th field (default: first) is in 'id_list'.\n" .
+      "Note: only the first field of each line in id_list matters.  With --exclude, prints\n" .
+      "only the lines that were *not* in id_list.\n" .
+      "Caution: previously, the -f option was interpreted as a zero-based field index.\n" .
+      "If your older scripts (written before Oct 2014) stopped working and you used the\n" .
+      "-f option, add 1 to the argument.\n" .
+      "See also: scripts/filter_scp.pl .\n";
+}
+
+
+$idlist = shift @ARGV;
+open(F, "<$idlist") || die "Could not open id-list file $idlist";
+while(<F>) {
+  @A = split;
+  @A>=1 || die "Invalid id-list file line $_";
+  $seen{$A[0]} = 1;
+}
+
+if ($field == 1) { # Treat this as special case, since it is common.
+  while(<>) {
+    $_ =~ m/\s*(\S+)\s*/ || die "Bad line $_, could not get first field.";
+    # $1 is what we filter on.
+    if ((!$exclude && $seen{$1}) || ($exclude && !defined $seen{$1})) {
+      print $_;
+    }
+  }
+} else {
+  while(<>) {
+    @A = split;
+    @A > 0 || die "Invalid scp file line $_";
+    @A >= $field || die "Invalid scp file line $_";
+    if ((!$exclude && $seen{$A[$field-1]}) || ($exclude && !defined $seen{$A[$field-1]})) {
+      print $_;
+    }
+  }
+}
+
+# tests:
+# the following should print "foo 1"
+# ( echo foo 1; echo bar 2 ) | scripts/filter_scp.pl <(echo foo)
+# the following should print "bar 2".
+# ( echo foo 1; echo bar 2 ) | scripts/filter_scp.pl -f 2 <(echo 2)
--- a/modules/python/vendors/FunASR/examples/aishell/paraformer/utils/fix_data.sh
+++ b/modules/python/vendors/FunASR/examples/aishell/paraformer/utils/fix_data.sh
@@ -0,0 +1,35 @@
+#!/usr/bin/env bash
+
+echo "$0 $@"
+data_dir=$1
+
+if [ ! -f ${data_dir}/wav.scp ]; then
+  echo "$0: wav.scp is not found"
+  exit 1;
+fi
+
+if [ ! -f ${data_dir}/text ]; then
+  echo "$0: text is not found"
+  exit 1;
+fi
+
+
+
+mkdir -p ${data_dir}/.backup
+
+awk '{print $1}' ${data_dir}/wav.scp > ${data_dir}/.backup/wav_id
+awk '{print $1}' ${data_dir}/text > ${data_dir}/.backup/text_id
+
+sort ${data_dir}/.backup/wav_id ${data_dir}/.backup/text_id | uniq -d > ${data_dir}/.backup/id
+
+cp ${data_dir}/wav.scp ${data_dir}/.backup/wav.scp
+cp ${data_dir}/text ${data_dir}/.backup/text
+
+mv ${data_dir}/wav.scp ${data_dir}/wav.scp.bak
+mv ${data_dir}/text ${data_dir}/text.bak
+
+utils/filter_scp.pl -f 1 ${data_dir}/.backup/id ${data_dir}/wav.scp.bak | sort -k1,1 -u > ${data_dir}/wav.scp
+utils/filter_scp.pl -f 1 ${data_dir}/.backup/id ${data_dir}/text.bak | sort -k1,1 -u > ${data_dir}/text
+
+rm ${data_dir}/wav.scp.bak
+rm ${data_dir}/text.bak
--- a/modules/python/vendors/FunASR/examples/aishell/paraformer/utils/fix_data_feat.sh
+++ b/modules/python/vendors/FunASR/examples/aishell/paraformer/utils/fix_data_feat.sh
@@ -0,0 +1,52 @@
+#!/usr/bin/env bash
+
+echo "$0 $@"
+data_dir=$1
+
+if [ ! -f ${data_dir}/feats.scp ]; then
+  echo "$0: feats.scp is not found"
+  exit 1;
+fi
+
+if [ ! -f ${data_dir}/text ]; then
+  echo "$0: text is not found"
+  exit 1;
+fi
+
+if [ ! -f ${data_dir}/speech_shape ]; then
+  echo "$0: feature lengths is not found"
+  exit 1;
+fi
+
+if [ ! -f ${data_dir}/text_shape ]; then
+  echo "$0: text lengths is not found"
+  exit 1;
+fi
+
+mkdir -p ${data_dir}/.backup
+
+awk '{print $1}' ${data_dir}/feats.scp > ${data_dir}/.backup/wav_id
+awk '{print $1}' ${data_dir}/text > ${data_dir}/.backup/text_id
+
+sort ${data_dir}/.backup/wav_id ${data_dir}/.backup/text_id | uniq -d > ${data_dir}/.backup/id
+
+cp ${data_dir}/feats.scp ${data_dir}/.backup/feats.scp
+cp ${data_dir}/text ${data_dir}/.backup/text
+cp ${data_dir}/speech_shape ${data_dir}/.backup/speech_shape
+cp ${data_dir}/text_shape ${data_dir}/.backup/text_shape
+
+mv ${data_dir}/feats.scp ${data_dir}/feats.scp.bak
+mv ${data_dir}/text ${data_dir}/text.bak
+mv ${data_dir}/speech_shape ${data_dir}/speech_shape.bak
+mv ${data_dir}/text_shape ${data_dir}/text_shape.bak
+
+utils/filter_scp.pl -f 1 ${data_dir}/.backup/id ${data_dir}/feats.scp.bak | sort -k1,1 -u > ${data_dir}/feats.scp
+utils/filter_scp.pl -f 1 ${data_dir}/.backup/id ${data_dir}/text.bak | sort -k1,1 -u > ${data_dir}/text
+utils/filter_scp.pl -f 1 ${data_dir}/.backup/id ${data_dir}/speech_shape.bak | sort -k1,1 -u > ${data_dir}/speech_shape
+utils/filter_scp.pl -f 1 ${data_dir}/.backup/id ${data_dir}/text_shape.bak | sort -k1,1 -u > ${data_dir}/text_shape
+
+rm ${data_dir}/feats.scp.bak
+rm ${data_dir}/text.bak
+rm ${data_dir}/speech_shape.bak
+rm ${data_dir}/text_shape.bak
+
--- a/modules/python/vendors/FunASR/examples/aishell/paraformer/utils/parse_options.sh
+++ b/modules/python/vendors/FunASR/examples/aishell/paraformer/utils/parse_options.sh
@@ -0,0 +1,97 @@
+#!/usr/bin/env bash
+
+# Copyright 2012  Johns Hopkins University (Author: Daniel Povey);
+#                 Arnab Ghoshal, Karel Vesely
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Parse command-line options.
+# To be sourced by another script (as in ". parse_options.sh").
+# Option format is: --option-name arg
+# and shell variable "option_name" gets set to value "arg."
+# The exception is --help, which takes no arguments, but prints the
+# $help_message variable (if defined).
+
+
+###
+### The --config file options have lower priority to command line
+### options, so we need to import them first...
+###
+
+# Now import all the configs specified by command-line, in left-to-right order
+for ((argpos=1; argpos<$#; argpos++)); do
+  if [ "${!argpos}" == "--config" ]; then
+    argpos_plus1=$((argpos+1))
+    config=${!argpos_plus1}
+    [ ! -r $config ] && echo "$0: missing config '$config'" && exit 1
+    . $config  # source the config file.
+  fi
+done
+
+
+###
+### Now we process the command line options
+###
+while true; do
+  [ -z "${1:-}" ] && break;  # break if there are no arguments
+  case "$1" in
+    # If the enclosing script is called with --help option, print the help
+    # message and exit.  Scripts should put help messages in $help_message
+    --help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2;
+      else printf "$help_message\n" 1>&2 ; fi;
+      exit 0 ;;
+    --*=*) echo "$0: options to scripts must be of the form --name value, got '$1'"
+      exit 1 ;;
+    # If the first command-line argument begins with "--" (e.g. --foo-bar),
+    # then work out the variable name as $name, which will equal "foo_bar".
+    --*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`;
+      # Next we test whether the variable in question is undefned-- if so it's
+      # an invalid option and we die.  Note: $0 evaluates to the name of the
+      # enclosing script.
+      # The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar
+      # is undefined.  We then have to wrap this test inside "eval" because
+      # foo_bar is itself inside a variable ($name).
+      eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1;
+
+      oldval="`eval echo \\$$name`";
+      # Work out whether we seem to be expecting a Boolean argument.
+      if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then
+        was_bool=true;
+      else
+        was_bool=false;
+      fi
+
+      # Set the variable to the right value-- the escaped quotes make it work if
+      # the option had spaces, like --cmd "queue.pl -sync y"
+      eval $name=\"$2\";
+
+      # Check that Boolean-valued arguments are really Boolean.
+      if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then
+        echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2
+        exit 1;
+      fi
+      shift 2;
+      ;;
+  *) break;
+  esac
+done
+
+
+# Check for an empty argument to the --cmd option, which can easily occur as a
+# result of scripting errors.
+[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1;
+
+
+true; # so this script returns exit code 0.
--- a/modules/python/vendors/FunASR/examples/aishell/paraformer/utils/postprocess_text_zh.py
+++ b/modules/python/vendors/FunASR/examples/aishell/paraformer/utils/postprocess_text_zh.py
@@ -0,0 +1,30 @@
+import sys
+import re
+
+in_f = sys.argv[1]
+out_f = sys.argv[2]
+
+
+with open(in_f, "r", encoding="utf-8") as f:
+    lines = f.readlines()
+
+with open(out_f, "w", encoding="utf-8") as f:
+    for line in lines:
+        outs = line.strip().split(" ", 1)
+        if len(outs) == 2:
+            idx, text = outs
+            text = re.sub("</s>", "", text)
+            text = re.sub("<s>", "", text)
+            text = re.sub("@@", "", text)
+            text = re.sub("@", "", text)
+            text = re.sub("<unk>", "", text)
+            text = re.sub(" ", "", text)
+            text = text.lower()
+        else:
+            idx = outs[0]
+            text = " "
+
+        text = [x for x in text]
+        text = " ".join(text)
+        out = "{} {}\n".format(idx, text)
+        f.write(out)
--- a/modules/python/vendors/FunASR/examples/aishell/paraformer/utils/shuffle_list.pl
+++ b/modules/python/vendors/FunASR/examples/aishell/paraformer/utils/shuffle_list.pl
@@ -0,0 +1,44 @@
+#!/usr/bin/env perl
+
+# Copyright 2013  Johns Hopkins University (author: Daniel Povey)
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+
+if ($ARGV[0] eq "--srand") {
+  $n = $ARGV[1];
+  $n =~ m/\d+/ || die "Bad argument to --srand option: \"$n\"";
+  srand($ARGV[1]);
+  shift;
+  shift;
+} else {
+  srand(0); # Gives inconsistent behavior if we don't seed.
+}
+
+if (@ARGV > 1 || $ARGV[0] =~ m/^-.+/) { # >1 args, or an option we
+  # don't understand.
+  print "Usage: shuffle_list.pl [--srand N] [input file]  > output\n";
+  print "randomizes the order of lines of input.\n";
+  exit(1);
+}
+
+@lines;
+while (<>) {
+  push @lines, [ (rand(), $_)] ;
+}
+
+@lines = sort { $a->[0] cmp $b->[0] } @lines;
+foreach $l (@lines) {
+    print $l->[1];
+}
--- a/modules/python/vendors/FunASR/examples/aishell/paraformer/utils/split_scp.pl
+++ b/modules/python/vendors/FunASR/examples/aishell/paraformer/utils/split_scp.pl
@@ -0,0 +1,246 @@
+#!/usr/bin/env perl
+
+# Copyright 2010-2011 Microsoft Corporation
+
+# See ../../COPYING for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+
+# This program splits up any kind of .scp or archive-type file.
+# If there is no utt2spk option it will work on any text  file and
+# will split it up with an approximately equal number of lines in
+# each but.
+# With the --utt2spk option it will work on anything that has the
+# utterance-id as the first entry on each line; the utt2spk file is
+# of the form "utterance speaker" (on each line).
+# It splits it into equal size chunks as far as it can.  If you use the utt2spk
+# option it will make sure these chunks coincide with speaker boundaries.  In
+# this case, if there are more chunks than speakers (and in some other
+# circumstances), some of the resulting chunks will be empty and it will print
+# an error message and exit with nonzero status.
+# You will normally call this like:
+# split_scp.pl scp scp.1 scp.2 scp.3 ...
+# or
+# split_scp.pl --utt2spk=utt2spk scp scp.1 scp.2 scp.3 ...
+# Note that you can use this script to split the utt2spk file itself,
+# e.g. split_scp.pl --utt2spk=utt2spk utt2spk utt2spk.1 utt2spk.2 ...
+
+# You can also call the scripts like:
+# split_scp.pl -j 3 0 scp scp.0
+# [note: with this option, it assumes zero-based indexing of the split parts,
+# i.e. the second number must be 0 <= n < num-jobs.]
+
+use warnings;
+
+$num_jobs = 0;
+$job_id = 0;
+$utt2spk_file = "";
+$one_based = 0;
+
+for ($x = 1; $x <= 3 && @ARGV > 0; $x++) {
+    if ($ARGV[0] eq "-j") {
+        shift @ARGV;
+        $num_jobs = shift @ARGV;
+        $job_id = shift @ARGV;
+    }
+    if ($ARGV[0] =~ /--utt2spk=(.+)/) {
+        $utt2spk_file=$1;
+        shift;
+    }
+    if ($ARGV[0] eq '--one-based') {
+        $one_based = 1;
+        shift @ARGV;
+    }
+}
+
+if ($num_jobs != 0 && ($num_jobs < 0 || $job_id - $one_based < 0 ||
+                       $job_id - $one_based >= $num_jobs)) {
+  die "$0: Invalid job number/index values for '-j $num_jobs $job_id" .
+      ($one_based ? " --one-based" : "") . "'\n"
+}
+
+$one_based
+    and $job_id--;
+
+if(($num_jobs == 0 && @ARGV < 2) || ($num_jobs > 0 && (@ARGV < 1 || @ARGV > 2))) {
+    die
+"Usage: split_scp.pl [--utt2spk=<utt2spk_file>] in.scp out1.scp out2.scp ...
+   or: split_scp.pl -j num-jobs job-id [--one-based] [--utt2spk=<utt2spk_file>] in.scp [out.scp]
+ ... where 0 <= job-id < num-jobs, or 1 <= job-id <- num-jobs if --one-based.\n";
+}
+
+$error = 0;
+$inscp = shift @ARGV;
+if ($num_jobs == 0) { # without -j option
+    @OUTPUTS = @ARGV;
+} else {
+    for ($j = 0; $j < $num_jobs; $j++) {
+        if ($j == $job_id) {
+            if (@ARGV > 0) { push @OUTPUTS, $ARGV[0]; }
+            else { push @OUTPUTS, "-"; }
+        } else {
+            push @OUTPUTS, "/dev/null";
+        }
+    }
+}
+
+if ($utt2spk_file ne "") {  # We have the --utt2spk option...
+    open($u_fh, '<', $utt2spk_file) || die "$0: Error opening utt2spk file $utt2spk_file: $!\n";
+    while(<$u_fh>) {
+        @A = split;
+        @A == 2 || die "$0: Bad line $_ in utt2spk file $utt2spk_file\n";
+        ($u,$s) = @A;
+        $utt2spk{$u} = $s;
+    }
+    close $u_fh;
+    open($i_fh, '<', $inscp) || die "$0: Error opening input scp file $inscp: $!\n";
+    @spkrs = ();
+    while(<$i_fh>) {
+        @A = split;
+        if(@A == 0) { die "$0: Empty or space-only line in scp file $inscp\n"; }
+        $u = $A[0];
+        $s = $utt2spk{$u};
+        defined $s || die "$0: No utterance $u in utt2spk file $utt2spk_file\n";
+        if(!defined $spk_count{$s}) {
+            push @spkrs, $s;
+            $spk_count{$s} = 0;
+            $spk_data{$s} = [];  # ref to new empty array.
+        }
+        $spk_count{$s}++;
+        push @{$spk_data{$s}}, $_;
+    }
+    # Now split as equally as possible ..
+    # First allocate spks to files by allocating an approximately
+    # equal number of speakers.
+    $numspks = @spkrs;  # number of speakers.
+    $numscps = @OUTPUTS; # number of output files.
+    if ($numspks < $numscps) {
+      die "$0: Refusing to split data because number of speakers $numspks " .
+          "is less than the number of output .scp files $numscps\n";
+    }
+    for($scpidx = 0; $scpidx < $numscps; $scpidx++) {
+        $scparray[$scpidx] = []; # [] is array reference.
+    }
+    for ($spkidx = 0; $spkidx < $numspks; $spkidx++) {
+        $scpidx = int(($spkidx*$numscps) / $numspks);
+        $spk = $spkrs[$spkidx];
+        push @{$scparray[$scpidx]}, $spk;
+        $scpcount[$scpidx] += $spk_count{$spk};
+    }
+
+    # Now will try to reassign beginning + ending speakers
+    # to different scp's and see if it gets more balanced.
+    # Suppose objf we're minimizing is sum_i (num utts in scp[i] - average)^2.
+    # We can show that if considering changing just 2 scp's, we minimize
+    # this by minimizing the squared difference in sizes.  This is
+    # equivalent to minimizing the absolute difference in sizes.  This
+    # shows this method is bound to converge.
+
+    $changed = 1;
+    while($changed) {
+        $changed = 0;
+        for($scpidx = 0; $scpidx < $numscps; $scpidx++) {
+            # First try to reassign ending spk of this scp.
+            if($scpidx < $numscps-1) {
+                $sz = @{$scparray[$scpidx]};
+                if($sz > 0) {
+                    $spk = $scparray[$scpidx]->[$sz-1];
+                    $count = $spk_count{$spk};
+                    $nutt1 = $scpcount[$scpidx];
+                    $nutt2 = $scpcount[$scpidx+1];
+                    if( abs( ($nutt2+$count) - ($nutt1-$count))
+                        < abs($nutt2 - $nutt1))  { # Would decrease
+                        # size-diff by reassigning spk...
+                        $scpcount[$scpidx+1] += $count;
+                        $scpcount[$scpidx] -= $count;
+                        pop @{$scparray[$scpidx]};
+                        unshift @{$scparray[$scpidx+1]}, $spk;
+                        $changed = 1;
+                    }
+                }
+            }
+            if($scpidx > 0 && @{$scparray[$scpidx]} > 0) {
+                $spk = $scparray[$scpidx]->[0];
+                $count = $spk_count{$spk};
+                $nutt1 = $scpcount[$scpidx-1];
+                $nutt2 = $scpcount[$scpidx];
+                if( abs( ($nutt2-$count) - ($nutt1+$count))
+                    < abs($nutt2 - $nutt1))  { # Would decrease
+                    # size-diff by reassigning spk...
+                    $scpcount[$scpidx-1] += $count;
+                    $scpcount[$scpidx] -= $count;
+                    shift @{$scparray[$scpidx]};
+                    push @{$scparray[$scpidx-1]}, $spk;
+                    $changed = 1;
+                }
+            }
+        }
+    }
+    # Now print out the files...
+    for($scpidx = 0; $scpidx < $numscps; $scpidx++) {
+        $scpfile = $OUTPUTS[$scpidx];
+        ($scpfile ne '-' ? open($f_fh, '>', $scpfile)
+                         : open($f_fh, '>&', \*STDOUT)) ||
+            die "$0: Could not open scp file $scpfile for writing: $!\n";
+        $count = 0;
+        if(@{$scparray[$scpidx]} == 0) {
+            print STDERR "$0: eError: split_scp.pl producing empty .scp file " .
+                         "$scpfile (too many splits and too few speakers?)\n";
+            $error = 1;
+        } else {
+            foreach $spk ( @{$scparray[$scpidx]} ) {
+                print $f_fh @{$spk_data{$spk}};
+                $count += $spk_count{$spk};
+            }
+            $count == $scpcount[$scpidx] || die "Count mismatch [code error]";
+        }
+        close($f_fh);
+    }
+} else {
+   # This block is the "normal" case where there is no --utt2spk
+   # option and we just break into equal size chunks.
+
+    open($i_fh, '<', $inscp) || die "$0: Error opening input scp file $inscp: $!\n";
+
+    $numscps = @OUTPUTS;  # size of array.
+    @F = ();
+    while(<$i_fh>) {
+        push @F, $_;
+    }
+    $numlines = @F;
+    if($numlines == 0) {
+        print STDERR "$0: error: empty input scp file $inscp\n";
+        $error = 1;
+    }
+    $linesperscp = int( $numlines / $numscps); # the "whole part"..
+    $linesperscp >= 1 || die "$0: You are splitting into too many pieces! [reduce \$nj ($numscps) to be smaller than the number of lines ($numlines) in $inscp]\n";
+    $remainder = $numlines - ($linesperscp * $numscps);
+    ($remainder >= 0 && $remainder < $numlines) || die "bad remainder $remainder";
+    # [just doing int() rounds down].
+    $n = 0;
+    for($scpidx = 0; $scpidx < @OUTPUTS; $scpidx++) {
+        $scpfile = $OUTPUTS[$scpidx];
+        ($scpfile ne '-' ? open($o_fh, '>', $scpfile)
+                         : open($o_fh, '>&', \*STDOUT)) ||
+            die "$0: Could not open scp file $scpfile for writing: $!\n";
+        for($k = 0; $k < $linesperscp + ($scpidx < $remainder ? 1 : 0); $k++) {
+            print $o_fh $F[$n++];
+        }
+        close($o_fh) || die "$0: Eror closing scp file $scpfile: $!\n";
+    }
+    $n == $numlines || die "$n != $numlines [code error]";
+}
+
+exit ($error);
--- a/modules/python/vendors/FunASR/examples/aishell/paraformer/utils/text2token.py
+++ b/modules/python/vendors/FunASR/examples/aishell/paraformer/utils/text2token.py
@@ -0,0 +1,141 @@
+#!/usr/bin/env python3
+
+# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+
+import argparse
+import codecs
+import re
+import sys
+import json
+
+is_python2 = sys.version_info[0] == 2
+
+
+def exist_or_not(i, match_pos):
+    start_pos = None
+    end_pos = None
+    for pos in match_pos:
+        if pos[0] <= i < pos[1]:
+            start_pos = pos[0]
+            end_pos = pos[1]
+            break
+
+    return start_pos, end_pos
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        description="convert raw text to tokenized text",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    parser.add_argument(
+        "--nchar",
+        "-n",
+        default=1,
+        type=int,
+        help="number of characters to split, i.e., \
+                        aabb -> a a b b with -n 1 and aa bb with -n 2",
+    )
+    parser.add_argument("--skip-ncols", "-s", default=0, type=int, help="skip first n columns")
+    parser.add_argument("--space", default="<space>", type=str, help="space symbol")
+    parser.add_argument(
+        "--non-lang-syms",
+        "-l",
+        default=None,
+        type=str,
+        help="list of non-linguistic symobles, e.g., <NOISE> etc.",
+    )
+    parser.add_argument("text", type=str, default=False, nargs="?", help="input text")
+    parser.add_argument(
+        "--trans_type",
+        "-t",
+        type=str,
+        default="char",
+        choices=["char", "phn"],
+        help="""Transcript type. char/phn. e.g., for TIMIT FADG0_SI1279 -
+                        If trans_type is char,
+                        read from SI1279.WRD file -> "bricks are an alternative"
+                        Else if trans_type is phn,
+                        read from SI1279.PHN file -> "sil b r ih sil k s aa r er n aa l
+                        sil t er n ih sil t ih v sil" """,
+    )
+    parser.add_argument(
+        "--text_format",
+        default="text",
+        type=str,
+        help="text, jsonl",
+    )
+    return parser
+
+
+def main():
+    parser = get_parser()
+    args = parser.parse_args()
+
+    rs = []
+    if args.non_lang_syms is not None:
+        with codecs.open(args.non_lang_syms, "r", encoding="utf-8") as f:
+            nls = [x.rstrip() for x in f.readlines()]
+            rs = [re.compile(re.escape(x)) for x in nls]
+
+    if args.text:
+        f = codecs.open(args.text, encoding="utf-8")
+    else:
+        f = codecs.getreader("utf-8")(sys.stdin if is_python2 else sys.stdin.buffer)
+
+    sys.stdout = codecs.getwriter("utf-8")(sys.stdout if is_python2 else sys.stdout.buffer)
+    line = f.readline()
+    n = args.nchar
+    while line:
+        if args.text_format == "jsonl":
+            data = json.loads(line.strip())
+            line = data["target"]
+        x = line.split()
+        print(" ".join(x[: args.skip_ncols]), end=" ")
+        a = " ".join(x[args.skip_ncols :])
+
+        # get all matched positions
+        match_pos = []
+        for r in rs:
+            i = 0
+            while i >= 0:
+                m = r.search(a, i)
+                if m:
+                    match_pos.append([m.start(), m.end()])
+                    i = m.end()
+                else:
+                    break
+
+        if args.trans_type == "phn":
+            a = a.split(" ")
+        else:
+            if len(match_pos) > 0:
+                chars = []
+                i = 0
+                while i < len(a):
+                    start_pos, end_pos = exist_or_not(i, match_pos)
+                    if start_pos is not None:
+                        chars.append(a[start_pos:end_pos])
+                        i = end_pos
+                    else:
+                        chars.append(a[i])
+                        i += 1
+                a = chars
+
+            a = [a[j : j + n] for j in range(0, len(a), n)]
+
+        a_flat = []
+        for z in a:
+            a_flat.append("".join(z))
+
+        a_chars = [z.replace(" ", args.space) for z in a_flat]
+        if args.trans_type == "phn":
+            a_chars = [z.replace("sil", args.space) for z in a_chars]
+        print(" ".join(a_chars))
+        line = f.readline()
+
+
+if __name__ == "__main__":
+    main()
--- a/modules/python/vendors/FunASR/examples/aishell/paraformer/utils/text_tokenize.py
+++ b/modules/python/vendors/FunASR/examples/aishell/paraformer/utils/text_tokenize.py
@@ -0,0 +1,104 @@
+import re
+import argparse
+
+
+def load_dict(seg_file):
+    seg_dict = {}
+    with open(seg_file, "r") as infile:
+        for line in infile:
+            s = line.strip().split()
+            key = s[0]
+            value = s[1:]
+            seg_dict[key] = " ".join(value)
+    return seg_dict
+
+
+def forward_segment(text, dic):
+    word_list = []
+    i = 0
+    while i < len(text):
+        longest_word = text[i]
+        for j in range(i + 1, len(text) + 1):
+            word = text[i:j]
+            if word in dic:
+                if len(word) > len(longest_word):
+                    longest_word = word
+        word_list.append(longest_word)
+        i += len(longest_word)
+    return word_list
+
+
+def tokenize(txt, seg_dict):
+    out_txt = ""
+    pattern = re.compile(r"([\u4E00-\u9FA5A-Za-z0-9])")
+    for word in txt:
+        if pattern.match(word):
+            if word in seg_dict:
+                out_txt += seg_dict[word] + " "
+            else:
+                out_txt += "<unk>" + " "
+        else:
+            continue
+    return out_txt.strip()
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        description="text tokenize",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    parser.add_argument(
+        "--text-file",
+        "-t",
+        default=False,
+        required=True,
+        type=str,
+        help="input text",
+    )
+    parser.add_argument(
+        "--seg-file",
+        "-s",
+        default=False,
+        required=True,
+        type=str,
+        help="seg file",
+    )
+    parser.add_argument(
+        "--txt-index",
+        "-i",
+        default=1,
+        required=True,
+        type=int,
+        help="txt index",
+    )
+    parser.add_argument(
+        "--output-dir",
+        "-o",
+        default=False,
+        required=True,
+        type=str,
+        help="output dir",
+    )
+    return parser
+
+
+def main():
+    parser = get_parser()
+    args = parser.parse_args()
+
+    txt_writer = open("{}/text.{}.txt".format(args.output_dir, args.txt_index), "w")
+    shape_writer = open("{}/len.{}".format(args.output_dir, args.txt_index), "w")
+    seg_dict = load_dict(args.seg_file)
+    with open(args.text_file, "r") as infile:
+        for line in infile:
+            s = line.strip().split()
+            text_id = s[0]
+            text_list = forward_segment("".join(s[1:]).lower(), seg_dict)
+            text = tokenize(text_list, seg_dict)
+            lens = len(text.strip().split())
+            txt_writer.write(text_id + " " + text + "\n")
+            shape_writer.write(text_id + " " + str(lens) + "\n")
+
+
+if __name__ == "__main__":
+    main()
--- a/modules/python/vendors/FunASR/examples/aishell/paraformer/utils/text_tokenize.sh
+++ b/modules/python/vendors/FunASR/examples/aishell/paraformer/utils/text_tokenize.sh
@@ -0,0 +1,35 @@
+#!/usr/bin/env bash
+
+
+# Begin configuration section.
+nj=32
+cmd=utils/run.pl
+
+echo "$0 $@"
+
+. utils/parse_options.sh || exit 1;
+
+# tokenize configuration
+text_dir=$1
+seg_file=$2
+logdir=$3
+output_dir=$4
+
+txt_dir=${output_dir}/txt; mkdir -p ${output_dir}/txt
+mkdir -p ${logdir}
+
+$cmd JOB=1:$nj $logdir/text_tokenize.JOB.log \
+  python utils/text_tokenize.py -t ${text_dir}/txt/text.JOB.txt \
+      -s ${seg_file} -i JOB -o ${txt_dir} \
+      || exit 1;
+
+# concatenate the text files together.
+for n in $(seq $nj); do
+  cat ${txt_dir}/text.$n.txt || exit 1
+done > ${output_dir}/text || exit 1
+
+for n in $(seq $nj); do
+  cat ${txt_dir}/len.$n || exit 1
+done > ${output_dir}/text_shape || exit 1
+
+echo "$0: Succeeded text tokenize"
--- a/modules/python/vendors/FunASR/examples/aishell/paraformer/utils/textnorm_zh.py
+++ b/modules/python/vendors/FunASR/examples/aishell/paraformer/utils/textnorm_zh.py
@@ -0,0 +1,911 @@
+#!/usr/bin/env python3
+# coding=utf-8
+
+# Authors:
+#   2019.5 Zhiyang Zhou (https://github.com/Joee1995/chn_text_norm.git)
+#   2019.9 Jiayu DU
+#
+# requirements:
+#   - python 3.X
+# notes: python 2.X WILL fail or produce misleading results
+
+import sys, os, argparse, codecs, string, re
+
+# ================================================================================ #
+#                                    basic constant
+# ================================================================================ #
+CHINESE_DIGIS = "零一二三四五六七八九"
+BIG_CHINESE_DIGIS_SIMPLIFIED = "零壹贰叁肆伍陆柒捌玖"
+BIG_CHINESE_DIGIS_TRADITIONAL = "零壹貳參肆伍陸柒捌玖"
+SMALLER_BIG_CHINESE_UNITS_SIMPLIFIED = "十百千万"
+SMALLER_BIG_CHINESE_UNITS_TRADITIONAL = "拾佰仟萬"
+LARGER_CHINESE_NUMERING_UNITS_SIMPLIFIED = "亿兆京垓秭穰沟涧正载"
+LARGER_CHINESE_NUMERING_UNITS_TRADITIONAL = "億兆京垓秭穰溝澗正載"
+SMALLER_CHINESE_NUMERING_UNITS_SIMPLIFIED = "十百千万"
+SMALLER_CHINESE_NUMERING_UNITS_TRADITIONAL = "拾佰仟萬"
+
+ZERO_ALT = "〇"
+ONE_ALT = "幺"
+TWO_ALTS = ["两", "兩"]
+
+POSITIVE = ["正", "正"]
+NEGATIVE = ["负", "負"]
+POINT = ["点", "點"]
+# PLUS = [u'加', u'加']
+# SIL = [u'杠', u'槓']
+
+FILLER_CHARS = ["呃", "啊"]
+ER_WHITELIST = (
+    "(儿女|儿子|儿孙|女儿|儿媳|妻儿|"
+    "胎儿|婴儿|新生儿|婴幼儿|幼儿|少儿|小儿|儿歌|儿童|儿科|托儿所|孤儿|"
+    "儿戏|儿化|台儿庄|鹿儿岛|正儿八经|吊儿郎当|生儿育女|托儿带女|养儿防老|痴儿呆女|"
+    "佳儿佳妇|儿怜兽扰|儿无常父|儿不嫌母丑|儿行千里母担忧|儿大不由爷|苏乞儿)"
+)
+
+# 中文数字系统类型
+NUMBERING_TYPES = ["low", "mid", "high"]
+
+CURRENCY_NAMES = (
+    "(人民币|美元|日元|英镑|欧元|马克|法郎|加拿大元|澳元|港币|先令|芬兰马克|爱尔兰镑|"
+    "里拉|荷兰盾|埃斯库多|比塞塔|印尼盾|林吉特|新西兰元|比索|卢布|新加坡元|韩元|泰铢)"
+)
+CURRENCY_UNITS = (
+    "((亿|千万|百万|万|千|百)|(亿|千万|百万|万|千|百|)元|(亿|千万|百万|万|千|百|)块|角|毛|分)"
+)
+COM_QUANTIFIERS = (
+    "(匹|张|座|回|场|尾|条|个|首|阙|阵|网|炮|顶|丘|棵|只|支|袭|辆|挑|担|颗|壳|窠|曲|墙|群|腔|"
+    "砣|座|客|贯|扎|捆|刀|令|打|手|罗|坡|山|岭|江|溪|钟|队|单|双|对|出|口|头|脚|板|跳|枝|件|贴|"
+    "针|线|管|名|位|身|堂|课|本|页|家|户|层|丝|毫|厘|分|钱|两|斤|担|铢|石|钧|锱|忽|(千|毫|微)克|"
+    "毫|厘|分|寸|尺|丈|里|寻|常|铺|程|(千|分|厘|毫|微)米|撮|勺|合|升|斗|石|盘|碗|碟|叠|桶|笼|盆|"
+    "盒|杯|钟|斛|锅|簋|篮|盘|桶|罐|瓶|壶|卮|盏|箩|箱|煲|啖|袋|钵|年|月|日|季|刻|时|周|天|秒|分|旬|"
+    "纪|岁|世|更|夜|春|夏|秋|冬|代|伏|辈|丸|泡|粒|颗|幢|堆|条|根|支|道|面|片|张|颗|块)"
+)
+
+# punctuation information are based on Zhon project (https://github.com/tsroten/zhon.git)
+CHINESE_PUNC_STOP = "！？｡。"
+CHINESE_PUNC_NON_STOP = "＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏"
+CHINESE_PUNC_LIST = CHINESE_PUNC_STOP + CHINESE_PUNC_NON_STOP
+
+
+# ================================================================================ #
+#                                    basic class
+# ================================================================================ #
+class ChineseChar(object):
+    """
+    中文字符
+    每个字符对应简体和繁体,
+    e.g. 简体 = '负', 繁体 = '負'
+    转换时可转换为简体或繁体
+    """
+
+    def __init__(self, simplified, traditional):
+        self.simplified = simplified
+        self.traditional = traditional
+        # self.__repr__ = self.__str__
+
+    def __str__(self):
+        return self.simplified or self.traditional or None
+
+    def __repr__(self):
+        return self.__str__()
+
+
+class ChineseNumberUnit(ChineseChar):
+    """
+    中文数字/数位字符
+    每个字符除繁简体外还有一个额外的大写字符
+    e.g. '陆' 和 '陸'
+    """
+
+    def __init__(self, power, simplified, traditional, big_s, big_t):
+        super(ChineseNumberUnit, self).__init__(simplified, traditional)
+        self.power = power
+        self.big_s = big_s
+        self.big_t = big_t
+
+    def __str__(self):
+        return "10^{}".format(self.power)
+
+    @classmethod
+    def create(cls, index, value, numbering_type=NUMBERING_TYPES[1], small_unit=False):
+
+        if small_unit:
+            return ChineseNumberUnit(
+                power=index + 1,
+                simplified=value[0],
+                traditional=value[1],
+                big_s=value[1],
+                big_t=value[1],
+            )
+        elif numbering_type == NUMBERING_TYPES[0]:
+            return ChineseNumberUnit(
+                power=index + 8,
+                simplified=value[0],
+                traditional=value[1],
+                big_s=value[0],
+                big_t=value[1],
+            )
+        elif numbering_type == NUMBERING_TYPES[1]:
+            return ChineseNumberUnit(
+                power=(index + 2) * 4,
+                simplified=value[0],
+                traditional=value[1],
+                big_s=value[0],
+                big_t=value[1],
+            )
+        elif numbering_type == NUMBERING_TYPES[2]:
+            return ChineseNumberUnit(
+                power=pow(2, index + 3),
+                simplified=value[0],
+                traditional=value[1],
+                big_s=value[0],
+                big_t=value[1],
+            )
+        else:
+            raise ValueError(
+                "Counting type should be in {0} ({1} provided).".format(
+                    NUMBERING_TYPES, numbering_type
+                )
+            )
+
+
+class ChineseNumberDigit(ChineseChar):
+    """
+    中文数字字符
+    """
+
+    def __init__(self, value, simplified, traditional, big_s, big_t, alt_s=None, alt_t=None):
+        super(ChineseNumberDigit, self).__init__(simplified, traditional)
+        self.value = value
+        self.big_s = big_s
+        self.big_t = big_t
+        self.alt_s = alt_s
+        self.alt_t = alt_t
+
+    def __str__(self):
+        return str(self.value)
+
+    @classmethod
+    def create(cls, i, v):
+        return ChineseNumberDigit(i, v[0], v[1], v[2], v[3])
+
+
+class ChineseMath(ChineseChar):
+    """
+    中文数位字符
+    """
+
+    def __init__(self, simplified, traditional, symbol, expression=None):
+        super(ChineseMath, self).__init__(simplified, traditional)
+        self.symbol = symbol
+        self.expression = expression
+        self.big_s = simplified
+        self.big_t = traditional
+
+
+CC, CNU, CND, CM = ChineseChar, ChineseNumberUnit, ChineseNumberDigit, ChineseMath
+
+
+class NumberSystem(object):
+    """
+    中文数字系统
+    """
+
+    pass
+
+
+class MathSymbol(object):
+    """
+    用于中文数字系统的数学符号 (繁/简体), e.g.
+    positive = ['正', '正']
+    negative = ['负', '負']
+    point = ['点', '點']
+    """
+
+    def __init__(self, positive, negative, point):
+        self.positive = positive
+        self.negative = negative
+        self.point = point
+
+    def __iter__(self):
+        for v in self.__dict__.values():
+            yield v
+
+
+# class OtherSymbol(object):
+#     """
+#     其他符号
+#     """
+#
+#     def __init__(self, sil):
+#         self.sil = sil
+#
+#     def __iter__(self):
+#         for v in self.__dict__.values():
+#             yield v
+
+
+# ================================================================================ #
+#                                    basic utils
+# ================================================================================ #
+def create_system(numbering_type=NUMBERING_TYPES[1]):
+    """
+    根据数字系统类型返回创建相应的数字系统，默认为 mid
+    NUMBERING_TYPES = ['low', 'mid', 'high']: 中文数字系统类型
+        low:  '兆' = '亿' * '十' = $10^{9}$,  '京' = '兆' * '十', etc.
+        mid:  '兆' = '亿' * '万' = $10^{12}$, '京' = '兆' * '万', etc.
+        high: '兆' = '亿' * '亿' = $10^{16}$, '京' = '兆' * '兆', etc.
+    返回对应的数字系统
+    """
+
+    # chinese number units of '亿' and larger
+    all_larger_units = zip(
+        LARGER_CHINESE_NUMERING_UNITS_SIMPLIFIED, LARGER_CHINESE_NUMERING_UNITS_TRADITIONAL
+    )
+    larger_units = [CNU.create(i, v, numbering_type, False) for i, v in enumerate(all_larger_units)]
+    # chinese number units of '十, 百, 千, 万'
+    all_smaller_units = zip(
+        SMALLER_CHINESE_NUMERING_UNITS_SIMPLIFIED, SMALLER_CHINESE_NUMERING_UNITS_TRADITIONAL
+    )
+    smaller_units = [CNU.create(i, v, small_unit=True) for i, v in enumerate(all_smaller_units)]
+    # digis
+    chinese_digis = zip(
+        CHINESE_DIGIS, CHINESE_DIGIS, BIG_CHINESE_DIGIS_SIMPLIFIED, BIG_CHINESE_DIGIS_TRADITIONAL
+    )
+    digits = [CND.create(i, v) for i, v in enumerate(chinese_digis)]
+    digits[0].alt_s, digits[0].alt_t = ZERO_ALT, ZERO_ALT
+    digits[1].alt_s, digits[1].alt_t = ONE_ALT, ONE_ALT
+    digits[2].alt_s, digits[2].alt_t = TWO_ALTS[0], TWO_ALTS[1]
+
+    # symbols
+    positive_cn = CM(POSITIVE[0], POSITIVE[1], "+", lambda x: x)
+    negative_cn = CM(NEGATIVE[0], NEGATIVE[1], "-", lambda x: -x)
+    point_cn = CM(POINT[0], POINT[1], ".", lambda x, y: float(str(x) + "." + str(y)))
+    # sil_cn = CM(SIL[0], SIL[1], '-', lambda x, y: float(str(x) + '-' + str(y)))
+    system = NumberSystem()
+    system.units = smaller_units + larger_units
+    system.digits = digits
+    system.math = MathSymbol(positive_cn, negative_cn, point_cn)
+    # system.symbols = OtherSymbol(sil_cn)
+    return system
+
+
+def chn2num(chinese_string, numbering_type=NUMBERING_TYPES[1]):
+
+    def get_symbol(char, system):
+        for u in system.units:
+            if char in [u.traditional, u.simplified, u.big_s, u.big_t]:
+                return u
+        for d in system.digits:
+            if char in [d.traditional, d.simplified, d.big_s, d.big_t, d.alt_s, d.alt_t]:
+                return d
+        for m in system.math:
+            if char in [m.traditional, m.simplified]:
+                return m
+
+    def string2symbols(chinese_string, system):
+        int_string, dec_string = chinese_string, ""
+        for p in [system.math.point.simplified, system.math.point.traditional]:
+            if p in chinese_string:
+                int_string, dec_string = chinese_string.split(p)
+                break
+        return [get_symbol(c, system) for c in int_string], [
+            get_symbol(c, system) for c in dec_string
+        ]
+
+    def correct_symbols(integer_symbols, system):
+        """
+        一百八 to 一百八十
+        一亿一千三百万 to 一亿 一千万 三百万
+        """
+
+        if integer_symbols and isinstance(integer_symbols[0], CNU):
+            if integer_symbols[0].power == 1:
+                integer_symbols = [system.digits[1]] + integer_symbols
+
+        if len(integer_symbols) > 1:
+            if isinstance(integer_symbols[-1], CND) and isinstance(integer_symbols[-2], CNU):
+                integer_symbols.append(CNU(integer_symbols[-2].power - 1, None, None, None, None))
+
+        result = []
+        unit_count = 0
+        for s in integer_symbols:
+            if isinstance(s, CND):
+                result.append(s)
+                unit_count = 0
+            elif isinstance(s, CNU):
+                current_unit = CNU(s.power, None, None, None, None)
+                unit_count += 1
+
+            if unit_count == 1:
+                result.append(current_unit)
+            elif unit_count > 1:
+                for i in range(len(result)):
+                    if (
+                        isinstance(result[-i - 1], CNU)
+                        and result[-i - 1].power < current_unit.power
+                    ):
+                        result[-i - 1] = CNU(
+                            result[-i - 1].power + current_unit.power, None, None, None, None
+                        )
+        return result
+
+    def compute_value(integer_symbols):
+        """
+        Compute the value.
+        When current unit is larger than previous unit, current unit * all previous units will be used as all previous units.
+        e.g. '两千万' = 2000 * 10000 not 2000 + 10000
+        """
+        value = [0]
+        last_power = 0
+        for s in integer_symbols:
+            if isinstance(s, CND):
+                value[-1] = s.value
+            elif isinstance(s, CNU):
+                value[-1] *= pow(10, s.power)
+                if s.power > last_power:
+                    value[:-1] = list(map(lambda v: v * pow(10, s.power), value[:-1]))
+                    last_power = s.power
+                value.append(0)
+        return sum(value)
+
+    system = create_system(numbering_type)
+    int_part, dec_part = string2symbols(chinese_string, system)
+    int_part = correct_symbols(int_part, system)
+    int_str = str(compute_value(int_part))
+    dec_str = "".join([str(d.value) for d in dec_part])
+    if dec_part:
+        return "{0}.{1}".format(int_str, dec_str)
+    else:
+        return int_str
+
+
+def num2chn(
+    number_string,
+    numbering_type=NUMBERING_TYPES[1],
+    big=False,
+    traditional=False,
+    alt_zero=False,
+    alt_one=False,
+    alt_two=True,
+    use_zeros=True,
+    use_units=True,
+):
+
+    def get_value(value_string, use_zeros=True):
+
+        striped_string = value_string.lstrip("0")
+
+        # record nothing if all zeros
+        if not striped_string:
+            return []
+
+        # record one digits
+        elif len(striped_string) == 1:
+            if use_zeros and len(value_string) != len(striped_string):
+                return [system.digits[0], system.digits[int(striped_string)]]
+            else:
+                return [system.digits[int(striped_string)]]
+
+        # recursively record multiple digits
+        else:
+            result_unit = next(u for u in reversed(system.units) if u.power < len(striped_string))
+            result_string = value_string[: -result_unit.power]
+            return (
+                get_value(result_string)
+                + [result_unit]
+                + get_value(striped_string[-result_unit.power :])
+            )
+
+    system = create_system(numbering_type)
+
+    int_dec = number_string.split(".")
+    if len(int_dec) == 1:
+        int_string = int_dec[0]
+        dec_string = ""
+    elif len(int_dec) == 2:
+        int_string = int_dec[0]
+        dec_string = int_dec[1]
+    else:
+        raise ValueError(
+            "invalid input num string with more than one dot: {}".format(number_string)
+        )
+
+    if use_units and len(int_string) > 1:
+        result_symbols = get_value(int_string)
+    else:
+        result_symbols = [system.digits[int(c)] for c in int_string]
+    dec_symbols = [system.digits[int(c)] for c in dec_string]
+    if dec_string:
+        result_symbols += [system.math.point] + dec_symbols
+
+    if alt_two:
+        liang = CND(
+            2,
+            system.digits[2].alt_s,
+            system.digits[2].alt_t,
+            system.digits[2].big_s,
+            system.digits[2].big_t,
+        )
+        for i, v in enumerate(result_symbols):
+            if isinstance(v, CND) and v.value == 2:
+                next_symbol = result_symbols[i + 1] if i < len(result_symbols) - 1 else None
+                previous_symbol = result_symbols[i - 1] if i > 0 else None
+                if isinstance(next_symbol, CNU) and isinstance(previous_symbol, (CNU, type(None))):
+                    if next_symbol.power != 1 and (
+                        (previous_symbol is None) or (previous_symbol.power != 1)
+                    ):
+                        result_symbols[i] = liang
+
+    # if big is True, '两' will not be used and `alt_two` has no impact on output
+    if big:
+        attr_name = "big_"
+        if traditional:
+            attr_name += "t"
+        else:
+            attr_name += "s"
+    else:
+        if traditional:
+            attr_name = "traditional"
+        else:
+            attr_name = "simplified"
+
+    result = "".join([getattr(s, attr_name) for s in result_symbols])
+
+    # if not use_zeros:
+    #     result = result.strip(getattr(system.digits[0], attr_name))
+
+    if alt_zero:
+        result = result.replace(getattr(system.digits[0], attr_name), system.digits[0].alt_s)
+
+    if alt_one:
+        result = result.replace(getattr(system.digits[1], attr_name), system.digits[1].alt_s)
+
+    for i, p in enumerate(POINT):
+        if result.startswith(p):
+            return CHINESE_DIGIS[0] + result
+
+    # ^10, 11, .., 19
+    if (
+        len(result) >= 2
+        and result[1]
+        in [
+            SMALLER_CHINESE_NUMERING_UNITS_SIMPLIFIED[0],
+            SMALLER_CHINESE_NUMERING_UNITS_TRADITIONAL[0],
+        ]
+        and result[0]
+        in [CHINESE_DIGIS[1], BIG_CHINESE_DIGIS_SIMPLIFIED[1], BIG_CHINESE_DIGIS_TRADITIONAL[1]]
+    ):
+        result = result[1:]
+
+    return result
+
+
+# ================================================================================ #
+#                          different types of rewriters
+# ================================================================================ #
+class Cardinal:
+    """
+    CARDINAL类
+    """
+
+    def __init__(self, cardinal=None, chntext=None):
+        self.cardinal = cardinal
+        self.chntext = chntext
+
+    def chntext2cardinal(self):
+        return chn2num(self.chntext)
+
+    def cardinal2chntext(self):
+        return num2chn(self.cardinal)
+
+
+class Digit:
+    """
+    DIGIT类
+    """
+
+    def __init__(self, digit=None, chntext=None):
+        self.digit = digit
+        self.chntext = chntext
+
+    # def chntext2digit(self):
+    #     return chn2num(self.chntext)
+
+    def digit2chntext(self):
+        return num2chn(self.digit, alt_two=False, use_units=False)
+
+
+class TelePhone:
+    """
+    TELEPHONE类
+    """
+
+    def __init__(self, telephone=None, raw_chntext=None, chntext=None):
+        self.telephone = telephone
+        self.raw_chntext = raw_chntext
+        self.chntext = chntext
+
+    # def chntext2telephone(self):
+    #     sil_parts = self.raw_chntext.split('<SIL>')
+    #     self.telephone = '-'.join([
+    #         str(chn2num(p)) for p in sil_parts
+    #     ])
+    #     return self.telephone
+
+    def telephone2chntext(self, fixed=False):
+
+        if fixed:
+            sil_parts = self.telephone.split("-")
+            self.raw_chntext = "<SIL>".join(
+                [num2chn(part, alt_two=False, use_units=False) for part in sil_parts]
+            )
+            self.chntext = self.raw_chntext.replace("<SIL>", "")
+        else:
+            sp_parts = self.telephone.strip("+").split()
+            self.raw_chntext = "<SP>".join(
+                [num2chn(part, alt_two=False, use_units=False) for part in sp_parts]
+            )
+            self.chntext = self.raw_chntext.replace("<SP>", "")
+        return self.chntext
+
+
+class Fraction:
+    """
+    FRACTION类
+    """
+
+    def __init__(self, fraction=None, chntext=None):
+        self.fraction = fraction
+        self.chntext = chntext
+
+    def chntext2fraction(self):
+        denominator, numerator = self.chntext.split("分之")
+        return chn2num(numerator) + "/" + chn2num(denominator)
+
+    def fraction2chntext(self):
+        numerator, denominator = self.fraction.split("/")
+        return num2chn(denominator) + "分之" + num2chn(numerator)
+
+
+class Date:
+    """
+    DATE类
+    """
+
+    def __init__(self, date=None, chntext=None):
+        self.date = date
+        self.chntext = chntext
+
+    # def chntext2date(self):
+    #     chntext = self.chntext
+    #     try:
+    #         year, other = chntext.strip().split('年', maxsplit=1)
+    #         year = Digit(chntext=year).digit2chntext() + '年'
+    #     except ValueError:
+    #         other = chntext
+    #         year = ''
+    #     if other:
+    #         try:
+    #             month, day = other.strip().split('月', maxsplit=1)
+    #             month = Cardinal(chntext=month).chntext2cardinal() + '月'
+    #         except ValueError:
+    #             day = chntext
+    #             month = ''
+    #         if day:
+    #             day = Cardinal(chntext=day[:-1]).chntext2cardinal() + day[-1]
+    #     else:
+    #         month = ''
+    #         day = ''
+    #     date = year + month + day
+    #     self.date = date
+    #     return self.date
+
+    def date2chntext(self):
+        date = self.date
+        try:
+            year, other = date.strip().split("年", 1)
+            year = Digit(digit=year).digit2chntext() + "年"
+        except ValueError:
+            other = date
+            year = ""
+        if other:
+            try:
+                month, day = other.strip().split("月", 1)
+                month = Cardinal(cardinal=month).cardinal2chntext() + "月"
+            except ValueError:
+                day = date
+                month = ""
+            if day:
+                day = Cardinal(cardinal=day[:-1]).cardinal2chntext() + day[-1]
+        else:
+            month = ""
+            day = ""
+        chntext = year + month + day
+        self.chntext = chntext
+        return self.chntext
+
+
+class Money:
+    """
+    MONEY类
+    """
+
+    def __init__(self, money=None, chntext=None):
+        self.money = money
+        self.chntext = chntext
+
+    # def chntext2money(self):
+    #     return self.money
+
+    def money2chntext(self):
+        money = self.money
+        pattern = re.compile(r"(\d+(\.\d+)?)")
+        matchers = pattern.findall(money)
+        if matchers:
+            for matcher in matchers:
+                money = money.replace(matcher[0], Cardinal(cardinal=matcher[0]).cardinal2chntext())
+        self.chntext = money
+        return self.chntext
+
+
+class Percentage:
+    """
+    PERCENTAGE类
+    """
+
+    def __init__(self, percentage=None, chntext=None):
+        self.percentage = percentage
+        self.chntext = chntext
+
+    def chntext2percentage(self):
+        return chn2num(self.chntext.strip().strip("百分之")) + "%"
+
+    def percentage2chntext(self):
+        return "百分之" + num2chn(self.percentage.strip().strip("%"))
+
+
+def remove_erhua(text, er_whitelist):
+    """
+    去除儿化音词中的儿:
+    他女儿在那边儿 -> 他女儿在那边
+    """
+
+    er_pattern = re.compile(er_whitelist)
+    new_str = ""
+    while re.search("儿", text):
+        a = re.search("儿", text).span()
+        remove_er_flag = 0
+
+        if er_pattern.search(text):
+            b = er_pattern.search(text).span()
+            if b[0] <= a[0]:
+                remove_er_flag = 1
+
+        if remove_er_flag == 0:
+            new_str = new_str + text[0 : a[0]]
+            text = text[a[1] :]
+        else:
+            new_str = new_str + text[0 : b[1]]
+            text = text[b[1] :]
+
+    text = new_str + text
+    return text
+
+
+# ================================================================================ #
+#                            NSW Normalizer
+# ================================================================================ #
+class NSWNormalizer:
+    def __init__(self, raw_text):
+        self.raw_text = "^" + raw_text + "$"
+        self.norm_text = ""
+
+    def _particular(self):
+        text = self.norm_text
+        pattern = re.compile(r"(([a-zA-Z]+)二([a-zA-Z]+))")
+        matchers = pattern.findall(text)
+        if matchers:
+            # print('particular')
+            for matcher in matchers:
+                text = text.replace(matcher[0], matcher[1] + "2" + matcher[2], 1)
+        self.norm_text = text
+        return self.norm_text
+
+    def normalize(self):
+        text = self.raw_text
+
+        # 规范化日期
+        pattern = re.compile(r"\D+((([089]\d|(19|20)\d{2})年)?(\d{1,2}月(\d{1,2}[日号])?)?)")
+        matchers = pattern.findall(text)
+        if matchers:
+            # print('date')
+            for matcher in matchers:
+                text = text.replace(matcher[0], Date(date=matcher[0]).date2chntext(), 1)
+
+        # 规范化金钱
+        pattern = re.compile(
+            r"\D+((\d+(\.\d+)?)[多余几]?" + CURRENCY_UNITS + r"(\d" + CURRENCY_UNITS + r"?)?)"
+        )
+        matchers = pattern.findall(text)
+        if matchers:
+            # print('money')
+            for matcher in matchers:
+                text = text.replace(matcher[0], Money(money=matcher[0]).money2chntext(), 1)
+
+        # 规范化固话/手机号码
+        # 手机
+        # http://www.jihaoba.com/news/show/13680
+        # 移动：139、138、137、136、135、134、159、158、157、150、151、152、188、187、182、183、184、178、198
+        # 联通：130、131、132、156、155、186、185、176
+        # 电信：133、153、189、180、181、177
+        pattern = re.compile(r"\D((\+?86 ?)?1([38]\d|5[0-35-9]|7[678]|9[89])\d{8})\D")
+        matchers = pattern.findall(text)
+        if matchers:
+            # print('telephone')
+            for matcher in matchers:
+                text = text.replace(
+                    matcher[0], TelePhone(telephone=matcher[0]).telephone2chntext(), 1
+                )
+        # 固话
+        pattern = re.compile(r"\D((0(10|2[1-3]|[3-9]\d{2})-?)?[1-9]\d{6,7})\D")
+        matchers = pattern.findall(text)
+        if matchers:
+            # print('fixed telephone')
+            for matcher in matchers:
+                text = text.replace(
+                    matcher[0], TelePhone(telephone=matcher[0]).telephone2chntext(fixed=True), 1
+                )
+
+        # 规范化分数
+        pattern = re.compile(r"(\d+/\d+)")
+        matchers = pattern.findall(text)
+        if matchers:
+            # print('fraction')
+            for matcher in matchers:
+                text = text.replace(matcher, Fraction(fraction=matcher).fraction2chntext(), 1)
+
+        # 规范化百分数
+        text = text.replace("％", "%")
+        pattern = re.compile(r"(\d+(\.\d+)?%)")
+        matchers = pattern.findall(text)
+        if matchers:
+            # print('percentage')
+            for matcher in matchers:
+                text = text.replace(
+                    matcher[0], Percentage(percentage=matcher[0]).percentage2chntext(), 1
+                )
+
+        # 规范化纯数+量词
+        pattern = re.compile(r"(\d+(\.\d+)?)[多余几]?" + COM_QUANTIFIERS)
+        matchers = pattern.findall(text)
+        if matchers:
+            # print('cardinal+quantifier')
+            for matcher in matchers:
+                text = text.replace(matcher[0], Cardinal(cardinal=matcher[0]).cardinal2chntext(), 1)
+
+        # 规范化数字编号
+        pattern = re.compile(r"(\d{4,32})")
+        matchers = pattern.findall(text)
+        if matchers:
+            # print('digit')
+            for matcher in matchers:
+                text = text.replace(matcher, Digit(digit=matcher).digit2chntext(), 1)
+
+        # 规范化纯数
+        pattern = re.compile(r"(\d+(\.\d+)?)")
+        matchers = pattern.findall(text)
+        if matchers:
+            # print('cardinal')
+            for matcher in matchers:
+                text = text.replace(matcher[0], Cardinal(cardinal=matcher[0]).cardinal2chntext(), 1)
+
+        self.norm_text = text
+        self._particular()
+
+        return self.norm_text.lstrip("^").rstrip("$")
+
+
+def nsw_test_case(raw_text):
+    print("I:" + raw_text)
+    print("O:" + NSWNormalizer(raw_text).normalize())
+    print("")
+
+
+def nsw_test():
+    nsw_test_case("固话：0595-23865596或23880880。")
+    nsw_test_case("固话：0595-23865596或23880880。")
+    nsw_test_case("手机：+86 19859213959或15659451527。")
+    nsw_test_case("分数：32477/76391。")
+    nsw_test_case("百分数：80.03%。")
+    nsw_test_case("编号：31520181154418。")
+    nsw_test_case("纯数：2983.07克或12345.60米。")
+    nsw_test_case("日期：1999年2月20日或09年3月15号。")
+    nsw_test_case("金钱：12块5，34.5元，20.1万")
+    nsw_test_case("特殊：O2O或B2C。")
+    nsw_test_case("3456万吨")
+    nsw_test_case("2938个")
+    nsw_test_case("938")
+    nsw_test_case("今天吃了115个小笼包231个馒头")
+    nsw_test_case("有62％的概率")
+
+
+if __name__ == "__main__":
+    # nsw_test()
+
+    p = argparse.ArgumentParser()
+    p.add_argument("ifile", help="input filename, assume utf-8 encoding")
+    p.add_argument("ofile", help="output filename")
+    p.add_argument("--to_upper", action="store_true", help="convert to upper case")
+    p.add_argument("--to_lower", action="store_true", help="convert to lower case")
+    p.add_argument(
+        "--has_key", action="store_true", help="input text has Kaldi's key as first field."
+    )
+    p.add_argument(
+        "--remove_fillers", type=bool, default=True, help='remove filler chars such as "呃, 啊"'
+    )
+    p.add_argument(
+        "--remove_erhua", type=bool, default=True, help='remove erhua chars such as "这儿"'
+    )
+    p.add_argument(
+        "--log_interval", type=int, default=10000, help="log interval in number of processed lines"
+    )
+    args = p.parse_args()
+
+    ifile = codecs.open(args.ifile, "r", "utf8")
+    ofile = codecs.open(args.ofile, "w+", "utf8")
+
+    n = 0
+    for l in ifile:
+        key = ""
+        text = ""
+        if args.has_key:
+            cols = l.split(maxsplit=1)
+            key = cols[0]
+            if len(cols) == 2:
+                text = cols[1].strip()
+            else:
+                text = ""
+        else:
+            text = l.strip()
+
+        # cases
+        if args.to_upper and args.to_lower:
+            sys.stderr.write("text norm: to_upper OR to_lower?")
+            exit(1)
+        if args.to_upper:
+            text = text.upper()
+        if args.to_lower:
+            text = text.lower()
+
+        # Filler chars removal
+        if args.remove_fillers:
+            for ch in FILLER_CHARS:
+                text = text.replace(ch, "")
+
+        if args.remove_erhua:
+            text = remove_erhua(text, ER_WHITELIST)
+
+        # NSW(Non-Standard-Word) normalization
+        text = NSWNormalizer(text).normalize()
+
+        # Punctuations removal
+        old_chars = CHINESE_PUNC_LIST + string.punctuation  # includes all CN and EN punctuations
+        new_chars = " " * len(old_chars)
+        del_chars = ""
+        text = text.translate(str.maketrans(old_chars, new_chars, del_chars))
+
+        #
+        if args.has_key:
+            ofile.write(key + "\t" + text + "\n")
+        else:
+            ofile.write(text + "\n")
+
+        n += 1
+        if n % args.log_interval == 0:
+            sys.stderr.write("text norm: {} lines done.\n".format(n))
+
+    sys.stderr.write("text norm: {} lines done in total.\n".format(n))
+
+    ifile.close()
+    ofile.close()
--- a/modules/python/vendors/FunASR/examples/aishell/transformer/README.md
+++ b/modules/python/vendors/FunASR/examples/aishell/transformer/README.md
@@ -0,0 +1,16 @@
+
+# Conformer Result
+
+## Training Config
+- Feature info: using 80 dims fbank, global cmvn, speed perturb(0.9, 1.0, 1.1), specaugment
+- Train info: lr 5e-4, batch_size 25000, 2 gpu(Tesla V100), acc_grad 1, 50 epochs
+- Train config: conf/train_asr_transformer.yaml
+- LM config: LM was not used
+- Model size: 46M
+
+## Results (CER)
+
+|   testset   | CER(%) |
+|:-----------:|:------:|
+|     dev     |  4.97  |
+|    test     |  5.37  |
--- a/modules/python/vendors/FunASR/examples/aishell/transformer/conf/transformer_12e_6d_2048_256.yaml
+++ b/modules/python/vendors/FunASR/examples/aishell/transformer/conf/transformer_12e_6d_2048_256.yaml
@@ -0,0 +1,104 @@
+# This is an example that demonstrates how to configure a model file.
+# You can modify the configuration according to your own requirements.
+
+# to print the register_table:
+# from funasr.register import tables
+# tables.print()
+
+# network architecture
+model: Transformer
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+
+# encoder
+encoder: TransformerEncoder
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder architecture type
+    normalize_before: true
+
+# decoder
+decoder: TransformerDecoder
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+
+# frontend related
+frontend: WavFrontend
+frontend_conf:
+    fs: 16000
+    window: hamming
+    n_mels: 80
+    frame_length: 25
+    frame_shift: 10
+    lfr_m: 1
+    lfr_n: 1
+
+specaug: SpecAug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
+
+train_conf:
+  accum_grad: 1
+  grad_clip: 5
+  max_epoch: 150
+  keep_nbest_models: 10
+  log_interval: 50
+
+optim: adam
+optim_conf:
+   lr: 0.002
+scheduler: warmuplr
+scheduler_conf:
+   warmup_steps: 30000
+
+dataset: AudioDataset
+dataset_conf:
+    index_ds: IndexDSJsonl
+    batch_sampler: EspnetStyleBatchSampler
+    batch_type: length # example or length
+    batch_size: 25000 # if batch_type is example, batch_size is the numbers of samples; if length, batch_size is source_token_len+target_token_len;
+    max_token_length: 2048 # filter samples if source_token_len+target_token_len > max_token_length,
+    buffer_size: 1024
+    shuffle: True
+    num_workers: 4
+    preprocessor_speech: SpeechPreprocessSpeedPerturb
+    preprocessor_speech_conf:
+      speed_perturb: [0.9, 1.0, 1.1]
+
+tokenizer: CharTokenizer
+tokenizer_conf:
+  unk_symbol: <unk>
+
+ctc_conf:
+    dropout_rate: 0.0
+    ctc_type: builtin
+    reduce: true
+    ignore_nan_grad: true
+normalize: null
--- a/modules/python/vendors/FunASR/examples/aishell/transformer/demo_infer.sh
+++ b/modules/python/vendors/FunASR/examples/aishell/transformer/demo_infer.sh
@@ -0,0 +1 @@
+../paraformer/demo_infer.sh
--- a/modules/python/vendors/FunASR/examples/aishell/transformer/demo_train_or_finetune.sh
+++ b/modules/python/vendors/FunASR/examples/aishell/transformer/demo_train_or_finetune.sh
@@ -0,0 +1 @@
+../paraformer/demo_train_or_finetune.sh
--- a/modules/python/vendors/FunASR/examples/aishell/transformer/local/aishell_data_prep.sh
+++ b/modules/python/vendors/FunASR/examples/aishell/transformer/local/aishell_data_prep.sh
@@ -0,0 +1,66 @@
+#!/bin/bash
+
+# Copyright 2017 Xingyu Na
+# Apache 2.0
+
+#. ./path.sh || exit 1;
+
+if [ $# != 3 ]; then
+  echo "Usage: $0 <audio-path> <text-path> <output-path>"
+  echo " $0 /export/a05/xna/data/data_aishell/wav /export/a05/xna/data/data_aishell/transcript data"
+  exit 1;
+fi
+
+aishell_audio_dir=$1
+aishell_text=$2/aishell_transcript_v0.8.txt
+output_dir=$3
+
+train_dir=$output_dir/data/local/train
+dev_dir=$output_dir/data/local/dev
+test_dir=$output_dir/data/local/test
+tmp_dir=$output_dir/data/local/tmp
+
+mkdir -p $train_dir
+mkdir -p $dev_dir
+mkdir -p $test_dir
+mkdir -p $tmp_dir
+
+# data directory check
+if [ ! -d $aishell_audio_dir ] || [ ! -f $aishell_text ]; then
+  echo "Error: $0 requires two directory arguments"
+  exit 1;
+fi
+
+# find wav audio file for train, dev and test resp.
+find $aishell_audio_dir -iname "*.wav" > $tmp_dir/wav.flist
+n=`cat $tmp_dir/wav.flist | wc -l`
+[ $n -ne 141925 ] && \
+  echo Warning: expected 141925 data data files, found $n
+
+grep -i "wav/train" $tmp_dir/wav.flist > $train_dir/wav.flist || exit 1;
+grep -i "wav/dev" $tmp_dir/wav.flist > $dev_dir/wav.flist || exit 1;
+grep -i "wav/test" $tmp_dir/wav.flist > $test_dir/wav.flist || exit 1;
+
+rm -r $tmp_dir
+
+# Transcriptions preparation
+for dir in $train_dir $dev_dir $test_dir; do
+  echo Preparing $dir transcriptions
+  sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{print $NF}' > $dir/utt.list
+  paste -d' ' $dir/utt.list $dir/wav.flist > $dir/wav.scp_all
+  utils/filter_scp.pl -f 1 $dir/utt.list $aishell_text > $dir/transcripts.txt
+  awk '{print $1}' $dir/transcripts.txt > $dir/utt.list
+  utils/filter_scp.pl -f 1 $dir/utt.list $dir/wav.scp_all | sort -u > $dir/wav.scp
+  sort -u $dir/transcripts.txt > $dir/text
+done
+
+mkdir -p $output_dir/data/train $output_dir/data/dev $output_dir/data/test
+
+for f in wav.scp text; do
+  cp $train_dir/$f $output_dir/data/train/$f || exit 1;
+  cp $dev_dir/$f $output_dir/data/dev/$f || exit 1;
+  cp $test_dir/$f $output_dir/data/test/$f || exit 1;
+done
+
+echo "$0: AISHELL data preparation succeeded"
+exit 0;
--- a/modules/python/vendors/FunASR/examples/aishell/transformer/local/download_and_untar.sh
+++ b/modules/python/vendors/FunASR/examples/aishell/transformer/local/download_and_untar.sh
@@ -0,0 +1,105 @@
+#!/usr/bin/env bash
+
+# Copyright   2014  Johns Hopkins University (author: Daniel Povey)
+#             2017  Xingyu Na
+# Apache 2.0
+
+remove_archive=false
+
+if [ "$1" == --remove-archive ]; then
+  remove_archive=true
+  shift
+fi
+
+if [ $# -ne 3 ]; then
+  echo "Usage: $0 [--remove-archive] <data-base> <url-base> <corpus-part>"
+  echo "e.g.: $0 /export/a05/xna/data www.openslr.org/resources/33 data_aishell"
+  echo "With --remove-archive it will remove the archive after successfully un-tarring it."
+  echo "<corpus-part> can be one of: data_aishell, resource_aishell."
+fi
+
+data=$1
+url=$2
+part=$3
+
+if [ ! -d "$data" ]; then
+  echo "$0: no such directory $data"
+  exit 1;
+fi
+
+part_ok=false
+list="data_aishell resource_aishell"
+for x in $list; do
+  if [ "$part" == $x ]; then part_ok=true; fi
+done
+if ! $part_ok; then
+  echo "$0: expected <corpus-part> to be one of $list, but got '$part'"
+  exit 1;
+fi
+
+if [ -z "$url" ]; then
+  echo "$0: empty URL base."
+  exit 1;
+fi
+
+if [ -f $data/$part/.complete ]; then
+  echo "$0: data part $part was already successfully extracted, nothing to do."
+  exit 0;
+fi
+
+# sizes of the archive files in bytes.
+sizes="15582913665 1246920"
+
+if [ -f $data/$part.tgz ]; then
+  size=$(/bin/ls -l $data/$part.tgz | awk '{print $5}')
+  size_ok=false
+  for s in $sizes; do if [ $s == $size ]; then size_ok=true; fi; done
+  if ! $size_ok; then
+    echo "$0: removing existing file $data/$part.tgz because its size in bytes $size"
+    echo "does not equal the size of one of the archives."
+    rm $data/$part.tgz
+  else
+    echo "$data/$part.tgz exists and appears to be complete."
+  fi
+fi
+
+if [ ! -f $data/$part.tgz ]; then
+  if ! command -v wget >/dev/null; then
+    echo "$0: wget is not installed."
+    exit 1;
+  fi
+  full_url=$url/$part.tgz
+  echo "$0: downloading data from $full_url.  This may take some time, please be patient."
+
+  cd $data || exit 1
+  if ! wget --no-check-certificate $full_url; then
+    echo "$0: error executing wget $full_url"
+    exit 1;
+  fi
+fi
+
+cd $data || exit 1
+
+if ! tar -xvzf $part.tgz; then
+  echo "$0: error un-tarring archive $data/$part.tgz"
+  exit 1;
+fi
+
+touch $data/$part/.complete
+
+if [ $part == "data_aishell" ]; then
+  cd $data/$part/wav || exit 1
+  for wav in ./*.tar.gz; do
+    echo "Extracting wav from $wav"
+    tar -zxf $wav && rm $wav
+  done
+fi
+
+echo "$0: Successfully downloaded and un-tarred $data/$part.tgz"
+
+if $remove_archive; then
+  echo "$0: removing $data/$part.tgz file since --remove-archive option was supplied."
+  rm $data/$part.tgz
+fi
+
+exit 0;
--- a/modules/python/vendors/FunASR/examples/aishell/transformer/run.sh
+++ b/modules/python/vendors/FunASR/examples/aishell/transformer/run.sh
@@ -0,0 +1,203 @@
+#!/usr/bin/env bash
+
+
+CUDA_VISIBLE_DEVICES="0,1"
+
+# general configuration
+feats_dir="../DATA" #feature output dictionary
+exp_dir=`pwd`
+lang=zh
+token_type=char
+stage=0
+stop_stage=5
+
+# feature configuration
+nj=32
+
+inference_device="cuda" #"cpu"
+inference_checkpoint="model.pt.avg10"
+inference_scp="wav.scp"
+inference_batch_size=1
+
+# data
+raw_data=../raw_data
+data_url=www.openslr.org/resources/33
+
+# exp tag
+tag="exp1"
+workspace=`pwd`
+
+master_port=12345
+
+. utils/parse_options.sh || exit 1;
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+train_set=train
+valid_set=dev
+test_sets="dev test"
+
+config=transformer_12e_6d_2048_256.yaml
+model_dir="baseline_$(basename "${config}" .yaml)_${lang}_${token_type}_${tag}"
+
+
+
+if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
+    echo "stage -1: Data Download"
+    mkdir -p ${raw_data}
+    local/download_and_untar.sh ${raw_data} ${data_url} data_aishell
+    local/download_and_untar.sh ${raw_data} ${data_url} resource_aishell
+fi
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    echo "stage 0: Data preparation"
+    # Data preparation
+    local/aishell_data_prep.sh ${raw_data}/data_aishell/wav ${raw_data}/data_aishell/transcript ${feats_dir}
+    for x in train dev test; do
+        cp ${feats_dir}/data/${x}/text ${feats_dir}/data/${x}/text.org
+        paste -d " " <(cut -f 1 -d" " ${feats_dir}/data/${x}/text.org) <(cut -f 2- -d" " ${feats_dir}/data/${x}/text.org | tr -d " ") \
+            > ${feats_dir}/data/${x}/text
+        utils/text2token.py -n 1 -s 1 ${feats_dir}/data/${x}/text > ${feats_dir}/data/${x}/text.org
+        mv ${feats_dir}/data/${x}/text.org ${feats_dir}/data/${x}/text
+
+        # convert wav.scp text to jsonl
+        scp_file_list_arg="++scp_file_list='[\"${feats_dir}/data/${x}/wav.scp\",\"${feats_dir}/data/${x}/text\"]'"
+        python ../../../funasr/datasets/audio_datasets/scp2jsonl.py \
+        ++data_type_list='["source", "target"]' \
+        ++jsonl_file_out=${feats_dir}/data/${x}/audio_datasets.jsonl \
+        ${scp_file_list_arg}
+    done
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    echo "stage 1: Feature and CMVN Generation"
+    python ../../../funasr/bin/compute_audio_cmvn.py \
+    --config-path "${workspace}/conf" \
+    --config-name "${config}" \
+    ++train_data_set_list="${feats_dir}/data/${train_set}/audio_datasets.jsonl" \
+    ++cmvn_file="${feats_dir}/data/${train_set}/cmvn.json" \
+
+fi
+
+token_list=${feats_dir}/data/${lang}_token_list/$token_type/tokens.txt
+echo "dictionary: ${token_list}"
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    echo "stage 2: Dictionary Preparation"
+    mkdir -p ${feats_dir}/data/${lang}_token_list/$token_type/
+
+    echo "make a dictionary"
+    echo "<blank>" > ${token_list}
+    echo "<s>" >> ${token_list}
+    echo "</s>" >> ${token_list}
+    utils/text2token.py -s 1 -n 1 --space "" ${feats_dir}/data/$train_set/text | cut -f 2- -d" " | tr " " "\n" \
+        | sort | uniq | grep -a -v -e '^\s*$' | awk '{print $0}' >> ${token_list}
+    echo "<unk>" >> ${token_list}
+fi
+
+# LM Training Stage
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    echo "stage 3: LM Training"
+fi
+
+# ASR Training Stage
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+  echo "stage 4: ASR Training"
+
+  mkdir -p ${exp_dir}/exp/${model_dir}
+  current_time=$(date "+%Y-%m-%d_%H-%M")
+  log_file="${exp_dir}/exp/${model_dir}/train.log.txt.${current_time}"
+  echo "log_file: ${log_file}"
+
+  export CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES
+  gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+  torchrun \
+  --nnodes 1 \
+  --nproc_per_node ${gpu_num} \
+  --master_port ${master_port} \
+  ../../../funasr/bin/train.py \
+  --config-path "${workspace}/conf" \
+  --config-name "${config}" \
+  ++train_data_set_list="${feats_dir}/data/${train_set}/audio_datasets.jsonl" \
+  ++valid_data_set_list="${feats_dir}/data/${valid_set}/audio_datasets.jsonl" \
+  ++tokenizer_conf.token_list="${token_list}" \
+  ++frontend_conf.cmvn_file="${feats_dir}/data/${train_set}/am.mvn" \
+  ++output_dir="${exp_dir}/exp/${model_dir}" &> ${log_file}
+fi
+
+
+
+# Testing Stage
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+  echo "stage 5: Inference"
+
+  if [ ${inference_device} == "cuda" ]; then
+      nj=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+  else
+      inference_batch_size=1
+      CUDA_VISIBLE_DEVICES=""
+      for JOB in $(seq ${nj}); do
+          CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"-1,"
+      done
+  fi
+
+  for dset in ${test_sets}; do
+
+    inference_dir="${exp_dir}/exp/${model_dir}/inference-${inference_checkpoint}/${dset}"
+    _logdir="${inference_dir}/logdir"
+    echo "inference_dir: ${inference_dir}"
+
+    mkdir -p "${_logdir}"
+    data_dir="${feats_dir}/data/${dset}"
+    key_file=${data_dir}/${inference_scp}
+
+    split_scps=
+    for JOB in $(seq "${nj}"); do
+        split_scps+=" ${_logdir}/keys.${JOB}.scp"
+    done
+    utils/split_scp.pl "${key_file}" ${split_scps}
+
+    gpuid_list_array=(${CUDA_VISIBLE_DEVICES//,/ })
+    for JOB in $(seq ${nj}); do
+        {
+          id=$((JOB-1))
+          gpuid=${gpuid_list_array[$id]}
+
+          export CUDA_VISIBLE_DEVICES=${gpuid}
+          python ../../../funasr/bin/inference.py \
+          --config-path="${exp_dir}/exp/${model_dir}" \
+          --config-name="config.yaml" \
+          ++init_param="${exp_dir}/exp/${model_dir}/${inference_checkpoint}" \
+          ++tokenizer_conf.token_list="${token_list}" \
+          ++frontend_conf.cmvn_file="${feats_dir}/data/${train_set}/am.mvn" \
+          ++input="${_logdir}/keys.${JOB}.scp" \
+          ++output_dir="${inference_dir}/${JOB}" \
+          ++device="${inference_device}" \
+          ++ncpu=1 \
+          ++disable_log=true \
+          ++batch_size="${inference_batch_size}" &> ${_logdir}/log.${JOB}.txt
+        }&
+
+    done
+    wait
+
+    mkdir -p ${inference_dir}/1best_recog
+    for f in token score text; do
+        if [ -f "${inference_dir}/${JOB}/1best_recog/${f}" ]; then
+          for JOB in $(seq "${nj}"); do
+              cat "${inference_dir}/${JOB}/1best_recog/${f}"
+          done | sort -k1 >"${inference_dir}/1best_recog/${f}"
+        fi
+    done
+
+    echo "Computing WER ..."
+    python utils/postprocess_text_zh.py ${inference_dir}/1best_recog/text ${inference_dir}/1best_recog/text.proc
+    python utils/postprocess_text_zh.py  ${data_dir}/text ${inference_dir}/1best_recog/text.ref
+    python utils/compute_wer.py ${inference_dir}/1best_recog/text.ref ${inference_dir}/1best_recog/text.proc ${inference_dir}/1best_recog/text.cer
+    tail -n 3 ${inference_dir}/1best_recog/text.cer
+  done
+
+fi
--- a/modules/python/vendors/FunASR/examples/aishell/transformer/utils
+++ b/modules/python/vendors/FunASR/examples/aishell/transformer/utils
@@ -0,0 +1 @@
+../paraformer/utils