mirror of
https://gitee.com/270580156/weiyu.git
synced 2026-05-15 19:58:00 +00:00
Sync from bytedesk-private: update
This commit is contained in:
48
modules/python/vendors/FunASR/runtime/onnxruntime/src/CMakeLists.txt
vendored
Normal file
48
modules/python/vendors/FunASR/runtime/onnxruntime/src/CMakeLists.txt
vendored
Normal file
@@ -0,0 +1,48 @@
|
||||
|
||||
file(GLOB files1 "*.cpp")
|
||||
if(APPLE)
|
||||
file(GLOB itn_files "itn-*.cpp")
|
||||
list(REMOVE_ITEM files1 ${itn_files})
|
||||
endif(APPLE)
|
||||
list(REMOVE_ITEM files1 "${CMAKE_CURRENT_SOURCE_DIR}/paraformer-torch.cpp")
|
||||
set(files ${files1})
|
||||
|
||||
if(GPU)
|
||||
set(files ${files} "${CMAKE_CURRENT_SOURCE_DIR}/paraformer-torch.cpp")
|
||||
endif()
|
||||
|
||||
message("files: "${files})
|
||||
|
||||
if(WIN32)
|
||||
add_compile_options("$<$<CXX_COMPILER_ID:MSVC>:/execution-charset:utf-8>")
|
||||
add_compile_options("$<$<CXX_COMPILER_ID:MSVC>:/source-charset:utf-8>")
|
||||
add_compile_options("$<$<CXX_COMPILER_ID:MSVC>:/bigobj>")
|
||||
endif()
|
||||
|
||||
add_library(funasr SHARED ${files})
|
||||
|
||||
if(WIN32)
|
||||
set(EXTRA_LIBS yaml-cpp csrc kaldi-decoder fst glog gflags avutil avcodec avformat swresample onnxruntime)
|
||||
include_directories(${ONNXRUNTIME_DIR}/include)
|
||||
include_directories(${FFMPEG_DIR}/include)
|
||||
target_link_directories(funasr PUBLIC ${ONNXRUNTIME_DIR}/lib)
|
||||
target_link_directories(funasr PUBLIC ${FFMPEG_DIR}/lib)
|
||||
target_compile_definitions(funasr PUBLIC -D_FUNASR_API_EXPORT -DNOMINMAX -DYAML_CPP_DLL)
|
||||
else()
|
||||
set(EXTRA_LIBS pthread yaml-cpp csrc kaldi-decoder fst glog gflags avutil avcodec avformat swresample)
|
||||
include_directories(${ONNXRUNTIME_DIR}/include)
|
||||
include_directories(${FFMPEG_DIR}/include)
|
||||
if(APPLE)
|
||||
target_link_directories(funasr PUBLIC ${ONNXRUNTIME_DIR}/lib)
|
||||
target_link_directories(funasr PUBLIC ${FFMPEG_DIR}/lib)
|
||||
endif(APPLE)
|
||||
endif()
|
||||
|
||||
if(GPU)
|
||||
set(TORCH_DEPS torch torch_cuda torch_cpu c10 c10_cuda torch_blade ral_base_context)
|
||||
endif()
|
||||
|
||||
#message("CXX_FLAGS "${CMAKE_CXX_FLAGS})
|
||||
include_directories(${CMAKE_SOURCE_DIR}/include)
|
||||
include_directories(${CMAKE_SOURCE_DIR}/third_party)
|
||||
target_link_libraries(funasr PUBLIC onnxruntime ${EXTRA_LIBS} ${TORCH_DEPS})
|
||||
21
modules/python/vendors/FunASR/runtime/onnxruntime/src/alignedmem.cpp
vendored
Normal file
21
modules/python/vendors/FunASR/runtime/onnxruntime/src/alignedmem.cpp
vendored
Normal file
@@ -0,0 +1,21 @@
|
||||
#include "precomp.h"
|
||||
|
||||
namespace funasr {
|
||||
void *AlignedMalloc(size_t alignment, size_t required_bytes)
|
||||
{
|
||||
void *p1; // original block
|
||||
void **p2; // aligned block
|
||||
int offset = alignment - 1 + sizeof(void *);
|
||||
if ((p1 = (void *)malloc(required_bytes + offset)) == nullptr) {
|
||||
return nullptr;
|
||||
}
|
||||
p2 = (void **)(((size_t)(p1) + offset) & ~(alignment - 1));
|
||||
p2[-1] = p1;
|
||||
return p2;
|
||||
}
|
||||
|
||||
void AlignedFree(void *p)
|
||||
{
|
||||
free(((void **)p)[-1]);
|
||||
}
|
||||
} // namespace funasr
|
||||
10
modules/python/vendors/FunASR/runtime/onnxruntime/src/alignedmem.h
vendored
Normal file
10
modules/python/vendors/FunASR/runtime/onnxruntime/src/alignedmem.h
vendored
Normal file
@@ -0,0 +1,10 @@
|
||||
|
||||
#ifndef ALIGNEDMEM_H
|
||||
#define ALIGNEDMEM_H
|
||||
|
||||
namespace funasr {
|
||||
extern void *AlignedMalloc(size_t alignment, size_t required_bytes);
|
||||
extern void AlignedFree(void *p);
|
||||
|
||||
} // namespace funasr
|
||||
#endif
|
||||
1427
modules/python/vendors/FunASR/runtime/onnxruntime/src/audio.cpp
vendored
Normal file
1427
modules/python/vendors/FunASR/runtime/onnxruntime/src/audio.cpp
vendored
Normal file
File diff suppressed because it is too large
Load Diff
180
modules/python/vendors/FunASR/runtime/onnxruntime/src/bias-lm.cpp
vendored
Normal file
180
modules/python/vendors/FunASR/runtime/onnxruntime/src/bias-lm.cpp
vendored
Normal file
@@ -0,0 +1,180 @@
|
||||
#include "bias-lm.h"
|
||||
#ifdef _WIN32
|
||||
#include "fst-types.cc"
|
||||
#endif
|
||||
namespace funasr {
|
||||
void print(std::queue<StateId> &q) {
|
||||
std::queue<StateId> data = q;
|
||||
while (!data.empty())
|
||||
{
|
||||
cout << data.front() << " ";
|
||||
data.pop();
|
||||
}
|
||||
cout << endl;
|
||||
}
|
||||
|
||||
void BiasLm::LoadCfgFromYaml(const char* filename, BiasLmOption &opt) {
|
||||
YAML::Node config;
|
||||
try {
|
||||
config = YAML::LoadFile(filename);
|
||||
} catch(exception const &e) {
|
||||
LOG(INFO) << "Error loading file, yaml file error or not exist.";
|
||||
exit(-1);
|
||||
}
|
||||
try {
|
||||
YAML::Node bias_lm_conf = config["bias_lm_conf"];
|
||||
opt_.incre_bias_ = bias_lm_conf["increment_weight"].as<float>();
|
||||
} catch(exception const &e) {
|
||||
}
|
||||
}
|
||||
|
||||
void BiasLm::BuildGraph(std::vector<std::vector<int>> &split_id_vec,
|
||||
std::vector<float> &custom_weight) {
|
||||
if (split_id_vec.empty()) {
|
||||
LOG(INFO) << "Skip building biaslm graph, hotword not exits.";
|
||||
return ;
|
||||
}
|
||||
assert(split_id_vec.size() == custom_weight.size());
|
||||
// Build prefix tree
|
||||
std::unique_ptr<fst::StdVectorFst> prefix_tree(new fst::StdVectorFst());
|
||||
StateId start_state = prefix_tree->AddState();
|
||||
prefix_tree->SetStart(start_state);
|
||||
int id = 0;
|
||||
for (auto& x : split_id_vec) {
|
||||
StateId state = start_state;
|
||||
StateId next_state = state;
|
||||
float w = custom_weight[id++];
|
||||
std::vector<int> split_id = x;
|
||||
for (int j = 0; j < split_id.size(); j++) {
|
||||
next_state = prefix_tree->AddState();
|
||||
if (j == split_id.size() - 1) {
|
||||
prefix_tree->SetFinal(next_state, w);
|
||||
}
|
||||
prefix_tree->AddArc(state, Arc(split_id[j], split_id[j], opt_.incre_bias_, next_state));
|
||||
state = next_state;
|
||||
}
|
||||
}
|
||||
graph_ = std::unique_ptr<fst::StdVectorFst>(new fst::StdVectorFst());
|
||||
fst::Determinize(*prefix_tree, graph_.get());
|
||||
|
||||
int num_node = graph_->NumStates();
|
||||
node_list_.resize(num_node);
|
||||
for (auto& x : split_id_vec) {
|
||||
StateId cur_state = 0;
|
||||
StateId next_state = 0;
|
||||
std::vector<int> split_id = x;
|
||||
for (int j = 0; j < split_id.size(); j++) {
|
||||
Matcher matcher(*graph_, fst::MATCH_INPUT);
|
||||
matcher.SetState(cur_state);
|
||||
if (matcher.Find(split_id[j])) {
|
||||
next_state = matcher.Value().nextstate;
|
||||
if (graph_->Final(next_state) != Weight::Zero()) {
|
||||
node_list_[next_state].is_final_ = true;
|
||||
}
|
||||
node_list_[next_state].score_ = opt_.incre_bias_ * (j + 1);
|
||||
cur_state = next_state;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Build Aho-Corasick Automata
|
||||
std::queue<StateId> q;
|
||||
Matcher matcher(*graph_, fst::MATCH_INPUT);
|
||||
// Back off state of all child nodes of the root node points to the root node
|
||||
for (ArcIterator aiter(*graph_, start_state); !aiter.Done(); aiter.Next()) {
|
||||
const Arc& arc = aiter.Value();
|
||||
node_list_[arc.nextstate].back_off_ = start_state;
|
||||
float back_off_score = (node_list_[arc.nextstate].is_final_ ? 0 :
|
||||
node_list_[start_state].score_ - node_list_[arc.nextstate].score_);
|
||||
graph_->AddArc(arc.nextstate, Arc(0, 0, back_off_score, start_state));
|
||||
q.push(arc.nextstate);
|
||||
}
|
||||
while (!q.empty()) {
|
||||
StateId state_id = q.front();
|
||||
q.pop();
|
||||
for (ArcIterator aiter(*graph_, state_id); !aiter.Done(); aiter.Next()) {
|
||||
const Arc& arc = aiter.Value();
|
||||
StateId next_state = arc.nextstate;
|
||||
StateId temp_state = node_list_[state_id].back_off_;
|
||||
if (next_state == start_state || next_state == temp_state) {
|
||||
continue;
|
||||
}
|
||||
while (true) {
|
||||
matcher.SetState(temp_state);
|
||||
if (matcher.Find(arc.ilabel)) {
|
||||
node_list_[next_state].back_off_ = matcher.Value().nextstate;
|
||||
break;
|
||||
} else if (temp_state == start_state) {
|
||||
node_list_[next_state].back_off_ = start_state;
|
||||
break;
|
||||
}
|
||||
temp_state = node_list_[temp_state].back_off_;
|
||||
}
|
||||
float back_off_score = (node_list_[next_state].is_final_ ? 0 :
|
||||
node_list_[node_list_[next_state].back_off_].score_ -
|
||||
node_list_[next_state].score_);
|
||||
graph_->AddArc(next_state, Arc(0, 0, back_off_score,
|
||||
node_list_[next_state].back_off_));
|
||||
q.push(next_state);
|
||||
}
|
||||
}
|
||||
fst::ArcSort(graph_.get(), fst::StdILabelCompare());
|
||||
//graph_->Write("graph.final.fst");
|
||||
}
|
||||
|
||||
float BiasLm::BiasLmScore(const StateId &his_state, const Label &lab, Label &new_state) {
|
||||
if (lab < 1 || lab > phn_set_.Size() || !graph_) { return VALUE_ZERO; }
|
||||
StateId cur_state = his_state;
|
||||
StateId next_state;
|
||||
float score = VALUE_ZERO;
|
||||
Matcher matcher(*graph_, fst::MATCH_INPUT);
|
||||
while (true) {
|
||||
StateId prev_state = cur_state;
|
||||
matcher.SetState(cur_state);
|
||||
if (matcher.Find(lab)) {
|
||||
next_state = matcher.Value().nextstate;
|
||||
score += matcher.Value().weight.Value();
|
||||
if (node_list_[next_state].is_final_) {
|
||||
score = score + graph_->Final(next_state).Value();
|
||||
}
|
||||
cur_state = next_state;
|
||||
break;
|
||||
} else {
|
||||
ArcIterator aiter(*graph_, cur_state);
|
||||
const Arc& arc = aiter.Value();
|
||||
if (arc.ilabel == 0) {
|
||||
score += arc.weight.Value();
|
||||
next_state = arc.nextstate;
|
||||
cur_state = next_state;
|
||||
}
|
||||
if (prev_state == ROOT_NODE && cur_state == ROOT_NODE) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
new_state = cur_state;
|
||||
return score;
|
||||
}
|
||||
|
||||
void BiasLm::VocabIdToPhnIdVector(int vocab_id, std::vector<int> &phn_ids) {
|
||||
bool is_oov = false;
|
||||
phn_ids.clear();
|
||||
std::string word = vocab_.Id2String(vocab_id);
|
||||
std::vector<std::string> phn_vec;
|
||||
Utf8ToCharset(word, phn_vec);
|
||||
for (auto& phn : phn_vec) {
|
||||
if (!phn_set_.Find(phn)) {
|
||||
is_oov = true;
|
||||
break;
|
||||
} else {
|
||||
phn_ids.push_back(phn_set_.String2Id(phn));
|
||||
}
|
||||
}
|
||||
if (is_oov) { phn_ids.clear(); }
|
||||
}
|
||||
|
||||
std::string BiasLm::GetPhoneLabel(int phone_id) {
|
||||
if (phone_id < 0 || phone_id >= phn_set_.Size()) { return ""; }
|
||||
return phn_set_.Id2String(phone_id);
|
||||
}
|
||||
}
|
||||
150
modules/python/vendors/FunASR/runtime/onnxruntime/src/bias-lm.h
vendored
Normal file
150
modules/python/vendors/FunASR/runtime/onnxruntime/src/bias-lm.h
vendored
Normal file
@@ -0,0 +1,150 @@
|
||||
#ifndef BIAS_LM_
|
||||
#define BIAS_LM_
|
||||
#include <assert.h>
|
||||
#include "util.h"
|
||||
#include "fst/fstlib.h"
|
||||
#include "phone-set.h"
|
||||
#include "vocab.h"
|
||||
#include "util/text-utils.h"
|
||||
#include <yaml-cpp/yaml.h>
|
||||
#ifdef _WIN32
|
||||
#include "win_func.h"
|
||||
#endif
|
||||
// node type
|
||||
#define ROOT_NODE 0
|
||||
#define VALUE_ZERO 0.0f
|
||||
|
||||
namespace funasr {
|
||||
typedef fst::StdArc Arc;
|
||||
typedef typename Arc::StateId StateId;
|
||||
typedef typename Arc::Weight Weight;
|
||||
typedef typename Arc::Label Label;
|
||||
typedef typename fst::SortedMatcher<fst::StdVectorFst> Matcher;
|
||||
typedef typename fst::ArcIterator<fst::StdVectorFst> ArcIterator;
|
||||
|
||||
class Node {
|
||||
public:
|
||||
Node() : score_(0.0f), is_final_(false), back_off_(-1) {}
|
||||
float score_;
|
||||
bool is_final_;
|
||||
StateId back_off_;
|
||||
};
|
||||
|
||||
class BiasLmOption {
|
||||
public:
|
||||
BiasLmOption() : incre_bias_(20.0f), scale_(1.0f) {}
|
||||
float incre_bias_;
|
||||
float scale_;
|
||||
};
|
||||
|
||||
class BiasLm {
|
||||
public:
|
||||
BiasLm(const string &hws_file, const string &cfg_file,
|
||||
const PhoneSet& phn_set, const Vocab& vocab) :
|
||||
phn_set_(phn_set), vocab_(vocab) {
|
||||
std::string line;
|
||||
std::ifstream ifs_hws(hws_file.c_str());
|
||||
std::vector<float> custom_weight;
|
||||
std::vector<std::vector<int>> split_id_vec;
|
||||
|
||||
struct timeval start, end;
|
||||
gettimeofday(&start, nullptr);
|
||||
|
||||
LoadCfgFromYaml(cfg_file.c_str(), opt_);
|
||||
while (getline(ifs_hws, line)) {
|
||||
Trim(&line);
|
||||
if (line.empty()) {
|
||||
continue;
|
||||
}
|
||||
float score = 1.0f;
|
||||
bool is_oov = false;
|
||||
std::vector<std::string> text;
|
||||
std::vector<std::string> split_str;
|
||||
std::vector<int> split_id;
|
||||
SplitStringToVector(line, "\t", true, &text);
|
||||
if (text.size() > 1) {
|
||||
score = std::stof(text[1]);
|
||||
}
|
||||
SplitChiEngCharacters(text[0], split_str);
|
||||
for (auto &str : split_str) {
|
||||
std::vector<string> lex_vec;
|
||||
std::string lex_str = vocab_.Word2Lex(str);
|
||||
SplitStringToVector(lex_str, " ", true, &lex_vec);
|
||||
for (auto &token : lex_vec) {
|
||||
split_id.push_back(phn_set_.String2Id(token));
|
||||
if (!phn_set_.Find(token)) {
|
||||
is_oov = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!is_oov) {
|
||||
split_id_vec.push_back(split_id);
|
||||
custom_weight.push_back(score);
|
||||
}
|
||||
}
|
||||
BuildGraph(split_id_vec, custom_weight);
|
||||
ifs_hws.close();
|
||||
|
||||
gettimeofday(&end, nullptr);
|
||||
long seconds = (end.tv_sec - start.tv_sec);
|
||||
long modle_init_micros = ((seconds * 1000000) + end.tv_usec) - (start.tv_usec);
|
||||
LOG(INFO) << "Build bias lm takes " << (double)modle_init_micros / 1000000 << " s";
|
||||
}
|
||||
|
||||
BiasLm(unordered_map<string, int> &hws_map, int inc_bias,
|
||||
const PhoneSet& phn_set, const Vocab& vocab) :
|
||||
phn_set_(phn_set), vocab_(vocab) {
|
||||
std::vector<float> custom_weight;
|
||||
std::vector<std::vector<int>> split_id_vec;
|
||||
|
||||
struct timeval start, end;
|
||||
gettimeofday(&start, nullptr);
|
||||
opt_.incre_bias_ = inc_bias;
|
||||
for (const pair<string, int>& kv : hws_map) {
|
||||
float score = 1.0f;
|
||||
bool is_oov = false;
|
||||
std::vector<std::string> text;
|
||||
std::vector<std::string> split_str;
|
||||
std::vector<int> split_id;
|
||||
score = kv.second;
|
||||
SplitChiEngCharacters(kv.first, split_str);
|
||||
for (auto &str : split_str) {
|
||||
std::vector<string> lex_vec;
|
||||
std::string lex_str = vocab_.Word2Lex(str);
|
||||
SplitStringToVector(lex_str, " ", true, &lex_vec);
|
||||
for (auto &token : lex_vec) {
|
||||
split_id.push_back(phn_set_.String2Id(token));
|
||||
if (!phn_set_.Find(token)) {
|
||||
is_oov = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!is_oov) {
|
||||
split_id_vec.push_back(split_id);
|
||||
custom_weight.push_back(score);
|
||||
}
|
||||
}
|
||||
BuildGraph(split_id_vec, custom_weight);
|
||||
|
||||
gettimeofday(&end, nullptr);
|
||||
long seconds = (end.tv_sec - start.tv_sec);
|
||||
long modle_init_micros = ((seconds * 1000000) + end.tv_usec) - (start.tv_usec);
|
||||
LOG(INFO) << "Build bias lm takes " << (double)modle_init_micros / 1000000 << " s";
|
||||
}
|
||||
|
||||
void BuildGraph(std::vector<std::vector<int>> &vec, std::vector<float> &wts);
|
||||
float BiasLmScore(const StateId &cur_state, const Label &lab, Label &new_state);
|
||||
void VocabIdToPhnIdVector(int vocab_id, std::vector<int> &phn_ids);
|
||||
void LoadCfgFromYaml(const char* filename, BiasLmOption &opt);
|
||||
std::string GetPhoneLabel(int phone_id);
|
||||
private:
|
||||
const PhoneSet& phn_set_;
|
||||
const Vocab& vocab_;
|
||||
std::unique_ptr<fst::StdVectorFst> graph_ = nullptr;
|
||||
std::vector<Node> node_list_;
|
||||
BiasLmOption opt_;
|
||||
};
|
||||
} // namespace funasr
|
||||
#endif // BIAS_LM_
|
||||
6
modules/python/vendors/FunASR/runtime/onnxruntime/src/common-struct.h
vendored
Normal file
6
modules/python/vendors/FunASR/runtime/onnxruntime/src/common-struct.h
vendored
Normal file
@@ -0,0 +1,6 @@
|
||||
|
||||
#ifndef COMMONSTRUCT_H
|
||||
#define COMMONSTRUCT_H
|
||||
|
||||
|
||||
#endif
|
||||
109
modules/python/vendors/FunASR/runtime/onnxruntime/src/commonfunc.h
vendored
Normal file
109
modules/python/vendors/FunASR/runtime/onnxruntime/src/commonfunc.h
vendored
Normal file
@@ -0,0 +1,109 @@
|
||||
#pragma once
|
||||
#include <algorithm>
|
||||
#ifdef _WIN32
|
||||
#include <codecvt>
|
||||
#endif
|
||||
|
||||
namespace funasr {
|
||||
typedef struct
|
||||
{
|
||||
std::string msg;
|
||||
std::string stamp;
|
||||
std::string stamp_sents;
|
||||
std::string tpass_msg;
|
||||
float snippet_time;
|
||||
}FUNASR_RECOG_RESULT;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
std::vector<std::vector<int>>* segments;
|
||||
float snippet_time;
|
||||
}FUNASR_VAD_RESULT;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
string msg;
|
||||
vector<string> arr_cache;
|
||||
}FUNASR_PUNC_RESULT;
|
||||
|
||||
#ifdef _WIN32
|
||||
|
||||
#define ORTSTRING(str) StrToWstr(str)
|
||||
#define ORTCHAR(str) StrToWstr(str).c_str()
|
||||
|
||||
inline std::wstring String2wstring(const std::string& str, const std::string& locale)
|
||||
{
|
||||
typedef std::codecvt_byname<wchar_t, char, std::mbstate_t> F;
|
||||
std::wstring_convert<F> strCnv(new F(locale));
|
||||
return strCnv.from_bytes(str);
|
||||
}
|
||||
|
||||
inline std::wstring StrToWstr(std::string str) {
|
||||
if (str.length() == 0)
|
||||
return L"";
|
||||
return String2wstring(str, "zh-CN");
|
||||
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#define ORTSTRING(str) str
|
||||
#define ORTCHAR(str) str
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
inline void GetInputName(Ort::Session* session, string& inputName,int nIndex=0) {
|
||||
size_t numInputNodes = session->GetInputCount();
|
||||
if (numInputNodes > 0) {
|
||||
Ort::AllocatorWithDefaultOptions allocator;
|
||||
{
|
||||
auto t = session->GetInputNameAllocated(nIndex, allocator);
|
||||
inputName = t.get();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline void GetInputNames(Ort::Session* session, std::vector<std::string> &m_strInputNames,
|
||||
std::vector<const char *> &m_szInputNames) {
|
||||
Ort::AllocatorWithDefaultOptions allocator;
|
||||
size_t numNodes = session->GetInputCount();
|
||||
m_strInputNames.resize(numNodes);
|
||||
m_szInputNames.resize(numNodes);
|
||||
for (size_t i = 0; i != numNodes; ++i) {
|
||||
auto t = session->GetInputNameAllocated(i, allocator);
|
||||
m_strInputNames[i] = t.get();
|
||||
m_szInputNames[i] = m_strInputNames[i].c_str();
|
||||
}
|
||||
}
|
||||
|
||||
inline void GetOutputName(Ort::Session* session, string& outputName, int nIndex = 0) {
|
||||
size_t numOutputNodes = session->GetOutputCount();
|
||||
if (numOutputNodes > 0) {
|
||||
Ort::AllocatorWithDefaultOptions allocator;
|
||||
{
|
||||
auto t = session->GetOutputNameAllocated(nIndex, allocator);
|
||||
outputName = t.get();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline void GetOutputNames(Ort::Session* session, std::vector<std::string> &m_strOutputNames,
|
||||
std::vector<const char *> &m_szOutputNames) {
|
||||
Ort::AllocatorWithDefaultOptions allocator;
|
||||
size_t numNodes = session->GetOutputCount();
|
||||
m_strOutputNames.resize(numNodes);
|
||||
m_szOutputNames.resize(numNodes);
|
||||
for (size_t i = 0; i != numNodes; ++i) {
|
||||
auto t = session->GetOutputNameAllocated(i, allocator);
|
||||
m_strOutputNames[i] = t.get();
|
||||
m_szOutputNames[i] = m_strOutputNames[i].c_str();
|
||||
}
|
||||
}
|
||||
|
||||
template <class ForwardIterator>
|
||||
inline static size_t Argmax(ForwardIterator first, ForwardIterator last) {
|
||||
return std::distance(first, std::max_element(first, last));
|
||||
}
|
||||
} // namespace funasr
|
||||
268
modules/python/vendors/FunASR/runtime/onnxruntime/src/ct-transformer-online.cpp
vendored
Normal file
268
modules/python/vendors/FunASR/runtime/onnxruntime/src/ct-transformer-online.cpp
vendored
Normal file
@@ -0,0 +1,268 @@
|
||||
/**
|
||||
* Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
|
||||
* MIT License (https://opensource.org/licenses/MIT)
|
||||
*/
|
||||
|
||||
#include "precomp.h"
|
||||
|
||||
namespace funasr {
|
||||
CTTransformerOnline::CTTransformerOnline()
|
||||
:env_(ORT_LOGGING_LEVEL_ERROR, ""),session_options{}
|
||||
{
|
||||
}
|
||||
|
||||
void CTTransformerOnline::InitPunc(const std::string &punc_model, const std::string &punc_config, const std::string &token_file, int thread_num){
|
||||
session_options.SetIntraOpNumThreads(thread_num);
|
||||
session_options.SetGraphOptimizationLevel(ORT_ENABLE_ALL);
|
||||
session_options.DisableCpuMemArena();
|
||||
|
||||
try{
|
||||
m_session = std::make_unique<Ort::Session>(env_, ORTSTRING(punc_model).c_str(), session_options);
|
||||
LOG(INFO) << "Successfully load model from " << punc_model;
|
||||
}
|
||||
catch (std::exception const &e) {
|
||||
LOG(ERROR) << "Error when load punc onnx model: " << e.what();
|
||||
exit(-1);
|
||||
}
|
||||
// read inputnames outputnames
|
||||
GetInputNames(m_session.get(), m_strInputNames, m_szInputNames);
|
||||
GetOutputNames(m_session.get(), m_strOutputNames, m_szOutputNames);
|
||||
|
||||
m_tokenizer.OpenYaml(punc_config.c_str(), token_file.c_str());
|
||||
}
|
||||
|
||||
CTTransformerOnline::~CTTransformerOnline()
|
||||
{
|
||||
}
|
||||
|
||||
string CTTransformerOnline::AddPunc(const char* sz_input, vector<string> &arr_cache, std::string language)
|
||||
{
|
||||
string strResult;
|
||||
vector<string> strOut;
|
||||
vector<int> InputData;
|
||||
string strText; //full_text
|
||||
strText = accumulate(arr_cache.begin(), arr_cache.end(), strText);
|
||||
strText += sz_input; // full_text = precache + text
|
||||
m_tokenizer.Tokenize(strText.c_str(), strOut, InputData);
|
||||
|
||||
int nTotalBatch = ceil((float)InputData.size() / TOKEN_LEN);
|
||||
int nCurBatch = -1;
|
||||
int nSentEnd = -1, nLastCommaIndex = -1;
|
||||
vector<int32_t> RemainIDs; //
|
||||
vector<string> RemainStr; //
|
||||
vector<int> new_mini_sentence_punc; // sentence_punc_list = []
|
||||
vector<string> sentenceOut; // sentenceOut
|
||||
vector<string> sentence_punc_list,sentence_words_list,sentence_punc_list_out; // sentence_words_list = []
|
||||
|
||||
int nSkipNum = 0;
|
||||
int nDiff = 0;
|
||||
for (size_t i = 0; i < InputData.size(); i += TOKEN_LEN)
|
||||
{
|
||||
nDiff = (i + TOKEN_LEN) < InputData.size() ? (0) : (i + TOKEN_LEN - InputData.size());
|
||||
vector<int32_t> InputIDs(InputData.begin() + i, InputData.begin() + i + (TOKEN_LEN - nDiff));
|
||||
vector<string> InputStr(strOut.begin() + i, strOut.begin() + i + (TOKEN_LEN - nDiff));
|
||||
InputIDs.insert(InputIDs.begin(), RemainIDs.begin(), RemainIDs.end()); // RemainIDs+InputIDs;
|
||||
InputStr.insert(InputStr.begin(), RemainStr.begin(), RemainStr.end()); // RemainStr+InputStr;
|
||||
|
||||
auto Punction = Infer(InputIDs, arr_cache.size());
|
||||
nCurBatch = i / TOKEN_LEN;
|
||||
if (nCurBatch < nTotalBatch - 1) // not the last minisetence
|
||||
{
|
||||
nSentEnd = -1;
|
||||
nLastCommaIndex = -1;
|
||||
for (int nIndex = Punction.size() - 2; nIndex > 0; nIndex--)
|
||||
{
|
||||
if (m_tokenizer.Id2Punc(Punction[nIndex]) == m_tokenizer.Id2Punc(PERIOD_INDEX) || m_tokenizer.Id2Punc(Punction[nIndex]) == m_tokenizer.Id2Punc(QUESTION_INDEX))
|
||||
{
|
||||
nSentEnd = nIndex;
|
||||
break;
|
||||
}
|
||||
if (nLastCommaIndex < 0 && m_tokenizer.Id2Punc(Punction[nIndex]) == m_tokenizer.Id2Punc(COMMA_INDEX))
|
||||
{
|
||||
nLastCommaIndex = nIndex;
|
||||
}
|
||||
}
|
||||
if (nSentEnd < 0 && InputStr.size() > CACHE_POP_TRIGGER_LIMIT && nLastCommaIndex > 0)
|
||||
{
|
||||
nSentEnd = nLastCommaIndex;
|
||||
Punction[nSentEnd] = PERIOD_INDEX;
|
||||
}
|
||||
RemainStr.assign(InputStr.begin() + (nSentEnd + 1), InputStr.end());
|
||||
RemainIDs.assign(InputIDs.begin() + (nSentEnd + 1), InputIDs.end());
|
||||
InputStr.assign(InputStr.begin(), InputStr.begin() + (nSentEnd + 1)); // minit_sentence
|
||||
Punction.assign(Punction.begin(), Punction.begin() + (nSentEnd + 1));
|
||||
}
|
||||
|
||||
for (auto& item : Punction)
|
||||
{
|
||||
sentence_punc_list.push_back(m_tokenizer.Id2Punc(item));
|
||||
}
|
||||
|
||||
sentence_words_list.insert(sentence_words_list.end(), InputStr.begin(), InputStr.end());
|
||||
|
||||
new_mini_sentence_punc.insert(new_mini_sentence_punc.end(), Punction.begin(), Punction.end());
|
||||
}
|
||||
vector<string> WordWithPunc;
|
||||
for (int i = 0; i < sentence_words_list.size(); i++) // for i in range(0, len(sentence_words_list)):
|
||||
{
|
||||
if (!(sentence_words_list[i][0] & 0x80) && (i + 1) < sentence_words_list.size() && !(sentence_words_list[i + 1][0] & 0x80))
|
||||
{
|
||||
sentence_words_list[i] = " " + sentence_words_list[i];
|
||||
}
|
||||
if (nSkipNum < arr_cache.size()) // if skip_num < len(cache):
|
||||
nSkipNum++;
|
||||
else
|
||||
WordWithPunc.push_back(sentence_words_list[i]);
|
||||
|
||||
if (nSkipNum >= arr_cache.size())
|
||||
{
|
||||
sentence_punc_list_out.push_back(sentence_punc_list[i]);
|
||||
if (sentence_punc_list[i] != NOTPUNC)
|
||||
{
|
||||
WordWithPunc.push_back(sentence_punc_list[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
sentenceOut.insert(sentenceOut.end(), WordWithPunc.begin(), WordWithPunc.end()); //
|
||||
nSentEnd = -1;
|
||||
for (int i = sentence_punc_list.size() - 2; i > 0; i--)
|
||||
{
|
||||
if (new_mini_sentence_punc[i] == PERIOD_INDEX || new_mini_sentence_punc[i] == QUESTION_INDEX)
|
||||
{
|
||||
nSentEnd = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
arr_cache.assign(sentence_words_list.begin() + (nSentEnd + 1), sentence_words_list.end());
|
||||
|
||||
if (sentenceOut.size() > 0 && m_tokenizer.IsPunc(sentenceOut[sentenceOut.size() - 1]))
|
||||
{
|
||||
sentenceOut.assign(sentenceOut.begin(), sentenceOut.end() - 1);
|
||||
sentence_punc_list_out[sentence_punc_list_out.size() - 1] = m_tokenizer.Id2Punc(NOTPUNC_INDEX);
|
||||
}
|
||||
return accumulate(sentenceOut.begin(), sentenceOut.end(), string(""));
|
||||
}
|
||||
|
||||
vector<int> CTTransformerOnline::Infer(vector<int32_t> input_data, int nCacheSize)
|
||||
{
|
||||
Ort::MemoryInfo m_memoryInfo = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);
|
||||
vector<int> punction;
|
||||
std::array<int64_t, 2> input_shape_{ 1, (int64_t)input_data.size()};
|
||||
Ort::Value onnx_input = Ort::Value::CreateTensor(
|
||||
m_memoryInfo,
|
||||
input_data.data(),
|
||||
input_data.size() * sizeof(int32_t),
|
||||
input_shape_.data(),
|
||||
input_shape_.size(), ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32);
|
||||
|
||||
std::array<int32_t,1> text_lengths{ (int32_t)input_data.size() };
|
||||
std::array<int64_t,1> text_lengths_dim{ 1 };
|
||||
Ort::Value onnx_text_lengths = Ort::Value::CreateTensor<int32_t>(
|
||||
m_memoryInfo,
|
||||
text_lengths.data(),
|
||||
text_lengths.size(),
|
||||
text_lengths_dim.data(),
|
||||
text_lengths_dim.size()); //, ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32);
|
||||
|
||||
//vad_mask
|
||||
// vector<float> arVadMask,arSubMask;
|
||||
vector<float> arVadMask;
|
||||
int nTextLength = input_data.size();
|
||||
|
||||
VadMask(nTextLength, nCacheSize, arVadMask);
|
||||
// Triangle(nTextLength, arSubMask);
|
||||
std::array<int64_t, 4> VadMask_Dim{ 1,1, nTextLength ,nTextLength };
|
||||
Ort::Value onnx_vad_mask = Ort::Value::CreateTensor<float>(
|
||||
m_memoryInfo,
|
||||
arVadMask.data(),
|
||||
arVadMask.size(), // * sizeof(float),
|
||||
VadMask_Dim.data(),
|
||||
VadMask_Dim.size()); // , ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT);
|
||||
//sub_masks
|
||||
|
||||
std::array<int64_t, 4> SubMask_Dim{ 1,1, nTextLength ,nTextLength };
|
||||
Ort::Value onnx_sub_mask = Ort::Value::CreateTensor<float>(
|
||||
m_memoryInfo,
|
||||
arVadMask.data(),
|
||||
arVadMask.size(),
|
||||
SubMask_Dim.data(),
|
||||
SubMask_Dim.size()); // , ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT);
|
||||
|
||||
std::vector<Ort::Value> input_onnx;
|
||||
input_onnx.emplace_back(std::move(onnx_input));
|
||||
input_onnx.emplace_back(std::move(onnx_text_lengths));
|
||||
input_onnx.emplace_back(std::move(onnx_vad_mask));
|
||||
input_onnx.emplace_back(std::move(onnx_sub_mask));
|
||||
|
||||
try {
|
||||
auto outputTensor = m_session->Run(Ort::RunOptions{nullptr}, m_szInputNames.data(), input_onnx.data(), m_szInputNames.size(), m_szOutputNames.data(), m_szOutputNames.size());
|
||||
std::vector<int64_t> outputShape = outputTensor[0].GetTensorTypeAndShapeInfo().GetShape();
|
||||
|
||||
int64_t outputCount = std::accumulate(outputShape.begin(), outputShape.end(), 1, std::multiplies<int64_t>());
|
||||
float * floatData = outputTensor[0].GetTensorMutableData<float>();
|
||||
|
||||
for (int i = 0; i < outputCount; i += CANDIDATE_NUM)
|
||||
{
|
||||
int index = Argmax(floatData + i, floatData + i + CANDIDATE_NUM-1);
|
||||
punction.push_back(index);
|
||||
}
|
||||
}
|
||||
catch (std::exception const &e)
|
||||
{
|
||||
LOG(ERROR) << "Error when run punc onnx forword: " << (e.what());
|
||||
}
|
||||
return punction;
|
||||
}
|
||||
|
||||
void CTTransformerOnline::VadMask(int nSize, int vad_pos, vector<float>& Result)
|
||||
{
|
||||
Result.resize(0);
|
||||
Result.assign(nSize * nSize, 1);
|
||||
if (vad_pos <= 0 || vad_pos >= nSize)
|
||||
{
|
||||
return;
|
||||
}
|
||||
for (int i = 0; i < vad_pos-1; i++)
|
||||
{
|
||||
for (int j = vad_pos; j < nSize; j++)
|
||||
{
|
||||
Result[i * nSize + j] = 0.0f;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void CTTransformerOnline::Triangle(int text_length, vector<float>& Result)
|
||||
{
|
||||
Result.resize(0);
|
||||
Result.assign(text_length * text_length,1); // generate a zeros: text_length x text_length
|
||||
|
||||
for (int i = 0; i < text_length; i++) // rows
|
||||
{
|
||||
for (int j = i+1; j<text_length; j++) //cols
|
||||
{
|
||||
Result[i * text_length + j] = 0.0f;
|
||||
}
|
||||
|
||||
}
|
||||
//Transport(Result, text_length, text_length);
|
||||
}
|
||||
|
||||
void CTTransformerOnline::Transport(vector<float>& In,int nRows, int nCols)
|
||||
{
|
||||
vector<float> Out;
|
||||
Out.resize(nRows * nCols);
|
||||
int i = 0;
|
||||
for (int j = 0; j < nCols; j++) {
|
||||
for (; i < nRows * nCols; i++) {
|
||||
Out[i] = In[j + nCols * (i % nRows)];
|
||||
if ((i + 1) % nRows == 0) {
|
||||
i++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
In = Out;
|
||||
}
|
||||
|
||||
} // namespace funasr
|
||||
37
modules/python/vendors/FunASR/runtime/onnxruntime/src/ct-transformer-online.h
vendored
Normal file
37
modules/python/vendors/FunASR/runtime/onnxruntime/src/ct-transformer-online.h
vendored
Normal file
@@ -0,0 +1,37 @@
|
||||
/**
|
||||
* Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
|
||||
* MIT License (https://opensource.org/licenses/MIT)
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
namespace funasr {
|
||||
class CTTransformerOnline : public PuncModel {
|
||||
/**
|
||||
* Author: Speech Lab of DAMO Academy, Alibaba Group
|
||||
* CT-Transformer: Controllable time-delay transformer for real-time punctuation prediction and disfluency detection
|
||||
* https://arxiv.org/pdf/2003.01309.pdf
|
||||
*/
|
||||
|
||||
private:
|
||||
|
||||
CTokenizer m_tokenizer;
|
||||
vector<string> m_strInputNames, m_strOutputNames;
|
||||
vector<const char*> m_szInputNames;
|
||||
vector<const char*> m_szOutputNames;
|
||||
|
||||
std::shared_ptr<Ort::Session> m_session;
|
||||
Ort::Env env_;
|
||||
Ort::SessionOptions session_options;
|
||||
public:
|
||||
|
||||
CTTransformerOnline();
|
||||
void InitPunc(const std::string &punc_model, const std::string &punc_config, const std::string &token_file, int thread_num);
|
||||
~CTTransformerOnline();
|
||||
vector<int> Infer(vector<int32_t> input_data, int nCacheSize);
|
||||
string AddPunc(const char* sz_input, vector<string> &arr_cache, std::string language="zh-cn");
|
||||
void Transport(vector<float>& In, int nRows, int nCols);
|
||||
void VadMask(int size, int vad_pos,vector<float>& Result);
|
||||
void Triangle(int text_length, vector<float>& Result);
|
||||
};
|
||||
} // namespace funasr
|
||||
201
modules/python/vendors/FunASR/runtime/onnxruntime/src/ct-transformer.cpp
vendored
Normal file
201
modules/python/vendors/FunASR/runtime/onnxruntime/src/ct-transformer.cpp
vendored
Normal file
@@ -0,0 +1,201 @@
|
||||
/**
|
||||
* Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
|
||||
* MIT License (https://opensource.org/licenses/MIT)
|
||||
*/
|
||||
|
||||
#include "precomp.h"
|
||||
|
||||
namespace funasr {
|
||||
CTTransformer::CTTransformer()
|
||||
:env_(ORT_LOGGING_LEVEL_ERROR, ""),session_options{}
|
||||
{
|
||||
}
|
||||
|
||||
void CTTransformer::InitPunc(const std::string &punc_model, const std::string &punc_config, const std::string &token_file, int thread_num){
|
||||
session_options.SetIntraOpNumThreads(thread_num);
|
||||
session_options.SetGraphOptimizationLevel(ORT_ENABLE_ALL);
|
||||
session_options.DisableCpuMemArena();
|
||||
|
||||
try{
|
||||
m_session = std::make_unique<Ort::Session>(env_, ORTSTRING(punc_model).c_str(), session_options);
|
||||
LOG(INFO) << "Successfully load model from " << punc_model;
|
||||
}
|
||||
catch (std::exception const &e) {
|
||||
LOG(ERROR) << "Error when load punc onnx model: " << e.what();
|
||||
exit(-1);
|
||||
}
|
||||
// read inputnames outputnames
|
||||
GetInputNames(m_session.get(), m_strInputNames, m_szInputNames);
|
||||
GetOutputNames(m_session.get(), m_strOutputNames, m_szOutputNames);
|
||||
|
||||
m_tokenizer.OpenYaml(punc_config.c_str(), token_file.c_str());
|
||||
m_tokenizer.JiebaInit(punc_config);
|
||||
}
|
||||
|
||||
CTTransformer::~CTTransformer()
|
||||
{
|
||||
}
|
||||
|
||||
string CTTransformer::AddPunc(const char* sz_input, std::string language)
|
||||
{
|
||||
string strResult;
|
||||
vector<string> strOut;
|
||||
vector<int> InputData;
|
||||
m_tokenizer.Tokenize(sz_input, strOut, InputData);
|
||||
|
||||
int nTotalBatch = ceil((float)InputData.size() / TOKEN_LEN);
|
||||
int nCurBatch = -1;
|
||||
int nSentEnd = -1, nLastCommaIndex = -1;
|
||||
vector<int32_t> RemainIDs; //
|
||||
vector<string> RemainStr; //
|
||||
vector<int> NewPunctuation; //
|
||||
vector<string> NewString; //
|
||||
vector<string> NewSentenceOut;
|
||||
vector<int> NewPuncOut;
|
||||
int nDiff = 0;
|
||||
for (size_t i = 0; i < InputData.size(); i += TOKEN_LEN)
|
||||
{
|
||||
nDiff = (i + TOKEN_LEN) < InputData.size() ? (0) : (i + TOKEN_LEN - InputData.size());
|
||||
vector<int32_t> InputIDs(InputData.begin() + i, InputData.begin() + i + (TOKEN_LEN - nDiff));
|
||||
vector<string> InputStr(strOut.begin() + i, strOut.begin() + i + (TOKEN_LEN - nDiff));
|
||||
InputIDs.insert(InputIDs.begin(), RemainIDs.begin(), RemainIDs.end()); // RemainIDs+InputIDs;
|
||||
InputStr.insert(InputStr.begin(), RemainStr.begin(), RemainStr.end()); // RemainStr+InputStr;
|
||||
|
||||
auto Punction = Infer(InputIDs);
|
||||
nCurBatch = i / TOKEN_LEN;
|
||||
if (nCurBatch < nTotalBatch - 1) // not the last minisetence
|
||||
{
|
||||
nSentEnd = -1;
|
||||
nLastCommaIndex = -1;
|
||||
for (int nIndex = Punction.size() - 2; nIndex > 0; nIndex--)
|
||||
{
|
||||
if (m_tokenizer.Id2Punc(Punction[nIndex]) == m_tokenizer.Id2Punc(PERIOD_INDEX) || m_tokenizer.Id2Punc(Punction[nIndex]) == m_tokenizer.Id2Punc(QUESTION_INDEX))
|
||||
{
|
||||
nSentEnd = nIndex;
|
||||
break;
|
||||
}
|
||||
if (nLastCommaIndex < 0 && m_tokenizer.Id2Punc(Punction[nIndex]) == m_tokenizer.Id2Punc(COMMA_INDEX))
|
||||
{
|
||||
nLastCommaIndex = nIndex;
|
||||
}
|
||||
}
|
||||
if (nSentEnd < 0 && InputStr.size() > CACHE_POP_TRIGGER_LIMIT && nLastCommaIndex > 0)
|
||||
{
|
||||
nSentEnd = nLastCommaIndex;
|
||||
Punction[nSentEnd] = PERIOD_INDEX;
|
||||
}
|
||||
RemainStr.assign(InputStr.begin() + (nSentEnd + 1), InputStr.end());
|
||||
RemainIDs.assign(InputIDs.begin() + (nSentEnd + 1), InputIDs.end());
|
||||
InputStr.assign(InputStr.begin(), InputStr.begin() + (nSentEnd + 1)); // minit_sentence
|
||||
Punction.assign(Punction.begin(), Punction.begin() + (nSentEnd + 1));
|
||||
}
|
||||
|
||||
NewPunctuation.insert(NewPunctuation.end(), Punction.begin(), Punction.end());
|
||||
vector<string> WordWithPunc;
|
||||
for (int i = 0; i < InputStr.size(); i++)
|
||||
{
|
||||
// if (i > 0 && !(InputStr[i][0] & 0x80) && (i + 1) <InputStr.size() && !(InputStr[i+1][0] & 0x80))// <20>м<EFBFBD><D0BC>Ӣ<EFBFBD>ģ<EFBFBD>
|
||||
if (i > 0 && !(InputStr[i-1][0] & 0x80) && !(InputStr[i][0] & 0x80))
|
||||
{
|
||||
InputStr[i] = " " + InputStr[i];
|
||||
}
|
||||
WordWithPunc.push_back(InputStr[i]);
|
||||
|
||||
if (Punction[i] != NOTPUNC_INDEX) // <20>»<EFBFBD><C2BB><EFBFBD>
|
||||
{
|
||||
WordWithPunc.push_back(m_tokenizer.Id2Punc(Punction[i]));
|
||||
}
|
||||
}
|
||||
|
||||
NewString.insert(NewString.end(), WordWithPunc.begin(), WordWithPunc.end()); // new_mini_sentence += "".join(words_with_punc)
|
||||
NewSentenceOut = NewString;
|
||||
NewPuncOut = NewPunctuation;
|
||||
// last mini sentence
|
||||
if(nCurBatch == nTotalBatch - 1)
|
||||
{
|
||||
if (NewString[NewString.size() - 1] == m_tokenizer.Id2Punc(COMMA_INDEX) || NewString[NewString.size() - 1] == m_tokenizer.Id2Punc(DUN_INDEX))
|
||||
{
|
||||
NewSentenceOut.assign(NewString.begin(), NewString.end() - 1);
|
||||
NewSentenceOut.push_back(m_tokenizer.Id2Punc(PERIOD_INDEX));
|
||||
NewPuncOut.assign(NewPunctuation.begin(), NewPunctuation.end() - 1);
|
||||
NewPuncOut.push_back(PERIOD_INDEX);
|
||||
}
|
||||
else if (NewString[NewString.size() - 1] != m_tokenizer.Id2Punc(PERIOD_INDEX) && NewString[NewString.size() - 1] != m_tokenizer.Id2Punc(QUESTION_INDEX))
|
||||
{
|
||||
NewSentenceOut = NewString;
|
||||
NewSentenceOut.push_back(m_tokenizer.Id2Punc(PERIOD_INDEX));
|
||||
NewPuncOut = NewPunctuation;
|
||||
NewPuncOut.push_back(PERIOD_INDEX);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (auto& item : NewSentenceOut){
|
||||
strResult += item;
|
||||
}
|
||||
|
||||
if(language == "en-bpe"){
|
||||
std::vector<std::string> chineseSymbols;
|
||||
chineseSymbols.push_back(",");
|
||||
chineseSymbols.push_back("。");
|
||||
chineseSymbols.push_back("、");
|
||||
chineseSymbols.push_back("?");
|
||||
|
||||
std::string englishSymbols = ",.,?";
|
||||
for (size_t i = 0; i < chineseSymbols.size(); i++) {
|
||||
size_t pos = 0;
|
||||
while ((pos = strResult.find(chineseSymbols[i], pos)) != std::string::npos) {
|
||||
strResult.replace(pos, 3, 1, englishSymbols[i]);
|
||||
pos++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return strResult;
|
||||
}
|
||||
|
||||
vector<int> CTTransformer::Infer(vector<int32_t> input_data)
|
||||
{
|
||||
Ort::MemoryInfo m_memoryInfo = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);
|
||||
vector<int> punction;
|
||||
std::array<int64_t, 2> input_shape_{ 1, (int64_t)input_data.size()};
|
||||
Ort::Value onnx_input = Ort::Value::CreateTensor<int32_t>(
|
||||
m_memoryInfo,
|
||||
input_data.data(),
|
||||
input_data.size(),
|
||||
input_shape_.data(),
|
||||
input_shape_.size());
|
||||
|
||||
std::array<int32_t,1> text_lengths{ (int32_t)input_data.size() };
|
||||
std::array<int64_t,1> text_lengths_dim{ 1 };
|
||||
Ort::Value onnx_text_lengths = Ort::Value::CreateTensor(
|
||||
m_memoryInfo,
|
||||
text_lengths.data(),
|
||||
text_lengths.size() * sizeof(int32_t),
|
||||
text_lengths_dim.data(),
|
||||
text_lengths_dim.size(), ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32);
|
||||
std::vector<Ort::Value> input_onnx;
|
||||
input_onnx.emplace_back(std::move(onnx_input));
|
||||
input_onnx.emplace_back(std::move(onnx_text_lengths));
|
||||
|
||||
try {
|
||||
auto outputTensor = m_session->Run(Ort::RunOptions{nullptr}, m_szInputNames.data(), input_onnx.data(), m_szInputNames.size(), m_szOutputNames.data(), m_szOutputNames.size());
|
||||
std::vector<int64_t> outputShape = outputTensor[0].GetTensorTypeAndShapeInfo().GetShape();
|
||||
|
||||
int64_t outputCount = std::accumulate(outputShape.begin(), outputShape.end(), 1, std::multiplies<int64_t>());
|
||||
float * floatData = outputTensor[0].GetTensorMutableData<float>();
|
||||
|
||||
for (int i = 0; i < outputCount; i += CANDIDATE_NUM)
|
||||
{
|
||||
int index = Argmax(floatData + i, floatData + i + CANDIDATE_NUM-1);
|
||||
punction.push_back(index);
|
||||
}
|
||||
}
|
||||
catch (std::exception const &e)
|
||||
{
|
||||
LOG(ERROR) << "Error when run punc onnx forword: " << (e.what());
|
||||
}
|
||||
return punction;
|
||||
}
|
||||
|
||||
} // namespace funasr
|
||||
34
modules/python/vendors/FunASR/runtime/onnxruntime/src/ct-transformer.h
vendored
Normal file
34
modules/python/vendors/FunASR/runtime/onnxruntime/src/ct-transformer.h
vendored
Normal file
@@ -0,0 +1,34 @@
|
||||
/**
|
||||
* Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
|
||||
* MIT License (https://opensource.org/licenses/MIT)
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
namespace funasr {
|
||||
class CTTransformer : public PuncModel {
|
||||
/**
|
||||
* Author: Speech Lab of DAMO Academy, Alibaba Group
|
||||
* CT-Transformer: Controllable time-delay transformer for real-time punctuation prediction and disfluency detection
|
||||
* https://arxiv.org/pdf/2003.01309.pdf
|
||||
*/
|
||||
|
||||
private:
|
||||
|
||||
CTokenizer m_tokenizer;
|
||||
vector<string> m_strInputNames, m_strOutputNames;
|
||||
vector<const char*> m_szInputNames;
|
||||
vector<const char*> m_szOutputNames;
|
||||
|
||||
std::shared_ptr<Ort::Session> m_session;
|
||||
Ort::Env env_;
|
||||
Ort::SessionOptions session_options;
|
||||
public:
|
||||
|
||||
CTTransformer();
|
||||
void InitPunc(const std::string &punc_model, const std::string &punc_config, const std::string &token_file, int thread_num);
|
||||
~CTTransformer();
|
||||
vector<int> Infer(vector<int32_t> input_data);
|
||||
string AddPunc(const char* sz_input, std::string language="zh-cn");
|
||||
};
|
||||
} // namespace funasr
|
||||
785
modules/python/vendors/FunASR/runtime/onnxruntime/src/e2e-vad.h
vendored
Normal file
785
modules/python/vendors/FunASR/runtime/onnxruntime/src/e2e-vad.h
vendored
Normal file
@@ -0,0 +1,785 @@
|
||||
/**
|
||||
* Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
|
||||
* MIT License (https://opensource.org/licenses/MIT)
|
||||
* Contributed by zhuzizyf(China Telecom).
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <map>
|
||||
#include <cmath>
|
||||
#include <algorithm>
|
||||
#include <iostream>
|
||||
#include <numeric>
|
||||
#include <cassert>
|
||||
|
||||
namespace funasr {
|
||||
enum class VadStateMachine {
|
||||
kVadInStateStartPointNotDetected = 1,
|
||||
kVadInStateInSpeechSegment = 2,
|
||||
kVadInStateEndPointDetected = 3
|
||||
};
|
||||
|
||||
enum class FrameState {
|
||||
kFrameStateInvalid = -1,
|
||||
kFrameStateSpeech = 1,
|
||||
kFrameStateSil = 0
|
||||
};
|
||||
|
||||
// final voice/unvoice state per frame
|
||||
enum class AudioChangeState {
|
||||
kChangeStateSpeech2Speech = 0,
|
||||
kChangeStateSpeech2Sil = 1,
|
||||
kChangeStateSil2Sil = 2,
|
||||
kChangeStateSil2Speech = 3,
|
||||
kChangeStateNoBegin = 4,
|
||||
kChangeStateInvalid = 5
|
||||
};
|
||||
|
||||
enum class VadDetectMode {
|
||||
kVadSingleUtteranceDetectMode = 0,
|
||||
kVadMutipleUtteranceDetectMode = 1
|
||||
};
|
||||
|
||||
class VADXOptions {
|
||||
public:
|
||||
int sample_rate;
|
||||
int detect_mode;
|
||||
int snr_mode;
|
||||
int max_end_silence_time;
|
||||
int max_start_silence_time;
|
||||
bool do_start_point_detection;
|
||||
bool do_end_point_detection;
|
||||
int window_size_ms;
|
||||
int sil_to_speech_time_thres;
|
||||
int speech_to_sil_time_thres;
|
||||
float speech_2_noise_ratio;
|
||||
int do_extend;
|
||||
int lookback_time_start_point;
|
||||
int lookahead_time_end_point;
|
||||
int max_single_segment_time;
|
||||
int nn_eval_block_size;
|
||||
int dcd_block_size;
|
||||
float snr_thres;
|
||||
int noise_frame_num_used_for_snr;
|
||||
float decibel_thres;
|
||||
float speech_noise_thres;
|
||||
float fe_prior_thres;
|
||||
int silence_pdf_num;
|
||||
std::vector<int> sil_pdf_ids;
|
||||
float speech_noise_thresh_low;
|
||||
float speech_noise_thresh_high;
|
||||
bool output_frame_probs;
|
||||
int frame_in_ms;
|
||||
int frame_length_ms;
|
||||
|
||||
explicit VADXOptions(
|
||||
int sr = 16000,
|
||||
int dm = static_cast<int>(VadDetectMode::kVadMutipleUtteranceDetectMode),
|
||||
int sm = 0,
|
||||
int mset = 800,
|
||||
int msst = 3000,
|
||||
bool dspd = true,
|
||||
bool depd = true,
|
||||
int wsm = 200,
|
||||
int ststh = 150,
|
||||
int sttsh = 150,
|
||||
float s2nr = 1.0,
|
||||
int de = 1,
|
||||
int lbtps = 200,
|
||||
int latsp = 100,
|
||||
int mss = 15000,
|
||||
int nebs = 8,
|
||||
int dbs = 4,
|
||||
float st = -100.0,
|
||||
int nfnus = 100,
|
||||
float dt = -100.0,
|
||||
float snt = 0.9,
|
||||
float fept = 1e-4,
|
||||
int spn = 1,
|
||||
std::vector<int> spids = {0},
|
||||
float sntl = -0.1,
|
||||
float snth = 0.3,
|
||||
bool ofp = false,
|
||||
int fim = 10,
|
||||
int flm = 25
|
||||
) :
|
||||
sample_rate(sr),
|
||||
detect_mode(dm),
|
||||
snr_mode(sm),
|
||||
max_end_silence_time(mset),
|
||||
max_start_silence_time(msst),
|
||||
do_start_point_detection(dspd),
|
||||
do_end_point_detection(depd),
|
||||
window_size_ms(wsm),
|
||||
sil_to_speech_time_thres(ststh),
|
||||
speech_to_sil_time_thres(sttsh),
|
||||
speech_2_noise_ratio(s2nr),
|
||||
do_extend(de),
|
||||
lookback_time_start_point(lbtps),
|
||||
lookahead_time_end_point(latsp),
|
||||
max_single_segment_time(mss),
|
||||
nn_eval_block_size(nebs),
|
||||
dcd_block_size(dbs),
|
||||
snr_thres(st),
|
||||
noise_frame_num_used_for_snr(nfnus),
|
||||
decibel_thres(dt),
|
||||
speech_noise_thres(snt),
|
||||
fe_prior_thres(fept),
|
||||
silence_pdf_num(spn),
|
||||
sil_pdf_ids(std::move(spids)),
|
||||
speech_noise_thresh_low(sntl),
|
||||
speech_noise_thresh_high(snth),
|
||||
output_frame_probs(ofp),
|
||||
frame_in_ms(fim),
|
||||
frame_length_ms(flm) {}
|
||||
};
|
||||
|
||||
class E2EVadSpeechBufWithDoa {
|
||||
public:
|
||||
int start_ms;
|
||||
int end_ms;
|
||||
std::vector<float> buffer;
|
||||
bool contain_seg_start_point;
|
||||
bool contain_seg_end_point;
|
||||
int doa;
|
||||
|
||||
E2EVadSpeechBufWithDoa() :
|
||||
start_ms(0),
|
||||
end_ms(0),
|
||||
buffer(),
|
||||
contain_seg_start_point(false),
|
||||
contain_seg_end_point(false),
|
||||
doa(0) {}
|
||||
|
||||
void Reset() {
|
||||
start_ms = 0;
|
||||
end_ms = 0;
|
||||
buffer.clear();
|
||||
contain_seg_start_point = false;
|
||||
contain_seg_end_point = false;
|
||||
doa = 0;
|
||||
}
|
||||
};
|
||||
|
||||
class E2EVadFrameProb {
|
||||
public:
|
||||
double noise_prob;
|
||||
double speech_prob;
|
||||
double score;
|
||||
int frame_id;
|
||||
int frm_state;
|
||||
|
||||
E2EVadFrameProb() :
|
||||
noise_prob(0.0),
|
||||
speech_prob(0.0),
|
||||
score(0.0),
|
||||
frame_id(0),
|
||||
frm_state(0) {}
|
||||
};
|
||||
|
||||
class WindowDetector {
|
||||
public:
|
||||
int window_size_ms;
|
||||
int sil_to_speech_time;
|
||||
int speech_to_sil_time;
|
||||
int frame_size_ms;
|
||||
int win_size_frame;
|
||||
int win_sum;
|
||||
std::vector<int> win_state;
|
||||
int cur_win_pos;
|
||||
FrameState pre_frame_state;
|
||||
FrameState cur_frame_state;
|
||||
int sil_to_speech_frmcnt_thres;
|
||||
int speech_to_sil_frmcnt_thres;
|
||||
int voice_last_frame_count;
|
||||
int noise_last_frame_count;
|
||||
int hydre_frame_count;
|
||||
|
||||
WindowDetector(int window_size_ms, int sil_to_speech_time, int speech_to_sil_time, int frame_size_ms) :
|
||||
window_size_ms(window_size_ms),
|
||||
sil_to_speech_time(sil_to_speech_time),
|
||||
speech_to_sil_time(speech_to_sil_time),
|
||||
frame_size_ms(frame_size_ms),
|
||||
win_size_frame(window_size_ms / frame_size_ms),
|
||||
win_sum(0),
|
||||
win_state(std::vector<int>(win_size_frame, 0)),
|
||||
cur_win_pos(0),
|
||||
pre_frame_state(FrameState::kFrameStateSil),
|
||||
cur_frame_state(FrameState::kFrameStateSil),
|
||||
sil_to_speech_frmcnt_thres(sil_to_speech_time / frame_size_ms),
|
||||
speech_to_sil_frmcnt_thres(speech_to_sil_time / frame_size_ms),
|
||||
voice_last_frame_count(0),
|
||||
noise_last_frame_count(0),
|
||||
hydre_frame_count(0) {}
|
||||
|
||||
void Reset() {
|
||||
cur_win_pos = 0;
|
||||
win_sum = 0;
|
||||
win_state = std::vector<int>(win_size_frame, 0);
|
||||
pre_frame_state = FrameState::kFrameStateSil;
|
||||
cur_frame_state = FrameState::kFrameStateSil;
|
||||
voice_last_frame_count = 0;
|
||||
noise_last_frame_count = 0;
|
||||
hydre_frame_count = 0;
|
||||
}
|
||||
|
||||
int GetWinSize() {
|
||||
return win_size_frame;
|
||||
}
|
||||
|
||||
AudioChangeState DetectOneFrame(FrameState frameState, int frame_count) {
|
||||
int cur_frame_state = 0;
|
||||
if (frameState == FrameState::kFrameStateSpeech) {
|
||||
cur_frame_state = 1;
|
||||
} else if (frameState == FrameState::kFrameStateSil) {
|
||||
cur_frame_state = 0;
|
||||
} else {
|
||||
return AudioChangeState::kChangeStateInvalid;
|
||||
}
|
||||
win_sum -= win_state[cur_win_pos];
|
||||
win_sum += cur_frame_state;
|
||||
win_state[cur_win_pos] = cur_frame_state;
|
||||
cur_win_pos = (cur_win_pos + 1) % win_size_frame;
|
||||
if (pre_frame_state == FrameState::kFrameStateSil && win_sum >= sil_to_speech_frmcnt_thres) {
|
||||
pre_frame_state = FrameState::kFrameStateSpeech;
|
||||
return AudioChangeState::kChangeStateSil2Speech;
|
||||
}
|
||||
if (pre_frame_state == FrameState::kFrameStateSpeech && win_sum <= speech_to_sil_frmcnt_thres) {
|
||||
pre_frame_state = FrameState::kFrameStateSil;
|
||||
return AudioChangeState::kChangeStateSpeech2Sil;
|
||||
}
|
||||
if (pre_frame_state == FrameState::kFrameStateSil) {
|
||||
return AudioChangeState::kChangeStateSil2Sil;
|
||||
}
|
||||
if (pre_frame_state == FrameState::kFrameStateSpeech) {
|
||||
return AudioChangeState::kChangeStateSpeech2Speech;
|
||||
}
|
||||
return AudioChangeState::kChangeStateInvalid;
|
||||
}
|
||||
|
||||
int FrameSizeMs() {
|
||||
return frame_size_ms;
|
||||
}
|
||||
};
|
||||
|
||||
class E2EVadModel {
|
||||
public:
|
||||
E2EVadModel() {
|
||||
this->vad_opts = VADXOptions();
|
||||
// this->windows_detector = WindowDetector(200,150,150,10);
|
||||
// this->encoder = encoder;
|
||||
// init variables
|
||||
this->is_final = false;
|
||||
this->data_buf_start_frame = 0;
|
||||
this->frm_cnt = 0;
|
||||
this->latest_confirmed_speech_frame = 0;
|
||||
this->lastest_confirmed_silence_frame = -1;
|
||||
this->continous_silence_frame_count = 0;
|
||||
this->vad_state_machine = VadStateMachine::kVadInStateStartPointNotDetected;
|
||||
this->confirmed_start_frame = -1;
|
||||
this->confirmed_end_frame = -1;
|
||||
this->number_end_time_detected = 0;
|
||||
this->sil_frame = 0;
|
||||
this->sil_pdf_ids = this->vad_opts.sil_pdf_ids;
|
||||
this->noise_average_decibel = -100.0;
|
||||
this->pre_end_silence_detected = false;
|
||||
this->next_seg = true;
|
||||
// this->output_data_buf = [];
|
||||
this->output_data_buf_offset = 0;
|
||||
// this->frame_probs = [];
|
||||
this->max_end_sil_frame_cnt_thresh =
|
||||
this->vad_opts.max_end_silence_time - this->vad_opts.speech_to_sil_time_thres;
|
||||
this->speech_noise_thres = this->vad_opts.speech_noise_thres;
|
||||
this->max_time_out = false;
|
||||
// this->decibel = [];
|
||||
this->ResetDetection();
|
||||
}
|
||||
|
||||
std::vector<std::vector<int>>
|
||||
operator()(const std::vector<std::vector<float>> &score, const std::vector<float> &waveform, bool is_final = false,
|
||||
bool online = false, int max_end_sil = 800, int max_single_segment_time = 15000,
|
||||
float speech_noise_thres = 0.8, int sample_rate = 16000) {
|
||||
max_end_sil_frame_cnt_thresh = max_end_sil - vad_opts.speech_to_sil_time_thres;
|
||||
this->waveform = waveform;
|
||||
this->vad_opts.max_single_segment_time = max_single_segment_time;
|
||||
this->speech_noise_thres = speech_noise_thres;
|
||||
this->vad_opts.sample_rate = sample_rate;
|
||||
|
||||
ComputeDecibel();
|
||||
ComputeScores(score);
|
||||
if (!is_final) {
|
||||
DetectCommonFrames();
|
||||
} else {
|
||||
DetectLastFrames();
|
||||
}
|
||||
|
||||
std::vector<std::vector<int>> segment_batch;
|
||||
if (output_data_buf.size() > 0) {
|
||||
for (size_t i = output_data_buf_offset; i < output_data_buf.size(); i++) {
|
||||
int start_ms;
|
||||
int end_ms;
|
||||
if (online) {
|
||||
|
||||
if (!output_data_buf[i].contain_seg_start_point) {
|
||||
continue;
|
||||
}
|
||||
if (!next_seg && !output_data_buf[i].contain_seg_end_point) {
|
||||
continue;
|
||||
}
|
||||
start_ms = next_seg ? output_data_buf[i].start_ms : -1;
|
||||
|
||||
if (output_data_buf[i].contain_seg_end_point) {
|
||||
end_ms = output_data_buf[i].end_ms;
|
||||
next_seg = true;
|
||||
output_data_buf_offset += 1;
|
||||
} else {
|
||||
end_ms = -1;
|
||||
next_seg = false;
|
||||
}
|
||||
} else {
|
||||
if (!is_final &&
|
||||
(!output_data_buf[i].contain_seg_start_point || !output_data_buf[i].contain_seg_end_point)) {
|
||||
continue;
|
||||
}
|
||||
start_ms = output_data_buf[i].start_ms;
|
||||
end_ms = output_data_buf[i].end_ms;
|
||||
output_data_buf_offset += 1;
|
||||
}
|
||||
std::vector<int> segment = {start_ms, end_ms};
|
||||
segment_batch.push_back(segment);
|
||||
}
|
||||
}
|
||||
|
||||
if (is_final) {
|
||||
AllResetDetection();
|
||||
}
|
||||
return segment_batch;
|
||||
}
|
||||
|
||||
private:
|
||||
VADXOptions vad_opts;
|
||||
WindowDetector windows_detector = WindowDetector(200, 150, 150, 10);
|
||||
bool is_final;
|
||||
int data_buf_start_frame;
|
||||
int frm_cnt;
|
||||
int latest_confirmed_speech_frame;
|
||||
int lastest_confirmed_silence_frame;
|
||||
int continous_silence_frame_count;
|
||||
VadStateMachine vad_state_machine;
|
||||
int confirmed_start_frame;
|
||||
int confirmed_end_frame;
|
||||
int number_end_time_detected;
|
||||
int sil_frame;
|
||||
std::vector<int> sil_pdf_ids;
|
||||
float noise_average_decibel;
|
||||
bool pre_end_silence_detected;
|
||||
bool next_seg;
|
||||
std::vector<E2EVadSpeechBufWithDoa> output_data_buf;
|
||||
int output_data_buf_offset;
|
||||
std::vector<E2EVadFrameProb> frame_probs;
|
||||
int max_end_sil_frame_cnt_thresh;
|
||||
float speech_noise_thres;
|
||||
std::vector<std::vector<float>> scores;
|
||||
int idx_pre_chunk = 0;
|
||||
bool max_time_out;
|
||||
std::vector<float> decibel;
|
||||
int data_buf_size = 0;
|
||||
int data_buf_all_size = 0;
|
||||
std::vector<float> waveform;
|
||||
|
||||
void AllResetDetection() {
|
||||
is_final = false;
|
||||
data_buf_start_frame = 0;
|
||||
frm_cnt = 0;
|
||||
latest_confirmed_speech_frame = 0;
|
||||
lastest_confirmed_silence_frame = -1;
|
||||
continous_silence_frame_count = 0;
|
||||
vad_state_machine = VadStateMachine::kVadInStateStartPointNotDetected;
|
||||
confirmed_start_frame = -1;
|
||||
confirmed_end_frame = -1;
|
||||
number_end_time_detected = 0;
|
||||
sil_frame = 0;
|
||||
sil_pdf_ids = vad_opts.sil_pdf_ids;
|
||||
noise_average_decibel = -100.0;
|
||||
pre_end_silence_detected = false;
|
||||
next_seg = true;
|
||||
output_data_buf.clear();
|
||||
output_data_buf_offset = 0;
|
||||
frame_probs.clear();
|
||||
max_end_sil_frame_cnt_thresh = vad_opts.max_end_silence_time - vad_opts.speech_to_sil_time_thres;
|
||||
speech_noise_thres = vad_opts.speech_noise_thres;
|
||||
scores.clear();
|
||||
idx_pre_chunk = 0;
|
||||
max_time_out = false;
|
||||
decibel.clear();
|
||||
int data_buf_size = 0;
|
||||
int data_buf_all_size = 0;
|
||||
waveform.clear();
|
||||
ResetDetection();
|
||||
}
|
||||
|
||||
void ResetDetection() {
|
||||
continous_silence_frame_count = 0;
|
||||
latest_confirmed_speech_frame = 0;
|
||||
lastest_confirmed_silence_frame = -1;
|
||||
confirmed_start_frame = -1;
|
||||
confirmed_end_frame = -1;
|
||||
vad_state_machine = VadStateMachine::kVadInStateStartPointNotDetected;
|
||||
windows_detector.Reset();
|
||||
sil_frame = 0;
|
||||
frame_probs.clear();
|
||||
}
|
||||
|
||||
void ComputeDecibel() {
|
||||
int frame_sample_length = int(vad_opts.frame_length_ms * vad_opts.sample_rate / 1000);
|
||||
int frame_shift_length = int(vad_opts.frame_in_ms * vad_opts.sample_rate / 1000);
|
||||
if (data_buf_all_size == 0) {
|
||||
data_buf_all_size = waveform.size();
|
||||
data_buf_size = data_buf_all_size;
|
||||
} else {
|
||||
data_buf_all_size += waveform.size();
|
||||
}
|
||||
for (int offset = 0; offset + frame_sample_length -1 < waveform.size(); offset += frame_shift_length) {
|
||||
float sum = 0.0;
|
||||
for (int i = 0; i < frame_sample_length; i++) {
|
||||
sum += waveform[offset + i] * waveform[offset + i];
|
||||
}
|
||||
this->decibel.push_back(10 * log10(sum + 0.000001));
|
||||
}
|
||||
}
|
||||
|
||||
void ComputeScores(const std::vector<std::vector<float>> &scores) {
|
||||
vad_opts.nn_eval_block_size = scores.size();
|
||||
frm_cnt += scores.size();
|
||||
this->scores = scores;
|
||||
}
|
||||
|
||||
void PopDataBufTillFrame(int frame_idx) {
|
||||
int frame_sample_length = int(vad_opts.frame_in_ms * vad_opts.sample_rate / 1000);
|
||||
while (data_buf_start_frame < frame_idx) {
|
||||
if (data_buf_size >= frame_sample_length) {
|
||||
data_buf_start_frame += 1;
|
||||
data_buf_size = data_buf_all_size - data_buf_start_frame * frame_sample_length;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void PopDataToOutputBuf(int start_frm, int frm_cnt, bool first_frm_is_start_point, bool last_frm_is_end_point,
|
||||
bool end_point_is_sent_end) {
|
||||
PopDataBufTillFrame(start_frm);
|
||||
int expected_sample_number = int(frm_cnt * vad_opts.sample_rate * vad_opts.frame_in_ms / 1000);
|
||||
if (last_frm_is_end_point) {
|
||||
int extra_sample = std::max(0, int(vad_opts.frame_length_ms * vad_opts.sample_rate / 1000 -
|
||||
vad_opts.sample_rate * vad_opts.frame_in_ms / 1000));
|
||||
expected_sample_number += int(extra_sample);
|
||||
}
|
||||
if (end_point_is_sent_end) {
|
||||
expected_sample_number = std::max(expected_sample_number, data_buf_size);
|
||||
}
|
||||
if (data_buf_size < expected_sample_number) {
|
||||
std::cout << "error in calling pop data_buf\n";
|
||||
}
|
||||
if (output_data_buf.size() == 0 || first_frm_is_start_point) {
|
||||
output_data_buf.push_back(E2EVadSpeechBufWithDoa());
|
||||
output_data_buf[output_data_buf.size() - 1].Reset();
|
||||
output_data_buf[output_data_buf.size() - 1].start_ms = start_frm * vad_opts.frame_in_ms;
|
||||
output_data_buf[output_data_buf.size() - 1].end_ms = output_data_buf[output_data_buf.size() - 1].start_ms;
|
||||
output_data_buf[output_data_buf.size() - 1].doa = 0;
|
||||
}
|
||||
E2EVadSpeechBufWithDoa &cur_seg = output_data_buf.back();
|
||||
if (cur_seg.end_ms != start_frm * vad_opts.frame_in_ms) {
|
||||
std::cout << "warning\n";
|
||||
}
|
||||
|
||||
int data_to_pop;
|
||||
if (end_point_is_sent_end) {
|
||||
data_to_pop = expected_sample_number;
|
||||
} else {
|
||||
data_to_pop = int(frm_cnt * vad_opts.frame_in_ms * vad_opts.sample_rate / 1000);
|
||||
}
|
||||
if (data_to_pop > data_buf_size) {
|
||||
std::cout << "VAD data_to_pop is bigger than data_buf.size()!!!\n";
|
||||
data_to_pop = data_buf_size;
|
||||
expected_sample_number = data_buf_size;
|
||||
}
|
||||
cur_seg.doa = 0;
|
||||
|
||||
if (cur_seg.end_ms != start_frm * vad_opts.frame_in_ms) {
|
||||
std::cout << "Something wrong with the VAD algorithm\n";
|
||||
}
|
||||
data_buf_start_frame += frm_cnt;
|
||||
cur_seg.end_ms = (start_frm + frm_cnt) * vad_opts.frame_in_ms;
|
||||
if (first_frm_is_start_point) {
|
||||
cur_seg.contain_seg_start_point = true;
|
||||
}
|
||||
if (last_frm_is_end_point) {
|
||||
cur_seg.contain_seg_end_point = true;
|
||||
}
|
||||
}
|
||||
|
||||
void OnSilenceDetected(int valid_frame) {
|
||||
lastest_confirmed_silence_frame = valid_frame;
|
||||
if (vad_state_machine == VadStateMachine::kVadInStateStartPointNotDetected) {
|
||||
PopDataBufTillFrame(valid_frame);
|
||||
}
|
||||
// silence_detected_callback_
|
||||
// pass
|
||||
}
|
||||
|
||||
void OnVoiceDetected(int valid_frame) {
|
||||
latest_confirmed_speech_frame = valid_frame;
|
||||
PopDataToOutputBuf(valid_frame, 1, false, false, false);
|
||||
}
|
||||
|
||||
void OnVoiceStart(int start_frame, bool fake_result = false) {
|
||||
if (vad_opts.do_start_point_detection) {
|
||||
// pass
|
||||
}
|
||||
if (confirmed_start_frame != -1) {
|
||||
std::cout << "not reset vad properly\n";
|
||||
} else {
|
||||
confirmed_start_frame = start_frame;
|
||||
}
|
||||
if (!fake_result && vad_state_machine == VadStateMachine::kVadInStateStartPointNotDetected) {
|
||||
PopDataToOutputBuf(confirmed_start_frame, 1, true, false, false);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void OnVoiceEnd(int end_frame, bool fake_result, bool is_last_frame) {
|
||||
for (int t = latest_confirmed_speech_frame + 1; t < end_frame; t++) {
|
||||
OnVoiceDetected(t);
|
||||
}
|
||||
if (vad_opts.do_end_point_detection) {
|
||||
// pass
|
||||
}
|
||||
if (confirmed_end_frame != -1) {
|
||||
std::cout << "not reset vad properly\n";
|
||||
} else {
|
||||
confirmed_end_frame = end_frame;
|
||||
}
|
||||
if (!fake_result) {
|
||||
sil_frame = 0;
|
||||
PopDataToOutputBuf(confirmed_end_frame, 1, false, true, is_last_frame);
|
||||
}
|
||||
number_end_time_detected++;
|
||||
}
|
||||
|
||||
void MaybeOnVoiceEndIfLastFrame(bool is_final_frame, int cur_frm_idx) {
|
||||
if (is_final_frame) {
|
||||
OnVoiceEnd(cur_frm_idx, false, true);
|
||||
vad_state_machine = VadStateMachine::kVadInStateEndPointDetected;
|
||||
}
|
||||
}
|
||||
|
||||
int GetLatency() {
|
||||
return int(LatencyFrmNumAtStartPoint() * vad_opts.frame_in_ms);
|
||||
}
|
||||
|
||||
int LatencyFrmNumAtStartPoint() {
|
||||
int vad_latency = windows_detector.GetWinSize();
|
||||
if (vad_opts.do_extend) {
|
||||
vad_latency += int(vad_opts.lookback_time_start_point / vad_opts.frame_in_ms);
|
||||
}
|
||||
return vad_latency;
|
||||
}
|
||||
|
||||
FrameState GetFrameState(int t) {
|
||||
FrameState frame_state = FrameState::kFrameStateInvalid;
|
||||
float cur_decibel = decibel[t];
|
||||
float cur_snr = cur_decibel - noise_average_decibel;
|
||||
if (cur_decibel < vad_opts.decibel_thres) {
|
||||
frame_state = FrameState::kFrameStateSil;
|
||||
DetectOneFrame(frame_state, t, false);
|
||||
return frame_state;
|
||||
}
|
||||
float sum_score = 0.0;
|
||||
float noise_prob = 0.0;
|
||||
assert(sil_pdf_ids.size() == vad_opts.silence_pdf_num);
|
||||
if (sil_pdf_ids.size() > 0) {
|
||||
std::vector<float> sil_pdf_scores;
|
||||
for (auto sil_pdf_id: sil_pdf_ids) {
|
||||
sil_pdf_scores.push_back(scores[t - idx_pre_chunk][sil_pdf_id]);
|
||||
}
|
||||
sum_score = accumulate(sil_pdf_scores.begin(), sil_pdf_scores.end(), 0.0);
|
||||
noise_prob = log(sum_score) * vad_opts.speech_2_noise_ratio;
|
||||
float total_score = 1.0;
|
||||
sum_score = total_score - sum_score;
|
||||
}
|
||||
float speech_prob = log(sum_score);
|
||||
if (vad_opts.output_frame_probs) {
|
||||
E2EVadFrameProb frame_prob;
|
||||
frame_prob.noise_prob = noise_prob;
|
||||
frame_prob.speech_prob = speech_prob;
|
||||
frame_prob.score = sum_score;
|
||||
frame_prob.frame_id = t;
|
||||
frame_probs.push_back(frame_prob);
|
||||
}
|
||||
if (exp(speech_prob) >= exp(noise_prob) + speech_noise_thres) {
|
||||
if (cur_snr >= vad_opts.snr_thres && cur_decibel >= vad_opts.decibel_thres) {
|
||||
frame_state = FrameState::kFrameStateSpeech;
|
||||
} else {
|
||||
frame_state = FrameState::kFrameStateSil;
|
||||
}
|
||||
} else {
|
||||
frame_state = FrameState::kFrameStateSil;
|
||||
if (noise_average_decibel < -99.9) {
|
||||
noise_average_decibel = cur_decibel;
|
||||
} else {
|
||||
noise_average_decibel =
|
||||
(cur_decibel + noise_average_decibel * (vad_opts.noise_frame_num_used_for_snr - 1)) /
|
||||
vad_opts.noise_frame_num_used_for_snr;
|
||||
}
|
||||
}
|
||||
return frame_state;
|
||||
}
|
||||
|
||||
int DetectCommonFrames() {
|
||||
if (vad_state_machine == VadStateMachine::kVadInStateEndPointDetected) {
|
||||
return 0;
|
||||
}
|
||||
for (int i = vad_opts.nn_eval_block_size - 1; i >= 0; i--) {
|
||||
FrameState frame_state = FrameState::kFrameStateInvalid;
|
||||
frame_state = GetFrameState(frm_cnt - 1 - i);
|
||||
DetectOneFrame(frame_state, frm_cnt - 1 - i, false);
|
||||
}
|
||||
idx_pre_chunk += scores.size();
|
||||
return 0;
|
||||
}
|
||||
|
||||
int DetectLastFrames() {
|
||||
if (vad_state_machine == VadStateMachine::kVadInStateEndPointDetected) {
|
||||
return 0;
|
||||
}
|
||||
for (int i = vad_opts.nn_eval_block_size - 1; i >= 0; i--) {
|
||||
FrameState frame_state = FrameState::kFrameStateInvalid;
|
||||
frame_state = GetFrameState(frm_cnt - 1 - i);
|
||||
if (i != 0) {
|
||||
DetectOneFrame(frame_state, frm_cnt - 1 - i, false);
|
||||
} else {
|
||||
DetectOneFrame(frame_state, frm_cnt - 1, true);
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
void DetectOneFrame(FrameState cur_frm_state, int cur_frm_idx, bool is_final_frame) {
|
||||
FrameState tmp_cur_frm_state = FrameState::kFrameStateInvalid;
|
||||
if (cur_frm_state == FrameState::kFrameStateSpeech) {
|
||||
if (std::fabs(1.0) > vad_opts.fe_prior_thres) {
|
||||
tmp_cur_frm_state = FrameState::kFrameStateSpeech;
|
||||
} else {
|
||||
tmp_cur_frm_state = FrameState::kFrameStateSil;
|
||||
}
|
||||
} else if (cur_frm_state == FrameState::kFrameStateSil) {
|
||||
tmp_cur_frm_state = FrameState::kFrameStateSil;
|
||||
}
|
||||
AudioChangeState state_change = windows_detector.DetectOneFrame(tmp_cur_frm_state, cur_frm_idx);
|
||||
int frm_shift_in_ms = vad_opts.frame_in_ms;
|
||||
if (AudioChangeState::kChangeStateSil2Speech == state_change) {
|
||||
int silence_frame_count = continous_silence_frame_count;
|
||||
continous_silence_frame_count = 0;
|
||||
pre_end_silence_detected = false;
|
||||
int start_frame = 0;
|
||||
if (vad_state_machine == VadStateMachine::kVadInStateStartPointNotDetected) {
|
||||
start_frame = std::max(data_buf_start_frame, cur_frm_idx - LatencyFrmNumAtStartPoint());
|
||||
OnVoiceStart(start_frame);
|
||||
vad_state_machine = VadStateMachine::kVadInStateInSpeechSegment;
|
||||
for (int t = start_frame + 1; t <= cur_frm_idx; t++) {
|
||||
OnVoiceDetected(t);
|
||||
}
|
||||
} else if (vad_state_machine == VadStateMachine::kVadInStateInSpeechSegment) {
|
||||
for (int t = latest_confirmed_speech_frame + 1; t < cur_frm_idx; t++) {
|
||||
OnVoiceDetected(t);
|
||||
}
|
||||
if (cur_frm_idx - confirmed_start_frame + 1 > vad_opts.max_single_segment_time / frm_shift_in_ms) {
|
||||
OnVoiceEnd(cur_frm_idx, false, false);
|
||||
vad_state_machine = VadStateMachine::kVadInStateEndPointDetected;
|
||||
} else if (!is_final_frame) {
|
||||
OnVoiceDetected(cur_frm_idx);
|
||||
} else {
|
||||
MaybeOnVoiceEndIfLastFrame(is_final_frame, cur_frm_idx);
|
||||
}
|
||||
}
|
||||
} else if (AudioChangeState::kChangeStateSpeech2Sil == state_change) {
|
||||
continous_silence_frame_count = 0;
|
||||
if (vad_state_machine == VadStateMachine::kVadInStateStartPointNotDetected) {
|
||||
// do nothing
|
||||
} else if (vad_state_machine == VadStateMachine::kVadInStateInSpeechSegment) {
|
||||
if (cur_frm_idx - confirmed_start_frame + 1 >
|
||||
vad_opts.max_single_segment_time / frm_shift_in_ms) {
|
||||
OnVoiceEnd(cur_frm_idx, false, false);
|
||||
vad_state_machine = VadStateMachine::kVadInStateEndPointDetected;
|
||||
} else if (!is_final_frame) {
|
||||
OnVoiceDetected(cur_frm_idx);
|
||||
} else {
|
||||
MaybeOnVoiceEndIfLastFrame(is_final_frame, cur_frm_idx);
|
||||
}
|
||||
}
|
||||
} else if (AudioChangeState::kChangeStateSpeech2Speech == state_change) {
|
||||
continous_silence_frame_count = 0;
|
||||
if (vad_state_machine == VadStateMachine::kVadInStateInSpeechSegment) {
|
||||
if (cur_frm_idx - confirmed_start_frame + 1 >
|
||||
vad_opts.max_single_segment_time / frm_shift_in_ms) {
|
||||
max_time_out = true;
|
||||
OnVoiceEnd(cur_frm_idx, false, false);
|
||||
vad_state_machine = VadStateMachine::kVadInStateEndPointDetected;
|
||||
} else if (!is_final_frame) {
|
||||
OnVoiceDetected(cur_frm_idx);
|
||||
} else {
|
||||
MaybeOnVoiceEndIfLastFrame(is_final_frame, cur_frm_idx);
|
||||
}
|
||||
}
|
||||
} else if (AudioChangeState::kChangeStateSil2Sil == state_change) {
|
||||
continous_silence_frame_count += 1;
|
||||
if (vad_state_machine == VadStateMachine::kVadInStateStartPointNotDetected) {
|
||||
if ((vad_opts.detect_mode == static_cast<int>(VadDetectMode::kVadSingleUtteranceDetectMode) &&
|
||||
(continous_silence_frame_count * frm_shift_in_ms > vad_opts.max_start_silence_time)) ||
|
||||
(is_final_frame && number_end_time_detected == 0)) {
|
||||
for (int t = lastest_confirmed_silence_frame + 1; t < cur_frm_idx; t++) {
|
||||
OnSilenceDetected(t);
|
||||
}
|
||||
OnVoiceStart(0, true);
|
||||
OnVoiceEnd(0, true, false);
|
||||
vad_state_machine = VadStateMachine::kVadInStateEndPointDetected;
|
||||
} else {
|
||||
if (cur_frm_idx >= LatencyFrmNumAtStartPoint()) {
|
||||
OnSilenceDetected(cur_frm_idx - LatencyFrmNumAtStartPoint());
|
||||
}
|
||||
}
|
||||
} else if (vad_state_machine == VadStateMachine::kVadInStateInSpeechSegment) {
|
||||
if (continous_silence_frame_count * frm_shift_in_ms >= max_end_sil_frame_cnt_thresh) {
|
||||
int lookback_frame = max_end_sil_frame_cnt_thresh / frm_shift_in_ms;
|
||||
if (vad_opts.do_extend) {
|
||||
lookback_frame -= vad_opts.lookahead_time_end_point / frm_shift_in_ms;
|
||||
lookback_frame -= 1;
|
||||
lookback_frame = std::max(0, lookback_frame);
|
||||
}
|
||||
OnVoiceEnd(cur_frm_idx - lookback_frame, false, false);
|
||||
vad_state_machine = VadStateMachine::kVadInStateEndPointDetected;
|
||||
} else if (cur_frm_idx - confirmed_start_frame + 1 >
|
||||
vad_opts.max_single_segment_time / frm_shift_in_ms) {
|
||||
OnVoiceEnd(cur_frm_idx, false, false);
|
||||
vad_state_machine = VadStateMachine::kVadInStateEndPointDetected;
|
||||
} else if (vad_opts.do_extend && !is_final_frame) {
|
||||
if (continous_silence_frame_count <= vad_opts.lookahead_time_end_point / frm_shift_in_ms) {
|
||||
OnVoiceDetected(cur_frm_idx);
|
||||
}
|
||||
} else {
|
||||
MaybeOnVoiceEndIfLastFrame(is_final_frame, cur_frm_idx);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (vad_state_machine == VadStateMachine::kVadInStateEndPointDetected &&
|
||||
vad_opts.detect_mode == static_cast<int>(VadDetectMode::kVadMutipleUtteranceDetectMode)) {
|
||||
ResetDetection();
|
||||
}
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
} // namespace funasr
|
||||
575
modules/python/vendors/FunASR/runtime/onnxruntime/src/encode_converter.cpp
vendored
Normal file
575
modules/python/vendors/FunASR/runtime/onnxruntime/src/encode_converter.cpp
vendored
Normal file
@@ -0,0 +1,575 @@
|
||||
/**
|
||||
* Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
|
||||
* MIT License (https://opensource.org/licenses/MIT)
|
||||
*/
|
||||
#include "encode_converter.h"
|
||||
#include <assert.h>
|
||||
|
||||
|
||||
namespace funasr {
|
||||
using namespace std;
|
||||
|
||||
U16CHAR_T UTF16[8];
|
||||
U8CHAR_T UTF8[8];
|
||||
|
||||
size_t MyUtf8ToUtf16(const U8CHAR_T* pu8, size_t ilen, U16CHAR_T* pu16);
|
||||
size_t MyUtf16ToUtf8(const U16CHAR_T* pu16, U8CHAR_T* pu8);
|
||||
|
||||
|
||||
void EncodeConverter::SwapEndian(U16CHAR_T* pbuf, size_t len)
|
||||
{
|
||||
for (size_t i = 0; i < len; i++) {
|
||||
pbuf[i] = ((pbuf[i] >> 8) | (pbuf[i] << 8));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
size_t MyUtf16ToUtf8(const U16CHAR_T* pu16, U8CHAR_T* pu8)
|
||||
{
|
||||
size_t n = 0;
|
||||
if (pu16[0] <= 0x007F)
|
||||
{
|
||||
pu8[0] = (pu16[0] & 0x7F);
|
||||
n = 1;
|
||||
}
|
||||
else if (pu16[0] >= 0x0080 && pu16[0] <= 0x07FF)
|
||||
{
|
||||
pu8[1] = (0x80 | (pu16[0] & 0x003F));
|
||||
pu8[0] = (0xC0 | ((pu16[0] >> 6) & 0x001F));
|
||||
n = 2;
|
||||
}
|
||||
else if (pu16[0] >= 0x0800)
|
||||
{
|
||||
pu8[2] = (0x80 | (pu16[0] & 0x003F));
|
||||
pu8[1] = (0x80 | ((pu16[0] >> 6) & 0x003F));
|
||||
pu8[0] = (0xE0 | ((pu16[0] >> 12) & 0x000F));
|
||||
n = 3;
|
||||
}
|
||||
|
||||
return n;
|
||||
}
|
||||
|
||||
#define is2ByteUtf16(u16) ( (u16) >= 0x0080 && (u16) <= 0x07FF )
|
||||
#define is3ByteUtf16(u16) ( (u16) >= 0x0800 )
|
||||
|
||||
size_t EncodeConverter::Utf16ToUtf8(const U16CHAR_T* pu16, U8CHAR_T* pu8)
|
||||
{
|
||||
size_t n = 0;
|
||||
if (pu16[0] <= 0x007F)
|
||||
{
|
||||
pu8[0] = (pu16[0] & 0x7F);
|
||||
n = 1;
|
||||
}
|
||||
else if (pu16[0] >= 0x0080 && pu16[0] <= 0x07FF)
|
||||
{
|
||||
pu8[1] = (0x80 | (pu16[0] & 0x003F));
|
||||
pu8[0] = (0xC0 | ((pu16[0] >> 6) & 0x001F));
|
||||
n = 2;
|
||||
}
|
||||
else if (pu16[0] >= 0x0800)
|
||||
{
|
||||
pu8[2] = (0x80 | (pu16[0] & 0x003F));
|
||||
pu8[1] = (0x80 | ((pu16[0] >> 6) & 0x003F));
|
||||
pu8[0] = (0xE0 | ((pu16[0] >> 12) & 0x000F));
|
||||
n = 3;
|
||||
}
|
||||
|
||||
return n;
|
||||
}
|
||||
|
||||
size_t EncodeConverter::Utf16ToUtf8(const U16CHAR_T* pu16, size_t ilen,
|
||||
U8CHAR_T* pu8, size_t olen)
|
||||
{
|
||||
size_t offset = 0;
|
||||
size_t sz = 0;
|
||||
/*
|
||||
for (size_t i = 0; i < ilen && offset < static_cast<int>(olen) - 3; i++) {
|
||||
sz = utf16ToUtf8(pu16 + i, pu8 + offset);
|
||||
offset += sz;
|
||||
}
|
||||
*/
|
||||
for (size_t i = 0; i < ilen && static_cast<int>(offset) < static_cast<int>(olen); i++) {
|
||||
sz = Utf16ToUtf8(pu16 + i, pu8 + offset);
|
||||
if (static_cast<int>(offset + static_cast<int>(sz)) <= static_cast<int>(olen))
|
||||
offset += sz;
|
||||
}
|
||||
|
||||
// pu8[offset] = '\0';
|
||||
return offset;
|
||||
}
|
||||
|
||||
u8string EncodeConverter::Utf16ToUtf8(const u16string& u16str)
|
||||
{
|
||||
size_t buflen = u16str.length()*3 + 1;
|
||||
U8CHAR_T* pu8 = new U8CHAR_T[buflen];
|
||||
size_t len = Utf16ToUtf8(u16str.data(), u16str.length(),
|
||||
pu8, buflen);
|
||||
u8string u8str(pu8, len);
|
||||
delete [] pu8;
|
||||
|
||||
return u8str;
|
||||
}
|
||||
|
||||
size_t EncodeConverter::Utf8ToUtf16(const U8CHAR_T* pu8, U16CHAR_T* pu16)
|
||||
{
|
||||
size_t n = 0;
|
||||
if ((pu8[0] & 0xF0) == 0xE0)
|
||||
{
|
||||
if ((pu8[1] & 0xC0) == 0x80 &&
|
||||
(pu8[2] & 0xC0) == 0x80)
|
||||
{
|
||||
pu16[0] = (((pu8[0] & 0x0F) << 4) | ((pu8[1] & 0x3C) >> 2));
|
||||
pu16[0] <<= 8;
|
||||
pu16[0] |= (((pu8[1] & 0x03) << 6) | (pu8[2] & 0x3F));
|
||||
}
|
||||
else
|
||||
{
|
||||
pu16[0] = defUniChar;
|
||||
}
|
||||
n = 3;
|
||||
}
|
||||
else if ((pu8[0] & 0xE0) == 0xC0)
|
||||
{
|
||||
if ((pu8[1] & 0xC0) == 0x80)
|
||||
{
|
||||
pu16[0] = ((pu8[0] & 0x1C) >> 2);
|
||||
pu16[0] <<= 8;
|
||||
pu16[0] |= (((pu8[0] & 0x03) << 6) | (pu8[1] & 0x3F));
|
||||
}
|
||||
else
|
||||
{
|
||||
pu16[0] = defUniChar;
|
||||
}
|
||||
n = 2;
|
||||
}
|
||||
else if ((pu8[0] & 0x80) == 0x00)
|
||||
{
|
||||
pu16[0] = pu8[0];
|
||||
n = 1;
|
||||
}
|
||||
|
||||
return n;
|
||||
}
|
||||
|
||||
size_t MyUtf8ToUtf16(const U8CHAR_T* pu8, size_t ilen, U16CHAR_T* pu16)
|
||||
{
|
||||
size_t n = 0;
|
||||
if ((pu8[0] & 0xF0) == 0xE0 && ilen >= 3)
|
||||
{
|
||||
if ((pu8[1] & 0xC0) == 0x80 &&
|
||||
(pu8[2] & 0xC0) == 0x80)
|
||||
{
|
||||
pu16[0] = (((pu8[0] & 0x0F) << 4) | ((pu8[1] & 0x3C) >> 2));
|
||||
pu16[0] <<= 8;
|
||||
pu16[0] |= (((pu8[1] & 0x03) << 6) | (pu8[2] & 0x3F));
|
||||
n = 3;
|
||||
}
|
||||
else
|
||||
{
|
||||
pu16[0] = 0x0000;
|
||||
n = 1;
|
||||
}
|
||||
}
|
||||
else if ((pu8[0] & 0xE0) == 0xC0 && ilen >= 2)
|
||||
{
|
||||
if ((pu8[1] & 0xC0) == 0x80)
|
||||
{
|
||||
pu16[0] = ((pu8[0] & 0x1C) >> 2);
|
||||
pu16[0] <<= 8;
|
||||
pu16[0] |= (((pu8[0] & 0x03) << 6) | (pu8[1] & 0x3F));
|
||||
n = 2;
|
||||
}
|
||||
else
|
||||
{
|
||||
pu16[0] = 0x0000;
|
||||
n = 1;
|
||||
}
|
||||
}
|
||||
else if ((pu8[0] & 0x80) == 0x00)
|
||||
{
|
||||
pu16[0] = pu8[0];
|
||||
n = 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
pu16[0] = 0x0000;
|
||||
n = 1;
|
||||
}
|
||||
return n;
|
||||
}
|
||||
|
||||
size_t EncodeConverter::Utf8ToUtf16(const U8CHAR_T* pu8, size_t ilen, U16CHAR_T* pu16)
|
||||
{
|
||||
size_t n = 0;
|
||||
if ((pu8[0] & 0xF0) == 0xE0 && ilen >= 3)
|
||||
{
|
||||
if ((pu8[1] & 0xC0) == 0x80 &&
|
||||
(pu8[2] & 0xC0) == 0x80)
|
||||
{
|
||||
pu16[0] = (((pu8[0] & 0x0F) << 4) | ((pu8[1] & 0x3C) >> 2));
|
||||
pu16[0] <<= 8;
|
||||
pu16[0] |= (((pu8[1] & 0x03) << 6) | (pu8[2] & 0x3F));
|
||||
n = 3;
|
||||
if( !is3ByteUtf16(pu16[0]) )
|
||||
{
|
||||
pu16[0] = 0x0000;
|
||||
n = 1;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
pu16[0] = 0x0000;
|
||||
n = 1;
|
||||
}
|
||||
}
|
||||
else if ((pu8[0] & 0xE0) == 0xC0 && ilen >= 2)
|
||||
{
|
||||
if ((pu8[1] & 0xC0) == 0x80)
|
||||
{
|
||||
pu16[0] = ((pu8[0] & 0x1C) >> 2);
|
||||
pu16[0] <<= 8;
|
||||
pu16[0] |= (((pu8[0] & 0x03) << 6) | (pu8[1] & 0x3F));
|
||||
n = 2;
|
||||
if( !is2ByteUtf16(pu16[0]) )
|
||||
{
|
||||
pu16[0] = 0x0000;
|
||||
n = 1;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
pu16[0] = 0x0000;
|
||||
n = 1;
|
||||
}
|
||||
}
|
||||
else if ((pu8[0] & 0x80) == 0x00)
|
||||
{
|
||||
pu16[0] = pu8[0];
|
||||
n = 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
pu16[0] = 0x0000;
|
||||
n = 1;
|
||||
}
|
||||
|
||||
return n;
|
||||
/*
|
||||
size_t n = 0;
|
||||
if ((pu8[0] & 0xF0) == 0xE0)
|
||||
{
|
||||
if (ilen >= 3 && (pu8[1] & 0xC0) == 0x80 &&
|
||||
(pu8[2] & 0xC0) == 0x80)
|
||||
{
|
||||
pu16[0] = (((pu8[0] & 0x0F) << 4) | ((pu8[1] & 0x3C) >> 2));
|
||||
pu16[0] <<= 8;
|
||||
pu16[0] |= (((pu8[1] & 0x03) << 6) | (pu8[2] & 0x3F));
|
||||
}
|
||||
else
|
||||
{
|
||||
pu16[0] = defUniChar;
|
||||
}
|
||||
n = 3;
|
||||
}
|
||||
else if ((pu8[0] & 0xE0) == 0xC0)
|
||||
{
|
||||
if( ilen >= 2 && (pu8[1] & 0xC0) == 0x80)
|
||||
{
|
||||
pu16[0] = ((pu8[0] & 0x1C) >> 2);
|
||||
pu16[0] <<= 8;
|
||||
pu16[0] |= (((pu8[0] & 0x03) << 6) | (pu8[1] & 0x3F));
|
||||
}
|
||||
else
|
||||
{
|
||||
pu16[0] = defUniChar;
|
||||
}
|
||||
n = 2;
|
||||
}
|
||||
else if ((pu8[0] & 0x80) == 0x00)
|
||||
{
|
||||
pu16[0] = pu8[0];
|
||||
n = 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
pu16[0] = defUniChar;
|
||||
n = 1;
|
||||
for (size_t i = 1; i < ilen; i++)
|
||||
{
|
||||
if ((pu8[i] & 0xF0) == 0xE0 || (pu8[i] & 0xE0) == 0xC0 || (pu8[i] & 0x80) == 0x00)
|
||||
break;
|
||||
n++;
|
||||
}
|
||||
}
|
||||
|
||||
return n;
|
||||
*/
|
||||
}
|
||||
|
||||
size_t EncodeConverter::Utf8ToUtf16(const U8CHAR_T* pu8, size_t ilen,
|
||||
U16CHAR_T* pu16, size_t olen)
|
||||
{
|
||||
int offset = 0;
|
||||
size_t sz = 0;
|
||||
for (size_t i = 0; i < ilen && offset < static_cast<int>(olen); offset ++)
|
||||
{
|
||||
sz = Utf8ToUtf16(pu8 + i, ilen - i, pu16 + offset);
|
||||
i += sz;
|
||||
if (sz == 0) {
|
||||
// failed
|
||||
// assert(sz != 0);
|
||||
break;
|
||||
}
|
||||
}
|
||||
// pu16[offset] = '\0';
|
||||
|
||||
return offset;
|
||||
}
|
||||
|
||||
u16string EncodeConverter::Utf8ToUtf16(const u8string& u8str)
|
||||
{
|
||||
U16CHAR_T* p16 = new U16CHAR_T[u8str.length() + 1];
|
||||
size_t len = Utf8ToUtf16(u8str.data(), u8str.length(),
|
||||
p16, u8str.length() + 1);
|
||||
u16string u16str(p16, len);
|
||||
delete[] p16;
|
||||
|
||||
return u16str;
|
||||
}
|
||||
|
||||
bool EncodeConverter::IsUTF8(const U8CHAR_T* pu8, size_t ilen)
|
||||
{
|
||||
size_t i;
|
||||
size_t n = 0;
|
||||
for (i = 0; i < ilen; i += n)
|
||||
{
|
||||
if ((pu8[i] & 0xF0) == 0xE0 &&
|
||||
(pu8[i + 1] & 0xC0) == 0x80 &&
|
||||
(pu8[i + 2] & 0xC0) == 0x80)
|
||||
{
|
||||
n = 3;
|
||||
}
|
||||
else if ((pu8[i] & 0xE0) == 0xC0 &&
|
||||
(pu8[i + 1] & 0xC0) == 0x80)
|
||||
{
|
||||
n = 2;
|
||||
}
|
||||
else if ((pu8[i] & 0x80) == 0x00)
|
||||
{
|
||||
n = 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return i == ilen;
|
||||
}
|
||||
|
||||
bool EncodeConverter::IsUTF8(const u8string& u8str)
|
||||
{
|
||||
return IsUTF8(u8str.data(), u8str.length());
|
||||
}
|
||||
|
||||
size_t EncodeConverter::GetUTF8Len(const U8CHAR_T* pu8, size_t ilen)
|
||||
{
|
||||
size_t i;
|
||||
size_t n = 0;
|
||||
size_t rlen = 0;
|
||||
for (i = 0; i < ilen; i += n, rlen ++)
|
||||
{
|
||||
if ((pu8[i] & 0xF0) == 0xE0 &&
|
||||
(pu8[i + 1] & 0xC0) == 0x80 &&
|
||||
(pu8[i + 2] & 0xC0) == 0x80)
|
||||
{
|
||||
n = 3;
|
||||
}
|
||||
else if ((pu8[i] & 0xE0) == 0xC0 &&
|
||||
(pu8[i + 1] & 0xC0) == 0x80)
|
||||
{
|
||||
n = 2;
|
||||
}
|
||||
else if ((pu8[i] & 0x80) == 0x00)
|
||||
{
|
||||
n = 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (i == ilen)
|
||||
return 0;
|
||||
else
|
||||
return rlen;
|
||||
}
|
||||
|
||||
size_t EncodeConverter::GetUTF8Len(const u8string& u8str)
|
||||
{
|
||||
return GetUTF8Len(u8str.data(), u8str.length());
|
||||
}
|
||||
|
||||
|
||||
size_t EncodeConverter::Utf16ToUtf8Len(const U16CHAR_T* pu16, size_t ilen)
|
||||
{
|
||||
int offset = 0;
|
||||
for (size_t i = 0; i < ilen ; i++) {
|
||||
if (pu16[i] <= 0x007F)
|
||||
{
|
||||
offset += 1;
|
||||
}
|
||||
else if (pu16[i] >= 0x0080 && pu16[i] <= 0x07FF)
|
||||
{
|
||||
offset += 2;
|
||||
}
|
||||
else if (pu16[i] >= 0x0800)
|
||||
{
|
||||
offset += 3;
|
||||
}
|
||||
}
|
||||
|
||||
return offset;
|
||||
}
|
||||
|
||||
uint16_t EncodeConverter::ToUni(const char* sc, int &len)
|
||||
{
|
||||
uint16_t wide[2];
|
||||
len = (int)Utf8ToUtf16((const U8CHAR_T*)sc, wide);
|
||||
return wide[0];
|
||||
}
|
||||
|
||||
bool EncodeConverter::IsAllChineseCharactor(const U8CHAR_T* pu8, size_t ilen) {
|
||||
if (pu8 == nullptr || ilen <= 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
U16CHAR_T* p16 = new U16CHAR_T[ilen + 1];
|
||||
size_t len = Utf8ToUtf16(pu8, ilen, p16, ilen + 1);
|
||||
for (size_t i = 0; i < len; i++) {
|
||||
if (p16[i] < 0x4e00 || p16[i] > 0x9fff) {
|
||||
delete[] p16;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
delete[] p16;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool EncodeConverter::HasAlpha(const U8CHAR_T* pu8, size_t ilen) {
|
||||
if (pu8 == nullptr || ilen <= 0) {
|
||||
return false;
|
||||
}
|
||||
for (size_t i = 0; i < ilen; i++) {
|
||||
if (pu8[i]> 0 && isalpha(pu8[i])){
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
bool EncodeConverter::IsAllAlpha(const U8CHAR_T* pu8, size_t ilen) {
|
||||
if (pu8 == nullptr || ilen <= 0) {
|
||||
return false;
|
||||
}
|
||||
for (size_t i = 0; i < ilen; i++) {
|
||||
if (!(pu8[i]> 0 && isalpha(pu8[i]))){
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool EncodeConverter::IsAllAlphaAndPunct(const U8CHAR_T* pu8, size_t ilen) {
|
||||
if (pu8 == nullptr || ilen <= 0) {
|
||||
return false;
|
||||
}
|
||||
bool flag1 = HasAlpha(pu8, ilen);
|
||||
if (flag1 == false) {
|
||||
return false;
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < ilen; i++) {
|
||||
if (!(pu8[i]> 0 && (isalpha(pu8[i]) || (ispunct(pu8[i]))))){
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool EncodeConverter::IsAllAlphaAndDigit(const U8CHAR_T* pu8, size_t ilen) {
|
||||
if (pu8 == nullptr || ilen <= 0) {
|
||||
return false;
|
||||
}
|
||||
bool flag1 = HasAlpha(pu8, ilen);
|
||||
if (flag1 == false) {
|
||||
return false;
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < ilen; i++) {
|
||||
if (!(pu8[i]> 0 && (isalnum(pu8[i]) || isalpha(pu8[i]) || pu8[i] == '\''))){
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
bool EncodeConverter::IsAllAlphaAndDigitAndBlank(const U8CHAR_T* pu8, size_t ilen) {
|
||||
if (pu8 == nullptr || ilen <= 0) {
|
||||
return false;
|
||||
}
|
||||
for (size_t i = 0; i < ilen; i++) {
|
||||
if (!(pu8[i]> 0 && (isalnum(pu8[i]) || isalpha(pu8[i]) || isblank(pu8[i]) || pu8[i] == '\''))){
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
bool EncodeConverter::NeedAddTailBlank(std::string str) {
|
||||
U8CHAR_T *pu8 = (U8CHAR_T*)str.data();
|
||||
size_t ilen = str.size();
|
||||
if (pu8 == nullptr || ilen <= 0) {
|
||||
return false;
|
||||
}
|
||||
if (IsAllAlpha(pu8, ilen) || IsAllAlphaAndPunct(pu8, ilen) || IsAllAlphaAndDigit(pu8, ilen)) {
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
std::vector<std::string> EncodeConverter::MergeEnglishWord(std::vector<std::string> &str_vec_input,
|
||||
std::vector<int> &merge_mask) {
|
||||
std::vector<std::string> output;
|
||||
for (int i = 0; i < merge_mask.size(); i++) {
|
||||
if (merge_mask[i] == 1 && i > 0) {
|
||||
output[output.size() - 1] += str_vec_input[i];
|
||||
} else {
|
||||
output.push_back(str_vec_input[i]);
|
||||
}
|
||||
}
|
||||
str_vec_input.swap(output);
|
||||
return str_vec_input;
|
||||
}
|
||||
size_t EncodeConverter::Utf8ToCharset(const std::string &input, std::vector<std::string> &output) {
|
||||
std::string ch;
|
||||
for (size_t i = 0, len = 0; i != input.length(); i += len) {
|
||||
unsigned char byte = (unsigned)input[i];
|
||||
if (byte >= 0xFC) // lenght 6
|
||||
len = 6;
|
||||
else if (byte >= 0xF8)
|
||||
len = 5;
|
||||
else if (byte >= 0xF0)
|
||||
len = 4;
|
||||
else if (byte >= 0xE0)
|
||||
len = 3;
|
||||
else if (byte >= 0xC0)
|
||||
len = 2;
|
||||
else
|
||||
len = 1;
|
||||
ch = input.substr(i, len);
|
||||
output.push_back(ch);
|
||||
}
|
||||
return output.size();
|
||||
}
|
||||
}
|
||||
109
modules/python/vendors/FunASR/runtime/onnxruntime/src/encode_converter.h
vendored
Normal file
109
modules/python/vendors/FunASR/runtime/onnxruntime/src/encode_converter.h
vendored
Normal file
@@ -0,0 +1,109 @@
|
||||
/**
|
||||
* Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
|
||||
* MIT License (https://opensource.org/licenses/MIT)
|
||||
*/
|
||||
#ifndef __WS__ENCODE_CONVERTER_H__
|
||||
#define __WS__ENCODE_CONVERTER_H__
|
||||
|
||||
#include <string>
|
||||
#include <stdint.h>
|
||||
#include <vector>
|
||||
#ifdef _MSC_VER
|
||||
#include <windows.h>
|
||||
#endif // _MSC_VER
|
||||
|
||||
namespace funasr {
|
||||
typedef unsigned char U8CHAR_T;
|
||||
typedef unsigned short U16CHAR_T;
|
||||
typedef std::basic_string<U8CHAR_T> u8string;
|
||||
typedef std::basic_string<U16CHAR_T> u16string;
|
||||
|
||||
class EncodeConverter {
|
||||
public:
|
||||
static const U16CHAR_T defUniChar = 0x25a1; //WHITE SQUARE
|
||||
|
||||
public:
|
||||
static void SwapEndian(U16CHAR_T* pbuf, size_t len);
|
||||
|
||||
static size_t Utf16ToUtf8(const U16CHAR_T* pu16, U8CHAR_T* pu8);
|
||||
|
||||
///< @param pu16 UTF16 string
|
||||
///< @param pu8 UTF8 string
|
||||
static size_t Utf16ToUtf8(const U16CHAR_T* pu16, size_t ilen,
|
||||
U8CHAR_T* pu8, size_t olen);
|
||||
|
||||
static u8string Utf16ToUtf8(const u16string& u16str);
|
||||
|
||||
static size_t Utf8ToUtf16(const U8CHAR_T* pu8, U16CHAR_T* pu16);
|
||||
|
||||
static size_t Utf8ToUtf16(const U8CHAR_T* pu8, size_t ilen, U16CHAR_T* pu16);
|
||||
|
||||
///< @param pu8 UTF8 string
|
||||
///< @param pu16 UTF16 string
|
||||
static size_t Utf8ToUtf16(const U8CHAR_T* pu8, size_t ilen,
|
||||
U16CHAR_T* pu16, size_t olen);
|
||||
|
||||
static u16string Utf8ToUtf16(const u8string& u8str);
|
||||
|
||||
///< @param pu8 string
|
||||
///< @return if string is encoded as UTF8 - true, otherwise false
|
||||
static bool IsUTF8(const U8CHAR_T* pu8, size_t ilen);
|
||||
|
||||
///< @param u8str string
|
||||
///< @return if string is encoded as UTF8 - true, otherwise false
|
||||
static bool IsUTF8(const u8string& u8str);
|
||||
|
||||
///< @param UTF8 string
|
||||
///< @return the word number of UTF8
|
||||
static size_t GetUTF8Len(const U8CHAR_T* pu8, size_t ilen);
|
||||
|
||||
///< @param UTF8 string
|
||||
///< @return the word number of UTF8
|
||||
static size_t GetUTF8Len(const u8string& u8str);
|
||||
|
||||
///< @param pu16 UTF16 string
|
||||
///< @param ilen UTF16 length
|
||||
///< @return UTF8 string length
|
||||
static size_t Utf16ToUtf8Len(const U16CHAR_T* pu16, size_t ilen);
|
||||
|
||||
static uint16_t ToUni(const char* sc, int &len);
|
||||
|
||||
static bool IsChineseCharacter(U16CHAR_T &u16) {
|
||||
return (u16 >= 0x4e00 && u16 <= 0x9fff) // common
|
||||
|| (u16 >= 0x3400 && u16 <= 0x4dff); // rare, extension A
|
||||
}
|
||||
|
||||
// whether the string is all Chinese
|
||||
static bool IsAllChineseCharactor(const U8CHAR_T* pu8, size_t ilen);
|
||||
static bool HasAlpha(const U8CHAR_T* pu8, size_t ilen);
|
||||
static bool NeedAddTailBlank(std::string str);
|
||||
static bool IsAllAlpha(const U8CHAR_T* pu8, size_t ilen);
|
||||
static bool IsAllAlphaAndPunct(const U8CHAR_T* pu8, size_t ilen);
|
||||
static bool IsAllAlphaAndDigit(const U8CHAR_T* pu8, size_t ilen);
|
||||
static bool IsAllAlphaAndDigitAndBlank(const U8CHAR_T* pu8, size_t ilen);
|
||||
static std::vector<std::string> MergeEnglishWord(std::vector<std::string> &str_vec_input,
|
||||
std::vector<int> &merge_mask);
|
||||
static size_t Utf8ToCharset(const std::string &input, std::vector<std::string> &output);
|
||||
|
||||
#ifdef _MSC_VER
|
||||
// convert to the local ansi page
|
||||
static std::string UTF8ToLocaleAnsi(const std::string& strUTF8) {
|
||||
int len = MultiByteToWideChar(CP_UTF8, 0, strUTF8.c_str(), -1, nullptr, 0);
|
||||
unsigned short*wszGBK = new unsigned short[len + 1];
|
||||
memset(wszGBK, 0, len * 2 + 2);
|
||||
MultiByteToWideChar(CP_UTF8, 0, (LPCCH)strUTF8.c_str(), -1, (LPWSTR)wszGBK, len);
|
||||
|
||||
len = WideCharToMultiByte(CP_ACP, 0, (LPCWCH)wszGBK, -1, nullptr, 0, nullptr, nullptr);
|
||||
char *szGBK = new char[len + 1];
|
||||
memset(szGBK, 0, len + 1);
|
||||
WideCharToMultiByte(CP_ACP, 0, (LPCWCH)wszGBK, -1, szGBK, len, nullptr, nullptr);
|
||||
std::string strTemp(szGBK);
|
||||
delete[]szGBK;
|
||||
delete[]wszGBK;
|
||||
return strTemp;
|
||||
}
|
||||
#endif
|
||||
};
|
||||
}
|
||||
|
||||
#endif //__WS_ENCODE_CONVERTER_H__
|
||||
215
modules/python/vendors/FunASR/runtime/onnxruntime/src/fsmn-vad-online.cpp
vendored
Normal file
215
modules/python/vendors/FunASR/runtime/onnxruntime/src/fsmn-vad-online.cpp
vendored
Normal file
@@ -0,0 +1,215 @@
|
||||
/**
|
||||
* Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
|
||||
* MIT License (https://opensource.org/licenses/MIT)
|
||||
*/
|
||||
|
||||
#include <fstream>
|
||||
#include "precomp.h"
|
||||
|
||||
namespace funasr {
|
||||
|
||||
void FsmnVadOnline::FbankKaldi(float sample_rate, std::vector<std::vector<float>> &vad_feats,
|
||||
std::vector<float> &waves) {
|
||||
knf::OnlineFbank fbank(fbank_opts_);
|
||||
// cache merge
|
||||
waves.insert(waves.begin(), input_cache_.begin(), input_cache_.end());
|
||||
int frame_number = ComputeFrameNum(waves.size(), frame_sample_length_, frame_shift_sample_length_);
|
||||
// Send the audio after the last frame shift position to the cache
|
||||
input_cache_.clear();
|
||||
input_cache_.insert(input_cache_.begin(), waves.begin() + frame_number * frame_shift_sample_length_, waves.end());
|
||||
if (frame_number == 0) {
|
||||
return;
|
||||
}
|
||||
// Delete audio that haven't undergone fbank processing
|
||||
waves.erase(waves.begin() + (frame_number - 1) * frame_shift_sample_length_ + frame_sample_length_, waves.end());
|
||||
|
||||
std::vector<float> buf(waves.size());
|
||||
for (int32_t i = 0; i != waves.size(); ++i) {
|
||||
buf[i] = waves[i] * 32768;
|
||||
}
|
||||
fbank.AcceptWaveform(sample_rate, buf.data(), buf.size());
|
||||
// fbank.AcceptWaveform(sample_rate, &waves[0], waves.size());
|
||||
int32_t frames = fbank.NumFramesReady();
|
||||
for (int32_t i = 0; i != frames; ++i) {
|
||||
const float *frame = fbank.GetFrame(i);
|
||||
vector<float> frame_vector(frame, frame + fbank_opts_.mel_opts.num_bins);
|
||||
vad_feats.emplace_back(frame_vector);
|
||||
}
|
||||
}
|
||||
|
||||
void FsmnVadOnline::ExtractFeats(float sample_rate, vector<std::vector<float>> &vad_feats,
|
||||
vector<float> &waves, bool input_finished) {
|
||||
FbankKaldi(sample_rate, vad_feats, waves);
|
||||
// cache deal & online lfr,cmvn
|
||||
if (vad_feats.size() > 0) {
|
||||
if (!reserve_waveforms_.empty()) {
|
||||
waves.insert(waves.begin(), reserve_waveforms_.begin(), reserve_waveforms_.end());
|
||||
}
|
||||
if (lfr_splice_cache_.empty()) {
|
||||
for (int i = 0; i < (lfr_m - 1) / 2; i++) {
|
||||
lfr_splice_cache_.emplace_back(vad_feats[0]);
|
||||
}
|
||||
}
|
||||
if (vad_feats.size() + lfr_splice_cache_.size() >= lfr_m) {
|
||||
vad_feats.insert(vad_feats.begin(), lfr_splice_cache_.begin(), lfr_splice_cache_.end());
|
||||
int frame_from_waves = (waves.size() - frame_sample_length_) / frame_shift_sample_length_ + 1;
|
||||
int minus_frame = reserve_waveforms_.empty() ? (lfr_m - 1) / 2 : 0;
|
||||
int lfr_splice_frame_idxs = OnlineLfrCmvn(vad_feats, input_finished);
|
||||
int reserve_frame_idx = std::abs(lfr_splice_frame_idxs - minus_frame);
|
||||
reserve_waveforms_.clear();
|
||||
reserve_waveforms_.insert(reserve_waveforms_.begin(),
|
||||
waves.begin() + reserve_frame_idx * frame_shift_sample_length_,
|
||||
waves.begin() + frame_from_waves * frame_shift_sample_length_);
|
||||
int sample_length = (frame_from_waves - 1) * frame_shift_sample_length_ + frame_sample_length_;
|
||||
waves.erase(waves.begin() + sample_length, waves.end());
|
||||
} else {
|
||||
reserve_waveforms_.clear();
|
||||
reserve_waveforms_.insert(reserve_waveforms_.begin(),
|
||||
waves.begin() + frame_sample_length_ - frame_shift_sample_length_, waves.end());
|
||||
lfr_splice_cache_.insert(lfr_splice_cache_.end(), vad_feats.begin(), vad_feats.end());
|
||||
}
|
||||
} else {
|
||||
if (input_finished) {
|
||||
if (!reserve_waveforms_.empty()) {
|
||||
waves = reserve_waveforms_;
|
||||
}
|
||||
vad_feats = lfr_splice_cache_;
|
||||
if(vad_feats.size() == 0){
|
||||
LOG(ERROR) << "vad_feats's size is 0";
|
||||
}else{
|
||||
OnlineLfrCmvn(vad_feats, input_finished);
|
||||
}
|
||||
}
|
||||
}
|
||||
if(input_finished){
|
||||
Reset();
|
||||
ResetCache();
|
||||
}
|
||||
}
|
||||
|
||||
int FsmnVadOnline::OnlineLfrCmvn(vector<vector<float>> &vad_feats, bool input_finished) {
|
||||
vector<vector<float>> out_feats;
|
||||
int T = vad_feats.size();
|
||||
int T_lrf = ceil((T - (lfr_m - 1) / 2) / (float)lfr_n);
|
||||
int lfr_splice_frame_idxs = T_lrf;
|
||||
vector<float> p;
|
||||
for (int i = 0; i < T_lrf; i++) {
|
||||
if (lfr_m <= T - i * lfr_n) {
|
||||
for (int j = 0; j < lfr_m; j++) {
|
||||
p.insert(p.end(), vad_feats[i * lfr_n + j].begin(), vad_feats[i * lfr_n + j].end());
|
||||
}
|
||||
out_feats.emplace_back(p);
|
||||
p.clear();
|
||||
} else {
|
||||
if (input_finished) {
|
||||
int num_padding = lfr_m - (T - i * lfr_n);
|
||||
for (int j = 0; j < (vad_feats.size() - i * lfr_n); j++) {
|
||||
p.insert(p.end(), vad_feats[i * lfr_n + j].begin(), vad_feats[i * lfr_n + j].end());
|
||||
}
|
||||
for (int j = 0; j < num_padding; j++) {
|
||||
p.insert(p.end(), vad_feats[vad_feats.size() - 1].begin(), vad_feats[vad_feats.size() - 1].end());
|
||||
}
|
||||
out_feats.emplace_back(p);
|
||||
p.clear();
|
||||
} else {
|
||||
lfr_splice_frame_idxs = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
lfr_splice_frame_idxs = std::min(T - 1, lfr_splice_frame_idxs * lfr_n);
|
||||
lfr_splice_cache_.clear();
|
||||
lfr_splice_cache_.insert(lfr_splice_cache_.begin(), vad_feats.begin() + lfr_splice_frame_idxs, vad_feats.end());
|
||||
|
||||
// Apply cmvn
|
||||
for (auto &out_feat: out_feats) {
|
||||
for (int j = 0; j < means_list_.size(); j++) {
|
||||
out_feat[j] = (out_feat[j] + means_list_[j]) * vars_list_[j];
|
||||
}
|
||||
}
|
||||
vad_feats = out_feats;
|
||||
return lfr_splice_frame_idxs;
|
||||
}
|
||||
|
||||
std::vector<std::vector<int>>
|
||||
FsmnVadOnline::Infer(std::vector<float> &waves, bool input_finished) {
|
||||
std::vector<std::vector<int>> vad_segments;
|
||||
std::vector<std::vector<float>> vad_feats;
|
||||
std::vector<std::vector<float>> vad_probs;
|
||||
ExtractFeats(vad_sample_rate_, vad_feats, waves, input_finished);
|
||||
if(vad_feats.size() == 0){
|
||||
return vad_segments;
|
||||
}
|
||||
fsmnvad_handle_->Forward(vad_feats, &vad_probs, &in_cache_, input_finished);
|
||||
if(vad_probs.size() == 0){
|
||||
return vad_segments;
|
||||
}
|
||||
|
||||
vad_segments = vad_scorer(vad_probs, waves, input_finished, true, vad_silence_duration_, vad_max_len_,
|
||||
vad_speech_noise_thres_, vad_sample_rate_);
|
||||
return vad_segments;
|
||||
}
|
||||
|
||||
void FsmnVadOnline::InitCache(){
|
||||
std::vector<float> cache_feats(128 * 19 * 1, 0);
|
||||
for (int i=0;i<4;i++){
|
||||
in_cache_.emplace_back(cache_feats);
|
||||
}
|
||||
};
|
||||
|
||||
void FsmnVadOnline::Reset(){
|
||||
in_cache_.clear();
|
||||
InitCache();
|
||||
};
|
||||
|
||||
void FsmnVadOnline::Test() {
|
||||
}
|
||||
|
||||
void FsmnVadOnline::InitOnline(std::shared_ptr<Ort::Session> &vad_session,
|
||||
Ort::Env &env,
|
||||
std::vector<const char *> &vad_in_names,
|
||||
std::vector<const char *> &vad_out_names,
|
||||
knf::FbankOptions &fbank_opts,
|
||||
std::vector<float> &means_list,
|
||||
std::vector<float> &vars_list,
|
||||
int vad_sample_rate,
|
||||
int vad_silence_duration,
|
||||
int vad_max_len,
|
||||
double vad_speech_noise_thres) {
|
||||
vad_session_ = vad_session;
|
||||
vad_in_names_ = vad_in_names;
|
||||
vad_out_names_ = vad_out_names;
|
||||
fbank_opts_ = fbank_opts;
|
||||
means_list_ = means_list;
|
||||
vars_list_ = vars_list;
|
||||
vad_sample_rate_ = vad_sample_rate;
|
||||
vad_silence_duration_ = vad_silence_duration;
|
||||
vad_max_len_ = vad_max_len;
|
||||
vad_speech_noise_thres_ = vad_speech_noise_thres;
|
||||
|
||||
frame_sample_length_ = vad_sample_rate_ / 1000 * 25;;
|
||||
frame_shift_sample_length_ = vad_sample_rate_ / 1000 * 10;
|
||||
|
||||
// 2pass
|
||||
audio_handle = make_unique<Audio>(vad_sample_rate,1);
|
||||
}
|
||||
|
||||
FsmnVadOnline::~FsmnVadOnline() {
|
||||
}
|
||||
|
||||
FsmnVadOnline::FsmnVadOnline(FsmnVad* fsmnvad_handle):fsmnvad_handle_(std::move(fsmnvad_handle)),session_options_{}{
|
||||
InitCache();
|
||||
InitOnline(fsmnvad_handle_->vad_session_,
|
||||
fsmnvad_handle_->env_,
|
||||
fsmnvad_handle_->vad_in_names_,
|
||||
fsmnvad_handle_->vad_out_names_,
|
||||
fsmnvad_handle_->fbank_opts_,
|
||||
fsmnvad_handle_->means_list_,
|
||||
fsmnvad_handle_->vars_list_,
|
||||
fsmnvad_handle_->vad_sample_rate_,
|
||||
fsmnvad_handle_->vad_silence_duration_,
|
||||
fsmnvad_handle_->vad_max_len_,
|
||||
fsmnvad_handle_->vad_speech_noise_thres_);
|
||||
}
|
||||
|
||||
} // namespace funasr
|
||||
92
modules/python/vendors/FunASR/runtime/onnxruntime/src/fsmn-vad-online.h
vendored
Normal file
92
modules/python/vendors/FunASR/runtime/onnxruntime/src/fsmn-vad-online.h
vendored
Normal file
@@ -0,0 +1,92 @@
|
||||
/**
|
||||
* Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
|
||||
* MIT License (https://opensource.org/licenses/MIT)
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
#include "precomp.h"
|
||||
|
||||
namespace funasr {
|
||||
class FsmnVadOnline : public VadModel {
|
||||
/**
|
||||
* Author: Speech Lab of DAMO Academy, Alibaba Group
|
||||
* Deep-FSMN for Large Vocabulary Continuous Speech Recognition
|
||||
* https://arxiv.org/abs/1803.05030
|
||||
*/
|
||||
|
||||
public:
|
||||
explicit FsmnVadOnline(FsmnVad* fsmnvad_handle);
|
||||
~FsmnVadOnline();
|
||||
void Test();
|
||||
std::vector<std::vector<int>> Infer(std::vector<float> &waves, bool input_finished);
|
||||
void ExtractFeats(float sample_rate, vector<vector<float>> &vad_feats, vector<float> &waves, bool input_finished);
|
||||
void Reset();
|
||||
int GetVadSampleRate() { return vad_sample_rate_; };
|
||||
|
||||
// 2pass
|
||||
std::unique_ptr<Audio> audio_handle = nullptr;
|
||||
|
||||
private:
|
||||
E2EVadModel vad_scorer = E2EVadModel();
|
||||
// std::unique_ptr<FsmnVad> fsmnvad_handle_;
|
||||
FsmnVad* fsmnvad_handle_ = nullptr;
|
||||
|
||||
void FbankKaldi(float sample_rate, std::vector<std::vector<float>> &vad_feats,
|
||||
std::vector<float> &waves);
|
||||
int OnlineLfrCmvn(vector<vector<float>> &vad_feats, bool input_finished);
|
||||
void InitVad(const std::string &vad_model, const std::string &vad_cmvn, const std::string &vad_config, int thread_num){}
|
||||
void InitCache();
|
||||
void InitOnline(std::shared_ptr<Ort::Session> &vad_session,
|
||||
Ort::Env &env,
|
||||
std::vector<const char *> &vad_in_names,
|
||||
std::vector<const char *> &vad_out_names,
|
||||
knf::FbankOptions &fbank_opts,
|
||||
std::vector<float> &means_list,
|
||||
std::vector<float> &vars_list,
|
||||
int vad_sample_rate,
|
||||
int vad_silence_duration,
|
||||
int vad_max_len,
|
||||
double vad_speech_noise_thres);
|
||||
|
||||
static int ComputeFrameNum(int sample_length, int frame_sample_length, int frame_shift_sample_length) {
|
||||
int frame_num = static_cast<int>((sample_length - frame_sample_length) / frame_shift_sample_length + 1);
|
||||
if (frame_num >= 1 && sample_length >= frame_sample_length)
|
||||
return frame_num;
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
void ResetCache() {
|
||||
reserve_waveforms_.clear();
|
||||
input_cache_.clear();
|
||||
lfr_splice_cache_.clear();
|
||||
}
|
||||
|
||||
// from fsmnvad_handle_
|
||||
std::shared_ptr<Ort::Session> vad_session_ = nullptr;
|
||||
Ort::Env env_;
|
||||
Ort::SessionOptions session_options_;
|
||||
std::vector<const char *> vad_in_names_;
|
||||
std::vector<const char *> vad_out_names_;
|
||||
knf::FbankOptions fbank_opts_;
|
||||
std::vector<float> means_list_;
|
||||
std::vector<float> vars_list_;
|
||||
|
||||
std::vector<std::vector<float>> in_cache_;
|
||||
// The reserved waveforms by fbank
|
||||
std::vector<float> reserve_waveforms_;
|
||||
// waveforms reserved after last shift position
|
||||
std::vector<float> input_cache_;
|
||||
// lfr reserved cache
|
||||
std::vector<std::vector<float>> lfr_splice_cache_;
|
||||
|
||||
int vad_sample_rate_ = MODEL_SAMPLE_RATE;
|
||||
int vad_silence_duration_ = VAD_SILENCE_DURATION;
|
||||
int vad_max_len_ = VAD_MAX_LEN;
|
||||
double vad_speech_noise_thres_ = VAD_SPEECH_NOISE_THRES;
|
||||
int lfr_m = VAD_LFR_M;
|
||||
int lfr_n = VAD_LFR_N;
|
||||
int frame_sample_length_ = vad_sample_rate_ / 1000 * 25;;
|
||||
int frame_shift_sample_length_ = vad_sample_rate_ / 1000 * 10;
|
||||
};
|
||||
|
||||
} // namespace funasr
|
||||
274
modules/python/vendors/FunASR/runtime/onnxruntime/src/fsmn-vad.cpp
vendored
Normal file
274
modules/python/vendors/FunASR/runtime/onnxruntime/src/fsmn-vad.cpp
vendored
Normal file
@@ -0,0 +1,274 @@
|
||||
/**
|
||||
* Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
|
||||
* MIT License (https://opensource.org/licenses/MIT)
|
||||
*/
|
||||
|
||||
#include <fstream>
|
||||
#include "precomp.h"
|
||||
|
||||
namespace funasr {
|
||||
void FsmnVad::InitVad(const std::string &vad_model, const std::string &vad_cmvn, const std::string &vad_config, int thread_num) {
|
||||
session_options_.SetIntraOpNumThreads(thread_num);
|
||||
session_options_.SetGraphOptimizationLevel(ORT_ENABLE_ALL);
|
||||
session_options_.DisableCpuMemArena();
|
||||
|
||||
ReadModel(vad_model.c_str());
|
||||
LoadCmvn(vad_cmvn.c_str());
|
||||
LoadConfigFromYaml(vad_config.c_str());
|
||||
InitCache();
|
||||
}
|
||||
|
||||
void FsmnVad::LoadConfigFromYaml(const char* filename){
|
||||
|
||||
YAML::Node config;
|
||||
try{
|
||||
config = YAML::LoadFile(filename);
|
||||
}catch(exception const &e){
|
||||
LOG(ERROR) << "Error loading file, yaml file error or not exist.";
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
try{
|
||||
YAML::Node frontend_conf = config["frontend_conf"];
|
||||
YAML::Node post_conf = config["model_conf"];
|
||||
|
||||
this->vad_sample_rate_ = frontend_conf["fs"].as<int>();
|
||||
this->vad_silence_duration_ = post_conf["max_end_silence_time"].as<int>();
|
||||
this->vad_max_len_ = post_conf["max_single_segment_time"].as<int>();
|
||||
this->vad_speech_noise_thres_ = post_conf["speech_noise_thres"].as<double>();
|
||||
|
||||
fbank_opts_.frame_opts.dither = frontend_conf["dither"].as<float>();
|
||||
fbank_opts_.mel_opts.num_bins = frontend_conf["n_mels"].as<int>();
|
||||
fbank_opts_.frame_opts.samp_freq = (float)vad_sample_rate_;
|
||||
fbank_opts_.frame_opts.window_type = frontend_conf["window"].as<string>();
|
||||
fbank_opts_.frame_opts.frame_shift_ms = frontend_conf["frame_shift"].as<float>();
|
||||
fbank_opts_.frame_opts.frame_length_ms = frontend_conf["frame_length"].as<float>();
|
||||
fbank_opts_.energy_floor = 0;
|
||||
fbank_opts_.mel_opts.debug_mel = false;
|
||||
}catch(exception const &e){
|
||||
LOG(ERROR) << "Error when load argument from vad config YAML.";
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
|
||||
void FsmnVad::ReadModel(const char* vad_model) {
|
||||
try {
|
||||
vad_session_ = std::make_shared<Ort::Session>(
|
||||
env_, ORTCHAR(vad_model), session_options_);
|
||||
LOG(INFO) << "Successfully load model from " << vad_model;
|
||||
} catch (std::exception const &e) {
|
||||
LOG(ERROR) << "Error when load vad onnx model: " << e.what();
|
||||
exit(-1);
|
||||
}
|
||||
GetInputNames(vad_session_.get(), m_strInputNames, vad_in_names_);
|
||||
GetOutputNames(vad_session_.get(), m_strOutputNames, vad_out_names_);
|
||||
}
|
||||
|
||||
void FsmnVad::Forward(
|
||||
const std::vector<std::vector<float>> &chunk_feats,
|
||||
std::vector<std::vector<float>> *out_prob,
|
||||
std::vector<std::vector<float>> *in_cache,
|
||||
bool is_final) {
|
||||
Ort::MemoryInfo memory_info =
|
||||
Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU);
|
||||
|
||||
int num_frames = chunk_feats.size();
|
||||
const int feature_dim = chunk_feats[0].size();
|
||||
|
||||
// 2. Generate input nodes tensor
|
||||
// vad node { batch,frame number,feature dim }
|
||||
const int64_t vad_feats_shape[3] = {1, num_frames, feature_dim};
|
||||
std::vector<float> vad_feats;
|
||||
for (const auto &chunk_feat: chunk_feats) {
|
||||
vad_feats.insert(vad_feats.end(), chunk_feat.begin(), chunk_feat.end());
|
||||
}
|
||||
Ort::Value vad_feats_ort = Ort::Value::CreateTensor<float>(
|
||||
memory_info, vad_feats.data(), vad_feats.size(), vad_feats_shape, 3);
|
||||
|
||||
// 3. Put nodes into onnx input vector
|
||||
std::vector<Ort::Value> vad_inputs;
|
||||
vad_inputs.emplace_back(std::move(vad_feats_ort));
|
||||
// 4 caches
|
||||
// cache node {batch,128,19,1}
|
||||
const int64_t cache_feats_shape[4] = {1, 128, 19, 1};
|
||||
for (int i = 0; i < in_cache->size(); i++) {
|
||||
vad_inputs.emplace_back(std::move(Ort::Value::CreateTensor<float>(
|
||||
memory_info, (*in_cache)[i].data(), (*in_cache)[i].size(), cache_feats_shape, 4)));
|
||||
}
|
||||
|
||||
// 4. Onnx infer
|
||||
std::vector<Ort::Value> vad_ort_outputs;
|
||||
try {
|
||||
vad_ort_outputs = vad_session_->Run(
|
||||
Ort::RunOptions{nullptr}, vad_in_names_.data(), vad_inputs.data(),
|
||||
vad_inputs.size(), vad_out_names_.data(), vad_out_names_.size());
|
||||
} catch (std::exception const &e) {
|
||||
LOG(ERROR) << "Error when run vad onnx forword: " << (e.what());
|
||||
return;
|
||||
}
|
||||
|
||||
// 5. Change infer result to output shapes
|
||||
float *logp_data = vad_ort_outputs[0].GetTensorMutableData<float>();
|
||||
auto type_info = vad_ort_outputs[0].GetTensorTypeAndShapeInfo();
|
||||
|
||||
int num_outputs = type_info.GetShape()[1];
|
||||
int output_dim = type_info.GetShape()[2];
|
||||
out_prob->resize(num_outputs);
|
||||
for (int i = 0; i < num_outputs; i++) {
|
||||
(*out_prob)[i].resize(output_dim);
|
||||
memcpy((*out_prob)[i].data(), logp_data + i * output_dim,
|
||||
sizeof(float) * output_dim);
|
||||
}
|
||||
|
||||
// get 4 caches outputs,each size is 128*19
|
||||
if(!is_final){
|
||||
for (int i = 1; i < 5; i++) {
|
||||
float* data = vad_ort_outputs[i].GetTensorMutableData<float>();
|
||||
memcpy((*in_cache)[i-1].data(), data, sizeof(float) * 128*19);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void FsmnVad::FbankKaldi(float sample_rate, std::vector<std::vector<float>> &vad_feats,
|
||||
std::vector<float> &waves) {
|
||||
knf::OnlineFbank fbank(fbank_opts_);
|
||||
|
||||
std::vector<float> buf(waves.size());
|
||||
for (int32_t i = 0; i != waves.size(); ++i) {
|
||||
buf[i] = waves[i] * 32768;
|
||||
}
|
||||
fbank.AcceptWaveform(sample_rate, buf.data(), buf.size());
|
||||
int32_t frames = fbank.NumFramesReady();
|
||||
for (int32_t i = 0; i != frames; ++i) {
|
||||
const float *frame = fbank.GetFrame(i);
|
||||
std::vector<float> frame_vector(frame, frame + fbank_opts_.mel_opts.num_bins);
|
||||
vad_feats.emplace_back(frame_vector);
|
||||
}
|
||||
}
|
||||
|
||||
void FsmnVad::LoadCmvn(const char *filename)
|
||||
{
|
||||
try{
|
||||
using namespace std;
|
||||
ifstream cmvn_stream(filename);
|
||||
if (!cmvn_stream.is_open()) {
|
||||
LOG(ERROR) << "Failed to open file: " << filename;
|
||||
exit(-1);
|
||||
}
|
||||
string line;
|
||||
|
||||
while (getline(cmvn_stream, line)) {
|
||||
istringstream iss(line);
|
||||
vector<string> line_item{istream_iterator<string>{iss}, istream_iterator<string>{}};
|
||||
if (line_item[0] == "<AddShift>") {
|
||||
getline(cmvn_stream, line);
|
||||
istringstream means_lines_stream(line);
|
||||
vector<string> means_lines{istream_iterator<string>{means_lines_stream}, istream_iterator<string>{}};
|
||||
if (means_lines[0] == "<LearnRateCoef>") {
|
||||
for (int j = 3; j < means_lines.size() - 1; j++) {
|
||||
means_list_.push_back(stof(means_lines[j]));
|
||||
}
|
||||
continue;
|
||||
}
|
||||
}
|
||||
else if (line_item[0] == "<Rescale>") {
|
||||
getline(cmvn_stream, line);
|
||||
istringstream vars_lines_stream(line);
|
||||
vector<string> vars_lines{istream_iterator<string>{vars_lines_stream}, istream_iterator<string>{}};
|
||||
if (vars_lines[0] == "<LearnRateCoef>") {
|
||||
for (int j = 3; j < vars_lines.size() - 1; j++) {
|
||||
// vars_list_.push_back(stof(vars_lines[j])*scale);
|
||||
vars_list_.push_back(stof(vars_lines[j]));
|
||||
}
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
}catch(std::exception const &e) {
|
||||
LOG(ERROR) << "Error when load vad cmvn : " << e.what();
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
|
||||
void FsmnVad::LfrCmvn(std::vector<std::vector<float>> &vad_feats) {
|
||||
|
||||
std::vector<std::vector<float>> out_feats;
|
||||
int T = vad_feats.size();
|
||||
int T_lrf = ceil(1.0 * T / lfr_n);
|
||||
|
||||
// Pad frames at start(copy first frame)
|
||||
for (int i = 0; i < (lfr_m - 1) / 2; i++) {
|
||||
vad_feats.insert(vad_feats.begin(), vad_feats[0]);
|
||||
}
|
||||
// Merge lfr_m frames as one,lfr_n frames per window
|
||||
T = T + (lfr_m - 1) / 2;
|
||||
std::vector<float> p;
|
||||
for (int i = 0; i < T_lrf; i++) {
|
||||
if (lfr_m <= T - i * lfr_n) {
|
||||
for (int j = 0; j < lfr_m; j++) {
|
||||
p.insert(p.end(), vad_feats[i * lfr_n + j].begin(), vad_feats[i * lfr_n + j].end());
|
||||
}
|
||||
out_feats.emplace_back(p);
|
||||
p.clear();
|
||||
} else {
|
||||
// Fill to lfr_m frames at last window if less than lfr_m frames (copy last frame)
|
||||
int num_padding = lfr_m - (T - i * lfr_n);
|
||||
for (int j = 0; j < (vad_feats.size() - i * lfr_n); j++) {
|
||||
p.insert(p.end(), vad_feats[i * lfr_n + j].begin(), vad_feats[i * lfr_n + j].end());
|
||||
}
|
||||
for (int j = 0; j < num_padding; j++) {
|
||||
p.insert(p.end(), vad_feats[vad_feats.size() - 1].begin(), vad_feats[vad_feats.size() - 1].end());
|
||||
}
|
||||
out_feats.emplace_back(p);
|
||||
p.clear();
|
||||
}
|
||||
}
|
||||
// Apply cmvn
|
||||
for (auto &out_feat: out_feats) {
|
||||
for (int j = 0; j < means_list_.size(); j++) {
|
||||
out_feat[j] = (out_feat[j] + means_list_[j]) * vars_list_[j];
|
||||
}
|
||||
}
|
||||
vad_feats = out_feats;
|
||||
}
|
||||
|
||||
std::vector<std::vector<int>>
|
||||
FsmnVad::Infer(std::vector<float> &waves, bool input_finished) {
|
||||
std::vector<std::vector<float>> vad_feats;
|
||||
std::vector<std::vector<float>> vad_probs;
|
||||
std::vector<std::vector<int>> vad_segments;
|
||||
FbankKaldi(vad_sample_rate_, vad_feats, waves);
|
||||
if(vad_feats.size() == 0){
|
||||
return vad_segments;
|
||||
}
|
||||
LfrCmvn(vad_feats);
|
||||
Forward(vad_feats, &vad_probs, &in_cache_, input_finished);
|
||||
|
||||
E2EVadModel vad_scorer = E2EVadModel();
|
||||
vad_segments = vad_scorer(vad_probs, waves, true, false, vad_silence_duration_, vad_max_len_,
|
||||
vad_speech_noise_thres_, vad_sample_rate_);
|
||||
return vad_segments;
|
||||
}
|
||||
|
||||
void FsmnVad::InitCache(){
|
||||
std::vector<float> cache_feats(128 * 19 * 1, 0);
|
||||
for (int i=0;i<4;i++){
|
||||
in_cache_.emplace_back(cache_feats);
|
||||
}
|
||||
};
|
||||
|
||||
void FsmnVad::Reset(){
|
||||
in_cache_.clear();
|
||||
InitCache();
|
||||
};
|
||||
|
||||
void FsmnVad::Test() {
|
||||
}
|
||||
|
||||
FsmnVad::~FsmnVad() {
|
||||
}
|
||||
|
||||
FsmnVad::FsmnVad():env_(ORT_LOGGING_LEVEL_ERROR, ""),session_options_{} {
|
||||
}
|
||||
|
||||
} // namespace funasr
|
||||
68
modules/python/vendors/FunASR/runtime/onnxruntime/src/fsmn-vad.h
vendored
Normal file
68
modules/python/vendors/FunASR/runtime/onnxruntime/src/fsmn-vad.h
vendored
Normal file
@@ -0,0 +1,68 @@
|
||||
/**
|
||||
* Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
|
||||
* MIT License (https://opensource.org/licenses/MIT)
|
||||
*/
|
||||
|
||||
#ifndef VAD_SERVER_FSMNVAD_H
|
||||
#define VAD_SERVER_FSMNVAD_H
|
||||
|
||||
#include "precomp.h"
|
||||
|
||||
namespace funasr {
|
||||
class FsmnVad : public VadModel {
|
||||
/**
|
||||
* Author: Speech Lab of DAMO Academy, Alibaba Group
|
||||
* Deep-FSMN for Large Vocabulary Continuous Speech Recognition
|
||||
* https://arxiv.org/abs/1803.05030
|
||||
*/
|
||||
|
||||
public:
|
||||
FsmnVad();
|
||||
~FsmnVad();
|
||||
void Test();
|
||||
void InitVad(const std::string &vad_model, const std::string &vad_cmvn, const std::string &vad_config, int thread_num);
|
||||
std::vector<std::vector<int>> Infer(std::vector<float> &waves, bool input_finished=true);
|
||||
void Forward(
|
||||
const std::vector<std::vector<float>> &chunk_feats,
|
||||
std::vector<std::vector<float>> *out_prob,
|
||||
std::vector<std::vector<float>> *in_cache,
|
||||
bool is_final);
|
||||
void Reset();
|
||||
|
||||
int GetVadSampleRate() { return vad_sample_rate_; };
|
||||
|
||||
std::shared_ptr<Ort::Session> vad_session_ = nullptr;
|
||||
Ort::Env env_;
|
||||
Ort::SessionOptions session_options_;
|
||||
vector<string> m_strInputNames, m_strOutputNames;
|
||||
std::vector<const char *> vad_in_names_;
|
||||
std::vector<const char *> vad_out_names_;
|
||||
std::vector<std::vector<float>> in_cache_;
|
||||
|
||||
knf::FbankOptions fbank_opts_;
|
||||
std::vector<float> means_list_;
|
||||
std::vector<float> vars_list_;
|
||||
|
||||
int vad_sample_rate_ = MODEL_SAMPLE_RATE;
|
||||
int vad_silence_duration_ = VAD_SILENCE_DURATION;
|
||||
int vad_max_len_ = VAD_MAX_LEN;
|
||||
double vad_speech_noise_thres_ = VAD_SPEECH_NOISE_THRES;
|
||||
int lfr_m = VAD_LFR_M;
|
||||
int lfr_n = VAD_LFR_N;
|
||||
|
||||
private:
|
||||
|
||||
void ReadModel(const char* vad_model);
|
||||
void LoadConfigFromYaml(const char* filename);
|
||||
|
||||
void FbankKaldi(float sample_rate, std::vector<std::vector<float>> &vad_feats,
|
||||
std::vector<float> &waves);
|
||||
|
||||
void LfrCmvn(std::vector<std::vector<float>> &vad_feats);
|
||||
void LoadCmvn(const char *filename);
|
||||
void InitCache();
|
||||
|
||||
};
|
||||
|
||||
} // namespace funasr
|
||||
#endif //VAD_SERVER_FSMNVAD_H
|
||||
883
modules/python/vendors/FunASR/runtime/onnxruntime/src/funasrruntime.cpp
vendored
Normal file
883
modules/python/vendors/FunASR/runtime/onnxruntime/src/funasrruntime.cpp
vendored
Normal file
@@ -0,0 +1,883 @@
|
||||
#include "precomp.h"
|
||||
#include <vector>
|
||||
|
||||
|
||||
// APIs for Init
|
||||
_FUNASRAPI FUNASR_HANDLE FunASRInit(std::map<std::string, std::string>& model_path, int thread_num, ASR_TYPE type)
|
||||
{
|
||||
funasr::Model* mm = funasr::CreateModel(model_path, thread_num, type);
|
||||
return mm;
|
||||
}
|
||||
|
||||
_FUNASRAPI FUNASR_HANDLE FunASROnlineInit(FUNASR_HANDLE asr_hanlde, std::vector<int> chunk_size)
|
||||
{
|
||||
funasr::Model* mm = funasr::CreateModel(asr_hanlde, chunk_size);
|
||||
return mm;
|
||||
}
|
||||
|
||||
_FUNASRAPI FUNASR_HANDLE FsmnVadInit(std::map<std::string, std::string>& model_path, int thread_num)
|
||||
{
|
||||
funasr::VadModel* mm = funasr::CreateVadModel(model_path, thread_num);
|
||||
return mm;
|
||||
}
|
||||
|
||||
_FUNASRAPI FUNASR_HANDLE FsmnVadOnlineInit(FUNASR_HANDLE fsmnvad_handle)
|
||||
{
|
||||
funasr::VadModel* mm = funasr::CreateVadModel(fsmnvad_handle);
|
||||
return mm;
|
||||
}
|
||||
|
||||
_FUNASRAPI FUNASR_HANDLE CTTransformerInit(std::map<std::string, std::string>& model_path, int thread_num, PUNC_TYPE type)
|
||||
{
|
||||
funasr::PuncModel* mm = funasr::CreatePuncModel(model_path, thread_num, type);
|
||||
return mm;
|
||||
}
|
||||
|
||||
_FUNASRAPI FUNASR_HANDLE FunOfflineInit(std::map<std::string, std::string>& model_path, int thread_num, bool use_gpu, int batch_size)
|
||||
{
|
||||
funasr::OfflineStream* mm = funasr::CreateOfflineStream(model_path, thread_num, use_gpu, batch_size);
|
||||
return mm;
|
||||
}
|
||||
|
||||
_FUNASRAPI FUNASR_HANDLE FunTpassInit(std::map<std::string, std::string>& model_path, int thread_num)
|
||||
{
|
||||
funasr::TpassStream* mm = funasr::CreateTpassStream(model_path, thread_num);
|
||||
return mm;
|
||||
}
|
||||
|
||||
_FUNASRAPI FUNASR_HANDLE FunTpassOnlineInit(FUNASR_HANDLE tpass_handle, std::vector<int> chunk_size)
|
||||
{
|
||||
return funasr::CreateTpassOnlineStream(tpass_handle, chunk_size);
|
||||
}
|
||||
|
||||
// APIs for ASR Infer
|
||||
_FUNASRAPI FUNASR_RESULT FunASRInferBuffer(FUNASR_HANDLE handle, const char* sz_buf, int n_len, FUNASR_MODE mode, QM_CALLBACK fn_callback, bool input_finished, int sampling_rate, std::string wav_format)
|
||||
{
|
||||
funasr::Model* recog_obj = (funasr::Model*)handle;
|
||||
if (!recog_obj)
|
||||
return nullptr;
|
||||
|
||||
funasr::Audio audio(recog_obj->GetAsrSampleRate(),1);
|
||||
if(wav_format == "pcm" || wav_format == "PCM"){
|
||||
if (!audio.LoadPcmwav(sz_buf, n_len, &sampling_rate))
|
||||
return nullptr;
|
||||
}else{
|
||||
if (!audio.FfmpegLoad(sz_buf, n_len))
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
float* buff;
|
||||
int len;
|
||||
int flag = 0;
|
||||
funasr::FUNASR_RECOG_RESULT* p_result = new funasr::FUNASR_RECOG_RESULT;
|
||||
p_result->snippet_time = audio.GetTimeLen();
|
||||
if(p_result->snippet_time == 0){
|
||||
return p_result;
|
||||
}
|
||||
|
||||
while (audio.Fetch(buff, len, flag) > 0) {
|
||||
string msg = recog_obj->Forward(buff, len, input_finished);
|
||||
p_result->msg += msg;
|
||||
}
|
||||
return p_result;
|
||||
}
|
||||
|
||||
_FUNASRAPI FUNASR_RESULT FunASRInfer(FUNASR_HANDLE handle, const char* sz_filename, FUNASR_MODE mode, QM_CALLBACK fn_callback, int sampling_rate)
|
||||
{
|
||||
funasr::Model* recog_obj = (funasr::Model*)handle;
|
||||
if (!recog_obj)
|
||||
return nullptr;
|
||||
|
||||
funasr::Audio audio(recog_obj->GetAsrSampleRate(),1);
|
||||
if(funasr::is_target_file(sz_filename, "wav")){
|
||||
int32_t sampling_rate_ = -1;
|
||||
if(!audio.LoadWav(sz_filename, &sampling_rate_))
|
||||
return nullptr;
|
||||
}else if(funasr::is_target_file(sz_filename, "pcm")){
|
||||
if (!audio.LoadPcmwav(sz_filename, &sampling_rate))
|
||||
return nullptr;
|
||||
}else{
|
||||
if (!audio.FfmpegLoad(sz_filename))
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
float* buff;
|
||||
int len;
|
||||
int flag = 0;
|
||||
funasr::FUNASR_RECOG_RESULT* p_result = new funasr::FUNASR_RECOG_RESULT;
|
||||
p_result->snippet_time = audio.GetTimeLen();
|
||||
if(p_result->snippet_time == 0){
|
||||
return p_result;
|
||||
}
|
||||
while (audio.Fetch(buff, len, flag) > 0) {
|
||||
string msg = recog_obj->Forward(buff, len, true);
|
||||
p_result->msg += msg;
|
||||
}
|
||||
return p_result;
|
||||
}
|
||||
|
||||
// APIs for VAD Infer
|
||||
_FUNASRAPI FUNASR_RESULT FsmnVadInferBuffer(FUNASR_HANDLE handle, const char* sz_buf, int n_len, QM_CALLBACK fn_callback, bool input_finished, int sampling_rate, std::string wav_format)
|
||||
{
|
||||
funasr::VadModel* vad_obj = (funasr::VadModel*)handle;
|
||||
if (!vad_obj)
|
||||
return nullptr;
|
||||
|
||||
funasr::Audio audio(vad_obj->GetVadSampleRate(),1);
|
||||
if(wav_format == "pcm" || wav_format == "PCM"){
|
||||
if (!audio.LoadPcmwav(sz_buf, n_len, &sampling_rate))
|
||||
return nullptr;
|
||||
}else{
|
||||
if (!audio.FfmpegLoad(sz_buf, n_len))
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
funasr::FUNASR_VAD_RESULT* p_result = new funasr::FUNASR_VAD_RESULT;
|
||||
p_result->snippet_time = audio.GetTimeLen();
|
||||
if(p_result->snippet_time == 0){
|
||||
p_result->segments = new vector<std::vector<int>>();
|
||||
return p_result;
|
||||
}
|
||||
|
||||
vector<std::vector<int>> vad_segments;
|
||||
audio.Split(vad_obj, vad_segments, input_finished);
|
||||
p_result->segments = new vector<std::vector<int>>(vad_segments);
|
||||
|
||||
return p_result;
|
||||
}
|
||||
|
||||
_FUNASRAPI FUNASR_RESULT FsmnVadInfer(FUNASR_HANDLE handle, const char* sz_filename, QM_CALLBACK fn_callback, int sampling_rate)
|
||||
{
|
||||
funasr::VadModel* vad_obj = (funasr::VadModel*)handle;
|
||||
if (!vad_obj)
|
||||
return nullptr;
|
||||
|
||||
funasr::Audio audio(vad_obj->GetVadSampleRate(),1);
|
||||
if(funasr::is_target_file(sz_filename, "wav")){
|
||||
int32_t sampling_rate_ = -1;
|
||||
if(!audio.LoadWav(sz_filename, &sampling_rate_))
|
||||
return nullptr;
|
||||
}else if(funasr::is_target_file(sz_filename, "pcm")){
|
||||
if (!audio.LoadPcmwav(sz_filename, &sampling_rate))
|
||||
return nullptr;
|
||||
}else{
|
||||
if (!audio.FfmpegLoad(sz_filename))
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
funasr::FUNASR_VAD_RESULT* p_result = new funasr::FUNASR_VAD_RESULT;
|
||||
p_result->snippet_time = audio.GetTimeLen();
|
||||
if(p_result->snippet_time == 0){
|
||||
p_result->segments = new vector<std::vector<int>>();
|
||||
return p_result;
|
||||
}
|
||||
|
||||
vector<std::vector<int>> vad_segments;
|
||||
audio.Split(vad_obj, vad_segments, true);
|
||||
p_result->segments = new vector<std::vector<int>>(vad_segments);
|
||||
|
||||
return p_result;
|
||||
}
|
||||
|
||||
// APIs for PUNC Infer
|
||||
_FUNASRAPI FUNASR_RESULT CTTransformerInfer(FUNASR_HANDLE handle, const char* sz_sentence, FUNASR_MODE mode, QM_CALLBACK fn_callback, PUNC_TYPE type, FUNASR_RESULT pre_result)
|
||||
{
|
||||
funasr::PuncModel* punc_obj = (funasr::PuncModel*)handle;
|
||||
if (!punc_obj)
|
||||
return nullptr;
|
||||
|
||||
FUNASR_RESULT p_result = nullptr;
|
||||
if (type==PUNC_OFFLINE){
|
||||
p_result = (FUNASR_RESULT)new funasr::FUNASR_PUNC_RESULT;
|
||||
((funasr::FUNASR_PUNC_RESULT*)p_result)->msg = punc_obj->AddPunc(sz_sentence);
|
||||
}else if(type==PUNC_ONLINE){
|
||||
if (!pre_result)
|
||||
p_result = (FUNASR_RESULT)new funasr::FUNASR_PUNC_RESULT;
|
||||
else
|
||||
p_result = pre_result;
|
||||
((funasr::FUNASR_PUNC_RESULT*)p_result)->msg = punc_obj->AddPunc(sz_sentence, ((funasr::FUNASR_PUNC_RESULT*)p_result)->arr_cache);
|
||||
}else{
|
||||
LOG(ERROR) << "Wrong PUNC_TYPE";
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
return p_result;
|
||||
}
|
||||
|
||||
// APIs for Offline-stream Infer
|
||||
_FUNASRAPI FUNASR_RESULT FunOfflineInferBuffer(FUNASR_HANDLE handle, const char* sz_buf, int n_len,
|
||||
FUNASR_MODE mode, QM_CALLBACK fn_callback, const std::vector<std::vector<float>> &hw_emb,
|
||||
int sampling_rate, std::string wav_format, bool itn, FUNASR_DEC_HANDLE dec_handle,
|
||||
std::string svs_lang, bool svs_itn)
|
||||
{
|
||||
funasr::OfflineStream* offline_stream = (funasr::OfflineStream*)handle;
|
||||
if (!offline_stream)
|
||||
return nullptr;
|
||||
|
||||
funasr::Audio audio(offline_stream->asr_handle->GetAsrSampleRate(),1);
|
||||
try{
|
||||
if(wav_format == "pcm" || wav_format == "PCM"){
|
||||
if (!audio.LoadPcmwav(sz_buf, n_len, &sampling_rate))
|
||||
return nullptr;
|
||||
}else{
|
||||
if (!audio.FfmpegLoad(sz_buf, n_len))
|
||||
return nullptr;
|
||||
}
|
||||
}catch (std::exception const &e)
|
||||
{
|
||||
LOG(ERROR)<<e.what();
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
funasr::FUNASR_RECOG_RESULT* p_result = new funasr::FUNASR_RECOG_RESULT;
|
||||
p_result->snippet_time = audio.GetTimeLen();
|
||||
if(p_result->snippet_time == 0){
|
||||
return p_result;
|
||||
}
|
||||
std::vector<int> index_vector={0};
|
||||
int msg_idx = 0;
|
||||
if(offline_stream->UseVad()){
|
||||
audio.CutSplit(offline_stream, index_vector);
|
||||
}
|
||||
std::vector<string> msgs(index_vector.size());
|
||||
std::vector<float> msg_stimes(index_vector.size());
|
||||
|
||||
float** buff;
|
||||
int* len;
|
||||
int* flag;
|
||||
float* start_time;
|
||||
int batch_size = offline_stream->asr_handle->GetBatchSize();
|
||||
int batch_in = 0;
|
||||
|
||||
std::string cur_stamp = "[";
|
||||
std::string lang = (offline_stream->asr_handle)->GetLang();
|
||||
while (audio.FetchDynamic(buff, len, flag, start_time, batch_size, batch_in) > 0) {
|
||||
// dec reset
|
||||
funasr::WfstDecoder* wfst_decoder = (funasr::WfstDecoder*)dec_handle;
|
||||
if (wfst_decoder){
|
||||
wfst_decoder->StartUtterance();
|
||||
}
|
||||
vector<string> msg_batch;
|
||||
if(offline_stream->GetModelType() == MODEL_SVS){
|
||||
msg_batch = (offline_stream->asr_handle)->Forward(buff, len, true, svs_lang, svs_itn, batch_in);
|
||||
}else{
|
||||
msg_batch = (offline_stream->asr_handle)->Forward(buff, len, true, hw_emb, dec_handle, batch_in);
|
||||
}
|
||||
for(int idx=0; idx<batch_in; idx++){
|
||||
string msg = msg_batch[idx];
|
||||
if(msg_idx < index_vector.size()){
|
||||
msgs[index_vector[msg_idx]] = msg;
|
||||
msg_stimes[index_vector[msg_idx]] = start_time[idx];
|
||||
msg_idx++;
|
||||
}else{
|
||||
LOG(ERROR) << "msg_idx: " << msg_idx <<" is out of range " << index_vector.size();
|
||||
}
|
||||
}
|
||||
|
||||
// release
|
||||
delete[] buff;
|
||||
buff = nullptr;
|
||||
delete[] len;
|
||||
len = nullptr;
|
||||
delete[] flag;
|
||||
flag = nullptr;
|
||||
delete[] start_time;
|
||||
start_time = nullptr;
|
||||
}
|
||||
for(int idx=0; idx<msgs.size(); idx++){
|
||||
string msg = msgs[idx];
|
||||
std::vector<std::string> msg_vec = funasr::SplitStr(msg, " | ");
|
||||
if(msg_vec.size()==0){
|
||||
continue;
|
||||
}
|
||||
if(lang == "en-bpe" && p_result->msg != ""){
|
||||
p_result->msg += " ";
|
||||
}
|
||||
p_result->msg += msg_vec[0];
|
||||
//timestamp
|
||||
if(msg_vec.size() > 1){
|
||||
std::vector<std::string> msg_stamp = funasr::split(msg_vec[1], ',');
|
||||
for(int i=0; i<msg_stamp.size()-1; i+=2){
|
||||
float begin = std::stof(msg_stamp[i])+msg_stimes[idx];
|
||||
float end = std::stof(msg_stamp[i+1])+msg_stimes[idx];
|
||||
cur_stamp += "["+std::to_string((int)(1000*begin))+","+std::to_string((int)(1000*end))+"],";
|
||||
}
|
||||
}
|
||||
}
|
||||
if(cur_stamp != "["){
|
||||
cur_stamp.erase(cur_stamp.length() - 1);
|
||||
p_result->stamp += cur_stamp + "]";
|
||||
}
|
||||
if(offline_stream->UsePunc()){
|
||||
string punc_res = (offline_stream->punc_handle)->AddPunc((p_result->msg).c_str(), lang);
|
||||
p_result->msg = punc_res;
|
||||
}
|
||||
#if !defined(__APPLE__)
|
||||
if(offline_stream->UseITN() && itn){
|
||||
string msg_itn = offline_stream->itn_handle->Normalize(p_result->msg);
|
||||
if(!(p_result->stamp).empty()){
|
||||
std::string new_stamp = funasr::TimestampSmooth(p_result->msg, msg_itn, p_result->stamp);
|
||||
if(!new_stamp.empty()){
|
||||
p_result->stamp = new_stamp;
|
||||
}
|
||||
}
|
||||
p_result->msg = msg_itn;
|
||||
}
|
||||
#endif
|
||||
if (!(p_result->stamp).empty()){
|
||||
p_result->stamp_sents = funasr::TimestampSentence(p_result->msg, p_result->stamp);
|
||||
}
|
||||
return p_result;
|
||||
}
|
||||
|
||||
_FUNASRAPI FUNASR_RESULT FunOfflineInfer(FUNASR_HANDLE handle, const char* sz_filename, FUNASR_MODE mode, QM_CALLBACK fn_callback,
|
||||
const std::vector<std::vector<float>> &hw_emb, int sampling_rate, bool itn, FUNASR_DEC_HANDLE dec_handle)
|
||||
{
|
||||
funasr::OfflineStream* offline_stream = (funasr::OfflineStream*)handle;
|
||||
if (!offline_stream)
|
||||
return nullptr;
|
||||
|
||||
funasr::Audio audio((offline_stream->asr_handle)->GetAsrSampleRate(),1);
|
||||
try{
|
||||
if(funasr::is_target_file(sz_filename, "wav")){
|
||||
int32_t sampling_rate_ = -1;
|
||||
if(!audio.LoadWav(sz_filename, &sampling_rate_))
|
||||
return nullptr;
|
||||
}else if(funasr::is_target_file(sz_filename, "pcm")){
|
||||
if (!audio.LoadPcmwav(sz_filename, &sampling_rate))
|
||||
return nullptr;
|
||||
}else{
|
||||
if (!audio.FfmpegLoad(sz_filename))
|
||||
return nullptr;
|
||||
}
|
||||
}catch (std::exception const &e)
|
||||
{
|
||||
LOG(ERROR)<<e.what();
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
funasr::FUNASR_RECOG_RESULT* p_result = new funasr::FUNASR_RECOG_RESULT;
|
||||
p_result->snippet_time = audio.GetTimeLen();
|
||||
if(p_result->snippet_time == 0){
|
||||
return p_result;
|
||||
}
|
||||
std::vector<int> index_vector={0};
|
||||
int msg_idx = 0;
|
||||
if(offline_stream->UseVad()){
|
||||
audio.CutSplit(offline_stream, index_vector);
|
||||
}
|
||||
std::vector<string> msgs(index_vector.size());
|
||||
std::vector<float> msg_stimes(index_vector.size());
|
||||
|
||||
float** buff;
|
||||
int* len;
|
||||
int* flag;
|
||||
float* start_time;
|
||||
int batch_size = offline_stream->asr_handle->GetBatchSize();
|
||||
int batch_in = 0;
|
||||
|
||||
std::string cur_stamp = "[";
|
||||
std::string lang = (offline_stream->asr_handle)->GetLang();
|
||||
while (audio.FetchDynamic(buff, len, flag, start_time, batch_size, batch_in) > 0) {
|
||||
// dec reset
|
||||
funasr::WfstDecoder* wfst_decoder = (funasr::WfstDecoder*)dec_handle;
|
||||
if (wfst_decoder){
|
||||
wfst_decoder->StartUtterance();
|
||||
}
|
||||
vector<string> msg_batch = (offline_stream->asr_handle)->Forward(buff, len, true, hw_emb, dec_handle, batch_in);
|
||||
for(int idx=0; idx<batch_in; idx++){
|
||||
string msg = msg_batch[idx];
|
||||
if(msg_idx < index_vector.size()){
|
||||
msgs[index_vector[msg_idx]] = msg;
|
||||
msg_stimes[index_vector[msg_idx]] = start_time[idx];
|
||||
msg_idx++;
|
||||
}else{
|
||||
LOG(ERROR) << "msg_idx: " << msg_idx <<" is out of range " << index_vector.size();
|
||||
}
|
||||
}
|
||||
|
||||
// release
|
||||
delete[] buff;
|
||||
buff = nullptr;
|
||||
delete[] len;
|
||||
len = nullptr;
|
||||
delete[] flag;
|
||||
flag = nullptr;
|
||||
delete[] start_time;
|
||||
start_time = nullptr;
|
||||
}
|
||||
for(int idx=0; idx<msgs.size(); idx++){
|
||||
string msg = msgs[idx];
|
||||
std::vector<std::string> msg_vec = funasr::SplitStr(msg, " | ");
|
||||
if(msg_vec.size()==0){
|
||||
continue;
|
||||
}
|
||||
if(lang == "en-bpe" && p_result->msg != ""){
|
||||
p_result->msg += " ";
|
||||
}
|
||||
p_result->msg += msg_vec[0];
|
||||
//timestamp
|
||||
if(msg_vec.size() > 1){
|
||||
std::vector<std::string> msg_stamp = funasr::split(msg_vec[1], ',');
|
||||
for(int i=0; i<msg_stamp.size()-1; i+=2){
|
||||
float begin = std::stof(msg_stamp[i])+msg_stimes[idx];
|
||||
float end = std::stof(msg_stamp[i+1])+msg_stimes[idx];
|
||||
cur_stamp += "["+std::to_string((int)(1000*begin))+","+std::to_string((int)(1000*end))+"],";
|
||||
}
|
||||
}
|
||||
}
|
||||
if(cur_stamp != "["){
|
||||
cur_stamp.erase(cur_stamp.length() - 1);
|
||||
p_result->stamp += cur_stamp + "]";
|
||||
}
|
||||
if(offline_stream->UsePunc()){
|
||||
string punc_res = (offline_stream->punc_handle)->AddPunc((p_result->msg).c_str(), lang);
|
||||
p_result->msg = punc_res;
|
||||
}
|
||||
#if !defined(__APPLE__)
|
||||
if(offline_stream->UseITN() && itn){
|
||||
string msg_itn = offline_stream->itn_handle->Normalize(p_result->msg);
|
||||
if(!(p_result->stamp).empty()){
|
||||
std::string new_stamp = funasr::TimestampSmooth(p_result->msg, msg_itn, p_result->stamp);
|
||||
if(!new_stamp.empty()){
|
||||
p_result->stamp = new_stamp;
|
||||
}
|
||||
}
|
||||
p_result->msg = msg_itn;
|
||||
}
|
||||
#endif
|
||||
if (!(p_result->stamp).empty()){
|
||||
p_result->stamp_sents = funasr::TimestampSentence(p_result->msg, p_result->stamp);
|
||||
}
|
||||
return p_result;
|
||||
}
|
||||
|
||||
//#if !defined(__APPLE__)
|
||||
_FUNASRAPI const std::vector<std::vector<float>> CompileHotwordEmbedding(FUNASR_HANDLE handle, std::string &hotwords, ASR_TYPE mode)
|
||||
{
|
||||
if (mode == ASR_OFFLINE){
|
||||
funasr::OfflineStream* offline_stream = (funasr::OfflineStream*)handle;
|
||||
std::vector<std::vector<float>> emb;
|
||||
if (!offline_stream)
|
||||
return emb;
|
||||
return (offline_stream->asr_handle)->CompileHotwordEmbedding(hotwords);
|
||||
}
|
||||
else if (mode == ASR_TWO_PASS){
|
||||
funasr::TpassStream* tpass_stream = (funasr::TpassStream*)handle;
|
||||
std::vector<std::vector<float>> emb;
|
||||
if (!tpass_stream)
|
||||
return emb;
|
||||
return (tpass_stream->asr_handle)->CompileHotwordEmbedding(hotwords);
|
||||
}
|
||||
else{
|
||||
LOG(ERROR) << "Not implement: Online model does not support Hotword yet!";
|
||||
std::vector<std::vector<float>> emb;
|
||||
return emb;
|
||||
}
|
||||
|
||||
}
|
||||
//#endif
|
||||
|
||||
// APIs for 2pass-stream Infer
|
||||
_FUNASRAPI FUNASR_RESULT FunTpassInferBuffer(FUNASR_HANDLE handle, FUNASR_HANDLE online_handle, const char* sz_buf,
|
||||
int n_len, std::vector<std::vector<std::string>> &punc_cache, bool input_finished,
|
||||
int sampling_rate, std::string wav_format, ASR_TYPE mode,
|
||||
const std::vector<std::vector<float>> &hw_emb, bool itn, FUNASR_DEC_HANDLE dec_handle)
|
||||
{
|
||||
funasr::TpassStream* tpass_stream = (funasr::TpassStream*)handle;
|
||||
funasr::TpassOnlineStream* tpass_online_stream = (funasr::TpassOnlineStream*)online_handle;
|
||||
if (!tpass_stream || !tpass_online_stream)
|
||||
return nullptr;
|
||||
|
||||
funasr::VadModel* vad_online_handle = (tpass_online_stream->vad_online_handle).get();
|
||||
if (!vad_online_handle)
|
||||
return nullptr;
|
||||
|
||||
funasr::Audio* audio = ((funasr::FsmnVadOnline*)vad_online_handle)->audio_handle.get();
|
||||
|
||||
funasr::Model* asr_online_handle = (tpass_online_stream->asr_online_handle).get();
|
||||
if (!asr_online_handle)
|
||||
return nullptr;
|
||||
int chunk_len = ((funasr::ParaformerOnline*)asr_online_handle)->chunk_len;
|
||||
|
||||
funasr::Model* asr_handle = (tpass_stream->asr_handle).get();
|
||||
if (!asr_handle)
|
||||
return nullptr;
|
||||
|
||||
funasr::PuncModel* punc_online_handle = (tpass_stream->punc_online_handle).get();
|
||||
if (!punc_online_handle)
|
||||
return nullptr;
|
||||
|
||||
if(wav_format == "pcm" || wav_format == "PCM"){
|
||||
if (!audio->LoadPcmwavOnline(sz_buf, n_len, &sampling_rate))
|
||||
return nullptr;
|
||||
}else{
|
||||
// if (!audio->FfmpegLoad(sz_buf, n_len))
|
||||
// return nullptr;
|
||||
LOG(ERROR) <<"Wrong wav_format: " << wav_format ;
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
funasr::FUNASR_RECOG_RESULT* p_result = new funasr::FUNASR_RECOG_RESULT;
|
||||
p_result->snippet_time = audio->GetTimeLen();
|
||||
|
||||
audio->Split(vad_online_handle, chunk_len, input_finished, mode);
|
||||
|
||||
funasr::AudioFrame* frame = nullptr;
|
||||
while(audio->FetchChunck(frame) > 0){
|
||||
string msg = ((funasr::ParaformerOnline*)asr_online_handle)->Forward(frame->data, frame->len, frame->is_final);
|
||||
if(mode == ASR_ONLINE){
|
||||
((funasr::ParaformerOnline*)asr_online_handle)->online_res += msg;
|
||||
if(frame->is_final){
|
||||
string online_msg = ((funasr::ParaformerOnline*)asr_online_handle)->online_res;
|
||||
string msg_punc = punc_online_handle->AddPunc(online_msg.c_str(), punc_cache[0]);
|
||||
p_result->tpass_msg = msg_punc;
|
||||
#if !defined(__APPLE__)
|
||||
// ITN
|
||||
if(tpass_stream->UseITN() && itn){
|
||||
string msg_itn = tpass_stream->itn_handle->Normalize(msg_punc);
|
||||
p_result->tpass_msg = msg_itn;
|
||||
}
|
||||
#endif
|
||||
((funasr::ParaformerOnline*)asr_online_handle)->online_res = "";
|
||||
p_result->msg += msg;
|
||||
}else{
|
||||
p_result->msg += msg;
|
||||
}
|
||||
}else if(mode == ASR_TWO_PASS){
|
||||
p_result->msg += msg;
|
||||
}
|
||||
if(frame != nullptr){
|
||||
delete frame;
|
||||
frame = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
// timestamp
|
||||
std::string cur_stamp = "[";
|
||||
while(audio->FetchTpass(frame) > 0){
|
||||
// dec reset
|
||||
funasr::WfstDecoder* wfst_decoder = (funasr::WfstDecoder*)dec_handle;
|
||||
if (wfst_decoder){
|
||||
wfst_decoder->StartUtterance();
|
||||
}
|
||||
float** buff;
|
||||
int* len;
|
||||
buff = new float*[1];
|
||||
len = new int[1];
|
||||
buff[0] = frame->data;
|
||||
len[0] = frame->len;
|
||||
vector<string> msgs = ((funasr::Paraformer*)asr_handle)->Forward(buff, len, frame->is_final, hw_emb, dec_handle);
|
||||
string msg = msgs.size()>0?msgs[0]:"";
|
||||
std::vector<std::string> msg_vec = funasr::SplitStr(msg, " | "); // split with timestamp
|
||||
if(msg_vec.size()==0){
|
||||
continue;
|
||||
}
|
||||
msg = msg_vec[0];
|
||||
//timestamp
|
||||
if(msg_vec.size() > 1){
|
||||
std::vector<std::string> msg_stamp = funasr::split(msg_vec[1], ',');
|
||||
for(int i=0; i<msg_stamp.size()-1; i+=2){
|
||||
float begin = std::stof(msg_stamp[i]) + float(frame->global_start)/1000.0;
|
||||
float end = std::stof(msg_stamp[i+1]) + float(frame->global_start)/1000.0;
|
||||
cur_stamp += "["+std::to_string((int)(1000*begin))+","+std::to_string((int)(1000*end))+"],";
|
||||
}
|
||||
}
|
||||
|
||||
if(cur_stamp != "["){
|
||||
cur_stamp.erase(cur_stamp.length() - 1);
|
||||
p_result->stamp += cur_stamp + "]";
|
||||
}
|
||||
|
||||
string msg_punc = punc_online_handle->AddPunc(msg.c_str(), punc_cache[1]);
|
||||
if(input_finished){
|
||||
msg_punc += "。";
|
||||
}
|
||||
p_result->tpass_msg = msg_punc;
|
||||
#if !defined(__APPLE__)
|
||||
if(tpass_stream->UseITN() && itn){
|
||||
string msg_itn = tpass_stream->itn_handle->Normalize(msg_punc);
|
||||
// TimestampSmooth
|
||||
if(!(p_result->stamp).empty()){
|
||||
std::string new_stamp = funasr::TimestampSmooth(p_result->tpass_msg, msg_itn, p_result->stamp);
|
||||
if(!new_stamp.empty()){
|
||||
p_result->stamp = new_stamp;
|
||||
}
|
||||
}
|
||||
p_result->tpass_msg = msg_itn;
|
||||
}
|
||||
#endif
|
||||
if (!(p_result->stamp).empty()){
|
||||
p_result->stamp_sents = funasr::TimestampSentence(p_result->tpass_msg, p_result->stamp);
|
||||
}
|
||||
if(frame != nullptr){
|
||||
delete frame;
|
||||
frame = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
if(input_finished){
|
||||
audio->ResetIndex();
|
||||
}
|
||||
|
||||
return p_result;
|
||||
}
|
||||
|
||||
_FUNASRAPI const int FunASRGetRetNumber(FUNASR_RESULT result)
|
||||
{
|
||||
if (!result)
|
||||
return 0;
|
||||
return 1;
|
||||
}
|
||||
|
||||
// APIs for GetRetSnippetTime
|
||||
_FUNASRAPI const float FunASRGetRetSnippetTime(FUNASR_RESULT result)
|
||||
{
|
||||
if (!result)
|
||||
return 0.0f;
|
||||
|
||||
return ((funasr::FUNASR_RECOG_RESULT*)result)->snippet_time;
|
||||
}
|
||||
|
||||
_FUNASRAPI const float FsmnVadGetRetSnippetTime(FUNASR_RESULT result)
|
||||
{
|
||||
if (!result)
|
||||
return 0.0f;
|
||||
|
||||
return ((funasr::FUNASR_VAD_RESULT*)result)->snippet_time;
|
||||
}
|
||||
|
||||
// APIs for GetResult
|
||||
_FUNASRAPI const char* FunASRGetResult(FUNASR_RESULT result,int n_index)
|
||||
{
|
||||
funasr::FUNASR_RECOG_RESULT * p_result = (funasr::FUNASR_RECOG_RESULT*)result;
|
||||
if(!p_result)
|
||||
return nullptr;
|
||||
|
||||
return p_result->msg.c_str();
|
||||
}
|
||||
|
||||
_FUNASRAPI const char* FunASRGetStamp(FUNASR_RESULT result)
|
||||
{
|
||||
funasr::FUNASR_RECOG_RESULT * p_result = (funasr::FUNASR_RECOG_RESULT*)result;
|
||||
if(!p_result)
|
||||
return nullptr;
|
||||
|
||||
return p_result->stamp.c_str();
|
||||
}
|
||||
|
||||
_FUNASRAPI const char* FunASRGetStampSents(FUNASR_RESULT result)
|
||||
{
|
||||
funasr::FUNASR_RECOG_RESULT * p_result = (funasr::FUNASR_RECOG_RESULT*)result;
|
||||
if(!p_result)
|
||||
return nullptr;
|
||||
|
||||
return p_result->stamp_sents.c_str();
|
||||
}
|
||||
|
||||
_FUNASRAPI const char* FunASRGetTpassResult(FUNASR_RESULT result,int n_index)
|
||||
{
|
||||
funasr::FUNASR_RECOG_RESULT * p_result = (funasr::FUNASR_RECOG_RESULT*)result;
|
||||
if(!p_result)
|
||||
return nullptr;
|
||||
|
||||
return p_result->tpass_msg.c_str();
|
||||
}
|
||||
|
||||
_FUNASRAPI const char* CTTransformerGetResult(FUNASR_RESULT result,int n_index)
|
||||
{
|
||||
funasr::FUNASR_PUNC_RESULT * p_result = (funasr::FUNASR_PUNC_RESULT*)result;
|
||||
if(!p_result)
|
||||
return nullptr;
|
||||
|
||||
return p_result->msg.c_str();
|
||||
}
|
||||
|
||||
_FUNASRAPI vector<std::vector<int>>* FsmnVadGetResult(FUNASR_RESULT result,int n_index)
|
||||
{
|
||||
funasr::FUNASR_VAD_RESULT * p_result = (funasr::FUNASR_VAD_RESULT*)result;
|
||||
if(!p_result)
|
||||
return nullptr;
|
||||
|
||||
return p_result->segments;
|
||||
}
|
||||
|
||||
// APIs for FreeResult
|
||||
_FUNASRAPI void FunASRFreeResult(FUNASR_RESULT result)
|
||||
{
|
||||
if (result)
|
||||
{
|
||||
delete (funasr::FUNASR_RECOG_RESULT*)result;
|
||||
}
|
||||
}
|
||||
|
||||
_FUNASRAPI void CTTransformerFreeResult(FUNASR_RESULT result)
|
||||
{
|
||||
if (result)
|
||||
{
|
||||
delete (funasr::FUNASR_PUNC_RESULT*)result;
|
||||
}
|
||||
}
|
||||
|
||||
_FUNASRAPI void FsmnVadFreeResult(FUNASR_RESULT result)
|
||||
{
|
||||
funasr::FUNASR_VAD_RESULT * p_result = (funasr::FUNASR_VAD_RESULT*)result;
|
||||
if (p_result)
|
||||
{
|
||||
if(p_result->segments){
|
||||
delete p_result->segments;
|
||||
}
|
||||
delete p_result;
|
||||
}
|
||||
}
|
||||
|
||||
// APIs for decoder status reset
|
||||
_FUNASRAPI void FunASRReset(FUNASR_HANDLE handle, FUNASR_DEC_HANDLE dec_handle)
|
||||
{
|
||||
funasr::Model* recog_obj = (funasr::Model*)handle;
|
||||
recog_obj->StartUtterance();
|
||||
funasr::WfstDecoder* wfst_decoder = (funasr::WfstDecoder*)dec_handle;
|
||||
if (wfst_decoder)
|
||||
wfst_decoder->StartUtterance();
|
||||
}
|
||||
|
||||
_FUNASRAPI void FunOfflineReset(FUNASR_HANDLE handle, FUNASR_DEC_HANDLE dec_handle)
|
||||
{
|
||||
funasr::OfflineStream* recog_obj = (funasr::OfflineStream*)handle;
|
||||
recog_obj->asr_handle->StartUtterance();
|
||||
funasr::WfstDecoder* wfst_decoder = (funasr::WfstDecoder*)dec_handle;
|
||||
if (wfst_decoder)
|
||||
wfst_decoder->StartUtterance();
|
||||
}
|
||||
|
||||
// APIs for Uninit
|
||||
_FUNASRAPI void FunASRUninit(FUNASR_HANDLE handle)
|
||||
{
|
||||
funasr::Model* recog_obj = (funasr::Model*)handle;
|
||||
|
||||
if (!recog_obj)
|
||||
return;
|
||||
|
||||
delete recog_obj;
|
||||
}
|
||||
|
||||
_FUNASRAPI void FsmnVadUninit(FUNASR_HANDLE handle)
|
||||
{
|
||||
funasr::VadModel* recog_obj = (funasr::VadModel*)handle;
|
||||
|
||||
if (!recog_obj)
|
||||
return;
|
||||
|
||||
delete recog_obj;
|
||||
}
|
||||
|
||||
_FUNASRAPI void CTTransformerUninit(FUNASR_HANDLE handle)
|
||||
{
|
||||
funasr::PuncModel* punc_obj = (funasr::PuncModel*)handle;
|
||||
|
||||
if (!punc_obj)
|
||||
return;
|
||||
|
||||
delete punc_obj;
|
||||
}
|
||||
|
||||
_FUNASRAPI void FunOfflineUninit(FUNASR_HANDLE handle)
|
||||
{
|
||||
funasr::OfflineStream* offline_stream = (funasr::OfflineStream*)handle;
|
||||
|
||||
if (!offline_stream)
|
||||
return;
|
||||
|
||||
delete offline_stream;
|
||||
}
|
||||
|
||||
_FUNASRAPI void FunTpassUninit(FUNASR_HANDLE handle)
|
||||
{
|
||||
funasr::TpassStream* tpass_stream = (funasr::TpassStream*)handle;
|
||||
|
||||
if (!tpass_stream)
|
||||
return;
|
||||
|
||||
delete tpass_stream;
|
||||
}
|
||||
|
||||
_FUNASRAPI void FunTpassOnlineUninit(FUNASR_HANDLE handle)
|
||||
{
|
||||
funasr::TpassOnlineStream* tpass_online_stream = (funasr::TpassOnlineStream*)handle;
|
||||
|
||||
if (!tpass_online_stream)
|
||||
return;
|
||||
|
||||
delete tpass_online_stream;
|
||||
}
|
||||
|
||||
_FUNASRAPI FUNASR_DEC_HANDLE FunASRWfstDecoderInit(FUNASR_HANDLE handle, int asr_type, float glob_beam, float lat_beam, float am_scale)
|
||||
{
|
||||
funasr::WfstDecoder* mm = nullptr;
|
||||
if (asr_type == ASR_OFFLINE) {
|
||||
funasr::OfflineStream* offline_stream = (funasr::OfflineStream*)handle;
|
||||
auto paraformer = dynamic_cast<funasr::Paraformer*>(offline_stream->asr_handle.get());
|
||||
if(paraformer !=nullptr){
|
||||
if (paraformer->lm_){
|
||||
mm = new funasr::WfstDecoder(paraformer->lm_.get(),
|
||||
paraformer->GetPhoneSet(), paraformer->GetLmVocab(), glob_beam, lat_beam, am_scale);
|
||||
}
|
||||
return mm;
|
||||
}
|
||||
#ifdef USE_GPU
|
||||
auto paraformer_torch = dynamic_cast<funasr::ParaformerTorch*>(offline_stream->asr_handle.get());
|
||||
if(paraformer_torch !=nullptr){
|
||||
if (paraformer_torch->lm_){
|
||||
mm = new funasr::WfstDecoder(paraformer_torch->lm_.get(),
|
||||
paraformer_torch->GetPhoneSet(), paraformer_torch->GetLmVocab(), glob_beam, lat_beam, am_scale);
|
||||
}
|
||||
return mm;
|
||||
}
|
||||
#endif
|
||||
|
||||
} else if (asr_type == ASR_TWO_PASS){
|
||||
funasr::TpassStream* tpass_stream = (funasr::TpassStream*)handle;
|
||||
auto paraformer = dynamic_cast<funasr::Paraformer*>(tpass_stream->asr_handle.get());
|
||||
if(paraformer !=nullptr){
|
||||
if (paraformer->lm_){
|
||||
mm = new funasr::WfstDecoder(paraformer->lm_.get(),
|
||||
paraformer->GetPhoneSet(), paraformer->GetLmVocab(), glob_beam, lat_beam, am_scale);
|
||||
}
|
||||
return mm;
|
||||
}
|
||||
#ifdef USE_GPU
|
||||
auto paraformer_torch = dynamic_cast<funasr::ParaformerTorch*>(tpass_stream->asr_handle.get());
|
||||
if(paraformer_torch !=nullptr){
|
||||
if (paraformer_torch->lm_){
|
||||
mm = new funasr::WfstDecoder(paraformer_torch->lm_.get(),
|
||||
paraformer_torch->GetPhoneSet(), paraformer_torch->GetLmVocab(), glob_beam, lat_beam, am_scale);
|
||||
}
|
||||
return mm;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
return mm;
|
||||
}
|
||||
|
||||
_FUNASRAPI void FunASRWfstDecoderUninit(FUNASR_DEC_HANDLE handle)
|
||||
{
|
||||
funasr::WfstDecoder* wfst_decoder = (funasr::WfstDecoder*)handle;
|
||||
if (!wfst_decoder)
|
||||
return;
|
||||
delete wfst_decoder;
|
||||
}
|
||||
|
||||
_FUNASRAPI void FunWfstDecoderLoadHwsRes(FUNASR_DEC_HANDLE handle, int inc_bias, unordered_map<string, int> &hws_map)
|
||||
{
|
||||
funasr::WfstDecoder* wfst_decoder = (funasr::WfstDecoder*)handle;
|
||||
if (!wfst_decoder)
|
||||
return;
|
||||
wfst_decoder->LoadHwsRes(inc_bias, hws_map);
|
||||
}
|
||||
_FUNASRAPI void FunWfstDecoderUnloadHwsRes(FUNASR_DEC_HANDLE handle)
|
||||
{
|
||||
funasr::WfstDecoder* wfst_decoder = (funasr::WfstDecoder*)handle;
|
||||
if (!wfst_decoder)
|
||||
return;
|
||||
wfst_decoder->UnloadHwsRes();
|
||||
}
|
||||
89
modules/python/vendors/FunASR/runtime/onnxruntime/src/itn-processor.cpp
vendored
Normal file
89
modules/python/vendors/FunASR/runtime/onnxruntime/src/itn-processor.cpp
vendored
Normal file
@@ -0,0 +1,89 @@
|
||||
// Acknowledgement: this code is adapted from
|
||||
// https://github.com/wenet-e2e/WeTextProcessing/blob/master/runtime/processor/processor.cc
|
||||
// Retrieved in Aug 2023.
|
||||
|
||||
// Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn)
|
||||
// 2023 Jing Du (thuduj12@163.com)
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "itn-processor.h"
|
||||
|
||||
using fst::StringTokenType;
|
||||
|
||||
namespace funasr {
|
||||
ITNProcessor::ITNProcessor(){};
|
||||
ITNProcessor::~ITNProcessor(){};
|
||||
|
||||
void ITNProcessor::InitITN(const std::string& tagger_path,
|
||||
const std::string& verbalizer_path,
|
||||
int thread_num) {
|
||||
try{
|
||||
tagger_.reset(StdVectorFst::Read(tagger_path));
|
||||
LOG(INFO) << "Successfully load model from " << tagger_path;
|
||||
verbalizer_.reset(StdVectorFst::Read(verbalizer_path));
|
||||
LOG(INFO) << "Successfully load model from " << verbalizer_path;
|
||||
}catch(exception const &e){
|
||||
LOG(ERROR) << "Error loading itn models";
|
||||
exit(-1);
|
||||
}
|
||||
compiler_ = std::make_shared<StringCompiler<StdArc>>(StringTokenType::BYTE);
|
||||
printer_ = std::make_shared<StringPrinter<StdArc>>(StringTokenType::BYTE);
|
||||
|
||||
if (tagger_path.find("_tn_") != tagger_path.npos) {
|
||||
parse_type_ = ParseType::kTN;
|
||||
} else if (tagger_path.find("_itn_") != tagger_path.npos) {
|
||||
parse_type_ = ParseType::kITN;
|
||||
} else {
|
||||
LOG(FATAL) << "Invalid fst prefix, prefix should contain"
|
||||
<< " either \"_tn_\" or \"_itn_\".";
|
||||
}
|
||||
}
|
||||
|
||||
std::string ITNProcessor::shortest_path(const StdVectorFst& lattice) {
|
||||
StdVectorFst shortest_path;
|
||||
fst::ShortestPath(lattice, &shortest_path, 1, true);
|
||||
|
||||
std::string output;
|
||||
printer_->operator()(shortest_path, &output);
|
||||
return output;
|
||||
}
|
||||
|
||||
std::string ITNProcessor::compose(const std::string& input,
|
||||
const StdVectorFst* fst) {
|
||||
StdVectorFst input_fst;
|
||||
compiler_->operator()(input, &input_fst);
|
||||
|
||||
StdVectorFst lattice;
|
||||
fst::Compose(input_fst, *fst, &lattice);
|
||||
return shortest_path(lattice);
|
||||
}
|
||||
|
||||
std::string ITNProcessor::tag(const std::string& input) {
|
||||
return compose(input, tagger_.get());
|
||||
}
|
||||
|
||||
std::string ITNProcessor::verbalize(const std::string& input) {
|
||||
if (input.empty()) {
|
||||
return "";
|
||||
}
|
||||
TokenParser parser(parse_type_);
|
||||
std::string output = parser.reorder(input);
|
||||
return compose(output, verbalizer_.get());
|
||||
}
|
||||
|
||||
std::string ITNProcessor::Normalize(const std::string& input) {
|
||||
return verbalize(tag(input));
|
||||
}
|
||||
|
||||
} // namespace funasr
|
||||
56
modules/python/vendors/FunASR/runtime/onnxruntime/src/itn-processor.h
vendored
Normal file
56
modules/python/vendors/FunASR/runtime/onnxruntime/src/itn-processor.h
vendored
Normal file
@@ -0,0 +1,56 @@
|
||||
// Acknowledgement: this code is adapted from
|
||||
// https://github.com/wenet-e2e/WeTextProcessing/blob/master/runtime/processor/processor.h
|
||||
// Retrieved in Aug 2023.
|
||||
|
||||
// Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn)
|
||||
// 2023 Jing Du (thuduj12@163.com)
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef ITN_PROCESSOR_H_
|
||||
#define ITN_PROCESSOR_H_
|
||||
|
||||
#include "fst/fstlib.h"
|
||||
#include "precomp.h"
|
||||
#include "itn-token-parser.h"
|
||||
|
||||
using fst::StdArc;
|
||||
using fst::StdVectorFst;
|
||||
using fst::StringCompiler;
|
||||
using fst::StringPrinter;
|
||||
|
||||
namespace funasr {
|
||||
class ITNProcessor : public ITNModel {
|
||||
public:
|
||||
ITNProcessor();
|
||||
void InitITN(const std::string &itn_tagger, const std::string &itn_verbalizer, int thread_num);
|
||||
~ITNProcessor();
|
||||
|
||||
std::string tag(const std::string& input);
|
||||
std::string verbalize(const std::string& input);
|
||||
std::string Normalize(const std::string& input);
|
||||
|
||||
private:
|
||||
std::string shortest_path(const StdVectorFst& lattice);
|
||||
std::string compose(const std::string& input, const StdVectorFst* fst);
|
||||
|
||||
ParseType parse_type_;
|
||||
std::shared_ptr<StdVectorFst> tagger_ = nullptr;
|
||||
std::shared_ptr<StdVectorFst> verbalizer_ = nullptr;
|
||||
std::shared_ptr<StringCompiler<StdArc>> compiler_ = nullptr;
|
||||
std::shared_ptr<StringPrinter<StdArc>> printer_ = nullptr;
|
||||
};
|
||||
|
||||
} // namespace funasr
|
||||
|
||||
#endif // ITN_PROCESSOR_H_
|
||||
157
modules/python/vendors/FunASR/runtime/onnxruntime/src/itn-token-parser.cpp
vendored
Normal file
157
modules/python/vendors/FunASR/runtime/onnxruntime/src/itn-token-parser.cpp
vendored
Normal file
@@ -0,0 +1,157 @@
|
||||
// Acknowledgement: this code is adapted from
|
||||
// https://github.com/wenet-e2e/WeTextProcessing/blob/master/runtime/processor/token_parser.cc
|
||||
// Retrieved in Aug 2023.
|
||||
|
||||
// Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn)
|
||||
// 2023 Jing Du (thuduj12@163.com)
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "itn-token-parser.h"
|
||||
#include <glog/logging.h>
|
||||
#include "utf8-string.h"
|
||||
|
||||
namespace funasr {
|
||||
const std::string EOS = "<EOS>";
|
||||
const std::set<std::string> UTF8_WHITESPACE = {" ", "\t", "\n", "\r",
|
||||
"\x0b\x0c"};
|
||||
const std::set<std::string> ASCII_LETTERS = {
|
||||
"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n",
|
||||
"o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "A", "B",
|
||||
"C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P",
|
||||
"Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", "_"};
|
||||
const std::unordered_map<std::string, std::vector<std::string>> TN_ORDERS = {
|
||||
{"date", {"year", "month", "day"}},
|
||||
{"fraction", {"denominator", "numerator"}},
|
||||
{"measure", {"denominator", "numerator", "value"}},
|
||||
{"money", {"value", "currency"}},
|
||||
{"time", {"noon", "hour", "minute", "second"}}};
|
||||
const std::unordered_map<std::string, std::vector<std::string>> ITN_ORDERS = {
|
||||
{"date", {"year", "month", "day"}},
|
||||
{"fraction", {"sign", "numerator", "denominator"}},
|
||||
{"measure", {"numerator", "denominator", "value"}},
|
||||
{"money", {"currency", "value"}},
|
||||
{"time", {"hour", "minute", "second", "noon"}}};
|
||||
|
||||
TokenParser::TokenParser(ParseType type) {
|
||||
if (type == ParseType::kTN) {
|
||||
orders = TN_ORDERS;
|
||||
} else {
|
||||
orders = ITN_ORDERS;
|
||||
}
|
||||
}
|
||||
|
||||
void TokenParser::load(const std::string& input) {
|
||||
string2chars(input, &text);
|
||||
CHECK_GT(text.size(), 0);
|
||||
index = 0;
|
||||
ch = text[0];
|
||||
}
|
||||
|
||||
bool TokenParser::read() {
|
||||
if (index < text.size() - 1) {
|
||||
index += 1;
|
||||
ch = text[index];
|
||||
return true;
|
||||
}
|
||||
ch = EOS;
|
||||
return false;
|
||||
}
|
||||
|
||||
bool TokenParser::parse_ws() {
|
||||
bool not_eos = ch != EOS;
|
||||
while (not_eos && ch == " ") {
|
||||
not_eos = read();
|
||||
}
|
||||
return not_eos;
|
||||
}
|
||||
|
||||
bool TokenParser::parse_char(const std::string& exp) {
|
||||
if (ch == exp) {
|
||||
read();
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool TokenParser::parse_chars(const std::string& exp) {
|
||||
bool ok = false;
|
||||
std::vector<std::string> chars;
|
||||
string2chars(exp, &chars);
|
||||
for (const auto& x : chars) {
|
||||
ok |= parse_char(x);
|
||||
}
|
||||
return ok;
|
||||
}
|
||||
|
||||
std::string TokenParser::parse_key() {
|
||||
CHECK_NE(ch, EOS);
|
||||
CHECK_EQ(UTF8_WHITESPACE.count(ch), 0);
|
||||
|
||||
std::string key = "";
|
||||
while (ASCII_LETTERS.count(ch) > 0) {
|
||||
key += ch;
|
||||
read();
|
||||
}
|
||||
return key;
|
||||
}
|
||||
|
||||
std::string TokenParser::parse_value() {
|
||||
CHECK_NE(ch, EOS);
|
||||
bool escape = false;
|
||||
|
||||
std::string value = "";
|
||||
while (ch != "\"") {
|
||||
value += ch;
|
||||
escape = ch == "\\" && !escape;
|
||||
read();
|
||||
if (escape) {
|
||||
value += ch;
|
||||
read();
|
||||
}
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
void TokenParser::parse(const std::string& input) {
|
||||
load(input);
|
||||
while (parse_ws()) {
|
||||
std::string name = parse_key();
|
||||
parse_chars(" { ");
|
||||
|
||||
Token token(name);
|
||||
while (parse_ws()) {
|
||||
if (ch == "}") {
|
||||
parse_char("}");
|
||||
break;
|
||||
}
|
||||
std::string key = parse_key();
|
||||
parse_chars(": \"");
|
||||
std::string value = parse_value();
|
||||
parse_char("\"");
|
||||
token.append(key, value);
|
||||
}
|
||||
tokens.emplace_back(token);
|
||||
}
|
||||
}
|
||||
|
||||
std::string TokenParser::reorder(const std::string& input) {
|
||||
parse(input);
|
||||
std::string output = "";
|
||||
for (auto& token : tokens) {
|
||||
output += token.string(orders) + " ";
|
||||
}
|
||||
return trim(output);
|
||||
}
|
||||
|
||||
} // namespace funasr
|
||||
96
modules/python/vendors/FunASR/runtime/onnxruntime/src/itn-token-parser.h
vendored
Normal file
96
modules/python/vendors/FunASR/runtime/onnxruntime/src/itn-token-parser.h
vendored
Normal file
@@ -0,0 +1,96 @@
|
||||
// Acknowledgement: this code is adapted from
|
||||
// https://github.com/wenet-e2e/WeTextProcessing/blob/master/runtime/processor/token_parser.h
|
||||
// Retrieved in Aug 2023.
|
||||
|
||||
// Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn)
|
||||
// 2023 Jing Du (thuduj12@163.com)
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef ITN_TOKEN_PARSER_H_
|
||||
#define ITN_TOKEN_PARSER_H_
|
||||
|
||||
#include <set>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
namespace funasr {
|
||||
|
||||
extern const std::string EOS;
|
||||
extern const std::set<std::string> UTF8_WHITESPACE;
|
||||
extern const std::set<std::string> ASCII_LETTERS;
|
||||
extern const std::unordered_map<std::string, std::vector<std::string>>
|
||||
TN_ORDERS;
|
||||
extern const std::unordered_map<std::string, std::vector<std::string>>
|
||||
ITN_ORDERS;
|
||||
|
||||
struct Token {
|
||||
std::string name;
|
||||
std::vector<std::string> order;
|
||||
std::unordered_map<std::string, std::string> members;
|
||||
|
||||
Token(const std::string& name) : name(name) {}
|
||||
|
||||
void append(const std::string& key, const std::string& value) {
|
||||
order.emplace_back(key);
|
||||
members[key] = value;
|
||||
}
|
||||
|
||||
std::string string(
|
||||
const std::unordered_map<std::string, std::vector<std::string>>& orders) {
|
||||
std::string output = name + " {";
|
||||
if (orders.count(name) > 0) {
|
||||
order = orders.at(name);
|
||||
}
|
||||
|
||||
for (const auto& key : order) {
|
||||
if (members.count(key) == 0) {
|
||||
continue;
|
||||
}
|
||||
output += " " + key + ": \"" + members[key] + "\"";
|
||||
}
|
||||
return output + " }";
|
||||
}
|
||||
};
|
||||
|
||||
enum ParseType {
|
||||
kTN = 0x00, // Text Normalization
|
||||
kITN = 0x01 // Inverse Text Normalization
|
||||
};
|
||||
|
||||
class TokenParser {
|
||||
public:
|
||||
TokenParser(ParseType type);
|
||||
std::string reorder(const std::string& input);
|
||||
|
||||
private:
|
||||
void load(const std::string& input);
|
||||
bool read();
|
||||
bool parse_ws();
|
||||
bool parse_char(const std::string& exp);
|
||||
bool parse_chars(const std::string& exp);
|
||||
std::string parse_key();
|
||||
std::string parse_value();
|
||||
void parse(const std::string& input);
|
||||
|
||||
int index;
|
||||
std::string ch;
|
||||
std::vector<std::string> text;
|
||||
std::vector<Token> tokens;
|
||||
std::unordered_map<std::string, std::vector<std::string>> orders;
|
||||
};
|
||||
|
||||
} // funasr
|
||||
|
||||
#endif // ITN_TOKEN_PARSER_H_
|
||||
60
modules/python/vendors/FunASR/runtime/onnxruntime/src/model.cpp
vendored
Normal file
60
modules/python/vendors/FunASR/runtime/onnxruntime/src/model.cpp
vendored
Normal file
@@ -0,0 +1,60 @@
|
||||
#include "precomp.h"
|
||||
|
||||
namespace funasr {
|
||||
Model *CreateModel(std::map<std::string, std::string>& model_path, int thread_num, ASR_TYPE type)
|
||||
{
|
||||
// offline
|
||||
if(type == ASR_OFFLINE){
|
||||
string am_model_path;
|
||||
string am_cmvn_path;
|
||||
string am_config_path;
|
||||
string token_path;
|
||||
|
||||
am_model_path = PathAppend(model_path.at(MODEL_DIR), MODEL_NAME);
|
||||
if(model_path.find(QUANTIZE) != model_path.end() && model_path.at(QUANTIZE) == "true"){
|
||||
am_model_path = PathAppend(model_path.at(MODEL_DIR), QUANT_MODEL_NAME);
|
||||
}
|
||||
am_cmvn_path = PathAppend(model_path.at(MODEL_DIR), AM_CMVN_NAME);
|
||||
am_config_path = PathAppend(model_path.at(MODEL_DIR), AM_CONFIG_NAME);
|
||||
token_path = PathAppend(model_path.at(MODEL_DIR), TOKEN_PATH);
|
||||
|
||||
Model *mm;
|
||||
mm = new Paraformer();
|
||||
mm->InitAsr(am_model_path, am_cmvn_path, am_config_path, token_path, thread_num);
|
||||
return mm;
|
||||
}else if(type == ASR_ONLINE){
|
||||
// online
|
||||
string en_model_path;
|
||||
string de_model_path;
|
||||
string am_cmvn_path;
|
||||
string am_config_path;
|
||||
string token_path;
|
||||
|
||||
en_model_path = PathAppend(model_path.at(MODEL_DIR), ENCODER_NAME);
|
||||
de_model_path = PathAppend(model_path.at(MODEL_DIR), DECODER_NAME);
|
||||
if(model_path.find(QUANTIZE) != model_path.end() && model_path.at(QUANTIZE) == "true"){
|
||||
en_model_path = PathAppend(model_path.at(MODEL_DIR), QUANT_ENCODER_NAME);
|
||||
de_model_path = PathAppend(model_path.at(MODEL_DIR), QUANT_DECODER_NAME);
|
||||
}
|
||||
am_cmvn_path = PathAppend(model_path.at(MODEL_DIR), AM_CMVN_NAME);
|
||||
am_config_path = PathAppend(model_path.at(MODEL_DIR), AM_CONFIG_NAME);
|
||||
token_path = PathAppend(model_path.at(MODEL_DIR), TOKEN_PATH);
|
||||
|
||||
Model *mm;
|
||||
mm = new Paraformer();
|
||||
mm->InitAsr(en_model_path, de_model_path, am_cmvn_path, am_config_path, token_path, thread_num);
|
||||
return mm;
|
||||
}else{
|
||||
LOG(ERROR)<<"Wrong ASR_TYPE : " << type;
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
|
||||
Model *CreateModel(void* asr_handle, std::vector<int> chunk_size)
|
||||
{
|
||||
Model* mm;
|
||||
mm = new ParaformerOnline((Paraformer*)asr_handle, chunk_size);
|
||||
return mm;
|
||||
}
|
||||
|
||||
} // namespace funasr
|
||||
160
modules/python/vendors/FunASR/runtime/onnxruntime/src/offline-stream.cpp
vendored
Normal file
160
modules/python/vendors/FunASR/runtime/onnxruntime/src/offline-stream.cpp
vendored
Normal file
@@ -0,0 +1,160 @@
|
||||
#include "precomp.h"
|
||||
|
||||
namespace funasr {
|
||||
OfflineStream::OfflineStream(std::map<std::string, std::string>& model_path, int thread_num, bool use_gpu, int batch_size)
|
||||
{
|
||||
// VAD model
|
||||
if(model_path.find(VAD_DIR) != model_path.end()){
|
||||
string vad_model_path;
|
||||
string vad_cmvn_path;
|
||||
string vad_config_path;
|
||||
|
||||
vad_model_path = PathAppend(model_path.at(VAD_DIR), MODEL_NAME);
|
||||
if(model_path.find(VAD_QUANT) != model_path.end() && model_path.at(VAD_QUANT) == "true"){
|
||||
vad_model_path = PathAppend(model_path.at(VAD_DIR), QUANT_MODEL_NAME);
|
||||
}
|
||||
vad_cmvn_path = PathAppend(model_path.at(VAD_DIR), VAD_CMVN_NAME);
|
||||
vad_config_path = PathAppend(model_path.at(VAD_DIR), VAD_CONFIG_NAME);
|
||||
if (access(vad_model_path.c_str(), F_OK) != 0 ||
|
||||
access(vad_cmvn_path.c_str(), F_OK) != 0 ||
|
||||
access(vad_config_path.c_str(), F_OK) != 0 )
|
||||
{
|
||||
LOG(INFO) << "VAD model file is not exist, skip load vad model.";
|
||||
}else{
|
||||
vad_handle = make_unique<FsmnVad>();
|
||||
vad_handle->InitVad(vad_model_path, vad_cmvn_path, vad_config_path, thread_num);
|
||||
use_vad = true;
|
||||
}
|
||||
}
|
||||
|
||||
// AM model
|
||||
if(model_path.find(MODEL_DIR) != model_path.end()){
|
||||
string am_model_path;
|
||||
string am_cmvn_path;
|
||||
string am_config_path;
|
||||
string token_path;
|
||||
string hw_cpu_model_path;
|
||||
string hw_gpu_model_path;
|
||||
string seg_dict_path;
|
||||
|
||||
if(use_gpu){
|
||||
#ifdef USE_GPU
|
||||
asr_handle = make_unique<ParaformerTorch>();
|
||||
asr_handle->SetBatchSize(batch_size);
|
||||
#else
|
||||
LOG(ERROR) <<"GPU is not supported! CPU will be used! If you want to use GPU, please add -DGPU=ON when cmake";
|
||||
asr_handle = make_unique<Paraformer>();
|
||||
use_gpu = false;
|
||||
#endif
|
||||
}else{
|
||||
if (model_path.at(MODEL_DIR).find(MODEL_SVS) != std::string::npos)
|
||||
{
|
||||
asr_handle = make_unique<SenseVoiceSmall>();
|
||||
model_type = MODEL_SVS;
|
||||
}else{
|
||||
asr_handle = make_unique<Paraformer>();
|
||||
}
|
||||
}
|
||||
|
||||
bool enable_hotword = false;
|
||||
hw_cpu_model_path = PathAppend(model_path.at(MODEL_DIR), MODEL_EB_NAME);
|
||||
hw_gpu_model_path = PathAppend(model_path.at(MODEL_DIR), TORCH_MODEL_EB_NAME);
|
||||
seg_dict_path = PathAppend(model_path.at(MODEL_DIR), MODEL_SEG_DICT);
|
||||
if (access(hw_cpu_model_path.c_str(), F_OK) == 0) { // if model_eb.onnx exist, hotword enabled
|
||||
enable_hotword = true;
|
||||
asr_handle->InitHwCompiler(hw_cpu_model_path, thread_num);
|
||||
asr_handle->InitSegDict(seg_dict_path);
|
||||
}
|
||||
if (use_gpu && access(hw_gpu_model_path.c_str(), F_OK) == 0) { // if model_eb.torchscript exist, hotword enabled
|
||||
enable_hotword = true;
|
||||
asr_handle->InitHwCompiler(hw_gpu_model_path, thread_num);
|
||||
asr_handle->InitSegDict(seg_dict_path);
|
||||
}
|
||||
|
||||
am_model_path = PathAppend(model_path.at(MODEL_DIR), MODEL_NAME);
|
||||
if(model_path.find(QUANTIZE) != model_path.end() && model_path.at(QUANTIZE) == "true"){
|
||||
am_model_path = PathAppend(model_path.at(MODEL_DIR), QUANT_MODEL_NAME);
|
||||
}
|
||||
if(use_gpu){
|
||||
am_model_path = PathAppend(model_path.at(MODEL_DIR), TORCH_MODEL_NAME);
|
||||
if(model_path.find(BLADEDISC) != model_path.end() && model_path.at(BLADEDISC) == "true"){
|
||||
am_model_path = PathAppend(model_path.at(MODEL_DIR), BLADE_MODEL_NAME);
|
||||
}
|
||||
}
|
||||
|
||||
am_cmvn_path = PathAppend(model_path.at(MODEL_DIR), AM_CMVN_NAME);
|
||||
am_config_path = PathAppend(model_path.at(MODEL_DIR), AM_CONFIG_NAME);
|
||||
token_path = PathAppend(model_path.at(MODEL_DIR), TOKEN_PATH);
|
||||
|
||||
asr_handle->InitAsr(am_model_path, am_cmvn_path, am_config_path, token_path, thread_num);
|
||||
}
|
||||
|
||||
// Lm resource
|
||||
if (model_path.find(LM_DIR) != model_path.end() && model_path.at(LM_DIR) != "") {
|
||||
string fst_path, lm_config_path, lex_path;
|
||||
fst_path = PathAppend(model_path.at(LM_DIR), LM_FST_RES);
|
||||
lm_config_path = PathAppend(model_path.at(LM_DIR), LM_CONFIG_NAME);
|
||||
lex_path = PathAppend(model_path.at(LM_DIR), LEX_PATH);
|
||||
if (access(lex_path.c_str(), F_OK) != 0 )
|
||||
{
|
||||
LOG(ERROR) << "Lexicon.txt file is not exist, please use the latest version. Skip load LM model.";
|
||||
}else{
|
||||
asr_handle->InitLm(fst_path, lm_config_path, lex_path);
|
||||
}
|
||||
}
|
||||
|
||||
// PUNC model
|
||||
if(model_path.find(PUNC_DIR) != model_path.end()){
|
||||
string punc_model_path;
|
||||
string punc_config_path;
|
||||
string token_path;
|
||||
|
||||
punc_model_path = PathAppend(model_path.at(PUNC_DIR), MODEL_NAME);
|
||||
if(model_path.find(PUNC_QUANT) != model_path.end() && model_path.at(PUNC_QUANT) == "true"){
|
||||
punc_model_path = PathAppend(model_path.at(PUNC_DIR), QUANT_MODEL_NAME);
|
||||
}
|
||||
punc_config_path = PathAppend(model_path.at(PUNC_DIR), PUNC_CONFIG_NAME);
|
||||
token_path = PathAppend(model_path.at(PUNC_DIR), TOKEN_PATH);
|
||||
|
||||
if (access(punc_model_path.c_str(), F_OK) != 0 ||
|
||||
access(punc_config_path.c_str(), F_OK) != 0 ||
|
||||
access(token_path.c_str(), F_OK) != 0)
|
||||
{
|
||||
LOG(INFO) << "PUNC model file is not exist, skip load punc model.";
|
||||
}else{
|
||||
punc_handle = make_unique<CTTransformer>();
|
||||
punc_handle->InitPunc(punc_model_path, punc_config_path, token_path, thread_num);
|
||||
use_punc = true;
|
||||
}
|
||||
}
|
||||
#if !defined(__APPLE__)
|
||||
// Optional: ITN, here we just support language_type=MandarinEnglish
|
||||
if(model_path.find(ITN_DIR) != model_path.end() && model_path.at(ITN_DIR) != ""){
|
||||
string itn_tagger_path = PathAppend(model_path.at(ITN_DIR), ITN_TAGGER_NAME);
|
||||
string itn_verbalizer_path = PathAppend(model_path.at(ITN_DIR), ITN_VERBALIZER_NAME);
|
||||
|
||||
if (access(itn_tagger_path.c_str(), F_OK) != 0 ||
|
||||
access(itn_verbalizer_path.c_str(), F_OK) != 0 )
|
||||
{
|
||||
LOG(INFO) << "ITN model file is not exist, skip load ITN model.";
|
||||
}else{
|
||||
itn_handle = make_unique<ITNProcessor>();
|
||||
itn_handle->InitITN(itn_tagger_path, itn_verbalizer_path, thread_num);
|
||||
use_itn = true;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
if(model_type == MODEL_SVS){
|
||||
use_itn = false;
|
||||
use_punc = false;
|
||||
}
|
||||
}
|
||||
|
||||
OfflineStream *CreateOfflineStream(std::map<std::string, std::string>& model_path, int thread_num, bool use_gpu, int batch_size)
|
||||
{
|
||||
OfflineStream *mm;
|
||||
mm = new OfflineStream(model_path, thread_num, use_gpu, batch_size);
|
||||
return mm;
|
||||
}
|
||||
|
||||
} // namespace funasr
|
||||
560
modules/python/vendors/FunASR/runtime/onnxruntime/src/paraformer-online.cpp
vendored
Normal file
560
modules/python/vendors/FunASR/runtime/onnxruntime/src/paraformer-online.cpp
vendored
Normal file
@@ -0,0 +1,560 @@
|
||||
/**
|
||||
* Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
|
||||
* MIT License (https://opensource.org/licenses/MIT)
|
||||
*/
|
||||
|
||||
#include "precomp.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace funasr {
|
||||
|
||||
ParaformerOnline::ParaformerOnline(Paraformer* para_handle, std::vector<int> chunk_size)
|
||||
:para_handle_(std::move(para_handle)),chunk_size(chunk_size),session_options_{}{
|
||||
InitOnline(
|
||||
para_handle_->fbank_opts_,
|
||||
para_handle_->encoder_session_,
|
||||
para_handle_->decoder_session_,
|
||||
para_handle_->en_szInputNames_,
|
||||
para_handle_->en_szOutputNames_,
|
||||
para_handle_->de_szInputNames_,
|
||||
para_handle_->de_szOutputNames_,
|
||||
para_handle_->means_list_,
|
||||
para_handle_->vars_list_);
|
||||
InitCache();
|
||||
}
|
||||
|
||||
void ParaformerOnline::InitOnline(
|
||||
knf::FbankOptions &fbank_opts,
|
||||
std::shared_ptr<Ort::Session> &encoder_session,
|
||||
std::shared_ptr<Ort::Session> &decoder_session,
|
||||
vector<const char*> &en_szInputNames,
|
||||
vector<const char*> &en_szOutputNames,
|
||||
vector<const char*> &de_szInputNames,
|
||||
vector<const char*> &de_szOutputNames,
|
||||
vector<float> &means_list,
|
||||
vector<float> &vars_list){
|
||||
fbank_opts_ = fbank_opts;
|
||||
encoder_session_ = encoder_session;
|
||||
decoder_session_ = decoder_session;
|
||||
en_szInputNames_ = en_szInputNames;
|
||||
en_szOutputNames_ = en_szOutputNames;
|
||||
de_szInputNames_ = de_szInputNames;
|
||||
de_szOutputNames_ = de_szOutputNames;
|
||||
means_list_ = means_list;
|
||||
vars_list_ = vars_list;
|
||||
|
||||
frame_length = para_handle_->frame_length;
|
||||
frame_shift = para_handle_->frame_shift;
|
||||
n_mels = para_handle_->n_mels;
|
||||
lfr_m = para_handle_->lfr_m;
|
||||
lfr_n = para_handle_->lfr_n;
|
||||
encoder_size = para_handle_->encoder_size;
|
||||
fsmn_layers = para_handle_->fsmn_layers;
|
||||
fsmn_lorder = para_handle_->fsmn_lorder;
|
||||
fsmn_dims = para_handle_->fsmn_dims;
|
||||
cif_threshold = para_handle_->cif_threshold;
|
||||
tail_alphas = para_handle_->tail_alphas;
|
||||
|
||||
// other vars
|
||||
sqrt_factor = std::sqrt(encoder_size);
|
||||
for(int i=0; i<fsmn_lorder*fsmn_dims; i++){
|
||||
fsmn_init_cache_.emplace_back(0);
|
||||
}
|
||||
chunk_len = chunk_size[1]*frame_shift*lfr_n*para_handle_->asr_sample_rate/1000;
|
||||
|
||||
frame_sample_length_ = para_handle_->asr_sample_rate / 1000 * frame_length;
|
||||
frame_shift_sample_length_ = para_handle_->asr_sample_rate / 1000 * frame_shift;
|
||||
|
||||
}
|
||||
|
||||
void ParaformerOnline::FbankKaldi(float sample_rate, std::vector<std::vector<float>> &wav_feats,
|
||||
std::vector<float> &waves) {
|
||||
knf::OnlineFbank fbank(fbank_opts_);
|
||||
// cache merge
|
||||
waves.insert(waves.begin(), input_cache_.begin(), input_cache_.end());
|
||||
int frame_number = ComputeFrameNum(waves.size(), frame_sample_length_, frame_shift_sample_length_);
|
||||
// Send the audio after the last frame shift position to the cache
|
||||
input_cache_.clear();
|
||||
input_cache_.insert(input_cache_.begin(), waves.begin() + frame_number * frame_shift_sample_length_, waves.end());
|
||||
if (frame_number == 0) {
|
||||
return;
|
||||
}
|
||||
// Delete audio that haven't undergone fbank processing
|
||||
waves.erase(waves.begin() + (frame_number - 1) * frame_shift_sample_length_ + frame_sample_length_, waves.end());
|
||||
|
||||
std::vector<float> buf(waves.size());
|
||||
for (int32_t i = 0; i != waves.size(); ++i) {
|
||||
buf[i] = waves[i] * 32768;
|
||||
}
|
||||
fbank.AcceptWaveform(sample_rate, buf.data(), buf.size());
|
||||
int32_t frames = fbank.NumFramesReady();
|
||||
for (int32_t i = 0; i != frames; ++i) {
|
||||
const float *frame = fbank.GetFrame(i);
|
||||
vector<float> frame_vector(frame, frame + fbank_opts_.mel_opts.num_bins);
|
||||
wav_feats.emplace_back(frame_vector);
|
||||
}
|
||||
}
|
||||
|
||||
void ParaformerOnline::ExtractFeats(float sample_rate, vector<std::vector<float>> &wav_feats,
|
||||
vector<float> &waves, bool input_finished) {
|
||||
FbankKaldi(sample_rate, wav_feats, waves);
|
||||
// cache deal & online lfr,cmvn
|
||||
if (wav_feats.size() > 0) {
|
||||
if (!reserve_waveforms_.empty()) {
|
||||
waves.insert(waves.begin(), reserve_waveforms_.begin(), reserve_waveforms_.end());
|
||||
}
|
||||
if (lfr_splice_cache_.empty()) {
|
||||
for (int i = 0; i < (lfr_m - 1) / 2; i++) {
|
||||
lfr_splice_cache_.emplace_back(wav_feats[0]);
|
||||
}
|
||||
}
|
||||
if (wav_feats.size() + lfr_splice_cache_.size() >= lfr_m) {
|
||||
wav_feats.insert(wav_feats.begin(), lfr_splice_cache_.begin(), lfr_splice_cache_.end());
|
||||
int frame_from_waves = (waves.size() - frame_sample_length_) / frame_shift_sample_length_ + 1;
|
||||
int minus_frame = reserve_waveforms_.empty() ? (lfr_m - 1) / 2 : 0;
|
||||
int lfr_splice_frame_idxs = OnlineLfrCmvn(wav_feats, input_finished);
|
||||
int reserve_frame_idx = std::abs(lfr_splice_frame_idxs - minus_frame);
|
||||
reserve_waveforms_.clear();
|
||||
reserve_waveforms_.insert(reserve_waveforms_.begin(),
|
||||
waves.begin() + reserve_frame_idx * frame_shift_sample_length_,
|
||||
waves.begin() + frame_from_waves * frame_shift_sample_length_);
|
||||
int sample_length = (frame_from_waves - 1) * frame_shift_sample_length_ + frame_sample_length_;
|
||||
waves.erase(waves.begin() + sample_length, waves.end());
|
||||
} else {
|
||||
reserve_waveforms_.clear();
|
||||
reserve_waveforms_.insert(reserve_waveforms_.begin(),
|
||||
waves.begin() + frame_sample_length_ - frame_shift_sample_length_, waves.end());
|
||||
lfr_splice_cache_.insert(lfr_splice_cache_.end(), wav_feats.begin(), wav_feats.end());
|
||||
}
|
||||
} else {
|
||||
if (input_finished) {
|
||||
if (!reserve_waveforms_.empty()) {
|
||||
waves = reserve_waveforms_;
|
||||
}
|
||||
wav_feats = lfr_splice_cache_;
|
||||
if(wav_feats.size() == 0){
|
||||
LOG(ERROR) << "wav_feats's size is 0";
|
||||
}else{
|
||||
OnlineLfrCmvn(wav_feats, input_finished);
|
||||
}
|
||||
}
|
||||
}
|
||||
if(input_finished){
|
||||
ResetCache();
|
||||
}
|
||||
}
|
||||
|
||||
int ParaformerOnline::OnlineLfrCmvn(vector<vector<float>> &wav_feats, bool input_finished) {
|
||||
vector<vector<float>> out_feats;
|
||||
int T = wav_feats.size();
|
||||
int T_lrf = ceil((T - (lfr_m - 1) / 2) / (float)lfr_n);
|
||||
int lfr_splice_frame_idxs = T_lrf;
|
||||
vector<float> p;
|
||||
for (int i = 0; i < T_lrf; i++) {
|
||||
if (lfr_m <= T - i * lfr_n) {
|
||||
for (int j = 0; j < lfr_m; j++) {
|
||||
p.insert(p.end(), wav_feats[i * lfr_n + j].begin(), wav_feats[i * lfr_n + j].end());
|
||||
}
|
||||
out_feats.emplace_back(p);
|
||||
p.clear();
|
||||
} else {
|
||||
if (input_finished) {
|
||||
int num_padding = lfr_m - (T - i * lfr_n);
|
||||
for (int j = 0; j < (wav_feats.size() - i * lfr_n); j++) {
|
||||
p.insert(p.end(), wav_feats[i * lfr_n + j].begin(), wav_feats[i * lfr_n + j].end());
|
||||
}
|
||||
for (int j = 0; j < num_padding; j++) {
|
||||
p.insert(p.end(), wav_feats[wav_feats.size() - 1].begin(), wav_feats[wav_feats.size() - 1].end());
|
||||
}
|
||||
out_feats.emplace_back(p);
|
||||
p.clear();
|
||||
} else {
|
||||
lfr_splice_frame_idxs = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
lfr_splice_frame_idxs = std::min(T - 1, lfr_splice_frame_idxs * lfr_n);
|
||||
lfr_splice_cache_.clear();
|
||||
lfr_splice_cache_.insert(lfr_splice_cache_.begin(), wav_feats.begin() + lfr_splice_frame_idxs, wav_feats.end());
|
||||
|
||||
// Apply cmvn
|
||||
for (auto &out_feat: out_feats) {
|
||||
for (int j = 0; j < means_list_.size(); j++) {
|
||||
out_feat[j] = (out_feat[j] + means_list_[j]) * vars_list_[j];
|
||||
}
|
||||
}
|
||||
wav_feats = out_feats;
|
||||
return lfr_splice_frame_idxs;
|
||||
}
|
||||
|
||||
void ParaformerOnline::GetPosEmb(std::vector<std::vector<float>> &wav_feats, int timesteps, int feat_dim)
|
||||
{
|
||||
int start_idx = start_idx_cache_;
|
||||
start_idx_cache_ += timesteps;
|
||||
int mm = start_idx_cache_;
|
||||
|
||||
int i;
|
||||
float scale = -0.0330119726594128;
|
||||
|
||||
std::vector<float> tmp(mm*feat_dim);
|
||||
|
||||
for (i = 0; i < feat_dim/2; i++) {
|
||||
float tmptime = exp(i * scale);
|
||||
int j;
|
||||
for (j = 0; j < mm; j++) {
|
||||
int sin_idx = j * feat_dim + i;
|
||||
int cos_idx = j * feat_dim + i + feat_dim/2;
|
||||
float coe = tmptime * (j + 1);
|
||||
tmp[sin_idx] = sin(coe);
|
||||
tmp[cos_idx] = cos(coe);
|
||||
}
|
||||
}
|
||||
|
||||
for (i = start_idx; i < start_idx + timesteps; i++) {
|
||||
for (int j = 0; j < feat_dim; j++) {
|
||||
wav_feats[i-start_idx][j] += tmp[i*feat_dim+j];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ParaformerOnline::CifSearch(std::vector<std::vector<float>> hidden, std::vector<float> alphas, bool is_final, std::vector<std::vector<float>>& list_frame)
|
||||
{
|
||||
try{
|
||||
int hidden_size = 0;
|
||||
if(hidden.size() > 0){
|
||||
hidden_size = hidden[0].size();
|
||||
}
|
||||
// cache
|
||||
int i,j;
|
||||
int chunk_size_pre = chunk_size[0];
|
||||
for (i = 0; i < chunk_size_pre; i++)
|
||||
alphas[i] = 0.0;
|
||||
|
||||
int chunk_size_suf = std::accumulate(chunk_size.begin(), chunk_size.end()-1, 0);
|
||||
for (i = chunk_size_suf; i < alphas.size(); i++){
|
||||
alphas[i] = 0.0;
|
||||
}
|
||||
|
||||
if(hidden_cache_.size()>0){
|
||||
hidden.insert(hidden.begin(), hidden_cache_.begin(), hidden_cache_.end());
|
||||
alphas.insert(alphas.begin(), alphas_cache_.begin(), alphas_cache_.end());
|
||||
hidden_cache_.clear();
|
||||
alphas_cache_.clear();
|
||||
}
|
||||
|
||||
if (is_last_chunk) {
|
||||
std::vector<float> tail_hidden(hidden_size, 0);
|
||||
hidden.emplace_back(tail_hidden);
|
||||
alphas.emplace_back(tail_alphas);
|
||||
}
|
||||
|
||||
float intergrate = 0.0;
|
||||
int len_time = alphas.size();
|
||||
std::vector<float> frames(hidden_size, 0);
|
||||
std::vector<float> list_fire;
|
||||
|
||||
for (i = 0; i < len_time; i++) {
|
||||
float alpha = alphas[i];
|
||||
if (alpha + intergrate < cif_threshold) {
|
||||
intergrate += alpha;
|
||||
list_fire.emplace_back(intergrate);
|
||||
for (j = 0; j < hidden_size; j++) {
|
||||
frames[j] += alpha * hidden[i][j];
|
||||
}
|
||||
} else {
|
||||
for (j = 0; j < hidden_size; j++) {
|
||||
frames[j] += (cif_threshold - intergrate) * hidden[i][j];
|
||||
}
|
||||
std::vector<float> frames_cp(frames);
|
||||
list_frame.emplace_back(frames_cp);
|
||||
intergrate += alpha;
|
||||
list_fire.emplace_back(intergrate);
|
||||
intergrate -= cif_threshold;
|
||||
for (j = 0; j < hidden_size; j++) {
|
||||
frames[j] = intergrate * hidden[i][j];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// cache
|
||||
alphas_cache_.emplace_back(intergrate);
|
||||
if (intergrate > 0.0) {
|
||||
std::vector<float> hidden_cache(hidden_size, 0);
|
||||
for (i = 0; i < hidden_size; i++) {
|
||||
hidden_cache[i] = frames[i] / intergrate;
|
||||
}
|
||||
hidden_cache_.emplace_back(hidden_cache);
|
||||
} else {
|
||||
std::vector<float> frames_cp(frames);
|
||||
hidden_cache_.emplace_back(frames_cp);
|
||||
}
|
||||
}catch (std::exception const &e)
|
||||
{
|
||||
LOG(ERROR)<<e.what();
|
||||
}
|
||||
}
|
||||
|
||||
void ParaformerOnline::InitCache(){
|
||||
|
||||
start_idx_cache_ = 0;
|
||||
is_first_chunk = true;
|
||||
is_last_chunk = false;
|
||||
hidden_cache_.clear();
|
||||
alphas_cache_.clear();
|
||||
feats_cache_.clear();
|
||||
decoder_onnx.clear();
|
||||
|
||||
// cif cache
|
||||
std::vector<float> hidden_cache(encoder_size, 0);
|
||||
hidden_cache_.emplace_back(hidden_cache);
|
||||
alphas_cache_.emplace_back(0);
|
||||
|
||||
// feats
|
||||
std::vector<float> feat_cache(feat_dims, 0);
|
||||
for(int i=0; i<(chunk_size[0]+chunk_size[2]); i++){
|
||||
feats_cache_.emplace_back(feat_cache);
|
||||
}
|
||||
|
||||
// fsmn cache
|
||||
#ifdef _WIN_X86
|
||||
Ort::MemoryInfo m_memoryInfo = Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU);
|
||||
#else
|
||||
Ort::MemoryInfo m_memoryInfo = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);
|
||||
#endif
|
||||
const int64_t fsmn_shape_[3] = {1, fsmn_dims, fsmn_lorder};
|
||||
for(int l=0; l<fsmn_layers; l++){
|
||||
Ort::Value onnx_fsmn_cache = Ort::Value::CreateTensor<float>(
|
||||
m_memoryInfo,
|
||||
fsmn_init_cache_.data(),
|
||||
fsmn_init_cache_.size(),
|
||||
fsmn_shape_,
|
||||
3);
|
||||
decoder_onnx.emplace_back(std::move(onnx_fsmn_cache));
|
||||
}
|
||||
};
|
||||
|
||||
void ParaformerOnline::Reset()
|
||||
{
|
||||
InitCache();
|
||||
}
|
||||
|
||||
void ParaformerOnline::ResetCache() {
|
||||
reserve_waveforms_.clear();
|
||||
input_cache_.clear();
|
||||
lfr_splice_cache_.clear();
|
||||
}
|
||||
|
||||
void ParaformerOnline::AddOverlapChunk(std::vector<std::vector<float>> &wav_feats, bool input_finished){
|
||||
wav_feats.insert(wav_feats.begin(), feats_cache_.begin(), feats_cache_.end());
|
||||
if(input_finished){
|
||||
feats_cache_.clear();
|
||||
feats_cache_.insert(feats_cache_.begin(), wav_feats.end()-chunk_size[0], wav_feats.end());
|
||||
if(!is_last_chunk){
|
||||
int padding_length = std::accumulate(chunk_size.begin(), chunk_size.end(), 0) - wav_feats.size();
|
||||
std::vector<float> tmp(feat_dims, 0);
|
||||
for(int i=0; i<padding_length; i++){
|
||||
wav_feats.emplace_back(feat_dims);
|
||||
}
|
||||
}
|
||||
}else{
|
||||
feats_cache_.clear();
|
||||
feats_cache_.insert(feats_cache_.begin(), wav_feats.end()-chunk_size[0]-chunk_size[2], wav_feats.end());
|
||||
}
|
||||
}
|
||||
|
||||
string ParaformerOnline::ForwardChunk(std::vector<std::vector<float>> &chunk_feats, bool input_finished)
|
||||
{
|
||||
string result;
|
||||
try{
|
||||
int32_t num_frames = chunk_feats.size();
|
||||
|
||||
#ifdef _WIN_X86
|
||||
Ort::MemoryInfo m_memoryInfo = Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU);
|
||||
#else
|
||||
Ort::MemoryInfo m_memoryInfo = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);
|
||||
#endif
|
||||
const int64_t input_shape_[3] = {1, num_frames, feat_dims};
|
||||
std::vector<float> wav_feats;
|
||||
for (const auto &chunk_feat: chunk_feats) {
|
||||
wav_feats.insert(wav_feats.end(), chunk_feat.begin(), chunk_feat.end());
|
||||
}
|
||||
Ort::Value onnx_feats = Ort::Value::CreateTensor<float>(
|
||||
m_memoryInfo,
|
||||
wav_feats.data(),
|
||||
wav_feats.size(),
|
||||
input_shape_,
|
||||
3);
|
||||
|
||||
const int64_t paraformer_length_shape[1] = {1};
|
||||
std::vector<int32_t> paraformer_length;
|
||||
paraformer_length.emplace_back(num_frames);
|
||||
Ort::Value onnx_feats_len = Ort::Value::CreateTensor<int32_t>(
|
||||
m_memoryInfo, paraformer_length.data(), paraformer_length.size(), paraformer_length_shape, 1);
|
||||
|
||||
std::vector<Ort::Value> input_onnx;
|
||||
input_onnx.emplace_back(std::move(onnx_feats));
|
||||
input_onnx.emplace_back(std::move(onnx_feats_len));
|
||||
|
||||
auto encoder_tensor = encoder_session_->Run(Ort::RunOptions{nullptr}, en_szInputNames_.data(), input_onnx.data(), input_onnx.size(), en_szOutputNames_.data(), en_szOutputNames_.size());
|
||||
|
||||
// get enc_vec
|
||||
std::vector<int64_t> enc_shape = encoder_tensor[0].GetTensorTypeAndShapeInfo().GetShape();
|
||||
float* enc_data = encoder_tensor[0].GetTensorMutableData<float>();
|
||||
std::vector<std::vector<float>> enc_vec(enc_shape[1], std::vector<float>(enc_shape[2]));
|
||||
for (int i = 0; i < enc_shape[1]; i++) {
|
||||
for (int j = 0; j < enc_shape[2]; j++) {
|
||||
enc_vec[i][j] = enc_data[i * enc_shape[2] + j];
|
||||
}
|
||||
}
|
||||
|
||||
// get alpha_vec
|
||||
std::vector<int64_t> alpha_shape = encoder_tensor[2].GetTensorTypeAndShapeInfo().GetShape();
|
||||
float* alpha_data = encoder_tensor[2].GetTensorMutableData<float>();
|
||||
std::vector<float> alpha_vec(alpha_shape[1]);
|
||||
for (int i = 0; i < alpha_shape[1]; i++) {
|
||||
alpha_vec[i] = alpha_data[i];
|
||||
}
|
||||
|
||||
std::vector<std::vector<float>> list_frame;
|
||||
CifSearch(enc_vec, alpha_vec, input_finished, list_frame);
|
||||
|
||||
|
||||
if(list_frame.size()>0){
|
||||
// enc
|
||||
decoder_onnx.insert(decoder_onnx.begin(), std::move(encoder_tensor[0]));
|
||||
// enc_lens
|
||||
decoder_onnx.insert(decoder_onnx.begin()+1, std::move(encoder_tensor[1]));
|
||||
|
||||
// acoustic_embeds
|
||||
const int64_t emb_shape_[3] = {1, (int64_t)list_frame.size(), (int64_t)list_frame[0].size()};
|
||||
std::vector<float> emb_input;
|
||||
for (const auto &list_frame_: list_frame) {
|
||||
emb_input.insert(emb_input.end(), list_frame_.begin(), list_frame_.end());
|
||||
}
|
||||
Ort::Value onnx_emb = Ort::Value::CreateTensor<float>(
|
||||
m_memoryInfo,
|
||||
emb_input.data(),
|
||||
emb_input.size(),
|
||||
emb_shape_,
|
||||
3);
|
||||
decoder_onnx.insert(decoder_onnx.begin()+2, std::move(onnx_emb));
|
||||
|
||||
// acoustic_embeds_len
|
||||
const int64_t emb_length_shape[1] = {1};
|
||||
std::vector<int32_t> emb_length;
|
||||
emb_length.emplace_back(list_frame.size());
|
||||
Ort::Value onnx_emb_len = Ort::Value::CreateTensor<int32_t>(
|
||||
m_memoryInfo, emb_length.data(), emb_length.size(), emb_length_shape, 1);
|
||||
decoder_onnx.insert(decoder_onnx.begin()+3, std::move(onnx_emb_len));
|
||||
|
||||
auto decoder_tensor = decoder_session_->Run(Ort::RunOptions{nullptr}, de_szInputNames_.data(), decoder_onnx.data(), decoder_onnx.size(), de_szOutputNames_.data(), de_szOutputNames_.size());
|
||||
// fsmn cache
|
||||
try{
|
||||
decoder_onnx.clear();
|
||||
}catch (std::exception const &e)
|
||||
{
|
||||
LOG(ERROR)<<e.what();
|
||||
return result;
|
||||
}
|
||||
for(int l=0;l<fsmn_layers;l++){
|
||||
decoder_onnx.emplace_back(std::move(decoder_tensor[2+l]));
|
||||
}
|
||||
|
||||
std::vector<int64_t> decoder_shape = decoder_tensor[0].GetTensorTypeAndShapeInfo().GetShape();
|
||||
float* float_data = decoder_tensor[0].GetTensorMutableData<float>();
|
||||
result = para_handle_->GreedySearch(float_data, list_frame.size(), decoder_shape[2]);
|
||||
}
|
||||
}catch (std::exception const &e)
|
||||
{
|
||||
LOG(ERROR)<<e.what();
|
||||
return result;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
string ParaformerOnline::Forward(float* din, int len, bool input_finished, const std::vector<std::vector<float>> &hw_emb, void* wfst_decoder)
|
||||
{
|
||||
std::vector<std::vector<float>> wav_feats;
|
||||
std::vector<float> waves(din, din+len);
|
||||
|
||||
string result="";
|
||||
try{
|
||||
if(len <16*60 && input_finished && !is_first_chunk){
|
||||
is_last_chunk = true;
|
||||
wav_feats = feats_cache_;
|
||||
result = ForwardChunk(wav_feats, is_last_chunk);
|
||||
// reset
|
||||
ResetCache();
|
||||
Reset();
|
||||
return result;
|
||||
}
|
||||
if(is_first_chunk){
|
||||
is_first_chunk = false;
|
||||
}
|
||||
ExtractFeats(para_handle_->asr_sample_rate, wav_feats, waves, input_finished);
|
||||
if(wav_feats.size() == 0){
|
||||
return result;
|
||||
}
|
||||
|
||||
for (auto& row : wav_feats) {
|
||||
for (auto& val : row) {
|
||||
val *= sqrt_factor;
|
||||
}
|
||||
}
|
||||
|
||||
GetPosEmb(wav_feats, wav_feats.size(), wav_feats[0].size());
|
||||
if(input_finished){
|
||||
if(wav_feats.size()+chunk_size[2] <= chunk_size[1]){
|
||||
is_last_chunk = true;
|
||||
AddOverlapChunk(wav_feats, input_finished);
|
||||
}else{
|
||||
// first chunk
|
||||
std::vector<std::vector<float>> first_chunk;
|
||||
first_chunk.insert(first_chunk.begin(), wav_feats.begin(), wav_feats.end());
|
||||
AddOverlapChunk(first_chunk, input_finished);
|
||||
string str_first_chunk = ForwardChunk(first_chunk, is_last_chunk);
|
||||
|
||||
// last chunk
|
||||
is_last_chunk = true;
|
||||
std::vector<std::vector<float>> last_chunk;
|
||||
last_chunk.insert(last_chunk.begin(), wav_feats.end()-(wav_feats.size()+chunk_size[2]-chunk_size[1]), wav_feats.end());
|
||||
AddOverlapChunk(last_chunk, input_finished);
|
||||
string str_last_chunk = ForwardChunk(last_chunk, is_last_chunk);
|
||||
|
||||
result = str_first_chunk+str_last_chunk;
|
||||
// reset
|
||||
ResetCache();
|
||||
Reset();
|
||||
return result;
|
||||
}
|
||||
}else{
|
||||
AddOverlapChunk(wav_feats, input_finished);
|
||||
}
|
||||
|
||||
result = ForwardChunk(wav_feats, is_last_chunk);
|
||||
if(input_finished){
|
||||
// reset
|
||||
ResetCache();
|
||||
Reset();
|
||||
}
|
||||
}catch (std::exception const &e)
|
||||
{
|
||||
LOG(ERROR)<<e.what();
|
||||
return result;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
ParaformerOnline::~ParaformerOnline()
|
||||
{
|
||||
}
|
||||
|
||||
string ParaformerOnline::Rescoring()
|
||||
{
|
||||
LOG(ERROR)<<"Not Imp!!!!!!";
|
||||
return "";
|
||||
}
|
||||
} // namespace funasr
|
||||
122
modules/python/vendors/FunASR/runtime/onnxruntime/src/paraformer-online.h
vendored
Normal file
122
modules/python/vendors/FunASR/runtime/onnxruntime/src/paraformer-online.h
vendored
Normal file
@@ -0,0 +1,122 @@
|
||||
/**
|
||||
* Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
|
||||
* MIT License (https://opensource.org/licenses/MIT)
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
#include "precomp.h"
|
||||
|
||||
namespace funasr {
|
||||
|
||||
class ParaformerOnline : public Model {
|
||||
/**
|
||||
* Author: Speech Lab of DAMO Academy, Alibaba Group
|
||||
* ParaformerOnline: Fast and Accurate Parallel Transformer for Non-autoregressive End-to-End Speech Recognition
|
||||
* https://arxiv.org/pdf/2206.08317.pdf
|
||||
*/
|
||||
private:
|
||||
|
||||
void FbankKaldi(float sample_rate, std::vector<std::vector<float>> &wav_feats,
|
||||
std::vector<float> &waves);
|
||||
int OnlineLfrCmvn(vector<vector<float>> &wav_feats, bool input_finished);
|
||||
void GetPosEmb(std::vector<std::vector<float>> &wav_feats, int timesteps, int feat_dim);
|
||||
void CifSearch(std::vector<std::vector<float>> hidden, std::vector<float> alphas, bool is_final, std::vector<std::vector<float>> &list_frame);
|
||||
|
||||
static int ComputeFrameNum(int sample_length, int frame_sample_length, int frame_shift_sample_length) {
|
||||
int frame_num = static_cast<int>((sample_length - frame_sample_length) / frame_shift_sample_length + 1);
|
||||
if (frame_num >= 1 && sample_length >= frame_sample_length)
|
||||
return frame_num;
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
void InitOnline(
|
||||
knf::FbankOptions &fbank_opts,
|
||||
std::shared_ptr<Ort::Session> &encoder_session,
|
||||
std::shared_ptr<Ort::Session> &decoder_session,
|
||||
vector<const char*> &en_szInputNames,
|
||||
vector<const char*> &en_szOutputNames,
|
||||
vector<const char*> &de_szInputNames,
|
||||
vector<const char*> &de_szOutputNames,
|
||||
vector<float> &means_list,
|
||||
vector<float> &vars_list);
|
||||
|
||||
void StartUtterance()
|
||||
{
|
||||
}
|
||||
|
||||
void EndUtterance()
|
||||
{
|
||||
}
|
||||
|
||||
Paraformer* para_handle_ = nullptr;
|
||||
// from para_handle_
|
||||
knf::FbankOptions fbank_opts_;
|
||||
std::shared_ptr<Ort::Session> encoder_session_ = nullptr;
|
||||
std::shared_ptr<Ort::Session> decoder_session_ = nullptr;
|
||||
Ort::SessionOptions session_options_;
|
||||
vector<const char*> en_szInputNames_;
|
||||
vector<const char*> en_szOutputNames_;
|
||||
vector<const char*> de_szInputNames_;
|
||||
vector<const char*> de_szOutputNames_;
|
||||
vector<float> means_list_;
|
||||
vector<float> vars_list_;
|
||||
// configs from para_handle_
|
||||
int frame_length = 25;
|
||||
int frame_shift = 10;
|
||||
int n_mels = 80;
|
||||
int lfr_m = PARA_LFR_M;
|
||||
int lfr_n = PARA_LFR_N;
|
||||
int encoder_size = 512;
|
||||
int fsmn_layers = 16;
|
||||
int fsmn_lorder = 10;
|
||||
int fsmn_dims = 512;
|
||||
float cif_threshold = 1.0;
|
||||
float tail_alphas = 0.45;
|
||||
|
||||
// configs
|
||||
int feat_dims = lfr_m*n_mels;
|
||||
std::vector<int> chunk_size = {5,10,5};
|
||||
int frame_sample_length_ = MODEL_SAMPLE_RATE / 1000 * frame_length;
|
||||
int frame_shift_sample_length_ = MODEL_SAMPLE_RATE / 1000 * frame_shift;
|
||||
|
||||
// The reserved waveforms by fbank
|
||||
std::vector<float> reserve_waveforms_;
|
||||
// waveforms reserved after last shift position
|
||||
std::vector<float> input_cache_;
|
||||
// lfr reserved cache
|
||||
std::vector<std::vector<float>> lfr_splice_cache_;
|
||||
// position index cache
|
||||
int start_idx_cache_ = 0;
|
||||
// cif alpha
|
||||
std::vector<float> alphas_cache_;
|
||||
std::vector<std::vector<float>> hidden_cache_;
|
||||
std::vector<std::vector<float>> feats_cache_;
|
||||
// fsmn init caches
|
||||
std::vector<float> fsmn_init_cache_;
|
||||
std::vector<Ort::Value> decoder_onnx;
|
||||
|
||||
bool is_first_chunk = true;
|
||||
bool is_last_chunk = false;
|
||||
double sqrt_factor;
|
||||
|
||||
public:
|
||||
ParaformerOnline(Paraformer* para_handle, std::vector<int> chunk_size);
|
||||
~ParaformerOnline();
|
||||
void Reset();
|
||||
void ResetCache();
|
||||
void InitCache();
|
||||
void ExtractFeats(float sample_rate, vector<vector<float>> &wav_feats, vector<float> &waves, bool input_finished);
|
||||
void AddOverlapChunk(std::vector<std::vector<float>> &wav_feats, bool input_finished);
|
||||
|
||||
string ForwardChunk(std::vector<std::vector<float>> &wav_feats, bool input_finished);
|
||||
string Forward(float* din, int len, bool input_finished, const std::vector<std::vector<float>> &hw_emb={{0.0}}, void* wfst_decoder=nullptr);
|
||||
string Rescoring();
|
||||
|
||||
int GetAsrSampleRate() { return para_handle_->asr_sample_rate; };
|
||||
|
||||
// 2pass
|
||||
std::string online_res;
|
||||
int chunk_len;
|
||||
};
|
||||
|
||||
} // namespace funasr
|
||||
638
modules/python/vendors/FunASR/runtime/onnxruntime/src/paraformer-torch.cpp
vendored
Normal file
638
modules/python/vendors/FunASR/runtime/onnxruntime/src/paraformer-torch.cpp
vendored
Normal file
@@ -0,0 +1,638 @@
|
||||
/**
|
||||
* Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
|
||||
* MIT License (https://opensource.org/licenses/MIT)
|
||||
*/
|
||||
|
||||
#include "precomp.h"
|
||||
#include "paraformer-torch.h"
|
||||
#include "encode_converter.h"
|
||||
#include <cstddef>
|
||||
|
||||
using namespace std;
|
||||
namespace funasr {
|
||||
|
||||
ParaformerTorch::ParaformerTorch()
|
||||
:use_hotword(false){
|
||||
}
|
||||
|
||||
// offline
|
||||
void ParaformerTorch::InitAsr(const std::string &am_model, const std::string &am_cmvn, const std::string &am_config, const std::string &token_file, int thread_num){
|
||||
LoadConfigFromYaml(am_config.c_str());
|
||||
// knf options
|
||||
fbank_opts_.frame_opts.dither = 0;
|
||||
fbank_opts_.mel_opts.num_bins = n_mels;
|
||||
fbank_opts_.frame_opts.samp_freq = asr_sample_rate;
|
||||
fbank_opts_.frame_opts.window_type = window_type;
|
||||
fbank_opts_.frame_opts.frame_shift_ms = frame_shift;
|
||||
fbank_opts_.frame_opts.frame_length_ms = frame_length;
|
||||
fbank_opts_.energy_floor = 0;
|
||||
fbank_opts_.mel_opts.debug_mel = false;
|
||||
|
||||
vocab = new Vocab(token_file.c_str());
|
||||
phone_set_ = new PhoneSet(token_file.c_str());
|
||||
LoadCmvn(am_cmvn.c_str());
|
||||
|
||||
torch::DeviceType device = at::kCPU;
|
||||
#ifdef USE_GPU
|
||||
if (!torch::cuda::is_available()) {
|
||||
LOG(ERROR) << "CUDA is not available! Please check your GPU settings";
|
||||
exit(-1);
|
||||
} else {
|
||||
LOG(INFO) << "CUDA is available, running on GPU";
|
||||
device = at::kCUDA;
|
||||
}
|
||||
#endif
|
||||
#ifdef USE_IPEX
|
||||
torch::jit::setTensorExprFuserEnabled(false);
|
||||
#endif
|
||||
|
||||
try {
|
||||
torch::jit::script::Module model = torch::jit::load(am_model, device);
|
||||
model_ = std::make_shared<TorchModule>(std::move(model));
|
||||
LOG(INFO) << "Successfully load model from " << am_model;
|
||||
torch::NoGradGuard no_grad;
|
||||
model_->eval();
|
||||
torch::jit::setGraphExecutorOptimize(false);
|
||||
torch::jit::FusionStrategy static0 = {{torch::jit::FusionBehavior::STATIC, 0}};
|
||||
torch::jit::setFusionStrategy(static0);
|
||||
#ifdef USE_GPU
|
||||
WarmUp();
|
||||
#endif
|
||||
} catch (std::exception const &e) {
|
||||
LOG(ERROR) << "Error when load am model: " << am_model << e.what();
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
|
||||
void ParaformerTorch::InitLm(const std::string &lm_file,
|
||||
const std::string &lm_cfg_file,
|
||||
const std::string &lex_file) {
|
||||
try {
|
||||
lm_ = std::shared_ptr<fst::Fst<fst::StdArc>>(
|
||||
fst::Fst<fst::StdArc>::Read(lm_file));
|
||||
if (lm_){
|
||||
lm_vocab = new Vocab(lm_cfg_file.c_str(), lex_file.c_str());
|
||||
LOG(INFO) << "Successfully load lm file " << lm_file;
|
||||
}else{
|
||||
LOG(ERROR) << "Failed to load lm file " << lm_file;
|
||||
}
|
||||
} catch (std::exception const &e) {
|
||||
LOG(ERROR) << "Error when load lm file: " << e.what();
|
||||
exit(0);
|
||||
}
|
||||
}
|
||||
|
||||
void ParaformerTorch::LoadConfigFromYaml(const char* filename){
|
||||
|
||||
YAML::Node config;
|
||||
try{
|
||||
config = YAML::LoadFile(filename);
|
||||
}catch(exception const &e){
|
||||
LOG(ERROR) << "Error loading file, yaml file error or not exist.";
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
try{
|
||||
YAML::Node frontend_conf = config["frontend_conf"];
|
||||
this->asr_sample_rate = frontend_conf["fs"].as<int>();
|
||||
|
||||
YAML::Node lang_conf = config["lang"];
|
||||
if (lang_conf.IsDefined()){
|
||||
language = lang_conf.as<string>();
|
||||
}
|
||||
}catch(exception const &e){
|
||||
LOG(ERROR) << "Error when load argument from vad config YAML.";
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
|
||||
void ParaformerTorch::InitHwCompiler(const std::string &hw_model, int thread_num) {
|
||||
// TODO
|
||||
torch::DeviceType device = at::kCPU;
|
||||
#ifdef USE_GPU
|
||||
if (!torch::cuda::is_available()) {
|
||||
// LOG(ERROR) << "CUDA is not available! Please check your GPU settings";
|
||||
exit(-1);
|
||||
} else {
|
||||
// LOG(INFO) << "CUDA is available, running on GPU";
|
||||
device = at::kCUDA;
|
||||
}
|
||||
#endif
|
||||
|
||||
try {
|
||||
torch::jit::script::Module model = torch::jit::load(hw_model, device);
|
||||
hw_model_ = std::make_shared<TorchModule>(std::move(model));
|
||||
LOG(INFO) << "Successfully load model from " << hw_model;
|
||||
torch::NoGradGuard no_grad;
|
||||
hw_model_->eval();
|
||||
} catch (std::exception const &e) {
|
||||
LOG(ERROR) << "Error when load hw model: " << hw_model << e.what();
|
||||
exit(-1);
|
||||
}
|
||||
use_hotword = true;
|
||||
}
|
||||
|
||||
void ParaformerTorch::InitSegDict(const std::string &seg_dict_model) {
|
||||
seg_dict = new SegDict(seg_dict_model.c_str());
|
||||
}
|
||||
|
||||
ParaformerTorch::~ParaformerTorch()
|
||||
{
|
||||
if(vocab){
|
||||
delete vocab;
|
||||
vocab = nullptr;
|
||||
}
|
||||
if(lm_vocab){
|
||||
delete lm_vocab;
|
||||
lm_vocab = nullptr;
|
||||
}
|
||||
if(seg_dict){
|
||||
delete seg_dict;
|
||||
seg_dict = nullptr;
|
||||
}
|
||||
if(phone_set_){
|
||||
delete phone_set_;
|
||||
phone_set_ = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
void ParaformerTorch::StartUtterance()
|
||||
{
|
||||
}
|
||||
|
||||
void ParaformerTorch::EndUtterance()
|
||||
{
|
||||
}
|
||||
|
||||
void ParaformerTorch::Reset()
|
||||
{
|
||||
}
|
||||
|
||||
void ParaformerTorch::FbankKaldi(float sample_rate, const float* waves, int len, std::vector<std::vector<float>> &asr_feats) {
|
||||
knf::OnlineFbank fbank_(fbank_opts_);
|
||||
std::vector<float> buf(len);
|
||||
for (int32_t i = 0; i != len; ++i) {
|
||||
buf[i] = waves[i] * 32768;
|
||||
}
|
||||
fbank_.AcceptWaveform(sample_rate, buf.data(), buf.size());
|
||||
|
||||
int32_t frames = fbank_.NumFramesReady();
|
||||
for (int32_t i = 0; i != frames; ++i) {
|
||||
const float *frame = fbank_.GetFrame(i);
|
||||
std::vector<float> frame_vector(frame, frame + fbank_opts_.mel_opts.num_bins);
|
||||
asr_feats.emplace_back(frame_vector);
|
||||
}
|
||||
}
|
||||
|
||||
void ParaformerTorch::LoadCmvn(const char *filename)
|
||||
{
|
||||
ifstream cmvn_stream(filename);
|
||||
if (!cmvn_stream.is_open()) {
|
||||
LOG(ERROR) << "Failed to open file: " << filename;
|
||||
exit(-1);
|
||||
}
|
||||
string line;
|
||||
|
||||
while (getline(cmvn_stream, line)) {
|
||||
istringstream iss(line);
|
||||
vector<string> line_item{istream_iterator<string>{iss}, istream_iterator<string>{}};
|
||||
if (line_item[0] == "<AddShift>") {
|
||||
getline(cmvn_stream, line);
|
||||
istringstream means_lines_stream(line);
|
||||
vector<string> means_lines{istream_iterator<string>{means_lines_stream}, istream_iterator<string>{}};
|
||||
if (means_lines[0] == "<LearnRateCoef>") {
|
||||
for (int j = 3; j < means_lines.size() - 1; j++) {
|
||||
means_list_.push_back(stof(means_lines[j]));
|
||||
}
|
||||
continue;
|
||||
}
|
||||
}
|
||||
else if (line_item[0] == "<Rescale>") {
|
||||
getline(cmvn_stream, line);
|
||||
istringstream vars_lines_stream(line);
|
||||
vector<string> vars_lines{istream_iterator<string>{vars_lines_stream}, istream_iterator<string>{}};
|
||||
if (vars_lines[0] == "<LearnRateCoef>") {
|
||||
for (int j = 3; j < vars_lines.size() - 1; j++) {
|
||||
vars_list_.push_back(stof(vars_lines[j])*scale);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
string ParaformerTorch::GreedySearch(float * in, int n_len, int64_t token_nums, bool is_stamp, std::vector<float> us_alphas, std::vector<float> us_cif_peak)
|
||||
{
|
||||
vector<int> hyps;
|
||||
int Tmax = n_len;
|
||||
for (int i = 0; i < Tmax; i++) {
|
||||
int max_idx;
|
||||
float max_val;
|
||||
FindMax(in + i * token_nums, token_nums, max_val, max_idx);
|
||||
hyps.push_back(max_idx);
|
||||
}
|
||||
if(!is_stamp){
|
||||
return vocab->Vector2StringV2(hyps, language);
|
||||
}else{
|
||||
std::vector<string> char_list;
|
||||
std::vector<std::vector<float>> timestamp_list;
|
||||
std::string res_str;
|
||||
vocab->Vector2String(hyps, char_list);
|
||||
std::vector<string> raw_char(char_list);
|
||||
TimestampOnnx(us_alphas, us_cif_peak, char_list, res_str, timestamp_list);
|
||||
|
||||
return PostProcess(raw_char, timestamp_list);
|
||||
}
|
||||
}
|
||||
|
||||
string ParaformerTorch::BeamSearch(WfstDecoder* &wfst_decoder, float *in, int len, int64_t token_nums)
|
||||
{
|
||||
return wfst_decoder->Search(in, len, token_nums);
|
||||
}
|
||||
|
||||
string ParaformerTorch::FinalizeDecode(WfstDecoder* &wfst_decoder,
|
||||
bool is_stamp, std::vector<float> us_alphas, std::vector<float> us_cif_peak)
|
||||
{
|
||||
return wfst_decoder->FinalizeDecode(is_stamp, us_alphas, us_cif_peak);
|
||||
}
|
||||
|
||||
void ParaformerTorch::LfrCmvn(std::vector<std::vector<float>> &asr_feats) {
|
||||
|
||||
std::vector<std::vector<float>> out_feats;
|
||||
int T = asr_feats.size();
|
||||
int T_lrf = ceil(1.0 * T / lfr_n);
|
||||
|
||||
// Pad frames at start(copy first frame)
|
||||
for (int i = 0; i < (lfr_m - 1) / 2; i++) {
|
||||
asr_feats.insert(asr_feats.begin(), asr_feats[0]);
|
||||
}
|
||||
// Merge lfr_m frames as one,lfr_n frames per window
|
||||
T = T + (lfr_m - 1) / 2;
|
||||
std::vector<float> p;
|
||||
for (int i = 0; i < T_lrf; i++) {
|
||||
if (lfr_m <= T - i * lfr_n) {
|
||||
for (int j = 0; j < lfr_m; j++) {
|
||||
p.insert(p.end(), asr_feats[i * lfr_n + j].begin(), asr_feats[i * lfr_n + j].end());
|
||||
}
|
||||
out_feats.emplace_back(p);
|
||||
p.clear();
|
||||
} else {
|
||||
// Fill to lfr_m frames at last window if less than lfr_m frames (copy last frame)
|
||||
int num_padding = lfr_m - (T - i * lfr_n);
|
||||
for (int j = 0; j < (asr_feats.size() - i * lfr_n); j++) {
|
||||
p.insert(p.end(), asr_feats[i * lfr_n + j].begin(), asr_feats[i * lfr_n + j].end());
|
||||
}
|
||||
for (int j = 0; j < num_padding; j++) {
|
||||
p.insert(p.end(), asr_feats[asr_feats.size() - 1].begin(), asr_feats[asr_feats.size() - 1].end());
|
||||
}
|
||||
out_feats.emplace_back(p);
|
||||
p.clear();
|
||||
}
|
||||
}
|
||||
// Apply cmvn
|
||||
for (auto &out_feat: out_feats) {
|
||||
for (int j = 0; j < means_list_.size(); j++) {
|
||||
out_feat[j] = (out_feat[j] + means_list_[j]) * vars_list_[j];
|
||||
}
|
||||
}
|
||||
asr_feats = out_feats;
|
||||
}
|
||||
|
||||
std::vector<std::string> ParaformerTorch::Forward(float** din, int* len, bool input_finished, const std::vector<std::vector<float>> &hw_emb, void* decoder_handle, int batch_in)
|
||||
{
|
||||
vector<std::string> results;
|
||||
string result="";
|
||||
|
||||
WfstDecoder* wfst_decoder = (WfstDecoder*)decoder_handle;
|
||||
int32_t in_feat_dim = fbank_opts_.mel_opts.num_bins;
|
||||
int32_t feature_dim = lfr_m*in_feat_dim;
|
||||
|
||||
std::vector<vector<float>> feats_batch;
|
||||
std::vector<int32_t> paraformer_length;
|
||||
int max_size = 0;
|
||||
int max_frames = 0;
|
||||
for(int index=0; index<batch_in; index++){
|
||||
std::vector<std::vector<float>> asr_feats;
|
||||
FbankKaldi(asr_sample_rate, din[index], len[index], asr_feats);
|
||||
if(asr_feats.size() != 0){
|
||||
LfrCmvn(asr_feats);
|
||||
}
|
||||
int32_t num_frames = asr_feats.size();
|
||||
paraformer_length.emplace_back(num_frames);
|
||||
if(max_size < asr_feats.size()*feature_dim){
|
||||
max_size = asr_feats.size()*feature_dim;
|
||||
max_frames = num_frames;
|
||||
}
|
||||
|
||||
std::vector<float> flattened;
|
||||
for (const auto& sub_vector : asr_feats) {
|
||||
flattened.insert(flattened.end(), sub_vector.begin(), sub_vector.end());
|
||||
}
|
||||
feats_batch.emplace_back(flattened);
|
||||
}
|
||||
|
||||
if(max_frames == 0){
|
||||
for(int index=0; index<batch_in; index++){
|
||||
results.push_back(result);
|
||||
}
|
||||
return results;
|
||||
}
|
||||
|
||||
// padding
|
||||
std::vector<float> all_feats(batch_in * max_frames * feature_dim);
|
||||
for(int index=0; index<batch_in; index++){
|
||||
feats_batch[index].resize(max_size);
|
||||
std::memcpy(&all_feats[index * max_frames * feature_dim], feats_batch[index].data(),
|
||||
max_frames * feature_dim * sizeof(float));
|
||||
}
|
||||
torch::Tensor feats =
|
||||
torch::from_blob(all_feats.data(),
|
||||
{batch_in, max_frames, feature_dim}, torch::kFloat).contiguous();
|
||||
torch::Tensor feat_lens = torch::from_blob(paraformer_length.data(),
|
||||
{batch_in}, torch::kInt32);
|
||||
|
||||
// 2. forward
|
||||
#ifdef USE_GPU
|
||||
feats = feats.to(at::kCUDA);
|
||||
feat_lens = feat_lens.to(at::kCUDA);
|
||||
#endif
|
||||
std::vector<torch::jit::IValue> inputs = {feats, feat_lens};
|
||||
|
||||
std::vector<float> batch_embedding;
|
||||
std::vector<float> embedding;
|
||||
try{
|
||||
if (use_hotword) {
|
||||
if(hw_emb.size()<=0){
|
||||
LOG(ERROR) << "hw_emb is null";
|
||||
for(int index=0; index<batch_in; index++){
|
||||
results.push_back(result);
|
||||
}
|
||||
return results;
|
||||
}
|
||||
|
||||
embedding.reserve(hw_emb.size() * hw_emb[0].size());
|
||||
for (auto item : hw_emb) {
|
||||
embedding.insert(embedding.end(), item.begin(), item.end());
|
||||
}
|
||||
batch_embedding.reserve(batch_in * embedding.size());
|
||||
for (size_t index = 0; index < batch_in; ++index) {
|
||||
batch_embedding.insert(batch_embedding.end(), embedding.begin(), embedding.end());
|
||||
}
|
||||
|
||||
torch::Tensor tensor_hw_emb =
|
||||
torch::from_blob(batch_embedding.data(),
|
||||
{batch_in, static_cast<int64_t>(hw_emb.size()), static_cast<int64_t>(hw_emb[0].size())}, torch::kFloat).contiguous();
|
||||
#ifdef USE_GPU
|
||||
tensor_hw_emb = tensor_hw_emb.to(at::kCUDA);
|
||||
#endif
|
||||
inputs.emplace_back(tensor_hw_emb);
|
||||
}
|
||||
}catch (std::exception const &e)
|
||||
{
|
||||
LOG(ERROR)<<e.what();
|
||||
for(int index=0; index<batch_in; index++){
|
||||
results.push_back(result);
|
||||
}
|
||||
return results;
|
||||
}
|
||||
|
||||
try {
|
||||
if(inputs.size() == 0){
|
||||
LOG(ERROR) << "inputs of forward is null";
|
||||
for(int index=0; index<batch_in; index++){
|
||||
results.push_back(result);
|
||||
}
|
||||
return results;
|
||||
}
|
||||
auto outputs = model_->forward(inputs).toTuple()->elements();
|
||||
torch::Tensor am_scores;
|
||||
torch::Tensor valid_token_lens;
|
||||
#ifdef USE_GPU
|
||||
am_scores = outputs[0].toTensor().to(at::kCPU);
|
||||
valid_token_lens = outputs[1].toTensor().to(at::kCPU);
|
||||
#else
|
||||
am_scores = outputs[0].toTensor();
|
||||
valid_token_lens = outputs[1].toTensor();
|
||||
#endif
|
||||
|
||||
torch::Tensor us_alphas_tensor;
|
||||
torch::Tensor us_peaks_tensor;
|
||||
if(outputs.size() == 4){
|
||||
#ifdef USE_GPU
|
||||
us_alphas_tensor = outputs[2].toTensor().to(at::kCPU);
|
||||
us_peaks_tensor = outputs[3].toTensor().to(at::kCPU);
|
||||
#else
|
||||
us_alphas_tensor = outputs[2].toTensor();
|
||||
us_peaks_tensor = outputs[3].toTensor();
|
||||
#endif
|
||||
}
|
||||
|
||||
// timestamp
|
||||
for(int index=0; index<batch_in; index++){
|
||||
result="";
|
||||
if(outputs.size() == 4){
|
||||
float* us_alphas_data = us_alphas_tensor[index].data_ptr<float>();
|
||||
std::vector<float> us_alphas(paraformer_length[index]*3);
|
||||
for (int i = 0; i < us_alphas.size(); i++) {
|
||||
us_alphas[i] = us_alphas_data[i];
|
||||
}
|
||||
|
||||
float* us_peaks_data = us_peaks_tensor[index].data_ptr<float>();
|
||||
std::vector<float> us_peaks(paraformer_length[index]*3);
|
||||
for (int i = 0; i < us_peaks.size(); i++) {
|
||||
us_peaks[i] = us_peaks_data[i];
|
||||
}
|
||||
if (lm_ == nullptr) {
|
||||
result = GreedySearch(am_scores[index].data_ptr<float>(), valid_token_lens[index].item<int>(), am_scores.size(2), true, us_alphas, us_peaks);
|
||||
} else {
|
||||
result = BeamSearch(wfst_decoder, am_scores[index].data_ptr<float>(), valid_token_lens[index].item<int>(), am_scores.size(2));
|
||||
if (input_finished) {
|
||||
result = FinalizeDecode(wfst_decoder, true, us_alphas, us_peaks);
|
||||
}
|
||||
}
|
||||
}else{
|
||||
if (lm_ == nullptr) {
|
||||
result = GreedySearch(am_scores[index].data_ptr<float>(), valid_token_lens[index].item<int>(), am_scores.size(2));
|
||||
} else {
|
||||
result = BeamSearch(wfst_decoder, am_scores[index].data_ptr<float>(), valid_token_lens[index].item<int>(), am_scores.size(2));
|
||||
if (input_finished) {
|
||||
result = FinalizeDecode(wfst_decoder);
|
||||
}
|
||||
}
|
||||
}
|
||||
results.push_back(result);
|
||||
if (wfst_decoder){
|
||||
wfst_decoder->StartUtterance();
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (std::exception const &e)
|
||||
{
|
||||
LOG(ERROR)<<e.what();
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
void ParaformerTorch::WarmUp()
|
||||
{
|
||||
int32_t in_feat_dim = fbank_opts_.mel_opts.num_bins;
|
||||
int32_t feature_dim = lfr_m*in_feat_dim;
|
||||
int batch_in = 1;
|
||||
int max_frames = 10;
|
||||
std::vector<int32_t> paraformer_length;
|
||||
paraformer_length.push_back(max_frames);
|
||||
|
||||
std::vector<float> all_feats(batch_in * max_frames * feature_dim, 0.1);
|
||||
torch::Tensor feats =
|
||||
torch::from_blob(all_feats.data(),
|
||||
{batch_in, max_frames, feature_dim}, torch::kFloat).contiguous();
|
||||
torch::Tensor feat_lens = torch::from_blob(paraformer_length.data(),
|
||||
{batch_in}, torch::kInt32);
|
||||
|
||||
// 2. forward
|
||||
feats = feats.to(at::kCUDA);
|
||||
feat_lens = feat_lens.to(at::kCUDA);
|
||||
std::vector<torch::jit::IValue> inputs = {feats, feat_lens};
|
||||
|
||||
if (use_hotword) {
|
||||
std::string hotwords_wp = "";
|
||||
std::vector<std::vector<float>> hw_emb = CompileHotwordEmbedding(hotwords_wp);
|
||||
std::vector<float> embedding;
|
||||
embedding.reserve(hw_emb.size() * hw_emb[0].size());
|
||||
for (auto item : hw_emb) {
|
||||
embedding.insert(embedding.end(), item.begin(), item.end());
|
||||
}
|
||||
torch::Tensor tensor_hw_emb =
|
||||
torch::from_blob(embedding.data(),
|
||||
{batch_in, static_cast<int64_t>(hw_emb.size()), static_cast<int64_t>(hw_emb[0].size())}, torch::kFloat).contiguous();
|
||||
tensor_hw_emb = tensor_hw_emb.to(at::kCUDA);
|
||||
inputs.emplace_back(tensor_hw_emb);
|
||||
}
|
||||
|
||||
try {
|
||||
auto outputs = model_->forward(inputs).toTuple()->elements();
|
||||
}
|
||||
catch (std::exception const &e)
|
||||
{
|
||||
LOG(ERROR)<<e.what();
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<std::vector<float>> ParaformerTorch::CompileHotwordEmbedding(std::string &hotwords) {
|
||||
int embedding_dim = encoder_size;
|
||||
std::vector<std::vector<float>> hw_emb;
|
||||
if (!use_hotword) {
|
||||
std::vector<float> vec(embedding_dim, 0);
|
||||
hw_emb.push_back(vec);
|
||||
return hw_emb;
|
||||
}
|
||||
int max_hotword_len = 10;
|
||||
std::vector<int32_t> hotword_matrix;
|
||||
std::vector<int32_t> lengths;
|
||||
int hotword_size = 1;
|
||||
int real_hw_size = 0;
|
||||
if (!hotwords.empty()) {
|
||||
std::vector<std::string> hotword_array = split(hotwords, ' ');
|
||||
hotword_size = hotword_array.size() + 1;
|
||||
hotword_matrix.reserve(hotword_size * max_hotword_len);
|
||||
for (auto hotword : hotword_array) {
|
||||
std::vector<std::string> chars;
|
||||
if (EncodeConverter::IsAllChineseCharactor((const U8CHAR_T*)hotword.c_str(), hotword.size())) {
|
||||
KeepChineseCharacterAndSplit(hotword, chars);
|
||||
} else {
|
||||
// for english
|
||||
std::vector<std::string> words = split(hotword, ' ');
|
||||
for (auto word : words) {
|
||||
std::vector<string> tokens = seg_dict->GetTokensByWord(word);
|
||||
chars.insert(chars.end(), tokens.begin(), tokens.end());
|
||||
}
|
||||
}
|
||||
if(chars.size()==0){
|
||||
continue;
|
||||
}
|
||||
std::vector<int32_t> hw_vector(max_hotword_len, 0);
|
||||
int vector_len = std::min(max_hotword_len, (int)chars.size());
|
||||
int chs_oov = false;
|
||||
for (int i=0; i<vector_len; i++) {
|
||||
hw_vector[i] = phone_set_->String2Id(chars[i]);
|
||||
if(hw_vector[i] == -1){
|
||||
chs_oov = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if(chs_oov){
|
||||
LOG(INFO) << "OOV: " << hotword;
|
||||
continue;
|
||||
}
|
||||
LOG(INFO) << hotword;
|
||||
lengths.push_back(vector_len);
|
||||
real_hw_size += 1;
|
||||
hotword_matrix.insert(hotword_matrix.end(), hw_vector.begin(), hw_vector.end());
|
||||
}
|
||||
hotword_size = real_hw_size + 1;
|
||||
}
|
||||
std::vector<int32_t> blank_vec(max_hotword_len, 0);
|
||||
blank_vec[0] = 1;
|
||||
hotword_matrix.insert(hotword_matrix.end(), blank_vec.begin(), blank_vec.end());
|
||||
lengths.push_back(1);
|
||||
|
||||
torch::Tensor feats =
|
||||
torch::from_blob(hotword_matrix.data(),
|
||||
{hotword_size, max_hotword_len}, torch::kInt32).contiguous();
|
||||
|
||||
// 2. forward
|
||||
#ifdef USE_GPU
|
||||
feats = feats.to(at::kCUDA);
|
||||
#endif
|
||||
std::vector<torch::jit::IValue> inputs = {feats};
|
||||
std::vector<std::vector<float>> result;
|
||||
try {
|
||||
auto output = hw_model_->forward(inputs);
|
||||
torch::Tensor emb_tensor;
|
||||
#ifdef USE_GPU
|
||||
emb_tensor = output.toTensor().to(at::kCPU);
|
||||
#else
|
||||
emb_tensor = output.toTensor();
|
||||
#endif
|
||||
assert(emb_tensor.size(0) == max_hotword_len);
|
||||
assert(emb_tensor.size(1) == hotword_size);
|
||||
embedding_dim = emb_tensor.size(2);
|
||||
|
||||
float* floatData = emb_tensor.data_ptr<float>();
|
||||
for (int j = 0; j < hotword_size; j++)
|
||||
{
|
||||
int start_pos = hotword_size * (lengths[j] - 1) * embedding_dim + j * embedding_dim;
|
||||
std::vector<float> embedding;
|
||||
embedding.insert(embedding.begin(), floatData + start_pos, floatData + start_pos + embedding_dim);
|
||||
result.push_back(embedding);
|
||||
}
|
||||
}
|
||||
catch (std::exception const &e)
|
||||
{
|
||||
LOG(ERROR)<<e.what();
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
Vocab* ParaformerTorch::GetVocab()
|
||||
{
|
||||
return vocab;
|
||||
}
|
||||
|
||||
Vocab* ParaformerTorch::GetLmVocab()
|
||||
{
|
||||
return lm_vocab;
|
||||
}
|
||||
|
||||
PhoneSet* ParaformerTorch::GetPhoneSet()
|
||||
{
|
||||
return phone_set_;
|
||||
}
|
||||
|
||||
string ParaformerTorch::Rescoring()
|
||||
{
|
||||
LOG(ERROR)<<"Not Imp!!!!!!";
|
||||
return "";
|
||||
}
|
||||
} // namespace funasr
|
||||
98
modules/python/vendors/FunASR/runtime/onnxruntime/src/paraformer-torch.h
vendored
Normal file
98
modules/python/vendors/FunASR/runtime/onnxruntime/src/paraformer-torch.h
vendored
Normal file
@@ -0,0 +1,98 @@
|
||||
/**
|
||||
* Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
|
||||
* MIT License (https://opensource.org/licenses/MIT)
|
||||
*/
|
||||
#pragma once
|
||||
#define C10_USE_GLOG
|
||||
#include <torch/serialize.h>
|
||||
#include <torch/script.h>
|
||||
#include <torch/torch.h>
|
||||
#include <torch/csrc/jit/passes/tensorexpr_fuser.h>
|
||||
#include "precomp.h"
|
||||
#include "fst/fstlib.h"
|
||||
#include "fst/symbol-table.h"
|
||||
#include "bias-lm.h"
|
||||
#include "phone-set.h"
|
||||
|
||||
namespace funasr {
|
||||
|
||||
class ParaformerTorch : public Model {
|
||||
/**
|
||||
* Author: Speech Lab of DAMO Academy, Alibaba Group
|
||||
* Paraformer: Fast and Accurate Parallel Transformer for Non-autoregressive End-to-End Speech Recognition
|
||||
* https://arxiv.org/pdf/2206.08317.pdf
|
||||
*/
|
||||
private:
|
||||
Vocab* vocab = nullptr;
|
||||
Vocab* lm_vocab = nullptr;
|
||||
SegDict* seg_dict = nullptr;
|
||||
PhoneSet* phone_set_ = nullptr;
|
||||
//const float scale = 22.6274169979695;
|
||||
const float scale = 1.0;
|
||||
|
||||
void LoadConfigFromYaml(const char* filename);
|
||||
void LoadCmvn(const char *filename);
|
||||
void LfrCmvn(std::vector<std::vector<float>> &asr_feats);
|
||||
|
||||
using TorchModule = torch::jit::script::Module;
|
||||
std::shared_ptr<TorchModule> model_ = nullptr;
|
||||
std::shared_ptr<TorchModule> hw_model_ = nullptr;
|
||||
std::vector<torch::Tensor> encoder_outs_;
|
||||
bool use_hotword;
|
||||
|
||||
public:
|
||||
ParaformerTorch();
|
||||
~ParaformerTorch();
|
||||
void InitAsr(const std::string &am_model, const std::string &am_cmvn, const std::string &am_config, const std::string &token_file, int thread_num);
|
||||
void InitHwCompiler(const std::string &hw_model, int thread_num);
|
||||
void InitSegDict(const std::string &seg_dict_model);
|
||||
std::vector<std::vector<float>> CompileHotwordEmbedding(std::string &hotwords);
|
||||
void Reset();
|
||||
void FbankKaldi(float sample_rate, const float* waves, int len, std::vector<std::vector<float>> &asr_feats);
|
||||
void WarmUp();
|
||||
std::vector<std::string> Forward(float** din, int* len, bool input_finished=true, const std::vector<std::vector<float>> &hw_emb={{0.0}}, void* wfst_decoder=nullptr, int batch_in=1);
|
||||
string GreedySearch( float* in, int n_len, int64_t token_nums,
|
||||
bool is_stamp=false, std::vector<float> us_alphas={0}, std::vector<float> us_cif_peak={0});
|
||||
|
||||
string Rescoring();
|
||||
string GetLang(){return language;};
|
||||
int GetAsrSampleRate() { return asr_sample_rate; };
|
||||
void SetBatchSize(int batch_size) {batch_size_ = batch_size;};
|
||||
int GetBatchSize() {return batch_size_;};
|
||||
void StartUtterance();
|
||||
void EndUtterance();
|
||||
void InitLm(const std::string &lm_file, const std::string &lm_cfg_file, const std::string &lex_file);
|
||||
string BeamSearch(WfstDecoder* &wfst_decoder, float* in, int n_len, int64_t token_nums);
|
||||
string FinalizeDecode(WfstDecoder* &wfst_decoder,
|
||||
bool is_stamp=false, std::vector<float> us_alphas={0}, std::vector<float> us_cif_peak={0});
|
||||
Vocab* GetVocab();
|
||||
Vocab* GetLmVocab();
|
||||
PhoneSet* GetPhoneSet();
|
||||
|
||||
knf::FbankOptions fbank_opts_;
|
||||
vector<float> means_list_;
|
||||
vector<float> vars_list_;
|
||||
int lfr_m = PARA_LFR_M;
|
||||
int lfr_n = PARA_LFR_N;
|
||||
|
||||
// paraformer-offline
|
||||
std::string language="zh-cn";
|
||||
|
||||
// lm
|
||||
std::shared_ptr<fst::Fst<fst::StdArc>> lm_ = nullptr;
|
||||
|
||||
string window_type = "hamming";
|
||||
int frame_length = 25;
|
||||
int frame_shift = 10;
|
||||
int n_mels = 80;
|
||||
int encoder_size = 512;
|
||||
int fsmn_layers = 16;
|
||||
int fsmn_lorder = 10;
|
||||
int fsmn_dims = 512;
|
||||
float cif_threshold = 1.0;
|
||||
float tail_alphas = 0.45;
|
||||
int asr_sample_rate = MODEL_SAMPLE_RATE;
|
||||
int batch_size_ = 1;
|
||||
};
|
||||
|
||||
} // namespace funasr
|
||||
701
modules/python/vendors/FunASR/runtime/onnxruntime/src/paraformer.cpp
vendored
Normal file
701
modules/python/vendors/FunASR/runtime/onnxruntime/src/paraformer.cpp
vendored
Normal file
@@ -0,0 +1,701 @@
|
||||
/**
|
||||
* Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
|
||||
* MIT License (https://opensource.org/licenses/MIT)
|
||||
*/
|
||||
|
||||
#include "precomp.h"
|
||||
#include "paraformer.h"
|
||||
#include "encode_converter.h"
|
||||
#include <cstddef>
|
||||
|
||||
using namespace std;
|
||||
namespace funasr {
|
||||
|
||||
Paraformer::Paraformer()
|
||||
:use_hotword(false),
|
||||
env_(ORT_LOGGING_LEVEL_ERROR, "paraformer"),session_options_{},
|
||||
hw_env_(ORT_LOGGING_LEVEL_ERROR, "paraformer_hw"),hw_session_options{} {
|
||||
}
|
||||
|
||||
// offline
|
||||
void Paraformer::InitAsr(const std::string &am_model, const std::string &am_cmvn, const std::string &am_config, const std::string &token_file, int thread_num){
|
||||
LoadConfigFromYaml(am_config.c_str());
|
||||
// knf options
|
||||
fbank_opts_.frame_opts.dither = 0;
|
||||
fbank_opts_.mel_opts.num_bins = n_mels;
|
||||
fbank_opts_.frame_opts.samp_freq = asr_sample_rate;
|
||||
fbank_opts_.frame_opts.window_type = window_type;
|
||||
fbank_opts_.frame_opts.frame_shift_ms = frame_shift;
|
||||
fbank_opts_.frame_opts.frame_length_ms = frame_length;
|
||||
fbank_opts_.energy_floor = 0;
|
||||
fbank_opts_.mel_opts.debug_mel = false;
|
||||
// fbank_ = std::make_unique<knf::OnlineFbank>(fbank_opts);
|
||||
|
||||
// session_options_.SetInterOpNumThreads(1);
|
||||
session_options_.SetIntraOpNumThreads(thread_num);
|
||||
session_options_.SetGraphOptimizationLevel(ORT_ENABLE_ALL);
|
||||
// DisableCpuMemArena can improve performance
|
||||
session_options_.DisableCpuMemArena();
|
||||
|
||||
try {
|
||||
m_session_ = std::make_unique<Ort::Session>(env_, ORTSTRING(am_model).c_str(), session_options_);
|
||||
LOG(INFO) << "Successfully load model from " << am_model;
|
||||
} catch (std::exception const &e) {
|
||||
LOG(ERROR) << "Error when load am onnx model: " << e.what();
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
GetInputNames(m_session_.get(), m_strInputNames, m_szInputNames);
|
||||
GetOutputNames(m_session_.get(), m_strOutputNames, m_szOutputNames);
|
||||
vocab = new Vocab(token_file.c_str());
|
||||
phone_set_ = new PhoneSet(token_file.c_str());
|
||||
LoadCmvn(am_cmvn.c_str());
|
||||
}
|
||||
|
||||
// online
|
||||
void Paraformer::InitAsr(const std::string &en_model, const std::string &de_model, const std::string &am_cmvn, const std::string &am_config, const std::string &token_file, int thread_num){
|
||||
|
||||
LoadOnlineConfigFromYaml(am_config.c_str());
|
||||
// knf options
|
||||
fbank_opts_.frame_opts.dither = 0;
|
||||
fbank_opts_.mel_opts.num_bins = n_mels;
|
||||
fbank_opts_.frame_opts.samp_freq = asr_sample_rate;
|
||||
fbank_opts_.frame_opts.window_type = window_type;
|
||||
fbank_opts_.frame_opts.frame_shift_ms = frame_shift;
|
||||
fbank_opts_.frame_opts.frame_length_ms = frame_length;
|
||||
fbank_opts_.energy_floor = 0;
|
||||
fbank_opts_.mel_opts.debug_mel = false;
|
||||
|
||||
// session_options_.SetInterOpNumThreads(1);
|
||||
session_options_.SetIntraOpNumThreads(thread_num);
|
||||
session_options_.SetGraphOptimizationLevel(ORT_ENABLE_ALL);
|
||||
// DisableCpuMemArena can improve performance
|
||||
session_options_.DisableCpuMemArena();
|
||||
|
||||
try {
|
||||
encoder_session_ = std::make_unique<Ort::Session>(env_, ORTSTRING(en_model).c_str(), session_options_);
|
||||
LOG(INFO) << "Successfully load model from " << en_model;
|
||||
} catch (std::exception const &e) {
|
||||
LOG(ERROR) << "Error when load am encoder model: " << e.what();
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
try {
|
||||
decoder_session_ = std::make_unique<Ort::Session>(env_, ORTSTRING(de_model).c_str(), session_options_);
|
||||
LOG(INFO) << "Successfully load model from " << de_model;
|
||||
} catch (std::exception const &e) {
|
||||
LOG(ERROR) << "Error when load am decoder model: " << e.what();
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
// encoder
|
||||
string strName;
|
||||
GetInputName(encoder_session_.get(), strName);
|
||||
en_strInputNames.push_back(strName.c_str());
|
||||
GetInputName(encoder_session_.get(), strName,1);
|
||||
en_strInputNames.push_back(strName);
|
||||
|
||||
GetOutputName(encoder_session_.get(), strName);
|
||||
en_strOutputNames.push_back(strName);
|
||||
GetOutputName(encoder_session_.get(), strName,1);
|
||||
en_strOutputNames.push_back(strName);
|
||||
GetOutputName(encoder_session_.get(), strName,2);
|
||||
en_strOutputNames.push_back(strName);
|
||||
|
||||
for (auto& item : en_strInputNames)
|
||||
en_szInputNames_.push_back(item.c_str());
|
||||
for (auto& item : en_strOutputNames)
|
||||
en_szOutputNames_.push_back(item.c_str());
|
||||
|
||||
// decoder
|
||||
int de_input_len = 4 + fsmn_layers;
|
||||
int de_out_len = 2 + fsmn_layers;
|
||||
for(int i=0;i<de_input_len; i++){
|
||||
GetInputName(decoder_session_.get(), strName, i);
|
||||
de_strInputNames.push_back(strName.c_str());
|
||||
}
|
||||
|
||||
for(int i=0;i<de_out_len; i++){
|
||||
GetOutputName(decoder_session_.get(), strName,i);
|
||||
de_strOutputNames.push_back(strName);
|
||||
}
|
||||
|
||||
for (auto& item : de_strInputNames)
|
||||
de_szInputNames_.push_back(item.c_str());
|
||||
for (auto& item : de_strOutputNames)
|
||||
de_szOutputNames_.push_back(item.c_str());
|
||||
|
||||
vocab = new Vocab(token_file.c_str());
|
||||
phone_set_ = new PhoneSet(token_file.c_str());
|
||||
LoadCmvn(am_cmvn.c_str());
|
||||
}
|
||||
|
||||
// 2pass
|
||||
void Paraformer::InitAsr(const std::string &am_model, const std::string &en_model, const std::string &de_model, const std::string &am_cmvn, const std::string &am_config, const std::string &token_file, int thread_num){
|
||||
// online
|
||||
InitAsr(en_model, de_model, am_cmvn, am_config, token_file, thread_num);
|
||||
|
||||
// offline
|
||||
try {
|
||||
m_session_ = std::make_unique<Ort::Session>(env_, ORTSTRING(am_model).c_str(), session_options_);
|
||||
LOG(INFO) << "Successfully load model from " << am_model;
|
||||
} catch (std::exception const &e) {
|
||||
LOG(ERROR) << "Error when load am onnx model: " << e.what();
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
// string strName;
|
||||
// GetInputName(m_session_.get(), strName);
|
||||
// m_strInputNames.push_back(strName.c_str());
|
||||
// GetInputName(m_session_.get(), strName,1);
|
||||
// m_strInputNames.push_back(strName);
|
||||
|
||||
// if (use_hotword) {
|
||||
// GetInputName(m_session_.get(), strName, 2);
|
||||
// m_strInputNames.push_back(strName);
|
||||
// }
|
||||
|
||||
// // support time stamp
|
||||
// size_t numOutputNodes = m_session_->GetOutputCount();
|
||||
// for(int index=0; index<numOutputNodes; index++){
|
||||
// GetOutputName(m_session_.get(), strName, index);
|
||||
// m_strOutputNames.push_back(strName);
|
||||
// }
|
||||
|
||||
// for (auto& item : m_strInputNames)
|
||||
// m_szInputNames.push_back(item.c_str());
|
||||
// for (auto& item : m_strOutputNames)
|
||||
// m_szOutputNames.push_back(item.c_str());
|
||||
GetInputNames(m_session_.get(), m_strInputNames, m_szInputNames);
|
||||
GetOutputNames(m_session_.get(), m_strOutputNames, m_szOutputNames);
|
||||
}
|
||||
|
||||
void Paraformer::InitLm(const std::string &lm_file,
|
||||
const std::string &lm_cfg_file,
|
||||
const std::string &lex_file) {
|
||||
try {
|
||||
lm_ = std::shared_ptr<fst::Fst<fst::StdArc>>(
|
||||
fst::Fst<fst::StdArc>::Read(lm_file));
|
||||
if (lm_){
|
||||
lm_vocab = new Vocab(lm_cfg_file.c_str(), lex_file.c_str());
|
||||
LOG(INFO) << "Successfully load lm file " << lm_file;
|
||||
}else{
|
||||
LOG(ERROR) << "Failed to load lm file " << lm_file;
|
||||
}
|
||||
} catch (std::exception const &e) {
|
||||
LOG(ERROR) << "Error when load lm file: " << e.what();
|
||||
exit(0);
|
||||
}
|
||||
}
|
||||
|
||||
void Paraformer::LoadConfigFromYaml(const char* filename){
|
||||
|
||||
YAML::Node config;
|
||||
try{
|
||||
config = YAML::LoadFile(filename);
|
||||
}catch(exception const &e){
|
||||
LOG(ERROR) << "Error loading file, yaml file error or not exist.";
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
try{
|
||||
YAML::Node frontend_conf = config["frontend_conf"];
|
||||
this->asr_sample_rate = frontend_conf["fs"].as<int>();
|
||||
|
||||
YAML::Node lang_conf = config["lang"];
|
||||
if (lang_conf.IsDefined()){
|
||||
language = lang_conf.as<string>();
|
||||
}
|
||||
}catch(exception const &e){
|
||||
LOG(ERROR) << "Error when load argument from vad config YAML.";
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
|
||||
void Paraformer::LoadOnlineConfigFromYaml(const char* filename){
|
||||
|
||||
YAML::Node config;
|
||||
try{
|
||||
config = YAML::LoadFile(filename);
|
||||
}catch(exception const &e){
|
||||
LOG(ERROR) << "Error loading file, yaml file error or not exist.";
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
try{
|
||||
YAML::Node frontend_conf = config["frontend_conf"];
|
||||
YAML::Node encoder_conf = config["encoder_conf"];
|
||||
YAML::Node decoder_conf = config["decoder_conf"];
|
||||
YAML::Node predictor_conf = config["predictor_conf"];
|
||||
|
||||
this->window_type = frontend_conf["window"].as<string>();
|
||||
this->n_mels = frontend_conf["n_mels"].as<int>();
|
||||
this->frame_length = frontend_conf["frame_length"].as<int>();
|
||||
this->frame_shift = frontend_conf["frame_shift"].as<int>();
|
||||
this->lfr_m = frontend_conf["lfr_m"].as<int>();
|
||||
this->lfr_n = frontend_conf["lfr_n"].as<int>();
|
||||
|
||||
this->encoder_size = encoder_conf["output_size"].as<int>();
|
||||
this->fsmn_dims = encoder_conf["output_size"].as<int>();
|
||||
|
||||
this->fsmn_layers = decoder_conf["num_blocks"].as<int>();
|
||||
this->fsmn_lorder = decoder_conf["kernel_size"].as<int>()-1;
|
||||
|
||||
this->cif_threshold = predictor_conf["threshold"].as<double>();
|
||||
this->tail_alphas = predictor_conf["tail_threshold"].as<double>();
|
||||
|
||||
this->asr_sample_rate = frontend_conf["fs"].as<int>();
|
||||
|
||||
|
||||
}catch(exception const &e){
|
||||
LOG(ERROR) << "Error when load argument from vad config YAML.";
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
|
||||
void Paraformer::InitHwCompiler(const std::string &hw_model, int thread_num) {
|
||||
hw_session_options.SetIntraOpNumThreads(thread_num);
|
||||
hw_session_options.SetGraphOptimizationLevel(ORT_ENABLE_ALL);
|
||||
// DisableCpuMemArena can improve performance
|
||||
hw_session_options.DisableCpuMemArena();
|
||||
|
||||
try {
|
||||
hw_m_session = std::make_unique<Ort::Session>(hw_env_, ORTSTRING(hw_model).c_str(), hw_session_options);
|
||||
LOG(INFO) << "Successfully load model from " << hw_model;
|
||||
} catch (std::exception const &e) {
|
||||
LOG(ERROR) << "Error when load hw compiler onnx model: " << e.what();
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
string strName;
|
||||
GetInputName(hw_m_session.get(), strName);
|
||||
hw_m_strInputNames.push_back(strName.c_str());
|
||||
//GetInputName(hw_m_session.get(), strName,1);
|
||||
//hw_m_strInputNames.push_back(strName);
|
||||
|
||||
GetOutputName(hw_m_session.get(), strName);
|
||||
hw_m_strOutputNames.push_back(strName);
|
||||
|
||||
for (auto& item : hw_m_strInputNames)
|
||||
hw_m_szInputNames.push_back(item.c_str());
|
||||
for (auto& item : hw_m_strOutputNames)
|
||||
hw_m_szOutputNames.push_back(item.c_str());
|
||||
// if init hotword compiler is called, this is a hotword paraformer model
|
||||
use_hotword = true;
|
||||
}
|
||||
|
||||
void Paraformer::InitSegDict(const std::string &seg_dict_model) {
|
||||
seg_dict = new SegDict(seg_dict_model.c_str());
|
||||
}
|
||||
|
||||
Paraformer::~Paraformer()
|
||||
{
|
||||
if(vocab){
|
||||
delete vocab;
|
||||
}
|
||||
if(lm_vocab){
|
||||
delete lm_vocab;
|
||||
}
|
||||
if(seg_dict){
|
||||
delete seg_dict;
|
||||
}
|
||||
if(phone_set_){
|
||||
delete phone_set_;
|
||||
}
|
||||
}
|
||||
|
||||
void Paraformer::StartUtterance()
|
||||
{
|
||||
}
|
||||
|
||||
void Paraformer::EndUtterance()
|
||||
{
|
||||
}
|
||||
|
||||
void Paraformer::Reset()
|
||||
{
|
||||
}
|
||||
|
||||
void Paraformer::FbankKaldi(float sample_rate, const float* waves, int len, std::vector<std::vector<float>> &asr_feats) {
|
||||
knf::OnlineFbank fbank_(fbank_opts_);
|
||||
std::vector<float> buf(len);
|
||||
for (int32_t i = 0; i != len; ++i) {
|
||||
buf[i] = waves[i] * 32768;
|
||||
}
|
||||
fbank_.AcceptWaveform(sample_rate, buf.data(), buf.size());
|
||||
|
||||
int32_t frames = fbank_.NumFramesReady();
|
||||
for (int32_t i = 0; i != frames; ++i) {
|
||||
const float *frame = fbank_.GetFrame(i);
|
||||
std::vector<float> frame_vector(frame, frame + fbank_opts_.mel_opts.num_bins);
|
||||
asr_feats.emplace_back(frame_vector);
|
||||
}
|
||||
}
|
||||
|
||||
void Paraformer::LoadCmvn(const char *filename)
|
||||
{
|
||||
ifstream cmvn_stream(filename);
|
||||
if (!cmvn_stream.is_open()) {
|
||||
LOG(ERROR) << "Failed to open file: " << filename;
|
||||
exit(-1);
|
||||
}
|
||||
string line;
|
||||
|
||||
while (getline(cmvn_stream, line)) {
|
||||
istringstream iss(line);
|
||||
vector<string> line_item{istream_iterator<string>{iss}, istream_iterator<string>{}};
|
||||
if (line_item[0] == "<AddShift>") {
|
||||
getline(cmvn_stream, line);
|
||||
istringstream means_lines_stream(line);
|
||||
vector<string> means_lines{istream_iterator<string>{means_lines_stream}, istream_iterator<string>{}};
|
||||
if (means_lines[0] == "<LearnRateCoef>") {
|
||||
for (int j = 3; j < means_lines.size() - 1; j++) {
|
||||
means_list_.push_back(stof(means_lines[j]));
|
||||
}
|
||||
continue;
|
||||
}
|
||||
}
|
||||
else if (line_item[0] == "<Rescale>") {
|
||||
getline(cmvn_stream, line);
|
||||
istringstream vars_lines_stream(line);
|
||||
vector<string> vars_lines{istream_iterator<string>{vars_lines_stream}, istream_iterator<string>{}};
|
||||
if (vars_lines[0] == "<LearnRateCoef>") {
|
||||
for (int j = 3; j < vars_lines.size() - 1; j++) {
|
||||
vars_list_.push_back(stof(vars_lines[j])*scale);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
string Paraformer::GreedySearch(float * in, int n_len, int64_t token_nums, bool is_stamp, std::vector<float> us_alphas, std::vector<float> us_cif_peak)
|
||||
{
|
||||
vector<int> hyps;
|
||||
int Tmax = n_len;
|
||||
for (int i = 0; i < Tmax; i++) {
|
||||
int max_idx;
|
||||
float max_val;
|
||||
FindMax(in + i * token_nums, token_nums, max_val, max_idx);
|
||||
hyps.push_back(max_idx);
|
||||
}
|
||||
if(!is_stamp){
|
||||
return vocab->Vector2StringV2(hyps, language);
|
||||
}else{
|
||||
std::vector<string> char_list;
|
||||
std::vector<std::vector<float>> timestamp_list;
|
||||
std::string res_str;
|
||||
vocab->Vector2String(hyps, char_list);
|
||||
std::vector<string> raw_char(char_list);
|
||||
TimestampOnnx(us_alphas, us_cif_peak, char_list, res_str, timestamp_list);
|
||||
|
||||
return PostProcess(raw_char, timestamp_list);
|
||||
}
|
||||
}
|
||||
|
||||
string Paraformer::BeamSearch(WfstDecoder* &wfst_decoder, float *in, int len, int64_t token_nums)
|
||||
{
|
||||
return wfst_decoder->Search(in, len, token_nums);
|
||||
}
|
||||
|
||||
string Paraformer::FinalizeDecode(WfstDecoder* &wfst_decoder,
|
||||
bool is_stamp, std::vector<float> us_alphas, std::vector<float> us_cif_peak)
|
||||
{
|
||||
return wfst_decoder->FinalizeDecode(is_stamp, us_alphas, us_cif_peak);
|
||||
}
|
||||
|
||||
void Paraformer::LfrCmvn(std::vector<std::vector<float>> &asr_feats) {
|
||||
|
||||
std::vector<std::vector<float>> out_feats;
|
||||
int T = asr_feats.size();
|
||||
int T_lrf = ceil(1.0 * T / lfr_n);
|
||||
|
||||
// Pad frames at start(copy first frame)
|
||||
for (int i = 0; i < (lfr_m - 1) / 2; i++) {
|
||||
asr_feats.insert(asr_feats.begin(), asr_feats[0]);
|
||||
}
|
||||
// Merge lfr_m frames as one,lfr_n frames per window
|
||||
T = T + (lfr_m - 1) / 2;
|
||||
std::vector<float> p;
|
||||
for (int i = 0; i < T_lrf; i++) {
|
||||
if (lfr_m <= T - i * lfr_n) {
|
||||
for (int j = 0; j < lfr_m; j++) {
|
||||
p.insert(p.end(), asr_feats[i * lfr_n + j].begin(), asr_feats[i * lfr_n + j].end());
|
||||
}
|
||||
out_feats.emplace_back(p);
|
||||
p.clear();
|
||||
} else {
|
||||
// Fill to lfr_m frames at last window if less than lfr_m frames (copy last frame)
|
||||
int num_padding = lfr_m - (T - i * lfr_n);
|
||||
for (int j = 0; j < (asr_feats.size() - i * lfr_n); j++) {
|
||||
p.insert(p.end(), asr_feats[i * lfr_n + j].begin(), asr_feats[i * lfr_n + j].end());
|
||||
}
|
||||
for (int j = 0; j < num_padding; j++) {
|
||||
p.insert(p.end(), asr_feats[asr_feats.size() - 1].begin(), asr_feats[asr_feats.size() - 1].end());
|
||||
}
|
||||
out_feats.emplace_back(p);
|
||||
p.clear();
|
||||
}
|
||||
}
|
||||
// Apply cmvn
|
||||
for (auto &out_feat: out_feats) {
|
||||
for (int j = 0; j < means_list_.size(); j++) {
|
||||
out_feat[j] = (out_feat[j] + means_list_[j]) * vars_list_[j];
|
||||
}
|
||||
}
|
||||
asr_feats = out_feats;
|
||||
}
|
||||
|
||||
std::vector<std::string> Paraformer::Forward(float** din, int* len, bool input_finished, const std::vector<std::vector<float>> &hw_emb, void* decoder_handle, int batch_in)
|
||||
{
|
||||
std::vector<std::string> results;
|
||||
string result="";
|
||||
WfstDecoder* wfst_decoder = (WfstDecoder*)decoder_handle;
|
||||
int32_t in_feat_dim = fbank_opts_.mel_opts.num_bins;
|
||||
|
||||
if(batch_in != 1){
|
||||
results.push_back(result);
|
||||
return results;
|
||||
}
|
||||
|
||||
std::vector<std::vector<float>> asr_feats;
|
||||
FbankKaldi(asr_sample_rate, din[0], len[0], asr_feats);
|
||||
if(asr_feats.size() == 0){
|
||||
results.push_back(result);
|
||||
return results;
|
||||
}
|
||||
LfrCmvn(asr_feats);
|
||||
int32_t feat_dim = lfr_m*in_feat_dim;
|
||||
int32_t num_frames = asr_feats.size();
|
||||
|
||||
std::vector<float> wav_feats;
|
||||
for (const auto &frame_feat: asr_feats) {
|
||||
wav_feats.insert(wav_feats.end(), frame_feat.begin(), frame_feat.end());
|
||||
}
|
||||
|
||||
#ifdef _WIN_X86
|
||||
Ort::MemoryInfo m_memoryInfo = Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU);
|
||||
#else
|
||||
Ort::MemoryInfo m_memoryInfo = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);
|
||||
#endif
|
||||
|
||||
const int64_t input_shape_[3] = {1, num_frames, feat_dim};
|
||||
Ort::Value onnx_feats = Ort::Value::CreateTensor<float>(m_memoryInfo,
|
||||
wav_feats.data(),
|
||||
wav_feats.size(),
|
||||
input_shape_,
|
||||
3);
|
||||
|
||||
const int64_t paraformer_length_shape[1] = {1};
|
||||
std::vector<int32_t> paraformer_length;
|
||||
paraformer_length.emplace_back(num_frames);
|
||||
Ort::Value onnx_feats_len = Ort::Value::CreateTensor<int32_t>(
|
||||
m_memoryInfo, paraformer_length.data(), paraformer_length.size(), paraformer_length_shape, 1);
|
||||
|
||||
std::vector<Ort::Value> input_onnx;
|
||||
input_onnx.emplace_back(std::move(onnx_feats));
|
||||
input_onnx.emplace_back(std::move(onnx_feats_len));
|
||||
|
||||
std::vector<float> embedding;
|
||||
try{
|
||||
if (use_hotword) {
|
||||
if(hw_emb.size()<=0){
|
||||
LOG(ERROR) << "hw_emb is null";
|
||||
results.push_back(result);
|
||||
return results;
|
||||
}
|
||||
//PrintMat(hw_emb, "input_clas_emb");
|
||||
const int64_t hotword_shape[3] = {1, static_cast<int64_t>(hw_emb.size()), static_cast<int64_t>(hw_emb[0].size())};
|
||||
embedding.reserve(hw_emb.size() * hw_emb[0].size());
|
||||
for (auto item : hw_emb) {
|
||||
embedding.insert(embedding.end(), item.begin(), item.end());
|
||||
}
|
||||
//LOG(INFO) << "hotword shape " << hotword_shape[0] << " " << hotword_shape[1] << " " << hotword_shape[2] << " size " << embedding.size();
|
||||
Ort::Value onnx_hw_emb = Ort::Value::CreateTensor<float>(
|
||||
m_memoryInfo, embedding.data(), embedding.size(), hotword_shape, 3);
|
||||
|
||||
input_onnx.emplace_back(std::move(onnx_hw_emb));
|
||||
}
|
||||
}catch (std::exception const &e)
|
||||
{
|
||||
LOG(ERROR)<<e.what();
|
||||
results.push_back(result);
|
||||
return results;
|
||||
}
|
||||
|
||||
try {
|
||||
auto outputTensor = m_session_->Run(Ort::RunOptions{nullptr}, m_szInputNames.data(), input_onnx.data(), input_onnx.size(), m_szOutputNames.data(), m_szOutputNames.size());
|
||||
std::vector<int64_t> outputShape = outputTensor[0].GetTensorTypeAndShapeInfo().GetShape();
|
||||
//LOG(INFO) << "paraformer out shape " << outputShape[0] << " " << outputShape[1] << " " << outputShape[2];
|
||||
|
||||
int64_t outputCount = std::accumulate(outputShape.begin(), outputShape.end(), 1, std::multiplies<int64_t>());
|
||||
float* floatData = outputTensor[0].GetTensorMutableData<float>();
|
||||
auto encoder_out_lens = outputTensor[1].GetTensorMutableData<int64_t>();
|
||||
// timestamp
|
||||
if(outputTensor.size() == 4){
|
||||
std::vector<int64_t> us_alphas_shape = outputTensor[2].GetTensorTypeAndShapeInfo().GetShape();
|
||||
float* us_alphas_data = outputTensor[2].GetTensorMutableData<float>();
|
||||
std::vector<float> us_alphas(us_alphas_shape[1]);
|
||||
for (int i = 0; i < us_alphas_shape[1]; i++) {
|
||||
us_alphas[i] = us_alphas_data[i];
|
||||
}
|
||||
|
||||
std::vector<int64_t> us_peaks_shape = outputTensor[3].GetTensorTypeAndShapeInfo().GetShape();
|
||||
float* us_peaks_data = outputTensor[3].GetTensorMutableData<float>();
|
||||
std::vector<float> us_peaks(us_peaks_shape[1]);
|
||||
for (int i = 0; i < us_peaks_shape[1]; i++) {
|
||||
us_peaks[i] = us_peaks_data[i];
|
||||
}
|
||||
if (lm_ == nullptr) {
|
||||
result = GreedySearch(floatData, *encoder_out_lens, outputShape[2], true, us_alphas, us_peaks);
|
||||
} else {
|
||||
result = BeamSearch(wfst_decoder, floatData, *encoder_out_lens, outputShape[2]);
|
||||
if (input_finished) {
|
||||
result = FinalizeDecode(wfst_decoder, true, us_alphas, us_peaks);
|
||||
}
|
||||
}
|
||||
}else{
|
||||
if (lm_ == nullptr) {
|
||||
result = GreedySearch(floatData, *encoder_out_lens, outputShape[2]);
|
||||
} else {
|
||||
result = BeamSearch(wfst_decoder, floatData, *encoder_out_lens, outputShape[2]);
|
||||
if (input_finished) {
|
||||
result = FinalizeDecode(wfst_decoder);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (std::exception const &e)
|
||||
{
|
||||
LOG(ERROR)<<e.what();
|
||||
}
|
||||
|
||||
results.push_back(result);
|
||||
return results;
|
||||
}
|
||||
|
||||
|
||||
std::vector<std::vector<float>> Paraformer::CompileHotwordEmbedding(std::string &hotwords) {
|
||||
int embedding_dim = encoder_size;
|
||||
std::vector<std::vector<float>> hw_emb;
|
||||
if (!use_hotword) {
|
||||
std::vector<float> vec(embedding_dim, 0);
|
||||
hw_emb.push_back(vec);
|
||||
return hw_emb;
|
||||
}
|
||||
int max_hotword_len = 10;
|
||||
std::vector<int32_t> hotword_matrix;
|
||||
std::vector<int32_t> lengths;
|
||||
int hotword_size = 1;
|
||||
int real_hw_size = 0;
|
||||
if (!hotwords.empty()) {
|
||||
std::vector<std::string> hotword_array = split(hotwords, ' ');
|
||||
hotword_size = hotword_array.size() + 1;
|
||||
hotword_matrix.reserve(hotword_size * max_hotword_len);
|
||||
for (auto hotword : hotword_array) {
|
||||
std::vector<std::string> chars;
|
||||
if (EncodeConverter::IsAllChineseCharactor((const U8CHAR_T*)hotword.c_str(), hotword.size())) {
|
||||
KeepChineseCharacterAndSplit(hotword, chars);
|
||||
} else {
|
||||
// for english
|
||||
std::vector<std::string> words = split(hotword, ' ');
|
||||
for (auto word : words) {
|
||||
std::vector<string> tokens = seg_dict->GetTokensByWord(word);
|
||||
chars.insert(chars.end(), tokens.begin(), tokens.end());
|
||||
}
|
||||
}
|
||||
if(chars.size()==0){
|
||||
continue;
|
||||
}
|
||||
std::vector<int32_t> hw_vector(max_hotword_len, 0);
|
||||
int vector_len = std::min(max_hotword_len, (int)chars.size());
|
||||
int chs_oov = false;
|
||||
for (int i=0; i<vector_len; i++) {
|
||||
hw_vector[i] = phone_set_->String2Id(chars[i]);
|
||||
if(hw_vector[i] == -1){
|
||||
chs_oov = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if(chs_oov){
|
||||
LOG(INFO) << "OOV: " << hotword;
|
||||
continue;
|
||||
}
|
||||
LOG(INFO) << hotword;
|
||||
lengths.push_back(vector_len);
|
||||
real_hw_size += 1;
|
||||
hotword_matrix.insert(hotword_matrix.end(), hw_vector.begin(), hw_vector.end());
|
||||
}
|
||||
hotword_size = real_hw_size + 1;
|
||||
}
|
||||
std::vector<int32_t> blank_vec(max_hotword_len, 0);
|
||||
blank_vec[0] = 1;
|
||||
hotword_matrix.insert(hotword_matrix.end(), blank_vec.begin(), blank_vec.end());
|
||||
lengths.push_back(1);
|
||||
|
||||
#ifdef _WIN_X86
|
||||
Ort::MemoryInfo m_memoryInfo = Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU);
|
||||
#else
|
||||
Ort::MemoryInfo m_memoryInfo = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);
|
||||
#endif
|
||||
|
||||
const int64_t input_shape_[2] = {hotword_size, max_hotword_len};
|
||||
Ort::Value onnx_hotword = Ort::Value::CreateTensor<int32_t>(m_memoryInfo,
|
||||
(int32_t*)hotword_matrix.data(),
|
||||
hotword_size * max_hotword_len,
|
||||
input_shape_,
|
||||
2);
|
||||
LOG(INFO) << "clas shape " << hotword_size << " " << max_hotword_len << std::endl;
|
||||
|
||||
std::vector<Ort::Value> input_onnx;
|
||||
input_onnx.emplace_back(std::move(onnx_hotword));
|
||||
|
||||
std::vector<std::vector<float>> result;
|
||||
try {
|
||||
auto outputTensor = hw_m_session->Run(Ort::RunOptions{nullptr}, hw_m_szInputNames.data(), input_onnx.data(), input_onnx.size(), hw_m_szOutputNames.data(), hw_m_szOutputNames.size());
|
||||
std::vector<int64_t> outputShape = outputTensor[0].GetTensorTypeAndShapeInfo().GetShape();
|
||||
|
||||
int64_t outputCount = std::accumulate(outputShape.begin(), outputShape.end(), 1, std::multiplies<int64_t>());
|
||||
float* floatData = outputTensor[0].GetTensorMutableData<float>(); // shape [max_hotword_len, hotword_size, dim]
|
||||
// get embedding by real hotword length
|
||||
assert(outputShape[0] == max_hotword_len);
|
||||
assert(outputShape[1] == hotword_size);
|
||||
embedding_dim = outputShape[2];
|
||||
|
||||
for (int j = 0; j < hotword_size; j++)
|
||||
{
|
||||
int start_pos = hotword_size * (lengths[j] - 1) * embedding_dim + j * embedding_dim;
|
||||
std::vector<float> embedding;
|
||||
embedding.insert(embedding.begin(), floatData + start_pos, floatData + start_pos + embedding_dim);
|
||||
result.push_back(embedding);
|
||||
}
|
||||
}
|
||||
catch (std::exception const &e)
|
||||
{
|
||||
LOG(ERROR)<<e.what();
|
||||
}
|
||||
//PrintMat(result, "clas_embedding_output");
|
||||
return result;
|
||||
}
|
||||
|
||||
Vocab* Paraformer::GetVocab()
|
||||
{
|
||||
return vocab;
|
||||
}
|
||||
|
||||
Vocab* Paraformer::GetLmVocab()
|
||||
{
|
||||
return lm_vocab;
|
||||
}
|
||||
|
||||
PhoneSet* Paraformer::GetPhoneSet()
|
||||
{
|
||||
return phone_set_;
|
||||
}
|
||||
|
||||
string Paraformer::Rescoring()
|
||||
{
|
||||
LOG(ERROR)<<"Not Imp!!!!!!";
|
||||
return "";
|
||||
}
|
||||
} // namespace funasr
|
||||
117
modules/python/vendors/FunASR/runtime/onnxruntime/src/paraformer.h
vendored
Normal file
117
modules/python/vendors/FunASR/runtime/onnxruntime/src/paraformer.h
vendored
Normal file
@@ -0,0 +1,117 @@
|
||||
/**
|
||||
* Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
|
||||
* MIT License (https://opensource.org/licenses/MIT)
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
#include "precomp.h"
|
||||
#include "fst/fstlib.h"
|
||||
#include "fst/symbol-table.h"
|
||||
#include "bias-lm.h"
|
||||
#include "phone-set.h"
|
||||
|
||||
namespace funasr {
|
||||
|
||||
class Paraformer : public Model {
|
||||
/**
|
||||
* Author: Speech Lab of DAMO Academy, Alibaba Group
|
||||
* Paraformer: Fast and Accurate Parallel Transformer for Non-autoregressive End-to-End Speech Recognition
|
||||
* https://arxiv.org/pdf/2206.08317.pdf
|
||||
*/
|
||||
private:
|
||||
Vocab* vocab = nullptr;
|
||||
Vocab* lm_vocab = nullptr;
|
||||
SegDict* seg_dict = nullptr;
|
||||
PhoneSet* phone_set_ = nullptr;
|
||||
//const float scale = 22.6274169979695;
|
||||
const float scale = 1.0;
|
||||
|
||||
void LoadConfigFromYaml(const char* filename);
|
||||
void LoadOnlineConfigFromYaml(const char* filename);
|
||||
void LoadCmvn(const char *filename);
|
||||
void LfrCmvn(std::vector<std::vector<float>> &asr_feats);
|
||||
|
||||
std::shared_ptr<Ort::Session> hw_m_session = nullptr;
|
||||
Ort::Env hw_env_;
|
||||
Ort::SessionOptions hw_session_options;
|
||||
vector<string> hw_m_strInputNames, hw_m_strOutputNames;
|
||||
vector<const char*> hw_m_szInputNames;
|
||||
vector<const char*> hw_m_szOutputNames;
|
||||
bool use_hotword;
|
||||
|
||||
public:
|
||||
Paraformer();
|
||||
~Paraformer();
|
||||
void InitAsr(const std::string &am_model, const std::string &am_cmvn, const std::string &am_config, const std::string &token_file, int thread_num);
|
||||
// online
|
||||
void InitAsr(const std::string &en_model, const std::string &de_model, const std::string &am_cmvn, const std::string &am_config, const std::string &token_file, int thread_num);
|
||||
// 2pass
|
||||
void InitAsr(const std::string &am_model, const std::string &en_model, const std::string &de_model, const std::string &am_cmvn, const std::string &am_config, const std::string &token_file, int thread_num);
|
||||
void InitHwCompiler(const std::string &hw_model, int thread_num);
|
||||
void InitSegDict(const std::string &seg_dict_model);
|
||||
std::vector<std::vector<float>> CompileHotwordEmbedding(std::string &hotwords);
|
||||
void Reset();
|
||||
void FbankKaldi(float sample_rate, const float* waves, int len, std::vector<std::vector<float>> &asr_feats);
|
||||
std::vector<std::string> Forward(float** din, int* len, bool input_finished=true, const std::vector<std::vector<float>> &hw_emb={{0.0}}, void* wfst_decoder=nullptr, int batch_in=1);
|
||||
string GreedySearch( float* in, int n_len, int64_t token_nums,
|
||||
bool is_stamp=false, std::vector<float> us_alphas={0}, std::vector<float> us_cif_peak={0});
|
||||
|
||||
string Rescoring();
|
||||
string GetLang(){return language;};
|
||||
int GetAsrSampleRate() { return asr_sample_rate; };
|
||||
int GetBatchSize() {return batch_size_;};
|
||||
void StartUtterance();
|
||||
void EndUtterance();
|
||||
void InitLm(const std::string &lm_file, const std::string &lm_cfg_file, const std::string &lex_file);
|
||||
string BeamSearch(WfstDecoder* &wfst_decoder, float* in, int n_len, int64_t token_nums);
|
||||
string FinalizeDecode(WfstDecoder* &wfst_decoder,
|
||||
bool is_stamp=false, std::vector<float> us_alphas={0}, std::vector<float> us_cif_peak={0});
|
||||
Vocab* GetVocab();
|
||||
Vocab* GetLmVocab();
|
||||
PhoneSet* GetPhoneSet();
|
||||
|
||||
knf::FbankOptions fbank_opts_;
|
||||
vector<float> means_list_;
|
||||
vector<float> vars_list_;
|
||||
int lfr_m = PARA_LFR_M;
|
||||
int lfr_n = PARA_LFR_N;
|
||||
|
||||
// paraformer-offline
|
||||
std::shared_ptr<Ort::Session> m_session_ = nullptr;
|
||||
Ort::Env env_;
|
||||
Ort::SessionOptions session_options_;
|
||||
|
||||
vector<string> m_strInputNames, m_strOutputNames;
|
||||
vector<const char*> m_szInputNames;
|
||||
vector<const char*> m_szOutputNames;
|
||||
|
||||
std::string language="zh-cn";
|
||||
|
||||
// paraformer-online
|
||||
std::shared_ptr<Ort::Session> encoder_session_ = nullptr;
|
||||
std::shared_ptr<Ort::Session> decoder_session_ = nullptr;
|
||||
vector<string> en_strInputNames, en_strOutputNames;
|
||||
vector<const char*> en_szInputNames_;
|
||||
vector<const char*> en_szOutputNames_;
|
||||
vector<string> de_strInputNames, de_strOutputNames;
|
||||
vector<const char*> de_szInputNames_;
|
||||
vector<const char*> de_szOutputNames_;
|
||||
|
||||
// lm
|
||||
std::shared_ptr<fst::Fst<fst::StdArc>> lm_ = nullptr;
|
||||
|
||||
string window_type = "hamming";
|
||||
int frame_length = 25;
|
||||
int frame_shift = 10;
|
||||
int n_mels = 80;
|
||||
int encoder_size = 512;
|
||||
int fsmn_layers = 16;
|
||||
int fsmn_lorder = 10;
|
||||
int fsmn_dims = 512;
|
||||
float cif_threshold = 1.0;
|
||||
float tail_alphas = 0.45;
|
||||
int asr_sample_rate = MODEL_SAMPLE_RATE;
|
||||
int batch_size_ = 1;
|
||||
};
|
||||
|
||||
} // namespace funasr
|
||||
95
modules/python/vendors/FunASR/runtime/onnxruntime/src/phone-set.cpp
vendored
Normal file
95
modules/python/vendors/FunASR/runtime/onnxruntime/src/phone-set.cpp
vendored
Normal file
@@ -0,0 +1,95 @@
|
||||
#include "phone-set.h"
|
||||
#include <yaml-cpp/yaml.h>
|
||||
#include <glog/logging.h>
|
||||
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <list>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace funasr {
|
||||
PhoneSet::PhoneSet(const char *filename) {
|
||||
ifstream in(filename);
|
||||
LoadPhoneSetFromJson(filename);
|
||||
}
|
||||
PhoneSet::~PhoneSet()
|
||||
{
|
||||
}
|
||||
|
||||
void PhoneSet::LoadPhoneSetFromYaml(const char* filename) {
|
||||
YAML::Node config;
|
||||
try{
|
||||
config = YAML::LoadFile(filename);
|
||||
}catch(exception const &e){
|
||||
LOG(INFO) << "Error loading file, yaml file error or not exist.";
|
||||
exit(-1);
|
||||
}
|
||||
YAML::Node myList = config["token_list"];
|
||||
int id = 0;
|
||||
for (YAML::const_iterator it = myList.begin(); it != myList.end(); ++it, id++) {
|
||||
phone_.push_back(it->as<string>());
|
||||
phn2Id_.emplace(it->as<string>(), id);
|
||||
}
|
||||
}
|
||||
|
||||
void PhoneSet::LoadPhoneSetFromJson(const char* filename) {
|
||||
nlohmann::json json_array;
|
||||
std::ifstream file(filename);
|
||||
if (file.is_open()) {
|
||||
file >> json_array;
|
||||
file.close();
|
||||
} else {
|
||||
LOG(INFO) << "Error loading token file, token file error or not exist.";
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
int id = 0;
|
||||
for (const auto& element : json_array) {
|
||||
phone_.push_back(element);
|
||||
phn2Id_.emplace(element, id);
|
||||
id++;
|
||||
}
|
||||
}
|
||||
|
||||
int PhoneSet::Size() const {
|
||||
return phone_.size();
|
||||
}
|
||||
|
||||
int PhoneSet::String2Id(string phn_str) const {
|
||||
if (phn2Id_.count(phn_str)) {
|
||||
return phn2Id_.at(phn_str);
|
||||
} else {
|
||||
//LOG(INFO) << "Phone unit not exist.";
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
string PhoneSet::Id2String(int id) const {
|
||||
if (id < 0 || id > Size()) {
|
||||
//LOG(INFO) << "Phone id not exist.";
|
||||
return "";
|
||||
} else {
|
||||
return phone_[id];
|
||||
}
|
||||
}
|
||||
|
||||
bool PhoneSet::Find(string phn_str) const {
|
||||
return phn2Id_.count(phn_str) > 0;
|
||||
}
|
||||
|
||||
int PhoneSet::GetBegSilPhnId() const {
|
||||
return String2Id(UNIT_BEG_SIL_SYMBOL);
|
||||
}
|
||||
|
||||
int PhoneSet::GetEndSilPhnId() const {
|
||||
return String2Id(UNIT_END_SIL_SYMBOL);
|
||||
}
|
||||
|
||||
int PhoneSet::GetBlkPhnId() const {
|
||||
return String2Id(UNIT_BLK_SYMBOL);
|
||||
}
|
||||
|
||||
}
|
||||
36
modules/python/vendors/FunASR/runtime/onnxruntime/src/phone-set.h
vendored
Normal file
36
modules/python/vendors/FunASR/runtime/onnxruntime/src/phone-set.h
vendored
Normal file
@@ -0,0 +1,36 @@
|
||||
#ifndef PHONESET_H
|
||||
#define PHONESET_H
|
||||
|
||||
#include <stdint.h>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <unordered_map>
|
||||
#include "nlohmann/json.hpp"
|
||||
#define UNIT_BEG_SIL_SYMBOL "<s>"
|
||||
#define UNIT_END_SIL_SYMBOL "</s>"
|
||||
#define UNIT_BLK_SYMBOL "<blank>"
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace funasr {
|
||||
class PhoneSet {
|
||||
public:
|
||||
PhoneSet(const char *filename);
|
||||
~PhoneSet();
|
||||
int Size() const;
|
||||
int String2Id(string str) const;
|
||||
string Id2String(int id) const;
|
||||
bool Find(string str) const;
|
||||
int GetBegSilPhnId() const;
|
||||
int GetEndSilPhnId() const;
|
||||
int GetBlkPhnId() const;
|
||||
|
||||
private:
|
||||
vector<string> phone_;
|
||||
unordered_map<string, int> phn2Id_;
|
||||
void LoadPhoneSetFromYaml(const char* filename);
|
||||
void LoadPhoneSetFromJson(const char* filename);
|
||||
};
|
||||
|
||||
} // namespace funasr
|
||||
#endif
|
||||
75
modules/python/vendors/FunASR/runtime/onnxruntime/src/precomp.h
vendored
Normal file
75
modules/python/vendors/FunASR/runtime/onnxruntime/src/precomp.h
vendored
Normal file
@@ -0,0 +1,75 @@
|
||||
#pragma once
|
||||
// system
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include <deque>
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
#include <sstream>
|
||||
#include <iterator>
|
||||
#include <list>
|
||||
#include <locale.h>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <math.h>
|
||||
#include <numeric>
|
||||
#include <cstring>
|
||||
|
||||
#ifdef _WIN32
|
||||
#include <win_func.h>
|
||||
#else
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
|
||||
using namespace std;
|
||||
// third part
|
||||
#if defined(__APPLE__)
|
||||
#include <onnxruntime/onnxruntime_cxx_api.h>
|
||||
#else
|
||||
#include "onnxruntime_run_options_config_keys.h"
|
||||
#include "onnxruntime_cxx_api.h"
|
||||
#include "itn-model.h"
|
||||
#include "itn-processor.h"
|
||||
#endif
|
||||
|
||||
#include "kaldi-native-fbank/csrc/feature-fbank.h"
|
||||
#include "kaldi-native-fbank/csrc/online-feature.h"
|
||||
#include "kaldi/decoder/lattice-faster-online-decoder.h"
|
||||
// mine
|
||||
#include <glog/logging.h>
|
||||
|
||||
|
||||
#include "common-struct.h"
|
||||
#include "com-define.h"
|
||||
#include "commonfunc.h"
|
||||
#include "predefine-coe.h"
|
||||
#include "model.h"
|
||||
#include "vad-model.h"
|
||||
#include "punc-model.h"
|
||||
#include "tokenizer.h"
|
||||
#include "ct-transformer.h"
|
||||
#include "ct-transformer-online.h"
|
||||
#include "e2e-vad.h"
|
||||
#include "fsmn-vad.h"
|
||||
#include "encode_converter.h"
|
||||
#include "vocab.h"
|
||||
#include "phone-set.h"
|
||||
#include "wfst-decoder.h"
|
||||
#include "audio.h"
|
||||
#include "fsmn-vad-online.h"
|
||||
#include "tensor.h"
|
||||
#include "util.h"
|
||||
#include "seg_dict.h"
|
||||
#include "resample.h"
|
||||
#include "paraformer.h"
|
||||
#include "sensevoice-small.h"
|
||||
#ifdef USE_GPU
|
||||
#include "paraformer-torch.h"
|
||||
#endif
|
||||
#include "paraformer-online.h"
|
||||
#include "offline-stream.h"
|
||||
#include "tpass-stream.h"
|
||||
#include "tpass-online-stream.h"
|
||||
#include "funasrruntime.h"
|
||||
595
modules/python/vendors/FunASR/runtime/onnxruntime/src/predefine-coe.h
vendored
Normal file
595
modules/python/vendors/FunASR/runtime/onnxruntime/src/predefine-coe.h
vendored
Normal file
@@ -0,0 +1,595 @@
|
||||
#ifndef PREDEFINE_COE_H
|
||||
#define PREDEFINE_COE_H
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
namespace funasr {
|
||||
const int32_t melcoe_hex[] = {
|
||||
|
||||
0x3f01050c, 0x3e0afb11, 0x3f5d413c, 0x3f547fd0, 0x3e2e00c1, 0x3f132970,
|
||||
0x3ed9ad21, 0x3ebb8bb9, 0x3f223a24, 0x3e4de6f8, 0x3f4c8642, 0x3d9c0424,
|
||||
0x3f6c7f7c, 0x3f7d295a, 0x3c35a961, 0x3f6fd497, 0x3d815b45, 0x3f6af197,
|
||||
0x3da87344, 0x3f6dfce9, 0x3d9018b9, 0x3f787ebc, 0x3d2098fe, 0x3cf02873,
|
||||
0x3f75f670, 0x3e08e423, 0x3f5dc6f7, 0x3e8161eb, 0x3f3f4f0b, 0x3eca38e2,
|
||||
0x3f1ae38f, 0x3f0f2d23, 0x3ee1a5ba, 0x3f3e9a98, 0x3e82cad1, 0x3f7321ac,
|
||||
0x3e321028, 0x3d4de548, 0x3f537bf6, 0x3ed50f76, 0x3f157845, 0x3f2cf6bc,
|
||||
0x3ea61288, 0x3f739ea7, 0x3e794186, 0x3d461590, 0x3f41af9f, 0x3f0cdfd4,
|
||||
0x3ee64058, 0x3f5f23aa, 0x3e53d467, 0x3e037156, 0x3f4b0ae6, 0x3f0e2fac,
|
||||
0x3ee3a0a8, 0x3f6ab111, 0x3e94b1ed, 0x3daa7774, 0x3f35a70a, 0x3f2d08dc,
|
||||
0x3d951fb4, 0x3ea5ee48, 0x3f6d5c09, 0x3ef61e1a, 0x3f04f0f3, 0x3f66305c,
|
||||
0x3ea7def9, 0x3dce7d20, 0x3f2c1083, 0x3f44354b, 0x3e5baf49, 0x3e6f2ad2,
|
||||
0x3f49142e, 0x3f2bfe35, 0x3e0d627b, 0x3ea80396, 0x3f5ca761, 0x3f1ce830,
|
||||
0x3dc4d786, 0x3ec62fa0, 0x3f67650f, 0x3f165fc0, 0x3db1323f, 0x3ed34080,
|
||||
0x3f69d9b8, 0x3f17def1, 0x3ddbd6b6, 0x3ed0421e, 0x3f648529, 0x3f20ebbd,
|
||||
0x3e20901a, 0x3ebe2886, 0x3f57dbf9, 0x3f3116ac, 0x3e6edcc6, 0x3e9dd2a9,
|
||||
0x3f4448ce, 0x3f47f9a3, 0x3eaba511, 0x3e601974, 0x3f2a2d77, 0x3f6536e2,
|
||||
0x3eec3842, 0x3d0781f6, 0x3dd648ed, 0x3f09e3df, 0x3f7787e1, 0x3f1c411f,
|
||||
0x3e45b702, 0x3ec77dc2, 0x3f4e9240, 0x3f47f500, 0x3ebf9c61, 0x3e602c00,
|
||||
0x3f2031d0, 0x3f78f0f7, 0x3f135547, 0x3e3bcd78, 0x3ce1e12a, 0x3ed95573,
|
||||
0x3f510ca2, 0x3f4bc3c2, 0x3ed37e77, 0x3d0ded37, 0x3e50f0f8, 0x3f1640c5,
|
||||
0x3f77212d, 0x3f291bd1, 0x3e94df6c, 0x3eadc85e, 0x3f35904a, 0x3f6cd43b,
|
||||
0x3f104351, 0x3e52dc63, 0x3d995e26, 0x3edf795f, 0x3f4b48e7, 0x3f5a29e7,
|
||||
0x3f00963d, 0x3e1fdb2f, 0x3e175865, 0x3efed385, 0x3f580934, 0x3f50466d,
|
||||
0x3ef30046, 0x3e0e7c6b, 0x3e3ee64e, 0x3f067fdd, 0x3f5c60e5, 0x3f4e9ea4,
|
||||
0x3ef4f46a, 0x3e1cb596, 0x3e45856f, 0x3f0585cb, 0x3f58d29b, 0x3f54b3ef,
|
||||
0x3f0309ad, 0x3e48aa5b, 0x3e2d3042, 0x3ef9eca6, 0x3f4dd569, 0x3f6212c4,
|
||||
0x3f12be68, 0x3e8853a3, 0x3def69e0, 0x3eda8330, 0x3f3bd62e, 0x3f76516a,
|
||||
0x3f2931b5, 0x3eb98e9b, 0x3d88773c, 0x3d1ae95c, 0x3ead9c96, 0x3f2338b2,
|
||||
0x3f6ef119, 0x3f46054d, 0x3ef74eba, 0x3e47c83a, 0x3e67eace, 0x3f0458a3,
|
||||
0x3f4e0df1, 0x3f68e26b, 0x3f207590, 0x3eb1515d, 0x3d8bc852, 0x3db8eca9,
|
||||
0x3ebf14e0, 0x3f275751, 0x3f6e86f6, 0x3f4ae3f8, 0x3f04e6de, 0x3e7dfcce,
|
||||
0x3e547020, 0x3ef63244, 0x3f4080cd, 0x3f7aaa80, 0x3f366659, 0x3ee560cb,
|
||||
0x3e3e1967, 0x3caab00e, 0x3e93334e, 0x3f0d4f9a, 0x3f5079a6, 0x3f6ce5f8,
|
||||
0x3f2acd10, 0x3ed272ff, 0x3e20a4c5, 0x3d98d042, 0x3eaa65e0, 0x3f16c680,
|
||||
0x3f57d6cf, 0x3f679a1b, 0x3f278a40, 0x3ecfef5c, 0x3e2381fd, 0x3dc32f28,
|
||||
0x3eb0eb80, 0x3f180852, 0x3f571f81, 0x3f6a42d8, 0x3f2c1ce8, 0x3edcd9d1,
|
||||
0x3e44c475, 0x3dade93f, 0x3ea7c630, 0x3f119318, 0x3f4ecee3, 0x3f7467d4,
|
||||
0x3f380f62, 0x3ef84c54, 0x3e815525, 0x3cb361d7, 0x3d3982c8, 0x3e8fe13b,
|
||||
0x3f03d9d6, 0x3f3f556d, 0x3f7a64f1, 0x3f4af618, 0x3f10ba30, 0x3eadcbc5,
|
||||
0x3debbe02, 0x3e5427a0, 0x3ede8b9f, 0x3f291a1d, 0x3f628840, 0x3f646e63,
|
||||
0x3f2bc86b, 0x3ee70902, 0x3e6e854e, 0x3c83b300, 0x3ddc8cea, 0x3ea86f2a,
|
||||
0x3f0c7b7f, 0x3f445eac, 0x3f7be268, 0x3f4cf80b, 0x3f162f6e, 0x3ebf8516,
|
||||
0x3e26c0c2, 0x3e4c1fd5, 0x3ed3a124, 0x3f203d75, 0x3f564fd0, 0x3f73f733,
|
||||
0x3f3e966d, 0x3f098cbf, 0x3ea9b21c, 0x3e01e917, 0x3d408cd1, 0x3e82d326,
|
||||
0x3eece682, 0x3f2b26f2, 0x3f5f85ba, 0x3f6c6f56, 0x3f38b733, 0x3f0550d9,
|
||||
0x3ea47689, 0x3dfbabd7, 0x3d9c8552, 0x3e8e919a, 0x3ef55e4f, 0x3f2dc4bb,
|
||||
0x3f608a85, 0x3f6cfe84, 0x3f3ad56c, 0x3f08f945, 0x3eaed247, 0x3e189086,
|
||||
0x3d980be2, 0x3e8a5528, 0x3eee0d76, 0x3f2896dc, 0x3f59dbde, 0x3f75295d,
|
||||
0x3f4477f6, 0x3f140f14, 0x3ec7dbbd, 0x3e504e0f, 0x3c8fe67e, 0x3d2d6a38,
|
||||
0x3e6e2028, 0x3ed7e1d9, 0x3f1c1221, 0x3f4bec7c, 0x3f7b80cc, 0x3f553023,
|
||||
0x3f262589, 0x3eeebd40, 0x3e91b54d, 0x3dd4c6f8, 0x3e2b3f74, 0x3eb3b4ef,
|
||||
0x3f08a160, 0x3f372559, 0x3f656721, 0x3f6c988d, 0x3f3ed8f9, 0x3f11596d,
|
||||
0x3ec83270, 0x3e5c5ea4, 0x3d254149, 0x3d9b3b97, 0x3e824e0e, 0x3edd4d25,
|
||||
0x3f1be6c8, 0x3f48e857, 0x3f75abeb, 0x3f5dcdd1, 0x3f318436, 0x3f0576a0,
|
||||
0x3eb348db, 0x3e3833fb, 0x3c2bedc9, 0x3e08c8be, 0x3e9cf794, 0x3ef512c0,
|
||||
0x3f265b92, 0x3f51f301, 0x3f7d5049, 0x3f578bfc, 0x3f2ca136, 0x3f01eecf,
|
||||
0x3eaee867, 0x3e34c34c, 0x3c490794, 0x3e21d00f, 0x3ea6bd94, 0x3efc2262,
|
||||
0x3f288bcc, 0x3f52cf2d, 0x3f7cdbe2, 0x3f594d89, 0x3f2fac87, 0x3f064092,
|
||||
0x3eba1245, 0x3e5016cd, 0x3d335c27, 0x3e1ac9dd, 0x3ea0a6f1, 0x3ef37edc,
|
||||
0x3f22f6de, 0x3f4bfa4d, 0x3f74ca3e, 0x3f6298cf, 0x3f3a2e5b, 0x3f11f5e8,
|
||||
0x3ed3ddf9, 0x3e84323c, 0x3dd39eaa, 0x3deb3986, 0x3e8ba34a, 0x3edc142f,
|
||||
0x3f161103, 0x3f3de6e2, 0x3f658c2b, 0x3f72feac, 0x3f4bb92e, 0x3f24a2e9,
|
||||
0x3efb76d9, 0x3eae048f, 0x3e41dc34, 0x3d219509, 0x3d50153e, 0x3e511b46,
|
||||
0x3eb6ba2d, 0x3f024494, 0x3f28fdb9, 0x3f4f88f3, 0x3f75e6af, 0x3f63e8a7,
|
||||
0x3f3de4a8, 0x3f180cea, 0x3ee4c20e, 0x3e99c134, 0x3e1e2cfc, 0x3c1824f4,
|
||||
0x3de0bac6, 0x3e8436b1, 0x3ecfe62d, 0x3f0d9ef9, 0x3f331f66, 0x3f5874c1,
|
||||
0x3f7d9f6c, 0x3f5d6037, 0x3f3889c9, 0x3f13dcea, 0x3edeb27d, 0x3e95fcd3,
|
||||
0x3e1b303e, 0x3c3075cb, 0x3e0a7f24, 0x3e8eec6e, 0x3ed8462b, 0x3f10a6c1,
|
||||
0x3f350197, 0x3f5933f1, 0x3f7d3e29, 0x3f5edf68, 0x3f3b246a, 0x3f179088,
|
||||
0x3ee846d8, 0x3ea1b983, 0x3e36f0d8, 0x3d2c1773, 0x3e048260, 0x3e89b72b,
|
||||
0x3ed0def0, 0x3f0bdc94, 0x3f2f233e, 0x3f5243ca, 0x3f753e89, 0x3f67ec34,
|
||||
0x3f453c1d, 0x3f22b0e2, 0x3f004a36, 0x3ebc0f98, 0x3e6fa55d, 0x3dcf7467,
|
||||
0x3dc09e5f, 0x3e6b0f8d, 0x3eba9e3c, 0x3eff6b94, 0x3f21f834, 0x3f4416a9,
|
||||
0x3f661173, 0x3f781723, 0x3f5662cf, 0x3f34d14a, 0x3f13624c, 0x3ee42b1a,
|
||||
0x3ea1d591, 0x3e3f86e3, 0x3d6fa1a1, 0x3cfd1ba9, 0x3e2674c3, 0x3e965d6c,
|
||||
0x3ed93b69, 0x3f0dea73, 0x3f2f1538, 0x3f501e47, 0x3f7105e6, 0x3f6e33a9,
|
||||
0x3f4d8e22, 0x3f2d0944, 0x3f0ca4cd, 0x3ed8c0fd, 0x3e98782f, 0x3e30dd66,
|
||||
0x3d452061, 0x3d8e62bc, 0x3e49c779, 0x3ea5ed78, 0x3ee6b665, 0x3f139f81,
|
||||
0x3f33c3e8, 0x3f53c8a7, 0x3f73adfa, 0x3f6c8be0, 0x3f4ce4ab, 0x3f2d5c2a,
|
||||
0x3f0df223, 0x3edd4cb5, 0x3e9ef12d, 0x3e41a276, 0x3d8bb1ba, 0x3d9ba0ff,
|
||||
0x3e4c6d54, 0x3ea547ab, 0x3ee41bba, 0x3f1159a6, 0x3f30876a, 0x3f4f9762,
|
||||
0x3f6e89c9, 0x3f72a12b, 0x3f53e942, 0x3f354e46, 0x3f16cffe, 0x3ef0dc6f,
|
||||
0x3eb45177, 0x3e6ffd59, 0x3def8e9c
|
||||
|
||||
};
|
||||
|
||||
const int32_t window_hex[] = {
|
||||
0x00000000, 0x398b03f6, 0x3a61d1c5, 0x3ae0ee32, 0x3b37623a, 0x3b85f871,
|
||||
0x3bb69d19, 0x3bed453b, 0x3c14d40b, 0x3c35c45b, 0x3c59595d, 0x3c7f7c1d,
|
||||
0x3c940c13, 0x3ca98d81, 0x3cc039eb, 0x3cd8098d, 0x3cf0f52e, 0x3d057b06,
|
||||
0x3d1302e6, 0x3d210f33, 0x3d2f9d0e, 0x3d3ea9ba, 0x3d4e3293, 0x3d5e3510,
|
||||
0x3d6eaebd, 0x3d7f9d38, 0x3d887f19, 0x3d9167b5, 0x3d9a8756, 0x3da3dce9,
|
||||
0x3dad675d, 0x3db725ab, 0x3dc116cc, 0x3dcb39bf, 0x3dd58d86, 0x3de01126,
|
||||
0x3deac3a7, 0x3df5a413, 0x3e0058bb, 0x3e05f571, 0x3e0ba7b2, 0x3e116f08,
|
||||
0x3e174afe, 0x3e1d3b1c, 0x3e233ef0, 0x3e295605, 0x3e2f7fe7, 0x3e35bc23,
|
||||
0x3e3c0a46, 0x3e4269de, 0x3e48da79, 0x3e4f5ba5, 0x3e55ecf2, 0x3e5c8ded,
|
||||
0x3e633e26, 0x3e69fd2c, 0x3e70ca8f, 0x3e77a5de, 0x3e7e8eaa, 0x3e82c241,
|
||||
0x3e86437c, 0x3e89cacd, 0x3e8d57fc, 0x3e90ead3, 0x3e948319, 0x3e982097,
|
||||
0x3e9bc316, 0x3e9f6a5d, 0x3ea31636, 0x3ea6c66a, 0x3eaa7ac0, 0x3eae3303,
|
||||
0x3eb1eefa, 0x3eb5ae6f, 0x3eb9712a, 0x3ebd36f6, 0x3ec0ff9b, 0x3ec4cae2,
|
||||
0x3ec89895, 0x3ecc687d, 0x3ed03a64, 0x3ed40e13, 0x3ed7e354, 0x3edbb9f2,
|
||||
0x3edf91b5, 0x3ee36a69, 0x3ee743d7, 0x3eeb1dca, 0x3eeef80c, 0x3ef2d267,
|
||||
0x3ef6aca8, 0x3efa8698, 0x3efe6002, 0x3f011c59, 0x3f03083a, 0x3f04f389,
|
||||
0x3f06de2d, 0x3f08c80b, 0x3f0ab10a, 0x3f0c990f, 0x3f0e8001, 0x3f1065c6,
|
||||
0x3f124a45, 0x3f142d65, 0x3f160f0c, 0x3f17ef21, 0x3f19cd8b, 0x3f1baa32,
|
||||
0x3f1d84fb, 0x3f1f5dd0, 0x3f213498, 0x3f230939, 0x3f24db9d, 0x3f26abaa,
|
||||
0x3f28794a, 0x3f2a4464, 0x3f2c0ce1, 0x3f2dd2a9, 0x3f2f95a6, 0x3f3155bf,
|
||||
0x3f3312e0, 0x3f34ccef, 0x3f3683d8, 0x3f383784, 0x3f39e7dd, 0x3f3b94cc,
|
||||
0x3f3d3e3c, 0x3f3ee418, 0x3f40864a, 0x3f4224bd, 0x3f43bf5c, 0x3f455613,
|
||||
0x3f46e8cc, 0x3f487774, 0x3f4a01f6, 0x3f4b883f, 0x3f4d0a3b, 0x3f4e87d6,
|
||||
0x3f5000fe, 0x3f5175a0, 0x3f52e5a9, 0x3f545106, 0x3f55b7a5, 0x3f571975,
|
||||
0x3f587664, 0x3f59ce60, 0x3f5b2158, 0x3f5c6f3b, 0x3f5db7f9, 0x3f5efb80,
|
||||
0x3f6039c2, 0x3f6172af, 0x3f62a636, 0x3f63d448, 0x3f64fcd6, 0x3f661fd3,
|
||||
0x3f673d2e, 0x3f6854db, 0x3f6966ca, 0x3f6a72ef, 0x3f6b793d, 0x3f6c79a5,
|
||||
0x3f6d741d, 0x3f6e6896, 0x3f6f5706, 0x3f703f5f, 0x3f712198, 0x3f71fda4,
|
||||
0x3f72d379, 0x3f73a30c, 0x3f746c52, 0x3f752f43, 0x3f75ebd4, 0x3f76a1fc,
|
||||
0x3f7751b2, 0x3f77faee, 0x3f789da6, 0x3f7939d4, 0x3f79cf6e, 0x3f7a5e6f,
|
||||
0x3f7ae6cf, 0x3f7b6886, 0x3f7be38f, 0x3f7c57e4, 0x3f7cc57f, 0x3f7d2c5b,
|
||||
0x3f7d8c72, 0x3f7de5bf, 0x3f7e3840, 0x3f7e83ee, 0x3f7ec8c7, 0x3f7f06c7,
|
||||
0x3f7f3deb, 0x3f7f6e31, 0x3f7f9795, 0x3f7fba17, 0x3f7fd5b4, 0x3f7fea6b,
|
||||
0x3f7ff83b, 0x3f7fff23, 0x3f7fff23, 0x3f7ff83b, 0x3f7fea6b, 0x3f7fd5b4,
|
||||
0x3f7fba17, 0x3f7f9795, 0x3f7f6e31, 0x3f7f3deb, 0x3f7f06c7, 0x3f7ec8c7,
|
||||
0x3f7e83ee, 0x3f7e3840, 0x3f7de5bf, 0x3f7d8c72, 0x3f7d2c5b, 0x3f7cc57f,
|
||||
0x3f7c57e4, 0x3f7be38f, 0x3f7b6886, 0x3f7ae6cf, 0x3f7a5e6f, 0x3f79cf6e,
|
||||
0x3f7939d4, 0x3f789da6, 0x3f77faee, 0x3f7751b2, 0x3f76a1fc, 0x3f75ebd4,
|
||||
0x3f752f43, 0x3f746c52, 0x3f73a30c, 0x3f72d379, 0x3f71fda4, 0x3f712198,
|
||||
0x3f703f5f, 0x3f6f5706, 0x3f6e6896, 0x3f6d741d, 0x3f6c79a5, 0x3f6b793d,
|
||||
0x3f6a72ef, 0x3f6966ca, 0x3f6854db, 0x3f673d2e, 0x3f661fd3, 0x3f64fcd6,
|
||||
0x3f63d448, 0x3f62a636, 0x3f6172af, 0x3f6039c2, 0x3f5efb80, 0x3f5db7f9,
|
||||
0x3f5c6f3b, 0x3f5b2158, 0x3f59ce60, 0x3f587664, 0x3f571975, 0x3f55b7a5,
|
||||
0x3f545106, 0x3f52e5a9, 0x3f5175a0, 0x3f5000fe, 0x3f4e87d6, 0x3f4d0a3b,
|
||||
0x3f4b883f, 0x3f4a01f6, 0x3f487774, 0x3f46e8cc, 0x3f455613, 0x3f43bf5c,
|
||||
0x3f4224bd, 0x3f40864a, 0x3f3ee418, 0x3f3d3e3c, 0x3f3b94cc, 0x3f39e7dd,
|
||||
0x3f383784, 0x3f3683d8, 0x3f34ccef, 0x3f3312e0, 0x3f3155bf, 0x3f2f95a6,
|
||||
0x3f2dd2a9, 0x3f2c0ce1, 0x3f2a4464, 0x3f28794a, 0x3f26abaa, 0x3f24db9d,
|
||||
0x3f230939, 0x3f213498, 0x3f1f5dd0, 0x3f1d84fb, 0x3f1baa32, 0x3f19cd8b,
|
||||
0x3f17ef21, 0x3f160f0c, 0x3f142d65, 0x3f124a45, 0x3f1065c6, 0x3f0e8001,
|
||||
0x3f0c990f, 0x3f0ab10a, 0x3f08c80b, 0x3f06de2d, 0x3f04f389, 0x3f03083a,
|
||||
0x3f011c59, 0x3efe6002, 0x3efa8698, 0x3ef6aca8, 0x3ef2d267, 0x3eeef80c,
|
||||
0x3eeb1dca, 0x3ee743d7, 0x3ee36a69, 0x3edf91b5, 0x3edbb9f2, 0x3ed7e354,
|
||||
0x3ed40e13, 0x3ed03a64, 0x3ecc687d, 0x3ec89895, 0x3ec4cae2, 0x3ec0ff9b,
|
||||
0x3ebd36f6, 0x3eb9712a, 0x3eb5ae6f, 0x3eb1eefa, 0x3eae3303, 0x3eaa7ac0,
|
||||
0x3ea6c66a, 0x3ea31636, 0x3e9f6a5d, 0x3e9bc316, 0x3e982097, 0x3e948319,
|
||||
0x3e90ead3, 0x3e8d57fc, 0x3e89cacd, 0x3e86437c, 0x3e82c241, 0x3e7e8eaa,
|
||||
0x3e77a5de, 0x3e70ca8f, 0x3e69fd2c, 0x3e633e26, 0x3e5c8ded, 0x3e55ecf2,
|
||||
0x3e4f5ba5, 0x3e48da79, 0x3e4269de, 0x3e3c0a46, 0x3e35bc23, 0x3e2f7fe7,
|
||||
0x3e295605, 0x3e233ef0, 0x3e1d3b1c, 0x3e174afe, 0x3e116f08, 0x3e0ba7b2,
|
||||
0x3e05f571, 0x3e0058bb, 0x3df5a413, 0x3deac3a7, 0x3de01126, 0x3dd58d86,
|
||||
0x3dcb39bf, 0x3dc116cc, 0x3db725ab, 0x3dad675d, 0x3da3dce9, 0x3d9a8756,
|
||||
0x3d9167b5, 0x3d887f19, 0x3d7f9d38, 0x3d6eaebd, 0x3d5e3510, 0x3d4e3293,
|
||||
0x3d3ea9ba, 0x3d2f9d0e, 0x3d210f33, 0x3d1302e6, 0x3d057b06, 0x3cf0f52e,
|
||||
0x3cd8098d, 0x3cc039eb, 0x3ca98d81, 0x3c940c13, 0x3c7f7c1d, 0x3c59595d,
|
||||
0x3c35c45b, 0x3c14d40b, 0x3bed453b, 0x3bb69d19, 0x3b85f871, 0x3b37623a,
|
||||
0x3ae0ee32, 0x3a61d1c5, 0x398b03f6, 0x00000000
|
||||
|
||||
};
|
||||
|
||||
const int32_t window_hamm_hex[] = {
|
||||
0x3da3d70a, 0x3da3f4f1, 0x3da44ea4, 0x3da4e41d, 0x3da5b554, 0x3da6c239,
|
||||
0x3da80abd, 0x3da98ecb, 0x3dab4e4a, 0x3dad491d, 0x3daf7f25, 0x3db1f03d,
|
||||
0x3db49c3e, 0x3db782fd, 0x3dbaa449, 0x3dbdfff1, 0x3dc195be, 0x3dc56575,
|
||||
0x3dc96ed9, 0x3dcdb1a8, 0x3dd22d9d, 0x3dd6e26e, 0x3ddbcfd0, 0x3de0f572,
|
||||
0x3de65301, 0x3debe825, 0x3df1b484, 0x3df7b7c0, 0x3dfdf176, 0x3e0230a1,
|
||||
0x3e05835d, 0x3e08f0ba, 0x3e0c7880, 0x3e101a75, 0x3e13d65f, 0x3e17ac00,
|
||||
0x3e1b9b1b, 0x3e1fa36f, 0x3e23c4bc, 0x3e27febd, 0x3e2c512e, 0x3e30bbc9,
|
||||
0x3e353e46, 0x3e39d85c, 0x3e3e89c0, 0x3e435226, 0x3e483140, 0x3e4d26be,
|
||||
0x3e523251, 0x3e5753a7, 0x3e5c8a6b, 0x3e61d64a, 0x3e6736ec, 0x3e6cabfc,
|
||||
0x3e72351f, 0x3e77d1fd, 0x3e7d8239, 0x3e81a2bc, 0x3e848dae, 0x3e8781c3,
|
||||
0x3e8a7eca, 0x3e8d8495, 0x3e9092f0, 0x3e93a9ab, 0x3e96c894, 0x3e99ef77,
|
||||
0x3e9d1e22, 0x3ea05460, 0x3ea391ff, 0x3ea6d6c8, 0x3eaa2286, 0x3ead7505,
|
||||
0x3eb0ce0f, 0x3eb42d6c, 0x3eb792e6, 0x3ebafe46, 0x3ebe6f54, 0x3ec1e5d9,
|
||||
0x3ec5619c, 0x3ec8e264, 0x3ecc67f8, 0x3ecff220, 0x3ed380a2, 0x3ed71344,
|
||||
0x3edaa9cb, 0x3ede43fe, 0x3ee1e1a3, 0x3ee5827d, 0x3ee92653, 0x3eeccce9,
|
||||
0x3ef07604, 0x3ef42168, 0x3ef7ceda, 0x3efb7e1d, 0x3eff2ef7, 0x3f017096,
|
||||
0x3f034a3f, 0x3f052459, 0x3f06fec5, 0x3f08d967, 0x3f0ab41f, 0x3f0c8ed0,
|
||||
0x3f0e695b, 0x3f1043a2, 0x3f121d87, 0x3f13f6ec, 0x3f15cfb4, 0x3f17a7bf,
|
||||
0x3f197ef0, 0x3f1b5529, 0x3f1d2a4d, 0x3f1efe3d, 0x3f20d0db, 0x3f22a20b,
|
||||
0x3f2471ae, 0x3f263fa8, 0x3f280bda, 0x3f29d628, 0x3f2b9e74, 0x3f2d64a2,
|
||||
0x3f2f2895, 0x3f30ea30, 0x3f32a956, 0x3f3465ec, 0x3f361fd4, 0x3f37d6f3,
|
||||
0x3f398b2d, 0x3f3b3c66, 0x3f3cea83, 0x3f3e9569, 0x3f403cfb, 0x3f41e121,
|
||||
0x3f4381be, 0x3f451eb8, 0x3f46b7f6, 0x3f484d5d, 0x3f49ded3, 0x3f4b6c3f,
|
||||
0x3f4cf588, 0x3f4e7a94, 0x3f4ffb4c, 0x3f517796, 0x3f52ef5a, 0x3f546282,
|
||||
0x3f55d0f4, 0x3f573a9a, 0x3f589f5d, 0x3f59ff26, 0x3f5b59df, 0x3f5caf72,
|
||||
0x3f5dffc9, 0x3f5f4acf, 0x3f60906f, 0x3f61d093, 0x3f630b29, 0x3f64401b,
|
||||
0x3f656f57, 0x3f6698c9, 0x3f67bc5d, 0x3f68da03, 0x3f69f1a6, 0x3f6b0337,
|
||||
0x3f6c0ea3, 0x3f6d13d9, 0x3f6e12c9, 0x3f6f0b62, 0x3f6ffd95, 0x3f70e953,
|
||||
0x3f71ce8c, 0x3f72ad32, 0x3f738537, 0x3f74568d, 0x3f752127, 0x3f75e4f8,
|
||||
0x3f76a1f3, 0x3f77580d, 0x3f780739, 0x3f78af6e, 0x3f79509f, 0x3f79eac3,
|
||||
0x3f7a7dd1, 0x3f7b09be, 0x3f7b8e83, 0x3f7c0c15, 0x3f7c826e, 0x3f7cf187,
|
||||
0x3f7d5957, 0x3f7db9d8, 0x3f7e1305, 0x3f7e64d7, 0x3f7eaf4a, 0x3f7ef258,
|
||||
0x3f7f2dfe, 0x3f7f6237, 0x3f7f8f00, 0x3f7fb457, 0x3f7fd239, 0x3f7fe8a4,
|
||||
0x3f7ff797, 0x3f7fff11, 0x3f7fff11, 0x3f7ff797, 0x3f7fe8a4, 0x3f7fd239,
|
||||
0x3f7fb457, 0x3f7f8f00, 0x3f7f6237, 0x3f7f2dfe, 0x3f7ef258, 0x3f7eaf4a,
|
||||
0x3f7e64d7, 0x3f7e1305, 0x3f7db9d8, 0x3f7d5957, 0x3f7cf187, 0x3f7c826e,
|
||||
0x3f7c0c15, 0x3f7b8e83, 0x3f7b09be, 0x3f7a7dd1, 0x3f79eac3, 0x3f79509f,
|
||||
0x3f78af6e, 0x3f780739, 0x3f77580d, 0x3f76a1f3, 0x3f75e4f8, 0x3f752127,
|
||||
0x3f74568d, 0x3f738537, 0x3f72ad32, 0x3f71ce8c, 0x3f70e953, 0x3f6ffd95,
|
||||
0x3f6f0b62, 0x3f6e12c9, 0x3f6d13d9, 0x3f6c0ea3, 0x3f6b0337, 0x3f69f1a6,
|
||||
0x3f68da03, 0x3f67bc5d, 0x3f6698c9, 0x3f656f57, 0x3f64401b, 0x3f630b29,
|
||||
0x3f61d093, 0x3f60906f, 0x3f5f4acf, 0x3f5dffc9, 0x3f5caf72, 0x3f5b59df,
|
||||
0x3f59ff26, 0x3f589f5d, 0x3f573a9a, 0x3f55d0f4, 0x3f546282, 0x3f52ef5a,
|
||||
0x3f517796, 0x3f4ffb4c, 0x3f4e7a94, 0x3f4cf588, 0x3f4b6c3f, 0x3f49ded3,
|
||||
0x3f484d5d, 0x3f46b7f6, 0x3f451eb8, 0x3f4381be, 0x3f41e121, 0x3f403cfb,
|
||||
0x3f3e9569, 0x3f3cea83, 0x3f3b3c66, 0x3f398b2d, 0x3f37d6f3, 0x3f361fd4,
|
||||
0x3f3465ec, 0x3f32a956, 0x3f30ea30, 0x3f2f2895, 0x3f2d64a2, 0x3f2b9e74,
|
||||
0x3f29d628, 0x3f280bda, 0x3f263fa8, 0x3f2471ae, 0x3f22a20b, 0x3f20d0db,
|
||||
0x3f1efe3d, 0x3f1d2a4d, 0x3f1b5529, 0x3f197ef0, 0x3f17a7bf, 0x3f15cfb4,
|
||||
0x3f13f6ec, 0x3f121d87, 0x3f1043a2, 0x3f0e695b, 0x3f0c8ed0, 0x3f0ab41f,
|
||||
0x3f08d967, 0x3f06fec5, 0x3f052459, 0x3f034a3f, 0x3f017096, 0x3eff2ef7,
|
||||
0x3efb7e1d, 0x3ef7ceda, 0x3ef42168, 0x3ef07604, 0x3eeccce9, 0x3ee92653,
|
||||
0x3ee5827d, 0x3ee1e1a3, 0x3ede43fe, 0x3edaa9cb, 0x3ed71344, 0x3ed380a2,
|
||||
0x3ecff220, 0x3ecc67f8, 0x3ec8e264, 0x3ec5619c, 0x3ec1e5d9, 0x3ebe6f54,
|
||||
0x3ebafe46, 0x3eb792e6, 0x3eb42d6c, 0x3eb0ce0f, 0x3ead7505, 0x3eaa2286,
|
||||
0x3ea6d6c8, 0x3ea391ff, 0x3ea05460, 0x3e9d1e22, 0x3e99ef77, 0x3e96c894,
|
||||
0x3e93a9ab, 0x3e9092f0, 0x3e8d8495, 0x3e8a7eca, 0x3e8781c3, 0x3e848dae,
|
||||
0x3e81a2bc, 0x3e7d8239, 0x3e77d1fd, 0x3e72351f, 0x3e6cabfc, 0x3e6736ec,
|
||||
0x3e61d64a, 0x3e5c8a6b, 0x3e5753a7, 0x3e523251, 0x3e4d26be, 0x3e483140,
|
||||
0x3e435226, 0x3e3e89c0, 0x3e39d85c, 0x3e353e46, 0x3e30bbc9, 0x3e2c512e,
|
||||
0x3e27febd, 0x3e23c4bc, 0x3e1fa36f, 0x3e1b9b1b, 0x3e17ac00, 0x3e13d65f,
|
||||
0x3e101a75, 0x3e0c7880, 0x3e08f0ba, 0x3e05835d, 0x3e0230a1, 0x3dfdf176,
|
||||
0x3df7b7c0, 0x3df1b484, 0x3debe825, 0x3de65301, 0x3de0f572, 0x3ddbcfd0,
|
||||
0x3dd6e26e, 0x3dd22d9d, 0x3dcdb1a8, 0x3dc96ed9, 0x3dc56575, 0x3dc195be,
|
||||
0x3dbdfff1, 0x3dbaa449, 0x3db782fd, 0x3db49c3e, 0x3db1f03d, 0x3daf7f25,
|
||||
0x3dad491d, 0x3dab4e4a, 0x3da98ecb, 0x3da80abd, 0x3da6c239, 0x3da5b554,
|
||||
0x3da4e41d, 0x3da44ea4, 0x3da3f4f1, 0x3da3d70a
|
||||
|
||||
};
|
||||
|
||||
const int global_cmvn_mean_hex[] = {
|
||||
0x413d6566, 0x4147923f, 0x4156ab15, 0x41613d12, 0x416b155b, 0x41722783,
|
||||
0x4176cd05, 0x4178532a, 0x417aa3c3, 0x417aed19, 0x417d4d2c, 0x417e6abb,
|
||||
0x41805848, 0x418122ab, 0x41812b23, 0x418161a8, 0x41810ef9, 0x4180863a,
|
||||
0x41815d8f, 0x417ff8b2, 0x417de2aa, 0x4180a5f2, 0x417e8bd1, 0x418041ac,
|
||||
0x417f2d60, 0x4180487f, 0x417eb835, 0x418018d8, 0x417ef8c1, 0x417ea302,
|
||||
0x417f30cf, 0x417ea0bb, 0x417ebac2, 0x417faab6, 0x417fca4d, 0x41805e45,
|
||||
0x4180e308, 0x4180ef3e, 0x418109fc, 0x4180afa3, 0x418113e2, 0x4180c915,
|
||||
0x41819f86, 0x418190bf, 0x418220bd, 0x4182f2e5, 0x4183e1c7, 0x41843eec,
|
||||
0x4184b066, 0x418574db, 0x41852611, 0x4184fc81, 0x41851b2a, 0x4185a1c7,
|
||||
0x41861152, 0x41868c28, 0x41871930, 0x41871f83, 0x41868893, 0x4185d919,
|
||||
0x4185664b, 0x418480a6, 0x41840e3a, 0x41836ace, 0x4182b217, 0x4181cb79,
|
||||
0x4180fb13, 0x418098b9, 0x41805ded, 0x417ff69a, 0x417f49bd, 0x417ecef8,
|
||||
0x417e286c, 0x417d9135, 0x417cfff4, 0x417ca8f7, 0x417b2e8f, 0x41773788,
|
||||
0x4170b095, 0x4167417f};
|
||||
|
||||
const int global_cmvn_std_hex[] = {
|
||||
0x4040335e, 0x405235d3, 0x40589be4, 0x4054261f, 0x40544ba2, 0x40575418,
|
||||
0x405b6528, 0x40617999, 0x40605fcf, 0x405c9c6d, 0x40590796, 0x405899fc,
|
||||
0x405810b8, 0x40587c40, 0x40592b5e, 0x4057fb12, 0x4057028b, 0x405515d7,
|
||||
0x4053d714, 0x405418c7, 0x405536bc, 0x4052f54e, 0x4052d382, 0x4051201d,
|
||||
0x4050a8d2, 0x4050857f, 0x404ffe85, 0x4050a0da, 0x40517a8a, 0x40508862,
|
||||
0x40504f68, 0x404f3159, 0x404f0930, 0x404e8a2e, 0x404e7383, 0x404eb185,
|
||||
0x404edaa9, 0x404efed2, 0x404ea8f4, 0x404f6d0d, 0x404ee9d9, 0x404f4cca,
|
||||
0x404fb13f, 0x405051c5, 0x40503f5e, 0x4050df6e, 0x4052974e, 0x4053d421,
|
||||
0x40544d48, 0x40544ec8, 0x40550e57, 0x40558287, 0x4055d122, 0x4056b22a,
|
||||
0x4058ea5c, 0x405acbc3, 0x405a89e7, 0x405a88ed, 0x405afadb, 0x405a1c60,
|
||||
0x405a6f46, 0x405b0a24, 0x405b5f44, 0x405cc0a9, 0x405d984b, 0x405ef9b8,
|
||||
0x4061178a, 0x406262bf, 0x40644904, 0x40660b20, 0x4067f7f1, 0x406a35e5,
|
||||
0x406c1e97, 0x406e16a9, 0x406eadb1, 0x406d0cba, 0x406d9ca0, 0x406f5a14,
|
||||
0x406e84a7, 0x406cd985};
|
||||
|
||||
const int global_cmvn_mean_online_hex[] = {
|
||||
|
||||
0x413d5d27, 0x414785ae, 0x4156986a, 0x41612a4e, 0x416b063e, 0x41721c9b,
|
||||
0x4176c505, 0x41784b5b, 0x417a9575, 0x417adfb2, 0x417d4153, 0x417e611e,
|
||||
0x41805288, 0x41811c27, 0x4181250c, 0x41815cd4, 0x41810b77, 0x4180817c,
|
||||
0x41815881, 0x417feaf2, 0x417dd2bf, 0x41809f37, 0x417e7b47, 0x41803a6a,
|
||||
0x417f1ff4, 0x41804382, 0x417ead10, 0x41801220, 0x417eeb28, 0x417e9801,
|
||||
0x417f26b9, 0x417e95f9, 0x417eac06, 0x417f9aa5, 0x417fbb16, 0x41805651,
|
||||
0x4180daaa, 0x4180e84c, 0x41810566, 0x4180ab2c, 0x418111b0, 0x4180c6cc,
|
||||
0x41819e27, 0x418190cc, 0x4182205c, 0x4182f265, 0x4183e1a2, 0x41844012,
|
||||
0x4184b0cd, 0x41857447, 0x418527f7, 0x4184fdc6, 0x41851ad2, 0x4185a148,
|
||||
0x41860f8b, 0x41868888, 0x418712e4, 0x41871702, 0x41867ec3, 0x4185cc48,
|
||||
0x418559b4, 0x41847855, 0x418408f4, 0x418368f4, 0x4182b718, 0x4181d76d,
|
||||
0x41810e52, 0x4180b204, 0x418078a4, 0x41801179, 0x417f5579, 0x417e93b7,
|
||||
0x417d6f2c, 0x417c1a0b, 0x417a6c7a, 0x41787d18, 0x4174eceb, 0x416e3ed3,
|
||||
0x41644af8, 0x41566dd4
|
||||
|
||||
};
|
||||
|
||||
const int global_cmvn_std_online_hex[] = {
|
||||
|
||||
0x40408fdd, 0x405293b6, 0x4058f2d2, 0x40546ddb, 0x4054984c, 0x4057971b,
|
||||
0x405ba086, 0x4061afa7, 0x4060a24c, 0x405cbb7e, 0x405923f7, 0x4058c91f,
|
||||
0x40585cf3, 0x4058c22a, 0x40594960, 0x405824a6, 0x405703f3, 0x40556377,
|
||||
0x4053e02d, 0x40540a7e, 0x405553c7, 0x4052ead5, 0x4052d23d, 0x40510308,
|
||||
0x4050a2f3, 0x40505b81, 0x404fed20, 0x4050a372, 0x40515196, 0x40504810,
|
||||
0x40501fdd, 0x404f2225, 0x404f0931, 0x404e8a2b, 0x404e773b, 0x404ea782,
|
||||
0x404ee17d, 0x404ef49c, 0x404e884d, 0x404f696b, 0x404edd0e, 0x404f23cc,
|
||||
0x404f74d4, 0x40501e89, 0x405009f3, 0x4050c422, 0x4052902b, 0x4053987c,
|
||||
0x40542997, 0x40543695, 0x4054cbef, 0x40553947, 0x4055ab7c, 0x4056887c,
|
||||
0x4058b710, 0x405a8d28, 0x405a6a27, 0x405a6b3b, 0x405ac8d3, 0x405a031d,
|
||||
0x405a2158, 0x405abb1b, 0x405b1350, 0x405c98c0, 0x405d5cf9, 0x405ead5b,
|
||||
0x40609748, 0x4061dfb9, 0x4063aa9f, 0x40655831, 0x40671a35, 0x40694bf5,
|
||||
0x406b1f59, 0x406cb49b, 0x406cf19e, 0x406b592b, 0x406b757c, 0x406c866d,
|
||||
0x406ac24f, 0x406678d9
|
||||
|
||||
};
|
||||
|
||||
const unsigned int paraformer_cmvn_mean_hex[] = {
|
||||
|
||||
0xc104fd75, 0xc1099d56, 0xc119dad7, 0xc126f9a7, 0xc133681f, 0xc13e221f,
|
||||
0xc145cc83, 0xc14a3166, 0xc14e1bda, 0xc14d4a62, 0xc14e41a9, 0xc14f4e7b,
|
||||
0xc153297e, 0xc1567ee5, 0xc157dbab, 0xc158dfa4, 0xc158e6f9, 0xc1584e70,
|
||||
0xc15aecea, 0xc15886b8, 0xc156bcb4, 0xc15a7ba9, 0xc1581d34, 0xc15c0a48,
|
||||
0xc15c463f, 0xc15dfc3b, 0xc15bb28b, 0xc15b4413, 0xc158f8c0, 0xc1588ede,
|
||||
0xc158c880, 0xc158ff19, 0xc159815a, 0xc159ed72, 0xc15a458d, 0xc15a93d3,
|
||||
0xc15a06ec, 0xc15953d8, 0xc1592e92, 0xc1579518, 0xc1587d76, 0xc157bc56,
|
||||
0xc159c47c, 0xc15a5ac4, 0xc15b7286, 0xc15cab60, 0xc15e7f8d, 0xc1607ee5,
|
||||
0xc162e9ad, 0xc165bdb0, 0xc167bf3e, 0xc169a0a5, 0xc16b4b68, 0xc16d5682,
|
||||
0xc16ebd51, 0xc170197a, 0xc170d1cc, 0xc1707fc1, 0xc16fd830, 0xc16ec4b1,
|
||||
0xc16de888, 0xc16d3b06, 0xc16cc155, 0xc16c4e31, 0xc16b6abe, 0xc169cde8,
|
||||
0xc1684578, 0xc166c2a4, 0xc165d326, 0xc164df46, 0xc163b4ad, 0xc1632d19,
|
||||
0xc162a94a, 0xc16280fc, 0xc161ae3e, 0xc15fec42, 0xc15cbadc, 0xc15664c3,
|
||||
0xc14c6d5d, 0xc13b64ae, 0xc104fd75, 0xc1099d56, 0xc119dad7, 0xc126f9a7,
|
||||
0xc133681f, 0xc13e221f, 0xc145cc83, 0xc14a3166, 0xc14e1bda, 0xc14d4a62,
|
||||
0xc14e41a9, 0xc14f4e7b, 0xc153297e, 0xc1567ee5, 0xc157dbab, 0xc158dfa4,
|
||||
0xc158e6f9, 0xc1584e70, 0xc15aecea, 0xc15886b8, 0xc156bcb4, 0xc15a7ba9,
|
||||
0xc1581d34, 0xc15c0a48, 0xc15c463f, 0xc15dfc3b, 0xc15bb28b, 0xc15b4413,
|
||||
0xc158f8c0, 0xc1588ede, 0xc158c880, 0xc158ff19, 0xc159815a, 0xc159ed72,
|
||||
0xc15a458d, 0xc15a93d3, 0xc15a06ec, 0xc15953d8, 0xc1592e92, 0xc1579518,
|
||||
0xc1587d76, 0xc157bc56, 0xc159c47c, 0xc15a5ac4, 0xc15b7286, 0xc15cab60,
|
||||
0xc15e7f8d, 0xc1607ee5, 0xc162e9ad, 0xc165bdb0, 0xc167bf3e, 0xc169a0a5,
|
||||
0xc16b4b68, 0xc16d5682, 0xc16ebd51, 0xc170197a, 0xc170d1cc, 0xc1707fc1,
|
||||
0xc16fd830, 0xc16ec4b1, 0xc16de888, 0xc16d3b06, 0xc16cc155, 0xc16c4e31,
|
||||
0xc16b6abe, 0xc169cde8, 0xc1684578, 0xc166c2a4, 0xc165d326, 0xc164df46,
|
||||
0xc163b4ad, 0xc1632d19, 0xc162a94a, 0xc16280fc, 0xc161ae3e, 0xc15fec42,
|
||||
0xc15cbadc, 0xc15664c3, 0xc14c6d5d, 0xc13b64ae, 0xc104fd75, 0xc1099d56,
|
||||
0xc119dad7, 0xc126f9a7, 0xc133681f, 0xc13e221f, 0xc145cc83, 0xc14a3166,
|
||||
0xc14e1bda, 0xc14d4a62, 0xc14e41a9, 0xc14f4e7b, 0xc153297e, 0xc1567ee5,
|
||||
0xc157dbab, 0xc158dfa4, 0xc158e6f9, 0xc1584e70, 0xc15aecea, 0xc15886b8,
|
||||
0xc156bcb4, 0xc15a7ba9, 0xc1581d34, 0xc15c0a48, 0xc15c463f, 0xc15dfc3b,
|
||||
0xc15bb28b, 0xc15b4413, 0xc158f8c0, 0xc1588ede, 0xc158c880, 0xc158ff19,
|
||||
0xc159815a, 0xc159ed72, 0xc15a458d, 0xc15a93d3, 0xc15a06ec, 0xc15953d8,
|
||||
0xc1592e92, 0xc1579518, 0xc1587d76, 0xc157bc56, 0xc159c47c, 0xc15a5ac4,
|
||||
0xc15b7286, 0xc15cab60, 0xc15e7f8d, 0xc1607ee5, 0xc162e9ad, 0xc165bdb0,
|
||||
0xc167bf3e, 0xc169a0a5, 0xc16b4b68, 0xc16d5682, 0xc16ebd51, 0xc170197a,
|
||||
0xc170d1cc, 0xc1707fc1, 0xc16fd830, 0xc16ec4b1, 0xc16de888, 0xc16d3b06,
|
||||
0xc16cc155, 0xc16c4e31, 0xc16b6abe, 0xc169cde8, 0xc1684578, 0xc166c2a4,
|
||||
0xc165d326, 0xc164df46, 0xc163b4ad, 0xc1632d19, 0xc162a94a, 0xc16280fc,
|
||||
0xc161ae3e, 0xc15fec42, 0xc15cbadc, 0xc15664c3, 0xc14c6d5d, 0xc13b64ae,
|
||||
0xc104fd75, 0xc1099d56, 0xc119dad7, 0xc126f9a7, 0xc133681f, 0xc13e221f,
|
||||
0xc145cc83, 0xc14a3166, 0xc14e1bda, 0xc14d4a62, 0xc14e41a9, 0xc14f4e7b,
|
||||
0xc153297e, 0xc1567ee5, 0xc157dbab, 0xc158dfa4, 0xc158e6f9, 0xc1584e70,
|
||||
0xc15aecea, 0xc15886b8, 0xc156bcb4, 0xc15a7ba9, 0xc1581d34, 0xc15c0a48,
|
||||
0xc15c463f, 0xc15dfc3b, 0xc15bb28b, 0xc15b4413, 0xc158f8c0, 0xc1588ede,
|
||||
0xc158c880, 0xc158ff19, 0xc159815a, 0xc159ed72, 0xc15a458d, 0xc15a93d3,
|
||||
0xc15a06ec, 0xc15953d8, 0xc1592e92, 0xc1579518, 0xc1587d76, 0xc157bc56,
|
||||
0xc159c47c, 0xc15a5ac4, 0xc15b7286, 0xc15cab60, 0xc15e7f8d, 0xc1607ee5,
|
||||
0xc162e9ad, 0xc165bdb0, 0xc167bf3e, 0xc169a0a5, 0xc16b4b68, 0xc16d5682,
|
||||
0xc16ebd51, 0xc170197a, 0xc170d1cc, 0xc1707fc1, 0xc16fd830, 0xc16ec4b1,
|
||||
0xc16de888, 0xc16d3b06, 0xc16cc155, 0xc16c4e31, 0xc16b6abe, 0xc169cde8,
|
||||
0xc1684578, 0xc166c2a4, 0xc165d326, 0xc164df46, 0xc163b4ad, 0xc1632d19,
|
||||
0xc162a94a, 0xc16280fc, 0xc161ae3e, 0xc15fec42, 0xc15cbadc, 0xc15664c3,
|
||||
0xc14c6d5d, 0xc13b64ae, 0xc104fd75, 0xc1099d56, 0xc119dad7, 0xc126f9a7,
|
||||
0xc133681f, 0xc13e221f, 0xc145cc83, 0xc14a3166, 0xc14e1bda, 0xc14d4a62,
|
||||
0xc14e41a9, 0xc14f4e7b, 0xc153297e, 0xc1567ee5, 0xc157dbab, 0xc158dfa4,
|
||||
0xc158e6f9, 0xc1584e70, 0xc15aecea, 0xc15886b8, 0xc156bcb4, 0xc15a7ba9,
|
||||
0xc1581d34, 0xc15c0a48, 0xc15c463f, 0xc15dfc3b, 0xc15bb28b, 0xc15b4413,
|
||||
0xc158f8c0, 0xc1588ede, 0xc158c880, 0xc158ff19, 0xc159815a, 0xc159ed72,
|
||||
0xc15a458d, 0xc15a93d3, 0xc15a06ec, 0xc15953d8, 0xc1592e92, 0xc1579518,
|
||||
0xc1587d76, 0xc157bc56, 0xc159c47c, 0xc15a5ac4, 0xc15b7286, 0xc15cab60,
|
||||
0xc15e7f8d, 0xc1607ee5, 0xc162e9ad, 0xc165bdb0, 0xc167bf3e, 0xc169a0a5,
|
||||
0xc16b4b68, 0xc16d5682, 0xc16ebd51, 0xc170197a, 0xc170d1cc, 0xc1707fc1,
|
||||
0xc16fd830, 0xc16ec4b1, 0xc16de888, 0xc16d3b06, 0xc16cc155, 0xc16c4e31,
|
||||
0xc16b6abe, 0xc169cde8, 0xc1684578, 0xc166c2a4, 0xc165d326, 0xc164df46,
|
||||
0xc163b4ad, 0xc1632d19, 0xc162a94a, 0xc16280fc, 0xc161ae3e, 0xc15fec42,
|
||||
0xc15cbadc, 0xc15664c3, 0xc14c6d5d, 0xc13b64ae, 0xc104fd75, 0xc1099d56,
|
||||
0xc119dad7, 0xc126f9a7, 0xc133681f, 0xc13e221f, 0xc145cc83, 0xc14a3166,
|
||||
0xc14e1bda, 0xc14d4a62, 0xc14e41a9, 0xc14f4e7b, 0xc153297e, 0xc1567ee5,
|
||||
0xc157dbab, 0xc158dfa4, 0xc158e6f9, 0xc1584e70, 0xc15aecea, 0xc15886b8,
|
||||
0xc156bcb4, 0xc15a7ba9, 0xc1581d34, 0xc15c0a48, 0xc15c463f, 0xc15dfc3b,
|
||||
0xc15bb28b, 0xc15b4413, 0xc158f8c0, 0xc1588ede, 0xc158c880, 0xc158ff19,
|
||||
0xc159815a, 0xc159ed72, 0xc15a458d, 0xc15a93d3, 0xc15a06ec, 0xc15953d8,
|
||||
0xc1592e92, 0xc1579518, 0xc1587d76, 0xc157bc56, 0xc159c47c, 0xc15a5ac4,
|
||||
0xc15b7286, 0xc15cab60, 0xc15e7f8d, 0xc1607ee5, 0xc162e9ad, 0xc165bdb0,
|
||||
0xc167bf3e, 0xc169a0a5, 0xc16b4b68, 0xc16d5682, 0xc16ebd51, 0xc170197a,
|
||||
0xc170d1cc, 0xc1707fc1, 0xc16fd830, 0xc16ec4b1, 0xc16de888, 0xc16d3b06,
|
||||
0xc16cc155, 0xc16c4e31, 0xc16b6abe, 0xc169cde8, 0xc1684578, 0xc166c2a4,
|
||||
0xc165d326, 0xc164df46, 0xc163b4ad, 0xc1632d19, 0xc162a94a, 0xc16280fc,
|
||||
0xc161ae3e, 0xc15fec42, 0xc15cbadc, 0xc15664c3, 0xc14c6d5d, 0xc13b64ae,
|
||||
0xc104fd75, 0xc1099d56, 0xc119dad7, 0xc126f9a7, 0xc133681f, 0xc13e221f,
|
||||
0xc145cc83, 0xc14a3166, 0xc14e1bda, 0xc14d4a62, 0xc14e41a9, 0xc14f4e7b,
|
||||
0xc153297e, 0xc1567ee5, 0xc157dbab, 0xc158dfa4, 0xc158e6f9, 0xc1584e70,
|
||||
0xc15aecea, 0xc15886b8, 0xc156bcb4, 0xc15a7ba9, 0xc1581d34, 0xc15c0a48,
|
||||
0xc15c463f, 0xc15dfc3b, 0xc15bb28b, 0xc15b4413, 0xc158f8c0, 0xc1588ede,
|
||||
0xc158c880, 0xc158ff19, 0xc159815a, 0xc159ed72, 0xc15a458d, 0xc15a93d3,
|
||||
0xc15a06ec, 0xc15953d8, 0xc1592e92, 0xc1579518, 0xc1587d76, 0xc157bc56,
|
||||
0xc159c47c, 0xc15a5ac4, 0xc15b7286, 0xc15cab60, 0xc15e7f8d, 0xc1607ee5,
|
||||
0xc162e9ad, 0xc165bdb0, 0xc167bf3e, 0xc169a0a5, 0xc16b4b68, 0xc16d5682,
|
||||
0xc16ebd51, 0xc170197a, 0xc170d1cc, 0xc1707fc1, 0xc16fd830, 0xc16ec4b1,
|
||||
0xc16de888, 0xc16d3b06, 0xc16cc155, 0xc16c4e31, 0xc16b6abe, 0xc169cde8,
|
||||
0xc1684578, 0xc166c2a4, 0xc165d326, 0xc164df46, 0xc163b4ad, 0xc1632d19,
|
||||
0xc162a94a, 0xc16280fc, 0xc161ae3e, 0xc15fec42, 0xc15cbadc, 0xc15664c3,
|
||||
0xc14c6d5d, 0xc13b64ae};
|
||||
|
||||
const unsigned int paraformer_cmvn_var_hex[] = {
|
||||
|
||||
0x40619618, 0x405fb77c, 0x405d3028, 0x405bef11, 0x405a189d, 0x4057aad5,
|
||||
0x4054f9cc, 0x40518e8c, 0x404fffdd, 0x40510d0d, 0x4052400d, 0x4052bab0,
|
||||
0x40526416, 0x40515cb8, 0x40506aee, 0x404fef8d, 0x404ff527, 0x40505b95,
|
||||
0x4050d61c, 0x4051d0a5, 0x4052abd2, 0x4052f14b, 0x4053d196, 0x4054800d,
|
||||
0x405545f2, 0x4055d71f, 0x40567588, 0x4056de4d, 0x40579b72, 0x40584d35,
|
||||
0x4058cd2f, 0x40594731, 0x4059a53f, 0x405a00ed, 0x405a34c1, 0x405a406e,
|
||||
0x405a1748, 0x405a0300, 0x405a1547, 0x405a66a7, 0x405a9be4, 0x405b04b2,
|
||||
0x405b5754, 0x405b9189, 0x405b9016, 0x405b7a07, 0x405b63f9, 0x405b3f45,
|
||||
0x405b0cb4, 0x405ac80b, 0x405ac1f7, 0x405abbd9, 0x405ac86a, 0x405ad72b,
|
||||
0x405af2f0, 0x405ab465, 0x405a6364, 0x405a1350, 0x4059baa3, 0x4059911d,
|
||||
0x40597921, 0x40595564, 0x40593b8d, 0x4059310f, 0x40594e46, 0x40599bae,
|
||||
0x4059e703, 0x4059feec, 0x405a053a, 0x4059feaa, 0x4059d7a0, 0x40599386,
|
||||
0x40592d0e, 0x4058ce4c, 0x40587335, 0x4058396a, 0x40584ee1, 0x4058925a,
|
||||
0x40592f6d, 0x405a9f0a, 0x40619618, 0x405fb77c, 0x405d3028, 0x405bef11,
|
||||
0x405a189d, 0x4057aad5, 0x4054f9cc, 0x40518e8c, 0x404fffdd, 0x40510d0d,
|
||||
0x4052400d, 0x4052bab0, 0x40526416, 0x40515cb8, 0x40506aee, 0x404fef8d,
|
||||
0x404ff527, 0x40505b95, 0x4050d61c, 0x4051d0a5, 0x4052abd2, 0x4052f14b,
|
||||
0x4053d196, 0x4054800d, 0x405545f2, 0x4055d71f, 0x40567588, 0x4056de4d,
|
||||
0x40579b72, 0x40584d35, 0x4058cd2f, 0x40594731, 0x4059a53f, 0x405a00ed,
|
||||
0x405a34c1, 0x405a406e, 0x405a1748, 0x405a0300, 0x405a1547, 0x405a66a7,
|
||||
0x405a9be4, 0x405b04b2, 0x405b5754, 0x405b9189, 0x405b9016, 0x405b7a07,
|
||||
0x405b63f9, 0x405b3f45, 0x405b0cb4, 0x405ac80b, 0x405ac1f7, 0x405abbd9,
|
||||
0x405ac86a, 0x405ad72b, 0x405af2f0, 0x405ab465, 0x405a6364, 0x405a1350,
|
||||
0x4059baa3, 0x4059911d, 0x40597921, 0x40595564, 0x40593b8d, 0x4059310f,
|
||||
0x40594e46, 0x40599bae, 0x4059e703, 0x4059feec, 0x405a053a, 0x4059feaa,
|
||||
0x4059d7a0, 0x40599386, 0x40592d0e, 0x4058ce4c, 0x40587335, 0x4058396a,
|
||||
0x40584ee1, 0x4058925a, 0x40592f6d, 0x405a9f0a, 0x40619618, 0x405fb77c,
|
||||
0x405d3028, 0x405bef11, 0x405a189d, 0x4057aad5, 0x4054f9cc, 0x40518e8c,
|
||||
0x404fffdd, 0x40510d0d, 0x4052400d, 0x4052bab0, 0x40526416, 0x40515cb8,
|
||||
0x40506aee, 0x404fef8d, 0x404ff527, 0x40505b95, 0x4050d61c, 0x4051d0a5,
|
||||
0x4052abd2, 0x4052f14b, 0x4053d196, 0x4054800d, 0x405545f2, 0x4055d71f,
|
||||
0x40567588, 0x4056de4d, 0x40579b72, 0x40584d35, 0x4058cd2f, 0x40594731,
|
||||
0x4059a53f, 0x405a00ed, 0x405a34c1, 0x405a406e, 0x405a1748, 0x405a0300,
|
||||
0x405a1547, 0x405a66a7, 0x405a9be4, 0x405b04b2, 0x405b5754, 0x405b9189,
|
||||
0x405b9016, 0x405b7a07, 0x405b63f9, 0x405b3f45, 0x405b0cb4, 0x405ac80b,
|
||||
0x405ac1f7, 0x405abbd9, 0x405ac86a, 0x405ad72b, 0x405af2f0, 0x405ab465,
|
||||
0x405a6364, 0x405a1350, 0x4059baa3, 0x4059911d, 0x40597921, 0x40595564,
|
||||
0x40593b8d, 0x4059310f, 0x40594e46, 0x40599bae, 0x4059e703, 0x4059feec,
|
||||
0x405a053a, 0x4059feaa, 0x4059d7a0, 0x40599386, 0x40592d0e, 0x4058ce4c,
|
||||
0x40587335, 0x4058396a, 0x40584ee1, 0x4058925a, 0x40592f6d, 0x405a9f0a,
|
||||
0x40619618, 0x405fb77c, 0x405d3028, 0x405bef11, 0x405a189d, 0x4057aad5,
|
||||
0x4054f9cc, 0x40518e8c, 0x404fffdd, 0x40510d0d, 0x4052400d, 0x4052bab0,
|
||||
0x40526416, 0x40515cb8, 0x40506aee, 0x404fef8d, 0x404ff527, 0x40505b95,
|
||||
0x4050d61c, 0x4051d0a5, 0x4052abd2, 0x4052f14b, 0x4053d196, 0x4054800d,
|
||||
0x405545f2, 0x4055d71f, 0x40567588, 0x4056de4d, 0x40579b72, 0x40584d35,
|
||||
0x4058cd2f, 0x40594731, 0x4059a53f, 0x405a00ed, 0x405a34c1, 0x405a406e,
|
||||
0x405a1748, 0x405a0300, 0x405a1547, 0x405a66a7, 0x405a9be4, 0x405b04b2,
|
||||
0x405b5754, 0x405b9189, 0x405b9016, 0x405b7a07, 0x405b63f9, 0x405b3f45,
|
||||
0x405b0cb4, 0x405ac80b, 0x405ac1f7, 0x405abbd9, 0x405ac86a, 0x405ad72b,
|
||||
0x405af2f0, 0x405ab465, 0x405a6364, 0x405a1350, 0x4059baa3, 0x4059911d,
|
||||
0x40597921, 0x40595564, 0x40593b8d, 0x4059310f, 0x40594e46, 0x40599bae,
|
||||
0x4059e703, 0x4059feec, 0x405a053a, 0x4059feaa, 0x4059d7a0, 0x40599386,
|
||||
0x40592d0e, 0x4058ce4c, 0x40587335, 0x4058396a, 0x40584ee1, 0x4058925a,
|
||||
0x40592f6d, 0x405a9f0a, 0x40619618, 0x405fb77c, 0x405d3028, 0x405bef11,
|
||||
0x405a189d, 0x4057aad5, 0x4054f9cc, 0x40518e8c, 0x404fffdd, 0x40510d0d,
|
||||
0x4052400d, 0x4052bab0, 0x40526416, 0x40515cb8, 0x40506aee, 0x404fef8d,
|
||||
0x404ff527, 0x40505b95, 0x4050d61c, 0x4051d0a5, 0x4052abd2, 0x4052f14b,
|
||||
0x4053d196, 0x4054800d, 0x405545f2, 0x4055d71f, 0x40567588, 0x4056de4d,
|
||||
0x40579b72, 0x40584d35, 0x4058cd2f, 0x40594731, 0x4059a53f, 0x405a00ed,
|
||||
0x405a34c1, 0x405a406e, 0x405a1748, 0x405a0300, 0x405a1547, 0x405a66a7,
|
||||
0x405a9be4, 0x405b04b2, 0x405b5754, 0x405b9189, 0x405b9016, 0x405b7a07,
|
||||
0x405b63f9, 0x405b3f45, 0x405b0cb4, 0x405ac80b, 0x405ac1f7, 0x405abbd9,
|
||||
0x405ac86a, 0x405ad72b, 0x405af2f0, 0x405ab465, 0x405a6364, 0x405a1350,
|
||||
0x4059baa3, 0x4059911d, 0x40597921, 0x40595564, 0x40593b8d, 0x4059310f,
|
||||
0x40594e46, 0x40599bae, 0x4059e703, 0x4059feec, 0x405a053a, 0x4059feaa,
|
||||
0x4059d7a0, 0x40599386, 0x40592d0e, 0x4058ce4c, 0x40587335, 0x4058396a,
|
||||
0x40584ee1, 0x4058925a, 0x40592f6d, 0x405a9f0a, 0x40619618, 0x405fb77c,
|
||||
0x405d3028, 0x405bef11, 0x405a189d, 0x4057aad5, 0x4054f9cc, 0x40518e8c,
|
||||
0x404fffdd, 0x40510d0d, 0x4052400d, 0x4052bab0, 0x40526416, 0x40515cb8,
|
||||
0x40506aee, 0x404fef8d, 0x404ff527, 0x40505b95, 0x4050d61c, 0x4051d0a5,
|
||||
0x4052abd2, 0x4052f14b, 0x4053d196, 0x4054800d, 0x405545f2, 0x4055d71f,
|
||||
0x40567588, 0x4056de4d, 0x40579b72, 0x40584d35, 0x4058cd2f, 0x40594731,
|
||||
0x4059a53f, 0x405a00ed, 0x405a34c1, 0x405a406e, 0x405a1748, 0x405a0300,
|
||||
0x405a1547, 0x405a66a7, 0x405a9be4, 0x405b04b2, 0x405b5754, 0x405b9189,
|
||||
0x405b9016, 0x405b7a07, 0x405b63f9, 0x405b3f45, 0x405b0cb4, 0x405ac80b,
|
||||
0x405ac1f7, 0x405abbd9, 0x405ac86a, 0x405ad72b, 0x405af2f0, 0x405ab465,
|
||||
0x405a6364, 0x405a1350, 0x4059baa3, 0x4059911d, 0x40597921, 0x40595564,
|
||||
0x40593b8d, 0x4059310f, 0x40594e46, 0x40599bae, 0x4059e703, 0x4059feec,
|
||||
0x405a053a, 0x4059feaa, 0x4059d7a0, 0x40599386, 0x40592d0e, 0x4058ce4c,
|
||||
0x40587335, 0x4058396a, 0x40584ee1, 0x4058925a, 0x40592f6d, 0x405a9f0a,
|
||||
0x40619618, 0x405fb77c, 0x405d3028, 0x405bef11, 0x405a189d, 0x4057aad5,
|
||||
0x4054f9cc, 0x40518e8c, 0x404fffdd, 0x40510d0d, 0x4052400d, 0x4052bab0,
|
||||
0x40526416, 0x40515cb8, 0x40506aee, 0x404fef8d, 0x404ff527, 0x40505b95,
|
||||
0x4050d61c, 0x4051d0a5, 0x4052abd2, 0x4052f14b, 0x4053d196, 0x4054800d,
|
||||
0x405545f2, 0x4055d71f, 0x40567588, 0x4056de4d, 0x40579b72, 0x40584d35,
|
||||
0x4058cd2f, 0x40594731, 0x4059a53f, 0x405a00ed, 0x405a34c1, 0x405a406e,
|
||||
0x405a1748, 0x405a0300, 0x405a1547, 0x405a66a7, 0x405a9be4, 0x405b04b2,
|
||||
0x405b5754, 0x405b9189, 0x405b9016, 0x405b7a07, 0x405b63f9, 0x405b3f45,
|
||||
0x405b0cb4, 0x405ac80b, 0x405ac1f7, 0x405abbd9, 0x405ac86a, 0x405ad72b,
|
||||
0x405af2f0, 0x405ab465, 0x405a6364, 0x405a1350, 0x4059baa3, 0x4059911d,
|
||||
0x40597921, 0x40595564, 0x40593b8d, 0x4059310f, 0x40594e46, 0x40599bae,
|
||||
0x4059e703, 0x4059feec, 0x405a053a, 0x4059feaa, 0x4059d7a0, 0x40599386,
|
||||
0x40592d0e, 0x4058ce4c, 0x40587335, 0x4058396a, 0x40584ee1, 0x4058925a,
|
||||
0x40592f6d, 0x405a9f0a
|
||||
|
||||
};
|
||||
|
||||
const int pos_enc_coe_hex[] = {
|
||||
0x3f800000, 0x3f84b063, 0x3f898cc0, 0x3f8e96b2, 0x3f93cfe5, 0x3f993a15,
|
||||
0x3f9ed70c, 0x3fa4a8a8, 0x3faab0d5, 0x3fb0f193, 0x3fb76cf5, 0x3fbe2520,
|
||||
0x3fc51c50, 0x3fcc54d2, 0x3fd3d10c, 0x3fdb9378, 0x3fe39ea9, 0x3febf549,
|
||||
0x3ff49a1b, 0x3ffd8ffe, 0x40036cf4, 0x40083d78, 0x400d3b22, 0x40126799,
|
||||
0x4017c496, 0x401d53df, 0x4023174b, 0x402910c4, 0x402f4244, 0x4035adda,
|
||||
0x403c55a4, 0x40433bd9, 0x404a62c2, 0x4051ccbd, 0x40597c3f, 0x406173d4,
|
||||
0x4069b621, 0x407245e2, 0x407b25ed, 0x40822c9a, 0x4086f161, 0x408be2e0,
|
||||
0x409102bc, 0x409652a6, 0x409bd461, 0x40a189c1, 0x40a774aa, 0x40ad9711,
|
||||
0x40b3f300, 0x40ba8a92, 0x40c15ff6, 0x40c8756f, 0x40cfcd58, 0x40d76a1e,
|
||||
0x40df4e48, 0x40e77c73, 0x40eff755, 0x40f8c1be, 0x4100ef4c, 0x4105a873,
|
||||
0x410a8de6, 0x410fa144, 0x4114e43b, 0x411a588a, 0x41200000, 0x4125dc7c,
|
||||
0x412beff0, 0x41323c5f, 0x4138c3df, 0x413f889a, 0x41468cd0, 0x414dd2d2,
|
||||
0x41555d0a, 0x415d2df7, 0x41654832, 0x416dae69, 0x41766364, 0x417f6a07,
|
||||
0x418462a7, 0x41893c2b, 0x418e432a, 0x4193794e, 0x4198e051, 0x419e79ff,
|
||||
0x41a44831, 0x41aa4cd6, 0x41b089ea, 0x41b70180, 0x41bdb5bc, 0x41c4a8d7,
|
||||
0x41cbdd1e, 0x41d354f5, 0x41db12d6, 0x41e31950, 0x41eb6b0d, 0x41f40ad0,
|
||||
0x41fcfb72, 0x42031ff6, 0x4207eda7, 0x420ce865, 0x421211d5, 0x42176bad,
|
||||
0x421cf7b4, 0x4222b7c0, 0x4228adb9, 0x422edb98, 0x4235436b, 0x423be74f,
|
||||
0x4242c979, 0x4249ec31, 0x425151d4, 0x4258fcd6, 0x4260efc0, 0x42692d37,
|
||||
0x4271b7f3, 0x427a92cb, 0x4281e057, 0x4286a253, 0x428b90ed, 0x4290adc8,
|
||||
0x4295fa95, 0x429b7917, 0x42a12b1f, 0x42a71290, 0x42ad3160, 0x42b38995,
|
||||
0x42ba1d4a, 0x42c0eead, 0x42c80000, 0x42cf539b, 0x42d6ebec, 0x42decb76,
|
||||
0x42e6f4d6, 0x42ef6ac1, 0x42f83003, 0x4300a3c3, 0x43055a26, 0x430a3cbb,
|
||||
0x430f4d1f, 0x43148d01, 0x4319fe1e, 0x431fa244, 0x43257b51, 0x432b8b36,
|
||||
0x4331d3f4, 0x433857a1, 0x433f1865, 0x4346187e, 0x434d5a3e, 0x4354e00b,
|
||||
0x435cac64, 0x4364c1e0, 0x436d232b, 0x4375d30c, 0x437ed466, 0x43841519,
|
||||
0x4388ebc5, 0x438defd2, 0x439322e8, 0x439886c2, 0x439e1d27, 0x43a3e7f3,
|
||||
0x43a9e911, 0x43b0227e, 0x43b6964a, 0x43bd4698, 0x43c435a1, 0x43cb65b0,
|
||||
0x43d2d927, 0x43da927e, 0x43e29445, 0x43eae123, 0x43f37bd8, 0x43fc673e,
|
||||
0x4402d325, 0x44079e06, 0x440c95d8, 0x4411bc42, 0x441712f8, 0x441c9bbf,
|
||||
0x4422586d, 0x44284ae8, 0x442e7528, 0x4434d93a, 0x443b793b, 0x4442575d,
|
||||
0x444975e6, 0x4450d734, 0x44587db7, 0x44606bfa, 0x4468a49c, 0x44712a58,
|
||||
0x447a0000, 0x44819441, 0x44865373, 0x448b3f2a, 0x44905906, 0x4495a2b9,
|
||||
0x449b1e02, 0x44a0ccb4, 0x44a6b0b0, 0x44accbe9, 0x44b32067, 0x44b9b042,
|
||||
0x44c07da6, 0x44c78ad5, 0x44ceda26, 0x44d66e03, 0x44de48f1, 0x44e66d89,
|
||||
0x44eede7f, 0x44f79e9e, 0x45005867, 0x45050c07, 0x4509ebbf, 0x450ef92c,
|
||||
0x451435fb, 0x4519a3e8, 0x451f44bf, 0x45251a60, 0x452b26b7, 0x45316bc7,
|
||||
0x4537eba3, 0x453ea872, 0x4545a471, 0x454ce1f0, 0x45546355, 0x455c2b1d,
|
||||
0x45643bdc, 0x456c983e, 0x45754309, 0x457e3f1c, 0x4583c7b8, 0x45889b8f,
|
||||
0x458d9cab, 0x4592ccb6, 0x45982d67, 0x459dc087, 0x45a387ee, 0x45a98587,
|
||||
0x45afbb4e, 0x45b62b53, 0x45bcd7b6, 0x45c3c2af, 0x45caee88, 0x45d25da1,
|
||||
0x45da1272, 0x45e20f88, 0x45ea5789, 0x45f2ed34, 0x45fbd360, 0x46028680,
|
||||
0x46074e93, 0x460c437c, 0x461166e2, 0x4616ba77};
|
||||
|
||||
const int pos_enc_div_term_hex[] = {
|
||||
0x3f800000, 0x3f76f410, 0x3f6e39f8, 0x3f65ced3, 0x3f5dafd7, 0x3f55da52,
|
||||
0x3f4e4bac, 0x3f470165, 0x3f3ff911, 0x3f39305c, 0x3f32a506, 0x3f2c54e5,
|
||||
0x3f263de0, 0x3f205df3, 0x3f1ab32b, 0x3f153ba8, 0x3f0ff59a, 0x3f0adf41,
|
||||
0x3f05f6ee, 0x3f013b01, 0x3ef953cf, 0x3ef0843c, 0x3ee80460, 0x3edfd167,
|
||||
0x3ed7e89b, 0x3ed0475c, 0x3ec8eb24, 0x3ec1d181, 0x3ebaf81a, 0x3eb45caa,
|
||||
0x3eadfcff, 0x3ea7d6fd, 0x3ea1e89b, 0x3e9c2fe1, 0x3e96aaea, 0x3e9157e1,
|
||||
0x3e8c3504, 0x3e87409d, 0x3e827909, 0x3e7bb965, 0x3e72d424, 0x3e6a3f5c,
|
||||
0x3e61f836, 0x3e59fbf3, 0x3e5247ed, 0x3e4ad998, 0x3e43ae7c, 0x3e3cc43a,
|
||||
0x3e361887, 0x3e2fa92d, 0x3e29740a, 0x3e23770f, 0x3e1db040, 0x3e181db4,
|
||||
0x3e12bd91, 0x3e0d8e0f, 0x3e088d77, 0x3e03ba20, 0x3dfe24e1, 0x3df529bb,
|
||||
0x3dec7fd5, 0x3de42450, 0x3ddc1466, 0x3dd44d6c, 0x3dcccccd, 0x3dc5900d,
|
||||
0x3dbe94c7, 0x3db7d8a9, 0x3db15978, 0x3dab150e, 0x3da50957, 0x3d9f3451,
|
||||
0x3d99940e, 0x3d9426b0, 0x3d8eea6c, 0x3d89dd84, 0x3d84fe4d, 0x3d804b29,
|
||||
0x3d778512, 0x3d6ec5da, 0x3d6655c3, 0x3d5e3202, 0x3d5657e4, 0x3d4ec4ce,
|
||||
0x3d47763f, 0x3d4069ca, 0x3d399d19, 0x3d330dec, 0x3d2cba15, 0x3d269f7d,
|
||||
0x3d20bc1d, 0x3d1b0e01, 0x3d159348, 0x3d104a21, 0x3d0b30cc, 0x3d064597,
|
||||
0x3d0186e2, 0x3cf9e635, 0x3cf11176, 0x3ce88c9c, 0x3ce054d2, 0x3cd86761,
|
||||
0x3cd0c1a8, 0x3cc9611d, 0x3cc24350, 0x3cbb65e3, 0x3cb4c691, 0x3cae6328,
|
||||
0x3ca8398b, 0x3ca247ad, 0x3c9c8b97, 0x3c970362, 0x3c91ad39, 0x3c8c8757,
|
||||
0x3c879008, 0x3c82c5a5, 0x3c7c4d33, 0x3c7362b9, 0x3c6ac8e7, 0x3c627ce5,
|
||||
0x3c5a7bf1, 0x3c52c366, 0x3c4b50b4, 0x3c442163, 0x3c3d3311, 0x3c368373,
|
||||
0x3c301052, 0x3c29d789, 0x3c23d70a, 0x3c1e0cd7, 0x3c187705, 0x3c1313ba,
|
||||
0x3c0de12d, 0x3c08dda5, 0x3c040779, 0x3bfeba1b, 0x3bf5b9b0, 0x3bed0ab3,
|
||||
0x3be4aa46, 0x3bdc95a0, 0x3bd4ca14, 0x3bcd450e, 0x3bc6040e, 0x3bbf04ae,
|
||||
0x3bb8449c, 0x3bb1c19b, 0x3bab7983, 0x3ba56a3f, 0x3b9f91cc, 0x3b99ee3b,
|
||||
0x3b947dae, 0x3b8f3e56, 0x3b8a2e77, 0x3b854c64, 0x3b80967d, 0x3b781668,
|
||||
0x3b6f520d, 0x3b66dd02, 0x3b5eb47a, 0x3b56d5bf, 0x3b4f3e37, 0x3b47eb5e,
|
||||
0x3b40dac5, 0x3b3a0a16, 0x3b33770f, 0x3b2d1f81, 0x3b270153, 0x3b211a7e,
|
||||
0x3b1b690d, 0x3b15eb1c, 0x3b109edb, 0x3b0b8287, 0x3b06946f, 0x3b01d2f1,
|
||||
0x3afa78f1, 0x3af19f03, 0x3ae91528, 0x3ae0d88b, 0x3ad8e673, 0x3ad13c3c,
|
||||
0x3ac9d75c, 0x3ac2b561, 0x3abbd3ec, 0x3ab530b7, 0x3aaec98e, 0x3aa89c52,
|
||||
0x3aa2a6f6, 0x3a9ce782, 0x3a975c0e, 0x3a9202c3, 0x3a8cd9db, 0x3a87dfa1,
|
||||
0x3a83126f, 0x3a7ce158, 0x3a73f1a2, 0x3a6b52c4, 0x3a6301e2, 0x3a5afc3b,
|
||||
0x3a533f27, 0x3a4bc816, 0x3a44948c, 0x3a3da229, 0x3a36ee9e, 0x3a3077b3,
|
||||
0x3a2a3b43, 0x3a24373e, 0x3a1e69a5, 0x3a18d08b, 0x3a136a16, 0x3a0e347c,
|
||||
0x3a092e02, 0x3a0454ff, 0x39ff4fad, 0x39f649f8, 0x39ed95e3, 0x39e5308a,
|
||||
0x39dd1726, 0x39d54706, 0x39cdbd95, 0x39c67853, 0x39bf74d7, 0x39b8b0cf,
|
||||
0x39b229fb, 0x39abde33, 0x39a5cb5f, 0x399fef7e, 0x399a489e, 0x3994d4df,
|
||||
0x398f9272, 0x398a7f9b, 0x39859aa9, 0x3980e1fe, 0x3978a814, 0x396fde93,
|
||||
0x39676491, 0x395f373e, 0x395753e5, 0x394fb7e7, 0x394860c1, 0x39414c02,
|
||||
0x393a7753, 0x3933e06f, 0x392d8529, 0x39276363, 0x39217917, 0x391bc44d,
|
||||
0x39164323, 0x3910f3c6, 0x390bd472, 0x3906e374, 0x39021f2b, 0x38fb0c03,
|
||||
0x38f22ce3, 0x38e99e04, 0x38e15c92, 0x38d965ce};
|
||||
#endif
|
||||
|
||||
} // namespace funasr
|
||||
30
modules/python/vendors/FunASR/runtime/onnxruntime/src/punc-model.cpp
vendored
Normal file
30
modules/python/vendors/FunASR/runtime/onnxruntime/src/punc-model.cpp
vendored
Normal file
@@ -0,0 +1,30 @@
|
||||
#include "precomp.h"
|
||||
|
||||
namespace funasr {
|
||||
PuncModel *CreatePuncModel(std::map<std::string, std::string>& model_path, int thread_num, PUNC_TYPE type)
|
||||
{
|
||||
PuncModel *mm;
|
||||
if (type==PUNC_OFFLINE){
|
||||
mm = new CTTransformer();
|
||||
}else if(type==PUNC_ONLINE){
|
||||
mm = new CTTransformerOnline();
|
||||
}else{
|
||||
LOG(ERROR) << "Wrong PUNC TYPE";
|
||||
exit(-1);
|
||||
}
|
||||
string punc_model_path;
|
||||
string punc_config_path;
|
||||
string token_file;
|
||||
|
||||
punc_model_path = PathAppend(model_path.at(MODEL_DIR), MODEL_NAME);
|
||||
if(model_path.find(QUANTIZE) != model_path.end() && model_path.at(QUANTIZE) == "true"){
|
||||
punc_model_path = PathAppend(model_path.at(MODEL_DIR), QUANT_MODEL_NAME);
|
||||
}
|
||||
punc_config_path = PathAppend(model_path.at(MODEL_DIR), PUNC_CONFIG_NAME);
|
||||
token_file = PathAppend(model_path.at(MODEL_DIR), TOKEN_PATH);
|
||||
|
||||
mm->InitPunc(punc_model_path, punc_config_path, token_file, thread_num);
|
||||
return mm;
|
||||
}
|
||||
|
||||
} // namespace funasr
|
||||
307
modules/python/vendors/FunASR/runtime/onnxruntime/src/resample.cpp
vendored
Normal file
307
modules/python/vendors/FunASR/runtime/onnxruntime/src/resample.cpp
vendored
Normal file
@@ -0,0 +1,307 @@
|
||||
/**
|
||||
* Copyright 2013 Pegah Ghahremani
|
||||
* 2014 IMSL, PKU-HKUST (author: Wei Shi)
|
||||
* 2014 Yanqing Sun, Junjie Wang
|
||||
* 2014 Johns Hopkins University (author: Daniel Povey)
|
||||
* Copyright 2023 Xiaomi Corporation (authors: Fangjun Kuang)
|
||||
*
|
||||
* See LICENSE for clarification regarding multiple authors
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
// this file is copied and modified from
|
||||
// kaldi/src/feat/resample.cc
|
||||
|
||||
#include "resample.h"
|
||||
|
||||
#include <assert.h>
|
||||
#include <math.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#include <cstdlib>
|
||||
#include <type_traits>
|
||||
|
||||
namespace funasr {
|
||||
#ifndef M_2PI
|
||||
#define M_2PI 6.283185307179586476925286766559005
|
||||
#endif
|
||||
|
||||
#ifndef M_PI
|
||||
#define M_PI 3.1415926535897932384626433832795
|
||||
#endif
|
||||
|
||||
template <class I>
|
||||
I Gcd(I m, I n) {
|
||||
// this function is copied from kaldi/src/base/kaldi-math.h
|
||||
if (m == 0 || n == 0) {
|
||||
if (m == 0 && n == 0) { // gcd not defined, as all integers are divisors.
|
||||
fprintf(stderr, "Undefined GCD since m = 0, n = 0.\n");
|
||||
exit(-1);
|
||||
}
|
||||
return (m == 0 ? (n > 0 ? n : -n) : (m > 0 ? m : -m));
|
||||
// return absolute value of whichever is nonzero
|
||||
}
|
||||
// could use compile-time assertion
|
||||
// but involves messing with complex template stuff.
|
||||
static_assert(std::is_integral<I>::value, "");
|
||||
while (1) {
|
||||
m %= n;
|
||||
if (m == 0) return (n > 0 ? n : -n);
|
||||
n %= m;
|
||||
if (n == 0) return (m > 0 ? m : -m);
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the least common multiple of two integers. Will
|
||||
/// crash unless the inputs are positive.
|
||||
template <class I>
|
||||
I Lcm(I m, I n) {
|
||||
// This function is copied from kaldi/src/base/kaldi-math.h
|
||||
assert(m > 0 && n > 0);
|
||||
I gcd = Gcd(m, n);
|
||||
return gcd * (m / gcd) * (n / gcd);
|
||||
}
|
||||
|
||||
static float DotProduct(const float *a, const float *b, int32_t n) {
|
||||
float sum = 0;
|
||||
for (int32_t i = 0; i != n; ++i) {
|
||||
sum += a[i] * b[i];
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
LinearResample::LinearResample(int32_t samp_rate_in_hz,
|
||||
int32_t samp_rate_out_hz, float filter_cutoff_hz,
|
||||
int32_t num_zeros)
|
||||
: samp_rate_in_(samp_rate_in_hz),
|
||||
samp_rate_out_(samp_rate_out_hz),
|
||||
filter_cutoff_(filter_cutoff_hz),
|
||||
num_zeros_(num_zeros) {
|
||||
assert(samp_rate_in_hz > 0.0 && samp_rate_out_hz > 0.0 &&
|
||||
filter_cutoff_hz > 0.0 && filter_cutoff_hz * 2 <= samp_rate_in_hz &&
|
||||
filter_cutoff_hz * 2 <= samp_rate_out_hz && num_zeros > 0);
|
||||
|
||||
// base_freq is the frequency of the repeating unit, which is the gcd
|
||||
// of the input frequencies.
|
||||
int32_t base_freq = Gcd(samp_rate_in_, samp_rate_out_);
|
||||
input_samples_in_unit_ = samp_rate_in_ / base_freq;
|
||||
output_samples_in_unit_ = samp_rate_out_ / base_freq;
|
||||
|
||||
SetIndexesAndWeights();
|
||||
Reset();
|
||||
}
|
||||
|
||||
void LinearResample::SetIndexesAndWeights() {
|
||||
first_index_.resize(output_samples_in_unit_);
|
||||
weights_.resize(output_samples_in_unit_);
|
||||
|
||||
double window_width = num_zeros_ / (2.0 * filter_cutoff_);
|
||||
|
||||
for (int32_t i = 0; i < output_samples_in_unit_; i++) {
|
||||
double output_t = i / static_cast<double>(samp_rate_out_);
|
||||
double min_t = output_t - window_width, max_t = output_t + window_width;
|
||||
// we do ceil on the min and floor on the max, because if we did it
|
||||
// the other way around we would unnecessarily include indexes just
|
||||
// outside the window, with zero coefficients. It's possible
|
||||
// if the arguments to the ceil and floor expressions are integers
|
||||
// (e.g. if filter_cutoff_ has an exact ratio with the sample rates),
|
||||
// that we unnecessarily include something with a zero coefficient,
|
||||
// but this is only a slight efficiency issue.
|
||||
int32_t min_input_index = ceil(min_t * samp_rate_in_),
|
||||
max_input_index = floor(max_t * samp_rate_in_),
|
||||
num_indices = max_input_index - min_input_index + 1;
|
||||
first_index_[i] = min_input_index;
|
||||
weights_[i].resize(num_indices);
|
||||
for (int32_t j = 0; j < num_indices; j++) {
|
||||
int32_t input_index = min_input_index + j;
|
||||
double input_t = input_index / static_cast<double>(samp_rate_in_),
|
||||
delta_t = input_t - output_t;
|
||||
// sign of delta_t doesn't matter.
|
||||
weights_[i][j] = FilterFunc(delta_t) / samp_rate_in_;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/** Here, t is a time in seconds representing an offset from
|
||||
the center of the windowed filter function, and FilterFunction(t)
|
||||
returns the windowed filter function, described
|
||||
in the header as h(t) = f(t)g(t), evaluated at t.
|
||||
*/
|
||||
float LinearResample::FilterFunc(float t) const {
|
||||
float window, // raised-cosine (Hanning) window of width
|
||||
// num_zeros_/2*filter_cutoff_
|
||||
filter; // sinc filter function
|
||||
if (fabs(t) < num_zeros_ / (2.0 * filter_cutoff_))
|
||||
window = 0.5 * (1 + cos(M_2PI * filter_cutoff_ / num_zeros_ * t));
|
||||
else
|
||||
window = 0.0; // outside support of window function
|
||||
if (t != 0)
|
||||
filter = sin(M_2PI * filter_cutoff_ * t) / (M_PI * t);
|
||||
else
|
||||
filter = 2 * filter_cutoff_; // limit of the function at t = 0
|
||||
return filter * window;
|
||||
}
|
||||
|
||||
void LinearResample::Reset() {
|
||||
input_sample_offset_ = 0;
|
||||
output_sample_offset_ = 0;
|
||||
input_remainder_.resize(0);
|
||||
}
|
||||
|
||||
void LinearResample::Resample(const float *input, int32_t input_dim, bool flush,
|
||||
std::vector<float> *output) {
|
||||
int64_t tot_input_samp = input_sample_offset_ + input_dim,
|
||||
tot_output_samp = GetNumOutputSamples(tot_input_samp, flush);
|
||||
|
||||
assert(tot_output_samp >= output_sample_offset_);
|
||||
|
||||
output->resize(tot_output_samp - output_sample_offset_);
|
||||
|
||||
// samp_out is the index into the total output signal, not just the part
|
||||
// of it we are producing here.
|
||||
for (int64_t samp_out = output_sample_offset_; samp_out < tot_output_samp;
|
||||
samp_out++) {
|
||||
int64_t first_samp_in;
|
||||
int32_t samp_out_wrapped;
|
||||
GetIndexes(samp_out, &first_samp_in, &samp_out_wrapped);
|
||||
const std::vector<float> &weights = weights_[samp_out_wrapped];
|
||||
// first_input_index is the first index into "input" that we have a weight
|
||||
// for.
|
||||
int32_t first_input_index =
|
||||
static_cast<int32_t>(first_samp_in - input_sample_offset_);
|
||||
float this_output;
|
||||
if (first_input_index >= 0 &&
|
||||
first_input_index + static_cast<int32_t>(weights.size()) <= input_dim) {
|
||||
this_output =
|
||||
DotProduct(input + first_input_index, weights.data(), weights.size());
|
||||
} else { // Handle edge cases.
|
||||
this_output = 0.0;
|
||||
for (int32_t i = 0; i < static_cast<int32_t>(weights.size()); i++) {
|
||||
float weight = weights[i];
|
||||
int32_t input_index = first_input_index + i;
|
||||
if (input_index < 0 &&
|
||||
static_cast<int32_t>(input_remainder_.size()) + input_index >= 0) {
|
||||
this_output +=
|
||||
weight * input_remainder_[input_remainder_.size() + input_index];
|
||||
} else if (input_index >= 0 && input_index < input_dim) {
|
||||
this_output += weight * input[input_index];
|
||||
} else if (input_index >= input_dim) {
|
||||
// We're past the end of the input and are adding zero; should only
|
||||
// happen if the user specified flush == true, or else we would not
|
||||
// be trying to output this sample.
|
||||
assert(flush);
|
||||
}
|
||||
}
|
||||
}
|
||||
int32_t output_index =
|
||||
static_cast<int32_t>(samp_out - output_sample_offset_);
|
||||
(*output)[output_index] = this_output;
|
||||
}
|
||||
|
||||
if (flush) {
|
||||
Reset(); // Reset the internal state.
|
||||
} else {
|
||||
SetRemainder(input, input_dim);
|
||||
input_sample_offset_ = tot_input_samp;
|
||||
output_sample_offset_ = tot_output_samp;
|
||||
}
|
||||
}
|
||||
|
||||
int64_t LinearResample::GetNumOutputSamples(int64_t input_num_samp,
|
||||
bool flush) const {
|
||||
// For exact computation, we measure time in "ticks" of 1.0 / tick_freq,
|
||||
// where tick_freq is the least common multiple of samp_rate_in_ and
|
||||
// samp_rate_out_.
|
||||
int32_t tick_freq = Lcm(samp_rate_in_, samp_rate_out_);
|
||||
int32_t ticks_per_input_period = tick_freq / samp_rate_in_;
|
||||
|
||||
// work out the number of ticks in the time interval
|
||||
// [ 0, input_num_samp/samp_rate_in_ ).
|
||||
int64_t interval_length_in_ticks = input_num_samp * ticks_per_input_period;
|
||||
if (!flush) {
|
||||
float window_width = num_zeros_ / (2.0 * filter_cutoff_);
|
||||
// To count the window-width in ticks we take the floor. This
|
||||
// is because since we're looking for the largest integer num-out-samp
|
||||
// that fits in the interval, which is open on the right, a reduction
|
||||
// in interval length of less than a tick will never make a difference.
|
||||
// For example, the largest integer in the interval [ 0, 2 ) and the
|
||||
// largest integer in the interval [ 0, 2 - 0.9 ) are the same (both one).
|
||||
// So when we're subtracting the window-width we can ignore the fractional
|
||||
// part.
|
||||
int32_t window_width_ticks = floor(window_width * tick_freq);
|
||||
// The time-period of the output that we can sample gets reduced
|
||||
// by the window-width (which is actually the distance from the
|
||||
// center to the edge of the windowing function) if we're not
|
||||
// "flushing the output".
|
||||
interval_length_in_ticks -= window_width_ticks;
|
||||
}
|
||||
if (interval_length_in_ticks <= 0) return 0;
|
||||
|
||||
int32_t ticks_per_output_period = tick_freq / samp_rate_out_;
|
||||
// Get the last output-sample in the closed interval, i.e. replacing [ ) with
|
||||
// [ ]. Note: integer division rounds down. See
|
||||
// http://en.wikipedia.org/wiki/Interval_(mathematics) for an explanation of
|
||||
// the notation.
|
||||
int64_t last_output_samp = interval_length_in_ticks / ticks_per_output_period;
|
||||
// We need the last output-sample in the open interval, so if it takes us to
|
||||
// the end of the interval exactly, subtract one.
|
||||
if (last_output_samp * ticks_per_output_period == interval_length_in_ticks)
|
||||
last_output_samp--;
|
||||
|
||||
// First output-sample index is zero, so the number of output samples
|
||||
// is the last output-sample plus one.
|
||||
int64_t num_output_samp = last_output_samp + 1;
|
||||
return num_output_samp;
|
||||
}
|
||||
|
||||
// inline
|
||||
void LinearResample::GetIndexes(int64_t samp_out, int64_t *first_samp_in,
|
||||
int32_t *samp_out_wrapped) const {
|
||||
// A unit is the smallest nonzero amount of time that is an exact
|
||||
// multiple of the input and output sample periods. The unit index
|
||||
// is the answer to "which numbered unit we are in".
|
||||
int64_t unit_index = samp_out / output_samples_in_unit_;
|
||||
// samp_out_wrapped is equal to samp_out % output_samples_in_unit_
|
||||
*samp_out_wrapped =
|
||||
static_cast<int32_t>(samp_out - unit_index * output_samples_in_unit_);
|
||||
*first_samp_in =
|
||||
first_index_[*samp_out_wrapped] + unit_index * input_samples_in_unit_;
|
||||
}
|
||||
|
||||
void LinearResample::SetRemainder(const float *input, int32_t input_dim) {
|
||||
std::vector<float> old_remainder(input_remainder_);
|
||||
// max_remainder_needed is the width of the filter from side to side,
|
||||
// measured in input samples. you might think it should be half that,
|
||||
// but you have to consider that you might be wanting to output samples
|
||||
// that are "in the past" relative to the beginning of the latest
|
||||
// input... anyway, storing more remainder than needed is not harmful.
|
||||
int32_t max_remainder_needed =
|
||||
ceil(samp_rate_in_ * num_zeros_ / filter_cutoff_);
|
||||
input_remainder_.resize(max_remainder_needed);
|
||||
for (int32_t index = -static_cast<int32_t>(input_remainder_.size());
|
||||
index < 0; index++) {
|
||||
// we interpret "index" as an offset from the end of "input" and
|
||||
// from the end of input_remainder_.
|
||||
int32_t input_index = index + input_dim;
|
||||
if (input_index >= 0) {
|
||||
input_remainder_[index + static_cast<int32_t>(input_remainder_.size())] =
|
||||
input[input_index];
|
||||
} else if (input_index + static_cast<int32_t>(old_remainder.size()) >= 0) {
|
||||
input_remainder_[index + static_cast<int32_t>(input_remainder_.size())] =
|
||||
old_remainder[input_index +
|
||||
static_cast<int32_t>(old_remainder.size())];
|
||||
// else leave it at zero.
|
||||
}
|
||||
}
|
||||
}
|
||||
} // namespace funasr
|
||||
138
modules/python/vendors/FunASR/runtime/onnxruntime/src/resample.h
vendored
Normal file
138
modules/python/vendors/FunASR/runtime/onnxruntime/src/resample.h
vendored
Normal file
@@ -0,0 +1,138 @@
|
||||
/**
|
||||
* Copyright 2013 Pegah Ghahremani
|
||||
* 2014 IMSL, PKU-HKUST (author: Wei Shi)
|
||||
* 2014 Yanqing Sun, Junjie Wang
|
||||
* 2014 Johns Hopkins University (author: Daniel Povey)
|
||||
* Copyright 2023 Xiaomi Corporation (authors: Fangjun Kuang)
|
||||
*
|
||||
* See LICENSE for clarification regarding multiple authors
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
// this file is copied and modified from
|
||||
// kaldi/src/feat/resample.h
|
||||
#pragma once
|
||||
#include <cstdint>
|
||||
#include <vector>
|
||||
|
||||
namespace funasr {
|
||||
/*
|
||||
We require that the input and output sampling rate be specified as
|
||||
integers, as this is an easy way to specify that their ratio be rational.
|
||||
*/
|
||||
|
||||
class LinearResample {
|
||||
public:
|
||||
/// Constructor. We make the input and output sample rates integers, because
|
||||
/// we are going to need to find a common divisor. This should just remind
|
||||
/// you that they need to be integers. The filter cutoff needs to be less
|
||||
/// than samp_rate_in_hz/2 and less than samp_rate_out_hz/2. num_zeros
|
||||
/// controls the sharpness of the filter, more == sharper but less efficient.
|
||||
/// We suggest around 4 to 10 for normal use.
|
||||
LinearResample(int32_t samp_rate_in_hz, int32_t samp_rate_out_hz,
|
||||
float filter_cutoff_hz, int32_t num_zeros);
|
||||
|
||||
/// Calling the function Reset() resets the state of the object prior to
|
||||
/// processing a new signal; it is only necessary if you have called
|
||||
/// Resample(x, x_size, false, y) for some signal, leading to a remainder of
|
||||
/// the signal being called, but then abandon processing the signal before
|
||||
/// calling Resample(x, x_size, true, y) for the last piece. Call it
|
||||
/// unnecessarily between signals will not do any harm.
|
||||
void Reset();
|
||||
|
||||
/// This function does the resampling. If you call it with flush == true and
|
||||
/// you have never called it with flush == false, it just resamples the input
|
||||
/// signal (it resizes the output to a suitable number of samples).
|
||||
///
|
||||
/// You can also use this function to process a signal a piece at a time.
|
||||
/// suppose you break it into piece1, piece2, ... pieceN. You can call
|
||||
/// \code{.cc}
|
||||
/// Resample(piece1, piece1_size, false, &output1);
|
||||
/// Resample(piece2, piece2_size, false, &output2);
|
||||
/// Resample(piece3, piece3_size, true, &output3);
|
||||
/// \endcode
|
||||
/// If you call it with flush == false, it won't output the last few samples
|
||||
/// but will remember them, so that if you later give it a second piece of
|
||||
/// the input signal it can process it correctly.
|
||||
/// If your most recent call to the object was with flush == false, it will
|
||||
/// have internal state; you can remove this by calling Reset().
|
||||
/// Empty input is acceptable.
|
||||
void Resample(const float *input, int32_t input_dim, bool flush,
|
||||
std::vector<float> *output);
|
||||
|
||||
//// Return the input and output sampling rates (for checks, for example)
|
||||
int32_t GetInputSamplingRate() const { return samp_rate_in_; }
|
||||
int32_t GetOutputSamplingRate() const { return samp_rate_out_; }
|
||||
|
||||
private:
|
||||
void SetIndexesAndWeights();
|
||||
|
||||
float FilterFunc(float) const;
|
||||
|
||||
/// This function outputs the number of output samples we will output
|
||||
/// for a signal with "input_num_samp" input samples. If flush == true,
|
||||
/// we return the largest n such that
|
||||
/// (n/samp_rate_out_) is in the interval [ 0, input_num_samp/samp_rate_in_ ),
|
||||
/// and note that the interval is half-open. If flush == false,
|
||||
/// define window_width as num_zeros / (2.0 * filter_cutoff_);
|
||||
/// we return the largest n such that (n/samp_rate_out_) is in the interval
|
||||
/// [ 0, input_num_samp/samp_rate_in_ - window_width ).
|
||||
int64_t GetNumOutputSamples(int64_t input_num_samp, bool flush) const;
|
||||
|
||||
/// Given an output-sample index, this function outputs to *first_samp_in the
|
||||
/// first input-sample index that we have a weight on (may be negative),
|
||||
/// and to *samp_out_wrapped the index into weights_ where we can get the
|
||||
/// corresponding weights on the input.
|
||||
inline void GetIndexes(int64_t samp_out, int64_t *first_samp_in,
|
||||
int32_t *samp_out_wrapped) const;
|
||||
|
||||
void SetRemainder(const float *input, int32_t input_dim);
|
||||
|
||||
private:
|
||||
// The following variables are provided by the user.
|
||||
int32_t samp_rate_in_;
|
||||
int32_t samp_rate_out_;
|
||||
float filter_cutoff_;
|
||||
int32_t num_zeros_;
|
||||
|
||||
int32_t input_samples_in_unit_; ///< The number of input samples in the
|
||||
///< smallest repeating unit: num_samp_in_ =
|
||||
///< samp_rate_in_hz / Gcd(samp_rate_in_hz,
|
||||
///< samp_rate_out_hz)
|
||||
|
||||
int32_t output_samples_in_unit_; ///< The number of output samples in the
|
||||
///< smallest repeating unit: num_samp_out_
|
||||
///< = samp_rate_out_hz /
|
||||
///< Gcd(samp_rate_in_hz, samp_rate_out_hz)
|
||||
|
||||
/// The first input-sample index that we sum over, for this output-sample
|
||||
/// index. May be negative; any truncation at the beginning is handled
|
||||
/// separately. This is just for the first few output samples, but we can
|
||||
/// extrapolate the correct input-sample index for arbitrary output samples.
|
||||
std::vector<int32_t> first_index_;
|
||||
|
||||
/// Weights on the input samples, for this output-sample index.
|
||||
std::vector<std::vector<float>> weights_;
|
||||
|
||||
// the following variables keep track of where we are in a particular signal,
|
||||
// if it is being provided over multiple calls to Resample().
|
||||
|
||||
int64_t input_sample_offset_; ///< The number of input samples we have
|
||||
///< already received for this signal
|
||||
///< (including anything in remainder_)
|
||||
int64_t output_sample_offset_; ///< The number of samples we have already
|
||||
///< output for this signal.
|
||||
std::vector<float> input_remainder_; ///< A small trailing part of the
|
||||
///< previously seen input signal.
|
||||
};
|
||||
} // namespace funasr
|
||||
54
modules/python/vendors/FunASR/runtime/onnxruntime/src/seg_dict.cpp
vendored
Normal file
54
modules/python/vendors/FunASR/runtime/onnxruntime/src/seg_dict.cpp
vendored
Normal file
@@ -0,0 +1,54 @@
|
||||
/**
|
||||
* Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
|
||||
* MIT License (https://opensource.org/licenses/MIT)
|
||||
*/
|
||||
#include "precomp.h"
|
||||
//#include "util.h"
|
||||
//#include "seg_dict.h"
|
||||
#include <glog/logging.h>
|
||||
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <list>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace funasr {
|
||||
SegDict::SegDict(const char *filename)
|
||||
{
|
||||
ifstream in(filename);
|
||||
if (!in) {
|
||||
LOG(ERROR) << filename << " open failed !!";
|
||||
return;
|
||||
}
|
||||
string textline;
|
||||
while (getline(in, textline)) {
|
||||
std::vector<string> line_item = split(textline, '\t');
|
||||
//std::cout << textline << std::endl;
|
||||
if (line_item.size() > 1) {
|
||||
std::string word = line_item[0];
|
||||
std::string segs = line_item[1];
|
||||
std::vector<string> segs_vec = split(segs, ' ');
|
||||
seg_dict[word] = segs_vec;
|
||||
}
|
||||
}
|
||||
LOG(INFO) << "load seg dict successfully";
|
||||
}
|
||||
std::vector<std::string> SegDict::GetTokensByWord(const std::string &word) {
|
||||
if (seg_dict.count(word))
|
||||
return seg_dict[word];
|
||||
else {
|
||||
LOG(INFO)<< word <<" is OOV!";
|
||||
std::vector<string> vec;
|
||||
return vec;
|
||||
}
|
||||
}
|
||||
|
||||
SegDict::~SegDict()
|
||||
{
|
||||
}
|
||||
|
||||
|
||||
} // namespace funasr
|
||||
26
modules/python/vendors/FunASR/runtime/onnxruntime/src/seg_dict.h
vendored
Normal file
26
modules/python/vendors/FunASR/runtime/onnxruntime/src/seg_dict.h
vendored
Normal file
@@ -0,0 +1,26 @@
|
||||
/**
|
||||
* Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
|
||||
* MIT License (https://opensource.org/licenses/MIT)
|
||||
*/
|
||||
#ifndef SEG_DICT_H
|
||||
#define SEG_DICT_H
|
||||
|
||||
#include <stdint.h>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <map>
|
||||
using namespace std;
|
||||
|
||||
namespace funasr {
|
||||
class SegDict {
|
||||
private:
|
||||
std::map<string, std::vector<string>> seg_dict;
|
||||
|
||||
public:
|
||||
SegDict(const char *filename);
|
||||
~SegDict();
|
||||
std::vector<std::string> GetTokensByWord(const std::string &word);
|
||||
};
|
||||
|
||||
} // namespace funasr
|
||||
#endif
|
||||
359
modules/python/vendors/FunASR/runtime/onnxruntime/src/sensevoice-small.cpp
vendored
Normal file
359
modules/python/vendors/FunASR/runtime/onnxruntime/src/sensevoice-small.cpp
vendored
Normal file
@@ -0,0 +1,359 @@
|
||||
/**
|
||||
* Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
|
||||
* MIT License (https://opensource.org/licenses/MIT)
|
||||
*/
|
||||
|
||||
#include "precomp.h"
|
||||
#include "sensevoice-small.h"
|
||||
#include <cstddef>
|
||||
|
||||
using namespace std;
|
||||
namespace funasr {
|
||||
|
||||
SenseVoiceSmall::SenseVoiceSmall()
|
||||
:use_hotword(false),
|
||||
env_(ORT_LOGGING_LEVEL_ERROR, "sensevoice"),session_options_{} {
|
||||
}
|
||||
|
||||
// offline
|
||||
void SenseVoiceSmall::InitAsr(const std::string &am_model, const std::string &am_cmvn, const std::string &am_config, const std::string &token_file, int thread_num){
|
||||
LoadConfigFromYaml(am_config.c_str());
|
||||
// knf options
|
||||
fbank_opts_.frame_opts.dither = 0;
|
||||
fbank_opts_.mel_opts.num_bins = n_mels;
|
||||
fbank_opts_.frame_opts.samp_freq = asr_sample_rate;
|
||||
fbank_opts_.frame_opts.window_type = window_type;
|
||||
fbank_opts_.frame_opts.frame_shift_ms = frame_shift;
|
||||
fbank_opts_.frame_opts.frame_length_ms = frame_length;
|
||||
fbank_opts_.energy_floor = 0;
|
||||
fbank_opts_.mel_opts.debug_mel = false;
|
||||
|
||||
// session_options_.SetInterOpNumThreads(1);
|
||||
session_options_.SetIntraOpNumThreads(thread_num);
|
||||
session_options_.SetGraphOptimizationLevel(ORT_ENABLE_ALL);
|
||||
// DisableCpuMemArena can improve performance
|
||||
session_options_.DisableCpuMemArena();
|
||||
|
||||
try {
|
||||
m_session_ = std::make_unique<Ort::Session>(env_, ORTSTRING(am_model).c_str(), session_options_);
|
||||
LOG(INFO) << "Successfully load model from " << am_model;
|
||||
} catch (std::exception const &e) {
|
||||
LOG(ERROR) << "Error when load am onnx model: " << e.what();
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
GetInputNames(m_session_.get(), m_strInputNames, m_szInputNames);
|
||||
GetOutputNames(m_session_.get(), m_strOutputNames, m_szOutputNames);
|
||||
vocab = new Vocab(token_file.c_str());
|
||||
LoadCmvn(am_cmvn.c_str());
|
||||
}
|
||||
|
||||
void SenseVoiceSmall::LoadConfigFromYaml(const char* filename){
|
||||
|
||||
YAML::Node config;
|
||||
try{
|
||||
config = YAML::LoadFile(filename);
|
||||
}catch(exception const &e){
|
||||
LOG(ERROR) << "Error loading file, yaml file error or not exist.";
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
try{
|
||||
YAML::Node frontend_conf = config["frontend_conf"];
|
||||
YAML::Node encoder_conf = config["encoder_conf"];
|
||||
|
||||
this->window_type = frontend_conf["window"].as<string>();
|
||||
this->n_mels = frontend_conf["n_mels"].as<int>();
|
||||
this->frame_length = frontend_conf["frame_length"].as<int>();
|
||||
this->frame_shift = frontend_conf["frame_shift"].as<int>();
|
||||
this->lfr_m = frontend_conf["lfr_m"].as<int>();
|
||||
this->lfr_n = frontend_conf["lfr_n"].as<int>();
|
||||
|
||||
this->encoder_size = encoder_conf["output_size"].as<int>();
|
||||
this->fsmn_dims = encoder_conf["output_size"].as<int>();
|
||||
|
||||
this->asr_sample_rate = frontend_conf["fs"].as<int>();
|
||||
}catch(exception const &e){
|
||||
LOG(ERROR) << "Error when load argument from vad config YAML.";
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
|
||||
SenseVoiceSmall::~SenseVoiceSmall()
|
||||
{
|
||||
if(vocab){
|
||||
delete vocab;
|
||||
}
|
||||
if(lm_vocab){
|
||||
delete lm_vocab;
|
||||
}
|
||||
if(seg_dict){
|
||||
delete seg_dict;
|
||||
}
|
||||
if(phone_set_){
|
||||
delete phone_set_;
|
||||
}
|
||||
}
|
||||
|
||||
void SenseVoiceSmall::StartUtterance()
|
||||
{
|
||||
}
|
||||
|
||||
void SenseVoiceSmall::EndUtterance()
|
||||
{
|
||||
}
|
||||
|
||||
void SenseVoiceSmall::Reset()
|
||||
{
|
||||
}
|
||||
|
||||
void SenseVoiceSmall::FbankKaldi(float sample_rate, const float* waves, int len, std::vector<std::vector<float>> &asr_feats) {
|
||||
knf::OnlineFbank fbank_(fbank_opts_);
|
||||
std::vector<float> buf(len);
|
||||
for (int32_t i = 0; i != len; ++i) {
|
||||
buf[i] = waves[i] * 32768;
|
||||
}
|
||||
fbank_.AcceptWaveform(sample_rate, buf.data(), buf.size());
|
||||
|
||||
int32_t frames = fbank_.NumFramesReady();
|
||||
for (int32_t i = 0; i != frames; ++i) {
|
||||
const float *frame = fbank_.GetFrame(i);
|
||||
std::vector<float> frame_vector(frame, frame + fbank_opts_.mel_opts.num_bins);
|
||||
asr_feats.emplace_back(frame_vector);
|
||||
}
|
||||
}
|
||||
|
||||
void SenseVoiceSmall::LoadCmvn(const char *filename)
|
||||
{
|
||||
ifstream cmvn_stream(filename);
|
||||
if (!cmvn_stream.is_open()) {
|
||||
LOG(ERROR) << "Failed to open file: " << filename;
|
||||
exit(-1);
|
||||
}
|
||||
string line;
|
||||
|
||||
while (getline(cmvn_stream, line)) {
|
||||
istringstream iss(line);
|
||||
vector<string> line_item{istream_iterator<string>{iss}, istream_iterator<string>{}};
|
||||
if (line_item[0] == "<AddShift>") {
|
||||
getline(cmvn_stream, line);
|
||||
istringstream means_lines_stream(line);
|
||||
vector<string> means_lines{istream_iterator<string>{means_lines_stream}, istream_iterator<string>{}};
|
||||
if (means_lines[0] == "<LearnRateCoef>") {
|
||||
for (int j = 3; j < means_lines.size() - 1; j++) {
|
||||
means_list_.push_back(stof(means_lines[j]));
|
||||
}
|
||||
continue;
|
||||
}
|
||||
}
|
||||
else if (line_item[0] == "<Rescale>") {
|
||||
getline(cmvn_stream, line);
|
||||
istringstream vars_lines_stream(line);
|
||||
vector<string> vars_lines{istream_iterator<string>{vars_lines_stream}, istream_iterator<string>{}};
|
||||
if (vars_lines[0] == "<LearnRateCoef>") {
|
||||
for (int j = 3; j < vars_lines.size() - 1; j++) {
|
||||
vars_list_.push_back(stof(vars_lines[j])*scale);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
string SenseVoiceSmall::CTCSearch(float * in, std::vector<int32_t> paraformer_length, std::vector<int64_t> outputShape)
|
||||
{
|
||||
std::string unicodeChar = "▁";
|
||||
int32_t vocab_size = outputShape[2];
|
||||
|
||||
std::vector<int64_t> tokens;
|
||||
std::string text="";
|
||||
int32_t prev_id = -1;
|
||||
for (int32_t t = 0; t != paraformer_length[0]; ++t) {
|
||||
auto y = std::distance(
|
||||
static_cast<const float *>(in),
|
||||
std::max_element(
|
||||
static_cast<const float *>(in),
|
||||
static_cast<const float *>(in) + vocab_size));
|
||||
in += vocab_size;
|
||||
|
||||
if (y != blank_id && y != prev_id) {
|
||||
tokens.push_back(y);
|
||||
}
|
||||
prev_id = y;
|
||||
}
|
||||
string str_lang = "";
|
||||
string str_emo = "";
|
||||
string str_event = "";
|
||||
string str_itn = "";
|
||||
if(tokens.size() >=3){
|
||||
str_lang = vocab->Id2String(tokens[0]);
|
||||
str_emo = vocab->Id2String(tokens[1]);
|
||||
str_event = vocab->Id2String(tokens[2]);
|
||||
str_itn = vocab->Id2String(tokens[3]);
|
||||
}
|
||||
|
||||
for(int32_t i = 4; i < tokens.size(); ++i){
|
||||
string word = vocab->Id2String(tokens[i]);
|
||||
size_t found = word.find(unicodeChar);
|
||||
if(found != std::string::npos){
|
||||
text += " " + word.substr(3);
|
||||
}else{
|
||||
text += word;
|
||||
}
|
||||
}
|
||||
if(str_itn == "<|withitn|>"){
|
||||
if(str_lang == "<|zh|>"){
|
||||
text += "。";
|
||||
}else{
|
||||
text += ".";
|
||||
}
|
||||
}
|
||||
|
||||
return str_lang + str_emo + str_event + " " + text;
|
||||
}
|
||||
|
||||
void SenseVoiceSmall::LfrCmvn(std::vector<std::vector<float>> &asr_feats) {
|
||||
|
||||
std::vector<std::vector<float>> out_feats;
|
||||
int T = asr_feats.size();
|
||||
int T_lrf = ceil(1.0 * T / lfr_n);
|
||||
|
||||
// Pad frames at start(copy first frame)
|
||||
for (int i = 0; i < (lfr_m - 1) / 2; i++) {
|
||||
asr_feats.insert(asr_feats.begin(), asr_feats[0]);
|
||||
}
|
||||
// Merge lfr_m frames as one,lfr_n frames per window
|
||||
T = T + (lfr_m - 1) / 2;
|
||||
std::vector<float> p;
|
||||
for (int i = 0; i < T_lrf; i++) {
|
||||
if (lfr_m <= T - i * lfr_n) {
|
||||
for (int j = 0; j < lfr_m; j++) {
|
||||
p.insert(p.end(), asr_feats[i * lfr_n + j].begin(), asr_feats[i * lfr_n + j].end());
|
||||
}
|
||||
out_feats.emplace_back(p);
|
||||
p.clear();
|
||||
} else {
|
||||
// Fill to lfr_m frames at last window if less than lfr_m frames (copy last frame)
|
||||
int num_padding = lfr_m - (T - i * lfr_n);
|
||||
for (int j = 0; j < (asr_feats.size() - i * lfr_n); j++) {
|
||||
p.insert(p.end(), asr_feats[i * lfr_n + j].begin(), asr_feats[i * lfr_n + j].end());
|
||||
}
|
||||
for (int j = 0; j < num_padding; j++) {
|
||||
p.insert(p.end(), asr_feats[asr_feats.size() - 1].begin(), asr_feats[asr_feats.size() - 1].end());
|
||||
}
|
||||
out_feats.emplace_back(p);
|
||||
p.clear();
|
||||
}
|
||||
}
|
||||
// Apply cmvn
|
||||
for (auto &out_feat: out_feats) {
|
||||
for (int j = 0; j < means_list_.size(); j++) {
|
||||
out_feat[j] = (out_feat[j] + means_list_[j]) * vars_list_[j];
|
||||
}
|
||||
}
|
||||
asr_feats = out_feats;
|
||||
}
|
||||
|
||||
std::vector<std::vector<float>> SenseVoiceSmall::CompileHotwordEmbedding(std::string &hotwords) {
|
||||
int embedding_dim = encoder_size;
|
||||
std::vector<std::vector<float>> hw_emb;
|
||||
std::vector<float> vec(embedding_dim, 0);
|
||||
hw_emb.push_back(vec);
|
||||
return hw_emb;
|
||||
}
|
||||
|
||||
std::vector<std::string> SenseVoiceSmall::Forward(float** din, int* len, bool input_finished, std::string svs_lang, bool svs_itn, int batch_in)
|
||||
{
|
||||
std::vector<std::string> results;
|
||||
string result="";
|
||||
int32_t in_feat_dim = fbank_opts_.mel_opts.num_bins;
|
||||
|
||||
if(batch_in != 1){
|
||||
results.push_back(result);
|
||||
return results;
|
||||
}
|
||||
|
||||
std::vector<std::vector<float>> asr_feats;
|
||||
FbankKaldi(asr_sample_rate, din[0], len[0], asr_feats);
|
||||
if(asr_feats.size() == 0){
|
||||
results.push_back(result);
|
||||
return results;
|
||||
}
|
||||
LfrCmvn(asr_feats);
|
||||
int32_t feat_dim = lfr_m*in_feat_dim;
|
||||
int32_t num_frames = asr_feats.size();
|
||||
|
||||
std::vector<float> wav_feats;
|
||||
for (const auto &frame_feat: asr_feats) {
|
||||
wav_feats.insert(wav_feats.end(), frame_feat.begin(), frame_feat.end());
|
||||
}
|
||||
|
||||
//lid textnorm
|
||||
int svs_lid = 0;
|
||||
int svs_itnid = 15;
|
||||
if(lid_map.find(svs_lang) != lid_map.end()){
|
||||
svs_lid = lid_map[svs_lang];
|
||||
}
|
||||
if(svs_itn){
|
||||
svs_itnid = 14;
|
||||
}
|
||||
|
||||
#ifdef _WIN_X86
|
||||
Ort::MemoryInfo m_memoryInfo = Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU);
|
||||
#else
|
||||
Ort::MemoryInfo m_memoryInfo = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);
|
||||
#endif
|
||||
|
||||
const int64_t input_shape_[3] = {1, num_frames, feat_dim};
|
||||
Ort::Value onnx_feats = Ort::Value::CreateTensor<float>(m_memoryInfo,
|
||||
wav_feats.data(),
|
||||
wav_feats.size(),
|
||||
input_shape_,
|
||||
3);
|
||||
|
||||
const int64_t paraformer_length_shape[1] = {1};
|
||||
std::vector<int32_t> paraformer_length;
|
||||
paraformer_length.emplace_back(num_frames);
|
||||
Ort::Value onnx_feats_len = Ort::Value::CreateTensor<int32_t>(
|
||||
m_memoryInfo, paraformer_length.data(), paraformer_length.size(), paraformer_length_shape, 1);
|
||||
|
||||
const int64_t lid_shape[1] = {1};
|
||||
std::vector<int32_t> lid_length;
|
||||
lid_length.emplace_back(svs_lid);
|
||||
Ort::Value onnx_lid = Ort::Value::CreateTensor<int32_t>(
|
||||
m_memoryInfo, lid_length.data(), lid_length.size(), lid_shape, 1);
|
||||
|
||||
const int64_t textnorm_shape[1] = {1};
|
||||
std::vector<int32_t> textnorm_length;
|
||||
textnorm_length.emplace_back(svs_itnid);
|
||||
Ort::Value onnx_itn = Ort::Value::CreateTensor<int32_t>(
|
||||
m_memoryInfo, textnorm_length.data(), textnorm_length.size(), textnorm_shape, 1);
|
||||
|
||||
std::vector<Ort::Value> input_onnx;
|
||||
input_onnx.emplace_back(std::move(onnx_feats));
|
||||
input_onnx.emplace_back(std::move(onnx_feats_len));
|
||||
input_onnx.emplace_back(std::move(onnx_lid));
|
||||
input_onnx.emplace_back(std::move(onnx_itn));
|
||||
|
||||
try {
|
||||
auto outputTensor = m_session_->Run(Ort::RunOptions{nullptr}, m_szInputNames.data(), input_onnx.data(), input_onnx.size(), m_szOutputNames.data(), m_szOutputNames.size());
|
||||
float* floatData = outputTensor[0].GetTensorMutableData<float>();
|
||||
std::vector<int64_t> outputShape = outputTensor[0].GetTensorTypeAndShapeInfo().GetShape();
|
||||
|
||||
result = CTCSearch(floatData, paraformer_length, outputShape);
|
||||
}
|
||||
catch (std::exception const &e)
|
||||
{
|
||||
LOG(ERROR)<<e.what();
|
||||
}
|
||||
|
||||
results.push_back(result);
|
||||
return results;
|
||||
}
|
||||
|
||||
string SenseVoiceSmall::Rescoring()
|
||||
{
|
||||
LOG(ERROR)<<"Not Imp!!!!!!";
|
||||
return "";
|
||||
}
|
||||
} // namespace funasr
|
||||
116
modules/python/vendors/FunASR/runtime/onnxruntime/src/sensevoice-small.h
vendored
Normal file
116
modules/python/vendors/FunASR/runtime/onnxruntime/src/sensevoice-small.h
vendored
Normal file
@@ -0,0 +1,116 @@
|
||||
/**
|
||||
* Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
|
||||
* MIT License (https://opensource.org/licenses/MIT)
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
#include "precomp.h"
|
||||
#include "phone-set.h"
|
||||
|
||||
namespace funasr {
|
||||
|
||||
class SenseVoiceSmall : public Model {
|
||||
private:
|
||||
Vocab* vocab = nullptr;
|
||||
Vocab* lm_vocab = nullptr;
|
||||
SegDict* seg_dict = nullptr;
|
||||
PhoneSet* phone_set_ = nullptr;
|
||||
const float scale = 1.0;
|
||||
|
||||
void LoadConfigFromYaml(const char* filename);
|
||||
void LoadCmvn(const char *filename);
|
||||
void LfrCmvn(std::vector<std::vector<float>> &asr_feats);
|
||||
|
||||
std::shared_ptr<Ort::Session> hw_m_session = nullptr;
|
||||
Ort::Env hw_env_;
|
||||
Ort::SessionOptions hw_session_options;
|
||||
vector<string> hw_m_strInputNames, hw_m_strOutputNames;
|
||||
vector<const char*> hw_m_szInputNames;
|
||||
vector<const char*> hw_m_szOutputNames;
|
||||
bool use_hotword;
|
||||
|
||||
public:
|
||||
SenseVoiceSmall();
|
||||
~SenseVoiceSmall();
|
||||
void InitAsr(const std::string &am_model, const std::string &am_cmvn, const std::string &am_config, const std::string &token_file, int thread_num);
|
||||
// online
|
||||
// void InitAsr(const std::string &en_model, const std::string &de_model, const std::string &am_cmvn, const std::string &am_config, const std::string &token_file, int thread_num);
|
||||
// 2pass
|
||||
// void InitAsr(const std::string &am_model, const std::string &en_model, const std::string &de_model, const std::string &am_cmvn, const std::string &am_config, const std::string &token_file, int thread_num);
|
||||
// void InitHwCompiler(const std::string &hw_model, int thread_num);
|
||||
// void InitSegDict(const std::string &seg_dict_model);
|
||||
std::vector<std::vector<float>> CompileHotwordEmbedding(std::string &hotwords);
|
||||
void Reset();
|
||||
void FbankKaldi(float sample_rate, const float* waves, int len, std::vector<std::vector<float>> &asr_feats);
|
||||
std::vector<std::string> Forward(float** din, int* len, bool input_finished=true, std::string svs_lang="auto", bool svs_itn=true, int batch_in=1);
|
||||
string CTCSearch( float * in, std::vector<int32_t> paraformer_length, std::vector<int64_t> outputShape);
|
||||
|
||||
string Rescoring();
|
||||
string GetLang(){return language;};
|
||||
int GetAsrSampleRate() { return asr_sample_rate; };
|
||||
int GetBatchSize() {return batch_size_;};
|
||||
void StartUtterance();
|
||||
void EndUtterance();
|
||||
// void InitLm(const std::string &lm_file, const std::string &lm_cfg_file, const std::string &lex_file);
|
||||
// string BeamSearch(WfstDecoder* &wfst_decoder, float* in, int n_len, int64_t token_nums);
|
||||
// string FinalizeDecode(WfstDecoder* &wfst_decoder,
|
||||
// bool is_stamp=false, std::vector<float> us_alphas={0}, std::vector<float> us_cif_peak={0});
|
||||
// Vocab* GetVocab();
|
||||
// Vocab* GetLmVocab();
|
||||
// PhoneSet* GetPhoneSet();
|
||||
|
||||
knf::FbankOptions fbank_opts_;
|
||||
vector<float> means_list_;
|
||||
vector<float> vars_list_;
|
||||
int lfr_m = PARA_LFR_M;
|
||||
int lfr_n = PARA_LFR_N;
|
||||
|
||||
// paraformer-offline
|
||||
std::shared_ptr<Ort::Session> m_session_ = nullptr;
|
||||
Ort::Env env_;
|
||||
Ort::SessionOptions session_options_;
|
||||
|
||||
vector<string> m_strInputNames, m_strOutputNames;
|
||||
vector<const char*> m_szInputNames;
|
||||
vector<const char*> m_szOutputNames;
|
||||
|
||||
std::string language="zh-cn";
|
||||
|
||||
// paraformer-online
|
||||
std::shared_ptr<Ort::Session> encoder_session_ = nullptr;
|
||||
std::shared_ptr<Ort::Session> decoder_session_ = nullptr;
|
||||
vector<string> en_strInputNames, en_strOutputNames;
|
||||
vector<const char*> en_szInputNames_;
|
||||
vector<const char*> en_szOutputNames_;
|
||||
vector<string> de_strInputNames, de_strOutputNames;
|
||||
vector<const char*> de_szInputNames_;
|
||||
vector<const char*> de_szOutputNames_;
|
||||
|
||||
// lm
|
||||
std::shared_ptr<fst::Fst<fst::StdArc>> lm_ = nullptr;
|
||||
|
||||
string window_type = "hamming";
|
||||
int frame_length = 25;
|
||||
int frame_shift = 10;
|
||||
int n_mels = 80;
|
||||
int encoder_size = 512;
|
||||
int fsmn_layers = 16;
|
||||
int fsmn_lorder = 10;
|
||||
int fsmn_dims = 512;
|
||||
int asr_sample_rate = MODEL_SAMPLE_RATE;
|
||||
int batch_size_ = 1;
|
||||
int blank_id = 0;
|
||||
//dict
|
||||
std::map<std::string, int> lid_map = {
|
||||
{"auto", 0},
|
||||
{"zh", 3},
|
||||
{"en", 4},
|
||||
{"yue", 7},
|
||||
{"ja", 11},
|
||||
{"ko", 12},
|
||||
{"nospeech", 13}
|
||||
};
|
||||
|
||||
};
|
||||
|
||||
} // namespace funasr
|
||||
161
modules/python/vendors/FunASR/runtime/onnxruntime/src/tensor.h
vendored
Normal file
161
modules/python/vendors/FunASR/runtime/onnxruntime/src/tensor.h
vendored
Normal file
@@ -0,0 +1,161 @@
|
||||
#ifndef TENSOR_H
|
||||
#define TENSOR_H
|
||||
|
||||
#include "alignedmem.h"
|
||||
#include "stdio.h"
|
||||
#include <iostream>
|
||||
#include <cstring>
|
||||
using namespace std;
|
||||
|
||||
namespace funasr {
|
||||
|
||||
template <typename T> class Tensor {
|
||||
private:
|
||||
void alloc_buff();
|
||||
void free_buff();
|
||||
int mem_size;
|
||||
|
||||
public:
|
||||
T *buff;
|
||||
int size[4];
|
||||
int buff_size;
|
||||
Tensor(Tensor<T> *in);
|
||||
Tensor(int a);
|
||||
Tensor(int a, int b);
|
||||
Tensor(int a, int b, int c);
|
||||
Tensor(int a, int b, int c, int d);
|
||||
~Tensor();
|
||||
void zeros();
|
||||
void shape();
|
||||
void disp();
|
||||
void dump(const char *mode);
|
||||
void concat(Tensor<T> *din, int dim);
|
||||
void resize(int a, int b, int c, int d);
|
||||
void add(float coe, Tensor<T> *in);
|
||||
void add(Tensor<T> *in);
|
||||
void add(Tensor<T> *in1, Tensor<T> *in2);
|
||||
void reload(Tensor<T> *in);
|
||||
};
|
||||
|
||||
template <typename T> Tensor<T>::Tensor(int a) : size{1, 1, 1, a}
|
||||
{
|
||||
alloc_buff();
|
||||
}
|
||||
|
||||
template <typename T> Tensor<T>::Tensor(int a, int b) : size{1, 1, a, b}
|
||||
{
|
||||
alloc_buff();
|
||||
}
|
||||
|
||||
template <typename T> Tensor<T>::Tensor(int a, int b, int c) : size{1, a, b, c}
|
||||
{
|
||||
|
||||
alloc_buff();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
Tensor<T>::Tensor(int a, int b, int c, int d) : size{a, b, c, d}
|
||||
{
|
||||
alloc_buff();
|
||||
}
|
||||
|
||||
template <typename T> Tensor<T>::Tensor(Tensor<T> *in)
|
||||
{
|
||||
memcpy(size, in->size, 4 * sizeof(int));
|
||||
alloc_buff();
|
||||
memcpy(buff, in->buff, in->buff_size * sizeof(T));
|
||||
}
|
||||
|
||||
template <typename T> Tensor<T>::~Tensor()
|
||||
{
|
||||
free_buff();
|
||||
}
|
||||
|
||||
template <typename T> void Tensor<T>::alloc_buff()
|
||||
{
|
||||
buff_size = size[0] * size[1] * size[2] * size[3];
|
||||
mem_size = buff_size;
|
||||
buff = (T *)AlignedMalloc(32, buff_size * sizeof(T));
|
||||
}
|
||||
|
||||
template <typename T> void Tensor<T>::free_buff()
|
||||
{
|
||||
aligned_free(buff);
|
||||
}
|
||||
|
||||
template <typename T> void Tensor<T>::zeros()
|
||||
{
|
||||
memset(buff, 0, buff_size * sizeof(T));
|
||||
}
|
||||
|
||||
template <typename T> void Tensor<T>::shape()
|
||||
{
|
||||
printf("(%d,%d,%d,%d)\n", size[0], size[1], size[2], size[3]);
|
||||
}
|
||||
|
||||
// TODO:: fix it!!!!
|
||||
template <typename T> void Tensor<T>::concat(Tensor<T> *din, int dim)
|
||||
{
|
||||
memcpy(buff + buff_size, din->buff, din->buff_size * sizeof(T));
|
||||
buff_size += din->buff_size;
|
||||
size[dim] += din->size[dim];
|
||||
}
|
||||
|
||||
// TODO:: fix it!!!!
|
||||
template <typename T> void Tensor<T>::resize(int a, int b, int c, int d)
|
||||
{
|
||||
size[0] = a;
|
||||
size[1] = b;
|
||||
size[2] = c;
|
||||
size[3] = d;
|
||||
buff_size = size[0] * size[1] * size[2] * size[3];
|
||||
}
|
||||
|
||||
template <typename T> void Tensor<T>::add(float coe, Tensor<T> *in)
|
||||
{
|
||||
int i;
|
||||
for (i = 0; i < buff_size; i++) {
|
||||
buff[i] = buff[i] + coe * in->buff[i];
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T> void Tensor<T>::add(Tensor<T> *in)
|
||||
{
|
||||
int i;
|
||||
for (i = 0; i < buff_size; i++) {
|
||||
buff[i] = buff[i] + in->buff[i];
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T> void Tensor<T>::add(Tensor<T> *in1, Tensor<T> *in2)
|
||||
{
|
||||
int i;
|
||||
for (i = 0; i < buff_size; i++) {
|
||||
buff[i] = buff[i] + in1->buff[i] + in2->buff[i];
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T> void Tensor<T>::reload(Tensor<T> *in)
|
||||
{
|
||||
memcpy(buff, in->buff, in->buff_size * sizeof(T));
|
||||
}
|
||||
|
||||
template <typename T> void Tensor<T>::disp()
|
||||
{
|
||||
int i;
|
||||
for (i = 0; i < buff_size; i++) {
|
||||
cout << buff[i] << " ";
|
||||
}
|
||||
cout << endl;
|
||||
}
|
||||
|
||||
template <typename T> void Tensor<T>::dump(const char *mode)
|
||||
{
|
||||
FILE *fp;
|
||||
fp = fopen("tmp.bin", mode);
|
||||
fwrite(buff, 1, buff_size * sizeof(T), fp);
|
||||
fclose(fp);
|
||||
}
|
||||
|
||||
} // namespace funasr
|
||||
#endif
|
||||
351
modules/python/vendors/FunASR/runtime/onnxruntime/src/tokenizer.cpp
vendored
Normal file
351
modules/python/vendors/FunASR/runtime/onnxruntime/src/tokenizer.cpp
vendored
Normal file
@@ -0,0 +1,351 @@
|
||||
/**
|
||||
* Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
|
||||
* MIT License (https://opensource.org/licenses/MIT)
|
||||
*/
|
||||
|
||||
#include "precomp.h"
|
||||
|
||||
namespace funasr {
|
||||
CTokenizer::CTokenizer(const char* sz_yamlfile):m_ready(false)
|
||||
{
|
||||
OpenYaml(sz_yamlfile);
|
||||
}
|
||||
|
||||
CTokenizer::CTokenizer():m_ready(false)
|
||||
{
|
||||
}
|
||||
|
||||
CTokenizer::~CTokenizer()
|
||||
{
|
||||
if (jieba_dict_trie_){
|
||||
delete jieba_dict_trie_;
|
||||
}
|
||||
if (jieba_model_){
|
||||
delete jieba_model_;
|
||||
}
|
||||
}
|
||||
|
||||
void CTokenizer::SetJiebaRes(cppjieba::DictTrie *dict, cppjieba::HMMModel *hmm) {
|
||||
jieba_processor_.SetJiebaRes(dict, hmm);
|
||||
}
|
||||
|
||||
void CTokenizer::JiebaInit(std::string punc_config){
|
||||
if (seg_jieba){
|
||||
std::string model_path = punc_config.substr(0, punc_config.length() - (sizeof(PUNC_CONFIG_NAME)-1));
|
||||
std::string jieba_dict_file = PathAppend(model_path, JIEBA_DICT);
|
||||
std::string jieba_hmm_file = PathAppend(model_path, JIEBA_HMM_MODEL);
|
||||
std::string jieba_userdict_file = PathAppend(model_path, JIEBA_USERDICT);
|
||||
try{
|
||||
jieba_dict_trie_ = new cppjieba::DictTrie(jieba_dict_file, jieba_userdict_file);
|
||||
LOG(INFO) << "Successfully load file from " << jieba_dict_file << ", " << jieba_userdict_file;
|
||||
}catch(exception const &e){
|
||||
LOG(ERROR) << "Error loading file, Jieba dict file error or not exist.";
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
try{
|
||||
jieba_model_ = new cppjieba::HMMModel(jieba_hmm_file);
|
||||
LOG(INFO) << "Successfully load model from " << jieba_hmm_file;
|
||||
}catch(exception const &e){
|
||||
LOG(ERROR) << "Error loading file, Jieba hmm file error or not exist.";
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
SetJiebaRes(jieba_dict_trie_, jieba_model_);
|
||||
}else {
|
||||
jieba_dict_trie_ = nullptr;
|
||||
jieba_model_ = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
void CTokenizer::ReadYaml(const YAML::Node& node)
|
||||
{
|
||||
if (node.IsMap())
|
||||
{//<2F><>map<61><70>
|
||||
for (auto it = node.begin(); it != node.end(); ++it)
|
||||
{
|
||||
ReadYaml(it->second);
|
||||
}
|
||||
}
|
||||
if (node.IsSequence()) {//<2F><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||||
for (size_t i = 0; i < node.size(); ++i) {
|
||||
ReadYaml(node[i]);
|
||||
}
|
||||
}
|
||||
if (node.IsScalar()) {//<2F>DZ<EFBFBD><C7B1><EFBFBD><EFBFBD><EFBFBD>
|
||||
LOG(INFO) << node.as<string>();
|
||||
}
|
||||
}
|
||||
|
||||
bool CTokenizer::OpenYaml(const char* sz_yamlfile)
|
||||
{
|
||||
YAML::Node m_Config;
|
||||
try{
|
||||
m_Config = YAML::LoadFile(sz_yamlfile);
|
||||
}catch(exception const &e){
|
||||
LOG(INFO) << "Error loading file, yaml file error or not exist.";
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
YAML::Node conf_seg_jieba = m_Config["seg_jieba"];
|
||||
if (conf_seg_jieba.IsDefined()){
|
||||
seg_jieba = conf_seg_jieba.as<bool>();
|
||||
}
|
||||
|
||||
auto Tokens = m_Config["token_list"];
|
||||
if (Tokens.IsSequence())
|
||||
{
|
||||
for (size_t i = 0; i < Tokens.size(); ++i)
|
||||
{
|
||||
if (Tokens[i].IsScalar())
|
||||
{
|
||||
m_id2token.push_back(Tokens[i].as<string>());
|
||||
m_token2id.insert(make_pair<string, int>(Tokens[i].as<string>(), i));
|
||||
}
|
||||
}
|
||||
}
|
||||
auto Puncs = m_Config["punc_list"];
|
||||
if (Puncs.IsSequence())
|
||||
{
|
||||
for (size_t i = 0; i < Puncs.size(); ++i)
|
||||
{
|
||||
if (Puncs[i].IsScalar())
|
||||
{
|
||||
m_id2punc.push_back(Puncs[i].as<string>());
|
||||
m_punc2id.insert(make_pair<string, int>(Puncs[i].as<string>(), i));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (YAML::BadFile& e) {
|
||||
LOG(ERROR) << "Read error!";
|
||||
return false;
|
||||
}
|
||||
m_ready = true;
|
||||
return m_ready;
|
||||
}
|
||||
|
||||
bool CTokenizer::OpenYaml(const char* sz_yamlfile, const char* token_file)
|
||||
{
|
||||
YAML::Node m_Config;
|
||||
try{
|
||||
m_Config = YAML::LoadFile(sz_yamlfile);
|
||||
}catch(exception const &e){
|
||||
LOG(INFO) << "Error loading file, yaml file error or not exist.";
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
YAML::Node conf_seg_jieba = m_Config["seg_jieba"];
|
||||
if (conf_seg_jieba.IsDefined()){
|
||||
seg_jieba = conf_seg_jieba.as<bool>();
|
||||
}
|
||||
|
||||
auto Puncs = m_Config["model_conf"]["punc_list"];
|
||||
if (Puncs.IsSequence())
|
||||
{
|
||||
for (size_t i = 0; i < Puncs.size(); ++i)
|
||||
{
|
||||
if (Puncs[i].IsScalar())
|
||||
{
|
||||
m_id2punc.push_back(Puncs[i].as<string>());
|
||||
m_punc2id.insert(make_pair<string, int>(Puncs[i].as<string>(), i));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
nlohmann::json json_array;
|
||||
std::ifstream file(token_file);
|
||||
if (file.is_open()) {
|
||||
file >> json_array;
|
||||
file.close();
|
||||
} else {
|
||||
LOG(INFO) << "Error loading token file, token file error or not exist.";
|
||||
return false;
|
||||
}
|
||||
|
||||
int i = 0;
|
||||
for (const auto& element : json_array) {
|
||||
m_id2token.push_back(element);
|
||||
m_token2id[element] = i;
|
||||
i++;
|
||||
}
|
||||
}
|
||||
catch (YAML::BadFile& e) {
|
||||
LOG(ERROR) << "Read error!";
|
||||
return false;
|
||||
}
|
||||
m_ready = true;
|
||||
return m_ready;
|
||||
}
|
||||
|
||||
vector<string> CTokenizer::Id2String(vector<int> input)
|
||||
{
|
||||
vector<string> result;
|
||||
for (auto& item : input)
|
||||
{
|
||||
result.push_back(m_id2token[item]);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
int CTokenizer::String2Id(string input)
|
||||
{
|
||||
int nID = 0; // <blank>
|
||||
if (m_token2id.find(input) != m_token2id.end())
|
||||
nID=(m_token2id[input]);
|
||||
else
|
||||
nID=(m_token2id[UNK_CHAR]);
|
||||
return nID;
|
||||
}
|
||||
|
||||
vector<int> CTokenizer::String2Ids(vector<string> input)
|
||||
{
|
||||
vector<int> result;
|
||||
for (auto& item : input)
|
||||
{
|
||||
transform(item.begin(), item.end(), item.begin(), ::tolower);
|
||||
if (m_token2id.find(item) != m_token2id.end())
|
||||
result.push_back(m_token2id[item]);
|
||||
else
|
||||
result.push_back(m_token2id[UNK_CHAR]);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
vector<string> CTokenizer::Id2Punc(vector<int> input)
|
||||
{
|
||||
vector<string> result;
|
||||
for (auto& item : input)
|
||||
{
|
||||
result.push_back(m_id2punc[item]);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
string CTokenizer::Id2Punc(int n_punc_id)
|
||||
{
|
||||
return m_id2punc[n_punc_id];
|
||||
}
|
||||
|
||||
vector<int> CTokenizer::Punc2Ids(vector<string> input)
|
||||
{
|
||||
vector<int> result;
|
||||
for (auto& item : input)
|
||||
{
|
||||
result.push_back(m_punc2id[item]);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
bool CTokenizer::IsPunc(string& Punc)
|
||||
{
|
||||
if (m_punc2id.find(Punc) != m_punc2id.end())
|
||||
return true;
|
||||
else
|
||||
return false;
|
||||
}
|
||||
|
||||
vector<string> CTokenizer::SplitChineseString(const string & str_info)
|
||||
{
|
||||
vector<string> list;
|
||||
int strSize = str_info.size();
|
||||
int i = 0;
|
||||
|
||||
while (i < strSize) {
|
||||
int len = 1;
|
||||
for (int j = 0; j < 6 && (str_info[i] & (0x80 >> j)); j++) {
|
||||
len = j + 1;
|
||||
}
|
||||
list.push_back(str_info.substr(i, len));
|
||||
i += len;
|
||||
}
|
||||
return list;
|
||||
}
|
||||
|
||||
vector<string> CTokenizer::SplitChineseJieba(const string & str_info)
|
||||
{
|
||||
vector<string> list;
|
||||
jieba_processor_.Cut(str_info, list, false);
|
||||
|
||||
return list;
|
||||
}
|
||||
|
||||
void CTokenizer::StrSplit(const string& str, const char split, vector<string>& res)
|
||||
{
|
||||
if (str == "")
|
||||
{
|
||||
return;
|
||||
}
|
||||
string&& strs = str + split;
|
||||
size_t pos = strs.find(split);
|
||||
|
||||
while (pos != string::npos)
|
||||
{
|
||||
res.emplace_back(strs.substr(0, pos));
|
||||
strs = move(strs.substr(pos + 1, strs.size()));
|
||||
pos = strs.find(split);
|
||||
}
|
||||
}
|
||||
|
||||
void CTokenizer::Tokenize(const char* str_info, vector<string> & str_out, vector<int> & id_out)
|
||||
{
|
||||
vector<string> strList;
|
||||
StrSplit(str_info,' ', strList);
|
||||
string current_eng,current_chinese;
|
||||
for (auto& item : strList)
|
||||
{
|
||||
current_eng = "";
|
||||
current_chinese = "";
|
||||
for (auto& ch : item)
|
||||
{
|
||||
if (!(ch& 0x80))
|
||||
{ // Ӣ<><D3A2>
|
||||
if (current_chinese.size() > 0)
|
||||
{
|
||||
// for utf-8 chinese
|
||||
vector<string> chineseList;
|
||||
if(seg_jieba){
|
||||
chineseList = SplitChineseJieba(current_chinese);
|
||||
}else{
|
||||
chineseList = SplitChineseString(current_chinese);
|
||||
}
|
||||
str_out.insert(str_out.end(), chineseList.begin(),chineseList.end());
|
||||
current_chinese = "";
|
||||
}
|
||||
current_eng += ch;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (current_eng.size() > 0)
|
||||
{
|
||||
str_out.push_back(current_eng);
|
||||
current_eng = "";
|
||||
}
|
||||
current_chinese += ch;
|
||||
}
|
||||
}
|
||||
if (current_chinese.size() > 0)
|
||||
{
|
||||
// for utf-8 chinese
|
||||
vector<string> chineseList;
|
||||
if(seg_jieba){
|
||||
chineseList = SplitChineseJieba(current_chinese);
|
||||
}else{
|
||||
chineseList = SplitChineseString(current_chinese);
|
||||
}
|
||||
str_out.insert(str_out.end(), chineseList.begin(), chineseList.end());
|
||||
current_chinese = "";
|
||||
}
|
||||
if (current_eng.size() > 0)
|
||||
{
|
||||
str_out.push_back(current_eng);
|
||||
}
|
||||
}
|
||||
id_out= String2Ids(str_out);
|
||||
}
|
||||
|
||||
} // namespace funasr
|
||||
49
modules/python/vendors/FunASR/runtime/onnxruntime/src/tokenizer.h
vendored
Normal file
49
modules/python/vendors/FunASR/runtime/onnxruntime/src/tokenizer.h
vendored
Normal file
@@ -0,0 +1,49 @@
|
||||
/**
|
||||
* Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
|
||||
* MIT License (https://opensource.org/licenses/MIT)
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
#include <yaml-cpp/yaml.h>
|
||||
#include "cppjieba/DictTrie.hpp"
|
||||
#include "cppjieba/HMMModel.hpp"
|
||||
#include "cppjieba/Jieba.hpp"
|
||||
#include "nlohmann/json.hpp"
|
||||
|
||||
namespace funasr {
|
||||
class CTokenizer {
|
||||
private:
|
||||
|
||||
bool m_ready = false;
|
||||
vector<string> m_id2token,m_id2punc;
|
||||
map<string, int> m_token2id,m_punc2id;
|
||||
|
||||
cppjieba::DictTrie *jieba_dict_trie_=nullptr;
|
||||
cppjieba::HMMModel *jieba_model_=nullptr;
|
||||
cppjieba::Jieba jieba_processor_;
|
||||
|
||||
public:
|
||||
|
||||
CTokenizer(const char* sz_yamlfile);
|
||||
CTokenizer();
|
||||
~CTokenizer();
|
||||
bool OpenYaml(const char* sz_yamlfile);
|
||||
bool OpenYaml(const char* sz_yamlfile, const char* token_file);
|
||||
void ReadYaml(const YAML::Node& node);
|
||||
vector<string> Id2String(vector<int> input);
|
||||
vector<int> String2Ids(vector<string> input);
|
||||
int String2Id(string input);
|
||||
vector<string> Id2Punc(vector<int> input);
|
||||
string Id2Punc(int n_punc_id);
|
||||
vector<int> Punc2Ids(vector<string> input);
|
||||
vector<string> SplitChineseString(const string& str_info);
|
||||
vector<string> SplitChineseJieba(const string& str_info);
|
||||
void StrSplit(const string& str, const char split, vector<string>& res);
|
||||
void Tokenize(const char* str_info, vector<string>& str_out, vector<int>& id_out);
|
||||
bool IsPunc(string& Punc);
|
||||
bool seg_jieba = false;
|
||||
void SetJiebaRes(cppjieba::DictTrie *dict, cppjieba::HMMModel *hmm);
|
||||
void JiebaInit(std::string punc_config);
|
||||
};
|
||||
|
||||
} // namespace funasr
|
||||
28
modules/python/vendors/FunASR/runtime/onnxruntime/src/tpass-online-stream.cpp
vendored
Normal file
28
modules/python/vendors/FunASR/runtime/onnxruntime/src/tpass-online-stream.cpp
vendored
Normal file
@@ -0,0 +1,28 @@
|
||||
#include "precomp.h"
|
||||
|
||||
namespace funasr {
|
||||
TpassOnlineStream::TpassOnlineStream(TpassStream* tpass_stream, std::vector<int> chunk_size){
|
||||
TpassStream* tpass_obj = (TpassStream*)tpass_stream;
|
||||
if(tpass_obj->vad_handle){
|
||||
vad_online_handle = make_unique<FsmnVadOnline>((FsmnVad*)(tpass_obj->vad_handle).get());
|
||||
}else{
|
||||
LOG(ERROR)<<"vad_handle is null";
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
if(tpass_obj->asr_handle){
|
||||
asr_online_handle = make_unique<ParaformerOnline>((Paraformer*)(tpass_obj->asr_handle).get(), chunk_size);
|
||||
}else{
|
||||
LOG(ERROR)<<"asr_handle is null";
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
|
||||
TpassOnlineStream* CreateTpassOnlineStream(void* tpass_stream, std::vector<int> chunk_size)
|
||||
{
|
||||
TpassOnlineStream *mm;
|
||||
mm =new TpassOnlineStream((TpassStream*)tpass_stream, chunk_size);
|
||||
return mm;
|
||||
}
|
||||
|
||||
} // namespace funasr
|
||||
136
modules/python/vendors/FunASR/runtime/onnxruntime/src/tpass-stream.cpp
vendored
Normal file
136
modules/python/vendors/FunASR/runtime/onnxruntime/src/tpass-stream.cpp
vendored
Normal file
@@ -0,0 +1,136 @@
|
||||
#include "precomp.h"
|
||||
|
||||
namespace funasr {
|
||||
TpassStream::TpassStream(std::map<std::string, std::string>& model_path, int thread_num)
|
||||
{
|
||||
// VAD model
|
||||
if(model_path.find(VAD_DIR) != model_path.end()){
|
||||
string vad_model_path;
|
||||
string vad_cmvn_path;
|
||||
string vad_config_path;
|
||||
|
||||
vad_model_path = PathAppend(model_path.at(VAD_DIR), MODEL_NAME);
|
||||
if(model_path.find(VAD_QUANT) != model_path.end() && model_path.at(VAD_QUANT) == "true"){
|
||||
vad_model_path = PathAppend(model_path.at(VAD_DIR), QUANT_MODEL_NAME);
|
||||
}
|
||||
vad_cmvn_path = PathAppend(model_path.at(VAD_DIR), VAD_CMVN_NAME);
|
||||
vad_config_path = PathAppend(model_path.at(VAD_DIR), VAD_CONFIG_NAME);
|
||||
if (access(vad_model_path.c_str(), F_OK) != 0 ||
|
||||
access(vad_cmvn_path.c_str(), F_OK) != 0 ||
|
||||
access(vad_config_path.c_str(), F_OK) != 0 )
|
||||
{
|
||||
LOG(INFO) << "VAD model file is not exist, skip load vad model.";
|
||||
}else{
|
||||
vad_handle = make_unique<FsmnVad>();
|
||||
vad_handle->InitVad(vad_model_path, vad_cmvn_path, vad_config_path, thread_num);
|
||||
use_vad = true;
|
||||
}
|
||||
}
|
||||
|
||||
// AM model
|
||||
if(model_path.find(OFFLINE_MODEL_DIR) != model_path.end() && model_path.find(ONLINE_MODEL_DIR) != model_path.end()){
|
||||
// 2pass
|
||||
string am_model_path;
|
||||
string en_model_path;
|
||||
string de_model_path;
|
||||
string am_cmvn_path;
|
||||
string am_config_path;
|
||||
string token_path;
|
||||
string hw_compile_model_path;
|
||||
string seg_dict_path;
|
||||
|
||||
asr_handle = make_unique<Paraformer>();
|
||||
|
||||
bool enable_hotword = false;
|
||||
hw_compile_model_path = PathAppend(model_path.at(MODEL_DIR), MODEL_EB_NAME);
|
||||
seg_dict_path = PathAppend(model_path.at(MODEL_DIR), MODEL_SEG_DICT);
|
||||
if ((access(hw_compile_model_path.c_str(), F_OK) == 0) &&
|
||||
(access(seg_dict_path.c_str(), F_OK) == 0)) { // if model_eb.onnx exist, hotword enabled
|
||||
enable_hotword = true;
|
||||
asr_handle->InitHwCompiler(hw_compile_model_path, thread_num);
|
||||
asr_handle->InitSegDict(seg_dict_path);
|
||||
}
|
||||
|
||||
am_model_path = PathAppend(model_path.at(OFFLINE_MODEL_DIR), MODEL_NAME);
|
||||
en_model_path = PathAppend(model_path.at(ONLINE_MODEL_DIR), ENCODER_NAME);
|
||||
de_model_path = PathAppend(model_path.at(ONLINE_MODEL_DIR), DECODER_NAME);
|
||||
if(model_path.find(QUANTIZE) != model_path.end() && model_path.at(QUANTIZE) == "true"){
|
||||
am_model_path = PathAppend(model_path.at(OFFLINE_MODEL_DIR), QUANT_MODEL_NAME);
|
||||
en_model_path = PathAppend(model_path.at(ONLINE_MODEL_DIR), QUANT_ENCODER_NAME);
|
||||
de_model_path = PathAppend(model_path.at(ONLINE_MODEL_DIR), QUANT_DECODER_NAME);
|
||||
}
|
||||
am_cmvn_path = PathAppend(model_path.at(ONLINE_MODEL_DIR), AM_CMVN_NAME);
|
||||
am_config_path = PathAppend(model_path.at(ONLINE_MODEL_DIR), AM_CONFIG_NAME);
|
||||
token_path = PathAppend(model_path.at(MODEL_DIR), TOKEN_PATH);
|
||||
|
||||
asr_handle->InitAsr(am_model_path, en_model_path, de_model_path, am_cmvn_path, am_config_path, token_path, thread_num);
|
||||
}else{
|
||||
LOG(ERROR) <<"Can not find offline-model-dir or online-model-dir";
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
// Lm resource
|
||||
if (model_path.find(LM_DIR) != model_path.end() && model_path.at(LM_DIR) != "") {
|
||||
string fst_path, lm_config_path, lex_path;
|
||||
fst_path = PathAppend(model_path.at(LM_DIR), LM_FST_RES);
|
||||
lm_config_path = PathAppend(model_path.at(LM_DIR), LM_CONFIG_NAME);
|
||||
lex_path = PathAppend(model_path.at(LM_DIR), LEX_PATH);
|
||||
if (access(lex_path.c_str(), F_OK) != 0 )
|
||||
{
|
||||
LOG(ERROR) << "Lexicon.txt file is not exist, please use the latest version. Skip load LM model.";
|
||||
}else{
|
||||
asr_handle->InitLm(fst_path, lm_config_path, lex_path);
|
||||
}
|
||||
}
|
||||
|
||||
// PUNC model
|
||||
if(model_path.find(PUNC_DIR) != model_path.end()){
|
||||
string punc_model_path;
|
||||
string punc_config_path;
|
||||
string token_path;
|
||||
|
||||
punc_model_path = PathAppend(model_path.at(PUNC_DIR), MODEL_NAME);
|
||||
if(model_path.find(PUNC_QUANT) != model_path.end() && model_path.at(PUNC_QUANT) == "true"){
|
||||
punc_model_path = PathAppend(model_path.at(PUNC_DIR), QUANT_MODEL_NAME);
|
||||
}
|
||||
punc_config_path = PathAppend(model_path.at(PUNC_DIR), PUNC_CONFIG_NAME);
|
||||
token_path = PathAppend(model_path.at(PUNC_DIR), TOKEN_PATH);
|
||||
|
||||
if (access(punc_model_path.c_str(), F_OK) != 0 ||
|
||||
access(punc_config_path.c_str(), F_OK) != 0 ||
|
||||
access(token_path.c_str(), F_OK) != 0)
|
||||
{
|
||||
LOG(INFO) << "PUNC model file is not exist, skip load punc model.";
|
||||
}else{
|
||||
punc_online_handle = make_unique<CTTransformerOnline>();
|
||||
punc_online_handle->InitPunc(punc_model_path, punc_config_path, token_path, thread_num);
|
||||
use_punc = true;
|
||||
}
|
||||
}
|
||||
#if !defined(__APPLE__)
|
||||
// Optional: ITN, here we just support language_type=MandarinEnglish
|
||||
if(model_path.find(ITN_DIR) != model_path.end()){
|
||||
string itn_tagger_path = PathAppend(model_path.at(ITN_DIR), ITN_TAGGER_NAME);
|
||||
string itn_verbalizer_path = PathAppend(model_path.at(ITN_DIR), ITN_VERBALIZER_NAME);
|
||||
|
||||
if (access(itn_tagger_path.c_str(), F_OK) != 0 ||
|
||||
access(itn_verbalizer_path.c_str(), F_OK) != 0 )
|
||||
{
|
||||
LOG(INFO) << "ITN model file is not exist, skip load ITN model.";
|
||||
}else{
|
||||
itn_handle = make_unique<ITNProcessor>();
|
||||
itn_handle->InitITN(itn_tagger_path, itn_verbalizer_path, thread_num);
|
||||
use_itn = true;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
TpassStream *CreateTpassStream(std::map<std::string, std::string>& model_path, int thread_num)
|
||||
{
|
||||
TpassStream *mm;
|
||||
mm = new TpassStream(model_path, thread_num);
|
||||
return mm;
|
||||
}
|
||||
} // namespace funasr
|
||||
92
modules/python/vendors/FunASR/runtime/onnxruntime/src/utf8-string.cpp
vendored
Normal file
92
modules/python/vendors/FunASR/runtime/onnxruntime/src/utf8-string.cpp
vendored
Normal file
@@ -0,0 +1,92 @@
|
||||
// Acknowledgement: this code is adapted from
|
||||
// https://github.com/wenet-e2e/WeTextProcessing/blob/master/runtime/utils/string.cc
|
||||
// Retrieved in Aug 2023.
|
||||
|
||||
// Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn)
|
||||
// 2023 Jing Du (thuduj12@163.com)
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "utf8-string.h"
|
||||
#include <glog/logging.h>
|
||||
|
||||
namespace funasr {
|
||||
const char* WHITESPACE = " \n\r\t\f\v";
|
||||
|
||||
int char_length(char ch) {
|
||||
int num_bytes = 1;
|
||||
CHECK_LE((ch & 0xF8), 0xF0);
|
||||
if ((ch & 0x80) == 0x00) {
|
||||
// The first 128 characters (US-ASCII) in UTF-8 format only need one byte.
|
||||
num_bytes = 1;
|
||||
} else if ((ch & 0xE0) == 0xC0) {
|
||||
// The next 1,920 characters need two bytes to encode,
|
||||
// which covers the remainder of almost all Latin-script alphabets.
|
||||
num_bytes = 2;
|
||||
} else if ((ch & 0xF0) == 0xE0) {
|
||||
// Three bytes are needed for characters in the rest of
|
||||
// the Basic Multilingual Plane, which contains virtually all characters
|
||||
// in common use, including most Chinese, Japanese and Korean characters.
|
||||
num_bytes = 3;
|
||||
} else if ((ch & 0xF8) == 0xF0) {
|
||||
// Four bytes are needed for characters in the other planes of Unicode,
|
||||
// which include less common CJK characters, various historic scripts,
|
||||
// mathematical symbols, and emoji (pictographic symbols).
|
||||
num_bytes = 4;
|
||||
}
|
||||
return num_bytes;
|
||||
}
|
||||
|
||||
int string_length(const std::string& str) {
|
||||
int len = 0;
|
||||
int num_bytes = 1;
|
||||
for (size_t i = 0; i < str.length(); i += num_bytes) {
|
||||
num_bytes = char_length(str[i]);
|
||||
++len;
|
||||
}
|
||||
return len;
|
||||
}
|
||||
|
||||
void string2chars(const std::string& str, std::vector<std::string>* chars) {
|
||||
chars->clear();
|
||||
int num_bytes = 1;
|
||||
for (size_t i = 0; i < str.length(); i += num_bytes) {
|
||||
num_bytes = char_length(str[i]);
|
||||
chars->push_back(str.substr(i, num_bytes));
|
||||
}
|
||||
}
|
||||
|
||||
std::string ltrim(const std::string& str) {
|
||||
size_t start = str.find_first_not_of(WHITESPACE);
|
||||
return (start == std::string::npos) ? "" : str.substr(start);
|
||||
}
|
||||
|
||||
std::string rtrim(const std::string& str) {
|
||||
size_t end = str.find_last_not_of(WHITESPACE);
|
||||
return end == std::string::npos ? "" : str.substr(0, end + 1);
|
||||
}
|
||||
|
||||
std::string trim(const std::string& str) { return rtrim(ltrim(str)); }
|
||||
|
||||
void split_string(const std::string& str, const std::string& delim,
|
||||
std::vector<std::string>* output) {
|
||||
std::string s = str;
|
||||
size_t pos = 0;
|
||||
while ((pos = s.find(delim)) != std::string::npos) {
|
||||
output->emplace_back(s.substr(0, pos));
|
||||
s.erase(0, pos + delim.length());
|
||||
}
|
||||
output->emplace_back(s);
|
||||
}
|
||||
|
||||
} // namespace funasr
|
||||
46
modules/python/vendors/FunASR/runtime/onnxruntime/src/utf8-string.h
vendored
Normal file
46
modules/python/vendors/FunASR/runtime/onnxruntime/src/utf8-string.h
vendored
Normal file
@@ -0,0 +1,46 @@
|
||||
// Acknowledgement: this code is adapted from
|
||||
// https://github.com/wenet-e2e/WeTextProcessing/blob/master/runtime/utils/string.h
|
||||
// Retrieved in Aug 2023.
|
||||
|
||||
// Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn)
|
||||
// 2023 Jing Du (thuduj12@163.com)
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef UTILS_UTF8_STRING_H_
|
||||
#define UTILS_UTF8_STRING_H_
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
namespace funasr {
|
||||
extern const char* WHITESPACE;
|
||||
|
||||
int char_length(char ch);
|
||||
|
||||
int string_length(const std::string& str);
|
||||
|
||||
void string2chars(const std::string& str, std::vector<std::string>* chars);
|
||||
|
||||
std::string ltrim(const std::string& str);
|
||||
|
||||
std::string rtrim(const std::string& str);
|
||||
|
||||
std::string trim(const std::string& str);
|
||||
|
||||
void split_string(const std::string& str, const std::string& delim,
|
||||
std::vector<std::string>* output);
|
||||
|
||||
} // namespace funasr
|
||||
|
||||
#endif // UTILS_UTF8_STRING_H_
|
||||
1101
modules/python/vendors/FunASR/runtime/onnxruntime/src/util.cpp
vendored
Normal file
1101
modules/python/vendors/FunASR/runtime/onnxruntime/src/util.cpp
vendored
Normal file
File diff suppressed because it is too large
Load Diff
74
modules/python/vendors/FunASR/runtime/onnxruntime/src/util.h
vendored
Normal file
74
modules/python/vendors/FunASR/runtime/onnxruntime/src/util.h
vendored
Normal file
@@ -0,0 +1,74 @@
|
||||
#ifndef UTIL_H
|
||||
#define UTIL_H
|
||||
#include <vector>
|
||||
#include <memory>
|
||||
#include <unordered_map>
|
||||
#include <deque>
|
||||
#include "tensor.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace funasr {
|
||||
typedef unsigned short U16CHAR_T;
|
||||
extern float *LoadParams(const char *filename);
|
||||
|
||||
extern void SaveDataFile(const char *filename, void *data, uint32_t len);
|
||||
extern void Relu(Tensor<float> *din);
|
||||
extern void Swish(Tensor<float> *din);
|
||||
extern void Sigmoid(Tensor<float> *din);
|
||||
extern void DoubleSwish(Tensor<float> *din);
|
||||
|
||||
extern void Softmax(float *din, int mask, int len);
|
||||
|
||||
extern void LogSoftmax(float *din, int len);
|
||||
extern int ValAlign(int val, int align);
|
||||
extern void DispParams(float *din, int size);
|
||||
|
||||
extern void BasicNorm(Tensor<float> *&din, float norm);
|
||||
|
||||
extern void FindMax(float *din, int len, float &max_val, int &max_idx);
|
||||
|
||||
extern void Glu(Tensor<float> *din, Tensor<float> *dout);
|
||||
|
||||
string PathAppend(const string &p1, const string &p2);
|
||||
bool is_target_file(const std::string& filename, const std::string target);
|
||||
|
||||
void KeepChineseCharacterAndSplit(const std::string &input_str,
|
||||
std::vector<std::string> &chinese_characters);
|
||||
void SplitChiEngCharacters(const std::string &input_str,
|
||||
std::vector<std::string> &characters);
|
||||
void TimestampAdd(std::deque<string> &alignment_str1, std::string str_word);
|
||||
vector<vector<int>> ParseTimestamps(const std::string& str);
|
||||
bool TimestampIsDigit(U16CHAR_T &u16);
|
||||
bool TimestampIsAlpha(U16CHAR_T &u16);
|
||||
bool TimestampIsPunctuation(U16CHAR_T &u16);
|
||||
bool TimestampIsPunctuation(const std::string& str);
|
||||
void TimestampSplitChiEngCharacters(const std::string &input_str,
|
||||
std::vector<std::string> &characters);
|
||||
std::string VectorToString(const std::vector<std::vector<int>>& vec, bool out_empty=true);
|
||||
std::string TimestampSmooth(std::string &text, std::string &text_itn, std::string &str_time);
|
||||
std::string TimestampSentence(std::string &text, std::string &str_time);
|
||||
std::vector<std::string> split(const std::string &s, char delim);
|
||||
std::vector<std::string> SplitStr(const std::string &s, string delimiter);
|
||||
|
||||
template<typename T>
|
||||
void PrintMat(const std::vector<std::vector<T>> &mat, const std::string &name);
|
||||
void Trim(std::string *str);
|
||||
size_t Utf8ToCharset(const std::string &input, std::vector<std::string> &output);
|
||||
void SplitStringToVector(const std::string &full, const char *delim,
|
||||
bool omit_empty_strings,
|
||||
std::vector<std::string> *out);
|
||||
string PostProcess(std::vector<string> &raw_char,
|
||||
std::vector<std::vector<float>> ×tamp_list);
|
||||
void TimestampOnnx( std::vector<float>& us_alphas,
|
||||
std::vector<float> us_cif_peak,
|
||||
std::vector<string>& char_list,
|
||||
std::string &res_str,
|
||||
std::vector<std::vector<float>> ×tamp_vec,
|
||||
float begin_time = 0.0,
|
||||
float total_offset = -1.5);
|
||||
bool IsTargetFile(const std::string& filename, const std::string target);
|
||||
void ExtractHws(string hws_file, unordered_map<string, int> &hws_map);
|
||||
void ExtractHws(string hws_file, unordered_map<string, int> &hws_map, string& nn_hotwords_);
|
||||
} // namespace funasr
|
||||
#endif
|
||||
31
modules/python/vendors/FunASR/runtime/onnxruntime/src/vad-model.cpp
vendored
Normal file
31
modules/python/vendors/FunASR/runtime/onnxruntime/src/vad-model.cpp
vendored
Normal file
@@ -0,0 +1,31 @@
|
||||
#include "precomp.h"
|
||||
|
||||
namespace funasr {
|
||||
VadModel *CreateVadModel(std::map<std::string, std::string>& model_path, int thread_num)
|
||||
{
|
||||
VadModel *mm;
|
||||
mm = new FsmnVad();
|
||||
|
||||
string vad_model_path;
|
||||
string vad_cmvn_path;
|
||||
string vad_config_path;
|
||||
|
||||
vad_model_path = PathAppend(model_path.at(MODEL_DIR), MODEL_NAME);
|
||||
if(model_path.find(QUANTIZE) != model_path.end() && model_path.at(QUANTIZE) == "true"){
|
||||
vad_model_path = PathAppend(model_path.at(MODEL_DIR), QUANT_MODEL_NAME);
|
||||
}
|
||||
vad_cmvn_path = PathAppend(model_path.at(MODEL_DIR), VAD_CMVN_NAME);
|
||||
vad_config_path = PathAppend(model_path.at(MODEL_DIR), VAD_CONFIG_NAME);
|
||||
|
||||
mm->InitVad(vad_model_path, vad_cmvn_path, vad_config_path, thread_num);
|
||||
return mm;
|
||||
}
|
||||
|
||||
VadModel *CreateVadModel(void* fsmnvad_handle)
|
||||
{
|
||||
VadModel *mm;
|
||||
mm = new FsmnVadOnline((FsmnVad*)fsmnvad_handle);
|
||||
return mm;
|
||||
}
|
||||
|
||||
} // namespace funasr
|
||||
285
modules/python/vendors/FunASR/runtime/onnxruntime/src/vocab.cpp
vendored
Normal file
285
modules/python/vendors/FunASR/runtime/onnxruntime/src/vocab.cpp
vendored
Normal file
@@ -0,0 +1,285 @@
|
||||
#include "vocab.h"
|
||||
#include <yaml-cpp/yaml.h>
|
||||
#include <glog/logging.h>
|
||||
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <list>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace funasr {
|
||||
Vocab::Vocab(const char *filename)
|
||||
{
|
||||
ifstream in(filename);
|
||||
LoadVocabFromJson(filename);
|
||||
}
|
||||
Vocab::Vocab(const char *filename, const char *lex_file)
|
||||
{
|
||||
ifstream in(filename);
|
||||
LoadVocabFromYaml(filename);
|
||||
LoadLex(lex_file);
|
||||
}
|
||||
Vocab::~Vocab()
|
||||
{
|
||||
}
|
||||
|
||||
void Vocab::LoadVocabFromYaml(const char* filename){
|
||||
YAML::Node config;
|
||||
try{
|
||||
config = YAML::LoadFile(filename);
|
||||
}catch(exception const &e){
|
||||
LOG(INFO) << "Error loading file, yaml file error or not exist.";
|
||||
exit(-1);
|
||||
}
|
||||
YAML::Node myList = config["token_list"];
|
||||
int i = 0;
|
||||
for (YAML::const_iterator it = myList.begin(); it != myList.end(); ++it) {
|
||||
vocab.push_back(it->as<string>());
|
||||
token_id[it->as<string>()] = i;
|
||||
i ++;
|
||||
}
|
||||
}
|
||||
|
||||
void Vocab::LoadVocabFromJson(const char* filename){
|
||||
nlohmann::json json_array;
|
||||
std::ifstream file(filename);
|
||||
if (file.is_open()) {
|
||||
file >> json_array;
|
||||
file.close();
|
||||
} else {
|
||||
LOG(INFO) << "Error loading token file, token file error or not exist.";
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
int i = 0;
|
||||
for (const auto& element : json_array) {
|
||||
vocab.push_back(element);
|
||||
token_id[element] = i;
|
||||
i++;
|
||||
}
|
||||
}
|
||||
|
||||
void Vocab::LoadLex(const char* filename){
|
||||
std::ifstream file(filename);
|
||||
std::string line;
|
||||
while (std::getline(file, line)) {
|
||||
std::string key, value;
|
||||
std::istringstream iss(line);
|
||||
std::getline(iss, key, '\t');
|
||||
std::getline(iss, value);
|
||||
|
||||
if (!key.empty() && !value.empty()) {
|
||||
lex_map[key] = value;
|
||||
}
|
||||
}
|
||||
|
||||
file.close();
|
||||
}
|
||||
|
||||
string Vocab::Word2Lex(const std::string &word) const {
|
||||
auto it = lex_map.find(word);
|
||||
if (it != lex_map.end()) {
|
||||
return it->second;
|
||||
}
|
||||
return "";
|
||||
}
|
||||
|
||||
int Vocab::GetIdByToken(const std::string &token) const {
|
||||
auto it = token_id.find(token);
|
||||
if (it != token_id.end()) {
|
||||
return it->second;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
void Vocab::Vector2String(vector<int> in, std::vector<std::string> &preds)
|
||||
{
|
||||
for (auto it = in.begin(); it != in.end(); it++) {
|
||||
string word = vocab[*it];
|
||||
preds.emplace_back(word);
|
||||
}
|
||||
}
|
||||
|
||||
string Vocab::Vector2String(vector<int> in)
|
||||
{
|
||||
int i;
|
||||
stringstream ss;
|
||||
for (auto it = in.begin(); it != in.end(); it++) {
|
||||
ss << vocab[*it];
|
||||
}
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
int Str2Int(string str)
|
||||
{
|
||||
const char *ch_array = str.c_str();
|
||||
if (((ch_array[0] & 0xf0) != 0xe0) || ((ch_array[1] & 0xc0) != 0x80) ||
|
||||
((ch_array[2] & 0xc0) != 0x80))
|
||||
return 0;
|
||||
int val = ((ch_array[0] & 0x0f) << 12) | ((ch_array[1] & 0x3f) << 6) |
|
||||
(ch_array[2] & 0x3f);
|
||||
return val;
|
||||
}
|
||||
|
||||
string Vocab::Id2String(int id) const
|
||||
{
|
||||
if (id < 0 || id >= vocab.size()) {
|
||||
LOG(INFO) << "Error vocabulary id, this id do not exit.";
|
||||
return "";
|
||||
} else {
|
||||
return vocab[id];
|
||||
}
|
||||
}
|
||||
|
||||
bool Vocab::IsChinese(string ch)
|
||||
{
|
||||
if (ch.size() != 3) {
|
||||
return false;
|
||||
}
|
||||
int unicode = Str2Int(ch);
|
||||
if (unicode >= 19968 && unicode <= 40959) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
string Vocab::WordFormat(std::string word)
|
||||
{
|
||||
if(word == "i"){
|
||||
return "I";
|
||||
}else if(word == "i'm"){
|
||||
return "I'm";
|
||||
}else if(word == "i've"){
|
||||
return "I've";
|
||||
}else if(word == "i'll"){
|
||||
return "I'll";
|
||||
}else{
|
||||
return word;
|
||||
}
|
||||
}
|
||||
|
||||
string Vocab::Vector2StringV2(vector<int> in, std::string language)
|
||||
{
|
||||
int i;
|
||||
list<string> words;
|
||||
int is_pre_english = false;
|
||||
int pre_english_len = 0;
|
||||
int is_combining = false;
|
||||
std::string combine = "";
|
||||
std::string unicodeChar = "▁";
|
||||
|
||||
for (i=0; i<in.size(); i++){
|
||||
string word = vocab[in[i]];
|
||||
// step1 space character skips
|
||||
if (word == "<s>" || word == "</s>" || word == "<unk>")
|
||||
continue;
|
||||
if (language == "en-bpe"){
|
||||
size_t found = word.find(unicodeChar);
|
||||
if(found != std::string::npos){
|
||||
if (combine != ""){
|
||||
combine = WordFormat(combine);
|
||||
if (words.size() != 0){
|
||||
combine = " " + combine;
|
||||
}
|
||||
words.push_back(combine);
|
||||
}
|
||||
combine = word.substr(3);
|
||||
}else{
|
||||
combine += word;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
// step2 combie phoneme to full word
|
||||
{
|
||||
int sub_word = !(word.find("@@") == string::npos);
|
||||
// process word start and middle part
|
||||
if (sub_word) {
|
||||
// if badcase: lo@@ chinese
|
||||
if (i == in.size()-1 || i<in.size()-1 && IsChinese(vocab[in[i+1]])){
|
||||
word = word.erase(word.length() - 2) + " ";
|
||||
if (is_combining) {
|
||||
combine += word;
|
||||
is_combining = false;
|
||||
word = combine;
|
||||
combine = "";
|
||||
}
|
||||
}else{
|
||||
combine += word.erase(word.length() - 2);
|
||||
is_combining = true;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
// process word end part
|
||||
else if (is_combining) {
|
||||
combine += word;
|
||||
is_combining = false;
|
||||
word = combine;
|
||||
combine = "";
|
||||
}
|
||||
}
|
||||
|
||||
// step3 process english word deal with space , turn abbreviation to upper case
|
||||
{
|
||||
// input word is chinese, not need process
|
||||
if (IsChinese(word)) {
|
||||
words.push_back(word);
|
||||
is_pre_english = false;
|
||||
}
|
||||
// input word is english word
|
||||
else {
|
||||
// pre word is chinese
|
||||
if (!is_pre_english) {
|
||||
// word[0] = word[0] - 32;
|
||||
words.push_back(word);
|
||||
pre_english_len = word.size();
|
||||
}
|
||||
// pre word is english word
|
||||
else {
|
||||
// single letter turn to upper case
|
||||
// if (word.size() == 1) {
|
||||
// word[0] = word[0] - 32;
|
||||
// }
|
||||
|
||||
if (pre_english_len > 1) {
|
||||
words.push_back(" ");
|
||||
words.push_back(word);
|
||||
pre_english_len = word.size();
|
||||
}
|
||||
else {
|
||||
if (word.size() > 1) {
|
||||
words.push_back(" ");
|
||||
}
|
||||
words.push_back(word);
|
||||
pre_english_len = word.size();
|
||||
}
|
||||
}
|
||||
is_pre_english = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (language == "en-bpe" && combine != ""){
|
||||
combine = WordFormat(combine);
|
||||
if (words.size() != 0){
|
||||
combine = " " + combine;
|
||||
}
|
||||
words.push_back(combine);
|
||||
}
|
||||
|
||||
stringstream ss;
|
||||
for (auto it = words.begin(); it != words.end(); it++) {
|
||||
ss << *it;
|
||||
}
|
||||
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
int Vocab::Size() const
|
||||
{
|
||||
return vocab.size();
|
||||
}
|
||||
|
||||
} // namespace funasr
|
||||
39
modules/python/vendors/FunASR/runtime/onnxruntime/src/vocab.h
vendored
Normal file
39
modules/python/vendors/FunASR/runtime/onnxruntime/src/vocab.h
vendored
Normal file
@@ -0,0 +1,39 @@
|
||||
|
||||
#ifndef VOCAB_H
|
||||
#define VOCAB_H
|
||||
|
||||
#include <stdint.h>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <map>
|
||||
#include "nlohmann/json.hpp"
|
||||
using namespace std;
|
||||
|
||||
namespace funasr {
|
||||
class Vocab {
|
||||
private:
|
||||
vector<string> vocab;
|
||||
std::map<string, int> token_id;
|
||||
std::map<string, string> lex_map;
|
||||
bool IsEnglish(string ch);
|
||||
void LoadVocabFromYaml(const char* filename);
|
||||
void LoadVocabFromJson(const char* filename);
|
||||
void LoadLex(const char* filename);
|
||||
|
||||
public:
|
||||
Vocab(const char *filename);
|
||||
Vocab(const char *filename, const char *lex_file);
|
||||
~Vocab();
|
||||
int Size() const;
|
||||
bool IsChinese(string ch);
|
||||
void Vector2String(vector<int> in, std::vector<std::string> &preds);
|
||||
string Vector2String(vector<int> in);
|
||||
string Vector2StringV2(vector<int> in, std::string language="");
|
||||
string Id2String(int id) const;
|
||||
string WordFormat(std::string word);
|
||||
int GetIdByToken(const std::string &token) const;
|
||||
string Word2Lex(const std::string &word) const;
|
||||
};
|
||||
|
||||
} // namespace funasr
|
||||
#endif
|
||||
114
modules/python/vendors/FunASR/runtime/onnxruntime/src/wfst-decoder.cpp
vendored
Normal file
114
modules/python/vendors/FunASR/runtime/onnxruntime/src/wfst-decoder.cpp
vendored
Normal file
@@ -0,0 +1,114 @@
|
||||
#include <wfst-decoder.h>
|
||||
namespace funasr {
|
||||
WfstDecoder::WfstDecoder(fst::Fst<fst::StdArc>* lm,
|
||||
PhoneSet* phone_set, Vocab* vocab,
|
||||
float glob_beam, float lat_beam, float am_scale)
|
||||
:dec_opts_(glob_beam, lat_beam, am_scale), decodable_(dec_opts_.acoustic_scale),
|
||||
lm_(lm), phone_set_(phone_set), vocab_(vocab) {
|
||||
decoder_ = std::shared_ptr<kaldi::LatticeFasterOnlineDecoder>(
|
||||
new kaldi::LatticeFasterOnlineDecoder(*lm_, dec_opts_));
|
||||
}
|
||||
|
||||
WfstDecoder::~WfstDecoder() {
|
||||
}
|
||||
|
||||
void WfstDecoder::StartUtterance() {
|
||||
if (decoder_) {
|
||||
cur_frame_ = 0;
|
||||
cur_token_ = 0;
|
||||
decodable_.Reset();
|
||||
decoder_->InitDecoding();
|
||||
}
|
||||
}
|
||||
|
||||
void WfstDecoder::EndUtterance() {
|
||||
}
|
||||
|
||||
string WfstDecoder::Search(float *in, int len, int64_t token_num) {
|
||||
string result;
|
||||
if (len == 0) {
|
||||
return "";
|
||||
}
|
||||
std::vector<std::vector<float>> logp_vec;
|
||||
int blk_phn_id = phone_set_->GetBlkPhnId();
|
||||
for (int i = 0; i < len - 1; i++) {
|
||||
std::vector<float> tmp_logp;
|
||||
for (int j = 0; j < token_num; j++) {
|
||||
tmp_logp.push_back((in + i * token_num)[j]);
|
||||
}
|
||||
logp_vec.push_back(tmp_logp);
|
||||
}
|
||||
for (int i = 0; i < logp_vec.size(); i++) {
|
||||
cur_frame_++;
|
||||
decodable_.AcceptLoglikes(logp_vec[i]);
|
||||
decoder_->AdvanceDecoding(&decodable_, 1);
|
||||
cur_token_++;
|
||||
}
|
||||
if (cur_token_ > 0) {
|
||||
std::vector<int> words;
|
||||
kaldi::Lattice lattice;
|
||||
decoder_->GetBestPath(&lattice, false);
|
||||
std::vector<int> alignment;
|
||||
kaldi::LatticeWeight weight;
|
||||
fst::GetLinearSymbolSequence(lattice, &alignment, &words, &weight);
|
||||
result = vocab_->Vector2StringV2(words);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
string WfstDecoder::FinalizeDecode(bool is_stamp, std::vector<float> us_alphas, std::vector<float> us_cif_peak) {
|
||||
string result;
|
||||
if (cur_token_ > 0) {
|
||||
std::vector<int> words;
|
||||
kaldi::Lattice lattice;
|
||||
decodable_.SetFinished();
|
||||
decoder_->FinalizeDecoding();
|
||||
decoder_->GetBestPath(&lattice, true);
|
||||
std::vector<int> alignment;
|
||||
kaldi::LatticeWeight weight;
|
||||
fst::GetLinearSymbolSequence(lattice, &alignment, &words, &weight);
|
||||
|
||||
if(!is_stamp){
|
||||
return vocab_->Vector2StringV2(words);
|
||||
}else{
|
||||
std::vector<std::string> char_list;
|
||||
std::vector<std::vector<float>> timestamp_list;
|
||||
std::string res_str;
|
||||
vocab_->Vector2String(words, char_list);
|
||||
// split chinese word to char
|
||||
std::vector<std::string> split_chars;
|
||||
for(auto& word:char_list){
|
||||
std::vector<std::string> word2char;
|
||||
SplitChiEngCharacters(word, word2char);
|
||||
split_chars.insert(split_chars.end(), word2char.begin(), word2char.end());
|
||||
}
|
||||
// std::vector<string> raw_char(char_list);
|
||||
TimestampOnnx(us_alphas, us_cif_peak, split_chars, res_str, timestamp_list);
|
||||
|
||||
return PostProcess(split_chars, timestamp_list);
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
void WfstDecoder::LoadHwsRes(int inc_bias, unordered_map<string, int> &hws_map) {
|
||||
try {
|
||||
if (!hws_map.empty()) {
|
||||
bias_lm_ = std::make_shared<BiasLm>(hws_map, inc_bias,
|
||||
*phone_set_, *vocab_);
|
||||
decoder_->SetBiasLm(bias_lm_);
|
||||
}
|
||||
} catch (std::exception const &e) {
|
||||
LOG(ERROR) << "Error when load wfst hotwords resource: " << e.what();
|
||||
exit(0);
|
||||
}
|
||||
}
|
||||
|
||||
void WfstDecoder::UnloadHwsRes() {
|
||||
if (bias_lm_) {
|
||||
decoder_->ClearBiasLm();
|
||||
bias_lm_.reset();
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace funasr
|
||||
86
modules/python/vendors/FunASR/runtime/onnxruntime/src/wfst-decoder.h
vendored
Normal file
86
modules/python/vendors/FunASR/runtime/onnxruntime/src/wfst-decoder.h
vendored
Normal file
@@ -0,0 +1,86 @@
|
||||
#ifndef WFST_DECODER_
|
||||
#define WFST_DECODER_
|
||||
#include "kaldi/decoder/lattice-faster-online-decoder.h"
|
||||
#include "model.h"
|
||||
#include "fst/fstlib.h"
|
||||
#include "fst/symbol-table.h"
|
||||
#include "bias-lm.h"
|
||||
#include "phone-set.h"
|
||||
#include "util.h"
|
||||
|
||||
#define MAX_SCORE 10.0f
|
||||
namespace funasr {
|
||||
class Decodable : public kaldi::DecodableInterface {
|
||||
public:
|
||||
Decodable(float scale = 1.0f) : scale_(scale) {
|
||||
Reset();
|
||||
}
|
||||
void Reset() {
|
||||
num_frames_ = 0;
|
||||
finished_ = false;
|
||||
logp_.clear();
|
||||
}
|
||||
|
||||
int NumFramesReady() const { return num_frames_; }
|
||||
|
||||
bool IsLastFrame(int frame) const {
|
||||
return finished_ && (frame == num_frames_ - 1);
|
||||
}
|
||||
|
||||
float LogLikelihood(int frm, int id) {
|
||||
CHECK_GT(id, 0);
|
||||
CHECK_LT(frm, num_frames_);
|
||||
return scale_ * logp_[id - 1];
|
||||
}
|
||||
|
||||
void AcceptLoglikes(const std::vector<float>& logp) {
|
||||
num_frames_++;
|
||||
logp_ = logp;
|
||||
}
|
||||
|
||||
int NumIndices() const { return 0; }
|
||||
void SetFinished() { finished_ = true; }
|
||||
|
||||
private:
|
||||
int num_frames_ = 0;
|
||||
float scale_ = 1.0f;
|
||||
bool finished_ = false;
|
||||
std::vector<float> logp_;
|
||||
};
|
||||
|
||||
struct DecodeOptions : public kaldi::LatticeFasterDecoderConfig {
|
||||
DecodeOptions(float glob_beam = 3.0f, float lat_beam = 3.0f, float ac_sc = 10.0f) :
|
||||
kaldi::LatticeFasterDecoderConfig(glob_beam, lat_beam), acoustic_scale(ac_sc) {
|
||||
}
|
||||
float acoustic_scale;
|
||||
};
|
||||
|
||||
class WfstDecoder {
|
||||
public:
|
||||
WfstDecoder(fst::Fst<fst::StdArc>* lm,
|
||||
PhoneSet* phone_set,
|
||||
Vocab* vocab,
|
||||
float glob_beam,
|
||||
float lat_beam,
|
||||
float am_scale);
|
||||
~WfstDecoder();
|
||||
void StartUtterance();
|
||||
void EndUtterance();
|
||||
string Search(float *in, int len, int64_t token_nums);
|
||||
string FinalizeDecode(bool is_stamp=false, std::vector<float> us_alphas={0}, std::vector<float> us_cif_peak={0});
|
||||
void LoadHwsRes(int inc_bias, unordered_map<string, int> &hws_map);
|
||||
void UnloadHwsRes();
|
||||
|
||||
private:
|
||||
Vocab* vocab_ = nullptr;
|
||||
PhoneSet* phone_set_ = nullptr;
|
||||
int cur_frame_ = 0;
|
||||
int cur_token_ = 0;
|
||||
DecodeOptions dec_opts_;
|
||||
Decodable decodable_;
|
||||
fst::Fst<fst::StdArc>* lm_ = nullptr;
|
||||
std::shared_ptr<kaldi::LatticeFasterOnlineDecoder> decoder_ = nullptr;
|
||||
std::shared_ptr<BiasLm> bias_lm_ = nullptr;
|
||||
};
|
||||
} // namespace funasr
|
||||
#endif // WFST_DECODER_
|
||||
38
modules/python/vendors/FunASR/runtime/onnxruntime/src/win_func.h
vendored
Normal file
38
modules/python/vendors/FunASR/runtime/onnxruntime/src/win_func.h
vendored
Normal file
@@ -0,0 +1,38 @@
|
||||
#ifndef WIN_FUNC_
|
||||
#define WIN_FUNC_
|
||||
#ifdef _WIN32
|
||||
#ifndef WIN32_LEAN_AND_MEAN
|
||||
#define WIN32_LEAN_AND_MEAN
|
||||
#endif
|
||||
#include <windows.h>
|
||||
#include <winsock.h>
|
||||
#include<io.h>
|
||||
|
||||
#ifndef R_OK
|
||||
#define R_OK 4
|
||||
#endif
|
||||
#ifndef W_OK
|
||||
#define W_OK 2
|
||||
#endif
|
||||
#ifndef X_OK
|
||||
#define X_OK 0
|
||||
#endif
|
||||
#ifndef F_OK
|
||||
#define F_OK 0
|
||||
#endif
|
||||
#define access _access
|
||||
|
||||
static inline int gettimeofday(struct timeval* tv, void* /*tz*/) {
|
||||
FILETIME ft;
|
||||
ULARGE_INTEGER li;
|
||||
ULONGLONG tt;
|
||||
GetSystemTimeAsFileTime(&ft);
|
||||
li.LowPart = ft.dwLowDateTime;
|
||||
li.HighPart = ft.dwHighDateTime;
|
||||
tt = (li.QuadPart - 116444736000000000ULL) / 10;
|
||||
tv->tv_sec = tt / 1000000;
|
||||
tv->tv_usec = tt % 1000000;
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
Reference in New Issue
Block a user