Sync from bytedesk-private: update

This commit is contained in:
jack ning
2024-12-14 10:43:18 +08:00
parent 476eebb101
commit 5e082909e4
3421 changed files with 812709 additions and 0 deletions

View File

@@ -0,0 +1,19 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<OutputType>Exe</OutputType>
<TargetFramework>net6.0</TargetFramework>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
</PropertyGroup>
<ItemGroup>
<PackageReference Include="CommandLineParser" Version="2.9.1" />
<PackageReference Include="NAudio" Version="2.1.0" />
</ItemGroup>
<ItemGroup>
<ProjectReference Include="..\AliFsmnVadSharp\AliFsmnVadSharp.csproj" />
</ItemGroup>
</Project>

View File

@@ -0,0 +1,107 @@
using AliFsmnVadSharp;
using AliFsmnVadSharp.Model;
using CommandLine;
using NAudio.Wave;
internal static class Program
{
public class ProgramParams
{
[Option('i', "input", Required = true, HelpText = "Input wav file/folder path.")]
public string WavFilePath { get; set; }
[Option('m', "model", Default = "speech_fsmn_vad_zh-cn-16k-common-onnx", HelpText = "Model path.")]
public string Model { get; set; }
}
[STAThread]
private static void Main(string[] args)
{
var argParams = Parser.Default.ParseArguments<ProgramParams>(args).Value;
string modelPath = argParams.Model;
if (!Directory.Exists(argParams.Model))
{
modelPath = Path.Combine(AppDomain.CurrentDomain.BaseDirectory, modelPath);
if (!Directory.Exists(modelPath))
{
throw new DirectoryNotFoundException($"Model not found: {argParams.Model}");
}
}
string modelFilePath = Path.Combine(modelPath, "model_quant.onnx");
string configFilePath = Path.Combine(modelPath, "config.yaml");
string mvnFilePath = Path.Combine(modelPath, "am.mvn");
int batchSize = 1;
AliFsmnVad aliFsmnVad = new AliFsmnVad(modelFilePath, configFilePath, mvnFilePath, batchSize);
List<string> wavFiles = new List<string>();
if (File.Exists(argParams.WavFilePath))
{
wavFiles.Add(argParams.WavFilePath);
}
else if (Directory.Exists(argParams.WavFilePath))
{
foreach (var wavFilePath in Directory.GetFiles(argParams.WavFilePath, "*.wav"))
{
wavFiles.Add(wavFilePath);
}
}
else
{
throw new Exception($"Invalid wav input path. {argParams.WavFilePath}");
}
var start_time = DateTime.Now;
TimeSpan total_duration = new TimeSpan(0L);
for (int i = 0; i < wavFiles.Count; i += batchSize)
{
List<float[]> samples = new List<float[]>();
foreach(var wavFile in wavFiles.Skip(i).Take(batchSize))
{
(var sample, var duration) = LoadWavFile(wavFile);
samples.Add(sample);
total_duration += duration;
}
SegmentEntity[] segments_duration = aliFsmnVad.GetSegments(samples);
Console.WriteLine("vad infer result:");
foreach (SegmentEntity segment in segments_duration)
{
Console.Write("[");
foreach (var x in segment.Segment)
{
Console.Write("[" + string.Join(",", x.ToArray()) + "]");
}
Console.Write("]\r\n");
}
}
var end_time = DateTime.Now;
double elapsed_milliseconds = (end_time - start_time).TotalMilliseconds;
double rtf = elapsed_milliseconds / total_duration.TotalMilliseconds;
Console.WriteLine("elapsed_milliseconds:{0}", elapsed_milliseconds.ToString());
Console.WriteLine("total_duration:{0}", total_duration.TotalMilliseconds.ToString());
Console.WriteLine("rtf:{1}", "0".ToString(), rtf.ToString());
Console.WriteLine("------------------------");
}
private static (float[] sample, TimeSpan duration) LoadWavFile(string wavFilePath)
{
AudioFileReader _audioFileReader = new AudioFileReader(wavFilePath);
byte[] datas = new byte[_audioFileReader.Length];
_audioFileReader.Read(datas, 0, datas.Length);
var duration = _audioFileReader.TotalTime;
float[] wavdata = new float[datas.Length / 4];
Buffer.BlockCopy(datas, 0, wavdata, 0, datas.Length);
var sample = wavdata.Select((float x) => x * 32768f).ToArray();
return (sample, duration);
}
}

View File

@@ -0,0 +1,37 @@

Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio Version 17
VisualStudioVersion = 17.1.32210.238
MinimumVisualStudioVersion = 10.0.40219.1
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "AliFsmnVadSharp", "AliFsmnVadSharp\AliFsmnVadSharp.csproj", "{BFB82F2E-AD5B-405C-AAFF-3CE33C548748}"
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "AliFsmnVadSharp.Examples", "AliFsmnVadSharp.Examples\AliFsmnVadSharp.Examples.csproj", "{2FFA4D03-A62B-435B-B57B-7E49209810E1}"
EndProject
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution Items", "{212561CC-9836-4F45-A31B-298EF576F519}"
ProjectSection(SolutionItems) = preProject
license = license
README.md = README.md
EndProjectSection
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
Release|Any CPU = Release|Any CPU
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{BFB82F2E-AD5B-405C-AAFF-3CE33C548748}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{BFB82F2E-AD5B-405C-AAFF-3CE33C548748}.Debug|Any CPU.Build.0 = Debug|Any CPU
{BFB82F2E-AD5B-405C-AAFF-3CE33C548748}.Release|Any CPU.ActiveCfg = Release|Any CPU
{BFB82F2E-AD5B-405C-AAFF-3CE33C548748}.Release|Any CPU.Build.0 = Release|Any CPU
{2FFA4D03-A62B-435B-B57B-7E49209810E1}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{2FFA4D03-A62B-435B-B57B-7E49209810E1}.Debug|Any CPU.Build.0 = Debug|Any CPU
{2FFA4D03-A62B-435B-B57B-7E49209810E1}.Release|Any CPU.ActiveCfg = Release|Any CPU
{2FFA4D03-A62B-435B-B57B-7E49209810E1}.Release|Any CPU.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {FCC1BBCC-91A3-4223-B368-D272FB5108B6}
EndGlobalSection
EndGlobal

View File

@@ -0,0 +1,410 @@
using AliFsmnVadSharp.Model;
using AliFsmnVadSharp.Utils;
using Microsoft.Extensions.Logging;
using Microsoft.ML.OnnxRuntime;
using Microsoft.ML.OnnxRuntime.Tensors;
// 模型文件下载地址https://modelscope.cn/models/iic/speech_fsmn_vad_zh-cn-16k-common-onnx/
namespace AliFsmnVadSharp
{
public class AliFsmnVad : IDisposable
{
private bool _disposed;
private InferenceSession _onnxSession;
private readonly ILogger _logger;
private string _frontend;
private WavFrontend _wavFrontend;
private int _batchSize = 1;
private int _max_end_sil = int.MinValue;
private EncoderConfEntity _encoderConfEntity;
private VadPostConfEntity _vad_post_conf;
public AliFsmnVad(string modelFilePath, string configFilePath, string mvnFilePath, int batchSize = 1)
{
SessionOptions options = new SessionOptions();
options.AppendExecutionProvider_CPU(0);
options.InterOpNumThreads = 1;
_onnxSession = new InferenceSession(modelFilePath, options);
VadYamlEntity vadYamlEntity = YamlHelper.ReadYaml<VadYamlEntity>(configFilePath);
_wavFrontend = new WavFrontend(mvnFilePath, vadYamlEntity.frontend_conf);
_frontend = vadYamlEntity.frontend;
_vad_post_conf = vadYamlEntity.model_conf;
_batchSize = batchSize;
_max_end_sil = _max_end_sil != int.MinValue ? _max_end_sil : vadYamlEntity.model_conf.max_end_silence_time;
_encoderConfEntity = vadYamlEntity.encoder_conf;
ILoggerFactory loggerFactory = new LoggerFactory();
_logger = new Logger<AliFsmnVad>(loggerFactory);
}
public SegmentEntity[] GetSegments(List<float[]> samples)
{
int waveform_nums = samples.Count;
_batchSize = Math.Min(waveform_nums, _batchSize);
SegmentEntity[] segments = new SegmentEntity[waveform_nums];
for (int beg_idx = 0; beg_idx < waveform_nums; beg_idx += _batchSize)
{
int end_idx = Math.Min(waveform_nums, beg_idx + _batchSize);
List<float[]> waveform_list = new List<float[]>();
for (int i = beg_idx; i < end_idx; i++)
{
waveform_list.Add(samples[i]);
}
List<VadInputEntity> vadInputEntitys = ExtractFeats(waveform_list);
try
{
int t_offset = 0;
int step = Math.Min(waveform_list.Max(x => x.Length), 6000);
bool is_final = true;
List<VadOutputEntity> vadOutputEntitys = Infer(vadInputEntitys);
for (int batch_num = beg_idx; batch_num < end_idx; batch_num++)
{
var scores = vadOutputEntitys[batch_num - beg_idx].Scores;
SegmentEntity[] segments_part = vadInputEntitys[batch_num].VadScorer.DefaultCall(scores, waveform_list[batch_num - beg_idx], is_final: is_final, max_end_sil: _max_end_sil, online: false);
if (segments_part.Length > 0)
{
#pragma warning disable CS8602 // 解引用可能出现空引用。
if (segments[batch_num] == null)
{
segments[batch_num] = new SegmentEntity();
}
segments[batch_num].Segment.AddRange(segments_part[0].Segment); //
#pragma warning restore CS8602 // 解引用可能出现空引用。
}
}
}
catch (OnnxRuntimeException ex)
{
_logger.LogWarning("input wav is silence or noise");
segments = null;
}
// for (int batch_num = 0; batch_num < _batchSize; batch_num++)
// {
// List<float[]> segment_waveforms = new List<float[]>();
// foreach (int[] segment in segments[beg_idx + batch_num].Segment)
// {
// // (int)(16000 * (segment[0] / 1000.0) * 2);
// int frame_length = (((6000 * 400) / 400 - 1) * 160 + 400) / 60 / 1000;
// int frame_start = segment[0] * frame_length;
// int frame_end = segment[1] * frame_length;
// float[] segment_waveform = new float[frame_end - frame_start];
// Array.Copy(waveform_list[batch_num], frame_start, segment_waveform, 0, segment_waveform.Length);
// segment_waveforms.Add(segment_waveform);
// }
// segments[beg_idx + batch_num].Waveform.AddRange(segment_waveforms);
// }
}
return segments;
}
public SegmentEntity[] GetSegmentsByStep(List<float[]> samples)
{
int waveform_nums = samples.Count;
_batchSize=Math.Min(waveform_nums, _batchSize);
SegmentEntity[] segments = new SegmentEntity[waveform_nums];
for (int beg_idx = 0; beg_idx < waveform_nums; beg_idx += _batchSize)
{
int end_idx = Math.Min(waveform_nums, beg_idx + _batchSize);
List<float[]> waveform_list = new List<float[]>();
for (int i = beg_idx; i < end_idx; i++)
{
waveform_list.Add(samples[i]);
}
List<VadInputEntity> vadInputEntitys = ExtractFeats(waveform_list);
int feats_len = vadInputEntitys.Max(x => x.SpeechLength);
List<float[]> in_cache = new List<float[]>();
in_cache = PrepareCache(in_cache);
try
{
int step = Math.Min(vadInputEntitys.Max(x => x.SpeechLength), 6000 * 400);
bool is_final = true;
for (int t_offset = 0; t_offset < (int)(feats_len); t_offset += Math.Min(step, feats_len - t_offset))
{
if (t_offset + step >= feats_len - 1)
{
step = feats_len - t_offset;
is_final = true;
}
else
{
is_final = false;
}
List<VadInputEntity> vadInputEntitys_step = new List<VadInputEntity>();
foreach (VadInputEntity vadInputEntity in vadInputEntitys)
{
VadInputEntity vadInputEntity_step = new VadInputEntity();
float[]? feats = vadInputEntity.Speech;
int curr_step = Math.Min(feats.Length - t_offset, step);
if (curr_step <= 0)
{
vadInputEntity_step.Speech = new float[32000];
vadInputEntity_step.SpeechLength = 0;
vadInputEntity_step.InCaches = in_cache;
vadInputEntity_step.Waveform = new float[(((int)(32000) / 400 - 1) * 160 + 400)];
vadInputEntitys_step.Add(vadInputEntity_step);
continue;
}
float[]? feats_step = new float[curr_step];
Array.Copy(feats, t_offset, feats_step, 0, feats_step.Length);
float[]? waveform = vadInputEntity.Waveform;
float[]? waveform_step = new float[Math.Min(waveform.Length, ((int)(t_offset + step) / 400 - 1) * 160 + 400) - t_offset / 400 * 160];
Array.Copy(waveform, t_offset / 400 * 160, waveform_step, 0, waveform_step.Length);
vadInputEntity_step.Speech = feats_step;
vadInputEntity_step.SpeechLength = feats_step.Length;
vadInputEntity_step.InCaches = vadInputEntity.InCaches;
vadInputEntity_step.Waveform = waveform_step;
vadInputEntitys_step.Add(vadInputEntity_step);
}
List<VadOutputEntity> vadOutputEntitys = Infer(vadInputEntitys_step);
for (int batch_num = 0; batch_num < _batchSize; batch_num++)
{
vadInputEntitys[batch_num].InCaches = vadOutputEntitys[batch_num].OutCaches;
var scores = vadOutputEntitys[batch_num].Scores;
SegmentEntity[] segments_part = vadInputEntitys[batch_num].VadScorer.DefaultCall(scores, vadInputEntitys_step[batch_num].Waveform, is_final: is_final, max_end_sil: _max_end_sil, online: false);
if (segments_part.Length > 0)
{
#pragma warning disable CS8602 // 解引用可能出现空引用。
if (segments[beg_idx + batch_num] == null)
{
segments[beg_idx + batch_num] = new SegmentEntity();
}
if (segments_part[0] != null)
{
segments[beg_idx + batch_num].Segment.AddRange(segments_part[0].Segment);
}
#pragma warning restore CS8602 // 解引用可能出现空引用。
}
}
}
}
catch (OnnxRuntimeException ex)
{
_logger.LogWarning("input wav is silence or noise");
segments = null;
}
// for (int batch_num = 0; batch_num < _batchSize; batch_num++)
// {
// List<float[]> segment_waveforms=new List<float[]>();
// foreach (int[] segment in segments[beg_idx + batch_num].Segment)
// {
// // (int)(16000 * (segment[0] / 1000.0) * 2);
// int frame_length = (((6000 * 400) / 400 - 1) * 160 + 400) / 60 / 1000;
// int frame_start = segment[0] * frame_length;
// int frame_end = segment[1] * frame_length;
// if(frame_end > waveform_list[batch_num].Length)
// {
// break;
// }
// float[] segment_waveform = new float[frame_end - frame_start];
// Array.Copy(waveform_list[batch_num], frame_start, segment_waveform, 0, segment_waveform.Length);
// segment_waveforms.Add(segment_waveform);
// }
// segments[beg_idx + batch_num].Waveform.AddRange(segment_waveforms);
// }
}
return segments;
}
private List<float[]> PrepareCache(List<float[]> in_cache)
{
if (in_cache.Count > 0)
{
return in_cache;
}
int fsmn_layers = _encoderConfEntity.fsmn_layers;
int proj_dim = _encoderConfEntity.proj_dim;
int lorder = _encoderConfEntity.lorder;
for (int i = 0; i < fsmn_layers; i++)
{
float[] cache = new float[1 * proj_dim * (lorder - 1) * 1];
in_cache.Add(cache);
}
return in_cache;
}
private List<VadInputEntity> ExtractFeats(List<float[]> waveform_list)
{
List<float[]> in_cache = new List<float[]>();
in_cache = PrepareCache(in_cache);
List<VadInputEntity> vadInputEntitys = new List<VadInputEntity>();
foreach (var waveform in waveform_list)
{
float[] fbanks = _wavFrontend.GetFbank(waveform);
float[] features = _wavFrontend.LfrCmvn(fbanks);
VadInputEntity vadInputEntity = new VadInputEntity();
vadInputEntity.Waveform = waveform;
vadInputEntity.Speech = features;
vadInputEntity.SpeechLength = features.Length;
vadInputEntity.InCaches = in_cache;
vadInputEntity.VadScorer = new E2EVadModel(_vad_post_conf);
vadInputEntitys.Add(vadInputEntity);
}
return vadInputEntitys;
}
/// <summary>
/// 一维数组转3维数组
/// </summary>
/// <param name="obj"></param>
/// <param name="len">一维长</param>
/// <param name="wid">二维长</param>
/// <returns></returns>
public static T[,,] DimOneToThree<T>(T[] oneDimObj, int len, int wid)
{
if (oneDimObj.Length % (len * wid) != 0)
return null;
int height = oneDimObj.Length / (len * wid);
T[,,] threeDimObj = new T[len, wid, height];
for (int i = 0; i < oneDimObj.Length; i++)
{
threeDimObj[i / (wid * height), (i / height) % wid, i % height] = oneDimObj[i];
}
return threeDimObj;
}
private List<VadOutputEntity> Infer(List<VadInputEntity> vadInputEntitys)
{
List<VadOutputEntity> vadOutputEntities = new List<VadOutputEntity>();
foreach (VadInputEntity vadInputEntity in vadInputEntitys)
{
int batchSize = 1;//_batchSize
var inputMeta = _onnxSession.InputMetadata;
var container = new List<NamedOnnxValue>();
int[] dim = new int[] { batchSize, vadInputEntity.Speech.Length / 400 / batchSize, 400 };
var tensor = new DenseTensor<float>(vadInputEntity.Speech, dim, false);
container.Add(NamedOnnxValue.CreateFromTensor<float>("speech", tensor));
int i = 0;
foreach (var cache in vadInputEntity.InCaches)
{
int[] cache_dim = new int[] { 1, 128, cache.Length / 128 / 1, 1 };
var cache_tensor = new DenseTensor<float>(cache, cache_dim, false);
container.Add(NamedOnnxValue.CreateFromTensor<float>("in_cache" + i.ToString(), cache_tensor));
i++;
}
IDisposableReadOnlyCollection<DisposableNamedOnnxValue> results = _onnxSession.Run(container);
var resultsArray = results.ToArray();
VadOutputEntity vadOutputEntity = new VadOutputEntity();
for (int j = 0; j < resultsArray.Length; j++)
{
if (resultsArray[j].Name.Equals("logits"))
{
Tensor<float> tensors = resultsArray[0].AsTensor<float>();
var _scores = DimOneToThree<float>(tensors.ToArray(), 1, tensors.Dimensions[1]);
vadOutputEntity.Scores = _scores;
}
if (resultsArray[j].Name.StartsWith("out_cache"))
{
vadOutputEntity.OutCaches.Add(resultsArray[j].AsEnumerable<float>().ToArray());
}
}
vadOutputEntities.Add(vadOutputEntity);
}
return vadOutputEntities;
}
private float[] PadSequence(List<VadInputEntity> modelInputs)
{
int max_speech_length = modelInputs.Max(x => x.SpeechLength);
int speech_length = max_speech_length * modelInputs.Count;
float[] speech = new float[speech_length];
float[,] xxx = new float[modelInputs.Count, max_speech_length];
for (int i = 0; i < modelInputs.Count; i++)
{
if (max_speech_length == modelInputs[i].SpeechLength)
{
for (int j = 0; j < xxx.GetLength(1); j++)
{
#pragma warning disable CS8602 // 解引用可能出现空引用。
xxx[i, j] = modelInputs[i].Speech[j];
#pragma warning restore CS8602 // 解引用可能出现空引用。
}
continue;
}
float[] nullspeech = new float[max_speech_length - modelInputs[i].SpeechLength];
float[]? curr_speech = modelInputs[i].Speech;
float[] padspeech = new float[max_speech_length];
// ///////////////////////////////////////////////////
var arr_neg_mean = _onnxSession.ModelMetadata.CustomMetadataMap["neg_mean"].ToString().Split(',').ToArray();
double[] neg_mean = arr_neg_mean.Select(x => (double)Convert.ToDouble(x)).ToArray();
var arr_inv_stddev = _onnxSession.ModelMetadata.CustomMetadataMap["inv_stddev"].ToString().Split(',').ToArray();
double[] inv_stddev = arr_inv_stddev.Select(x => (double)Convert.ToDouble(x)).ToArray();
int dim = neg_mean.Length;
for (int j = 0; j < max_speech_length; j++)
{
int k = new Random().Next(0, dim);
padspeech[j] = (float)((float)(0 + neg_mean[k]) * inv_stddev[k]);
}
Array.Copy(curr_speech, 0, padspeech, 0, curr_speech.Length);
for (int j = 0; j < padspeech.Length; j++)
{
#pragma warning disable CS8602 // 解引用可能出现空引用。
xxx[i, j] = padspeech[j];
#pragma warning restore CS8602 // 解引用可能出现空引用。
}
}
int s = 0;
for (int i = 0; i < xxx.GetLength(0); i++)
{
for (int j = 0; j < xxx.GetLength(1); j++)
{
speech[s] = xxx[i, j];
s++;
}
}
return speech;
}
protected virtual void Dispose(bool disposing)
{
if (!_disposed)
{
if (disposing)
{
if (_onnxSession != null)
{
_onnxSession.Dispose();
}
if (_wavFrontend != null)
{
_wavFrontend.Dispose();
}
if (_encoderConfEntity != null)
{
_encoderConfEntity = null;
}
if (_vad_post_conf != null)
{
_vad_post_conf = null;
}
}
_disposed = true;
}
}
public void Dispose()
{
Dispose(disposing: true);
GC.SuppressFinalize(this);
}
~AliFsmnVad()
{
Dispose(_disposed);
}
}
}

View File

@@ -0,0 +1,34 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<TargetFramework>net6.0</TargetFramework>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
</PropertyGroup>
<ItemGroup>
<PackageReference Include="KaldiNativeFbankSharp" Version="1.1.2" />
<PackageReference Include="Microsoft.Extensions.Logging" Version="7.0.0" />
<PackageReference Include="Microsoft.ML.OnnxRuntime" Version="1.15.0" />
<PackageReference Include="YamlDotNet" Version="13.1.0" />
</ItemGroup>
<ItemGroup>
<None Update="speech_fsmn_vad_zh-cn-16k-common-pytorch\example\0.wav">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</None>
<None Update="speech_fsmn_vad_zh-cn-16k-common-pytorch\example\1.wav">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</None>
<None Update="speech_fsmn_vad_zh-cn-16k-common-pytorch\model.onnx">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</None>
<None Update="speech_fsmn_vad_zh-cn-16k-common-pytorch\vad.mvn">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</None>
<None Update="speech_fsmn_vad_zh-cn-16k-common-pytorch\vad.yaml">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</None>
</ItemGroup>
</Project>

View File

@@ -0,0 +1,724 @@
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using AliFsmnVadSharp.Model;
namespace AliFsmnVadSharp
{
enum VadStateMachine
{
kVadInStateStartPointNotDetected = 1,
kVadInStateInSpeechSegment = 2,
kVadInStateEndPointDetected = 3,
}
enum VadDetectMode
{
kVadSingleUtteranceDetectMode = 0,
kVadMutipleUtteranceDetectMode = 1,
}
internal class E2EVadModel
{
private VadPostConfEntity _vad_opts = new VadPostConfEntity();
private WindowDetector _windows_detector = new WindowDetector();
private bool _is_final = false;
private int _data_buf_start_frame = 0;
private int _frm_cnt = 0;
private int _latest_confirmed_speech_frame = 0;
private int _lastest_confirmed_silence_frame = -1;
private int _continous_silence_frame_count = 0;
private int _vad_state_machine = (int)VadStateMachine.kVadInStateStartPointNotDetected;
private int _confirmed_start_frame = -1;
private int _confirmed_end_frame = -1;
private int _number_end_time_detected = 0;
private int _sil_frame = 0;
private int[] _sil_pdf_ids = new int[0];
private double _noise_average_decibel = -100.0D;
private bool _pre_end_silence_detected = false;
private bool _next_seg = true;
private List<E2EVadSpeechBufWithDoaEntity> _output_data_buf;
private int _output_data_buf_offset = 0;
private List<E2EVadFrameProbEntity> _frame_probs = new List<E2EVadFrameProbEntity>();
private int _max_end_sil_frame_cnt_thresh = 800 - 150;
private float _speech_noise_thres = 0.6F;
private float[,,] _scores = null;
private int _idx_pre_chunk = 0;
private bool _max_time_out = false;
private List<double> _decibel = new List<double>();
private int _data_buf_size = 0;
private int _data_buf_all_size = 0;
public E2EVadModel(VadPostConfEntity vadPostConfEntity)
{
_vad_opts = vadPostConfEntity;
_windows_detector = new WindowDetector(_vad_opts.window_size_ms,
_vad_opts.sil_to_speech_time_thres,
_vad_opts.speech_to_sil_time_thres,
_vad_opts.frame_in_ms);
AllResetDetection();
}
private void AllResetDetection()
{
_is_final = false;
_data_buf_start_frame = 0;
_frm_cnt = 0;
_latest_confirmed_speech_frame = 0;
_lastest_confirmed_silence_frame = -1;
_continous_silence_frame_count = 0;
_vad_state_machine = (int)VadStateMachine.kVadInStateStartPointNotDetected;
_confirmed_start_frame = -1;
_confirmed_end_frame = -1;
_number_end_time_detected = 0;
_sil_frame = 0;
_sil_pdf_ids = _vad_opts.sil_pdf_ids;
_noise_average_decibel = -100.0F;
_pre_end_silence_detected = false;
_next_seg = true;
_output_data_buf = new List<E2EVadSpeechBufWithDoaEntity>();
_output_data_buf_offset = 0;
_frame_probs = new List<E2EVadFrameProbEntity>();
_max_end_sil_frame_cnt_thresh = _vad_opts.max_end_silence_time - _vad_opts.speech_to_sil_time_thres;
_speech_noise_thres = _vad_opts.speech_noise_thres;
_scores = null;
_idx_pre_chunk = 0;
_max_time_out = false;
_decibel = new List<double>();
_data_buf_size = 0;
_data_buf_all_size = 0;
ResetDetection();
}
private void ResetDetection()
{
_continous_silence_frame_count = 0;
_latest_confirmed_speech_frame = 0;
_lastest_confirmed_silence_frame = -1;
_confirmed_start_frame = -1;
_confirmed_end_frame = -1;
_vad_state_machine = (int)VadStateMachine.kVadInStateStartPointNotDetected;
_windows_detector.Reset();
_sil_frame = 0;
_frame_probs = new List<E2EVadFrameProbEntity>();
}
private void ComputeDecibel(float[] waveform)
{
int frame_sample_length = (int)(_vad_opts.frame_length_ms * _vad_opts.sample_rate / 1000);
int frame_shift_length = (int)(_vad_opts.frame_in_ms * _vad_opts.sample_rate / 1000);
if (_data_buf_all_size == 0)
{
_data_buf_all_size = waveform.Length;
_data_buf_size = _data_buf_all_size;
}
else
{
_data_buf_all_size += waveform.Length;
}
for (int offset = 0; offset < waveform.Length - frame_sample_length + 1; offset += frame_shift_length)
{
float[] _waveform_chunk = new float[frame_sample_length];
Array.Copy(waveform, offset, _waveform_chunk, 0, _waveform_chunk.Length);
float[] _waveform_chunk_pow = _waveform_chunk.Select(x => (float)Math.Pow((double)x, 2)).ToArray();
_decibel.Add(
10 * Math.Log10(
_waveform_chunk_pow.Sum() + 0.000001
)
);
}
}
private void ComputeScores(float[,,] scores)
{
_vad_opts.nn_eval_block_size = scores.GetLength(1);
_frm_cnt += scores.GetLength(1);
_scores = scores;
}
private void PopDataBufTillFrame(int frame_idx)// need check again
{
while (_data_buf_start_frame < frame_idx)
{
if (_data_buf_size >= (int)(_vad_opts.frame_in_ms * _vad_opts.sample_rate / 1000))
{
_data_buf_start_frame += 1;
_data_buf_size = _data_buf_all_size - _data_buf_start_frame * (int)(_vad_opts.frame_in_ms * _vad_opts.sample_rate / 1000);
}
}
}
private void PopDataToOutputBuf(int start_frm, int frm_cnt, bool first_frm_is_start_point,
bool last_frm_is_end_point, bool end_point_is_sent_end)
{
PopDataBufTillFrame(start_frm);
int expected_sample_number = (int)(frm_cnt * _vad_opts.sample_rate * _vad_opts.frame_in_ms / 1000);
if (last_frm_is_end_point)
{
int extra_sample = Math.Max(0, (int)(_vad_opts.frame_length_ms * _vad_opts.sample_rate / 1000 - _vad_opts.sample_rate * _vad_opts.frame_in_ms / 1000));
expected_sample_number += (int)(extra_sample);
}
if (end_point_is_sent_end)
{
expected_sample_number = Math.Max(expected_sample_number, _data_buf_size);
}
if (_data_buf_size < expected_sample_number)
{
Console.WriteLine("error in calling pop data_buf\n");
}
if (_output_data_buf.Count == 0 || first_frm_is_start_point)
{
_output_data_buf.Add(new E2EVadSpeechBufWithDoaEntity());
_output_data_buf.Last().Reset();
_output_data_buf.Last().start_ms = start_frm * _vad_opts.frame_in_ms;
_output_data_buf.Last().end_ms = _output_data_buf.Last().start_ms;
_output_data_buf.Last().doa = 0;
}
E2EVadSpeechBufWithDoaEntity cur_seg = _output_data_buf.Last();
if (cur_seg.end_ms != start_frm * _vad_opts.frame_in_ms)
{
Console.WriteLine("warning\n");
}
int out_pos = cur_seg.buffer.Length; // cur_seg.buff现在没做任何操作
int data_to_pop = 0;
if (end_point_is_sent_end)
{
data_to_pop = expected_sample_number;
}
else
{
data_to_pop = (int)(frm_cnt * _vad_opts.frame_in_ms * _vad_opts.sample_rate / 1000);
}
if (data_to_pop > _data_buf_size)
{
Console.WriteLine("VAD data_to_pop is bigger than _data_buf_size!!!\n");
data_to_pop = _data_buf_size;
expected_sample_number = _data_buf_size;
}
cur_seg.doa = 0;
for (int sample_cpy_out = 0; sample_cpy_out < data_to_pop; sample_cpy_out++)
{
out_pos += 1;
}
for (int sample_cpy_out = data_to_pop; sample_cpy_out < expected_sample_number; sample_cpy_out++)
{
out_pos += 1;
}
if (cur_seg.end_ms != start_frm * _vad_opts.frame_in_ms)
{
Console.WriteLine("Something wrong with the VAD algorithm\n");
}
_data_buf_start_frame += frm_cnt;
cur_seg.end_ms = (start_frm + frm_cnt) * _vad_opts.frame_in_ms;
if (first_frm_is_start_point)
{
cur_seg.contain_seg_start_point = true;
}
if (last_frm_is_end_point)
{
cur_seg.contain_seg_end_point = true;
}
}
private void OnSilenceDetected(int valid_frame)
{
_lastest_confirmed_silence_frame = valid_frame;
if (_vad_state_machine == (int)VadStateMachine.kVadInStateStartPointNotDetected)
{
PopDataBufTillFrame(valid_frame);
}
}
private void OnVoiceDetected(int valid_frame)
{
_latest_confirmed_speech_frame = valid_frame;
PopDataToOutputBuf(valid_frame, 1, false, false, false);
}
private void OnVoiceStart(int start_frame, bool fake_result = false)
{
if (_vad_opts.do_start_point_detection)
{
//do nothing
}
if (_confirmed_start_frame != -1)
{
Console.WriteLine("not reset vad properly\n");
}
else
{
_confirmed_start_frame = start_frame;
}
if (!fake_result || _vad_state_machine == (int)VadStateMachine.kVadInStateStartPointNotDetected)
{
PopDataToOutputBuf(_confirmed_start_frame, 1, true, false, false);
}
}
private void OnVoiceEnd(int end_frame, bool fake_result, bool is_last_frame)
{
for (int t = _latest_confirmed_speech_frame + 1; t < end_frame; t++)
{
OnVoiceDetected(t);
}
if (_vad_opts.do_end_point_detection)
{
//do nothing
}
if (_confirmed_end_frame != -1)
{
Console.WriteLine("not reset vad properly\n");
}
else
{
_confirmed_end_frame = end_frame;
}
if (!fake_result)
{
_sil_frame = 0;
PopDataToOutputBuf(_confirmed_end_frame, 1, false, true, is_last_frame);
}
_number_end_time_detected += 1;
}
private void MaybeOnVoiceEndIfLastFrame(bool is_final_frame, int cur_frm_idx)
{
if (is_final_frame)
{
OnVoiceEnd(cur_frm_idx, false, true);
_vad_state_machine = (int)VadStateMachine.kVadInStateEndPointDetected;
}
}
private int GetLatency()
{
return (int)(LatencyFrmNumAtStartPoint() * _vad_opts.frame_in_ms);
}
private int LatencyFrmNumAtStartPoint()
{
int vad_latency = _windows_detector.GetWinSize();
if (_vad_opts.do_extend != 0)
{
vad_latency += (int)(_vad_opts.lookback_time_start_point / _vad_opts.frame_in_ms);
}
return vad_latency;
}
private FrameState GetFrameState(int t)
{
FrameState frame_state = FrameState.kFrameStateInvalid;
double cur_decibel = _decibel[t];
double cur_snr = cur_decibel - _noise_average_decibel;
if (cur_decibel < _vad_opts.decibel_thres)
{
frame_state = FrameState.kFrameStateSil;
DetectOneFrame(frame_state, t, false);
return frame_state;
}
double sum_score = 0.0D;
double noise_prob = 0.0D;
Trace.Assert(_sil_pdf_ids.Length == _vad_opts.silence_pdf_num, "");
if (_sil_pdf_ids.Length > 0)
{
Trace.Assert(_scores.GetLength(0) == 1, "只支持batch_size = 1的测试"); // 只支持batch_size = 1的测试
float[] sil_pdf_scores = new float[_sil_pdf_ids.Length];
int j = 0;
foreach (int sil_pdf_id in _sil_pdf_ids)
{
sil_pdf_scores[j] = _scores[0,t - _idx_pre_chunk,sil_pdf_id];
j++;
}
sum_score = sil_pdf_scores.Length == 0 ? 0 : sil_pdf_scores.Sum();
noise_prob = Math.Log(sum_score) * _vad_opts.speech_2_noise_ratio;
double total_score = 1.0D;
sum_score = total_score - sum_score;
}
double speech_prob = Math.Log(sum_score);
if (_vad_opts.output_frame_probs)
{
E2EVadFrameProbEntity frame_prob = new E2EVadFrameProbEntity();
frame_prob.noise_prob = noise_prob;
frame_prob.speech_prob = speech_prob;
frame_prob.score = sum_score;
frame_prob.frame_id = t;
_frame_probs.Add(frame_prob);
}
if (Math.Exp(speech_prob) >= Math.Exp(noise_prob) + _speech_noise_thres)
{
if (cur_snr >= _vad_opts.snr_thres && cur_decibel >= _vad_opts.decibel_thres)
{
frame_state = FrameState.kFrameStateSpeech;
}
else
{
frame_state = FrameState.kFrameStateSil;
}
}
else
{
frame_state = FrameState.kFrameStateSil;
if (_noise_average_decibel < -99.9)
{
_noise_average_decibel = cur_decibel;
}
else
{
_noise_average_decibel = (cur_decibel + _noise_average_decibel * (_vad_opts.noise_frame_num_used_for_snr - 1)) / _vad_opts.noise_frame_num_used_for_snr;
}
}
return frame_state;
}
public SegmentEntity[] DefaultCall(float[,,] score, float[] waveform,
bool is_final = false, int max_end_sil = 800, bool online = false
)
{
_max_end_sil_frame_cnt_thresh = max_end_sil - _vad_opts.speech_to_sil_time_thres;
// compute decibel for each frame
ComputeDecibel(waveform);
ComputeScores(score);
if (!is_final)
{
DetectCommonFrames();
}
else
{
DetectLastFrames();
}
int batchSize = score.GetLength(0);
SegmentEntity[] segments = new SegmentEntity[batchSize];
for (int batch_num = 0; batch_num < batchSize; batch_num++) // only support batch_size = 1 now
{
List<int[]> segment_batch = new List<int[]>();
if (_output_data_buf.Count > 0)
{
for (int i = _output_data_buf_offset; i < _output_data_buf.Count; i++)
{
int start_ms;
int end_ms;
if (online)
{
if (!_output_data_buf[i].contain_seg_start_point)
{
continue;
}
if (!_next_seg && !_output_data_buf[i].contain_seg_end_point)
{
continue;
}
start_ms = _next_seg ? _output_data_buf[i].start_ms : -1;
if (_output_data_buf[i].contain_seg_end_point)
{
end_ms = _output_data_buf[i].end_ms;
_next_seg = true;
_output_data_buf_offset += 1;
}
else
{
end_ms = -1;
_next_seg = false;
}
}
else
{
if (!is_final && (!_output_data_buf[i].contain_seg_start_point || !_output_data_buf[i].contain_seg_end_point))
{
continue;
}
start_ms = _output_data_buf[i].start_ms;
end_ms = _output_data_buf[i].end_ms;
_output_data_buf_offset += 1;
}
int[] segment_ms = new int[] { start_ms, end_ms };
segment_batch.Add(segment_ms);
}
}
if (segment_batch.Count > 0)
{
if (segments[batch_num] == null)
{
segments[batch_num] = new SegmentEntity();
}
segments[batch_num].Segment.AddRange(segment_batch);
}
}
if (is_final)
{
// reset class variables and clear the dict for the next query
AllResetDetection();
}
return segments;
}
private int DetectCommonFrames()
{
if (_vad_state_machine == (int)VadStateMachine.kVadInStateEndPointDetected)
{
return 0;
}
for (int i = _vad_opts.nn_eval_block_size - 1; i > -1; i += -1)
{
FrameState frame_state = FrameState.kFrameStateInvalid;
frame_state = GetFrameState(_frm_cnt - 1 - i);
DetectOneFrame(frame_state, _frm_cnt - 1 - i, false);
}
_idx_pre_chunk += _scores.GetLength(1)* _scores.GetLength(0); //_scores.shape[1];
return 0;
}
private int DetectLastFrames()
{
if (_vad_state_machine == (int)VadStateMachine.kVadInStateEndPointDetected)
{
return 0;
}
try
{
for (int i = _vad_opts.nn_eval_block_size - 1; i > -1; i += -1)
{
FrameState frame_state = FrameState.kFrameStateInvalid;
frame_state = GetFrameState(_frm_cnt - 1 - i);
if (i != 0)
{
DetectOneFrame(frame_state, _frm_cnt - 1 - i, false);
}
else
{
DetectOneFrame(frame_state, _frm_cnt - 1, true);
}
}
}
catch (Exception e)
{
//
}
return 0;
}
private void DetectOneFrame(FrameState cur_frm_state, int cur_frm_idx, bool is_final_frame)
{
FrameState tmp_cur_frm_state = FrameState.kFrameStateInvalid;
if (cur_frm_state == FrameState.kFrameStateSpeech)
{
if (Math.Abs(1.0) > _vad_opts.fe_prior_thres)//Fabs
{
tmp_cur_frm_state = FrameState.kFrameStateSpeech;
}
else
{
tmp_cur_frm_state = FrameState.kFrameStateSil;
}
}
else if (cur_frm_state == FrameState.kFrameStateSil)
{
tmp_cur_frm_state = FrameState.kFrameStateSil;
}
AudioChangeState state_change = _windows_detector.DetectOneFrame(tmp_cur_frm_state, cur_frm_idx);
int frm_shift_in_ms = _vad_opts.frame_in_ms;
if (AudioChangeState.kChangeStateSil2Speech == state_change)
{
int silence_frame_count = _continous_silence_frame_count; // no used
_continous_silence_frame_count = 0;
_pre_end_silence_detected = false;
int start_frame = 0;
if (_vad_state_machine == (int)VadStateMachine.kVadInStateStartPointNotDetected)
{
start_frame = Math.Max(_data_buf_start_frame, cur_frm_idx - LatencyFrmNumAtStartPoint());
OnVoiceStart(start_frame);
_vad_state_machine = (int)VadStateMachine.kVadInStateInSpeechSegment;
for (int t = start_frame + 1; t < cur_frm_idx + 1; t++)
{
OnVoiceDetected(t);
}
}
else if (_vad_state_machine == (int)VadStateMachine.kVadInStateInSpeechSegment)
{
for (int t = _latest_confirmed_speech_frame + 1; t < cur_frm_idx; t++)
{
OnVoiceDetected(t);
}
if (cur_frm_idx - _confirmed_start_frame + 1 > _vad_opts.max_single_segment_time / frm_shift_in_ms)
{
OnVoiceEnd(cur_frm_idx, false, false);
_vad_state_machine = (int)VadStateMachine.kVadInStateEndPointDetected;
}
else if (!is_final_frame)
{
OnVoiceDetected(cur_frm_idx);
}
else
{
MaybeOnVoiceEndIfLastFrame(is_final_frame, cur_frm_idx);
}
}
else
{
return;
}
}
else if (AudioChangeState.kChangeStateSpeech2Sil == state_change)
{
_continous_silence_frame_count = 0;
if (_vad_state_machine == (int)VadStateMachine.kVadInStateStartPointNotDetected)
{ return; }
else if (_vad_state_machine == (int)VadStateMachine.kVadInStateInSpeechSegment)
{
if (cur_frm_idx - _confirmed_start_frame + 1 > _vad_opts.max_single_segment_time / frm_shift_in_ms)
{
OnVoiceEnd(cur_frm_idx, false, false);
_vad_state_machine = (int)VadStateMachine.kVadInStateEndPointDetected;
}
else if (!is_final_frame)
{
OnVoiceDetected(cur_frm_idx);
}
else
{
MaybeOnVoiceEndIfLastFrame(is_final_frame, cur_frm_idx);
}
}
else
{
return;
}
}
else if (AudioChangeState.kChangeStateSpeech2Speech == state_change)
{
_continous_silence_frame_count = 0;
if (_vad_state_machine == (int)VadStateMachine.kVadInStateInSpeechSegment)
{
if (cur_frm_idx - _confirmed_start_frame + 1 > _vad_opts.max_single_segment_time / frm_shift_in_ms)
{
_max_time_out = true;
OnVoiceEnd(cur_frm_idx, false, false);
_vad_state_machine = (int)VadStateMachine.kVadInStateEndPointDetected;
}
else if (!is_final_frame)
{
OnVoiceDetected(cur_frm_idx);
}
else
{
MaybeOnVoiceEndIfLastFrame(is_final_frame, cur_frm_idx);
}
}
else
{
return;
}
}
else if (AudioChangeState.kChangeStateSil2Sil == state_change)
{
_continous_silence_frame_count += 1;
if (_vad_state_machine == (int)VadStateMachine.kVadInStateStartPointNotDetected)
{
// silence timeout, return zero length decision
if (((_vad_opts.detect_mode == (int)VadDetectMode.kVadSingleUtteranceDetectMode) && (
_continous_silence_frame_count * frm_shift_in_ms > _vad_opts.max_start_silence_time)) || (is_final_frame && _number_end_time_detected == 0))
{
for (int t = _lastest_confirmed_silence_frame + 1; t < cur_frm_idx; t++)
{
OnSilenceDetected(t);
}
OnVoiceStart(0, true);
OnVoiceEnd(0, true, false);
_vad_state_machine = (int)VadStateMachine.kVadInStateEndPointDetected;
}
else
{
if (cur_frm_idx >= LatencyFrmNumAtStartPoint())
{
OnSilenceDetected(cur_frm_idx - LatencyFrmNumAtStartPoint());
}
}
}
else if (_vad_state_machine == (int)VadStateMachine.kVadInStateInSpeechSegment)
{
if (_continous_silence_frame_count * frm_shift_in_ms >= _max_end_sil_frame_cnt_thresh)
{
int lookback_frame = (int)(_max_end_sil_frame_cnt_thresh / frm_shift_in_ms);
if (_vad_opts.do_extend != 0)
{
lookback_frame -= (int)(_vad_opts.lookahead_time_end_point / frm_shift_in_ms);
lookback_frame -= 1;
lookback_frame = Math.Max(0, lookback_frame);
}
OnVoiceEnd(cur_frm_idx - lookback_frame, false, false);
_vad_state_machine = (int)VadStateMachine.kVadInStateEndPointDetected;
}
else if (cur_frm_idx - _confirmed_start_frame + 1 > _vad_opts.max_single_segment_time / frm_shift_in_ms)
{
OnVoiceEnd(cur_frm_idx, false, false);
_vad_state_machine = (int)VadStateMachine.kVadInStateEndPointDetected;
}
else if (_vad_opts.do_extend != 0 && !is_final_frame)
{
if (_continous_silence_frame_count <= (int)(_vad_opts.lookahead_time_end_point / frm_shift_in_ms))
{
OnVoiceDetected(cur_frm_idx);
}
}
else
{
MaybeOnVoiceEndIfLastFrame(is_final_frame, cur_frm_idx);
}
}
else
{
return;
}
}
if (_vad_state_machine == (int)VadStateMachine.kVadInStateEndPointDetected && _vad_opts.detect_mode == (int)VadDetectMode.kVadMutipleUtteranceDetectMode)
{
ResetDetection();
}
}
}
}

View File

@@ -0,0 +1,17 @@
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
namespace AliFsmnVadSharp.Model
{
internal class CmvnEntity
{
private List<float> _means = new List<float>();
private List<float> _vars = new List<float>();
public List<float> Means { get => _means; set => _means = value; }
public List<float> Vars { get => _vars; set => _vars = value; }
}
}

View File

@@ -0,0 +1,23 @@
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
namespace AliFsmnVadSharp.Model
{
internal class E2EVadFrameProbEntity
{
private double _noise_prob = 0.0F;
private double _speech_prob = 0.0F;
private double _score = 0.0F;
private int _frame_id = 0;
private int _frm_state = 0;
public double noise_prob { get => _noise_prob; set => _noise_prob = value; }
public double speech_prob { get => _speech_prob; set => _speech_prob = value; }
public double score { get => _score; set => _score = value; }
public int frame_id { get => _frame_id; set => _frame_id = value; }
public int frm_state { get => _frm_state; set => _frm_state = value; }
}
}

View File

@@ -0,0 +1,98 @@
// AliFsmnVadSharp, Version=1.0.0.0, Culture=neutral, PublicKeyToken=null
// AliFsmnVadSharp.Model.E2EVadSpeechBufWithDoaEntity
internal class E2EVadSpeechBufWithDoaEntity
{
private int _start_ms = 0;
private int _end_ms = 0;
private byte[]? _buffer;
private bool _contain_seg_start_point = false;
private bool _contain_seg_end_point = false;
private int _doa = 0;
public int start_ms
{
get
{
return _start_ms;
}
set
{
_start_ms = value;
}
}
public int end_ms
{
get
{
return _end_ms;
}
set
{
_end_ms = value;
}
}
public byte[]? buffer
{
get
{
return _buffer;
}
set
{
_buffer = value;
}
}
public bool contain_seg_start_point
{
get
{
return _contain_seg_start_point;
}
set
{
_contain_seg_start_point = value;
}
}
public bool contain_seg_end_point
{
get
{
return _contain_seg_end_point;
}
set
{
_contain_seg_end_point = value;
}
}
public int doa
{
get
{
return _doa;
}
set
{
_doa = value;
}
}
public void Reset()
{
_start_ms = 0;
_end_ms = 0;
_buffer = new byte[0];
_contain_seg_start_point = false;
_contain_seg_end_point = false;
_doa = 0;
}
}

View File

@@ -0,0 +1,35 @@
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
namespace AliFsmnVadSharp.Model
{
public class EncoderConfEntity
{
private int _input_dim=400;
private int _input_affineDim = 140;
private int _fsmn_layers = 4;
private int _linear_dim = 250;
private int _proj_dim = 128;
private int _lorder = 20;
private int _rorder = 0;
private int _lstride = 1;
private int _rstride = 0;
private int _output_dffine_dim = 140;
private int _output_dim = 248;
public int input_dim { get => _input_dim; set => _input_dim = value; }
public int input_affine_dim { get => _input_affineDim; set => _input_affineDim = value; }
public int fsmn_layers { get => _fsmn_layers; set => _fsmn_layers = value; }
public int linear_dim { get => _linear_dim; set => _linear_dim = value; }
public int proj_dim { get => _proj_dim; set => _proj_dim = value; }
public int lorder { get => _lorder; set => _lorder = value; }
public int rorder { get => _rorder; set => _rorder = value; }
public int lstride { get => _lstride; set => _lstride = value; }
public int rstride { get => _rstride; set => _rstride = value; }
public int output_affine_dim { get => _output_dffine_dim; set => _output_dffine_dim = value; }
public int output_dim { get => _output_dim; set => _output_dim = value; }
}
}

View File

@@ -0,0 +1,29 @@
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
namespace AliFsmnVadSharp.Model
{
public class FrontendConfEntity
{
private int _fs = 16000;
private string _window = "hamming";
private int _n_mels = 80;
private int _frame_length = 25;
private int _frame_shift = 10;
private float _dither = 0.0F;
private int _lfr_m = 5;
private int _lfr_n = 1;
public int fs { get => _fs; set => _fs = value; }
public string window { get => _window; set => _window = value; }
public int n_mels { get => _n_mels; set => _n_mels = value; }
public int frame_length { get => _frame_length; set => _frame_length = value; }
public int frame_shift { get => _frame_shift; set => _frame_shift = value; }
public float dither { get => _dither; set => _dither = value; }
public int lfr_m { get => _lfr_m; set => _lfr_m = value; }
public int lfr_n { get => _lfr_n; set => _lfr_n = value; }
}
}

View File

@@ -0,0 +1,22 @@
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
namespace AliFsmnVadSharp.Model
{
public class SegmentEntity
{
private List<int[]> _segment=new List<int[]>();
private List<float[]> _waveform=new List<float[]>();
public List<int[]> Segment { get => _segment; set => _segment = value; }
public List<float[]> Waveform { get => _waveform; set => _waveform = value; }
//public SegmentEntity()
//{
// int[] t=new int[0];
// _segment.Add(t);
//}
}
}

View File

@@ -0,0 +1,23 @@
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
namespace AliFsmnVadSharp.Model
{
internal class VadInputEntity
{
private float[]? _speech;
private int _speechLength;
private List<float[]> _inCaches = new List<float[]>();
private float[]? _waveform;
private E2EVadModel _vad_scorer;
public float[]? Speech { get => _speech; set => _speech = value; }
public int SpeechLength { get => _speechLength; set => _speechLength = value; }
public List<float[]> InCaches { get => _inCaches; set => _inCaches = value; }
public float[] Waveform { get => _waveform; set => _waveform = value; }
internal E2EVadModel VadScorer { get => _vad_scorer; set => _vad_scorer = value; }
}
}

View File

@@ -0,0 +1,19 @@
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
namespace AliFsmnVadSharp.Model
{
internal class VadOutputEntity
{
private float[,,]? _scores;
private List<float[]> _outCaches=new List<float[]>();
private float[]? _waveform;
public float[,,]? Scores { get => _scores; set => _scores = value; }
public List<float[]> OutCaches { get => _outCaches; set => _outCaches = value; }
public float[] Waveform { get => _waveform; set => _waveform = value; }
}
}

View File

@@ -0,0 +1,72 @@
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
namespace AliFsmnVadSharp.Model
{
public class VadPostConfEntity
{
private int _sample_rate= 16000;
private int _detect_mode = 1 ;
private int _snr_mode = 0;
private int _max_end_silence_time = 800;
private int _max_start_silence_time = 3000;
private bool _do_start_point_detection = true;
private bool _do_end_point_detection = true;
private int _window_size_ms = 200;
private int _sil_to_speech_time_thres = 150;
private int _speech_to_sil_time_thres = 150;
private float _speech_2_noise_ratio = 1.0F;
private int _do_extend = 1;
private int _lookback_time_start_point = 200;
private int _lookahead_time_end_point = 100;
private int _max_single_segment_time = 60000;
private int _nn_eval_block_size = 8;
private int _dcd_block_size = 4;
private float _snr_thres = -100.0F;
private int _noise_frame_num_used_for_snr = 100;
private float _decibel_thres = -100.0F;
private float _speech_noise_thres = 0.6F;
private float _fe_prior_thres = 0.0001F;
private int _silence_pdf_num = 1;
private int[] _sil_pdf_ids = new int[] {0};
private float _speech_noise_thresh_low = -0.1F;
private float _speech_noise_thresh_high = 0.3F;
private bool _output_frame_probs = false;
private int _frame_in_ms = 10;
private int _frame_length_ms = 25;
public int sample_rate { get => _sample_rate; set => _sample_rate = value; }
public int detect_mode { get => _detect_mode; set => _detect_mode = value; }
public int snr_mode { get => _snr_mode; set => _snr_mode = value; }
public int max_end_silence_time { get => _max_end_silence_time; set => _max_end_silence_time = value; }
public int max_start_silence_time { get => _max_start_silence_time; set => _max_start_silence_time = value; }
public bool do_start_point_detection { get => _do_start_point_detection; set => _do_start_point_detection = value; }
public bool do_end_point_detection { get => _do_end_point_detection; set => _do_end_point_detection = value; }
public int window_size_ms { get => _window_size_ms; set => _window_size_ms = value; }
public int sil_to_speech_time_thres { get => _sil_to_speech_time_thres; set => _sil_to_speech_time_thres = value; }
public int speech_to_sil_time_thres { get => _speech_to_sil_time_thres; set => _speech_to_sil_time_thres = value; }
public float speech_2_noise_ratio { get => _speech_2_noise_ratio; set => _speech_2_noise_ratio = value; }
public int do_extend { get => _do_extend; set => _do_extend = value; }
public int lookback_time_start_point { get => _lookback_time_start_point; set => _lookback_time_start_point = value; }
public int lookahead_time_end_point { get => _lookahead_time_end_point; set => _lookahead_time_end_point = value; }
public int max_single_segment_time { get => _max_single_segment_time; set => _max_single_segment_time = value; }
public int nn_eval_block_size { get => _nn_eval_block_size; set => _nn_eval_block_size = value; }
public int dcd_block_size { get => _dcd_block_size; set => _dcd_block_size = value; }
public float snr_thres { get => _snr_thres; set => _snr_thres = value; }
public int noise_frame_num_used_for_snr { get => _noise_frame_num_used_for_snr; set => _noise_frame_num_used_for_snr = value; }
public float decibel_thres { get => _decibel_thres; set => _decibel_thres = value; }
public float speech_noise_thres { get => _speech_noise_thres; set => _speech_noise_thres = value; }
public float fe_prior_thres { get => _fe_prior_thres; set => _fe_prior_thres = value; }
public int silence_pdf_num { get => _silence_pdf_num; set => _silence_pdf_num = value; }
public int[] sil_pdf_ids { get => _sil_pdf_ids; set => _sil_pdf_ids = value; }
public float speech_noise_thresh_low { get => _speech_noise_thresh_low; set => _speech_noise_thresh_low = value; }
public float speech_noise_thresh_high { get => _speech_noise_thresh_high; set => _speech_noise_thresh_high = value; }
public bool output_frame_probs { get => _output_frame_probs; set => _output_frame_probs = value; }
public int frame_in_ms { get => _frame_in_ms; set => _frame_in_ms = value; }
public int frame_length_ms { get => _frame_length_ms; set => _frame_length_ms = value; }
}
}

View File

@@ -0,0 +1,27 @@
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
namespace AliFsmnVadSharp.Model
{
internal class VadYamlEntity
{
private int _input_size;
private string _frontend = "wav_frontend";
private FrontendConfEntity _frontend_conf=new FrontendConfEntity();
private string _model = "e2evad";
private string _encoder = "fsmn";
private EncoderConfEntity _encoder_conf=new EncoderConfEntity();
private VadPostConfEntity _vad_post_conf=new VadPostConfEntity();
public int input_size { get => _input_size; set => _input_size = value; }
public string frontend { get => _frontend; set => _frontend = value; }
public string model { get => _model; set => _model = value; }
public string encoder { get => _encoder; set => _encoder = value; }
public FrontendConfEntity frontend_conf { get => _frontend_conf; set => _frontend_conf = value; }
public EncoderConfEntity encoder_conf { get => _encoder_conf; set => _encoder_conf = value; }
public VadPostConfEntity model_conf { get => _vad_post_conf; set => _vad_post_conf = value; }
}
}

View File

@@ -0,0 +1,28 @@
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using System.Text.Json;
using YamlDotNet.Serialization;
namespace AliFsmnVadSharp.Utils
{
internal class YamlHelper
{
public static T ReadYaml<T>(string yamlFilePath)
{
if (!File.Exists(yamlFilePath))
{
#pragma warning disable CS8603 // 可能返回 null 引用。
return default(T);
#pragma warning restore CS8603 // 可能返回 null 引用。
}
StreamReader yamlReader = File.OpenText(yamlFilePath);
Deserializer yamlDeserializer = new Deserializer();
T info = yamlDeserializer.Deserialize<T>(yamlReader);
yamlReader.Close();
return info;
}
}
}

View File

@@ -0,0 +1,180 @@
using AliFsmnVadSharp.Model;
using KaldiNativeFbankSharp;
namespace AliFsmnVadSharp
{
internal class WavFrontend : IDisposable
{
private bool _disposed;
private FrontendConfEntity _frontendConfEntity;
OnlineFbank _onlineFbank;
private CmvnEntity _cmvnEntity;
private static int _fbank_beg_idx = 0;
public WavFrontend(string mvnFilePath, FrontendConfEntity frontendConfEntity)
{
_frontendConfEntity = frontendConfEntity;
_fbank_beg_idx = 0;
_onlineFbank = new OnlineFbank(
dither: _frontendConfEntity.dither,
snip_edges: true,
sample_rate: _frontendConfEntity.fs,
num_bins: _frontendConfEntity.n_mels
);
_cmvnEntity = LoadCmvn(mvnFilePath);
}
public float[] GetFbank(float[] samples)
{
float sample_rate = _frontendConfEntity.fs;
samples = samples.Select((float x) => x * 32768f).ToArray();
float[] fbanks = _onlineFbank.GetFbank(samples);
return fbanks;
}
public float[] LfrCmvn(float[] fbanks)
{
float[] features = fbanks;
if (_frontendConfEntity.lfr_m != 1 || _frontendConfEntity.lfr_n != 1)
{
features = ApplyLfr(fbanks, _frontendConfEntity.lfr_m, _frontendConfEntity.lfr_n);
}
if (_cmvnEntity != null)
{
features = ApplyCmvn(features);
}
return features;
}
private float[] ApplyCmvn(float[] inputs)
{
var arr_neg_mean = _cmvnEntity.Means;
float[] neg_mean = arr_neg_mean.Select(x => (float)Convert.ToDouble(x)).ToArray();
var arr_inv_stddev = _cmvnEntity.Vars;
float[] inv_stddev = arr_inv_stddev.Select(x => (float)Convert.ToDouble(x)).ToArray();
int dim = neg_mean.Length;
int num_frames = inputs.Length / dim;
for (int i = 0; i < num_frames; i++)
{
for (int k = 0; k != dim; ++k)
{
inputs[dim * i + k] = (inputs[dim * i + k] + neg_mean[k]) * inv_stddev[k];
}
}
return inputs;
}
public float[] ApplyLfr(float[] inputs, int lfr_m, int lfr_n)
{
int t = inputs.Length / 80;
int t_lfr = (int)Math.Floor((double)(t / lfr_n));
float[] input_0 = new float[80];
Array.Copy(inputs, 0, input_0, 0, 80);
int tile_x = (lfr_m - 1) / 2;
t = t + tile_x;
float[] inputs_temp = new float[t * 80];
for (int i = 0; i < tile_x; i++)
{
Array.Copy(input_0, 0, inputs_temp, tile_x * 80, 80);
}
Array.Copy(inputs, 0, inputs_temp, tile_x * 80, inputs.Length);
inputs = inputs_temp;
float[] LFR_outputs = new float[t_lfr * lfr_m * 80];
for (int i = 0; i < t_lfr; i++)
{
if (lfr_m <= t - i * lfr_n)
{
Array.Copy(inputs, i * lfr_n * 80, LFR_outputs, i* lfr_m * 80, lfr_m * 80);
}
else
{
// process last LFR frame
int num_padding = lfr_m - (t - i * lfr_n);
float[] frame = new float[lfr_m * 80];
Array.Copy(inputs, i * lfr_n * 80, frame, 0, (t - i * lfr_n) * 80);
for (int j = 0; j < num_padding; j++)
{
Array.Copy(inputs, (t - 1) * 80, frame, (lfr_m - num_padding + j) * 80, 80);
}
Array.Copy(frame, 0, LFR_outputs, i * lfr_m * 80, frame.Length);
}
}
return LFR_outputs;
}
private CmvnEntity LoadCmvn(string mvnFilePath)
{
List<float> means_list = new List<float>();
List<float> vars_list = new List<float>();
FileStreamOptions options = new FileStreamOptions();
options.Access = FileAccess.Read;
options.Mode = FileMode.Open;
StreamReader srtReader = new StreamReader(mvnFilePath, options);
int i = 0;
while (!srtReader.EndOfStream)
{
string? strLine = srtReader.ReadLine();
if (!string.IsNullOrEmpty(strLine))
{
if (strLine.StartsWith("<AddShift>"))
{
i=1;
continue;
}
if (strLine.StartsWith("<Rescale>"))
{
i = 2;
continue;
}
if (strLine.StartsWith("<LearnRateCoef>") && i==1)
{
string[] add_shift_line = strLine.Substring(strLine.IndexOf("[") + 1, strLine.LastIndexOf("]") - strLine.IndexOf("[") - 1).Split(" ");
means_list = add_shift_line.Where(x => !string.IsNullOrEmpty(x)).Select(x => float.Parse(x.Trim())).ToList();
continue;
}
if (strLine.StartsWith("<LearnRateCoef>") && i==2)
{
string[] rescale_line = strLine.Substring(strLine.IndexOf("[") + 1, strLine.LastIndexOf("]") - strLine.IndexOf("[") - 1).Split(" ");
vars_list = rescale_line.Where(x => !string.IsNullOrEmpty(x)).Select(x => float.Parse(x.Trim())).ToList();
continue;
}
}
}
CmvnEntity cmvnEntity = new CmvnEntity();
cmvnEntity.Means = means_list;
cmvnEntity.Vars = vars_list;
return cmvnEntity;
}
protected virtual void Dispose(bool disposing)
{
if (!_disposed)
{
if (disposing)
{
if (_onlineFbank != null)
{
_onlineFbank.Dispose();
}
}
_disposed = true;
}
}
public void Dispose()
{
Dispose(disposing: true);
GC.SuppressFinalize(this);
}
~WavFrontend()
{
Dispose(_disposed);
}
}
}

View File

@@ -0,0 +1,156 @@
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
namespace AliFsmnVadSharp
{
public enum FrameState
{
kFrameStateInvalid = -1,
kFrameStateSpeech = 1,
kFrameStateSil = 0
}
/// <summary>
/// final voice/unvoice state per frame
/// </summary>
public enum AudioChangeState
{
kChangeStateSpeech2Speech = 0,
kChangeStateSpeech2Sil = 1,
kChangeStateSil2Sil = 2,
kChangeStateSil2Speech = 3,
kChangeStateNoBegin = 4,
kChangeStateInvalid = 5
}
internal class WindowDetector
{
private int _window_size_ms = 0; //window_size_ms;
private int _sil_to_speech_time = 0; //sil_to_speech_time;
private int _speech_to_sil_time = 0; //speech_to_sil_time;
private int _frame_size_ms = 0; //frame_size_ms;
private int _win_size_frame = 0;
private int _win_sum = 0;
private int[] _win_state = new int[0];// * _win_size_frame; // 初始化窗
private int _cur_win_pos = 0;
private int _pre_frame_state = (int)FrameState.kFrameStateSil;
private int _cur_frame_state = (int)FrameState.kFrameStateSil;
private int _sil_to_speech_frmcnt_thres = 0; //int(sil_to_speech_time / frame_size_ms);
private int _speech_to_sil_frmcnt_thres = 0; //int(speech_to_sil_time / frame_size_ms);
private int _voice_last_frame_count = 0;
private int _noise_last_frame_count = 0;
private int _hydre_frame_count = 0;
public WindowDetector()
{
}
public WindowDetector(int window_size_ms, int sil_to_speech_time, int speech_to_sil_time, int frame_size_ms)
{
_window_size_ms = window_size_ms;
_sil_to_speech_time = sil_to_speech_time;
_speech_to_sil_time = speech_to_sil_time;
_frame_size_ms = frame_size_ms;
_win_size_frame = (int)(window_size_ms / frame_size_ms);
_win_sum = 0;
_win_state = new int[_win_size_frame];//[0] * _win_size_frame; // 初始化窗
_cur_win_pos = 0;
_pre_frame_state = (int)FrameState.kFrameStateSil;
_cur_frame_state = (int)FrameState.kFrameStateSil;
_sil_to_speech_frmcnt_thres = (int)(sil_to_speech_time / frame_size_ms);
_speech_to_sil_frmcnt_thres = (int)(speech_to_sil_time / frame_size_ms);
_voice_last_frame_count = 0;
_noise_last_frame_count = 0;
_hydre_frame_count = 0;
}
public void Reset()
{
_cur_win_pos = 0;
_win_sum = 0;
_win_state = new int[_win_size_frame];
_pre_frame_state = (int)FrameState.kFrameStateSil;
_cur_frame_state = (int)FrameState.kFrameStateSil;
_voice_last_frame_count = 0;
_noise_last_frame_count = 0;
_hydre_frame_count = 0;
}
public int GetWinSize()
{
return _win_size_frame;
}
public AudioChangeState DetectOneFrame(FrameState frameState, int frame_count)
{
_cur_frame_state = (int)FrameState.kFrameStateSil;
if (frameState == FrameState.kFrameStateSpeech)
{
_cur_frame_state = 1;
}
else if (frameState == FrameState.kFrameStateSil)
{
_cur_frame_state = 0;
}
else
{
return AudioChangeState.kChangeStateInvalid;
}
_win_sum -= _win_state[_cur_win_pos];
_win_sum += _cur_frame_state;
_win_state[_cur_win_pos] = _cur_frame_state;
_cur_win_pos = (_cur_win_pos + 1) % _win_size_frame;
if (_pre_frame_state == (int)FrameState.kFrameStateSil && _win_sum >= _sil_to_speech_frmcnt_thres)
{
_pre_frame_state = (int)FrameState.kFrameStateSpeech;
return AudioChangeState.kChangeStateSil2Speech;
}
if (_pre_frame_state == (int)FrameState.kFrameStateSpeech && _win_sum <= _speech_to_sil_frmcnt_thres)
{
_pre_frame_state = (int)FrameState.kFrameStateSil;
return AudioChangeState.kChangeStateSpeech2Sil;
}
if (_pre_frame_state == (int)FrameState.kFrameStateSil)
{
return AudioChangeState.kChangeStateSil2Sil;
}
if (_pre_frame_state == (int)FrameState.kFrameStateSpeech)
{
return AudioChangeState.kChangeStateSpeech2Speech;
}
return AudioChangeState.kChangeStateInvalid;
}
private int FrameSizeMs()
{
return _frame_size_ms;
}
}
}

View File

@@ -0,0 +1,59 @@
# AliFsmnVadSharp
##### 简介:
项目中使用的VAD模型是阿里巴巴达摩院提供的FSMN-Monophone VAD模型。
**项目基于Net 6.0使用C#编写调用Microsoft.ML.OnnxRuntime对onnx模型进行解码支持跨平台编译。项目以库的形式进行调用部署非常方便。**
VAD整体流程的rtf在0.008左右。
##### 用途:
16k中文通用VAD模型可用于检测长语音片段中有效语音的起止时间点.
FSMN-Monophone VAD是达摩院语音团队提出的高效语音端点检测模型用于检测输入音频中有效语音的起止时间点信息并将检测出来的有效音频片段输入识别引擎进行识别减少无效语音带来的识别错误。
##### VAD常用参数调整说明参考vad.yaml文件
max_end_silence_time尾部连续检测到多长时间静音进行尾点判停参数范围500ms6000ms默认值800ms(该值过低容易出现语音提前截断的情况)。
speech_noise_thresspeech的得分减去noise的得分大于此值则判断为speech参数范围-1,1
取值越趋于-1噪音被误判定为语音的概率越大FA越高
取值越趋于+1语音被误判定为噪音的概率越大Pmiss越高
通常情况下该值会根据当前模型在长语音测试集上的效果取balance
##### 模型获取
##### 调用方式:
###### 1.添加项目引用
using AliFsmnVadSharp;
###### 2.初始化模型和配置
```csharp
string applicationBase = AppDomain.CurrentDomain.BaseDirectory;
string modelFilePath = applicationBase + "./speech_fsmn_vad_zh-cn-16k-common-pytorch/model.onnx";
string configFilePath = applicationBase + "./speech_fsmn_vad_zh-cn-16k-common-pytorch/vad.yaml";
string mvnFilePath = applicationBase + "./speech_fsmn_vad_zh-cn-16k-common-pytorch/vad.mvn";
int batchSize = 2;//批量解码
AliFsmnVad aliFsmnVad = new AliFsmnVad(modelFilePath, configFilePath, mvnFilePath, batchSize);
```
###### 3.调用
方法一(适用于小文件)
```csharp
SegmentEntity[] segments_duration = aliFsmnVad.GetSegments(samples);
```
方法二(适用于大文件)
```csharp
SegmentEntity[] segments_duration = aliFsmnVad.GetSegmentsByStep(samples);
```
###### 4.输出结果:
```
load model and init config elapsed_milliseconds:463.5390625
vad infer result:
[[70,2340][2620,6200][6480,23670][23950,26250][26780,28990][29950,31430][31750,37600][38210,46900][47310,49630][49910,56460][56740,59540][59820,70450]]
elapsed_milliseconds:662.796875
total_duration:70470.625
rtf:0.009405292985552491
```
输出的数据,例如:[70,2340]是以毫秒为单位的segement的起止时间可以以此为依据对音频进行分片。其中静音噪音部分已被去除。
其他说明:
测试用例AliFsmnVadSharp.Examples。
测试环境windows11。
测试用例中samples的计算,使用的是NAudio库。
通过以下链接了解更多:
https://www.modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/summary