-
Notifications
You must be signed in to change notification settings - Fork 588
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Real-time speech recognition from microphone for .Net (#192)
* Real-time speech recognition from microphone for .Net supporting Linux, macOS, and Windows * Use PortAudioSharp2. We will maintain it by ourselves. The project is at https://github.com/csukuangfj/PortAudioSharp2 * minor fixes
- Loading branch information
1 parent
513dfaa
commit 0dd2d41
Showing
5 changed files
with
279 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
234 changes: 234 additions & 0 deletions
234
dotnet-examples/speech-recognition-from-microphone/Program.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,234 @@ | ||
// Copyright (c) 2023 Xiaomi Corporation | ||
// | ||
// This file shows how to use a streaming model for real-time speech | ||
// recognition from a microphone. | ||
// Please refer to | ||
// https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-transducer/zipformer-transducer-models.html | ||
// to download streaming models | ||
|
||
using CommandLine.Text; | ||
using CommandLine; | ||
using PortAudioSharp; | ||
using System.Threading; | ||
using SherpaOnnx; | ||
using System.Collections.Generic; | ||
using System.Runtime.InteropServices; | ||
using System; | ||
|
||
|
||
class OnlineDecodeFiles | ||
{ | ||
class Options | ||
{ | ||
[Option(Required = true, HelpText = "Path to tokens.txt")] | ||
public string Tokens { get; set; } | ||
|
||
[Option(Required = false, Default = "cpu", HelpText = "Provider, e.g., cpu, coreml")] | ||
public string Provider { get; set; } | ||
|
||
[Option(Required = true, HelpText = "Path to encoder.onnx")] | ||
public string Encoder { get; set; } | ||
|
||
[Option(Required = true, HelpText = "Path to decoder.onnx")] | ||
public string Decoder { get; set; } | ||
|
||
[Option(Required = true, HelpText = "Path to joiner.onnx")] | ||
public string Joiner { get; set; } | ||
|
||
[Option("num-threads", Required = false, Default = 1, HelpText = "Number of threads for computation")] | ||
public int NumThreads { get; set; } | ||
|
||
[Option("decoding-method", Required = false, Default = "greedy_search", | ||
HelpText = "Valid decoding methods are: greedy_search, modified_beam_search")] | ||
public string DecodingMethod { get; set; } | ||
|
||
[Option(Required = false, Default = false, HelpText = "True to show model info during loading")] | ||
public bool Debug { get; set; } | ||
|
||
[Option("sample-rate", Required = false, Default = 16000, HelpText = "Sample rate of the data used to train the model")] | ||
public int SampleRate { get; set; } | ||
|
||
[Option("max-active-paths", Required = false, Default = 4, | ||
HelpText = @"Used only when --decoding--method is modified_beam_search. | ||
It specifies number of active paths to keep during the search")] | ||
public int MaxActivePaths { get; set; } | ||
|
||
[Option("enable-endpoint", Required = false, Default = true, | ||
HelpText = "True to enable endpoint detection.")] | ||
public bool EnableEndpoint { get; set; } | ||
|
||
[Option("rule1-min-trailing-silence", Required = false, Default = 2.4F, | ||
HelpText = @"An endpoint is detected if trailing silence in seconds is | ||
larger than this value even if nothing has been decoded. Used only when --enable-endpoint is true.")] | ||
public float Rule1MinTrailingSilence { get; set; } | ||
|
||
[Option("rule2-min-trailing-silence", Required = false, Default = 0.8F, | ||
HelpText = @"An endpoint is detected if trailing silence in seconds is | ||
larger than this value after something that is not blank has been decoded. Used | ||
only when --enable-endpoint is true.")] | ||
public float Rule2MinTrailingSilence { get; set; } | ||
|
||
[Option("rule3-min-utterance-length", Required = false, Default = 20.0F, | ||
HelpText = @"An endpoint is detected if the utterance in seconds is | ||
larger than this value. Used only when --enable-endpoint is true.")] | ||
public float Rule3MinUtteranceLength { get; set; } | ||
} | ||
|
||
static void Main(string[] args) | ||
{ | ||
var parser = new CommandLine.Parser(with => with.HelpWriter = null); | ||
var parserResult = parser.ParseArguments<Options>(args); | ||
|
||
parserResult | ||
.WithParsed<Options>(options => Run(options)) | ||
.WithNotParsed(errs => DisplayHelp(parserResult, errs)); | ||
} | ||
|
||
private static void DisplayHelp<T>(ParserResult<T> result, IEnumerable<Error> errs) | ||
{ | ||
string usage = @" | ||
dotnet run -c Release \ | ||
--tokens ./icefall-asr-zipformer-streaming-wenetspeech-20230615/data/lang_char/tokens.txt \ | ||
--encoder ./icefall-asr-zipformer-streaming-wenetspeech-20230615/exp/encoder-epoch-12-avg-4-chunk-16-left-128.onnx \ | ||
--decoder ./icefall-asr-zipformer-streaming-wenetspeech-20230615/exp/decoder-epoch-12-avg-4-chunk-16-left-128.onnx \ | ||
--joiner ./icefall-asr-zipformer-streaming-wenetspeech-20230615/exp/joiner-epoch-12-avg-4-chunk-16-left-128.onnx \ | ||
Please refer to | ||
https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-transducer/index.html | ||
to download pre-trained streaming models. | ||
"; | ||
|
||
var helpText = HelpText.AutoBuild(result, h => | ||
{ | ||
h.AdditionalNewLineAfterOption = false; | ||
h.Heading = usage; | ||
h.Copyright = "Copyright (c) 2023 Xiaomi Corporation"; | ||
return HelpText.DefaultParsingErrorsHandler(result, h); | ||
}, e => e); | ||
Console.WriteLine(helpText); | ||
} | ||
|
||
private static void Run(Options options) | ||
{ | ||
OnlineRecognizerConfig config = new OnlineRecognizerConfig(); | ||
config.FeatConfig.SampleRate = options.SampleRate; | ||
|
||
// All models from icefall using feature dim 80. | ||
// You can change it if your model has a different feature dim. | ||
config.FeatConfig.FeatureDim = 80; | ||
|
||
config.TransducerModelConfig.Encoder = options.Encoder; | ||
config.TransducerModelConfig.Decoder = options.Decoder; | ||
config.TransducerModelConfig.Joiner = options.Joiner; | ||
config.TransducerModelConfig.Tokens = options.Tokens; | ||
config.TransducerModelConfig.Provider = options.Provider; | ||
config.TransducerModelConfig.NumThreads = options.NumThreads; | ||
config.TransducerModelConfig.Debug = options.Debug ? 1 : 0; | ||
|
||
config.DecodingMethod = options.DecodingMethod; | ||
config.MaxActivePaths = options.MaxActivePaths; | ||
config.EnableEndpoint = options.EnableEndpoint ? 1 : 0; | ||
|
||
config.Rule1MinTrailingSilence = options.Rule1MinTrailingSilence; | ||
config.Rule2MinTrailingSilence = options.Rule2MinTrailingSilence; | ||
config.Rule3MinUtteranceLength = options.Rule3MinUtteranceLength; | ||
|
||
OnlineRecognizer recognizer = new OnlineRecognizer(config); | ||
|
||
|
||
OnlineStream s = recognizer.CreateStream(); | ||
|
||
Console.WriteLine(PortAudio.VersionInfo.versionText); | ||
PortAudio.Initialize(); | ||
|
||
Console.WriteLine($"Number of devices: {PortAudio.DeviceCount}"); | ||
for (int i = 0; i != PortAudio.DeviceCount; ++i) | ||
{ | ||
Console.WriteLine($" Device {i}"); | ||
DeviceInfo deviceInfo = PortAudio.GetDeviceInfo(i); | ||
Console.WriteLine($" Name: {deviceInfo.name}"); | ||
Console.WriteLine($" Max input channels: {deviceInfo.maxInputChannels}"); | ||
Console.WriteLine($" Default sample rate: {deviceInfo.defaultSampleRate}"); | ||
} | ||
int deviceIndex = PortAudio.DefaultInputDevice; | ||
if (deviceIndex == PortAudio.NoDevice) | ||
{ | ||
Console.WriteLine("No default input device found"); | ||
Environment.Exit(1); | ||
} | ||
|
||
DeviceInfo info = PortAudio.GetDeviceInfo(deviceIndex); | ||
|
||
Console.WriteLine(); | ||
Console.WriteLine($"Use default device {deviceIndex} ({info.name})"); | ||
|
||
StreamParameters param = new StreamParameters(); | ||
param.device = deviceIndex; | ||
param.channelCount = 1; | ||
param.sampleFormat = SampleFormat.Float32; | ||
param.suggestedLatency = info.defaultLowInputLatency; | ||
param.hostApiSpecificStreamInfo = IntPtr.Zero; | ||
|
||
PortAudioSharp.Stream.Callback callback = (IntPtr input, IntPtr output, | ||
UInt32 frameCount, | ||
ref StreamCallbackTimeInfo timeInfo, | ||
StreamCallbackFlags statusFlags, | ||
IntPtr userData | ||
) => | ||
{ | ||
float[] samples = new float[frameCount]; | ||
Marshal.Copy(input, samples, 0, (Int32)frameCount); | ||
|
||
s.AcceptWaveform(options.SampleRate, samples); | ||
|
||
return StreamCallbackResult.Continue; | ||
}; | ||
|
||
PortAudioSharp.Stream stream = new PortAudioSharp.Stream(inParams: param, outParams: null, sampleRate: options.SampleRate, | ||
framesPerBuffer: 0, | ||
streamFlags: StreamFlags.ClipOff, | ||
callback: callback, | ||
userData: IntPtr.Zero | ||
); | ||
|
||
Console.WriteLine(param); | ||
|
||
stream.Start(); | ||
|
||
int segment_index = 0; | ||
String lastText = ""; | ||
int segmentIndex = 0; | ||
|
||
while (true) | ||
{ | ||
while (recognizer.IsReady(s)) | ||
{ | ||
recognizer.Decode(s); | ||
} | ||
|
||
var text = recognizer.GetResult(s).Text; | ||
bool isEndpoint = recognizer.IsEndpoint(s); | ||
if (!string.IsNullOrWhiteSpace(text) && lastText != text) | ||
{ | ||
lastText = text; | ||
Console.Write($"\r{segmentIndex}: {lastText}"); | ||
} | ||
|
||
if (isEndpoint) | ||
{ | ||
if (!string.IsNullOrWhiteSpace(text)) | ||
{ | ||
++segmentIndex; | ||
Console.WriteLine(); | ||
} | ||
recognizer.Reset(s); | ||
} | ||
|
||
Thread.Sleep(200); // ms | ||
} | ||
|
||
PortAudio.Terminate(); | ||
|
||
|
||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
#!/usr/bin/env bash | ||
|
||
# Please refer to | ||
# https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-transducer/zipformer-transducer-models.html#csukuangfj-sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20-bilingual-chinese-english | ||
# to download the model files | ||
# | ||
export LD_LIBRARY_PATH=$PWD:$LD_LIBRARY_PATH | ||
export DYLD_LIBRARY_PATH=$PWD:$DYLD_LIBRARY_PATH | ||
|
||
if [ ! -d ./icefall-asr-zipformer-streaming-wenetspeech-20230615 ]; then | ||
GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/pkufool/icefall-asr-zipformer-streaming-wenetspeech-20230615 | ||
cd icefall-asr-zipformer-streaming-wenetspeech-20230615 | ||
git lfs pull --include "*.onnx" | ||
cd .. | ||
fi | ||
|
||
dotnet run -c Release \ | ||
--tokens ./icefall-asr-zipformer-streaming-wenetspeech-20230615/data/lang_char/tokens.txt \ | ||
--encoder ./icefall-asr-zipformer-streaming-wenetspeech-20230615/exp/encoder-epoch-12-avg-4-chunk-16-left-128.onnx \ | ||
--decoder ./icefall-asr-zipformer-streaming-wenetspeech-20230615/exp/decoder-epoch-12-avg-4-chunk-16-left-128.onnx \ | ||
--joiner ./icefall-asr-zipformer-streaming-wenetspeech-20230615/exp/joiner-epoch-12-avg-4-chunk-16-left-128.onnx |
17 changes: 17 additions & 0 deletions
17
dotnet-examples/speech-recognition-from-microphone/speech-recognition-from-microphone.csproj
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
<Project Sdk="Microsoft.NET.Sdk"> | ||
|
||
<PropertyGroup> | ||
<OutputType>Exe</OutputType> | ||
<TargetFramework>net6.0</TargetFramework> | ||
<RootNamespace>speech_recognition_from_microphone</RootNamespace> | ||
<ImplicitUsings>enable</ImplicitUsings> | ||
<Nullable>enable</Nullable> | ||
</PropertyGroup> | ||
|
||
<ItemGroup> | ||
<PackageReference Include="CommandLineParser" Version="2.9.1" /> | ||
<PackageReference Include="org.k2fsa.sherpa.onnx" Version="*" /> | ||
<PackageReference Include="PortAudioSharp2" Version="*" /> | ||
</ItemGroup> | ||
|
||
</Project> |