diff --git a/examples/net_framework/DeepSpeechWPF/DeepSpeech.WPF.csproj b/examples/net_framework/DeepSpeechWPF/DeepSpeech.WPF.csproj index d2184abc98..db4d56afb0 100644 --- a/examples/net_framework/DeepSpeechWPF/DeepSpeech.WPF.csproj +++ b/examples/net_framework/DeepSpeechWPF/DeepSpeech.WPF.csproj @@ -14,26 +14,8 @@ 4 true true - - - AnyCPU - true - full - false - bin\Debug\ - DEBUG;TRACE - prompt - 4 - - - AnyCPU - pdbonly - true - bin\Release\ - TRACE - prompt - 4 - true + + true @@ -59,13 +41,14 @@ - ..\packages\CSCore.1.2.1.2\lib\net35-client\CSCore.dll + packages\CSCore.1.2.1.2\lib\net35-client\CSCore.dll - - ..\packages\NAudio.1.8.5\lib\net35\NAudio.dll + + packages\NAudio.1.9.0\lib\net35\NAudio.dll + @@ -125,7 +108,7 @@ - + {56de4091-bbbe-47e4-852d-7268b33b971f} DeepSpeechClient diff --git a/examples/net_framework/DeepSpeechWPF/DeepSpeech.WPF.sln b/examples/net_framework/DeepSpeechWPF/DeepSpeech.WPF.sln new file mode 100644 index 0000000000..96b4e6bc0e --- /dev/null +++ b/examples/net_framework/DeepSpeechWPF/DeepSpeech.WPF.sln @@ -0,0 +1,31 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 15 +VisualStudioVersion = 15.0.28307.421 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "DeepSpeech.WPF", "DeepSpeech.WPF.csproj", "{54BFD766-4305-4F4C-BA59-AF45505DF3C1}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "DeepSpeechClient", "..\..\..\native_client\dotnet\DeepSpeechClient\DeepSpeechClient.csproj", "{56DE4091-BBBE-47E4-852D-7268B33B971F}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {54BFD766-4305-4F4C-BA59-AF45505DF3C1}.Debug|x64.ActiveCfg = Debug|x64 + {54BFD766-4305-4F4C-BA59-AF45505DF3C1}.Debug|x64.Build.0 = Debug|x64 + {54BFD766-4305-4F4C-BA59-AF45505DF3C1}.Release|x64.ActiveCfg = Release|x64 + {54BFD766-4305-4F4C-BA59-AF45505DF3C1}.Release|x64.Build.0 = Release|x64 + {56DE4091-BBBE-47E4-852D-7268B33B971F}.Debug|x64.ActiveCfg = Debug|x64 + {56DE4091-BBBE-47E4-852D-7268B33B971F}.Debug|x64.Build.0 = Debug|x64 + {56DE4091-BBBE-47E4-852D-7268B33B971F}.Release|x64.ActiveCfg = Release|x64 + {56DE4091-BBBE-47E4-852D-7268B33B971F}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {19C58802-CCEC-4FD1-8D17-A6EB766116F7} + EndGlobalSection +EndGlobal diff --git a/examples/net_framework/DeepSpeechWPF/MainWindow.xaml.cs b/examples/net_framework/DeepSpeechWPF/MainWindow.xaml.cs index aeb4b31582..e855f181c6 100644 --- a/examples/net_framework/DeepSpeechWPF/MainWindow.xaml.cs +++ b/examples/net_framework/DeepSpeechWPF/MainWindow.xaml.cs @@ -79,14 +79,8 @@ private void Window_Loaded(object sender, RoutedEventArgs e) { try { - if (_sttClient.CreateModel("output_graph.pbmm", N_CEP, N_CONTEXT, "alphabet.txt", BEAM_WIDTH) == 0) - { - Dispatcher.Invoke(() => { EnableControls(); }); - } - else - { - MessageBox.Show("Model load failed."); - } + _sttClient.CreateModel("output_graph.pbmm", N_CEP, N_CONTEXT, "alphabet.txt", BEAM_WIDTH); + Dispatcher.Invoke(() => { EnableControls(); }); } catch (Exception ex) { @@ -161,18 +155,12 @@ await Task.Run(() => { try { - if (_sttClient.EnableDecoderWithLM("alphabet.txt", "lm.binary", "trie", LM_ALPHA, LM_BETA) != 0) - { - MessageBox.Show("Error loading LM."); - Dispatcher.Invoke(() => btnEnableLM.IsEnabled = true); - } - else - { - Dispatcher.Invoke(() => lblStatus.Content = "LM loaded."); - } + _sttClient.EnableDecoderWithLM("alphabet.txt", "lm.binary", "trie", LM_ALPHA, LM_BETA); + Dispatcher.Invoke(() => lblStatus.Content = "LM loaded."); } catch (Exception ex) { + Dispatcher.Invoke(() => btnEnableLM.IsEnabled = true); MessageBox.Show(ex.Message); } }); diff --git a/examples/net_framework/DeepSpeechWPF/packages.config b/examples/net_framework/DeepSpeechWPF/packages.config index 6cab284370..4b03fe6aa5 100644 --- a/examples/net_framework/DeepSpeechWPF/packages.config +++ b/examples/net_framework/DeepSpeechWPF/packages.config @@ -1,5 +1,5 @@  - + \ No newline at end of file diff --git a/native_client/dotnet/DeepSpeechClient/DeepSpeech.cs b/native_client/dotnet/DeepSpeechClient/DeepSpeech.cs index 255d26f834..5271e2e70c 100644 --- a/native_client/dotnet/DeepSpeechClient/DeepSpeech.cs +++ b/native_client/dotnet/DeepSpeechClient/DeepSpeech.cs @@ -5,6 +5,7 @@ using System; using System.IO; using System.Runtime.InteropServices; +using DeepSpeechClient.Enums; namespace DeepSpeechClient { @@ -35,8 +36,8 @@ public DeepSpeech() /// The context window the model was trained with. /// The path to the configuration file specifying the alphabet used by the network. /// The beam width used by the decoder. A larger beam width generates better results at the cost of decoding time. - /// Zero on success, non-zero on failure. - public unsafe int CreateModel(string aModelPath, uint aNCep, + /// Thrown when the native binary failed to create the model. + public unsafe void CreateModel(string aModelPath, uint aNCep, uint aNContext, string aAlphabetConfigPath, uint aBeamWidth) { string exceptionMessage = null; @@ -61,16 +62,53 @@ public unsafe int CreateModel(string aModelPath, uint aNCep, { throw new FileNotFoundException(exceptionMessage); } - int result = NativeImp.DS_CreateModel(aModelPath, + var resultCode = NativeImp.DS_CreateModel(aModelPath, aNCep, aNContext, aAlphabetConfigPath, aBeamWidth, ref _modelStatePP); + EvaluateResultCode(resultCode); _modelStateP = *_modelStatePP; - return result; - + } + /// + /// Evaluate the result code and will raise an exception if necessary. + /// + /// Native result code. + private void EvaluateResultCode(ErrorCodes resultCode) + { + switch (resultCode) + { + case ErrorCodes.DS_ERR_OK: + break; + case ErrorCodes.DS_ERR_NO_MODEL: + throw new ArgumentException("Missing model information."); + case ErrorCodes.DS_ERR_INVALID_ALPHABET: + throw new ArgumentException("Invalid alphabet file or invalid alphabet size."); + case ErrorCodes.DS_ERR_INVALID_SHAPE: + throw new ArgumentException("Invalid model shape."); + case ErrorCodes.DS_ERR_INVALID_LM: + throw new ArgumentException("Invalid language model file."); + case ErrorCodes.DS_ERR_FAIL_INIT_MMAP: + throw new ArgumentException("Failed to initialize memory mapped model."); + case ErrorCodes.DS_ERR_FAIL_INIT_SESS: + throw new ArgumentException("Failed to initialize the session."); + case ErrorCodes.DS_ERR_FAIL_INTERPRETER: + throw new ArgumentException("Interpreter failed."); + case ErrorCodes.DS_ERR_FAIL_RUN_SESS: + throw new ArgumentException("Failed to run the session."); + case ErrorCodes.DS_ERR_FAIL_CREATE_STREAM: + throw new ArgumentException("Error creating the stream."); + case ErrorCodes.DS_ERR_FAIL_READ_PROTOBUF: + throw new ArgumentException("Error reading the proto buffer model file."); + case ErrorCodes.DS_ERR_FAIL_CREATE_SESS: + throw new ArgumentException("Error failed to create session."); + case ErrorCodes.DS_ERR_MODEL_INCOMPATIBLE: + throw new ArgumentException("Error incompatible model."); + default: + throw new ArgumentException("Unknown error, please make sure you are using the correct native binary."); + } } /// @@ -89,8 +127,8 @@ public unsafe void Dispose() /// The path to the trie file build from the same vocabulary as the language model binary. /// The alpha hyperparameter of the CTC decoder. Language Model weight. /// The beta hyperparameter of the CTC decoder. Word insertion weight. - /// Zero on success, non-zero on failure (invalid arguments). - public unsafe int EnableDecoderWithLM(string aAlphabetConfigPath, + /// Thrown when the native binary failed to enable decoding with a language model. + public unsafe void EnableDecoderWithLM(string aAlphabetConfigPath, string aLMPath, string aTriePath, float aLMAlpha, float aLMBeta) { @@ -109,12 +147,13 @@ public unsafe int EnableDecoderWithLM(string aAlphabetConfigPath, throw new FileNotFoundException(exceptionMessage); } - return NativeImp.DS_EnableDecoderWithLM(_modelStatePP, + var resultCode = NativeImp.DS_EnableDecoderWithLM(_modelStatePP, aAlphabetConfigPath, aLMPath, aTriePath, aLMAlpha, aLMBeta); + EvaluateResultCode(resultCode); } /// @@ -169,10 +208,11 @@ public unsafe void PrintVersions() /// One timestep is equivalent to two window lengths(20ms). /// If set to 0 we reserve enough frames for 3 seconds of audio(150). /// The sample-rate of the audio signal - /// Zero for success, non-zero on failure - public unsafe int SetupStream(uint aPreAllocFrames, uint aSampleRate) + /// Thrown when the native binary failed to initialize the streaming mode. + public unsafe void SetupStream(uint aPreAllocFrames, uint aSampleRate) { - return NativeImp.DS_SetupStream(_modelStatePP, aPreAllocFrames, aSampleRate, ref _streamingStatePP); + var resultCode = NativeImp.DS_SetupStream(_modelStatePP, aPreAllocFrames, aSampleRate, ref _streamingStatePP); + EvaluateResultCode(resultCode); } /// diff --git a/native_client/dotnet/DeepSpeechClient/DeepSpeechClient.csproj b/native_client/dotnet/DeepSpeechClient/DeepSpeechClient.csproj index ded5102860..bd5a5a13de 100644 --- a/native_client/dotnet/DeepSpeechClient/DeepSpeechClient.csproj +++ b/native_client/dotnet/DeepSpeechClient/DeepSpeechClient.csproj @@ -45,6 +45,7 @@ + diff --git a/native_client/dotnet/DeepSpeechClient/Enums/ErrorCodes.cs b/native_client/dotnet/DeepSpeechClient/Enums/ErrorCodes.cs new file mode 100644 index 0000000000..019564c279 --- /dev/null +++ b/native_client/dotnet/DeepSpeechClient/Enums/ErrorCodes.cs @@ -0,0 +1,29 @@ +namespace DeepSpeechClient.Enums +{ + /// + /// Error codes from the native DeepSpeech binary. + /// + internal enum ErrorCodes + { + // OK + DS_ERR_OK = 0x0000, + + // Missing invormations + DS_ERR_NO_MODEL = 0x1000, + + // Invalid parameters + DS_ERR_INVALID_ALPHABET = 0x2000, + DS_ERR_INVALID_SHAPE = 0x2001, + DS_ERR_INVALID_LM = 0x2002, + DS_ERR_MODEL_INCOMPATIBLE = 0x2003, + + // Runtime failures + DS_ERR_FAIL_INIT_MMAP = 0x3000, + DS_ERR_FAIL_INIT_SESS = 0x3001, + DS_ERR_FAIL_INTERPRETER = 0x3002, + DS_ERR_FAIL_RUN_SESS = 0x3003, + DS_ERR_FAIL_CREATE_STREAM = 0x3004, + DS_ERR_FAIL_READ_PROTOBUF = 0x3005, + DS_ERR_FAIL_CREATE_SESS = 0x3006, + } +} diff --git a/native_client/dotnet/DeepSpeechClient/Interfaces/IDeepSpeech.cs b/native_client/dotnet/DeepSpeechClient/Interfaces/IDeepSpeech.cs index 5139cdfc1d..3d27a56cbc 100644 --- a/native_client/dotnet/DeepSpeechClient/Interfaces/IDeepSpeech.cs +++ b/native_client/dotnet/DeepSpeechClient/Interfaces/IDeepSpeech.cs @@ -21,8 +21,8 @@ public interface IDeepSpeech : IDisposable /// The context window the model was trained with. /// The path to the configuration file specifying the alphabet used by the network. /// The beam width used by the decoder. A larger beam width generates better results at the cost of decoding time. - /// Zero on success, non-zero on failure. - unsafe int CreateModel(string aModelPath, uint aNCep, + /// Thrown when the native binary failed to create the model. + unsafe void CreateModel(string aModelPath, uint aNCep, uint aNContext, string aAlphabetConfigPath, uint aBeamWidth); @@ -35,8 +35,8 @@ unsafe int CreateModel(string aModelPath, uint aNCep, /// The path to the trie file build from the same vocabulary as the language model binary. /// The alpha hyperparameter of the CTC decoder. Language Model weight. /// The beta hyperparameter of the CTC decoder. Word insertion weight. - /// Zero on success, non-zero on failure (invalid arguments). - unsafe int EnableDecoderWithLM(string aAlphabetConfigPath, + /// Thrown when the native binary failed to enable decoding with a language model. + unsafe void EnableDecoderWithLM(string aAlphabetConfigPath, string aLMPath, string aTriePath, float aLMAlpha, @@ -88,8 +88,8 @@ unsafe Metadata SpeechToTextWithMetadata(short[] aBuffer, /// One timestep is equivalent to two window lengths(20ms). /// If set to 0 we reserve enough frames for 3 seconds of audio(150). /// The sample-rate of the audio signal - /// Zero for success, non-zero on failure - unsafe int SetupStream(uint aPreAllocFrames, uint aSampleRate); + /// Thrown when the native binary failed to initialize the streaming mode. + unsafe void SetupStream(uint aPreAllocFrames, uint aSampleRate); /// /// Feeds audio samples to an ongoing streaming inference. diff --git a/native_client/dotnet/DeepSpeechClient/NativeImp.cs b/native_client/dotnet/DeepSpeechClient/NativeImp.cs index 52d9044c76..ec7d527b29 100644 --- a/native_client/dotnet/DeepSpeechClient/NativeImp.cs +++ b/native_client/dotnet/DeepSpeechClient/NativeImp.cs @@ -1,4 +1,5 @@ -using DeepSpeechClient.Structs; +using DeepSpeechClient.Enums; +using DeepSpeechClient.Structs; using System; using System.Runtime.InteropServices; @@ -15,7 +16,7 @@ internal static class NativeImp internal static extern void DS_PrintVersions(); [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] - internal unsafe static extern int DS_CreateModel(string aModelPath, + internal unsafe static extern ErrorCodes DS_CreateModel(string aModelPath, uint aNCep, uint aNContext, string aAlphabetConfigPath, @@ -23,7 +24,7 @@ internal unsafe static extern int DS_CreateModel(string aModelPath, ref ModelState** pint); [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] - internal static unsafe extern int DS_EnableDecoderWithLM(ModelState** aCtx, + internal static unsafe extern ErrorCodes DS_EnableDecoderWithLM(ModelState** aCtx, string aAlphabetConfigPath, string aLMPath, string aTriePath, @@ -47,7 +48,7 @@ internal static unsafe extern IntPtr DS_SpeechToTextWithMetadata(ModelState** aC internal static unsafe extern void DS_DestroyModel(ModelState** aCtx); [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] - internal static unsafe extern int DS_SetupStream(ModelState** aCtx, + internal static unsafe extern ErrorCodes DS_SetupStream(ModelState** aCtx, uint aPreAllocFrames, uint aSampleRate, ref StreamingState** retval); diff --git a/native_client/dotnet/DeepSpeechConsole/Program.cs b/native_client/dotnet/DeepSpeechConsole/Program.cs index 8f76457182..315a1a4084 100644 --- a/native_client/dotnet/DeepSpeechConsole/Program.cs +++ b/native_client/dotnet/DeepSpeechConsole/Program.cs @@ -25,7 +25,7 @@ static string MetadataToString(Metadata meta) { var nl = Environment.NewLine; string retval = - Environment.NewLine +$"Recognized text: {string.Join("", meta?.Items?.Select(x=>x.Character))} {nl}" + Environment.NewLine + $"Recognized text: {string.Join("", meta?.Items?.Select(x => x.Character))} {nl}" + $"Prob: {meta?.Probability} {nl}" + $"Item count: {meta?.Items?.Length} {nl}" + string.Join(nl, meta?.Items?.Select(x => $"Timestep : {x.Timestep} TimeOffset: {x.StartTime} Char: {x.Character}")); @@ -60,43 +60,27 @@ static void Main(string[] args) using (IDeepSpeech sttClient = new DeepSpeech()) { - var result = 1; - Console.WriteLine("Loading model..."); - stopwatch.Start(); try { - result = sttClient.CreateModel( + Console.WriteLine("Loading model..."); + stopwatch.Start(); + sttClient.CreateModel( model ?? "output_graph.pbmm", N_CEP, N_CONTEXT, alphabet ?? "alphabet.txt", BEAM_WIDTH); - } - catch (IOException ex) - { - Console.WriteLine("Error loading lm."); - Console.WriteLine(ex.Message); - } - stopwatch.Stop(); - if (result == 0) - { + stopwatch.Stop(); + Console.WriteLine($"Model loaded - {stopwatch.Elapsed.Milliseconds} ms"); stopwatch.Reset(); if (lm != null) { Console.WriteLine("Loadin LM..."); - try - { - result = sttClient.EnableDecoderWithLM( - alphabet ?? "alphabet.txt", - lm ?? "lm.binary", - trie ?? "trie", - LM_ALPHA, LM_BETA); - } - catch (IOException ex) - { - Console.WriteLine("Error loading lm."); - Console.WriteLine(ex.Message); - } + sttClient.EnableDecoderWithLM( + alphabet ?? "alphabet.txt", + lm ?? "lm.binary", + trie ?? "trie", + LM_ALPHA, LM_BETA); } @@ -123,15 +107,15 @@ static void Main(string[] args) Console.WriteLine($"Audio duration: {waveInfo.TotalTime.ToString()}"); Console.WriteLine($"Inference took: {stopwatch.Elapsed.ToString()}"); - Console.WriteLine((extended ? $"Extended result: ": "Recognized text: ") + speechResult); + Console.WriteLine((extended ? $"Extended result: " : "Recognized text: ") + speechResult); } waveBuffer.Clear(); } - else + catch (Exception ex) { - Console.WriteLine("Error loding the model."); + Console.WriteLine(ex.Message); } } } } -} +} \ No newline at end of file