-
Notifications
You must be signed in to change notification settings - Fork 55
New issue
Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? # to your account
Audio buffer fix #47
base: development
Are you sure you want to change the base?
Audio buffer fix #47
Changes from all commits
3ce7254
e2fafa5
30ae093
db28784
5dccbfb
d938fdc
4d36d58
7983dd5
b5e84ea
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,11 +5,14 @@ | |
// Created by Bruno Berisso on 5/29/15. | ||
// Copyright (c) 2015 Bruno Berisso. All rights reserved. | ||
// | ||
// Updated by mainvolume Copyright © 2018. All rights reserved. | ||
|
||
import Foundation | ||
import AVFoundation | ||
import Sphinx | ||
|
||
//A global buffer size in the decoder to be used when initialize the models -nfft parameter | ||
public let bufferSize = 16384 | ||
|
||
fileprivate enum SpeechStateEnum : CustomStringConvertible { | ||
case silence | ||
|
@@ -123,7 +126,7 @@ public final class Decoder { | |
|
||
fileprivate func hypotesisForSpeech (inFile fileHandle: FileHandle) -> Hypothesis? { | ||
|
||
start_utt() | ||
startUtterence() | ||
|
||
let hypothesis = fileHandle.reduceChunks(2048, initial: nil, reducer: { | ||
(data: Data, partialHyp: Hypothesis?) -> Hypothesis? in | ||
|
@@ -133,15 +136,15 @@ public final class Decoder { | |
var resultantHyp = partialHyp | ||
if speechState == .utterance { | ||
|
||
end_utt() | ||
stopUtterence() | ||
resultantHyp = partialHyp + get_hyp() | ||
start_utt() | ||
startUtterence() | ||
} | ||
|
||
return resultantHyp | ||
}) | ||
|
||
end_utt() | ||
stopUtterence() | ||
|
||
//Process any pending speech | ||
if speechState == .speech { | ||
|
@@ -165,33 +168,32 @@ public final class Decoder { | |
} | ||
} | ||
} | ||
public func startDecodingSpeech (_ utteranceComplete: @escaping (Hypothesis?) -> ()) throws { | ||
|
||
public func startDecodingSpeech (_ audioSessionCategoryOptions:AVAudioSessionCategoryOptions = [.mixWithOthers, .allowBluetoothA2DP], utteranceComplete: @escaping (Hypothesis?) -> ()) throws { | ||
do { | ||
try AVAudioSession.sharedInstance().setCategory(AVAudioSessionCategoryRecord) | ||
try AVAudioSession.sharedInstance().setCategory(AVAudioSessionCategoryPlayAndRecord, with: audioSessionCategoryOptions) | ||
} catch let error as NSError { | ||
print("Error setting the shared AVAudioSession: \(error)") | ||
throw DecodeErrors.CantSetAudioSession(error) | ||
} | ||
|
||
engine = AVAudioEngine() | ||
|
||
let input = engine.inputNode | ||
let mixer = AVAudioMixerNode() | ||
engine.attach(mixer) | ||
engine.connect(input, to: mixer, format: input.outputFormat(forBus: 0)) | ||
|
||
// We forceunwrap this because the docs for AVAudioFormat specify that this constructor return nil when the channels | ||
// are grater than 2. | ||
let formatIn = AVAudioFormat(commonFormat: .pcmFormatFloat32, sampleRate: 16000, channels: 1, interleaved: false)! | ||
let formatOut = AVAudioFormat(commonFormat: .pcmFormatInt16, sampleRate: 16000, channels: 1, interleaved: false)! | ||
|
||
let formatIn = AVAudioFormat(commonFormat: .pcmFormatFloat32, sampleRate: input.outputFormat(forBus: 0).sampleRate, channels: 1, interleaved: false)! | ||
let formatOut = AVAudioFormat(commonFormat: .pcmFormatInt16, sampleRate: input.outputFormat(forBus: 0).sampleRate, channels: 1, interleaved: false)! | ||
|
||
guard let bufferMapper = AVAudioConverter(from: formatIn, to: formatOut) else { | ||
// Returns nil if the format conversion is not possible. | ||
throw DecodeErrors.CantConvertAudioFormat | ||
} | ||
|
||
mixer.installTap(onBus: 0, bufferSize: 2048, format: formatIn, block: { | ||
mixer.installTap(onBus: 0, bufferSize: AVAudioFrameCount(bufferSize), format: formatIn, block: { | ||
[unowned self] (buffer: AVAudioPCMBuffer!, time: AVAudioTime!) in | ||
|
||
guard let sphinxBuffer = AVAudioPCMBuffer(pcmFormat: formatOut, frameCapacity: buffer.frameCapacity) else { | ||
|
@@ -218,27 +220,28 @@ public final class Decoder { | |
let audioData = sphinxBuffer.toData() | ||
self.process_raw(audioData) | ||
|
||
print("Process: \(buffer.frameLength) frames - \(audioData.count) bytes - sample time: \(time.sampleTime)") | ||
// uncomment for frame monitorting | ||
//print("Process: \(buffer.frameLength) frames - \(audioData.count) bytes - sample time: \(time.sampleTime)") | ||
|
||
if self.speechState == .utterance { | ||
|
||
self.end_utt() | ||
self.endUtterence() | ||
let hypothesis = self.get_hyp() | ||
|
||
DispatchQueue.main.async { | ||
utteranceComplete(hypothesis) | ||
} | ||
|
||
self.start_utt() | ||
self.startUtterence() | ||
} | ||
}) | ||
|
||
start_utt() | ||
startUtterence() | ||
|
||
do { | ||
try engine.start() | ||
} catch let error as NSError { | ||
end_utt() | ||
endUtterence() | ||
print("Can't start AVAudioEngine: \(error)") | ||
throw DecodeErrors.CantStartAudioEngine(error) | ||
} | ||
|
@@ -248,7 +251,33 @@ public final class Decoder { | |
engine.stop() | ||
engine = nil | ||
} | ||
|
||
|
||
public func startUtterence() { | ||
self.start_utt() | ||
} | ||
|
||
public func startDecodingBuffer(buffer: AVAudioPCMBuffer!, time: AVAudioTime!, utteranceComplete: @escaping (Hypothesis?)-> ()) throws { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 👏🏻👏🏻👏🏻 nice! There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also, something was wrong with the tabs? jaja There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The tabs... editing in github as the codebase in home and at work right now. 😂 Havent written any tests, but to bypass the microphone usage for the thinking machine implementation, a synthesized continuous buffer is passed to the function with which works quite sweet with. The function is based on the streaming function but with the option of creating the buffer before passing it to the function, instead of using the tap in the function. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hold on, fixing the tabs. |
||
|
||
let audioData = buffer.toData() | ||
self.process_raw(audioData) | ||
|
||
if self.speechState == .utterance { | ||
|
||
self.endUtterence() | ||
let hypothesis = self.get_hyp() | ||
|
||
DispatchQueue.main.async { | ||
utteranceComplete(hypothesis) | ||
} | ||
|
||
self.startUtterence() | ||
} | ||
} | ||
|
||
public func endUtterence() { | ||
self.end_utt() | ||
} | ||
|
||
public func add(words:Array<(word: String, phones: String)>) throws { | ||
|
||
guard engine == nil || !engine.isRunning else { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this and
endUtterence
shouldn't be public. Is my understanding that we needed public because you should callstartUtterance()
beforestartDecodingBuffer
right?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That is accurate. Shall we make the endUtterence private you mean?