Skip to content
New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

Fixed tokenizer and audio processing logic #214

Open
wants to merge 27 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
c53d130
Update dependencies and fix language detection typo
1amageek Oct 3, 2024
2608340
Update AudioEncoder shape access and add tokenizer methods
1amageek Oct 3, 2024
b8029fb
Add `Sendable` conformance to several structs and enums
1amageek Oct 4, 2024
2af7d50
Refactor AudioProcessor to use actor model and async
1amageek Oct 5, 2024
dd1c4d5
Add Sendable conformance to various types and protocols
1amageek Oct 5, 2024
208893d
Update development team and package dependencies
1amageek Oct 5, 2024
9539e8b
Update package version and clean up code formatting
1amageek Oct 5, 2024
769dc29
Refactor audio energy calculations and buffer conversion
1amageek Oct 5, 2024
c8219d3
Refactor calculateRelativeEnergy method for clarity
1amageek Oct 5, 2024
2c0549c
Optimize audio buffer processing with vDSP_mmov
1amageek Oct 5, 2024
22aaa70
Refactor audio sample access methods in AudioProcessor
1amageek Oct 5, 2024
8ceaa0a
Remove unnecessary weak self references in closure
1amageek Oct 5, 2024
4d4233e
Refactor audio processing to use async/await methods
1amageek Oct 5, 2024
f9bcd1d
Use weak self in audio tap closure to prevent retain cycle
1amageek Oct 5, 2024
ea5d853
Log file name in error message for transcriber
1amageek Oct 5, 2024
5909d11
Refactor VADAudioChunker to a struct from a class
1amageek Oct 5, 2024
184b990
Refactor voice activity detection to use protocols
1amageek Oct 5, 2024
933b71b
Add audio converter initialization in resampling process
1amageek Oct 5, 2024
2dbb87f
Refactor AudioProcessor to use SampleRange type
1amageek Oct 6, 2024
368333f
Make AudioProcessing conform to Actor protocol
1amageek Oct 6, 2024
c41fb22
Refactor SegmentSeeker to improve readability and performance
1amageek Oct 6, 2024
621b1f3
Refactor SegmentSeeker to improve clarity and efficiency
1amageek Oct 6, 2024
f646268
Refactor SegmentSeeker to simplify alignment handling
1amageek Oct 6, 2024
db66166
Refactor SegmentSeeker to handle Float16 data type
1amageek Oct 6, 2024
28f34c3
Remove unnecessary comments in SegmentSeeker.swift
1amageek Oct 6, 2024
bb66ae1
Refactor SegmentSeeker for improved clarity and performance
1amageek Oct 6, 2024
8bfdd88
Refactor audio processor deinit and improve memory management
1amageek Oct 23, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 13 additions & 4 deletions Package.resolved
Original file line number Diff line number Diff line change
@@ -1,21 +1,30 @@
{
"pins" : [
{
"identity" : "jinja",
"kind" : "remoteSourceControl",
"location" : "https://github.com/maiqingqiang/Jinja",
"state" : {
"revision" : "4ffa95ce02e013c992287e19e3bbd620b6cc233a",
"version" : "1.0.4"
}
},
{
"identity" : "swift-argument-parser",
"kind" : "remoteSourceControl",
"location" : "https://github.com/apple/swift-argument-parser.git",
"state" : {
"revision" : "c8ed701b513cf5177118a175d85fbbbcd707ab41",
"version" : "1.3.0"
"revision" : "41982a3656a71c768319979febd796c6fd111d5c",
"version" : "1.5.0"
}
},
{
"identity" : "swift-transformers",
"kind" : "remoteSourceControl",
"location" : "https://github.com/huggingface/swift-transformers.git",
"state" : {
"revision" : "74b94211bdc741694ed7e700a1104c72e5ba68fe",
"version" : "0.1.7"
"revision" : "0f2306713d48a75b862026ebb291926793773f52",
"version" : "0.1.12"
}
}
],
Expand Down
4 changes: 2 additions & 2 deletions Package.swift
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@ let package = Package(
),
],
dependencies: [
.package(url: "https://github.com/huggingface/swift-transformers.git", exact: "0.1.7"),
.package(url: "https://github.com/apple/swift-argument-parser.git", exact: "1.3.0"),
.package(url: "https://github.com/huggingface/swift-transformers.git", exact: "0.1.12"),
.package(url: "https://github.com/apple/swift-argument-parser.git", exact: "1.5.0"),
],
targets: [
.target(
Expand Down
2 changes: 1 addition & 1 deletion Sources/WhisperKit/Core/Audio/AudioChunker.swift
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ open class VADAudioChunker: AudioChunking {
var startIndex = seekClipStart
while startIndex < seekClipEnd - windowPadding {
let currentFrameLength = startIndex - seekClipStart
if startIndex >= currentFrameLength, startIndex < 0 {
if startIndex >= currentFrameLength || startIndex < 0 {
throw WhisperError.audioProcessingFailed("startIndex is outside the buffer size")
}

Expand Down
2 changes: 1 addition & 1 deletion Sources/WhisperKit/Core/Audio/AudioProcessor.swift
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ public extension AudioProcessing {
static func padOrTrimAudio(fromArray audioArray: [Float], startAt startIndex: Int = 0, toLength frameLength: Int = 480_000, saveSegment: Bool = false) -> MLMultiArray? {
let currentFrameLength = audioArray.count

if startIndex >= currentFrameLength, startIndex < 0 {
if startIndex >= currentFrameLength || startIndex < 0 {
Logging.error("startIndex is outside the buffer size")
return nil
}
Expand Down
6 changes: 2 additions & 4 deletions Sources/WhisperKit/Core/AudioEncoder.swift
Original file line number Diff line number Diff line change
Expand Up @@ -22,16 +22,14 @@ public class AudioEncoder: AudioEncoding, WhisperMLModel {
guard let inputDescription = model?.modelDescription.outputDescriptionsByName["encoder_output_embeds"] else { return nil }
guard inputDescription.type == .multiArray else { return nil }
guard let shapeConstraint = inputDescription.multiArrayConstraint else { return nil }
let shape = shapeConstraint.shape.map { $0.intValue }
return shape[1]
return shapeConstraint.shape[0].intValue
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

shape[0]: Batch size
shape[1]: Sequence length
shape[2]: Embedding dimension

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

}

public var sequenceLength: Int? {
guard let inputDescription = model?.modelDescription.outputDescriptionsByName["encoder_output_embeds"] else { return nil }
guard inputDescription.type == .multiArray else { return nil }
guard let shapeConstraint = inputDescription.multiArrayConstraint else { return nil }
let shape = shapeConstraint.shape.map { $0.intValue }
return shape[3]
return shapeConstraint.shape[1].intValue
}

public init() {}
Expand Down
13 changes: 13 additions & 0 deletions Sources/WhisperKit/Core/Models.swift
Original file line number Diff line number Diff line change
Expand Up @@ -1155,6 +1155,15 @@ struct WhisperTokenizerWrapper: WhisperTokenizer {
}

extension WhisperTokenizerWrapper: Tokenizer {

func applyChatTemplate(messages: [[String : String]]) throws -> [Int] {
try tokenizer.applyChatTemplate(messages: messages)
}

func applyChatTemplate(messages: [[String : String]], chatTemplate: String?, addGenerationPrompt: Bool, truncation: Bool, maxLength: Int?) throws -> [Int] {
try tokenizer.applyChatTemplate(messages: messages, chatTemplate: chatTemplate, addGenerationPrompt: addGenerationPrompt, truncation: truncation, maxLength: maxLength)
}

func tokenize(text: String) -> [String] {
tokenizer.tokenize(text: text)
}
Expand All @@ -1166,6 +1175,10 @@ extension WhisperTokenizerWrapper: Tokenizer {
func decode(tokens: [Int]) -> String {
tokenizer.decode(tokens: tokens)
}

func encode(text: String, addSpecialTokens: Bool) -> [Int] {
tokenizer.encode(text: text, addSpecialTokens: addSpecialTokens)
}

func convertTokenToId(_ token: String) -> Int? {
tokenizer.convertTokenToId(token)
Expand Down
4 changes: 2 additions & 2 deletions Sources/WhisperKit/Core/WhisperKit.swift
Original file line number Diff line number Diff line change
Expand Up @@ -417,14 +417,14 @@ open class WhisperKit {
) async throws -> (language: String, langProbs: [String: Float]) {
let audioBuffer = try AudioProcessor.loadAudio(fromPath: audioPath)
let audioArray = AudioProcessor.convertBufferToArray(buffer: audioBuffer)
return try await detectLangauge(audioArray: audioArray)
return try await detectLanguage(audioArray: audioArray)
}

/// Detects the language of the audio samples in the provided array.
///
/// - Parameter audioArray: An array of audio samples.
/// - Returns: A tuple containing the detected language and the language log probabilities.
open func detectLangauge(
open func detectLanguage(
audioArray: [Float]
) async throws -> (language: String, langProbs: [String: Float]) {
if modelState != .loaded {
Expand Down
Loading