argmaxinc · 1amageek · Oct 3, 2024 · Oct 3, 2024 · Oct 4, 2024 · Oct 5, 2024
diff --git a/Package.resolved b/Package.resolved
@@ -1,21 +1,30 @@
 {
   "pins" : [
+    {
+      "identity" : "jinja",
+      "kind" : "remoteSourceControl",
+      "location" : "https://github.com/maiqingqiang/Jinja",
+      "state" : {
+        "revision" : "4ffa95ce02e013c992287e19e3bbd620b6cc233a",
+        "version" : "1.0.4"
+      }
+    },
     {
       "identity" : "swift-argument-parser",
       "kind" : "remoteSourceControl",
       "location" : "https://github.com/apple/swift-argument-parser.git",
       "state" : {
-        "revision" : "c8ed701b513cf5177118a175d85fbbbcd707ab41",
-        "version" : "1.3.0"
+        "revision" : "41982a3656a71c768319979febd796c6fd111d5c",
+        "version" : "1.5.0"
       }
     },
     {
       "identity" : "swift-transformers",
       "kind" : "remoteSourceControl",
       "location" : "https://github.com/huggingface/swift-transformers.git",
       "state" : {
-        "revision" : "74b94211bdc741694ed7e700a1104c72e5ba68fe",
-        "version" : "0.1.7"
+        "revision" : "0f2306713d48a75b862026ebb291926793773f52",
+        "version" : "0.1.12"
       }
     }
   ],

diff --git a/Package.swift b/Package.swift
@@ -20,8 +20,8 @@ let package = Package(
         ),
     ],
     dependencies: [
-        .package(url: "https://github.com/huggingface/swift-transformers.git", exact: "0.1.7"),
-        .package(url: "https://github.com/apple/swift-argument-parser.git", exact: "1.3.0"),
+        .package(url: "https://github.com/huggingface/swift-transformers.git", exact: "0.1.12"),
+        .package(url: "https://github.com/apple/swift-argument-parser.git", exact: "1.5.0"),
     ],
     targets: [
         .target(

diff --git a/Sources/WhisperKit/Core/Audio/AudioChunker.swift b/Sources/WhisperKit/Core/Audio/AudioChunker.swift
@@ -82,7 +82,7 @@ open class VADAudioChunker: AudioChunking {
             var startIndex = seekClipStart
             while startIndex < seekClipEnd - windowPadding {
                 let currentFrameLength = startIndex - seekClipStart
-                if startIndex >= currentFrameLength, startIndex < 0 {
+                if startIndex >= currentFrameLength || startIndex < 0 {
                     throw WhisperError.audioProcessingFailed("startIndex is outside the buffer size")
                 }
 

diff --git a/Sources/WhisperKit/Core/Audio/AudioProcessor.swift b/Sources/WhisperKit/Core/Audio/AudioProcessor.swift
@@ -95,7 +95,7 @@ public extension AudioProcessing {
     static func padOrTrimAudio(fromArray audioArray: [Float], startAt startIndex: Int = 0, toLength frameLength: Int = 480_000, saveSegment: Bool = false) -> MLMultiArray? {
         let currentFrameLength = audioArray.count
 
-        if startIndex >= currentFrameLength, startIndex < 0 {
+        if startIndex >= currentFrameLength || startIndex < 0 {
             Logging.error("startIndex is outside the buffer size")
             return nil
         }

diff --git a/Sources/WhisperKit/Core/AudioEncoder.swift b/Sources/WhisperKit/Core/AudioEncoder.swift
@@ -22,16 +22,14 @@ public class AudioEncoder: AudioEncoding, WhisperMLModel {
         guard let inputDescription = model?.modelDescription.outputDescriptionsByName["encoder_output_embeds"] else { return nil }
         guard inputDescription.type == .multiArray else { return nil }
         guard let shapeConstraint = inputDescription.multiArrayConstraint else { return nil }
-        let shape = shapeConstraint.shape.map { $0.intValue }
-        return shape[1]
+        return shapeConstraint.shape[0].intValue
     }
 
     public var sequenceLength: Int? {
         guard let inputDescription = model?.modelDescription.outputDescriptionsByName["encoder_output_embeds"] else { return nil }
         guard inputDescription.type == .multiArray else { return nil }
         guard let shapeConstraint = inputDescription.multiArrayConstraint else { return nil }
-        let shape = shapeConstraint.shape.map { $0.intValue }
-        return shape[3]
+        return shapeConstraint.shape[1].intValue
     }
 
     public init() {}

diff --git a/Sources/WhisperKit/Core/Models.swift b/Sources/WhisperKit/Core/Models.swift
@@ -1155,6 +1155,15 @@ struct WhisperTokenizerWrapper: WhisperTokenizer {
 }
 
 extension WhisperTokenizerWrapper: Tokenizer {
+
+    func applyChatTemplate(messages: [[String : String]]) throws -> [Int] {
+        try tokenizer.applyChatTemplate(messages: messages)
+    }
+
+    func applyChatTemplate(messages: [[String : String]], chatTemplate: String?, addGenerationPrompt: Bool, truncation: Bool, maxLength: Int?) throws -> [Int] {
+        try tokenizer.applyChatTemplate(messages: messages, chatTemplate: chatTemplate, addGenerationPrompt: addGenerationPrompt, truncation: truncation, maxLength: maxLength)
+    }
+
     func tokenize(text: String) -> [String] {
         tokenizer.tokenize(text: text)
     }
@@ -1166,6 +1175,10 @@ extension WhisperTokenizerWrapper: Tokenizer {
     func decode(tokens: [Int]) -> String {
         tokenizer.decode(tokens: tokens)
     }
+
+    func encode(text: String, addSpecialTokens: Bool) -> [Int] {
+        tokenizer.encode(text: text, addSpecialTokens: addSpecialTokens)
+    }
 
     func convertTokenToId(_ token: String) -> Int? {
         tokenizer.convertTokenToId(token)

diff --git a/Sources/WhisperKit/Core/WhisperKit.swift b/Sources/WhisperKit/Core/WhisperKit.swift
@@ -417,14 +417,14 @@ open class WhisperKit {
     ) async throws -> (language: String, langProbs: [String: Float]) {
         let audioBuffer = try AudioProcessor.loadAudio(fromPath: audioPath)
         let audioArray = AudioProcessor.convertBufferToArray(buffer: audioBuffer)
-        return try await detectLangauge(audioArray: audioArray)
+        return try await detectLanguage(audioArray: audioArray)
     }
 
     /// Detects the language of the audio samples in the provided array.
     ///
     /// - Parameter audioArray: An array of audio samples.
     /// - Returns: A tuple containing the detected language and the language log probabilities.
-    open func detectLangauge(
+    open func detectLanguage(
         audioArray: [Float]
     ) async throws -> (language: String, langProbs: [String: Float]) {
         if modelState != .loaded {