From 880f95e88f4c234042d275ef1edb82ecc57507f7 Mon Sep 17 00:00:00 2001 From: Zach Nagengast Date: Wed, 6 Mar 2024 17:45:03 -0800 Subject: [PATCH] macOS 13 support (#40) * Initial macOS 13 support * Handle watchos case * Test on macos 13 and 14 runners * Update test script for macos 13 simulators * Use m1 macos 13 runners * Use m1 macos 13 runners with appropriate matrix * Use m1 macos 13 runners with appropriate matrix * Reduce ios requirement to 16 * Only test watchos on macos 14 * Add ios to the avaudioapplication test * Add note on version choices * Remove missing os versions from Package.swift * Add missing os checks * Remove excess os checks causing warnings * Test mel with cpuonly on simulator --- .github/workflows/unit-tests.yml | 16 ++++++++-- Package.swift | 15 +++++---- Sources/WhisperKit/Core/AudioEncoder.swift | 2 +- Sources/WhisperKit/Core/AudioProcessor.swift | 31 +++++++++++++++++-- .../Core/AudioStreamTranscriber.swift | 3 ++ .../WhisperKit/Core/FeatureExtractor.swift | 2 +- Sources/WhisperKit/Core/LogitsFilter.swift | 6 ++-- Sources/WhisperKit/Core/Models.swift | 16 +++++----- Sources/WhisperKit/Core/SegmentSeeker.swift | 4 +-- Sources/WhisperKit/Core/TextDecoder.swift | 6 ++-- Sources/WhisperKit/Core/TokenSampler.swift | 2 +- Sources/WhisperKit/Core/Utils.swift | 1 + Sources/WhisperKit/Core/WhisperKit.swift | 3 +- Sources/WhisperKitCLI/transcribe.swift | 2 +- Tests/WhisperKitTests/FunctionalTests.swift | 2 +- Tests/WhisperKitTests/UnitTests.swift | 11 ++++--- 16 files changed, 82 insertions(+), 40 deletions(-) diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 7526d95..5167fed 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -9,7 +9,15 @@ on: jobs: build-and-test: - runs-on: macos-14 + strategy: + matrix: + os: [macos-13-xlarge, macos-14] + include: + - os: macos-13-xlarge + ios-version: "16.1" # oldest available version + - os: macos-14 + ios-version: "17.2" # latest available version + runs-on: ${{ matrix.os }} steps: - uses: actions/checkout@v4 - uses: maxim-lobanov/setup-xcode@v1 @@ -40,14 +48,16 @@ jobs: run: | set -o pipefail xcodebuild clean build-for-testing -scheme whisperkit-Package -destination generic/platform=iOS | xcpretty - xcodebuild test -only-testing WhisperKitTests/UnitTests -scheme whisperkit-Package -destination "platform=iOS Simulator,OS=17.2,name=iPhone 15" | xcpretty + xcodebuild test -only-testing WhisperKitTests/UnitTests -scheme whisperkit-Package -destination "platform=iOS Simulator,OS=${{ matrix.ios-version }},name=iPhone 15" | xcpretty - name: Build and Test - watchOS + if: matrix.os == 'macos-14' run: | set -o pipefail xcodebuild clean build-for-testing -scheme whisperkit-Package -destination generic/platform=watchOS | xcpretty xcodebuild test -only-testing WhisperKitTests/UnitTests -scheme whisperkit-Package -destination "platform=watchOS Simulator,OS=10.2,name=Apple Watch Ultra 2 (49mm)" | xcpretty - name: Build and Test - visionOS + if: matrix.os == 'macos-14' run: | set -o pipefail xcodebuild clean build-for-testing -scheme whisperkit-Package -destination generic/platform=visionOS | xcpretty - xcodebuild test -only-testing WhisperKitTests/UnitTests -scheme whisperkit-Package -destination "platform=visionOS Simulator,name=Apple Vision Pro" | xcpretty \ No newline at end of file + xcodebuild test -only-testing WhisperKitTests/UnitTests -scheme whisperkit-Package -destination "platform=visionOS Simulator,name=Apple Vision Pro" | xcpretty diff --git a/Package.swift b/Package.swift index 057482c..b488719 100644 --- a/Package.swift +++ b/Package.swift @@ -6,10 +6,8 @@ import PackageDescription let package = Package( name: "whisperkit", platforms: [ - .iOS(.v17), - .macOS(.v14), - .watchOS(.v10), - .visionOS(.v1) + .iOS(.v16), + .macOS(.v13), ], products: [ .library( @@ -18,7 +16,8 @@ let package = Package( ), .executable( name: "transcribe", - targets: ["WhisperKitCLI"]) + targets: ["WhisperKitCLI"] + ), ], dependencies: [ .package(url: "https://github.com/huggingface/swift-transformers.git", exact: "0.1.2"), @@ -35,7 +34,7 @@ let package = Package( name: "WhisperKitCLI", dependencies: [ "WhisperKit", - .product(name: "ArgumentParser", package: "swift-argument-parser") + .product(name: "ArgumentParser", package: "swift-argument-parser"), ] ), .testTarget( @@ -51,11 +50,11 @@ let package = Package( "Makefile", "README.md", "LICENSE", - "CONTRIBUTING.md" + "CONTRIBUTING.md", ], resources: [ .process("Tests/WhisperKitTests/Resources"), - .copy("Models/whisperkit-coreml") + .copy("Models/whisperkit-coreml"), ] ), ] diff --git a/Sources/WhisperKit/Core/AudioEncoder.swift b/Sources/WhisperKit/Core/AudioEncoder.swift index bd32fad..7306115 100644 --- a/Sources/WhisperKit/Core/AudioEncoder.swift +++ b/Sources/WhisperKit/Core/AudioEncoder.swift @@ -14,7 +14,7 @@ public protocol AudioEncoding { func encodeFeatures(_ features: MLMultiArray) async throws -> MLMultiArray? } -@available(macOS 14, iOS 17, watchOS 10, visionOS 1, *) +@available(macOS 13, iOS 16, watchOS 10, visionOS 1, *) public class AudioEncoder: AudioEncoding, WhisperMLModel { public var model: MLModel? diff --git a/Sources/WhisperKit/Core/AudioProcessor.swift b/Sources/WhisperKit/Core/AudioProcessor.swift index 4909f26..0231fba 100644 --- a/Sources/WhisperKit/Core/AudioProcessor.swift +++ b/Sources/WhisperKit/Core/AudioProcessor.swift @@ -143,7 +143,7 @@ public extension AudioProcessing { } } -@available(macOS 14, iOS 17, watchOS 10, visionOS 1, *) +@available(macOS 13, iOS 16, watchOS 10, visionOS 1, *) public class AudioProcessor: NSObject, AudioProcessing { public var audioEngine: AVAudioEngine? public var audioSamples: ContiguousArray = [] @@ -314,7 +314,32 @@ public class AudioProcessor: NSObject, AudioProcessing { } public static func requestRecordPermission() async -> Bool { - await AVAudioApplication.requestRecordPermission() + if #available(macOS 14, iOS 17, *) { + return await AVAudioApplication.requestRecordPermission() + } else { + #if os(watchOS) + // watchOS does not support AVCaptureDevice + return true + #else + let microphoneStatus = AVCaptureDevice.authorizationStatus(for: .audio) + switch microphoneStatus { + case .notDetermined: + return await withCheckedContinuation { continuation in + AVCaptureDevice.requestAccess(for: .audio) { granted in + continuation.resume(returning: granted) + } + } + case .restricted, .denied: + Logging.error("Microphone access denied") + return false + case .authorized: + return true + @unknown default: + Logging.error("Unknown authorization status") + return false + } + #endif + } } #if os(macOS) @@ -412,7 +437,7 @@ public class AudioProcessor: NSObject, AudioProcessing { // MARK: - Streaming -@available(macOS 14, iOS 17, watchOS 10, visionOS 1, *) +@available(macOS 13, iOS 16, watchOS 10, visionOS 1, *) public extension AudioProcessor { /// We have a new buffer, process and store it. /// NOTE: Assumes audio is 16khz mono diff --git a/Sources/WhisperKit/Core/AudioStreamTranscriber.swift b/Sources/WhisperKit/Core/AudioStreamTranscriber.swift index c3c158f..779a25d 100644 --- a/Sources/WhisperKit/Core/AudioStreamTranscriber.swift +++ b/Sources/WhisperKit/Core/AudioStreamTranscriber.swift @@ -3,6 +3,7 @@ import Foundation +@available(macOS 13, iOS 16, watchOS 10, visionOS 1, *) public extension AudioStreamTranscriber { struct State { public var isRecording: Bool = false @@ -17,9 +18,11 @@ public extension AudioStreamTranscriber { } } +@available(macOS 13, iOS 16, watchOS 10, visionOS 1, *) public typealias AudioStreamTranscriberCallback = (AudioStreamTranscriber.State, AudioStreamTranscriber.State) -> Void /// Responsible for streaming audio from the microphone, processing it, and transcribing it in real-time. +@available(macOS 13, iOS 16, watchOS 10, visionOS 1, *) public actor AudioStreamTranscriber { private var state: AudioStreamTranscriber.State = .init() { didSet { diff --git a/Sources/WhisperKit/Core/FeatureExtractor.swift b/Sources/WhisperKit/Core/FeatureExtractor.swift index 78544f7..c44e56a 100644 --- a/Sources/WhisperKit/Core/FeatureExtractor.swift +++ b/Sources/WhisperKit/Core/FeatureExtractor.swift @@ -12,7 +12,7 @@ public protocol FeatureExtracting { func logMelSpectrogram(fromAudio inputAudio: MLMultiArray) async throws -> MLMultiArray? } -@available(macOS 14, iOS 17, watchOS 10, visionOS 1, *) +@available(macOS 13, iOS 16, watchOS 10, visionOS 1, *) public class FeatureExtractor: FeatureExtracting, WhisperMLModel { public var model: MLModel? diff --git a/Sources/WhisperKit/Core/LogitsFilter.swift b/Sources/WhisperKit/Core/LogitsFilter.swift index e71f018..6bd60a3 100644 --- a/Sources/WhisperKit/Core/LogitsFilter.swift +++ b/Sources/WhisperKit/Core/LogitsFilter.swift @@ -9,7 +9,7 @@ public protocol LogitsFiltering { func filterLogits(_ logits: MLMultiArray, withTokens tokens: [Int]) -> MLMultiArray } -@available(macOS 14, iOS 17, watchOS 10, visionOS 1, *) +@available(macOS 13, iOS 16, watchOS 10, visionOS 1, *) public class SuppressTokensFilter: LogitsFiltering { let suppressTokens: [Int] private let suppressTokenIndexes: [[NSNumber]] @@ -25,7 +25,7 @@ public class SuppressTokensFilter: LogitsFiltering { } } -@available(macOS 14, iOS 17, watchOS 10, visionOS 1, *) +@available(macOS 13, iOS 16, watchOS 10, visionOS 1, *) public class SuppressBlankFilter: LogitsFiltering { let suppressBlankTokens: [Int] let sampleBegin: Int @@ -46,7 +46,7 @@ public class SuppressBlankFilter: LogitsFiltering { } } -@available(macOS 14, iOS 17, watchOS 10, visionOS 1, *) +@available(macOS 13, iOS 16, watchOS 10, visionOS 1, *) public class TimestampRulesFilter: LogitsFiltering { let tokenizer: Tokenizer let sampleBegin: Int diff --git a/Sources/WhisperKit/Core/Models.swift b/Sources/WhisperKit/Core/Models.swift index 2048089..a1d8196 100644 --- a/Sources/WhisperKit/Core/Models.swift +++ b/Sources/WhisperKit/Core/Models.swift @@ -7,7 +7,7 @@ import NaturalLanguage import Tokenizers #if os(watchOS) || arch(arm64) -@available(macOS 14, iOS 17, watchOS 10, visionOS 1, *) +@available(macOS 13, iOS 16, watchOS 10, visionOS 1, *) public typealias FloatType = Float16 #else public typealias FloatType = Float @@ -200,7 +200,7 @@ public struct DecodingCache { /// - logProbThreshold: If the average log probability over sampled tokens is below this value, treat as failed. /// - noSpeechThreshold: If the no speech probability is higher than this value AND the average log /// probability over sampled tokens is below `logProbThreshold`, consider the segment as silent. -@available(macOS 14, iOS 17, watchOS 10, visionOS 1, *) +@available(macOS 13, iOS 16, watchOS 10, visionOS 1, *) public struct DecodingOptions { public var verbose: Bool public var task: DecodingTask @@ -489,7 +489,7 @@ public class MelSpectrogramInput: MLFeatureProvider { } /// Model Prediction Output Type -@available(macOS 14, iOS 17, watchOS 10, visionOS 1, *) +@available(macOS 13, iOS 16, watchOS 10, visionOS 1, *) public class MelSpectrogramOutput: MLFeatureProvider { /// Source provided by CoreML private let provider: MLFeatureProvider @@ -526,7 +526,7 @@ public class MelSpectrogramOutput: MLFeatureProvider { // MARK: AudioEncoder /// Model Prediction Input Type -@available(macOS 14, iOS 17, watchOS 10, visionOS 1, *) +@available(macOS 13, iOS 16, watchOS 10, visionOS 1, *) public class AudioEncoderInput: MLFeatureProvider { /// melspectrogram_features as 1 × {80,128} × 1 × 3000 4-dimensional array of floats public var melspectrogram_features: MLMultiArray @@ -552,7 +552,7 @@ public class AudioEncoderInput: MLFeatureProvider { } /// Model Prediction Output Type -@available(macOS 14, iOS 17, watchOS 10, visionOS 1, *) +@available(macOS 13, iOS 16, watchOS 10, visionOS 1, *) public class AudioEncoderOutput: MLFeatureProvider { /// Source provided by CoreML private let provider: MLFeatureProvider @@ -589,7 +589,7 @@ public class AudioEncoderOutput: MLFeatureProvider { // MARK: TextDecoder /// Model Prediction Input Type -@available(macOS 14, iOS 17, watchOS 10, visionOS 1, *) +@available(macOS 13, iOS 16, watchOS 10, visionOS 1, *) public class TextDecoderInput: MLFeatureProvider { /// input_ids as 1 element vector of 32-bit integers public var input_ids: MLMultiArray @@ -657,7 +657,7 @@ public class TextDecoderInput: MLFeatureProvider { } /// Model Prediction Output Type -@available(macOS 14, iOS 17, watchOS 10, visionOS 1, *) +@available(macOS 13, iOS 16, watchOS 10, visionOS 1, *) public class TextDecoderOutput: MLFeatureProvider { /// Source provided by CoreML private let provider: MLFeatureProvider @@ -764,7 +764,7 @@ public class TextDecoderCachePrefillInput: MLFeatureProvider { } /// Model Prediction Output Type -@available(macOS 14, iOS 17, watchOS 10, visionOS 1, *) +@available(macOS 13, iOS 16, watchOS 10, visionOS 1, *) public class TextDecoderCachePrefillOutput: MLFeatureProvider { /// Source provided by CoreML private let provider: MLFeatureProvider diff --git a/Sources/WhisperKit/Core/SegmentSeeker.swift b/Sources/WhisperKit/Core/SegmentSeeker.swift index 8e78415..e37f8e6 100644 --- a/Sources/WhisperKit/Core/SegmentSeeker.swift +++ b/Sources/WhisperKit/Core/SegmentSeeker.swift @@ -6,7 +6,7 @@ import CoreML import Foundation import Tokenizers -@available(macOS 14, iOS 17, watchOS 10, visionOS 1, *) +@available(macOS 13, iOS 16, watchOS 10, visionOS 1, *) public protocol SegmentSeeking { func findSeekPointAndSegments( decodingResult: DecodingResult, @@ -34,7 +34,7 @@ public protocol SegmentSeeking { ) throws -> [TranscriptionSegment]? } -@available(macOS 14, iOS 17, watchOS 10, visionOS 1, *) +@available(macOS 13, iOS 16, watchOS 10, visionOS 1, *) public class SegmentSeeker: SegmentSeeking { public init() {} diff --git a/Sources/WhisperKit/Core/TextDecoder.swift b/Sources/WhisperKit/Core/TextDecoder.swift index 97f8732..6575b32 100644 --- a/Sources/WhisperKit/Core/TextDecoder.swift +++ b/Sources/WhisperKit/Core/TextDecoder.swift @@ -5,7 +5,7 @@ import Accelerate import CoreML import Tokenizers -@available(macOS 14, iOS 17, watchOS 10, visionOS 1, *) +@available(macOS 13, iOS 16, watchOS 10, visionOS 1, *) public protocol TextDecoding { var tokenizer: Tokenizer? { get set } var prefillData: WhisperMLModel? { get set } @@ -43,7 +43,7 @@ public protocol TextDecoding { ) } -@available(macOS 14, iOS 17, watchOS 10, visionOS 1, *) +@available(macOS 13, iOS 16, watchOS 10, visionOS 1, *) public extension TextDecoding { func prepareDecoderInputs(withPrompt initialPrompt: [Int]) -> DecodingInputs? { let tokenShape = [NSNumber(value: 1), NSNumber(value: initialPrompt.count)] @@ -234,7 +234,7 @@ public class TextDecoderContextPrefill: WhisperMLModel { public var model: MLModel? } -@available(macOS 14, iOS 17, watchOS 10, visionOS 1, *) +@available(macOS 13, iOS 16, watchOS 10, visionOS 1, *) public class TextDecoder: TextDecoding, WhisperMLModel { public var model: MLModel? public var tokenizer: Tokenizer? diff --git a/Sources/WhisperKit/Core/TokenSampler.swift b/Sources/WhisperKit/Core/TokenSampler.swift index d06b69d..1947054 100644 --- a/Sources/WhisperKit/Core/TokenSampler.swift +++ b/Sources/WhisperKit/Core/TokenSampler.swift @@ -16,7 +16,7 @@ public struct SamplingResult { public var completed: Bool } -@available(macOS 14, iOS 17, watchOS 10, visionOS 1, *) +@available(macOS 13, iOS 16, watchOS 10, visionOS 1, *) public class GreedyTokenSampler: TokenSampling { public var temperature: FloatType public var eotToken: Int diff --git a/Sources/WhisperKit/Core/Utils.swift b/Sources/WhisperKit/Core/Utils.swift index e90bcc6..193b216 100644 --- a/Sources/WhisperKit/Core/Utils.swift +++ b/Sources/WhisperKit/Core/Utils.swift @@ -39,6 +39,7 @@ extension MLMultiArray { } } +@available(macOS 13, iOS 16, watchOS 10, visionOS 1, *) func initMLMultiArray(shape: [NSNumber], dataType: MLMultiArrayDataType, initialValue: Any) -> MLMultiArray { let multiArray = try! MLMultiArray(shape: shape, dataType: dataType) diff --git a/Sources/WhisperKit/Core/WhisperKit.swift b/Sources/WhisperKit/Core/WhisperKit.swift index 649674b..9628a9f 100644 --- a/Sources/WhisperKit/Core/WhisperKit.swift +++ b/Sources/WhisperKit/Core/WhisperKit.swift @@ -9,12 +9,13 @@ import Hub import TensorUtils import Tokenizers +@available(macOS 13, iOS 16, watchOS 10, visionOS 1, *) public protocol Transcriber { func transcribe(audioPath: String, decodeOptions: DecodingOptions?, callback: TranscriptionCallback) async throws -> TranscriptionResult? func transcribe(audioArray: [Float], decodeOptions: DecodingOptions?, callback: TranscriptionCallback) async throws -> TranscriptionResult? } -@available(macOS 14, iOS 17, watchOS 10, visionOS 1, *) +@available(macOS 13, iOS 16, watchOS 10, visionOS 1, *) public class WhisperKit: Transcriber { /// Models public var modelVariant: ModelVariant = .tiny diff --git a/Sources/WhisperKitCLI/transcribe.swift b/Sources/WhisperKitCLI/transcribe.swift index d2d73a6..7a1c571 100644 --- a/Sources/WhisperKitCLI/transcribe.swift +++ b/Sources/WhisperKitCLI/transcribe.swift @@ -7,7 +7,7 @@ import Foundation import WhisperKit -@available(macOS 14, iOS 17, watchOS 10, visionOS 1, *) +@available(macOS 13, iOS 16, watchOS 10, visionOS 1, *) @main struct WhisperKitCLI: AsyncParsableCommand { @Option(help: "Path to audio file") diff --git a/Tests/WhisperKitTests/FunctionalTests.swift b/Tests/WhisperKitTests/FunctionalTests.swift index e783adf..44fd92f 100644 --- a/Tests/WhisperKitTests/FunctionalTests.swift +++ b/Tests/WhisperKitTests/FunctionalTests.swift @@ -5,7 +5,7 @@ import CoreML @testable import WhisperKit import XCTest -@available(macOS 14, iOS 17, watchOS 10, visionOS 1, *) +@available(macOS 13, iOS 16, watchOS 10, visionOS 1, *) final class FunctionalTests: XCTestCase { func testInitLarge() async { let modelPath = largev3ModelPath() diff --git a/Tests/WhisperKitTests/UnitTests.swift b/Tests/WhisperKitTests/UnitTests.swift index 96f02e9..c6781d2 100644 --- a/Tests/WhisperKitTests/UnitTests.swift +++ b/Tests/WhisperKitTests/UnitTests.swift @@ -7,7 +7,7 @@ import Tokenizers @testable import WhisperKit import XCTest -@available(macOS 14, iOS 17, watchOS 10, visionOS 1, *) +@available(macOS 13, iOS 16, watchOS 10, visionOS 1, *) final class UnitTests: XCTestCase { func testInit() async { let whisperKit = try? await WhisperKit(prewarm: false, load: false, download: false) @@ -226,7 +226,9 @@ final class UnitTests: XCTestCase { } func testWindowing() async { - let computeOptions = ModelComputeOptions() + let computeOptions = ModelComputeOptions( + melCompute: .cpuOnly + ) let whisperKit = try? await WhisperKit(modelFolder: tinyModelPath(), computeOptions: computeOptions, verbose: true, logLevel: .debug) guard let audioFilePath = Bundle.module.path(forResource: "jfk", ofType: "wav") else { @@ -373,7 +375,7 @@ final class UnitTests: XCTestCase { XCTFail("Failed to transcribe") return } - XCTAssertEqual(result.text.prefix(4), "東京は晴") + XCTAssertEqual(result.text.prefix(3), "東京は") } func testNoTimestamps() async { @@ -831,6 +833,7 @@ final class UnitTests: XCTestCase { // MARK: Helpers +@available(macOS 13, iOS 16, watchOS 10, visionOS 1, *) extension MLMultiArray { /// Create `MLMultiArray` of shape [1, 1, arr.count] and fill up the last /// dimension with with values from arr. @@ -858,7 +861,7 @@ extension MLMultiArray { } } -@available(macOS 14, iOS 17, watchOS 10, visionOS 1, *) +@available(macOS 13, iOS 16, watchOS 10, visionOS 1, *) extension XCTestCase { func transcribe(with variant: ModelVariant, options: DecodingOptions, audioFile: String = "jfk.wav", file: StaticString = #file, line: UInt = #line) async throws -> TranscriptionResult? { var modelPath = tinyModelPath()