argmaxinc
diff --git a/‎Playground.xcodeproj/project.pbxproj‎
Lines changed: 9 additions & 1 deletion b/‎Playground.xcodeproj/project.pbxproj‎
Lines changed: 9 additions & 1 deletion
diff --git a/‎Playground/Playground.swift‎
Lines changed: 1 addition & 1 deletion b/‎Playground/Playground.swift‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎Playground/Services/ArgmaxSDKCoordinator.swift‎
Lines changed: 25 additions & 4 deletions b/‎Playground/Services/ArgmaxSDKCoordinator.swift‎
Lines changed: 25 additions & 4 deletions
diff --git a/‎Playground/TranscriptionLiveActivity/TranscriptionAttributes.swift‎
Lines changed: 1 addition & 1 deletion b/‎Playground/TranscriptionLiveActivity/TranscriptionAttributes.swift‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎Playground/TranscriptionLiveActivity/TranscriptionLiveActivity.swift‎
Lines changed: 3 additions & 3 deletions b/‎Playground/TranscriptionLiveActivity/TranscriptionLiveActivity.swift‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎Playground/ViewModels/StreamViewModel.swift‎
Lines changed: 39 additions & 25 deletions b/‎Playground/ViewModels/StreamViewModel.swift‎
Lines changed: 39 additions & 25 deletions
@@ -11,6 +11,8 @@
 		1677AFC42B57618A008C61C0 /* Preview Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 1677AFAE2B57618A008C61C0 /* Preview Assets.xcassets */; };
 		1677AFE12B57678E008C61C0 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 1677AFE02B57678E008C61C0 /* Assets.xcassets */; };
 		1677AFE62B57704E008C61C0 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 1677AFE52B57704E008C61C0 /* ContentView.swift */; };
+		5539A56B2EA719360020D5CE /* CustomVocabularySheet.swift in Sources */ = {isa = PBXBuildFile; fileRef = 5539A56A2EA719360020D5CE /* CustomVocabularySheet.swift */; };
+		5539A56D2EA71B2A0020D5CE /* HighlightedTextView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 5539A56C2EA71B2A0020D5CE /* HighlightedTextView.swift */; };
 		740F6DA12E2CD3ED00429FE9 /* AudioDeviceDiscoverer.swift in Sources */ = {isa = PBXBuildFile; fileRef = 740F6DA02E2CD3ED00429FE9 /* AudioDeviceDiscoverer.swift */; };
 		740F6DA42E2DB07400429FE9 /* ArgmaxSDKCoordinator.swift in Sources */ = {isa = PBXBuildFile; fileRef = 740F6DA32E2DB07400429FE9 /* ArgmaxSDKCoordinator.swift */; };
 		74312CDC2E1D02E3000D994A /* MacAudioDevicesView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 74312CDB2E1D02E3000D994A /* MacAudioDevicesView.swift */; };
@@ -88,6 +90,8 @@
 		1677AFE02B57678E008C61C0 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = "<group>"; };
 		1677AFE52B57704E008C61C0 /* ContentView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ContentView.swift; sourceTree = "<group>"; };
 		167B345E2B05431E0076F261 /* Playground.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = Playground.app; sourceTree = BUILT_PRODUCTS_DIR; };
+		5539A56A2EA719360020D5CE /* CustomVocabularySheet.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = CustomVocabularySheet.swift; sourceTree = "<group>"; };
+		5539A56C2EA71B2A0020D5CE /* HighlightedTextView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = HighlightedTextView.swift; sourceTree = "<group>"; };
 		740F6DA02E2CD3ED00429FE9 /* AudioDeviceDiscoverer.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = AudioDeviceDiscoverer.swift; sourceTree = "<group>"; };
 		740F6DA32E2DB07400429FE9 /* ArgmaxSDKCoordinator.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ArgmaxSDKCoordinator.swift; sourceTree = "<group>"; };
 		74312CDB2E1D02E3000D994A /* MacAudioDevicesView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = MacAudioDevicesView.swift; sourceTree = "<group>"; };
@@ -193,6 +197,8 @@
 		1677AFE42B5769E5008C61C0 /* Views */ = {
 			isa = PBXGroup;
 			children = (
+				5539A56C2EA71B2A0020D5CE /* HighlightedTextView.swift */,
+				5539A56A2EA719360020D5CE /* CustomVocabularySheet.swift */,
 				74F897782E4F9B130045252E /* TranscriptionModeSelection.swift */,
 				74312CDD2E1DA46C000D994A /* StreamResultView.swift */,
 				1677AFE52B57704E008C61C0 /* ContentView.swift */,
@@ -394,6 +400,7 @@
 			isa = PBXSourcesBuildPhase;
 			buildActionMask = 2147483647;
 			files = (
+				5539A56D2EA71B2A0020D5CE /* HighlightedTextView.swift in Sources */,
 				747E67082E300F780061E778 /* TranscribeResultView.swift in Sources */,
 				74312CDC2E1D02E3000D994A /* MacAudioDevicesView.swift in Sources */,
 				74F3B7BE2E1CF44F00C544D1 /* AudioProcessDiscoverer.swift in Sources */,
@@ -413,6 +420,7 @@
 				74F860962E2B19060007163C /* CoreAudioUtils.swift in Sources */,
 				747E67062E3008000061E778 /* TranscribeViewModel.swift in Sources */,
 				74F860942E29A9D20007163C /* ProcessTapper.swift in Sources */,
+				5539A56B2EA719360020D5CE /* CustomVocabularySheet.swift in Sources */,
 				740F6DA42E2DB07400429FE9 /* ArgmaxSDKCoordinator.swift in Sources */,
 			);
 			runOnlyForDeploymentPostprocessing = 0;
@@ -758,7 +766,7 @@
 			repositoryURL = "argmaxinc.argmax-sdk-swift-alpha";
 			requirement = {
 				kind = upToNextMajorVersion;
-				minimumVersion = 1.7.11;
+				minimumVersion = 1.9.6;
 			};
 		};
 /* End XCRemoteSwiftPackageReference section */
 
@@ -92,7 +92,7 @@ struct Playground: App {
     }
 
     var body: some Scene {
-        WindowGroup {
+        WindowGroup("Argmax Playground")  {
              ContentView(analyticsLogger: analyticsLogger)
                  #if os(macOS)
                  .environmentObject(audioProcessDiscoverer)
 
@@ -159,7 +159,8 @@ public final class ArgmaxSDKCoordinator: ObservableObject {
     public func prepare(modelName: String,
                         repository: String? = nil,
                         config: WhisperKitProConfig,
-                        redownload: Bool = false) async throws {
+                        redownload: Bool = false,
+                        clustererVersion: ClustererVersion) async throws {
         guard let apiKey = apiKey, !apiKey.isEmpty else {
             self.whisperKitModelState = .unloaded
             self.speakerKitModelState = .unloaded
@@ -197,7 +198,7 @@ public final class ArgmaxSDKCoordinator: ObservableObject {
             self.whisperKit = whisperKitPro
 
             // --- Then prepare SpeakerKit
-            let speakerKitPro = try await initializeSpeakerKitPro()
+            let speakerKitPro = try await initializeSpeakerKitPro(clustererVersion: clustererVersion)
             self.speakerKit = speakerKitPro
             self.speakerKitModelState = speakerKitPro.modelState
 
@@ -210,6 +211,20 @@ public final class ArgmaxSDKCoordinator: ObservableObject {
             throw error
         }
     }
+    
+    @MainActor
+    public func updateCustomVocabulary(words: [String]) throws {
+        guard let whisperKit else {
+            throw ArgmaxError.modelUnavailable("WhisperKit model is not loaded")
+        }
+
+        do {
+            try whisperKit.setCustomVocabulary(words)
+        } catch {
+            Logging.error("Failed to update custom vocabulary: \(error)")
+            throw error
+        }
+    }
 
     public func delete(modelName: String,
                        repository: String? = nil,
@@ -226,6 +241,12 @@ public final class ArgmaxSDKCoordinator: ObservableObject {
             throw ArgmaxError.generic("Failed to delete model")
         }
     }
+    
+    public func deleteCustomVocabularyModels() async throws {
+        for model in ["canary-1b-v2", "parakeet-tdt_ctc-110m"] {
+            try await modelStore.deleteModel(variant: model, from: "argmaxinc/ctckit-pro")
+        }
+    }
 
     public func reset() async {
         modelStore.cancelDownload()
@@ -322,8 +343,8 @@ public final class ArgmaxSDKCoordinator: ObservableObject {
     }
 
     /// Initializes and loads SpeakerKitPro
-    private func initializeSpeakerKitPro() async throws -> SpeakerKitPro {
-        var config = SpeakerKitProConfig(load: true)
+    private func initializeSpeakerKitPro(clustererVersion: ClustererVersion) async throws -> SpeakerKitPro {
+        var config = SpeakerKitProConfig(load: true, clustererVersion: clustererVersion)
         let connected = await ArgmaxSDK.isConnected()
         if !connected {
             config.download = false
 
@@ -12,7 +12,7 @@ struct TranscriptionAttributes: ActivityAttributes {
     /// Static configuration that doesn't change during the live activity session
     public struct ContentState: Codable, Hashable {
         /// Current transcription hypothesis text being processed
-        var currentHypothesis: String
+        var currentHypothesis: AttributedString
 
         /// Duration of audio processed in seconds
         var audioSeconds: Double
 
@@ -46,10 +46,10 @@ struct TranscriptionLiveActivity: Widget {
                                 .font(.caption)
                                 .foregroundColor(.secondary)
                                 .frame(minHeight: 32, alignment: .topLeading)
-                        } else if !context.state.currentHypothesis.isEmpty {
+                        } else if !context.state.currentHypothesis.characters.isEmpty {
                             Text(context.state.currentHypothesis)
                                 .font(.caption)
-                                .lineLimit(nil)
+                                .lineLimit(3)
                                 .truncationMode(.head)
                                 .frame(minHeight: 32, alignment: .topLeading)
                                 .fixedSize(horizontal: false, vertical: true)
@@ -107,7 +107,7 @@ struct LockScreenLiveActivityView: View {
                 Text("Microphone session interrupted. Restart transcription from the app.")
                     .font(.subheadline)
                     .foregroundColor(.secondary)
-            } else if !context.state.currentHypothesis.isEmpty {
+            } else if !context.state.currentHypothesis.characters.isEmpty {
                 Text(context.state.currentHypothesis)
                     .font(.subheadline)
                     .lineLimit(3)
 
@@ -42,6 +42,11 @@ import ActivityKit
 /// - **`AudioProcessDiscoverer` / `AudioDeviceDiscoverer`:** (macOS only) These are used to determine which
 ///   audio sources are available for streaming.
 class StreamViewModel: ObservableObject {
+    #if os(macOS)
+    private let energyHistoryLimit = 512
+    #else
+    private let energyHistoryLimit = 256
+    #endif
     // Stream Results - per-stream data for UI
     @Published var deviceResult: StreamResult?
     @Published var systemResult: StreamResult?
@@ -137,12 +142,11 @@ class StreamViewModel: ObservableObject {
     /// Contains all transcription data for a single stream including text results, timing information, and audio energy data
     struct StreamResult {
         var title: String = ""
-        var confirmedText: String = ""
-        var hypothesisText: String = ""
+        var confirmedSegments: [TranscriptionSegment] = []
+        var hypothesisSegments: [TranscriptionSegment] = []
+        var customVocabularyResults: [WordTiming: [WordTiming]] = [:]
         var streamEndSeconds: Float?
         var bufferEnergy: [Float] = []
-        var bufferSeconds: Double = 0
-        var transcribeResult: TranscriptionResultPro? = nil
         var streamTimestampText: String {
             guard let end = streamEndSeconds else {
                 return ""
@@ -311,20 +315,37 @@ class StreamViewModel: ObservableObject {
         }
     }
 
+    private func mergeVocabularyResults(
+        existing: inout [WordTiming: [WordTiming]],
+        newResults: [WordTiming: [WordTiming]]
+    ) {
+        guard !newResults.isEmpty else { return }
+        for (key, occurrences) in newResults {
+            if var stored = existing[key] {
+                stored.append(contentsOf: occurrences)
+                existing[key] = stored
+            } else {
+                existing[key] = occurrences
+            }
+        }
+    }
+    
     @MainActor
     private func handleResult(_ result: LiveResult, for sourceId: String) {
         switch result {
-        case .hypothesis(let text, _, _):
+        case .hypothesis(let text, _, let hypothesisResult):
             let now = Date().timeIntervalSince1970
             let last = lastHypothesisUpdateAtBySource[sourceId] ?? 0
             // Update at most 10 times per second per source
             guard now - last >= 0.1 else { return }
             lastHypothesisUpdateAtBySource[sourceId] = now
-            let trimmed = text.trimmingCharacters(in: .whitespacesAndNewlines)
-            guard trimmed != (isDeviceSource(sourceId) ? deviceResult?.hypothesisText : systemResult?.hypothesisText) else { return }
             updateStreamResult(sourceId: sourceId) { oldResult in
                 var newResult = oldResult
-                newResult.hypothesisText = trimmed
+                newResult.hypothesisSegments = hypothesisResult.hypothesisSegments
+                mergeVocabularyResults(
+                    existing: &newResult.customVocabularyResults,
+                    newResults: hypothesisResult.customVocabularyResults
+                )
                 return newResult
             }
 
@@ -333,28 +354,26 @@ class StreamViewModel: ObservableObject {
             Task {
                 await liveActivityManager.updateContentState { oldState in
                     var state = oldState
-                    state.currentHypothesis = trimmed
+                    let highlightedHypothesis = HighlightedTextView.createHighlightedAttributedString(segments: deviceResult?.hypothesisSegments ?? [], customVocabularyResults: deviceResult?.customVocabularyResults ?? [:], font: .body, foregroundColor: .primary)
+                    state.currentHypothesis = highlightedHypothesis
                     return state
                 }
             }
             #endif
 
-        case .confirm(let text, let seconds, let transcriptionResult):
+        case .confirm(_, let seconds, let confirmedResult):
             updateStreamResult(sourceId: sourceId) { oldResult in
                 var newResult = oldResult
-                let newText = text.trimmingCharacters(in: .whitespaces)
-                if !newText.isEmpty {
-                    if !newResult.confirmedText.isEmpty {
-                        newResult.confirmedText += " "
-                    }
-                    newResult.confirmedText += newText
-                }
+                newResult.confirmedSegments += confirmedResult.segments
                 newResult.streamEndSeconds = Float(seconds)
-                newResult.transcribeResult = transcriptionResult
+                mergeVocabularyResults(
+                    existing: &newResult.customVocabularyResults,
+                    newResults: confirmedResult.customVocabularyResults
+                )
                 return newResult
             }
             if let confirmedresultCallback = self.confirmedresultCallback {
-                confirmedresultCallback(sourceId, transcriptionResult)
+                confirmedresultCallback(sourceId, confirmedResult)
             }
         }
     }
@@ -373,18 +392,13 @@ class StreamViewModel: ObservableObject {
 
             // Limit the amount of energy samples passed to the UI for performance
             let energies = whisperKitPro.audioProcessor.relativeEnergy
-            #if os(iOS)
-            let newBufferEnergy = Array(energies.suffix(256))
-            #else
-            let newBufferEnergy = energies
-            #endif
+            let newBufferEnergy = Array(energies.suffix(self.energyHistoryLimit))
             let sampleCount = whisperKitPro.audioProcessor.audioSamples.count
             let audioSeconds = Double(sampleCount) / Double(WhisperKit.sampleRate)
 
             updateStreamResult(sourceId: source.id) { oldResult in
                 var newResult = oldResult
                 newResult.bufferEnergy = newBufferEnergy
-                newResult.bufferSeconds = audioSeconds
                 return newResult
             }
Original file line number	Diff line number	Diff line change
`@@ -92,7 +92,7 @@ struct Playground: App {`
`92`	`92`	`}`
`93`	`93`
`94`	`94`	`var body: some Scene {`
`95`		`- WindowGroup {`
	`95`	`+ WindowGroup("Argmax Playground") {`
`96`	`96`	`ContentView(analyticsLogger: analyticsLogger)`
`97`	`97`	`#if os(macOS)`
`98`	`98`	`.environmentObject(audioProcessDiscoverer)`