Skip to content

Commit 02763ca

Browse files
Fix resampling large files (#183)
* Update resampling logic to handle chunking properly * Cleanup logging * Optimize memory usage when resampling * Add filter to input prompt text * Correct timestamp filter logic for #170 * Filter out zero length segments - when calculating word timestamps - resolves #170 * Add method for async audio loading * Fix async load audio function * Fix tests * Fix tests * Fix tests * Revert timestamp filter changes * Temporarily remove xcpretty for tests * Check suspected test crash * Remove errant test case for japanese options * Add bigger range for early stopping test * Reset progress between runs * Fix progress resetting and improve example app transcription handling * Update tests * Minimize crash risk for early stop checks * Fix finalize text * Add source text to language label
1 parent 3186ca6 commit 02763ca

File tree

13 files changed

+536
-169
lines changed

13 files changed

+536
-169
lines changed

.github/workflows/unit-tests.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,4 +77,4 @@ jobs:
7777
run: |
7878
set -o pipefail
7979
xcodebuild clean build-for-testing -scheme whisperkit-Package -destination '${{ matrix.run-config['clean-destination'] }}' | xcpretty
80-
xcodebuild test -only-testing WhisperKitTests/UnitTests -scheme whisperkit-Package -destination '${{ matrix.run-config['test-destination'] }}' | xcpretty
80+
xcodebuild test -only-testing WhisperKitTests/UnitTests -scheme whisperkit-Package -destination '${{ matrix.run-config['test-destination'] }}'

Examples/WhisperAX/WhisperAX.xcodeproj/project.pbxproj

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -890,7 +890,7 @@
890890
LD_RUNPATH_SEARCH_PATHS = "@executable_path/Frameworks";
891891
"LD_RUNPATH_SEARCH_PATHS[sdk=macosx*]" = "@executable_path/../Frameworks";
892892
MACOSX_DEPLOYMENT_TARGET = 14.0;
893-
MARKETING_VERSION = 0.3.1;
893+
MARKETING_VERSION = 0.3.2;
894894
PRODUCT_BUNDLE_IDENTIFIER = "com.argmax.whisperkit.WhisperAX${DEVELOPMENT_TEAM}";
895895
PRODUCT_NAME = "$(TARGET_NAME)";
896896
SDKROOT = auto;
@@ -936,7 +936,7 @@
936936
LD_RUNPATH_SEARCH_PATHS = "@executable_path/Frameworks";
937937
"LD_RUNPATH_SEARCH_PATHS[sdk=macosx*]" = "@executable_path/../Frameworks";
938938
MACOSX_DEPLOYMENT_TARGET = 14.0;
939-
MARKETING_VERSION = 0.3.1;
939+
MARKETING_VERSION = 0.3.2;
940940
PRODUCT_BUNDLE_IDENTIFIER = com.argmax.whisperkit.WhisperAX;
941941
PRODUCT_NAME = "$(TARGET_NAME)";
942942
SDKROOT = auto;

Examples/WhisperAX/WhisperAX/Views/ContentView.swift

Lines changed: 76 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ struct ContentView: View {
9797
@State private var showAdvancedOptions: Bool = false
9898
@State private var transcriptionTask: Task<Void, Never>? = nil
9999
@State private var selectedCategoryId: MenuItem.ID?
100-
@State private var transcribeFileTask: Task<Void, Never>? = nil
100+
@State private var transcribeTask: Task<Void, Never>? = nil
101101

102102
struct MenuItem: Identifiable, Hashable {
103103
var id = UUID()
@@ -122,7 +122,7 @@ struct ContentView: View {
122122
// MARK: Views
123123

124124
func resetState() {
125-
transcribeFileTask?.cancel()
125+
transcribeTask?.cancel()
126126
isRecording = false
127127
isTranscribing = false
128128
whisperKit?.audioProcessor.stopRecording()
@@ -311,15 +311,27 @@ struct ContentView: View {
311311
.textSelection(.enabled)
312312
.padding()
313313
if let whisperKit,
314-
!isRecording,
315-
!isTranscribing,
316-
whisperKit.progress.fractionCompleted > 0,
314+
!isStreamMode,
315+
isTranscribing,
316+
let task = transcribeTask,
317+
!task.isCancelled,
317318
whisperKit.progress.fractionCompleted < 1
318319
{
319-
ProgressView(whisperKit.progress)
320-
.progressViewStyle(.linear)
321-
.labelsHidden()
322-
.padding(.horizontal)
320+
HStack {
321+
ProgressView(whisperKit.progress)
322+
.progressViewStyle(.linear)
323+
.labelsHidden()
324+
.padding(.horizontal)
325+
326+
Button {
327+
transcribeTask?.cancel()
328+
transcribeTask = nil
329+
} label: {
330+
Image(systemName: "xmark.circle.fill")
331+
.foregroundColor(.secondary)
332+
}
333+
.buttonStyle(BorderlessButtonStyle())
334+
}
323335
}
324336
}
325337
}
@@ -706,7 +718,7 @@ struct ContentView: View {
706718
}
707719
.disabled(!(whisperKit?.modelVariant.isMultilingual ?? false))
708720
} label: {
709-
Label("Language", systemImage: "globe")
721+
Label("Source Language", systemImage: "globe")
710722
}
711723
.padding(.horizontal)
712724
.padding(.top)
@@ -1149,12 +1161,14 @@ struct ContentView: View {
11491161
func transcribeFile(path: String) {
11501162
resetState()
11511163
whisperKit?.audioProcessor = AudioProcessor()
1152-
self.transcribeFileTask = Task {
1164+
self.transcribeTask = Task {
1165+
isTranscribing = true
11531166
do {
11541167
try await transcribeCurrentFile(path: path)
11551168
} catch {
11561169
print("File selection error: \(error.localizedDescription)")
11571170
}
1171+
isTranscribing = false
11581172
}
11591173
}
11601174

@@ -1218,21 +1232,49 @@ struct ContentView: View {
12181232

12191233
// If not looping, transcribe the full buffer
12201234
if !loop {
1221-
Task {
1235+
self.transcribeTask = Task {
1236+
isTranscribing = true
12221237
do {
12231238
try await transcribeCurrentBuffer()
12241239
} catch {
12251240
print("Error: \(error.localizedDescription)")
12261241
}
1242+
finalizeText()
1243+
isTranscribing = false
1244+
}
1245+
}
1246+
1247+
finalizeText()
1248+
}
1249+
1250+
func finalizeText() {
1251+
// Finalize unconfirmed text
1252+
Task {
1253+
await MainActor.run {
1254+
if hypothesisText != "" {
1255+
confirmedText += hypothesisText
1256+
hypothesisText = ""
1257+
}
1258+
1259+
if unconfirmedSegments.count > 0 {
1260+
confirmedSegments.append(contentsOf: unconfirmedSegments)
1261+
unconfirmedSegments = []
1262+
}
12271263
}
12281264
}
12291265
}
12301266

12311267
// MARK: - Transcribe Logic
12321268

12331269
func transcribeCurrentFile(path: String) async throws {
1234-
let audioFileBuffer = try AudioProcessor.loadAudio(fromPath: path)
1235-
let audioFileSamples = AudioProcessor.convertBufferToArray(buffer: audioFileBuffer)
1270+
// Load and convert buffer in a limited scope
1271+
let audioFileSamples = try await Task {
1272+
try autoreleasepool {
1273+
let audioFileBuffer = try AudioProcessor.loadAudio(fromPath: path)
1274+
return AudioProcessor.convertBufferToArray(buffer: audioFileBuffer)
1275+
}
1276+
}.value
1277+
12361278
let transcription = try await transcribeAudioSamples(audioFileSamples)
12371279

12381280
await MainActor.run {
@@ -1258,7 +1300,7 @@ struct ContentView: View {
12581300

12591301
let languageCode = Constants.languages[selectedLanguage, default: Constants.defaultLanguageCode]
12601302
let task: DecodingTask = selectedTask == "transcribe" ? .transcribe : .translate
1261-
let seekClip: [Float] = []
1303+
let seekClip: [Float] = [lastConfirmedSegmentEndSeconds]
12621304

12631305
let options = DecodingOptions(
12641306
verbose: true,
@@ -1271,6 +1313,7 @@ struct ContentView: View {
12711313
usePrefillCache: enableCachePrefill,
12721314
skipSpecialTokens: !enableSpecialCharacters,
12731315
withoutTimestamps: !enableTimestamps,
1316+
wordTimestamps: true,
12741317
clipTimestamps: seekClip,
12751318
chunkingStrategy: chunkingStrategy
12761319
)
@@ -1279,7 +1322,7 @@ struct ContentView: View {
12791322
let decodingCallback: ((TranscriptionProgress) -> Bool?) = { (progress: TranscriptionProgress) in
12801323
DispatchQueue.main.async {
12811324
let fallbacks = Int(progress.timings.totalDecodingFallbacks)
1282-
let chunkId = progress.windowId
1325+
let chunkId = isStreamMode ? 0 : progress.windowId
12831326

12841327
// First check if this is a new window for the same chunk, append if so
12851328
var updatedChunk = (chunkText: [progress.text], fallbacks: fallbacks)
@@ -1292,7 +1335,7 @@ struct ContentView: View {
12921335
// This is either a new window or a fallback (only in streaming mode)
12931336
if fallbacks == currentChunk.fallbacks && isStreamMode {
12941337
// New window (since fallbacks havent changed)
1295-
updatedChunk.chunkText = currentChunk.chunkText + [progress.text]
1338+
updatedChunk.chunkText = [updatedChunk.chunkText.first ?? "" + progress.text]
12961339
} else {
12971340
// Fallback, overwrite the previous bad text
12981341
updatedChunk.chunkText[currentChunk.chunkText.endIndex - 1] = progress.text
@@ -1419,6 +1462,7 @@ struct ContentView: View {
14191462
// Run realtime transcribe using word timestamps for segmentation
14201463
let transcription = try await transcribeEagerMode(Array(currentBuffer))
14211464
await MainActor.run {
1465+
currentText = ""
14221466
self.tokensPerSecond = transcription?.timings.tokensPerSecond ?? 0
14231467
self.firstTokenTime = transcription?.timings.firstTokenTime ?? 0
14241468
self.pipelineStart = transcription?.timings.pipelineStart ?? 0
@@ -1464,10 +1508,13 @@ struct ContentView: View {
14641508
// Update lastConfirmedSegmentEnd based on the last confirmed segment
14651509
if let lastConfirmedSegment = confirmedSegmentsArray.last, lastConfirmedSegment.end > lastConfirmedSegmentEndSeconds {
14661510
lastConfirmedSegmentEndSeconds = lastConfirmedSegment.end
1511+
print("Last confirmed segment end: \(lastConfirmedSegmentEndSeconds)")
14671512

14681513
// Add confirmed segments to the confirmedSegments array
1469-
if !self.confirmedSegments.contains(confirmedSegmentsArray) {
1470-
self.confirmedSegments.append(contentsOf: confirmedSegmentsArray)
1514+
for segment in confirmedSegmentsArray {
1515+
if !self.confirmedSegments.contains(segment: segment) {
1516+
self.confirmedSegments.append(segment)
1517+
}
14711518
}
14721519
}
14731520

@@ -1584,18 +1631,20 @@ struct ContentView: View {
15841631
eagerResults.append(transcription)
15851632
}
15861633
}
1634+
1635+
await MainActor.run {
1636+
let finalWords = confirmedWords.map { $0.word }.joined()
1637+
confirmedText = finalWords
1638+
1639+
// Accept the final hypothesis because it is the last of the available audio
1640+
let lastHypothesis = lastAgreedWords + findLongestDifferentSuffix(prevWords, hypothesisWords)
1641+
hypothesisText = lastHypothesis.map { $0.word }.joined()
1642+
}
15871643
} catch {
15881644
Logging.error("[EagerMode] Error: \(error)")
1645+
finalizeText()
15891646
}
15901647

1591-
await MainActor.run {
1592-
let finalWords = confirmedWords.map { $0.word }.joined()
1593-
confirmedText = finalWords
1594-
1595-
// Accept the final hypothesis because it is the last of the available audio
1596-
let lastHypothesis = lastAgreedWords + findLongestDifferentSuffix(prevWords, hypothesisWords)
1597-
hypothesisText = lastHypothesis.map { $0.word }.joined()
1598-
}
15991648

16001649
let mergedResult = mergeTranscriptionResults(eagerResults, confirmedWords: confirmedWords)
16011650

0 commit comments

Comments
 (0)