@@ -97,7 +97,7 @@ struct ContentView: View {
97
97
@State private var showAdvancedOptions : Bool = false
98
98
@State private var transcriptionTask : Task < Void , Never > ? = nil
99
99
@State private var selectedCategoryId : MenuItem . ID ?
100
- @State private var transcribeFileTask : Task < Void , Never > ? = nil
100
+ @State private var transcribeTask : Task < Void , Never > ? = nil
101
101
102
102
struct MenuItem : Identifiable , Hashable {
103
103
var id = UUID ( )
@@ -122,7 +122,7 @@ struct ContentView: View {
122
122
// MARK: Views
123
123
124
124
func resetState( ) {
125
- transcribeFileTask ? . cancel ( )
125
+ transcribeTask ? . cancel ( )
126
126
isRecording = false
127
127
isTranscribing = false
128
128
whisperKit? . audioProcessor. stopRecording ( )
@@ -311,15 +311,27 @@ struct ContentView: View {
311
311
. textSelection ( . enabled)
312
312
. padding ( )
313
313
if let whisperKit,
314
- !isRecording,
315
- !isTranscribing,
316
- whisperKit. progress. fractionCompleted > 0 ,
314
+ !isStreamMode,
315
+ isTranscribing,
316
+ let task = transcribeTask,
317
+ !task. isCancelled,
317
318
whisperKit. progress. fractionCompleted < 1
318
319
{
319
- ProgressView ( whisperKit. progress)
320
- . progressViewStyle ( . linear)
321
- . labelsHidden ( )
322
- . padding ( . horizontal)
320
+ HStack {
321
+ ProgressView ( whisperKit. progress)
322
+ . progressViewStyle ( . linear)
323
+ . labelsHidden ( )
324
+ . padding ( . horizontal)
325
+
326
+ Button {
327
+ transcribeTask? . cancel ( )
328
+ transcribeTask = nil
329
+ } label: {
330
+ Image ( systemName: " xmark.circle.fill " )
331
+ . foregroundColor ( . secondary)
332
+ }
333
+ . buttonStyle ( BorderlessButtonStyle ( ) )
334
+ }
323
335
}
324
336
}
325
337
}
@@ -706,7 +718,7 @@ struct ContentView: View {
706
718
}
707
719
. disabled ( !( whisperKit? . modelVariant. isMultilingual ?? false ) )
708
720
} label: {
709
- Label ( " Language " , systemImage: " globe " )
721
+ Label ( " Source Language" , systemImage: " globe " )
710
722
}
711
723
. padding ( . horizontal)
712
724
. padding ( . top)
@@ -1149,12 +1161,14 @@ struct ContentView: View {
1149
1161
func transcribeFile( path: String ) {
1150
1162
resetState ( )
1151
1163
whisperKit? . audioProcessor = AudioProcessor ( )
1152
- self . transcribeFileTask = Task {
1164
+ self . transcribeTask = Task {
1165
+ isTranscribing = true
1153
1166
do {
1154
1167
try await transcribeCurrentFile ( path: path)
1155
1168
} catch {
1156
1169
print ( " File selection error: \( error. localizedDescription) " )
1157
1170
}
1171
+ isTranscribing = false
1158
1172
}
1159
1173
}
1160
1174
@@ -1218,21 +1232,49 @@ struct ContentView: View {
1218
1232
1219
1233
// If not looping, transcribe the full buffer
1220
1234
if !loop {
1221
- Task {
1235
+ self . transcribeTask = Task {
1236
+ isTranscribing = true
1222
1237
do {
1223
1238
try await transcribeCurrentBuffer ( )
1224
1239
} catch {
1225
1240
print ( " Error: \( error. localizedDescription) " )
1226
1241
}
1242
+ finalizeText ( )
1243
+ isTranscribing = false
1244
+ }
1245
+ }
1246
+
1247
+ finalizeText ( )
1248
+ }
1249
+
1250
+ func finalizeText( ) {
1251
+ // Finalize unconfirmed text
1252
+ Task {
1253
+ await MainActor . run {
1254
+ if hypothesisText != " " {
1255
+ confirmedText += hypothesisText
1256
+ hypothesisText = " "
1257
+ }
1258
+
1259
+ if unconfirmedSegments. count > 0 {
1260
+ confirmedSegments. append ( contentsOf: unconfirmedSegments)
1261
+ unconfirmedSegments = [ ]
1262
+ }
1227
1263
}
1228
1264
}
1229
1265
}
1230
1266
1231
1267
// MARK: - Transcribe Logic
1232
1268
1233
1269
func transcribeCurrentFile( path: String ) async throws {
1234
- let audioFileBuffer = try AudioProcessor . loadAudio ( fromPath: path)
1235
- let audioFileSamples = AudioProcessor . convertBufferToArray ( buffer: audioFileBuffer)
1270
+ // Load and convert buffer in a limited scope
1271
+ let audioFileSamples = try await Task {
1272
+ try autoreleasepool {
1273
+ let audioFileBuffer = try AudioProcessor . loadAudio ( fromPath: path)
1274
+ return AudioProcessor . convertBufferToArray ( buffer: audioFileBuffer)
1275
+ }
1276
+ } . value
1277
+
1236
1278
let transcription = try await transcribeAudioSamples ( audioFileSamples)
1237
1279
1238
1280
await MainActor . run {
@@ -1258,7 +1300,7 @@ struct ContentView: View {
1258
1300
1259
1301
let languageCode = Constants . languages [ selectedLanguage, default: Constants . defaultLanguageCode]
1260
1302
let task : DecodingTask = selectedTask == " transcribe " ? . transcribe : . translate
1261
- let seekClip : [ Float ] = [ ]
1303
+ let seekClip : [ Float ] = [ lastConfirmedSegmentEndSeconds ]
1262
1304
1263
1305
let options = DecodingOptions (
1264
1306
verbose: true ,
@@ -1271,6 +1313,7 @@ struct ContentView: View {
1271
1313
usePrefillCache: enableCachePrefill,
1272
1314
skipSpecialTokens: !enableSpecialCharacters,
1273
1315
withoutTimestamps: !enableTimestamps,
1316
+ wordTimestamps: true ,
1274
1317
clipTimestamps: seekClip,
1275
1318
chunkingStrategy: chunkingStrategy
1276
1319
)
@@ -1279,7 +1322,7 @@ struct ContentView: View {
1279
1322
let decodingCallback : ( ( TranscriptionProgress ) -> Bool ? ) = { ( progress: TranscriptionProgress ) in
1280
1323
DispatchQueue . main. async {
1281
1324
let fallbacks = Int ( progress. timings. totalDecodingFallbacks)
1282
- let chunkId = progress. windowId
1325
+ let chunkId = isStreamMode ? 0 : progress. windowId
1283
1326
1284
1327
// First check if this is a new window for the same chunk, append if so
1285
1328
var updatedChunk = ( chunkText: [ progress. text] , fallbacks: fallbacks)
@@ -1292,7 +1335,7 @@ struct ContentView: View {
1292
1335
// This is either a new window or a fallback (only in streaming mode)
1293
1336
if fallbacks == currentChunk. fallbacks && isStreamMode {
1294
1337
// New window (since fallbacks havent changed)
1295
- updatedChunk. chunkText = currentChunk . chunkText + [ progress. text]
1338
+ updatedChunk. chunkText = [ updatedChunk . chunkText. first ?? " " + progress. text]
1296
1339
} else {
1297
1340
// Fallback, overwrite the previous bad text
1298
1341
updatedChunk. chunkText [ currentChunk. chunkText. endIndex - 1 ] = progress. text
@@ -1419,6 +1462,7 @@ struct ContentView: View {
1419
1462
// Run realtime transcribe using word timestamps for segmentation
1420
1463
let transcription = try await transcribeEagerMode ( Array ( currentBuffer) )
1421
1464
await MainActor . run {
1465
+ currentText = " "
1422
1466
self . tokensPerSecond = transcription? . timings. tokensPerSecond ?? 0
1423
1467
self . firstTokenTime = transcription? . timings. firstTokenTime ?? 0
1424
1468
self . pipelineStart = transcription? . timings. pipelineStart ?? 0
@@ -1464,10 +1508,13 @@ struct ContentView: View {
1464
1508
// Update lastConfirmedSegmentEnd based on the last confirmed segment
1465
1509
if let lastConfirmedSegment = confirmedSegmentsArray. last, lastConfirmedSegment. end > lastConfirmedSegmentEndSeconds {
1466
1510
lastConfirmedSegmentEndSeconds = lastConfirmedSegment. end
1511
+ print ( " Last confirmed segment end: \( lastConfirmedSegmentEndSeconds) " )
1467
1512
1468
1513
// Add confirmed segments to the confirmedSegments array
1469
- if !self . confirmedSegments. contains ( confirmedSegmentsArray) {
1470
- self . confirmedSegments. append ( contentsOf: confirmedSegmentsArray)
1514
+ for segment in confirmedSegmentsArray {
1515
+ if !self . confirmedSegments. contains ( segment: segment) {
1516
+ self . confirmedSegments. append ( segment)
1517
+ }
1471
1518
}
1472
1519
}
1473
1520
@@ -1584,18 +1631,20 @@ struct ContentView: View {
1584
1631
eagerResults. append ( transcription)
1585
1632
}
1586
1633
}
1634
+
1635
+ await MainActor . run {
1636
+ let finalWords = confirmedWords. map { $0. word } . joined ( )
1637
+ confirmedText = finalWords
1638
+
1639
+ // Accept the final hypothesis because it is the last of the available audio
1640
+ let lastHypothesis = lastAgreedWords + findLongestDifferentSuffix( prevWords, hypothesisWords)
1641
+ hypothesisText = lastHypothesis. map { $0. word } . joined ( )
1642
+ }
1587
1643
} catch {
1588
1644
Logging . error ( " [EagerMode] Error: \( error) " )
1645
+ finalizeText ( )
1589
1646
}
1590
1647
1591
- await MainActor . run {
1592
- let finalWords = confirmedWords. map { $0. word } . joined ( )
1593
- confirmedText = finalWords
1594
-
1595
- // Accept the final hypothesis because it is the last of the available audio
1596
- let lastHypothesis = lastAgreedWords + findLongestDifferentSuffix( prevWords, hypothesisWords)
1597
- hypothesisText = lastHypothesis. map { $0. word } . joined ( )
1598
- }
1599
1648
1600
1649
let mergedResult = mergeTranscriptionResults ( eagerResults, confirmedWords: confirmedWords)
1601
1650
0 commit comments