Skip to content

Commit a68c8e5

Browse files
authored
Implement lazy loading for Whisper model
Refactor language detection to use cached Whisper model and add lazy loading for improved performance.
1 parent bc8d6e6 commit a68c8e5

File tree

1 file changed

+28
-19
lines changed

1 file changed

+28
-19
lines changed

AudioMediaChecker/AudioMediaChecker.py

Lines changed: 28 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -108,8 +108,8 @@ def __init__(self, file_path, check_all_tracks=False, verbose=False, dry_run=Fal
108108
self.gpu = gpu
109109
self.logger = logger if logger else _setup_logger(verbose)
110110

111-
# if self.file_path.suffix.lower() != '.mkv':
112-
# raise ValueError(f"Formato file non supportato: {self.file_path}")
111+
# Whisper model cache (lazy-loaded on first use)
112+
self._whisper = None
113113

114114
# In dry-run mode I accept all video formats, otherwise only mkv
115115
if not self.dry_run and self.file_path.suffix.lower() != '.mkv':
@@ -121,6 +121,30 @@ def __init__(self, file_path, check_all_tracks=False, verbose=False, dry_run=Fal
121121

122122
self._validate_model_ram()
123123

124+
def _lazy_load_whisper(self):
125+
"""
126+
Instantiate and cache the Whisper model on first use.
127+
Reuses the same instance for subsequent transcriptions.
128+
"""
129+
if self._whisper is None:
130+
device = 'cuda' if self.gpu else 'cpu'
131+
compute_type = self._best_compute_type()
132+
cpu_threads = self._optimal_cpu_threads() if device == 'cpu' else 0
133+
134+
self.logger.info(f"Loading Whisper model '{self.whisper_model_size}' on {device}...")
135+
self.logger.debug(f"Whisper configuration: compute_type={compute_type}, threads={cpu_threads or 'auto'}")
136+
137+
self._whisper = WhisperModel(
138+
self.whisper_model_size,
139+
device=device,
140+
compute_type=compute_type,
141+
cpu_threads=cpu_threads,
142+
download_root="/models" # persist weights if volume-mapped
143+
)
144+
145+
self.logger.info("Whisper model loaded.")
146+
return self._whisper
147+
124148
def _validate_model_ram(self):
125149
"""
126150
Verify that the available RAM is sufficient for the selected model.
@@ -484,39 +508,24 @@ def update_language_tag(self, stream_index, language):
484508

485509
def detect_language(self, audio_file):
486510
"""
487-
Performs language detection using the Whisper model.
511+
Performs language detection using the (cached) Whisper model.
488512
489513
Arguments:
490514
audio_file (file-like): audio sample in BytesIO format.
491515
492516
Returns:
493517
tuple: (language detected (str), confidence (float))
494518
"""
495-
model_size = self.whisper_model_size
496-
device = 'cuda' if self.gpu else 'cpu'
497-
compute_type = self._best_compute_type()
498-
cpu_threads = self._optimal_cpu_threads() if device == 'cpu' else 0
499-
500519
self.logger.info("Beginning language detection")
501-
self.logger.debug(f"Whisper configuration for {device}: compute_type={compute_type}, threads={cpu_threads or 'auto'}")
502-
503-
# NOTE: here you load the model every time you call. It could be improved by loading it only once,
504-
# e.g., by saving it in self.whisper_model on first use.
505-
model = WhisperModel(model_size,
506-
device=device,
507-
compute_type=compute_type,
508-
cpu_threads=cpu_threads,
509-
download_root="/models")
510520

521+
model = self._lazy_load_whisper()
511522
segments, info = model.transcribe(audio_file, language=None, beam_size=5)
512523
detected_language = info.language
513524

514525
if self.verbose:
515526
self.logger.debug("Recognized text:")
516527
for segment in segments:
517528
self.logger.debug(f"[{segment.start:.2f}s -> {segment.end:.2f}s] {segment.text}")
518-
519-
if self.verbose:
520529
self.logger.info(f"Detected language: {detected_language} with confidence: {info.language_probability:.2f}")
521530

522531
return detected_language, info.language_probability

0 commit comments

Comments
 (0)