@@ -108,8 +108,8 @@ def __init__(self, file_path, check_all_tracks=False, verbose=False, dry_run=Fal
108108 self .gpu = gpu
109109 self .logger = logger if logger else _setup_logger (verbose )
110110
111- # if self.file_path.suffix.lower() != '.mkv':
112- # raise ValueError(f"Formato file non supportato: { self.file_path}")
111+ # Whisper model cache (lazy-loaded on first use)
112+ self ._whisper = None
113113
114114 # In dry-run mode I accept all video formats, otherwise only mkv
115115 if not self .dry_run and self .file_path .suffix .lower () != '.mkv' :
@@ -121,6 +121,30 @@ def __init__(self, file_path, check_all_tracks=False, verbose=False, dry_run=Fal
121121
122122 self ._validate_model_ram ()
123123
124+ def _lazy_load_whisper (self ):
125+ """
126+ Instantiate and cache the Whisper model on first use.
127+ Reuses the same instance for subsequent transcriptions.
128+ """
129+ if self ._whisper is None :
130+ device = 'cuda' if self .gpu else 'cpu'
131+ compute_type = self ._best_compute_type ()
132+ cpu_threads = self ._optimal_cpu_threads () if device == 'cpu' else 0
133+
134+ self .logger .info (f"Loading Whisper model '{ self .whisper_model_size } ' on { device } ..." )
135+ self .logger .debug (f"Whisper configuration: compute_type={ compute_type } , threads={ cpu_threads or 'auto' } " )
136+
137+ self ._whisper = WhisperModel (
138+ self .whisper_model_size ,
139+ device = device ,
140+ compute_type = compute_type ,
141+ cpu_threads = cpu_threads ,
142+ download_root = "/models" # persist weights if volume-mapped
143+ )
144+
145+ self .logger .info ("Whisper model loaded." )
146+ return self ._whisper
147+
124148 def _validate_model_ram (self ):
125149 """
126150 Verify that the available RAM is sufficient for the selected model.
@@ -484,39 +508,24 @@ def update_language_tag(self, stream_index, language):
484508
485509 def detect_language (self , audio_file ):
486510 """
487- Performs language detection using the Whisper model.
511+ Performs language detection using the (cached) Whisper model.
488512
489513 Arguments:
490514 audio_file (file-like): audio sample in BytesIO format.
491515
492516 Returns:
493517 tuple: (language detected (str), confidence (float))
494518 """
495- model_size = self .whisper_model_size
496- device = 'cuda' if self .gpu else 'cpu'
497- compute_type = self ._best_compute_type ()
498- cpu_threads = self ._optimal_cpu_threads () if device == 'cpu' else 0
499-
500519 self .logger .info ("Beginning language detection" )
501- self .logger .debug (f"Whisper configuration for { device } : compute_type={ compute_type } , threads={ cpu_threads or 'auto' } " )
502-
503- # NOTE: here you load the model every time you call. It could be improved by loading it only once,
504- # e.g., by saving it in self.whisper_model on first use.
505- model = WhisperModel (model_size ,
506- device = device ,
507- compute_type = compute_type ,
508- cpu_threads = cpu_threads ,
509- download_root = "/models" )
510520
521+ model = self ._lazy_load_whisper ()
511522 segments , info = model .transcribe (audio_file , language = None , beam_size = 5 )
512523 detected_language = info .language
513524
514525 if self .verbose :
515526 self .logger .debug ("Recognized text:" )
516527 for segment in segments :
517528 self .logger .debug (f"[{ segment .start :.2f} s -> { segment .end :.2f} s] { segment .text } " )
518-
519- if self .verbose :
520529 self .logger .info (f"Detected language: { detected_language } with confidence: { info .language_probability :.2f} " )
521530
522531 return detected_language , info .language_probability
0 commit comments