Add audio speedup feature with ffmpeg and pydub options

cursoragent · calmmage · cursoragent · commit b251a5446822 · 2025-07-20T23:13:45.000Z
Co-authored-by: petr.b.lavrov &lt;petr.b.lavrov@gmail.com&gt;
diff --git a/src/app.py b/src/app.py
@@ -1,6 +1,7 @@
 import asyncio
 import datetime
 import time
+from enum import Enum
 from io import BytesIO
 from pathlib import Path
 from typing import (
@@ -44,6 +45,11 @@
 )
 
 
+class SpeedupMode(Enum):
+    FFMPEG = "ffmpeg"
+    PYDUB = "pydub"
+
+
 class AppConfig(BaseSettings):
     telegram_api_id: int
     telegram_api_hash: SecretStr
@@ -92,6 +98,9 @@ class AppConfig(BaseSettings):
         10000  # Skip formatting if total chunks length exceeds this
     )
 
+    # Audio speedup configuration
+    speedup_mode: SpeedupMode = SpeedupMode.FFMPEG  # Default speedup implementation
+
     class Config:
         env_file = ".env"
         env_file_encoding = "utf-8"
@@ -460,6 +469,7 @@ async def process_message(
         message: AiogramMessage,
         whisper_model: Optional[str] = None,
         language: Optional[str] = None,
+        speedup: Optional[float] = None,
         status_callback: Optional[Callable[[str], Awaitable[None]]] = None,
     ) -> str:
         """
@@ -502,7 +512,7 @@ async def process_message(
             message, status_callback=status_callback
         )
 
-        parts = await self.prepare_parts(media_file, status_callback=status_callback)
+        parts = await self.prepare_parts(media_file, speedup=speedup, status_callback=status_callback)
 
         # Store message_id per user for async safety
         self._user_message_ids[username] = message_id
@@ -576,31 +586,43 @@ async def download_attachment(
     async def prepare_parts(
         self,
         media_file: Union[BinaryIO, Path],
+        speedup: Optional[float] = None,
         status_callback: Optional[Callable[[str], Awaitable[None]]] = None,
     ) -> Sequence[Audio]:
         if isinstance(media_file, Path):
             # process file on disk - with
-            parts = await self.process_file_on_disk(media_file, status_callback=status_callback)
+            parts = await self.process_file_on_disk(media_file, speedup=speedup, status_callback=status_callback)
             if self.config.cleanup_downloads:
                 if media_file != parts[0]:
                     media_file.unlink(missing_ok=True)
             return parts
         else:
             # process file in memory - with pydub
             assert isinstance(media_file, (BinaryIO, BytesIO))
+            if speedup is not None and self.config.speedup_mode == SpeedupMode.PYDUB:
+                if status_callback is not None:
+                    await status_callback(f"Applying {speedup}x speedup to audio...")
+                # Apply speedup using pydub
+                media_file = await self._apply_speedup_pydub(media_file, speedup)
             if status_callback is not None:
                 await status_callback(
                     "Just one part to process\n<b>Estimated processing time: Should be up to 1 minute</b>"
                 )
             return [media_file]
 
-    async def process_file_on_disk(self, media_file: Path, status_callback: Optional[Callable[[str], Awaitable[None]]] = None) -> Sequence[Audio]:
-        if media_file.suffix != ".mp3":
+    async def process_file_on_disk(self, media_file: Path, speedup: Optional[float] = None, status_callback: Optional[Callable[[str], Awaitable[None]]] = None) -> Sequence[Audio]:
+        # Skip conversion only if it's already MP3 AND there's no speedup
+        skip_conversion = media_file.suffix == ".mp3" and speedup is None
+        
+        if not skip_conversion:
             if status_callback is not None:
-                await status_callback(
-                    "Preparing audio - converting to mp3.."
-                )
-            mp3_file = await convert_to_mp3_ffmpeg(media_file)
+                status_msg = "Preparing audio - converting to mp3"
+                if speedup is not None:
+                    status_msg += f" with {speedup}x speedup"
+                status_msg += ".."
+                await status_callback(status_msg)
+            
+            mp3_file = await convert_to_mp3_ffmpeg(media_file, speedup=speedup)
             # delete original file
             if self.config.cleanup_downloads:
                 media_file.unlink(missing_ok=True)
@@ -635,6 +657,27 @@ async def cut_audio_with_ffmpeg(self, media_file, status_callback: Optional[Call
     def _get_file_size(media_file):
         return media_file.stat().st_size
 
+    async def _apply_speedup_pydub(self, media_file: Union[BinaryIO, BytesIO], speedup: float) -> BytesIO:
+        """Apply speedup to audio using pydub."""
+        import asyncio
+        
+        def _speedup_audio():
+            # Load audio from binary data
+            audio = AudioSegment.from_file(media_file)
+            
+            # Apply speedup - this changes both tempo and pitch
+            # For tempo-only change, we would need a more complex approach
+            faster_audio = audio.speedup(playback_speed=speedup)
+            
+            # Convert back to BytesIO
+            output_buffer = BytesIO()
+            faster_audio.export(output_buffer, format="mp3")
+            output_buffer.seek(0)
+            return output_buffer
+        
+        # Run in thread to avoid blocking
+        return await asyncio.get_event_loop().run_in_executor(None, _speedup_audio)
+
     # endregion prepare_parts
 
     # region process_parts
diff --git a/src/router.py b/src/router.py
@@ -152,6 +152,8 @@ async def media_handler(message: Message, app: App, state: FSMContext):
 
     language_code = await ask_user_language(message, app, state)
 
+    speedup = await ask_user_speedup(message, app, state)
+
     # Send a processing message
     notif = await reply_safe(
         message, "Processing your media file. Estimating transcription time..."
@@ -162,6 +164,7 @@ async def media_handler(message: Message, app: App, state: FSMContext):
         message,
         whisper_model=model,
         language=language_code,
+        speedup=speedup,
         status_callback=create_notification_callback(notif),
     )
     await reply_safe(message, transcription)
@@ -263,6 +266,29 @@ async def ask_user_language(message: Message, app: App, state: FSMContext):
     return language_code
 
 
+async def ask_user_speedup(message: Message, app: App, state: FSMContext):
+    speedup = await ask_user_choice(
+        message.chat.id,
+        "Please choose audio speedup:",
+        {
+            "none": "No speedup (original speed)",
+            "2": "2x speed (default)",
+            "3": "3x speed",
+            "4": "4x speed", 
+            "5": "5x speed",
+        },
+        state=state,
+        default_choice="2",
+        timeout=10,
+        cleanup=app.config.cleanup_messages,
+    )
+    
+    if speedup == "none":
+        return None
+    else:
+        return float(speedup)
+
+
 @router.message()
 async def chat_handler(message: Message, app: App):
     if message.reply_to_message:
diff --git a/src/utils/convert_to_mp3_ffmpeg.py b/src/utils/convert_to_mp3_ffmpeg.py
@@ -12,6 +12,7 @@ async def convert_to_mp3_ffmpeg(
     output_path: Optional[Path] = None,
     format: str = "mp3",
     use_memory_profiler: bool = False,
+    speedup: Optional[float] = None,
 ) -> Path:
     """
     Convert video file to audio using ffmpeg.
@@ -21,15 +22,16 @@ async def convert_to_mp3_ffmpeg(
         output_path: Path to save the audio file (optional)
         format: Audio format (default: mp3)
         use_memory_profiler: Whether to use memory profiler implementation
+        speedup: Audio speedup factor (e.g., 2.0 for 2x speed)
 
     Returns:
         Path to the converted audio file
     """
-    # If the file is already an audio file, return it as is
+    # If the file is already an audio file and no speedup is requested, return it as is
     # if source_path.suffix.lower() in [".mp3", ".wav", ".ogg", ".m4a", ".flac"]:
     #     logger.info(f"File {source_path} is already an audio file, skipping conversion")
     #     return source_path
-    if source_path.suffix.lower() == ".mp3":
+    if source_path.suffix.lower() == ".mp3" and speedup is None:
         logger.info(f"File {source_path} is already an audio file, skipping conversion")
         return source_path
 
@@ -42,13 +44,13 @@ async def convert_to_mp3_ffmpeg(
 
     # Choose the appropriate implementation based on the flag
     if use_memory_profiler:
-        return await _convert_to_mp3_with_profiler(source_path, output_path, format)
+        return await _convert_to_mp3_with_profiler(source_path, output_path, format, speedup)
     else:
-        return await _convert_to_mp3_standard(source_path, output_path, format)
+        return await _convert_to_mp3_standard(source_path, output_path, format, speedup)
 
 
 async def _convert_to_mp3_standard(
-    source_path: Path, output_path: Path, format: str = "mp3"
+    source_path: Path, output_path: Path, format: str = "mp3", speedup: Optional[float] = None
 ) -> Path:
     """
     Standard implementation of video to audio conversion using ffmpeg.
@@ -57,6 +59,7 @@ async def _convert_to_mp3_standard(
         source_path: Path to the video file
         output_path: Path to save the audio file
         format: Audio format (default: mp3)
+        speedup: Audio speedup factor (e.g., 2.0 for 2x speed)
 
     Returns:
         Path to the converted audio file
@@ -72,13 +75,32 @@ async def _convert_to_mp3_standard(
             "-i",
             str(source_path),  # Input file
             "-vn",  # Disable video
+        ]
+        
+        # Add speedup filter if requested
+        if speedup is not None:
+            # For speedup > 2.0, chain multiple atempo filters for better quality
+            if speedup > 2.0:
+                # Calculate how many atempo filters we need
+                temp_speedup = speedup
+                filters = []
+                while temp_speedup > 2.0:
+                    filters.append("atempo=2.0")
+                    temp_speedup /= 2.0
+                if temp_speedup > 1.0:
+                    filters.append(f"atempo={temp_speedup}")
+                cmd.extend(["-filter:a", ",".join(filters)])
+            else:
+                cmd.extend(["-filter:a", f"atempo={speedup}"])
+        
+        cmd.extend([
             "-acodec",
             "libmp3lame" if format == "mp3" else format,  # Audio codec
             "-q:a",
             "2",  # Audio quality (0-9, 0=best)
             "-y",  # Overwrite output file if it exists
             str(output_path),  # Output file
-        ]
+        ])
 
         # Run the command
         process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
@@ -99,7 +121,7 @@ async def _convert_to_mp3_standard(
 
 
 async def _convert_to_mp3_with_profiler(
-    source_path: Path, output_path: Path, format: str = "mp3"
+    source_path: Path, output_path: Path, format: str = "mp3", speedup: Optional[float] = None
 ) -> Path:
     """
     Memory-profiled implementation of video to audio conversion using ffmpeg.
@@ -108,6 +130,7 @@ async def _convert_to_mp3_with_profiler(
         source_path: Path to the video file
         output_path: Path to save the audio file
         format: Audio format (default: mp3)
+        speedup: Audio speedup factor (e.g., 2.0 for 2x speed)
 
     Returns:
         Path to the converted audio file
@@ -121,13 +144,32 @@ async def _convert_to_mp3_with_profiler(
             "-i",
             str(source_path),  # Input file
             "-vn",  # Disable video
+        ]
+        
+        # Add speedup filter if requested
+        if speedup is not None:
+            # For speedup > 2.0, chain multiple atempo filters for better quality
+            if speedup > 2.0:
+                # Calculate how many atempo filters we need
+                temp_speedup = speedup
+                filters = []
+                while temp_speedup > 2.0:
+                    filters.append("atempo=2.0")
+                    temp_speedup /= 2.0
+                if temp_speedup > 1.0:
+                    filters.append(f"atempo={temp_speedup}")
+                cmd.extend(["-filter:a", ",".join(filters)])
+            else:
+                cmd.extend(["-filter:a", f"atempo={speedup}"])
+        
+        cmd.extend([
             "-acodec",
             "libmp3lame" if format == "mp3" else format,  # Audio codec
             "-q:a",
             "2",  # Audio quality (0-9, 0=best)
             "-y",  # Overwrite output file if it exists
             str(output_path),  # Output file
-        ]
+        ])
 
         # Start memory profiling
         memory_stats = []