Skip to content

Commit b251a54

Browse files
cursoragentcalmmage
andcommitted
Add audio speedup feature with ffmpeg and pydub options
Co-authored-by: petr.b.lavrov <petr.b.lavrov@gmail.com>
1 parent 1795d5c commit b251a54

File tree

3 files changed

+127
-16
lines changed

3 files changed

+127
-16
lines changed

src/app.py

Lines changed: 51 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import asyncio
22
import datetime
33
import time
4+
from enum import Enum
45
from io import BytesIO
56
from pathlib import Path
67
from typing import (
@@ -44,6 +45,11 @@
4445
)
4546

4647

48+
class SpeedupMode(Enum):
49+
FFMPEG = "ffmpeg"
50+
PYDUB = "pydub"
51+
52+
4753
class AppConfig(BaseSettings):
4854
telegram_api_id: int
4955
telegram_api_hash: SecretStr
@@ -92,6 +98,9 @@ class AppConfig(BaseSettings):
9298
10000 # Skip formatting if total chunks length exceeds this
9399
)
94100

101+
# Audio speedup configuration
102+
speedup_mode: SpeedupMode = SpeedupMode.FFMPEG # Default speedup implementation
103+
95104
class Config:
96105
env_file = ".env"
97106
env_file_encoding = "utf-8"
@@ -460,6 +469,7 @@ async def process_message(
460469
message: AiogramMessage,
461470
whisper_model: Optional[str] = None,
462471
language: Optional[str] = None,
472+
speedup: Optional[float] = None,
463473
status_callback: Optional[Callable[[str], Awaitable[None]]] = None,
464474
) -> str:
465475
"""
@@ -502,7 +512,7 @@ async def process_message(
502512
message, status_callback=status_callback
503513
)
504514

505-
parts = await self.prepare_parts(media_file, status_callback=status_callback)
515+
parts = await self.prepare_parts(media_file, speedup=speedup, status_callback=status_callback)
506516

507517
# Store message_id per user for async safety
508518
self._user_message_ids[username] = message_id
@@ -576,31 +586,43 @@ async def download_attachment(
576586
async def prepare_parts(
577587
self,
578588
media_file: Union[BinaryIO, Path],
589+
speedup: Optional[float] = None,
579590
status_callback: Optional[Callable[[str], Awaitable[None]]] = None,
580591
) -> Sequence[Audio]:
581592
if isinstance(media_file, Path):
582593
# process file on disk - with
583-
parts = await self.process_file_on_disk(media_file, status_callback=status_callback)
594+
parts = await self.process_file_on_disk(media_file, speedup=speedup, status_callback=status_callback)
584595
if self.config.cleanup_downloads:
585596
if media_file != parts[0]:
586597
media_file.unlink(missing_ok=True)
587598
return parts
588599
else:
589600
# process file in memory - with pydub
590601
assert isinstance(media_file, (BinaryIO, BytesIO))
602+
if speedup is not None and self.config.speedup_mode == SpeedupMode.PYDUB:
603+
if status_callback is not None:
604+
await status_callback(f"Applying {speedup}x speedup to audio...")
605+
# Apply speedup using pydub
606+
media_file = await self._apply_speedup_pydub(media_file, speedup)
591607
if status_callback is not None:
592608
await status_callback(
593609
"Just one part to process\n<b>Estimated processing time: Should be up to 1 minute</b>"
594610
)
595611
return [media_file]
596612

597-
async def process_file_on_disk(self, media_file: Path, status_callback: Optional[Callable[[str], Awaitable[None]]] = None) -> Sequence[Audio]:
598-
if media_file.suffix != ".mp3":
613+
async def process_file_on_disk(self, media_file: Path, speedup: Optional[float] = None, status_callback: Optional[Callable[[str], Awaitable[None]]] = None) -> Sequence[Audio]:
614+
# Skip conversion only if it's already MP3 AND there's no speedup
615+
skip_conversion = media_file.suffix == ".mp3" and speedup is None
616+
617+
if not skip_conversion:
599618
if status_callback is not None:
600-
await status_callback(
601-
"Preparing audio - converting to mp3.."
602-
)
603-
mp3_file = await convert_to_mp3_ffmpeg(media_file)
619+
status_msg = "Preparing audio - converting to mp3"
620+
if speedup is not None:
621+
status_msg += f" with {speedup}x speedup"
622+
status_msg += ".."
623+
await status_callback(status_msg)
624+
625+
mp3_file = await convert_to_mp3_ffmpeg(media_file, speedup=speedup)
604626
# delete original file
605627
if self.config.cleanup_downloads:
606628
media_file.unlink(missing_ok=True)
@@ -635,6 +657,27 @@ async def cut_audio_with_ffmpeg(self, media_file, status_callback: Optional[Call
635657
def _get_file_size(media_file):
636658
return media_file.stat().st_size
637659

660+
async def _apply_speedup_pydub(self, media_file: Union[BinaryIO, BytesIO], speedup: float) -> BytesIO:
661+
"""Apply speedup to audio using pydub."""
662+
import asyncio
663+
664+
def _speedup_audio():
665+
# Load audio from binary data
666+
audio = AudioSegment.from_file(media_file)
667+
668+
# Apply speedup - this changes both tempo and pitch
669+
# For tempo-only change, we would need a more complex approach
670+
faster_audio = audio.speedup(playback_speed=speedup)
671+
672+
# Convert back to BytesIO
673+
output_buffer = BytesIO()
674+
faster_audio.export(output_buffer, format="mp3")
675+
output_buffer.seek(0)
676+
return output_buffer
677+
678+
# Run in thread to avoid blocking
679+
return await asyncio.get_event_loop().run_in_executor(None, _speedup_audio)
680+
638681
# endregion prepare_parts
639682

640683
# region process_parts

src/router.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,8 @@ async def media_handler(message: Message, app: App, state: FSMContext):
152152

153153
language_code = await ask_user_language(message, app, state)
154154

155+
speedup = await ask_user_speedup(message, app, state)
156+
155157
# Send a processing message
156158
notif = await reply_safe(
157159
message, "Processing your media file. Estimating transcription time..."
@@ -162,6 +164,7 @@ async def media_handler(message: Message, app: App, state: FSMContext):
162164
message,
163165
whisper_model=model,
164166
language=language_code,
167+
speedup=speedup,
165168
status_callback=create_notification_callback(notif),
166169
)
167170
await reply_safe(message, transcription)
@@ -263,6 +266,29 @@ async def ask_user_language(message: Message, app: App, state: FSMContext):
263266
return language_code
264267

265268

269+
async def ask_user_speedup(message: Message, app: App, state: FSMContext):
270+
speedup = await ask_user_choice(
271+
message.chat.id,
272+
"Please choose audio speedup:",
273+
{
274+
"none": "No speedup (original speed)",
275+
"2": "2x speed (default)",
276+
"3": "3x speed",
277+
"4": "4x speed",
278+
"5": "5x speed",
279+
},
280+
state=state,
281+
default_choice="2",
282+
timeout=10,
283+
cleanup=app.config.cleanup_messages,
284+
)
285+
286+
if speedup == "none":
287+
return None
288+
else:
289+
return float(speedup)
290+
291+
266292
@router.message()
267293
async def chat_handler(message: Message, app: App):
268294
if message.reply_to_message:

src/utils/convert_to_mp3_ffmpeg.py

Lines changed: 50 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ async def convert_to_mp3_ffmpeg(
1212
output_path: Optional[Path] = None,
1313
format: str = "mp3",
1414
use_memory_profiler: bool = False,
15+
speedup: Optional[float] = None,
1516
) -> Path:
1617
"""
1718
Convert video file to audio using ffmpeg.
@@ -21,15 +22,16 @@ async def convert_to_mp3_ffmpeg(
2122
output_path: Path to save the audio file (optional)
2223
format: Audio format (default: mp3)
2324
use_memory_profiler: Whether to use memory profiler implementation
25+
speedup: Audio speedup factor (e.g., 2.0 for 2x speed)
2426
2527
Returns:
2628
Path to the converted audio file
2729
"""
28-
# If the file is already an audio file, return it as is
30+
# If the file is already an audio file and no speedup is requested, return it as is
2931
# if source_path.suffix.lower() in [".mp3", ".wav", ".ogg", ".m4a", ".flac"]:
3032
# logger.info(f"File {source_path} is already an audio file, skipping conversion")
3133
# return source_path
32-
if source_path.suffix.lower() == ".mp3":
34+
if source_path.suffix.lower() == ".mp3" and speedup is None:
3335
logger.info(f"File {source_path} is already an audio file, skipping conversion")
3436
return source_path
3537

@@ -42,13 +44,13 @@ async def convert_to_mp3_ffmpeg(
4244

4345
# Choose the appropriate implementation based on the flag
4446
if use_memory_profiler:
45-
return await _convert_to_mp3_with_profiler(source_path, output_path, format)
47+
return await _convert_to_mp3_with_profiler(source_path, output_path, format, speedup)
4648
else:
47-
return await _convert_to_mp3_standard(source_path, output_path, format)
49+
return await _convert_to_mp3_standard(source_path, output_path, format, speedup)
4850

4951

5052
async def _convert_to_mp3_standard(
51-
source_path: Path, output_path: Path, format: str = "mp3"
53+
source_path: Path, output_path: Path, format: str = "mp3", speedup: Optional[float] = None
5254
) -> Path:
5355
"""
5456
Standard implementation of video to audio conversion using ffmpeg.
@@ -57,6 +59,7 @@ async def _convert_to_mp3_standard(
5759
source_path: Path to the video file
5860
output_path: Path to save the audio file
5961
format: Audio format (default: mp3)
62+
speedup: Audio speedup factor (e.g., 2.0 for 2x speed)
6063
6164
Returns:
6265
Path to the converted audio file
@@ -72,13 +75,32 @@ async def _convert_to_mp3_standard(
7275
"-i",
7376
str(source_path), # Input file
7477
"-vn", # Disable video
78+
]
79+
80+
# Add speedup filter if requested
81+
if speedup is not None:
82+
# For speedup > 2.0, chain multiple atempo filters for better quality
83+
if speedup > 2.0:
84+
# Calculate how many atempo filters we need
85+
temp_speedup = speedup
86+
filters = []
87+
while temp_speedup > 2.0:
88+
filters.append("atempo=2.0")
89+
temp_speedup /= 2.0
90+
if temp_speedup > 1.0:
91+
filters.append(f"atempo={temp_speedup}")
92+
cmd.extend(["-filter:a", ",".join(filters)])
93+
else:
94+
cmd.extend(["-filter:a", f"atempo={speedup}"])
95+
96+
cmd.extend([
7597
"-acodec",
7698
"libmp3lame" if format == "mp3" else format, # Audio codec
7799
"-q:a",
78100
"2", # Audio quality (0-9, 0=best)
79101
"-y", # Overwrite output file if it exists
80102
str(output_path), # Output file
81-
]
103+
])
82104

83105
# Run the command
84106
process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
@@ -99,7 +121,7 @@ async def _convert_to_mp3_standard(
99121

100122

101123
async def _convert_to_mp3_with_profiler(
102-
source_path: Path, output_path: Path, format: str = "mp3"
124+
source_path: Path, output_path: Path, format: str = "mp3", speedup: Optional[float] = None
103125
) -> Path:
104126
"""
105127
Memory-profiled implementation of video to audio conversion using ffmpeg.
@@ -108,6 +130,7 @@ async def _convert_to_mp3_with_profiler(
108130
source_path: Path to the video file
109131
output_path: Path to save the audio file
110132
format: Audio format (default: mp3)
133+
speedup: Audio speedup factor (e.g., 2.0 for 2x speed)
111134
112135
Returns:
113136
Path to the converted audio file
@@ -121,13 +144,32 @@ async def _convert_to_mp3_with_profiler(
121144
"-i",
122145
str(source_path), # Input file
123146
"-vn", # Disable video
147+
]
148+
149+
# Add speedup filter if requested
150+
if speedup is not None:
151+
# For speedup > 2.0, chain multiple atempo filters for better quality
152+
if speedup > 2.0:
153+
# Calculate how many atempo filters we need
154+
temp_speedup = speedup
155+
filters = []
156+
while temp_speedup > 2.0:
157+
filters.append("atempo=2.0")
158+
temp_speedup /= 2.0
159+
if temp_speedup > 1.0:
160+
filters.append(f"atempo={temp_speedup}")
161+
cmd.extend(["-filter:a", ",".join(filters)])
162+
else:
163+
cmd.extend(["-filter:a", f"atempo={speedup}"])
164+
165+
cmd.extend([
124166
"-acodec",
125167
"libmp3lame" if format == "mp3" else format, # Audio codec
126168
"-q:a",
127169
"2", # Audio quality (0-9, 0=best)
128170
"-y", # Overwrite output file if it exists
129171
str(output_path), # Output file
130-
]
172+
])
131173

132174
# Start memory profiling
133175
memory_stats = []

0 commit comments

Comments
 (0)