@@ -43,6 +43,15 @@ def __init__(
43
43
* ,
44
44
allowed_local_media_path : str = "" ,
45
45
) -> None :
46
+ """
47
+ Args:
48
+ media_io_kwargs: Additional args passed to process media
49
+ inputs, keyed by modalities. For example,
50
+ to set num_frames for video, set
51
+ `--media-io-kwargs '{"video": {"num_frames": 40} }'`
52
+ connection: HTTP connection client to download media contents.
53
+ allowed_local_media_path: A local directory to load media files from.
54
+ """
46
55
super ().__init__ ()
47
56
48
57
self .media_io_kwargs : dict [str , dict [
@@ -277,15 +286,6 @@ def fetch_image_embedding(
277
286
return image_embedding_io .load_base64 ("" , data )
278
287
279
288
280
- global_media_connector = MediaConnector ()
281
- """The global [`MediaConnector`][vllm.multimodal.utils.MediaConnector]
282
- instance used by vLLM."""
283
-
284
- fetch_audio = global_media_connector .fetch_audio
285
- fetch_image = global_media_connector .fetch_image
286
- fetch_video = global_media_connector .fetch_video
287
-
288
-
289
289
def encode_audio_base64 (
290
290
audio : np .ndarray ,
291
291
sampling_rate : float ,
@@ -441,3 +441,51 @@ def run_dp_sharded_vision_model(image_input: torch.Tensor,
441
441
dim = 0 )
442
442
vision_embeddings = vision_embeddings [:num_chunks , ...]
443
443
return vision_embeddings
444
+
445
+
446
+ def fetch_audio (
447
+ audio_url : str ,
448
+ audio_io_kwargs : Optional [dict [str , Any ]] = None ,
449
+ ) -> tuple [np .ndarray , Union [int , float ]]:
450
+ """
451
+ Args:
452
+ audio_url: URL of the audio file to fetch.
453
+ audio_io_kwargs: Additional kwargs passed to handle audio IO.
454
+ """
455
+ media_io_kwargs = None if not audio_io_kwargs else {
456
+ "audio" : audio_io_kwargs
457
+ }
458
+ media_connector = MediaConnector (media_io_kwargs = media_io_kwargs )
459
+ return media_connector .fetch_audio (audio_url )
460
+
461
+
462
+ def fetch_image (
463
+ image_url : str ,
464
+ image_io_kwargs : Optional [dict [str , Any ]] = None ,
465
+ ) -> Image .Image :
466
+ """
467
+ Args:
468
+ image_url: URL of the image file to fetch.
469
+ image_io_kwargs: Additional kwargs passed to handle image IO.
470
+ """
471
+ media_io_kwargs = None if not image_io_kwargs else {
472
+ "image" : image_io_kwargs
473
+ }
474
+ media_connector = MediaConnector (media_io_kwargs = media_io_kwargs )
475
+ return media_connector .fetch_image (image_url )
476
+
477
+
478
+ def fetch_video (
479
+ video_url : str ,
480
+ video_io_kwargs : Optional [dict [str , Any ]] = None ,
481
+ ) -> tuple [npt .NDArray , dict [str , Any ]]:
482
+ """
483
+ Args:
484
+ video_url: URL of the video file to fetch.
485
+ video_io_kwargs: Additional kwargs passed to handle video IO.
486
+ """
487
+ media_io_kwargs = None if not video_io_kwargs else {
488
+ "video" : video_io_kwargs
489
+ }
490
+ media_connector = MediaConnector (media_io_kwargs = media_io_kwargs )
491
+ return media_connector .fetch_video (video_url )
0 commit comments