Skip to content

Commit 9ff2af6

Browse files
authored
[Benchmark] Parameterization of streaming loading of multimodal datasets (#20528)
Signed-off-by: wangli <wangli858794774@gmail.com>
1 parent 70ca548 commit 9ff2af6

File tree

4 files changed

+24
-2
lines changed

4 files changed

+24
-2
lines changed

benchmarks/benchmark_dataset.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -701,13 +701,15 @@ def __init__(
701701
self,
702702
dataset_path: str,
703703
dataset_split: str,
704+
no_stream: bool = False,
704705
dataset_subset: Optional[str] = None,
705706
**kwargs,
706707
) -> None:
707708
super().__init__(dataset_path=dataset_path, **kwargs)
708709

709710
self.dataset_split = dataset_split
710711
self.dataset_subset = dataset_subset
712+
self.load_stream = not no_stream
711713
self.load_data()
712714

713715
def load_data(self) -> None:
@@ -716,7 +718,7 @@ def load_data(self) -> None:
716718
self.dataset_path,
717719
name=self.dataset_subset,
718720
split=self.dataset_split,
719-
streaming=True,
721+
streaming=self.load_stream,
720722
)
721723
self.data = self.data.shuffle(seed=self.random_seed)
722724

benchmarks/benchmark_serving.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -825,6 +825,7 @@ def main(args: argparse.Namespace):
825825
dataset_subset=args.hf_subset,
826826
dataset_split=args.hf_split,
827827
random_seed=args.seed,
828+
no_stream=args.no_stream,
828829
).sample(
829830
num_requests=args.num_prompts,
830831
tokenizer=tokenizer,
@@ -1033,6 +1034,11 @@ def create_argument_parser():
10331034
help="Path to the sharegpt/sonnet dataset. "
10341035
"Or the huggingface dataset ID if using HF dataset.",
10351036
)
1037+
parser.add_argument(
1038+
"--no-stream",
1039+
action="store_true",
1040+
help="Do not load the dataset in streaming mode.",
1041+
)
10361042
parser.add_argument(
10371043
"--max-concurrency",
10381044
type=int,

benchmarks/benchmark_throughput.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -356,6 +356,7 @@ def get_requests(args, tokenizer):
356356
elif args.dataset_name == "burstgpt":
357357
dataset_cls = BurstGPTDataset
358358
elif args.dataset_name == "hf":
359+
common_kwargs["no_stream"] = args.no_stream
359360
if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS:
360361
dataset_cls = VisionArenaDataset
361362
common_kwargs["dataset_subset"] = None
@@ -610,6 +611,11 @@ def create_argument_parser():
610611
help="Name of the dataset to benchmark on.",
611612
default="sharegpt",
612613
)
614+
parser.add_argument(
615+
"--no-stream",
616+
action="store_true",
617+
help="Do not load the dataset in streaming mode.",
618+
)
613619
parser.add_argument(
614620
"--dataset",
615621
type=str,

vllm/benchmarks/datasets.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -481,6 +481,11 @@ def add_dataset_parser(parser: FlexibleArgumentParser):
481481
choices=["sharegpt", "burstgpt", "sonnet", "random", "hf", "custom"],
482482
help="Name of the dataset to benchmark on.",
483483
)
484+
parser.add_argument(
485+
"--no-stream",
486+
action="store_true",
487+
help="Do not load the dataset in streaming mode.",
488+
)
484489
parser.add_argument(
485490
"--dataset-path",
486491
type=str,
@@ -674,6 +679,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
674679
dataset_subset=args.hf_subset,
675680
dataset_split=args.hf_split,
676681
random_seed=args.seed,
682+
no_stream=args.no_stream,
677683
).sample(
678684
num_requests=args.num_prompts,
679685
tokenizer=tokenizer,
@@ -971,13 +977,15 @@ def __init__(
971977
self,
972978
dataset_path: str,
973979
dataset_split: str,
980+
no_stream: bool = False,
974981
dataset_subset: Optional[str] = None,
975982
**kwargs,
976983
) -> None:
977984
super().__init__(dataset_path=dataset_path, **kwargs)
978985

979986
self.dataset_split = dataset_split
980987
self.dataset_subset = dataset_subset
988+
self.load_stream = not no_stream
981989
self.load_data()
982990

983991
def load_data(self) -> None:
@@ -986,7 +994,7 @@ def load_data(self) -> None:
986994
self.dataset_path,
987995
name=self.dataset_subset,
988996
split=self.dataset_split,
989-
streaming=True,
997+
streaming=self.load_stream,
990998
)
991999
self.data = self.data.shuffle(seed=self.random_seed)
9921000

0 commit comments

Comments
 (0)