Skip to content

Commit 3ee2ff0

Browse files
[Release Tests] Adding configuration to test aggregations with different shuffle strategies (#51318)
## Why are these changes needed? Added configuration to test aggregations with different shuffle strategies. --------- Signed-off-by: Alexey Kudinkin <ak@anyscale.com>
1 parent f6347c0 commit 3ee2ff0

File tree

2 files changed

+29
-12
lines changed

2 files changed

+29
-12
lines changed

release/nightly_tests/dataset/groupby_benchmark.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
import ray
77

88
from benchmark import Benchmark
9+
from python.ray.data import DataContext
10+
from python.ray.data.context import ShuffleStrategy
911

1012

1113
def parse_args() -> argparse.Namespace:
@@ -24,6 +26,14 @@ def parse_args() -> argparse.Namespace:
2426
type=str,
2527
help="Which columns to group by",
2628
)
29+
parser.add_argument(
30+
"--shuffle-strategy",
31+
required=False,
32+
default=ShuffleStrategy.SORT_SHUFFLE_PULL_BASED,
33+
nargs="?",
34+
type=str,
35+
help="Strategy to use when shuffling data (see ShuffleStrategy for accepted values)",
36+
)
2737

2838
consume_group = parser.add_mutually_exclusive_group()
2939
consume_group.add_argument("--aggregate", action="store_true")
@@ -39,6 +49,11 @@ def main(args):
3949
def benchmark_fn():
4050
path = f"s3://ray-benchmark-data/tpch/parquet/sf{args.sf}/lineitem"
4151

52+
# Configure appropriate shuffle-strategy
53+
DataContext.get_current().shuffle_strategy = ShuffleStrategy(
54+
args.shuffle_strategy
55+
)
56+
4257
grouped_ds = ray.data.read_parquet(path).groupby(args.group_by)
4358
consume_fn(grouped_ds)
4459

release/release_data_tests.yaml

Lines changed: 14 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -135,16 +135,17 @@
135135
timeout: 3600
136136

137137
variations:
138-
- __suffix__: few_groups
138+
- __suffix__: few_groups (sort_shuffle_pull_based)
139139
run:
140140
script: >
141-
python groupby_benchmark.py --sf 10 --aggregate
142-
--group-by column08 column13 column14
143-
- __suffix__: many_groups
141+
python groupby_benchmark.py --sf 10 --aggregate --group-by column08 column13 column14
142+
--shuffle-strategy sort_shuffle_pull_based
143+
144+
- __suffix__: many_groups (sort_shuffle_pull_based)
144145
run:
145146
script: >
146-
python groupby_benchmark.py --sf 10 --aggregate
147-
--group-by column02 column14
147+
python groupby_benchmark.py --sf 10 --aggregate --group-by column02 column14
148+
--shuffle-strategy sort_shuffle_pull_based
148149
149150
- name: map_groups
150151

@@ -155,16 +156,17 @@
155156
timeout: 3600
156157

157158
variations:
158-
- __suffix__: few_groups
159+
- __suffix__: few_groups (sort_shuffle_pull_based)
159160
run:
160161
script: >
161-
python groupby_benchmark.py --sf 10 --map-groups
162-
--group-by column08 column13 column14
163-
- __suffix__: many_groups
162+
python groupby_benchmark.py --sf 10 --map-groups --group-by column08 column13 column14
163+
--shuffle-strategy sort_shuffle_pull_based
164+
165+
- __suffix__: many_groups (sort_shuffle_pull_based)
164166
run:
165167
script: >
166-
python groupby_benchmark.py --sf 10 --map-groups
167-
--group-by column02 column14
168+
python groupby_benchmark.py --sf 10 --map-groups --group-by column02 column14
169+
--shuffle-strategy sort_shuffle_pull_based
168170
169171
#######################
170172
# Streaming split tests

0 commit comments

Comments
 (0)