Skip to content

Support query filter on all benchmarks #16477

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jun 24, 2025
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 20 additions & 17 deletions benchmarks/bench.sh
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,8 @@ usage() {
Orchestrates running benchmarks against DataFusion checkouts

Usage:
$0 data [benchmark] [query]
$0 run [benchmark]
$0 data [benchmark]
$0 run [benchmark] [query]
$0 compare <branch1> <branch2>
$0 compare_detail <branch1> <branch2>
$0 venv
Expand Down Expand Up @@ -268,10 +268,15 @@ main() {
RESULTS_NAME=${RESULTS_NAME:-"${BRANCH_NAME}"}
RESULTS_DIR=${RESULTS_DIR:-"$SCRIPT_DIR/results/$RESULTS_NAME"}

# Optional query filter to run specific query
QUERY=${ARG3}
QUERY_ARG=$([ -n "$QUERY" ] && echo "--query ${QUERY}" || echo "")

echo "***************************"
echo "DataFusion Benchmark Script"
echo "COMMAND: ${COMMAND}"
echo "BENCHMARK: ${BENCHMARK}"
echo "QUERY: ${QUERY:-All}"
echo "DATAFUSION_DIR: ${DATAFUSION_DIR}"
echo "BRANCH_NAME: ${BRANCH_NAME}"
echo "DATA_DIR: ${DATA_DIR}"
Expand Down Expand Up @@ -486,11 +491,9 @@ run_tpch() {
RESULTS_FILE="${RESULTS_DIR}/tpch_sf${SCALE_FACTOR}.json"
echo "RESULTS_FILE: ${RESULTS_FILE}"
echo "Running tpch benchmark..."
# Optional query filter to run specific query
QUERY=$([ -n "$ARG3" ] && echo "--query $ARG3" || echo "")

FORMAT=$2
debug_run $CARGO_COMMAND --bin tpch -- benchmark datafusion --iterations 5 --path "${TPCH_DIR}" --prefer_hash_join "${PREFER_HASH_JOIN}" --format ${FORMAT} -o "${RESULTS_FILE}" $QUERY
debug_run $CARGO_COMMAND --bin tpch -- benchmark datafusion --iterations 5 --path "${TPCH_DIR}" --prefer_hash_join "${PREFER_HASH_JOIN}" --format ${FORMAT} -o "${RESULTS_FILE}" ${QUERY_ARG}
}

# Runs the tpch in memory
Expand All @@ -505,10 +508,8 @@ run_tpch_mem() {
RESULTS_FILE="${RESULTS_DIR}/tpch_mem_sf${SCALE_FACTOR}.json"
echo "RESULTS_FILE: ${RESULTS_FILE}"
echo "Running tpch_mem benchmark..."
# Optional query filter to run specific query
QUERY=$([ -n "$ARG3" ] && echo "--query $ARG3" || echo "")
# -m means in memory
debug_run $CARGO_COMMAND --bin tpch -- benchmark datafusion --iterations 5 --path "${TPCH_DIR}" --prefer_hash_join "${PREFER_HASH_JOIN}" -m --format parquet -o "${RESULTS_FILE}" $QUERY
debug_run $CARGO_COMMAND --bin tpch -- benchmark datafusion --iterations 5 --path "${TPCH_DIR}" --prefer_hash_join "${PREFER_HASH_JOIN}" -m --format parquet -o "${RESULTS_FILE}" ${QUERY_ARG}
}

# Runs the cancellation benchmark
Expand Down Expand Up @@ -586,23 +587,23 @@ run_clickbench_1() {
RESULTS_FILE="${RESULTS_DIR}/clickbench_1.json"
echo "RESULTS_FILE: ${RESULTS_FILE}"
echo "Running clickbench (1 file) benchmark..."
debug_run $CARGO_COMMAND --bin dfbench -- clickbench --iterations 5 --path "${DATA_DIR}/hits.parquet" --queries-path "${SCRIPT_DIR}/queries/clickbench/queries" -o "${RESULTS_FILE}"
debug_run $CARGO_COMMAND --bin dfbench -- clickbench --iterations 5 --path "${DATA_DIR}/hits.parquet" --queries-path "${SCRIPT_DIR}/queries/clickbench/queries" -o "${RESULTS_FILE}" ${QUERY_ARG}
}

# Runs the clickbench benchmark with the partitioned parquet files
run_clickbench_partitioned() {
RESULTS_FILE="${RESULTS_DIR}/clickbench_partitioned.json"
echo "RESULTS_FILE: ${RESULTS_FILE}"
echo "Running clickbench (partitioned, 100 files) benchmark..."
debug_run $CARGO_COMMAND --bin dfbench -- clickbench --iterations 5 --path "${DATA_DIR}/hits_partitioned" --queries-path "${SCRIPT_DIR}/queries/clickbench/queries" -o "${RESULTS_FILE}"
debug_run $CARGO_COMMAND --bin dfbench -- clickbench --iterations 5 --path "${DATA_DIR}/hits_partitioned" --queries-path "${SCRIPT_DIR}/queries/clickbench/queries" -o "${RESULTS_FILE}" ${QUERY_ARG}
}

# Runs the clickbench "extended" benchmark with a single large parquet file
run_clickbench_extended() {
RESULTS_FILE="${RESULTS_DIR}/clickbench_extended.json"
echo "RESULTS_FILE: ${RESULTS_FILE}"
echo "Running clickbench (1 file) extended benchmark..."
debug_run $CARGO_COMMAND --bin dfbench -- clickbench --iterations 5 --path "${DATA_DIR}/hits.parquet" --queries-path "${SCRIPT_DIR}/queries/clickbench/extended" -o "${RESULTS_FILE}"
debug_run $CARGO_COMMAND --bin dfbench -- clickbench --iterations 5 --path "${DATA_DIR}/hits.parquet" --queries-path "${SCRIPT_DIR}/queries/clickbench/extended" -o "${RESULTS_FILE}" ${QUERY_ARG}
}

# Downloads the csv.gz files IMDB datasets from Peter Boncz's homepage(one of the JOB paper authors)
Expand Down Expand Up @@ -717,7 +718,7 @@ run_imdb() {
RESULTS_FILE="${RESULTS_DIR}/imdb.json"
echo "RESULTS_FILE: ${RESULTS_FILE}"
echo "Running imdb benchmark..."
debug_run $CARGO_COMMAND --bin imdb -- benchmark datafusion --iterations 5 --path "${IMDB_DIR}" --prefer_hash_join "${PREFER_HASH_JOIN}" --format parquet -o "${RESULTS_FILE}"
debug_run $CARGO_COMMAND --bin imdb -- benchmark datafusion --iterations 5 --path "${IMDB_DIR}" --prefer_hash_join "${PREFER_HASH_JOIN}" --format parquet -o "${RESULTS_FILE}" ${QUERY_ARG}
}

data_h2o() {
Expand Down Expand Up @@ -912,7 +913,8 @@ run_h2o() {
--iterations 3 \
--path "${H2O_DIR}/${FILE_NAME}" \
--queries-path "${QUERY_FILE}" \
-o "${RESULTS_FILE}"
-o "${RESULTS_FILE}" \
${QUERY_ARG}
}

# Utility function to run h2o join/window benchmark
Expand Down Expand Up @@ -963,7 +965,8 @@ h2o_runner() {
--iterations 3 \
--join-paths "${H2O_DIR}/${X_TABLE_FILE_NAME},${H2O_DIR}/${SMALL_TABLE_FILE_NAME},${H2O_DIR}/${MEDIUM_TABLE_FILE_NAME},${H2O_DIR}/${LARGE_TABLE_FILE_NAME}" \
--queries-path "${QUERY_FILE}" \
-o "${RESULTS_FILE}"
-o "${RESULTS_FILE}" \
${QUERY_ARG}
}

# Runners for h2o join benchmark
Expand All @@ -989,7 +992,7 @@ run_external_aggr() {
# number-of-partitions), and by default `--partitions` is set to number of
# CPU cores, we set a constant number of partitions to prevent this
# benchmark to fail on some machines.
debug_run $CARGO_COMMAND --bin external_aggr -- benchmark --partitions 4 --iterations 5 --path "${TPCH_DIR}" -o "${RESULTS_FILE}"
debug_run $CARGO_COMMAND --bin external_aggr -- benchmark --partitions 4 --iterations 5 --path "${TPCH_DIR}" -o "${RESULTS_FILE}" ${QUERY_ARG}
}

# Runs the sort integration benchmark
Expand All @@ -999,7 +1002,7 @@ run_sort_tpch() {
echo "RESULTS_FILE: ${RESULTS_FILE}"
echo "Running sort tpch benchmark..."

debug_run $CARGO_COMMAND --bin dfbench -- sort-tpch --iterations 5 --path "${TPCH_DIR}" -o "${RESULTS_FILE}"
debug_run $CARGO_COMMAND --bin dfbench -- sort-tpch --iterations 5 --path "${TPCH_DIR}" -o "${RESULTS_FILE}" ${QUERY_ARG}
}

# Runs the sort tpch integration benchmark with limit 100 (topk)
Expand All @@ -1009,7 +1012,7 @@ run_topk_tpch() {
echo "RESULTS_FILE: ${RESULTS_FILE}"
echo "Running topk tpch benchmark..."

$CARGO_COMMAND --bin dfbench -- sort-tpch --iterations 5 --path "${TPCH_DIR}" -o "${RESULTS_FILE}" --limit 100
$CARGO_COMMAND --bin dfbench -- sort-tpch --iterations 5 --path "${TPCH_DIR}" -o "${RESULTS_FILE}" --limit 100 ${QUERY_ARG}
}


Expand Down