Skip to content

Split clickbench query set into one file per query #16476

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jun 23, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions benchmarks/bench.sh
Original file line number Diff line number Diff line change
Expand Up @@ -586,23 +586,23 @@ run_clickbench_1() {
RESULTS_FILE="${RESULTS_DIR}/clickbench_1.json"
echo "RESULTS_FILE: ${RESULTS_FILE}"
echo "Running clickbench (1 file) benchmark..."
debug_run $CARGO_COMMAND --bin dfbench -- clickbench --iterations 5 --path "${DATA_DIR}/hits.parquet" --queries-path "${SCRIPT_DIR}/queries/clickbench/queries.sql" -o "${RESULTS_FILE}"
debug_run $CARGO_COMMAND --bin dfbench -- clickbench --iterations 5 --path "${DATA_DIR}/hits.parquet" --queries-path "${SCRIPT_DIR}/queries/clickbench/queries" -o "${RESULTS_FILE}"
}

# Runs the clickbench benchmark with the partitioned parquet files
run_clickbench_partitioned() {
RESULTS_FILE="${RESULTS_DIR}/clickbench_partitioned.json"
echo "RESULTS_FILE: ${RESULTS_FILE}"
echo "Running clickbench (partitioned, 100 files) benchmark..."
debug_run $CARGO_COMMAND --bin dfbench -- clickbench --iterations 5 --path "${DATA_DIR}/hits_partitioned" --queries-path "${SCRIPT_DIR}/queries/clickbench/queries.sql" -o "${RESULTS_FILE}"
debug_run $CARGO_COMMAND --bin dfbench -- clickbench --iterations 5 --path "${DATA_DIR}/hits_partitioned" --queries-path "${SCRIPT_DIR}/queries/clickbench/queries" -o "${RESULTS_FILE}"
}

# Runs the clickbench "extended" benchmark with a single large parquet file
run_clickbench_extended() {
RESULTS_FILE="${RESULTS_DIR}/clickbench_extended.json"
echo "RESULTS_FILE: ${RESULTS_FILE}"
echo "Running clickbench (1 file) extended benchmark..."
debug_run $CARGO_COMMAND --bin dfbench -- clickbench --iterations 5 --path "${DATA_DIR}/hits.parquet" --queries-path "${SCRIPT_DIR}/queries/clickbench/extended.sql" -o "${RESULTS_FILE}"
debug_run $CARGO_COMMAND --bin dfbench -- clickbench --iterations 5 --path "${DATA_DIR}/hits.parquet" --queries-path "${SCRIPT_DIR}/queries/clickbench/extended" -o "${RESULTS_FILE}"
}

# Downloads the csv.gz files IMDB datasets from Peter Boncz's homepage(one of the JOB paper authors)
Expand Down
8 changes: 4 additions & 4 deletions benchmarks/queries/clickbench/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,17 +6,17 @@ ClickBench is focused on aggregation and filtering performance (though it has no

## Files:

- `queries.sql` - Actual ClickBench queries, downloaded from the [ClickBench repository]
- `extended.sql` - "Extended" DataFusion specific queries.
- `queries/*.sql` - Actual ClickBench queries, downloaded from the [ClickBench repository](https://raw.githubusercontent.com/ClickHouse/ClickBench/main/datafusion/queries.sql) and split by the `update_queries.sh` script.
- `extended/*.sql` - "Extended" DataFusion specific queries.

[clickbench repository]: https://github.com/ClickHouse/ClickBench/blob/main/datafusion/queries.sql

## "Extended" Queries

The "extended" queries are not part of the official ClickBench benchmark.
Instead they are used to test other DataFusion features that are not covered by
the standard benchmark. Each description below is for the corresponding line in
`extended.sql` (line 1 is `Q0`, line 2 is `Q1`, etc.)
the standard benchmark. Each description below is for the corresponding file in
`extended`

### Q0: Data Exploration

Expand Down
9 changes: 0 additions & 9 deletions benchmarks/queries/clickbench/extended.sql

This file was deleted.

1 change: 1 addition & 0 deletions benchmarks/queries/clickbench/extended/q0.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
SELECT COUNT(DISTINCT "SearchPhrase"), COUNT(DISTINCT "MobilePhone"), COUNT(DISTINCT "MobilePhoneModel") FROM hits;
1 change: 1 addition & 0 deletions benchmarks/queries/clickbench/extended/q1.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
SELECT COUNT(DISTINCT "HitColor"), COUNT(DISTINCT "BrowserCountry"), COUNT(DISTINCT "BrowserLanguage") FROM hits;
1 change: 1 addition & 0 deletions benchmarks/queries/clickbench/extended/q2.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
SELECT "BrowserCountry", COUNT(DISTINCT "SocialNetwork"), COUNT(DISTINCT "HitColor"), COUNT(DISTINCT "BrowserLanguage"), COUNT(DISTINCT "SocialAction") FROM hits GROUP BY 1 ORDER BY 2 DESC LIMIT 10;
1 change: 1 addition & 0 deletions benchmarks/queries/clickbench/extended/q3.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
SELECT "SocialSourceNetworkID", "RegionID", COUNT(*), AVG("Age"), AVG("ParamPrice"), STDDEV("ParamPrice") as s, VAR("ParamPrice") FROM hits GROUP BY "SocialSourceNetworkID", "RegionID" HAVING s IS NOT NULL ORDER BY s DESC LIMIT 10;
1 change: 1 addition & 0 deletions benchmarks/queries/clickbench/extended/q4.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
SELECT "ClientIP", "WatchID", COUNT(*) c, MIN("ResponseStartTiming") tmin, MEDIAN("ResponseStartTiming") tmed, MAX("ResponseStartTiming") tmax FROM hits WHERE "JavaEnable" = 0 GROUP BY "ClientIP", "WatchID" HAVING c > 1 ORDER BY tmed DESC LIMIT 10;
1 change: 1 addition & 0 deletions benchmarks/queries/clickbench/extended/q5.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
SELECT "ClientIP", "WatchID", COUNT(*) c, MIN("ResponseStartTiming") tmin, APPROX_PERCENTILE_CONT(0.95) WITHIN GROUP (ORDER BY "ResponseStartTiming") tp95, MAX("ResponseStartTiming") tmax FROM 'hits' WHERE "JavaEnable" = 0 GROUP BY "ClientIP", "WatchID" HAVING c > 1 ORDER BY tp95 DESC LIMIT 10;
1 change: 1 addition & 0 deletions benchmarks/queries/clickbench/extended/q6.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
SELECT COUNT(*) AS ShareCount FROM hits WHERE "IsMobile" = 1 AND "MobilePhoneModel" LIKE 'iPhone%' AND "SocialAction" = 'share' AND "SocialSourceNetworkID" IN (5, 12) AND "ClientTimeZone" BETWEEN -5 AND 5 AND regexp_match("Referer", '\/campaign\/(spring|summer)_promo') IS NOT NULL AND CASE WHEN split_part(split_part("URL", 'resolution=', 2), '&', 1) ~ '^\d+$' THEN split_part(split_part("URL", 'resolution=', 2), '&', 1)::INT ELSE 0 END > 1920 AND levenshtein(CAST("UTMSource" AS STRING), CAST("UTMCampaign" AS STRING)) < 3;
1 change: 1 addition & 0 deletions benchmarks/queries/clickbench/extended/q7.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
SELECT "WatchID", MIN("ResolutionWidth") as wmin, MAX("ResolutionWidth") as wmax, SUM("IsRefresh") as srefresh FROM hits GROUP BY "WatchID" ORDER BY "WatchID" DESC LIMIT 10;
43 changes: 0 additions & 43 deletions benchmarks/queries/clickbench/queries.sql

This file was deleted.

1 change: 1 addition & 0 deletions benchmarks/queries/clickbench/queries/q0.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
SELECT COUNT(*) FROM hits;
1 change: 1 addition & 0 deletions benchmarks/queries/clickbench/queries/q1.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
SELECT COUNT(*) FROM hits WHERE "AdvEngineID" <> 0;
1 change: 1 addition & 0 deletions benchmarks/queries/clickbench/queries/q10.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
SELECT "MobilePhoneModel", COUNT(DISTINCT "UserID") AS u FROM hits WHERE "MobilePhoneModel" <> '' GROUP BY "MobilePhoneModel" ORDER BY u DESC LIMIT 10;
1 change: 1 addition & 0 deletions benchmarks/queries/clickbench/queries/q11.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
SELECT "MobilePhone", "MobilePhoneModel", COUNT(DISTINCT "UserID") AS u FROM hits WHERE "MobilePhoneModel" <> '' GROUP BY "MobilePhone", "MobilePhoneModel" ORDER BY u DESC LIMIT 10;
1 change: 1 addition & 0 deletions benchmarks/queries/clickbench/queries/q12.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
SELECT "SearchPhrase", COUNT(*) AS c FROM hits WHERE "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY c DESC LIMIT 10;
1 change: 1 addition & 0 deletions benchmarks/queries/clickbench/queries/q13.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
SELECT "SearchPhrase", COUNT(DISTINCT "UserID") AS u FROM hits WHERE "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY u DESC LIMIT 10;
1 change: 1 addition & 0 deletions benchmarks/queries/clickbench/queries/q14.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
SELECT "SearchEngineID", "SearchPhrase", COUNT(*) AS c FROM hits WHERE "SearchPhrase" <> '' GROUP BY "SearchEngineID", "SearchPhrase" ORDER BY c DESC LIMIT 10;
1 change: 1 addition & 0 deletions benchmarks/queries/clickbench/queries/q15.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
SELECT "UserID", COUNT(*) FROM hits GROUP BY "UserID" ORDER BY COUNT(*) DESC LIMIT 10;
1 change: 1 addition & 0 deletions benchmarks/queries/clickbench/queries/q16.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
SELECT "UserID", "SearchPhrase", COUNT(*) FROM hits GROUP BY "UserID", "SearchPhrase" ORDER BY COUNT(*) DESC LIMIT 10;
1 change: 1 addition & 0 deletions benchmarks/queries/clickbench/queries/q17.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
SELECT "UserID", "SearchPhrase", COUNT(*) FROM hits GROUP BY "UserID", "SearchPhrase" LIMIT 10;
1 change: 1 addition & 0 deletions benchmarks/queries/clickbench/queries/q18.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
SELECT "UserID", extract(minute FROM to_timestamp_seconds("EventTime")) AS m, "SearchPhrase", COUNT(*) FROM hits GROUP BY "UserID", m, "SearchPhrase" ORDER BY COUNT(*) DESC LIMIT 10;
1 change: 1 addition & 0 deletions benchmarks/queries/clickbench/queries/q19.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
SELECT "UserID" FROM hits WHERE "UserID" = 435090932899640449;
1 change: 1 addition & 0 deletions benchmarks/queries/clickbench/queries/q2.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
SELECT SUM("AdvEngineID"), COUNT(*), AVG("ResolutionWidth") FROM hits;
1 change: 1 addition & 0 deletions benchmarks/queries/clickbench/queries/q20.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
SELECT COUNT(*) FROM hits WHERE "URL" LIKE '%google%';
1 change: 1 addition & 0 deletions benchmarks/queries/clickbench/queries/q21.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
SELECT "SearchPhrase", MIN("URL"), COUNT(*) AS c FROM hits WHERE "URL" LIKE '%google%' AND "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY c DESC LIMIT 10;
1 change: 1 addition & 0 deletions benchmarks/queries/clickbench/queries/q22.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
SELECT "SearchPhrase", MIN("URL"), MIN("Title"), COUNT(*) AS c, COUNT(DISTINCT "UserID") FROM hits WHERE "Title" LIKE '%Google%' AND "URL" NOT LIKE '%.google.%' AND "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY c DESC LIMIT 10;
1 change: 1 addition & 0 deletions benchmarks/queries/clickbench/queries/q23.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
SELECT * FROM hits WHERE "URL" LIKE '%google%' ORDER BY "EventTime" LIMIT 10;
1 change: 1 addition & 0 deletions benchmarks/queries/clickbench/queries/q24.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
SELECT "SearchPhrase" FROM hits WHERE "SearchPhrase" <> '' ORDER BY "EventTime" LIMIT 10;
1 change: 1 addition & 0 deletions benchmarks/queries/clickbench/queries/q25.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
SELECT "SearchPhrase" FROM hits WHERE "SearchPhrase" <> '' ORDER BY "SearchPhrase" LIMIT 10;
1 change: 1 addition & 0 deletions benchmarks/queries/clickbench/queries/q26.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
SELECT "SearchPhrase" FROM hits WHERE "SearchPhrase" <> '' ORDER BY "EventTime", "SearchPhrase" LIMIT 10;
1 change: 1 addition & 0 deletions benchmarks/queries/clickbench/queries/q27.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
SELECT "CounterID", AVG(length("URL")) AS l, COUNT(*) AS c FROM hits WHERE "URL" <> '' GROUP BY "CounterID" HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;
1 change: 1 addition & 0 deletions benchmarks/queries/clickbench/queries/q28.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
SELECT REGEXP_REPLACE("Referer", '^https?://(?:www\.)?([^/]+)/.*$', '\1') AS k, AVG(length("Referer")) AS l, COUNT(*) AS c, MIN("Referer") FROM hits WHERE "Referer" <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;
Loading