diff --git a/benchmarks/bench.sh b/benchmarks/bench.sh index 9ad12d1f630e..ed6225b591b0 100755 --- a/benchmarks/bench.sh +++ b/benchmarks/bench.sh @@ -586,7 +586,7 @@ run_clickbench_1() { RESULTS_FILE="${RESULTS_DIR}/clickbench_1.json" echo "RESULTS_FILE: ${RESULTS_FILE}" echo "Running clickbench (1 file) benchmark..." - debug_run $CARGO_COMMAND --bin dfbench -- clickbench --iterations 5 --path "${DATA_DIR}/hits.parquet" --queries-path "${SCRIPT_DIR}/queries/clickbench/queries.sql" -o "${RESULTS_FILE}" + debug_run $CARGO_COMMAND --bin dfbench -- clickbench --iterations 5 --path "${DATA_DIR}/hits.parquet" --queries-path "${SCRIPT_DIR}/queries/clickbench/queries" -o "${RESULTS_FILE}" } # Runs the clickbench benchmark with the partitioned parquet files @@ -594,7 +594,7 @@ run_clickbench_partitioned() { RESULTS_FILE="${RESULTS_DIR}/clickbench_partitioned.json" echo "RESULTS_FILE: ${RESULTS_FILE}" echo "Running clickbench (partitioned, 100 files) benchmark..." - debug_run $CARGO_COMMAND --bin dfbench -- clickbench --iterations 5 --path "${DATA_DIR}/hits_partitioned" --queries-path "${SCRIPT_DIR}/queries/clickbench/queries.sql" -o "${RESULTS_FILE}" + debug_run $CARGO_COMMAND --bin dfbench -- clickbench --iterations 5 --path "${DATA_DIR}/hits_partitioned" --queries-path "${SCRIPT_DIR}/queries/clickbench/queries" -o "${RESULTS_FILE}" } # Runs the clickbench "extended" benchmark with a single large parquet file @@ -602,7 +602,7 @@ run_clickbench_extended() { RESULTS_FILE="${RESULTS_DIR}/clickbench_extended.json" echo "RESULTS_FILE: ${RESULTS_FILE}" echo "Running clickbench (1 file) extended benchmark..." - debug_run $CARGO_COMMAND --bin dfbench -- clickbench --iterations 5 --path "${DATA_DIR}/hits.parquet" --queries-path "${SCRIPT_DIR}/queries/clickbench/extended.sql" -o "${RESULTS_FILE}" + debug_run $CARGO_COMMAND --bin dfbench -- clickbench --iterations 5 --path "${DATA_DIR}/hits.parquet" --queries-path "${SCRIPT_DIR}/queries/clickbench/extended" -o "${RESULTS_FILE}" } # Downloads the csv.gz files IMDB datasets from Peter Boncz's homepage(one of the JOB paper authors) diff --git a/benchmarks/queries/clickbench/README.md b/benchmarks/queries/clickbench/README.md index e5acd8f348a4..877ea0e0c319 100644 --- a/benchmarks/queries/clickbench/README.md +++ b/benchmarks/queries/clickbench/README.md @@ -6,8 +6,8 @@ ClickBench is focused on aggregation and filtering performance (though it has no ## Files: -- `queries.sql` - Actual ClickBench queries, downloaded from the [ClickBench repository] -- `extended.sql` - "Extended" DataFusion specific queries. +- `queries/*.sql` - Actual ClickBench queries, downloaded from the [ClickBench repository](https://raw.githubusercontent.com/ClickHouse/ClickBench/main/datafusion/queries.sql) and split by the `update_queries.sh` script. +- `extended/*.sql` - "Extended" DataFusion specific queries. [clickbench repository]: https://github.com/ClickHouse/ClickBench/blob/main/datafusion/queries.sql @@ -15,8 +15,8 @@ ClickBench is focused on aggregation and filtering performance (though it has no The "extended" queries are not part of the official ClickBench benchmark. Instead they are used to test other DataFusion features that are not covered by -the standard benchmark. Each description below is for the corresponding line in -`extended.sql` (line 1 is `Q0`, line 2 is `Q1`, etc.) +the standard benchmark. Each description below is for the corresponding file in +`extended` ### Q0: Data Exploration diff --git a/benchmarks/queries/clickbench/extended.sql b/benchmarks/queries/clickbench/extended.sql deleted file mode 100644 index 93c39efe4f8e..000000000000 --- a/benchmarks/queries/clickbench/extended.sql +++ /dev/null @@ -1,9 +0,0 @@ -SELECT COUNT(DISTINCT "SearchPhrase"), COUNT(DISTINCT "MobilePhone"), COUNT(DISTINCT "MobilePhoneModel") FROM hits; -SELECT COUNT(DISTINCT "HitColor"), COUNT(DISTINCT "BrowserCountry"), COUNT(DISTINCT "BrowserLanguage") FROM hits; -SELECT "BrowserCountry", COUNT(DISTINCT "SocialNetwork"), COUNT(DISTINCT "HitColor"), COUNT(DISTINCT "BrowserLanguage"), COUNT(DISTINCT "SocialAction") FROM hits GROUP BY 1 ORDER BY 2 DESC LIMIT 10; -SELECT "SocialSourceNetworkID", "RegionID", COUNT(*), AVG("Age"), AVG("ParamPrice"), STDDEV("ParamPrice") as s, VAR("ParamPrice") FROM hits GROUP BY "SocialSourceNetworkID", "RegionID" HAVING s IS NOT NULL ORDER BY s DESC LIMIT 10; -SELECT "ClientIP", "WatchID", COUNT(*) c, MIN("ResponseStartTiming") tmin, MEDIAN("ResponseStartTiming") tmed, MAX("ResponseStartTiming") tmax FROM hits WHERE "JavaEnable" = 0 GROUP BY "ClientIP", "WatchID" HAVING c > 1 ORDER BY tmed DESC LIMIT 10; -SELECT "ClientIP", "WatchID", COUNT(*) c, MIN("ResponseStartTiming") tmin, APPROX_PERCENTILE_CONT(0.95) WITHIN GROUP (ORDER BY "ResponseStartTiming") tp95, MAX("ResponseStartTiming") tmax FROM 'hits' WHERE "JavaEnable" = 0 GROUP BY "ClientIP", "WatchID" HAVING c > 1 ORDER BY tp95 DESC LIMIT 10; -SELECT COUNT(*) AS ShareCount FROM hits WHERE "IsMobile" = 1 AND "MobilePhoneModel" LIKE 'iPhone%' AND "SocialAction" = 'share' AND "SocialSourceNetworkID" IN (5, 12) AND "ClientTimeZone" BETWEEN -5 AND 5 AND regexp_match("Referer", '\/campaign\/(spring|summer)_promo') IS NOT NULL AND CASE WHEN split_part(split_part("URL", 'resolution=', 2), '&', 1) ~ '^\d+$' THEN split_part(split_part("URL", 'resolution=', 2), '&', 1)::INT ELSE 0 END > 1920 AND levenshtein(CAST("UTMSource" AS STRING), CAST("UTMCampaign" AS STRING)) < 3; -SELECT "WatchID", MIN("ResolutionWidth") as wmin, MAX("ResolutionWidth") as wmax, SUM("IsRefresh") as srefresh FROM hits GROUP BY "WatchID" ORDER BY "WatchID" DESC LIMIT 10; -SELECT "RegionID", "UserAgent", "OS", AVG(to_timestamp("ResponseEndTiming")-to_timestamp("ResponseStartTiming")) as avg_response_time, AVG(to_timestamp("ResponseEndTiming")-to_timestamp("ConnectTiming")) as avg_latency FROM hits GROUP BY "RegionID", "UserAgent", "OS" ORDER BY avg_latency DESC limit 10; \ No newline at end of file diff --git a/benchmarks/queries/clickbench/extended/q0.sql b/benchmarks/queries/clickbench/extended/q0.sql new file mode 100644 index 000000000000..a1e55b5b25ac --- /dev/null +++ b/benchmarks/queries/clickbench/extended/q0.sql @@ -0,0 +1 @@ +SELECT COUNT(DISTINCT "SearchPhrase"), COUNT(DISTINCT "MobilePhone"), COUNT(DISTINCT "MobilePhoneModel") FROM hits; diff --git a/benchmarks/queries/clickbench/extended/q1.sql b/benchmarks/queries/clickbench/extended/q1.sql new file mode 100644 index 000000000000..84fac921c8cb --- /dev/null +++ b/benchmarks/queries/clickbench/extended/q1.sql @@ -0,0 +1 @@ +SELECT COUNT(DISTINCT "HitColor"), COUNT(DISTINCT "BrowserCountry"), COUNT(DISTINCT "BrowserLanguage") FROM hits; diff --git a/benchmarks/queries/clickbench/extended/q2.sql b/benchmarks/queries/clickbench/extended/q2.sql new file mode 100644 index 000000000000..9832ce44d4cb --- /dev/null +++ b/benchmarks/queries/clickbench/extended/q2.sql @@ -0,0 +1 @@ +SELECT "BrowserCountry", COUNT(DISTINCT "SocialNetwork"), COUNT(DISTINCT "HitColor"), COUNT(DISTINCT "BrowserLanguage"), COUNT(DISTINCT "SocialAction") FROM hits GROUP BY 1 ORDER BY 2 DESC LIMIT 10; diff --git a/benchmarks/queries/clickbench/extended/q3.sql b/benchmarks/queries/clickbench/extended/q3.sql new file mode 100644 index 000000000000..d1661bc216e5 --- /dev/null +++ b/benchmarks/queries/clickbench/extended/q3.sql @@ -0,0 +1 @@ +SELECT "SocialSourceNetworkID", "RegionID", COUNT(*), AVG("Age"), AVG("ParamPrice"), STDDEV("ParamPrice") as s, VAR("ParamPrice") FROM hits GROUP BY "SocialSourceNetworkID", "RegionID" HAVING s IS NOT NULL ORDER BY s DESC LIMIT 10; diff --git a/benchmarks/queries/clickbench/extended/q4.sql b/benchmarks/queries/clickbench/extended/q4.sql new file mode 100644 index 000000000000..bd54956a2bcd --- /dev/null +++ b/benchmarks/queries/clickbench/extended/q4.sql @@ -0,0 +1 @@ +SELECT "ClientIP", "WatchID", COUNT(*) c, MIN("ResponseStartTiming") tmin, MEDIAN("ResponseStartTiming") tmed, MAX("ResponseStartTiming") tmax FROM hits WHERE "JavaEnable" = 0 GROUP BY "ClientIP", "WatchID" HAVING c > 1 ORDER BY tmed DESC LIMIT 10; diff --git a/benchmarks/queries/clickbench/extended/q5.sql b/benchmarks/queries/clickbench/extended/q5.sql new file mode 100644 index 000000000000..9de2f517d09b --- /dev/null +++ b/benchmarks/queries/clickbench/extended/q5.sql @@ -0,0 +1 @@ +SELECT "ClientIP", "WatchID", COUNT(*) c, MIN("ResponseStartTiming") tmin, APPROX_PERCENTILE_CONT(0.95) WITHIN GROUP (ORDER BY "ResponseStartTiming") tp95, MAX("ResponseStartTiming") tmax FROM 'hits' WHERE "JavaEnable" = 0 GROUP BY "ClientIP", "WatchID" HAVING c > 1 ORDER BY tp95 DESC LIMIT 10; diff --git a/benchmarks/queries/clickbench/extended/q6.sql b/benchmarks/queries/clickbench/extended/q6.sql new file mode 100644 index 000000000000..091e8867c7ef --- /dev/null +++ b/benchmarks/queries/clickbench/extended/q6.sql @@ -0,0 +1 @@ +SELECT COUNT(*) AS ShareCount FROM hits WHERE "IsMobile" = 1 AND "MobilePhoneModel" LIKE 'iPhone%' AND "SocialAction" = 'share' AND "SocialSourceNetworkID" IN (5, 12) AND "ClientTimeZone" BETWEEN -5 AND 5 AND regexp_match("Referer", '\/campaign\/(spring|summer)_promo') IS NOT NULL AND CASE WHEN split_part(split_part("URL", 'resolution=', 2), '&', 1) ~ '^\d+$' THEN split_part(split_part("URL", 'resolution=', 2), '&', 1)::INT ELSE 0 END > 1920 AND levenshtein(CAST("UTMSource" AS STRING), CAST("UTMCampaign" AS STRING)) < 3; diff --git a/benchmarks/queries/clickbench/extended/q7.sql b/benchmarks/queries/clickbench/extended/q7.sql new file mode 100644 index 000000000000..ddaff7f8804f --- /dev/null +++ b/benchmarks/queries/clickbench/extended/q7.sql @@ -0,0 +1 @@ +SELECT "WatchID", MIN("ResolutionWidth") as wmin, MAX("ResolutionWidth") as wmax, SUM("IsRefresh") as srefresh FROM hits GROUP BY "WatchID" ORDER BY "WatchID" DESC LIMIT 10; diff --git a/benchmarks/queries/clickbench/queries.sql b/benchmarks/queries/clickbench/queries.sql deleted file mode 100644 index 9a183cd6e259..000000000000 --- a/benchmarks/queries/clickbench/queries.sql +++ /dev/null @@ -1,43 +0,0 @@ -SELECT COUNT(*) FROM hits; -SELECT COUNT(*) FROM hits WHERE "AdvEngineID" <> 0; -SELECT SUM("AdvEngineID"), COUNT(*), AVG("ResolutionWidth") FROM hits; -SELECT AVG("UserID") FROM hits; -SELECT COUNT(DISTINCT "UserID") FROM hits; -SELECT COUNT(DISTINCT "SearchPhrase") FROM hits; -SELECT MIN("EventDate"), MAX("EventDate") FROM hits; -SELECT "AdvEngineID", COUNT(*) FROM hits WHERE "AdvEngineID" <> 0 GROUP BY "AdvEngineID" ORDER BY COUNT(*) DESC; -SELECT "RegionID", COUNT(DISTINCT "UserID") AS u FROM hits GROUP BY "RegionID" ORDER BY u DESC LIMIT 10; -SELECT "RegionID", SUM("AdvEngineID"), COUNT(*) AS c, AVG("ResolutionWidth"), COUNT(DISTINCT "UserID") FROM hits GROUP BY "RegionID" ORDER BY c DESC LIMIT 10; -SELECT "MobilePhoneModel", COUNT(DISTINCT "UserID") AS u FROM hits WHERE "MobilePhoneModel" <> '' GROUP BY "MobilePhoneModel" ORDER BY u DESC LIMIT 10; -SELECT "MobilePhone", "MobilePhoneModel", COUNT(DISTINCT "UserID") AS u FROM hits WHERE "MobilePhoneModel" <> '' GROUP BY "MobilePhone", "MobilePhoneModel" ORDER BY u DESC LIMIT 10; -SELECT "SearchPhrase", COUNT(*) AS c FROM hits WHERE "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY c DESC LIMIT 10; -SELECT "SearchPhrase", COUNT(DISTINCT "UserID") AS u FROM hits WHERE "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY u DESC LIMIT 10; -SELECT "SearchEngineID", "SearchPhrase", COUNT(*) AS c FROM hits WHERE "SearchPhrase" <> '' GROUP BY "SearchEngineID", "SearchPhrase" ORDER BY c DESC LIMIT 10; -SELECT "UserID", COUNT(*) FROM hits GROUP BY "UserID" ORDER BY COUNT(*) DESC LIMIT 10; -SELECT "UserID", "SearchPhrase", COUNT(*) FROM hits GROUP BY "UserID", "SearchPhrase" ORDER BY COUNT(*) DESC LIMIT 10; -SELECT "UserID", "SearchPhrase", COUNT(*) FROM hits GROUP BY "UserID", "SearchPhrase" LIMIT 10; -SELECT "UserID", extract(minute FROM to_timestamp_seconds("EventTime")) AS m, "SearchPhrase", COUNT(*) FROM hits GROUP BY "UserID", m, "SearchPhrase" ORDER BY COUNT(*) DESC LIMIT 10; -SELECT "UserID" FROM hits WHERE "UserID" = 435090932899640449; -SELECT COUNT(*) FROM hits WHERE "URL" LIKE '%google%'; -SELECT "SearchPhrase", MIN("URL"), COUNT(*) AS c FROM hits WHERE "URL" LIKE '%google%' AND "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY c DESC LIMIT 10; -SELECT "SearchPhrase", MIN("URL"), MIN("Title"), COUNT(*) AS c, COUNT(DISTINCT "UserID") FROM hits WHERE "Title" LIKE '%Google%' AND "URL" NOT LIKE '%.google.%' AND "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY c DESC LIMIT 10; -SELECT * FROM hits WHERE "URL" LIKE '%google%' ORDER BY "EventTime" LIMIT 10; -SELECT "SearchPhrase" FROM hits WHERE "SearchPhrase" <> '' ORDER BY "EventTime" LIMIT 10; -SELECT "SearchPhrase" FROM hits WHERE "SearchPhrase" <> '' ORDER BY "SearchPhrase" LIMIT 10; -SELECT "SearchPhrase" FROM hits WHERE "SearchPhrase" <> '' ORDER BY "EventTime", "SearchPhrase" LIMIT 10; -SELECT "CounterID", AVG(length("URL")) AS l, COUNT(*) AS c FROM hits WHERE "URL" <> '' GROUP BY "CounterID" HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; -SELECT REGEXP_REPLACE("Referer", '^https?://(?:www\.)?([^/]+)/.*$', '\1') AS k, AVG(length("Referer")) AS l, COUNT(*) AS c, MIN("Referer") FROM hits WHERE "Referer" <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; -SELECT SUM("ResolutionWidth"), SUM("ResolutionWidth" + 1), SUM("ResolutionWidth" + 2), SUM("ResolutionWidth" + 3), SUM("ResolutionWidth" + 4), SUM("ResolutionWidth" + 5), SUM("ResolutionWidth" + 6), SUM("ResolutionWidth" + 7), SUM("ResolutionWidth" + 8), SUM("ResolutionWidth" + 9), SUM("ResolutionWidth" + 10), SUM("ResolutionWidth" + 11), SUM("ResolutionWidth" + 12), SUM("ResolutionWidth" + 13), SUM("ResolutionWidth" + 14), SUM("ResolutionWidth" + 15), SUM("ResolutionWidth" + 16), SUM("ResolutionWidth" + 17), SUM("ResolutionWidth" + 18), SUM("ResolutionWidth" + 19), SUM("ResolutionWidth" + 20), SUM("ResolutionWidth" + 21), SUM("ResolutionWidth" + 22), SUM("ResolutionWidth" + 23), SUM("ResolutionWidth" + 24), SUM("ResolutionWidth" + 25), SUM("ResolutionWidth" + 26), SUM("ResolutionWidth" + 27), SUM("ResolutionWidth" + 28), SUM("ResolutionWidth" + 29), SUM("ResolutionWidth" + 30), SUM("ResolutionWidth" + 31), SUM("ResolutionWidth" + 32), SUM("ResolutionWidth" + 33), SUM("ResolutionWidth" + 34), SUM("ResolutionWidth" + 35), SUM("ResolutionWidth" + 36), SUM("ResolutionWidth" + 37), SUM("ResolutionWidth" + 38), SUM("ResolutionWidth" + 39), SUM("ResolutionWidth" + 40), SUM("ResolutionWidth" + 41), SUM("ResolutionWidth" + 42), SUM("ResolutionWidth" + 43), SUM("ResolutionWidth" + 44), SUM("ResolutionWidth" + 45), SUM("ResolutionWidth" + 46), SUM("ResolutionWidth" + 47), SUM("ResolutionWidth" + 48), SUM("ResolutionWidth" + 49), SUM("ResolutionWidth" + 50), SUM("ResolutionWidth" + 51), SUM("ResolutionWidth" + 52), SUM("ResolutionWidth" + 53), SUM("ResolutionWidth" + 54), SUM("ResolutionWidth" + 55), SUM("ResolutionWidth" + 56), SUM("ResolutionWidth" + 57), SUM("ResolutionWidth" + 58), SUM("ResolutionWidth" + 59), SUM("ResolutionWidth" + 60), SUM("ResolutionWidth" + 61), SUM("ResolutionWidth" + 62), SUM("ResolutionWidth" + 63), SUM("ResolutionWidth" + 64), SUM("ResolutionWidth" + 65), SUM("ResolutionWidth" + 66), SUM("ResolutionWidth" + 67), SUM("ResolutionWidth" + 68), SUM("ResolutionWidth" + 69), SUM("ResolutionWidth" + 70), SUM("ResolutionWidth" + 71), SUM("ResolutionWidth" + 72), SUM("ResolutionWidth" + 73), SUM("ResolutionWidth" + 74), SUM("ResolutionWidth" + 75), SUM("ResolutionWidth" + 76), SUM("ResolutionWidth" + 77), SUM("ResolutionWidth" + 78), SUM("ResolutionWidth" + 79), SUM("ResolutionWidth" + 80), SUM("ResolutionWidth" + 81), SUM("ResolutionWidth" + 82), SUM("ResolutionWidth" + 83), SUM("ResolutionWidth" + 84), SUM("ResolutionWidth" + 85), SUM("ResolutionWidth" + 86), SUM("ResolutionWidth" + 87), SUM("ResolutionWidth" + 88), SUM("ResolutionWidth" + 89) FROM hits; -SELECT "SearchEngineID", "ClientIP", COUNT(*) AS c, SUM("IsRefresh"), AVG("ResolutionWidth") FROM hits WHERE "SearchPhrase" <> '' GROUP BY "SearchEngineID", "ClientIP" ORDER BY c DESC LIMIT 10; -SELECT "WatchID", "ClientIP", COUNT(*) AS c, SUM("IsRefresh"), AVG("ResolutionWidth") FROM hits WHERE "SearchPhrase" <> '' GROUP BY "WatchID", "ClientIP" ORDER BY c DESC LIMIT 10; -SELECT "WatchID", "ClientIP", COUNT(*) AS c, SUM("IsRefresh"), AVG("ResolutionWidth") FROM hits GROUP BY "WatchID", "ClientIP" ORDER BY c DESC LIMIT 10; -SELECT "URL", COUNT(*) AS c FROM hits GROUP BY "URL" ORDER BY c DESC LIMIT 10; -SELECT 1, "URL", COUNT(*) AS c FROM hits GROUP BY 1, "URL" ORDER BY c DESC LIMIT 10; -SELECT "ClientIP", "ClientIP" - 1, "ClientIP" - 2, "ClientIP" - 3, COUNT(*) AS c FROM hits GROUP BY "ClientIP", "ClientIP" - 1, "ClientIP" - 2, "ClientIP" - 3 ORDER BY c DESC LIMIT 10; -SELECT "URL", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "DontCountHits" = 0 AND "IsRefresh" = 0 AND "URL" <> '' GROUP BY "URL" ORDER BY PageViews DESC LIMIT 10; -SELECT "Title", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "DontCountHits" = 0 AND "IsRefresh" = 0 AND "Title" <> '' GROUP BY "Title" ORDER BY PageViews DESC LIMIT 10; -SELECT "URL", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "IsRefresh" = 0 AND "IsLink" <> 0 AND "IsDownload" = 0 GROUP BY "URL" ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; -SELECT "TraficSourceID", "SearchEngineID", "AdvEngineID", CASE WHEN ("SearchEngineID" = 0 AND "AdvEngineID" = 0) THEN "Referer" ELSE '' END AS Src, "URL" AS Dst, COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "IsRefresh" = 0 GROUP BY "TraficSourceID", "SearchEngineID", "AdvEngineID", Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; -SELECT "URLHash", "EventDate", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "IsRefresh" = 0 AND "TraficSourceID" IN (-1, 6) AND "RefererHash" = 3594120000172545465 GROUP BY "URLHash", "EventDate" ORDER BY PageViews DESC LIMIT 10 OFFSET 100; -SELECT "WindowClientWidth", "WindowClientHeight", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "IsRefresh" = 0 AND "DontCountHits" = 0 AND "URLHash" = 2868770270353813622 GROUP BY "WindowClientWidth", "WindowClientHeight" ORDER BY PageViews DESC LIMIT 10 OFFSET 10000; -SELECT DATE_TRUNC('minute', to_timestamp_seconds("EventTime")) AS M, COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-14' AND "EventDate" <= '2013-07-15' AND "IsRefresh" = 0 AND "DontCountHits" = 0 GROUP BY DATE_TRUNC('minute', to_timestamp_seconds("EventTime")) ORDER BY DATE_TRUNC('minute', M) LIMIT 10 OFFSET 1000; diff --git a/benchmarks/queries/clickbench/queries/q0.sql b/benchmarks/queries/clickbench/queries/q0.sql new file mode 100644 index 000000000000..c70aa7a844d7 --- /dev/null +++ b/benchmarks/queries/clickbench/queries/q0.sql @@ -0,0 +1 @@ +SELECT COUNT(*) FROM hits; diff --git a/benchmarks/queries/clickbench/queries/q1.sql b/benchmarks/queries/clickbench/queries/q1.sql new file mode 100644 index 000000000000..283a5c3cc82b --- /dev/null +++ b/benchmarks/queries/clickbench/queries/q1.sql @@ -0,0 +1 @@ +SELECT COUNT(*) FROM hits WHERE "AdvEngineID" <> 0; diff --git a/benchmarks/queries/clickbench/queries/q10.sql b/benchmarks/queries/clickbench/queries/q10.sql new file mode 100644 index 000000000000..dd44e5c49368 --- /dev/null +++ b/benchmarks/queries/clickbench/queries/q10.sql @@ -0,0 +1 @@ +SELECT "MobilePhoneModel", COUNT(DISTINCT "UserID") AS u FROM hits WHERE "MobilePhoneModel" <> '' GROUP BY "MobilePhoneModel" ORDER BY u DESC LIMIT 10; diff --git a/benchmarks/queries/clickbench/queries/q11.sql b/benchmarks/queries/clickbench/queries/q11.sql new file mode 100644 index 000000000000..9349d450699c --- /dev/null +++ b/benchmarks/queries/clickbench/queries/q11.sql @@ -0,0 +1 @@ +SELECT "MobilePhone", "MobilePhoneModel", COUNT(DISTINCT "UserID") AS u FROM hits WHERE "MobilePhoneModel" <> '' GROUP BY "MobilePhone", "MobilePhoneModel" ORDER BY u DESC LIMIT 10; diff --git a/benchmarks/queries/clickbench/queries/q12.sql b/benchmarks/queries/clickbench/queries/q12.sql new file mode 100644 index 000000000000..908af6314988 --- /dev/null +++ b/benchmarks/queries/clickbench/queries/q12.sql @@ -0,0 +1 @@ +SELECT "SearchPhrase", COUNT(*) AS c FROM hits WHERE "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY c DESC LIMIT 10; diff --git a/benchmarks/queries/clickbench/queries/q13.sql b/benchmarks/queries/clickbench/queries/q13.sql new file mode 100644 index 000000000000..46e1e6b4a74d --- /dev/null +++ b/benchmarks/queries/clickbench/queries/q13.sql @@ -0,0 +1 @@ +SELECT "SearchPhrase", COUNT(DISTINCT "UserID") AS u FROM hits WHERE "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY u DESC LIMIT 10; diff --git a/benchmarks/queries/clickbench/queries/q14.sql b/benchmarks/queries/clickbench/queries/q14.sql new file mode 100644 index 000000000000..d6c5118168f0 --- /dev/null +++ b/benchmarks/queries/clickbench/queries/q14.sql @@ -0,0 +1 @@ +SELECT "SearchEngineID", "SearchPhrase", COUNT(*) AS c FROM hits WHERE "SearchPhrase" <> '' GROUP BY "SearchEngineID", "SearchPhrase" ORDER BY c DESC LIMIT 10; diff --git a/benchmarks/queries/clickbench/queries/q15.sql b/benchmarks/queries/clickbench/queries/q15.sql new file mode 100644 index 000000000000..f5b4e511a886 --- /dev/null +++ b/benchmarks/queries/clickbench/queries/q15.sql @@ -0,0 +1 @@ +SELECT "UserID", COUNT(*) FROM hits GROUP BY "UserID" ORDER BY COUNT(*) DESC LIMIT 10; diff --git a/benchmarks/queries/clickbench/queries/q16.sql b/benchmarks/queries/clickbench/queries/q16.sql new file mode 100644 index 000000000000..38e44b684941 --- /dev/null +++ b/benchmarks/queries/clickbench/queries/q16.sql @@ -0,0 +1 @@ +SELECT "UserID", "SearchPhrase", COUNT(*) FROM hits GROUP BY "UserID", "SearchPhrase" ORDER BY COUNT(*) DESC LIMIT 10; diff --git a/benchmarks/queries/clickbench/queries/q17.sql b/benchmarks/queries/clickbench/queries/q17.sql new file mode 100644 index 000000000000..1a97cdd36a24 --- /dev/null +++ b/benchmarks/queries/clickbench/queries/q17.sql @@ -0,0 +1 @@ +SELECT "UserID", "SearchPhrase", COUNT(*) FROM hits GROUP BY "UserID", "SearchPhrase" LIMIT 10; diff --git a/benchmarks/queries/clickbench/queries/q18.sql b/benchmarks/queries/clickbench/queries/q18.sql new file mode 100644 index 000000000000..5aeeedf78ee0 --- /dev/null +++ b/benchmarks/queries/clickbench/queries/q18.sql @@ -0,0 +1 @@ +SELECT "UserID", extract(minute FROM to_timestamp_seconds("EventTime")) AS m, "SearchPhrase", COUNT(*) FROM hits GROUP BY "UserID", m, "SearchPhrase" ORDER BY COUNT(*) DESC LIMIT 10; diff --git a/benchmarks/queries/clickbench/queries/q19.sql b/benchmarks/queries/clickbench/queries/q19.sql new file mode 100644 index 000000000000..e388497dd1ec --- /dev/null +++ b/benchmarks/queries/clickbench/queries/q19.sql @@ -0,0 +1 @@ +SELECT "UserID" FROM hits WHERE "UserID" = 435090932899640449; diff --git a/benchmarks/queries/clickbench/queries/q2.sql b/benchmarks/queries/clickbench/queries/q2.sql new file mode 100644 index 000000000000..9938e3081dd2 --- /dev/null +++ b/benchmarks/queries/clickbench/queries/q2.sql @@ -0,0 +1 @@ +SELECT SUM("AdvEngineID"), COUNT(*), AVG("ResolutionWidth") FROM hits; diff --git a/benchmarks/queries/clickbench/queries/q20.sql b/benchmarks/queries/clickbench/queries/q20.sql new file mode 100644 index 000000000000..a7e6995c1f1b --- /dev/null +++ b/benchmarks/queries/clickbench/queries/q20.sql @@ -0,0 +1 @@ +SELECT COUNT(*) FROM hits WHERE "URL" LIKE '%google%'; diff --git a/benchmarks/queries/clickbench/queries/q21.sql b/benchmarks/queries/clickbench/queries/q21.sql new file mode 100644 index 000000000000..d857899d136c --- /dev/null +++ b/benchmarks/queries/clickbench/queries/q21.sql @@ -0,0 +1 @@ +SELECT "SearchPhrase", MIN("URL"), COUNT(*) AS c FROM hits WHERE "URL" LIKE '%google%' AND "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY c DESC LIMIT 10; diff --git a/benchmarks/queries/clickbench/queries/q22.sql b/benchmarks/queries/clickbench/queries/q22.sql new file mode 100644 index 000000000000..8ac4f099c484 --- /dev/null +++ b/benchmarks/queries/clickbench/queries/q22.sql @@ -0,0 +1 @@ +SELECT "SearchPhrase", MIN("URL"), MIN("Title"), COUNT(*) AS c, COUNT(DISTINCT "UserID") FROM hits WHERE "Title" LIKE '%Google%' AND "URL" NOT LIKE '%.google.%' AND "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY c DESC LIMIT 10; diff --git a/benchmarks/queries/clickbench/queries/q23.sql b/benchmarks/queries/clickbench/queries/q23.sql new file mode 100644 index 000000000000..3623b0fed806 --- /dev/null +++ b/benchmarks/queries/clickbench/queries/q23.sql @@ -0,0 +1 @@ +SELECT * FROM hits WHERE "URL" LIKE '%google%' ORDER BY "EventTime" LIMIT 10; diff --git a/benchmarks/queries/clickbench/queries/q24.sql b/benchmarks/queries/clickbench/queries/q24.sql new file mode 100644 index 000000000000..cee774aafe53 --- /dev/null +++ b/benchmarks/queries/clickbench/queries/q24.sql @@ -0,0 +1 @@ +SELECT "SearchPhrase" FROM hits WHERE "SearchPhrase" <> '' ORDER BY "EventTime" LIMIT 10; diff --git a/benchmarks/queries/clickbench/queries/q25.sql b/benchmarks/queries/clickbench/queries/q25.sql new file mode 100644 index 000000000000..048b4cd9d3e2 --- /dev/null +++ b/benchmarks/queries/clickbench/queries/q25.sql @@ -0,0 +1 @@ +SELECT "SearchPhrase" FROM hits WHERE "SearchPhrase" <> '' ORDER BY "SearchPhrase" LIMIT 10; diff --git a/benchmarks/queries/clickbench/queries/q26.sql b/benchmarks/queries/clickbench/queries/q26.sql new file mode 100644 index 000000000000..104e8d50ecb0 --- /dev/null +++ b/benchmarks/queries/clickbench/queries/q26.sql @@ -0,0 +1 @@ +SELECT "SearchPhrase" FROM hits WHERE "SearchPhrase" <> '' ORDER BY "EventTime", "SearchPhrase" LIMIT 10; diff --git a/benchmarks/queries/clickbench/queries/q27.sql b/benchmarks/queries/clickbench/queries/q27.sql new file mode 100644 index 000000000000..c84cad9296e0 --- /dev/null +++ b/benchmarks/queries/clickbench/queries/q27.sql @@ -0,0 +1 @@ +SELECT "CounterID", AVG(length("URL")) AS l, COUNT(*) AS c FROM hits WHERE "URL" <> '' GROUP BY "CounterID" HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; diff --git a/benchmarks/queries/clickbench/queries/q28.sql b/benchmarks/queries/clickbench/queries/q28.sql new file mode 100644 index 000000000000..8c5a51877f32 --- /dev/null +++ b/benchmarks/queries/clickbench/queries/q28.sql @@ -0,0 +1 @@ +SELECT REGEXP_REPLACE("Referer", '^https?://(?:www\.)?([^/]+)/.*$', '\1') AS k, AVG(length("Referer")) AS l, COUNT(*) AS c, MIN("Referer") FROM hits WHERE "Referer" <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; diff --git a/benchmarks/queries/clickbench/queries/q29.sql b/benchmarks/queries/clickbench/queries/q29.sql new file mode 100644 index 000000000000..bfff2509062d --- /dev/null +++ b/benchmarks/queries/clickbench/queries/q29.sql @@ -0,0 +1 @@ +SELECT SUM("ResolutionWidth"), SUM("ResolutionWidth" + 1), SUM("ResolutionWidth" + 2), SUM("ResolutionWidth" + 3), SUM("ResolutionWidth" + 4), SUM("ResolutionWidth" + 5), SUM("ResolutionWidth" + 6), SUM("ResolutionWidth" + 7), SUM("ResolutionWidth" + 8), SUM("ResolutionWidth" + 9), SUM("ResolutionWidth" + 10), SUM("ResolutionWidth" + 11), SUM("ResolutionWidth" + 12), SUM("ResolutionWidth" + 13), SUM("ResolutionWidth" + 14), SUM("ResolutionWidth" + 15), SUM("ResolutionWidth" + 16), SUM("ResolutionWidth" + 17), SUM("ResolutionWidth" + 18), SUM("ResolutionWidth" + 19), SUM("ResolutionWidth" + 20), SUM("ResolutionWidth" + 21), SUM("ResolutionWidth" + 22), SUM("ResolutionWidth" + 23), SUM("ResolutionWidth" + 24), SUM("ResolutionWidth" + 25), SUM("ResolutionWidth" + 26), SUM("ResolutionWidth" + 27), SUM("ResolutionWidth" + 28), SUM("ResolutionWidth" + 29), SUM("ResolutionWidth" + 30), SUM("ResolutionWidth" + 31), SUM("ResolutionWidth" + 32), SUM("ResolutionWidth" + 33), SUM("ResolutionWidth" + 34), SUM("ResolutionWidth" + 35), SUM("ResolutionWidth" + 36), SUM("ResolutionWidth" + 37), SUM("ResolutionWidth" + 38), SUM("ResolutionWidth" + 39), SUM("ResolutionWidth" + 40), SUM("ResolutionWidth" + 41), SUM("ResolutionWidth" + 42), SUM("ResolutionWidth" + 43), SUM("ResolutionWidth" + 44), SUM("ResolutionWidth" + 45), SUM("ResolutionWidth" + 46), SUM("ResolutionWidth" + 47), SUM("ResolutionWidth" + 48), SUM("ResolutionWidth" + 49), SUM("ResolutionWidth" + 50), SUM("ResolutionWidth" + 51), SUM("ResolutionWidth" + 52), SUM("ResolutionWidth" + 53), SUM("ResolutionWidth" + 54), SUM("ResolutionWidth" + 55), SUM("ResolutionWidth" + 56), SUM("ResolutionWidth" + 57), SUM("ResolutionWidth" + 58), SUM("ResolutionWidth" + 59), SUM("ResolutionWidth" + 60), SUM("ResolutionWidth" + 61), SUM("ResolutionWidth" + 62), SUM("ResolutionWidth" + 63), SUM("ResolutionWidth" + 64), SUM("ResolutionWidth" + 65), SUM("ResolutionWidth" + 66), SUM("ResolutionWidth" + 67), SUM("ResolutionWidth" + 68), SUM("ResolutionWidth" + 69), SUM("ResolutionWidth" + 70), SUM("ResolutionWidth" + 71), SUM("ResolutionWidth" + 72), SUM("ResolutionWidth" + 73), SUM("ResolutionWidth" + 74), SUM("ResolutionWidth" + 75), SUM("ResolutionWidth" + 76), SUM("ResolutionWidth" + 77), SUM("ResolutionWidth" + 78), SUM("ResolutionWidth" + 79), SUM("ResolutionWidth" + 80), SUM("ResolutionWidth" + 81), SUM("ResolutionWidth" + 82), SUM("ResolutionWidth" + 83), SUM("ResolutionWidth" + 84), SUM("ResolutionWidth" + 85), SUM("ResolutionWidth" + 86), SUM("ResolutionWidth" + 87), SUM("ResolutionWidth" + 88), SUM("ResolutionWidth" + 89) FROM hits; diff --git a/benchmarks/queries/clickbench/queries/q3.sql b/benchmarks/queries/clickbench/queries/q3.sql new file mode 100644 index 000000000000..db818fa013ef --- /dev/null +++ b/benchmarks/queries/clickbench/queries/q3.sql @@ -0,0 +1 @@ +SELECT AVG("UserID") FROM hits; diff --git a/benchmarks/queries/clickbench/queries/q30.sql b/benchmarks/queries/clickbench/queries/q30.sql new file mode 100644 index 000000000000..8b4bf19b7f9c --- /dev/null +++ b/benchmarks/queries/clickbench/queries/q30.sql @@ -0,0 +1 @@ +SELECT "SearchEngineID", "ClientIP", COUNT(*) AS c, SUM("IsRefresh"), AVG("ResolutionWidth") FROM hits WHERE "SearchPhrase" <> '' GROUP BY "SearchEngineID", "ClientIP" ORDER BY c DESC LIMIT 10; diff --git a/benchmarks/queries/clickbench/queries/q31.sql b/benchmarks/queries/clickbench/queries/q31.sql new file mode 100644 index 000000000000..5ab49a38b804 --- /dev/null +++ b/benchmarks/queries/clickbench/queries/q31.sql @@ -0,0 +1 @@ +SELECT "WatchID", "ClientIP", COUNT(*) AS c, SUM("IsRefresh"), AVG("ResolutionWidth") FROM hits WHERE "SearchPhrase" <> '' GROUP BY "WatchID", "ClientIP" ORDER BY c DESC LIMIT 10; diff --git a/benchmarks/queries/clickbench/queries/q32.sql b/benchmarks/queries/clickbench/queries/q32.sql new file mode 100644 index 000000000000..d00bc12405ed --- /dev/null +++ b/benchmarks/queries/clickbench/queries/q32.sql @@ -0,0 +1 @@ +SELECT "WatchID", "ClientIP", COUNT(*) AS c, SUM("IsRefresh"), AVG("ResolutionWidth") FROM hits GROUP BY "WatchID", "ClientIP" ORDER BY c DESC LIMIT 10; diff --git a/benchmarks/queries/clickbench/queries/q33.sql b/benchmarks/queries/clickbench/queries/q33.sql new file mode 100644 index 000000000000..45d491d1c30b --- /dev/null +++ b/benchmarks/queries/clickbench/queries/q33.sql @@ -0,0 +1 @@ +SELECT "URL", COUNT(*) AS c FROM hits GROUP BY "URL" ORDER BY c DESC LIMIT 10; diff --git a/benchmarks/queries/clickbench/queries/q34.sql b/benchmarks/queries/clickbench/queries/q34.sql new file mode 100644 index 000000000000..7e878804de06 --- /dev/null +++ b/benchmarks/queries/clickbench/queries/q34.sql @@ -0,0 +1 @@ +SELECT 1, "URL", COUNT(*) AS c FROM hits GROUP BY 1, "URL" ORDER BY c DESC LIMIT 10; diff --git a/benchmarks/queries/clickbench/queries/q35.sql b/benchmarks/queries/clickbench/queries/q35.sql new file mode 100644 index 000000000000..c03da84fb19e --- /dev/null +++ b/benchmarks/queries/clickbench/queries/q35.sql @@ -0,0 +1 @@ +SELECT "ClientIP", "ClientIP" - 1, "ClientIP" - 2, "ClientIP" - 3, COUNT(*) AS c FROM hits GROUP BY "ClientIP", "ClientIP" - 1, "ClientIP" - 2, "ClientIP" - 3 ORDER BY c DESC LIMIT 10; diff --git a/benchmarks/queries/clickbench/queries/q36.sql b/benchmarks/queries/clickbench/queries/q36.sql new file mode 100644 index 000000000000..b76dce5cab9e --- /dev/null +++ b/benchmarks/queries/clickbench/queries/q36.sql @@ -0,0 +1 @@ +SELECT "URL", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "DontCountHits" = 0 AND "IsRefresh" = 0 AND "URL" <> '' GROUP BY "URL" ORDER BY PageViews DESC LIMIT 10; diff --git a/benchmarks/queries/clickbench/queries/q37.sql b/benchmarks/queries/clickbench/queries/q37.sql new file mode 100644 index 000000000000..49017e3a5f1d --- /dev/null +++ b/benchmarks/queries/clickbench/queries/q37.sql @@ -0,0 +1 @@ +SELECT "Title", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "DontCountHits" = 0 AND "IsRefresh" = 0 AND "Title" <> '' GROUP BY "Title" ORDER BY PageViews DESC LIMIT 10; diff --git a/benchmarks/queries/clickbench/queries/q38.sql b/benchmarks/queries/clickbench/queries/q38.sql new file mode 100644 index 000000000000..b0cb6814bd85 --- /dev/null +++ b/benchmarks/queries/clickbench/queries/q38.sql @@ -0,0 +1 @@ +SELECT "URL", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "IsRefresh" = 0 AND "IsLink" <> 0 AND "IsDownload" = 0 GROUP BY "URL" ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; diff --git a/benchmarks/queries/clickbench/queries/q39.sql b/benchmarks/queries/clickbench/queries/q39.sql new file mode 100644 index 000000000000..8327eb9bd572 --- /dev/null +++ b/benchmarks/queries/clickbench/queries/q39.sql @@ -0,0 +1 @@ +SELECT "TraficSourceID", "SearchEngineID", "AdvEngineID", CASE WHEN ("SearchEngineID" = 0 AND "AdvEngineID" = 0) THEN "Referer" ELSE '' END AS Src, "URL" AS Dst, COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "IsRefresh" = 0 GROUP BY "TraficSourceID", "SearchEngineID", "AdvEngineID", Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; diff --git a/benchmarks/queries/clickbench/queries/q4.sql b/benchmarks/queries/clickbench/queries/q4.sql new file mode 100644 index 000000000000..027310ad7526 --- /dev/null +++ b/benchmarks/queries/clickbench/queries/q4.sql @@ -0,0 +1 @@ +SELECT COUNT(DISTINCT "UserID") FROM hits; diff --git a/benchmarks/queries/clickbench/queries/q40.sql b/benchmarks/queries/clickbench/queries/q40.sql new file mode 100644 index 000000000000..d30d7c414271 --- /dev/null +++ b/benchmarks/queries/clickbench/queries/q40.sql @@ -0,0 +1 @@ +SELECT "URLHash", "EventDate", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "IsRefresh" = 0 AND "TraficSourceID" IN (-1, 6) AND "RefererHash" = 3594120000172545465 GROUP BY "URLHash", "EventDate" ORDER BY PageViews DESC LIMIT 10 OFFSET 100; diff --git a/benchmarks/queries/clickbench/queries/q41.sql b/benchmarks/queries/clickbench/queries/q41.sql new file mode 100644 index 000000000000..0e9a51a7f54c --- /dev/null +++ b/benchmarks/queries/clickbench/queries/q41.sql @@ -0,0 +1 @@ +SELECT "WindowClientWidth", "WindowClientHeight", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "IsRefresh" = 0 AND "DontCountHits" = 0 AND "URLHash" = 2868770270353813622 GROUP BY "WindowClientWidth", "WindowClientHeight" ORDER BY PageViews DESC LIMIT 10 OFFSET 10000; diff --git a/benchmarks/queries/clickbench/queries/q42.sql b/benchmarks/queries/clickbench/queries/q42.sql new file mode 100644 index 000000000000..dcad5daa1b67 --- /dev/null +++ b/benchmarks/queries/clickbench/queries/q42.sql @@ -0,0 +1 @@ +SELECT DATE_TRUNC('minute', to_timestamp_seconds("EventTime")) AS M, COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-14' AND "EventDate" <= '2013-07-15' AND "IsRefresh" = 0 AND "DontCountHits" = 0 GROUP BY DATE_TRUNC('minute', to_timestamp_seconds("EventTime")) ORDER BY DATE_TRUNC('minute', M) LIMIT 10 OFFSET 1000; diff --git a/benchmarks/queries/clickbench/queries/q5.sql b/benchmarks/queries/clickbench/queries/q5.sql new file mode 100644 index 000000000000..35b17097d87c --- /dev/null +++ b/benchmarks/queries/clickbench/queries/q5.sql @@ -0,0 +1 @@ +SELECT COUNT(DISTINCT "SearchPhrase") FROM hits; diff --git a/benchmarks/queries/clickbench/queries/q6.sql b/benchmarks/queries/clickbench/queries/q6.sql new file mode 100644 index 000000000000..684103643652 --- /dev/null +++ b/benchmarks/queries/clickbench/queries/q6.sql @@ -0,0 +1 @@ +SELECT MIN("EventDate"), MAX("EventDate") FROM hits; diff --git a/benchmarks/queries/clickbench/queries/q7.sql b/benchmarks/queries/clickbench/queries/q7.sql new file mode 100644 index 000000000000..ab8528c1b141 --- /dev/null +++ b/benchmarks/queries/clickbench/queries/q7.sql @@ -0,0 +1 @@ +SELECT "AdvEngineID", COUNT(*) FROM hits WHERE "AdvEngineID" <> 0 GROUP BY "AdvEngineID" ORDER BY COUNT(*) DESC; diff --git a/benchmarks/queries/clickbench/queries/q8.sql b/benchmarks/queries/clickbench/queries/q8.sql new file mode 100644 index 000000000000..e5691bb66f81 --- /dev/null +++ b/benchmarks/queries/clickbench/queries/q8.sql @@ -0,0 +1 @@ +SELECT "RegionID", COUNT(DISTINCT "UserID") AS u FROM hits GROUP BY "RegionID" ORDER BY u DESC LIMIT 10; diff --git a/benchmarks/queries/clickbench/queries/q9.sql b/benchmarks/queries/clickbench/queries/q9.sql new file mode 100644 index 000000000000..42c22d96852d --- /dev/null +++ b/benchmarks/queries/clickbench/queries/q9.sql @@ -0,0 +1 @@ +SELECT "RegionID", SUM("AdvEngineID"), COUNT(*) AS c, AVG("ResolutionWidth"), COUNT(DISTINCT "UserID") FROM hits GROUP BY "RegionID" ORDER BY c DESC LIMIT 10; diff --git a/benchmarks/queries/clickbench/update_queries.sh b/benchmarks/queries/clickbench/update_queries.sh new file mode 100755 index 000000000000..d7db7359aa39 --- /dev/null +++ b/benchmarks/queries/clickbench/update_queries.sh @@ -0,0 +1,80 @@ +#!/usr/bin/env bash +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# This script is meant for developers of DataFusion -- it is runnable +# from the standard DataFusion development environment and uses cargo, +# etc and orchestrates gathering data and run the benchmark binary in +# different configurations. + +# Script to download ClickBench queries and split them into individual files + +set -e # Exit on any error + +# URL for the raw file (not the GitHub page) +URL="https://raw.githubusercontent.com/ClickHouse/ClickBench/main/datafusion/queries.sql" + +# Temporary file to store downloaded content +TEMP_FILE="queries.sql" + +TARGET_DIR="queries" + +# Download the file +echo "Downloading queries from $URL..." +if command -v curl &> /dev/null; then + curl -s -o "$TEMP_FILE" "$URL" +elif command -v wget &> /dev/null; then + wget -q -O "$TEMP_FILE" "$URL" +else + echo "Error: Neither curl nor wget is available. Please install one of them." + exit 1 +fi + +# Check if download was successful +if [ ! -f "$TEMP_FILE" ] || [ ! -s "$TEMP_FILE" ]; then + echo "Error: Failed to download or file is empty" + exit 1 +fi + +# Initialize counter +counter=0 + +# Ensure the target directory exists +if [ ! -d ${TARGET_DIR} ]; then + mkdir -p ${TARGET_DIR} +fi + +# Read the file line by line and create individual query files +mapfile -t lines < $TEMP_FILE +for line in "${lines[@]}"; do + # Skip empty lines + if [ -n "$line" ]; then + # Create filename with zero-padded counter + filename="q${counter}.sql" + + # Write the line to the individual file + echo "$line" > "${TARGET_DIR}/$filename" + + echo "Created ${TARGET_DIR}/$filename" + + # Increment counter + (( counter += 1 )) + fi +done + +# Clean up temporary file +rm "$TEMP_FILE" \ No newline at end of file diff --git a/benchmarks/src/clickbench.rs b/benchmarks/src/clickbench.rs index 57726fd181f4..8d1847b1b874 100644 --- a/benchmarks/src/clickbench.rs +++ b/benchmarks/src/clickbench.rs @@ -15,8 +15,9 @@ // specific language governing permissions and limitations // under the License. -use std::path::Path; -use std::path::PathBuf; +use std::fs; +use std::io::ErrorKind; +use std::path::{Path, PathBuf}; use crate::util::{BenchmarkRun, CommonOpt, QueryResult}; use datafusion::{ @@ -56,12 +57,12 @@ pub struct RunOpt { )] path: PathBuf, - /// Path to queries.sql (single file) + /// Path to queries directory #[structopt( parse(from_os_str), short = "r", long = "queries-path", - default_value = "benchmarks/queries/clickbench/queries.sql" + default_value = "benchmarks/queries/clickbench/queries" )] queries_path: PathBuf, @@ -70,49 +71,47 @@ pub struct RunOpt { output_path: Option, } -struct AllQueries { - queries: Vec, +/// Get the SQL file path +pub fn get_query_path(query_dir: &Path, query: usize) -> PathBuf { + let mut query_path = query_dir.to_path_buf(); + query_path.push(format!("q{query}.sql")); + query_path } -impl AllQueries { - fn try_new(path: &Path) -> Result { - // ClickBench has all queries in a single file identified by line number - let all_queries = std::fs::read_to_string(path) - .map_err(|e| exec_datafusion_err!("Could not open {path:?}: {e}"))?; - Ok(Self { - queries: all_queries.lines().map(|s| s.to_string()).collect(), - }) +/// Get the SQL statement from the specified query file +pub fn get_query_sql(query_path: &Path) -> Result> { + if fs::exists(query_path)? { + Ok(Some(fs::read_to_string(query_path)?)) + } else { + Ok(None) } +} + +impl RunOpt { + pub async fn run(self) -> Result<()> { + println!("Running benchmarks with the following options: {self:?}"); - /// Returns the text of query `query_id` - fn get_query(&self, query_id: usize) -> Result<&str> { - self.queries - .get(query_id) - .ok_or_else(|| { - let min_id = self.min_query_id(); - let max_id = self.max_query_id(); + let query_dir_metadata = fs::metadata(&self.queries_path).map_err(|e| { + if e.kind() == ErrorKind::NotFound { exec_datafusion_err!( - "Invalid query id {query_id}. Must be between {min_id} and {max_id}" + "Query path '{}' does not exist.", + &self.queries_path.to_str().unwrap() ) - }) - .map(|s| s.as_str()) - } + } else { + DataFusionError::External(Box::new(e)) + } + })?; - fn min_query_id(&self) -> usize { - 0 - } + if !query_dir_metadata.is_dir() { + return Err(exec_datafusion_err!( + "Query path '{}' is not a directory.", + &self.queries_path.to_str().unwrap() + )); + } - fn max_query_id(&self) -> usize { - self.queries.len() - 1 - } -} -impl RunOpt { - pub async fn run(self) -> Result<()> { - println!("Running benchmarks with the following options: {self:?}"); - let queries = AllQueries::try_new(self.queries_path.as_path())?; let query_range = match self.query { Some(query_id) => query_id..=query_id, - None => queries.min_query_id()..=queries.max_query_id(), + None => 0..=usize::MAX, }; // configure parquet options @@ -130,8 +129,18 @@ impl RunOpt { let mut benchmark_run = BenchmarkRun::new(); for query_id in query_range { + let query_path = get_query_path(&self.queries_path, query_id); + let Some(sql) = get_query_sql(&query_path)? else { + if self.query.is_some() { + return Err(exec_datafusion_err!( + "Could not load query file '{}'.", + &query_path.to_str().unwrap() + )); + } + break; + }; benchmark_run.start_new_case(&format!("Query {query_id}")); - let query_run = self.benchmark_query(&queries, query_id, &ctx).await; + let query_run = self.benchmark_query(&sql, query_id, &ctx).await; match query_run { Ok(query_results) => { for iter in query_results { @@ -151,11 +160,10 @@ impl RunOpt { async fn benchmark_query( &self, - queries: &AllQueries, + sql: &str, query_id: usize, ctx: &SessionContext, ) -> Result> { - let sql = queries.get_query(query_id)?; println!("Q{query_id}: {sql}"); let mut millis = Vec::with_capacity(self.iterations()); diff --git a/datafusion/core/benches/sql_planner.rs b/datafusion/core/benches/sql_planner.rs index 6dc953f56b43..d02478d2b479 100644 --- a/datafusion/core/benches/sql_planner.rs +++ b/datafusion/core/benches/sql_planner.rs @@ -30,9 +30,6 @@ use datafusion::datasource::MemTable; use datafusion::execution::context::SessionContext; use datafusion_common::ScalarValue; use datafusion_expr::col; -use itertools::Itertools; -use std::fs::File; -use std::io::{BufRead, BufReader}; use std::path::PathBuf; use std::sync::Arc; use test_utils::tpcds::tpcds_schemas; @@ -466,17 +463,20 @@ fn criterion_benchmark(c: &mut Criterion) { // }); // -- clickbench -- - - let queries_file = - File::open(format!("{benchmarks_path}queries/clickbench/queries.sql")).unwrap(); - let extended_file = - File::open(format!("{benchmarks_path}queries/clickbench/extended.sql")).unwrap(); - - let clickbench_queries: Vec = BufReader::new(queries_file) - .lines() - .chain(BufReader::new(extended_file).lines()) - .map(|l| l.expect("Could not parse line")) - .collect_vec(); + let clickbench_queries = (0..=42) + .map(|q| { + std::fs::read_to_string(format!( + "{benchmarks_path}queries/clickbench/queries/q{q}.sql" + )) + .unwrap() + }) + .chain((0..=7).map(|q| { + std::fs::read_to_string(format!( + "{benchmarks_path}queries/clickbench/extended/q{q}.sql" + )) + .unwrap() + })) + .collect::>(); let clickbench_ctx = register_clickbench_hits_table(&rt);