apache
diff --git a/‎README.md
Lines changed: 1 addition & 1 deletion b/‎README.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎dev/benchmarks/.gitignore
Lines changed: 2 additions & 0 deletions b/‎dev/benchmarks/.gitignore
Lines changed: 2 additions & 0 deletions
diff --git a/‎dev/benchmarks/README.md
Lines changed: 71 additions & 0 deletions b/‎dev/benchmarks/README.md
Lines changed: 71 additions & 0 deletions
diff --git a/‎dev/benchmarks/comet-tpch.sh
Lines changed: 51 additions & 0 deletions b/‎dev/benchmarks/comet-tpch.sh
Lines changed: 51 additions & 0 deletions
diff --git a/‎dev/benchmarks/drop-caches.sh
Lines changed: 21 additions & 0 deletions b/‎dev/benchmarks/drop-caches.sh
Lines changed: 21 additions & 0 deletions
diff --git a/‎dev/benchmarks/generate-comparison.py
Lines changed: 229 additions & 0 deletions b/‎dev/benchmarks/generate-comparison.py
Lines changed: 229 additions & 0 deletions
@@ -48,7 +48,7 @@ The following chart shows the time it takes to run the 22 TPC-H queries against
 using a single executor with 8 cores. See the [Comet Benchmarking Guide](https://datafusion.apache.org/comet/contributor-guide/benchmarking.html)
 for details of the environment used for these benchmarks.
 
-When using Comet, the overall run time is reduced from 616 seconds to 275 seconds, a 2.2x speedup.
+When using Comet, the overall run time is reduced from 652 seconds to 268 seconds, a 2.4x speedup.
 
 ![](docs/source/_static/images/benchmark-results/0.9.0/tpch_allqueries.png)
 
 
@@ -0,0 +1,2 @@
+*.json
+*.png
@@ -0,0 +1,71 @@
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
+# Comet Benchmarking Scripts
+
+This directory contains scripts used for generating benchmark results that are published in this repository and in 
+the Comet documentation.
+
+## Example usage
+
+Set Spark environment variables:
+
+```shell
+export SPARK_HOME=/opt/spark-3.5.3-bin-hadoop3/
+export SPARK_MASTER=spark://yourhostname:7077
+```
+
+Set path to queries and data:
+
+```shell
+export TPCH_QUERIES=/mnt/bigdata/tpch/queries/
+export TPCH_DATA=/mnt/bigdata/tpch/sf100/
+```
+
+Run Spark benchmark:
+
+```shell
+export JAVA_HOME=/usr/lib/jvm/java-17-openjdk-amd64
+sudo ./drop-caches.sh
+./spark-tpch.sh
+```
+
+Run Comet benchmark:
+
+```shell
+export JAVA_HOME=/usr/lib/jvm/java-17-openjdk-amd64
+export COMET_JAR=/opt/comet/comet-spark-spark3.5_2.12-0.9.0.jar
+sudo ./drop-caches.sh
+./comet-tpch.sh
+```
+
+Run Gluten benchmark:
+
+```shell
+export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64
+export GLUTEN_JAR=/opt/gluten/gluten-velox-bundle-spark3.5_2.12-linux_amd64-1.4.0.jar
+sudo ./drop-caches.sh
+./gluten-tpch.sh
+```
+
+Generating charts:
+
+```shell
+python3 generate-comparison.py --benchmark tpch --labels "Spark 3.5.3" "Comet 0.9.0" "Gluten 1.4.0" --title "TPC-H @ 100 GB (single executor, 8 cores, local Parquet files)" spark-tpch-1752338506381.json comet-tpch-1752337818039.json gluten-tpch-1752337474344.json
+```
@@ -0,0 +1,51 @@
+#!/bin/bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+$SPARK_HOME/sbin/stop-master.sh
+$SPARK_HOME/sbin/stop-worker.sh
+
+$SPARK_HOME/sbin/start-master.sh
+$SPARK_HOME/sbin/start-worker.sh $SPARK_MASTER
+
+$SPARK_HOME/bin/spark-submit \
+    --master $SPARK_MASTER \
+    --jars $COMET_JAR \
+    --driver-class-path $COMET_JAR \
+    --conf spark.driver.memory=8G \
+    --conf spark.executor.instances=1 \
+    --conf spark.executor.cores=8 \
+    --conf spark.cores.max=8 \
+    --conf spark.executor.memory=16g \
+    --conf spark.memory.offHeap.enabled=true \
+    --conf spark.memory.offHeap.size=16g \
+    --conf spark.eventLog.enabled=true \
+    --conf spark.driver.extraClassPath=$COMET_JAR \
+    --conf spark.executor.extraClassPath=$COMET_JAR \
+    --conf spark.plugins=org.apache.spark.CometPlugin \
+    --conf spark.shuffle.manager=org.apache.spark.sql.comet.execution.shuffle.CometShuffleManager \
+    --conf spark.comet.exec.replaceSortMergeJoin=true \
+    --conf spark.comet.cast.allowIncompatible=true \
+    tpcbench.py \
+    --name comet \
+    --benchmark tpch \
+    --data $TPCH_DATA \
+    --queries $TPCH_QUERIES \
+    --output . \
+    --iterations 1
@@ -0,0 +1,21 @@
+#!/bin/bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+echo 1 > /proc/sys/vm/drop_caches
@@ -0,0 +1,229 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import argparse
+import json
+import matplotlib.pyplot as plt
+import numpy as np
+
+def geomean(data):
+    return np.prod(data) ** (1 / len(data))
+
+def generate_query_rel_speedup_chart(baseline, comparison, label1: str, label2: str, benchmark: str, title: str):
+    results = []
+    for query in range(1, query_count(benchmark)+1):
+        if query == 999:
+            continue
+        a = np.median(np.array(baseline[str(query)]))
+        b = np.median(np.array(comparison[str(query)]))
+        if a > b:
+            speedup = a/b-1
+        else:
+            speedup = -(1/(a/b)-1)
+        results.append(("q" + str(query), round(speedup*100, 0)))
+
+    results = sorted(results, key=lambda x: -x[1])
+
+    queries, speedups = zip(*results)
+
+    # Create figure and axis
+    if benchmark == "tpch":
+        fig, ax = plt.subplots(figsize=(10, 6))
+    else:
+        fig, ax = plt.subplots(figsize=(35, 10))
+
+    # Create bar chart
+    bars = ax.bar(queries, speedups, color='skyblue')
+
+    # Add text annotations
+    for bar, speedup in zip(bars, speedups):
+        yval = bar.get_height()
+        if yval >= 0:
+            ax.text(bar.get_x() + bar.get_width() / 2.0, min(800, yval+5), f'{yval:.0f}%', va='bottom', ha='center', fontsize=8,
+                    color='blue', rotation=90)
+        else:
+            ax.text(bar.get_x() + bar.get_width() / 2.0, yval, f'{yval:.0f}%', va='top', ha='center', fontsize=8,
+                    color='blue', rotation=90)
+
+    # Add title and labels
+    ax.set_title(label2 + " speedup over " + label1 + " (" + title + ")")
+    ax.set_ylabel('Speedup Percentage (100% speedup = 2x faster)')
+    ax.set_xlabel('Query')
+
+    # Customize the y-axis to handle both positive and negative values better
+    ax.axhline(0, color='black', linewidth=0.8)
+    min_value = (min(speedups) // 100) * 100
+    max_value = ((max(speedups) // 100) + 1) * 100 + 50
+    if benchmark == "tpch":
+        ax.set_ylim(min_value, max_value)
+    else:
+        # TODO improve this
+        ax.set_ylim(-250, 300)
+
+    # Show grid for better readability
+    ax.yaxis.grid(True)
+
+    # Save the plot as an image file
+    plt.savefig(f'{benchmark}_queries_speedup_rel.png', format='png')
+
+def generate_query_abs_speedup_chart(baseline, comparison, label1: str, label2: str, benchmark: str, title: str):
+    results = []
+    for query in range(1, query_count(benchmark)+1):
+        if query == 999:
+            continue
+        a = np.median(np.array(baseline[str(query)]))
+        b = np.median(np.array(comparison[str(query)]))
+        speedup = a-b
+        results.append(("q" + str(query), round(speedup, 1)))
+
+    results = sorted(results, key=lambda x: -x[1])
+
+    queries, speedups = zip(*results)
+
+    # Create figure and axis
+    if benchmark == "tpch":
+        fig, ax = plt.subplots(figsize=(10, 6))
+    else:
+        fig, ax = plt.subplots(figsize=(35, 10))
+
+    # Create bar chart
+    bars = ax.bar(queries, speedups, color='skyblue')
+
+    # Add text annotations
+    for bar, speedup in zip(bars, speedups):
+        yval = bar.get_height()
+        if yval >= 0:
+            ax.text(bar.get_x() + bar.get_width() / 2.0, min(800, yval+5), f'{yval:.1f}', va='bottom', ha='center', fontsize=8,
+                    color='blue', rotation=90)
+        else:
+            ax.text(bar.get_x() + bar.get_width() / 2.0, yval, f'{yval:.1f}', va='top', ha='center', fontsize=8,
+                    color='blue', rotation=90)
+
+    # Add title and labels
+    ax.set_title(label2 + " speedup over " + label1 + " (" + title + ")")
+    ax.set_ylabel('Speedup (in seconds)')
+    ax.set_xlabel('Query')
+
+    # Customize the y-axis to handle both positive and negative values better
+    ax.axhline(0, color='black', linewidth=0.8)
+    min_value = min(speedups) * 2 - 20
+    max_value = max(speedups) * 1.5
+    ax.set_ylim(min_value, max_value)
+
+    # Show grid for better readability
+    ax.yaxis.grid(True)
+
+    # Save the plot as an image file
+    plt.savefig(f'{benchmark}_queries_speedup_abs.png', format='png')
+
+def generate_query_comparison_chart(results, labels, benchmark: str, title: str):
+    queries = []
+    benches = []
+    for _ in results:
+        benches.append([])
+    for query in range(1, query_count(benchmark)+1):
+        if query == 999:
+            continue
+        queries.append("q" + str(query))
+        for i in range(0, len(results)):
+            benches[i].append(np.median(np.array(results[i][str(query)])))
+
+    # Define the width of the bars
+    bar_width = 0.3
+
+    # Define the positions of the bars on the x-axis
+    index = np.arange(len(queries)) * 1.5
+
+    # Create a bar chart
+    if benchmark == "tpch":
+        fig, ax = plt.subplots(figsize=(15, 6))
+    else:
+        fig, ax = plt.subplots(figsize=(35, 6))
+
+    for i in range(0, len(results)):
+        bar = ax.bar(index + i * bar_width, benches[i], bar_width, label=labels[i])
+
+    # Add labels, title, and legend
+    ax.set_title(title)
+    ax.set_xlabel('Queries')
+    ax.set_ylabel('Query Time (seconds)')
+    ax.set_xticks(index + bar_width / 2)
+    ax.set_xticklabels(queries)
+    ax.legend()
+
+    # Save the plot as an image file
+    plt.savefig(f'{benchmark}_queries_compare.png', format='png')
+
+def generate_summary(results, labels, benchmark: str, title: str):
+    timings = []
+    for _ in results:
+        timings.append(0)
+
+    num_queries = query_count(benchmark)
+    for query in range(1, num_queries + 1):
+        if query == 999:
+            continue
+        for i in range(0, len(results)):
+            timings[i] += np.median(np.array(results[i][str(query)]))
+
+    # Create figure and axis
+    fig, ax = plt.subplots()
+    fig.set_size_inches(10, 6)
+
+    # Add title and labels
+    ax.set_title(title)
+    ax.set_ylabel(f'Time in seconds to run all {num_queries} {benchmark} queries (lower is better)')
+
+    times = [round(x,0) for x in timings]
+
+    # Create bar chart
+    bars = ax.bar(labels, times, color='skyblue', width=0.8)
+
+    # Add text annotations
+    for bar in bars:
+        yval = bar.get_height()
+        ax.text(bar.get_x() + bar.get_width() / 2.0, yval, f'{yval}', va='bottom')  # va: vertical alignment
+
+    plt.savefig(f'{benchmark}_allqueries.png', format='png')
+
+def query_count(benchmark: str):
+    if benchmark == "tpch":
+        return 22
+    elif benchmark == "tpcds":
+        return 99
+    else:
+        raise "invalid benchmark name"
+
+def main(files, labels, benchmark: str, title: str):
+    results = []
+    for filename in files:
+        with open(filename) as f:
+            results.append(json.load(f))
+    generate_summary(results, labels, benchmark, title)
+    generate_query_comparison_chart(results, labels, benchmark, title)
+    if len(files) == 2:
+        generate_query_abs_speedup_chart(results[0], results[1], labels[0], labels[1], benchmark, title)
+        generate_query_rel_speedup_chart(results[0], results[1], labels[0], labels[1], benchmark, title)
+
+if __name__ == '__main__':
+    argparse = argparse.ArgumentParser(description='Generate comparison')
+    argparse.add_argument('filenames', nargs='+', type=str, help='JSON result files')
+    argparse.add_argument('--labels', nargs='+', type=str, help='Labels')
+    argparse.add_argument('--benchmark', type=str, help='Benchmark name (tpch or tpcds)')
+    argparse.add_argument('--title', type=str, help='Chart title')
+    args = argparse.parse_args()
+    main(args.filenames, args.labels, args.benchmark, args.title)