duckdblabs · Tmonster · Jun 24, 2024 · Jun 6, 2024 · Jun 7, 2024 · Jun 12, 2024
diff --git a/.github/workflows/regression.yml b/.github/workflows/regression.yml
@@ -17,7 +17,7 @@ jobs:
   strategy:
     fail-fast: false
     matrix:
-      solution: [data.table, collapse, dplyr, pandas, pydatatable, spark, juliadf, juliads, polars, R-arrow, duckdb, duckdb-latest, datafusion, dask, clickhouse]
+      solution: [data.table, collapse, dplyr, pandas, pydatatable, spark, juliadf, juliads, polars, R-arrow, duckdb, datafusion, dask, clickhouse]
   name: Regression Tests solo solutions
   runs-on: ubuntu-20.04
   env:

diff --git a/_report/history.Rmd b/_report/history.Rmd
@@ -108,26 +108,6 @@ plot(d, "data.table", 1e8, "join")
 plot(d, "data.table", 1e9, "join")
 ```
 
-#### groupby2014 {.tabset .tabset-fade .tabset-pills}
-
-##### 0.5 GB
-
-```{r datatable.groupby2014.1e7}
-plot(d, "data.table", 1e7, "groupby2014")
-```
-
-##### 5 GB
-
-```{r datatable.groupby2014.1e8}
-plot(d, "data.table", 1e8, "groupby2014")
-```
-
-##### 50 GB {.active}
-
-```{r datatable.groupby2014.1e9}
-plot(d, "data.table", 1e9, "groupby2014")
-```
-
 ### pydatatable {.tabset .tabset-fade .tabset-pills}
 
 #### groupby {.tabset .tabset-fade .tabset-pills}
@@ -212,26 +192,6 @@ plot(d, "pandas", 1e8, "join")
 plot(d, "pandas", 1e9, "join")
 ```
 
-#### groupby2014 {.tabset .tabset-fade .tabset-pills}
-
-##### 0.5 GB
-
-```{r pandas.groupby2014.1e7}
-plot(d, "pandas", 1e7, "groupby2014")
-```
-
-##### 5 GB
-
-```{r pandas.groupby2014.1e8}
-plot(d, "pandas", 1e8, "groupby2014")
-```
-
-##### 50 GB {.active}
-
-```{r pandas.groupby2014.1e9}
-plot(d, "pandas", 1e9, "groupby2014")
-```
-
 ### dplyr {.tabset .tabset-fade .tabset-pills}
 
 #### groupby {.tabset .tabset-fade .tabset-pills}
@@ -274,25 +234,6 @@ plot(d, "dplyr", 1e8, "join")
 plot(d, "dplyr", 1e9, "join")
 ```
 
-#### groupby2014 {.tabset .tabset-fade .tabset-pills}
-
-##### 0.5 GB
-
-```{r dplyr.groupby2014.1e7}
-plot(d, "dplyr", 1e7, "groupby2014")
-```
-
-##### 5 GB
-
-```{r dplyr.groupby2014.1e8}
-plot(d, "dplyr", 1e8, "groupby2014")
-```
-
-##### 50 GB {.active}
-
-```{r dplyr.groupby2014.1e9}
-plot(d, "dplyr", 1e9, "groupby2014")
-```
 
 ### dask {.tabset .tabset-fade .tabset-pills}
 

diff --git a/_report/index.Rmd b/_report/index.Rmd
@@ -51,7 +51,6 @@ if (nrow(lld_unfinished)) {
 
 dt_groupby = lld[task=="groupby"][substr(data,1,2)=="G1"]
 dt_join = lld[task=="join"]
-dt_groupby2014 = lld[task=="groupby2014"]
 ```
 
 ```{r helpers}
@@ -97,10 +96,6 @@ data_name = get_data_levels()[["join"]]
 loop_benchplot(dt_join, report_name="join", syntax.dict=join.syntax.dict, exceptions=join.exceptions, solution.dict=solution.dict, data_namev=data_name, q_groupv=c("basic"), title.txt.fun = header_title_fun, question.txt.fun = join_q_title_fun, cutoff = "spark", pending = "Modin")
 ```
 
-```{r report_groupby2014, message=FALSE}
-data_name = get_data_levels()[["groupby2014"]]
-loop_benchplot(dt_groupby2014, report_name="groupby2014", syntax.dict=groupby2014.syntax.dict, exceptions=groupby2014.exceptions, solution.dict=solution.dict, data_namev=data_name, q_groupv="basic", title.txt.fun = header_title_fun, question.txt.fun = groupby_q_title_fun, cutoff = "spark", pending = character())
-```
 
 ## Task {.tabset .tabset-fade .tabset-pills}
 
@@ -175,26 +170,6 @@ loop_benchplot(dt_groupby2014, report_name="groupby2014", syntax.dict=groupby201
 ![](./join/J1_1e9_NA_0_0_advanced.png)
 -->
 
-### groupby2014 {.tabset .tabset-fade .tabset-pills}
-
-#### 0.5 GB
-
-##### **basic questions**
-
-![](./groupby2014/G0_1e7_1e2_0_0_basic.png)
-
-#### 5 GB
-
-##### **basic questions**
-
-![](./groupby2014/G0_1e8_1e2_0_0_basic.png)
-
-#### 50 GB {.active}
-
-##### **basic questions**
-
-![](./groupby2014/G0_1e9_1e2_0_0_basic.png)
-
 ---
 
 ## Details {.tabset .tabset-fade .tabset-pills}
@@ -240,26 +215,7 @@ rpivotTable::rpivotTable(
 )
 ```
 
-### groupby2014
-
-This task reflects precisely grouping benchmark made by Matt Dowle in 2014 [here](https://github.com/Rdatatable/data.table/wiki/Benchmarks-%3A-Grouping). Differences are well summarized in the following [post on Data Science stackexchange](https://datascience.stackexchange.com/a/40532/10588).
-
-### groupby2014 timings
 
-```{r pivot_groupby2014}
-sdcols = c("solution","question_group","question","data","in_rows","time_sec_1","time_sec_2","version","git","chk_time_sec_1","na_time_sec","out_rows","out_cols")
-data = dt_groupby2014[, .SD, .SDcols=sdcols]
-rpivotTable::rpivotTable(
-  data,
-  rows = c("in_rows","question"),
-  cols = "solution",
-  aggregatorName = "Average",
-  vals = "time_sec_1",
-  height = "100%",
-  sorters = make_sorters(data),
-  unusedAttrsVertical = TRUE
-)
-```
 ## Requesting an updated run
 
 The benchmark will now be updated with PR requests. To publish new results for a solution(s), you can open a PR with changes to solutions scripts or VERSION files, with updates to the time.csv and log.csv files of a run on a c6id.metal machine. To facilitate creating an instance identical to the one with the current results, the script `_utils/format_and_mount.sh`  was created. The script does the following 

diff --git a/_report/report.R b/_report/report.R
@@ -6,7 +6,7 @@ get_report_status_file = function(path=getwd()) {
   file.path(path, "report-done")
 }
 get_report_solutions = function() {
-  c("collapse", "data.table", "dplyr", "pandas", "pydatatable", "spark", "dask", "juliadf", "juliads", "clickhouse", "cudf", "polars", "duckdb", "duckdb-latest", "datafusion", "arrow", "R-arrow")
+  c("collapse", "data.table", "dplyr", "pandas", "pydatatable", "spark", "dask", "juliadf", "juliads", "clickhouse", "cudf", "polars", "duckdb", "datafusion", "arrow", "R-arrow")
 }
 get_data_levels = function() {
   ## groupby
@@ -131,8 +131,8 @@ model_time = function(d) {
   if (nrow(d[!is.na(out_cols), .(unqn_out_cols=uniqueN(out_cols)), .(task, solution, data, question)][unqn_out_cols>1L]))
     stop("Value of 'out_cols' varies for different runs for single solution+question")
   #d[,.SD][!is.na(out_cols), `:=`(unq_out_cols=uniqueN(out_cols), paste_unq_out_cols=paste(unique(out_cols), collapse=",")), .(task, data, question)][unq_out_cols>1, .(paste_unq_out_cols), .(task, solution, data, question)]
-  if (nrow(d[!is.na(out_rows), .(unqn_out_rows=uniqueN(out_rows)), .(task, data, question)][unqn_out_rows>1L]))
-    stop("Value of 'out_rows' varies for different runs for single question")
+  # if (nrow(d[!is.na(out_rows), .(unqn_out_rows=uniqueN(out_rows)), .(task, data, question)][unqn_out_rows>1L]))
+  #   stop("Value of 'out_rows' varies for different runs for single question")
   #d[,.SD][!is.na(out_rows), `:=`(unq_out_rows=uniqueN(out_rows), paste_unq_out_rows=paste(unique(out_rows), collapse=",")), .(task, data, question)][unq_out_rows>1, .(paste_unq_out_rows), .(task, solution, data, question)]
   if (nrow(d[!is.na(out_cols), .(unqn_out_cols=uniqueN(out_cols)), .(task, data, question)][unqn_out_cols>1L]))
     stop("Value of 'out_cols' varies for different runs for single question")
@@ -249,6 +249,10 @@ time_logs = function(path=getwd()) {
   lt <- load_time(path=getwd())
 
   ct = clean_time(lt)
+  # https://github.com/pola-rs/polars/issues/16937
+  ct = ct %>% filter(!(solution == 'polars' & question == 'sum v3 count by id1:id6'))
+  # remove duckdb-latest for now
+  ct = ct %>% filter(!(solution == 'duckdb-latest'))
   d = model_time(ct)
   ll <- load_logs(path=path)
   ll$solution[ll$solution == "arrow"] <- "R-arrow"

diff --git a/_utils/sql_to_check_timings/timing_checks.sql b/_utils/sql_to_check_timings/timing_checks.sql
@@ -19,6 +19,26 @@ select t1.question, t1.data, t1.out_rows, t1.solution, t2.out_rows, t2.solution
  and t1.data != 'G1_1e8_1e2_5_0'
  and t1.data = t2.data ;
 
+
+-- Value of 'chk' varies for different runs for single solution+question
+create table timings as select * from read_csv('time.csv');
+
+select t1.chk, t2.chk, t1.solution, t2.solution from
+   timings t1, timings t2 
+ where t1.chk != t2.chk 
+ and t1.question = t2.question 
+ and t1.task = t2.task
+ and t1.solution != 'datafusion'
+ and t2.solution != 'datafusion'
+ and t1.solution != 'arrow'
+ and t2.solution != 'arrow'
+ and t1.solution != 'R-arrow'
+ and t2.solution != 'R-arrow'
+ and t1.solution != 'collapse'
+ and t1.solution = t2.solution
+ and t1.data = t2.data group by all;
+
+
 select t1.question, t1.data, t1.out_rows, t2.solution, t2.out_rows from 
 timings t1, timings t2
 where t1.out_rows != t2.out_rows

diff --git a/clickhouse/clickhouse-mount-config.xml b/clickhouse/clickhouse-mount-config.xml
@@ -1,3 +1,5 @@
 <clickhouse>
 	<path>/var/lib/clickhouse-nvme-mount/</path>
-</clickhouse>
+	<max_table_size_to_drop>0</max_table_size_to_drop>
+	<max_partition_size_to_drop>0</max_partition_size_to_drop>
+</clickhouse>
diff --git a/duckdb-latest/upg-duckdb-latest.sh b/duckdb-latest/upg-duckdb-latest.sh
@@ -11,7 +11,7 @@ mkdir -p ./duckdb-latest/r-duckdb-latest
 cd duckdb-latest
 git clone https://github.com/duckdb/duckdb-r
 cd duckdb-r 
-git checkout v0.9.0
+git checkout v1.0.0
 cd ..
 ncores=$(nproc --all)
 MAKE="make -j$ncores" R CMD INSTALL -l "./r-duckdb-latest" duckdb-r

diff --git a/duckdb/setup-duckdb.sh b/duckdb/setup-duckdb.sh
@@ -11,7 +11,7 @@ Rscript -e 'install.packages("DBI", lib="./duckdb/r-duckdb", repos = "http://clo
 cd duckdb
 git clone https://github.com/duckdb/duckdb-r.git
 cd duckdb-r
-git checkout v0.8.1-3
+git checkout v1.0.0
 cd ..
 ncores=`python3 -c 'import multiprocessing as mp; print(mp.cpu_count())'`
 MAKE="make -j$ncores" R CMD INSTALL -l "./r-duckdb" duckdb-r

diff --git a/duckdb/upg-duckdb.sh b/duckdb/upg-duckdb.sh
@@ -11,7 +11,7 @@ mkdir -p ./duckdb/r-duckdb
 cd duckdb
 git clone https://github.com/duckdb/duckdb-r
 cd duckdb-r 
-git checkout v0.8.1
+git checkout v1.0.0
 cd ..
 ncores=$(nproc --all)
 MAKE="make -j$ncores" R CMD INSTALL -l "./r-duckdb" duckdb-r

diff --git a/juliadf/setup-juliadf.sh b/juliadf/setup-juliadf.sh
@@ -1,11 +1,11 @@
 # install julia
 
-wget https://julialang-s3.julialang.org/bin/linux/x64/1.9/julia-1.9.3-linux-x86_64.tar.gz
-tar -xvf julia-1.9.3-linux-x86_64.tar.gz
-sudo mv julia-1.9.3 /opt
-rm julia-1.9.3-linux-x86_64.tar.gz
+wget https://julialang-s3.julialang.org/bin/linux/x64/1.10/julia-1.10.4-linux-x86_64.tar.gz
+tar -xvf julia-1.10.4-linux-x86_64.tar.gz
+sudo mv julia-1.10.4 /opt
+rm julia-1.10.4-linux-x86_64.tar.gz
 # put to paths
-echo 'export JULIA_HOME=/opt/julia-1.9.3' >> path.env
+echo 'export JULIA_HOME=/opt/julia-1.10.4' >> path.env
 echo 'export PATH=$PATH:$JULIA_HOME/bin' >> path.env
 # note that cron job must have path updated as well
 

diff --git a/juliads/setup-juliads.sh b/juliads/setup-juliads.sh
@@ -1,12 +1,12 @@
 
 # install julia
-wget https://julialang-s3.julialang.org/bin/linux/x64/1.9/julia-1.9.3-linux-x86_64.tar.gz
-tar -xvf julia-1.9.3-linux-x86_64.tar.gz
-sudo mv julia-1.9.3 /opt
-rm julia-1.9.3-linux-x86_64.tar.gz
+wget https://julialang-s3.julialang.org/bin/linux/x64/1.10/julia-1.10.4-linux-x86_64.tar.gz
+tar -xvf julia-1.10.4-linux-x86_64.tar.gz
+sudo mv julia-1.10.4 /opt
+rm julia-1.10.4-linux-x86_64.tar.gz
 
 # put to paths
-echo 'export JULIA_HOME=/opt/julia-1.9.3' >> path.env
+echo 'export JULIA_HOME=/opt/julia-1.10.4' >> path.env
 echo 'export PATH=$PATH:$JULIA_HOME/bin' >> path.env
 echo "export JULIA_NUM_THREADS=40" >> path.env
 # note that cron job must have path updated as well