Skip to content

get_sundered_data() does not work on DuckDB tables with > 1 per-row tests #636

@petrbouchal

Description

@petrbouchal

Prework

Description

get_sundered_data() returns a SQL error on DuckDB based tables when there is more than one row-level test. The error message suggests that it has something to do with DuckDB not knowing how to add Booleans.

It works fine for local data frames and SQLite tables.

It also works fine in DuckDB if there is only one test as no two Booleans are added in the operations.

Reproducible example

library(pointblank)
library(duckdb)
#> Loading required package: DBI
library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union

con <- dbConnect(duckdb::duckdb(), "test.duckdb")
tbl_db <- copy_to(con, airquality |> tibble::rowid_to_column("idcol"), 
  "airquality_cp", temporary = TRUE, overwrite = TRUE)

ag <- create_agent(tbl_db) |> 
  col_vals_between(Wind, 1, 2) |>
  col_vals_gt(Ozone, 20) |> 
  interrogate()

ag |> get_sundered_data(id_cols = "idcol")
#> Error in `collect()`:
#> ! Failed to collect lazy table.
#> Caused by error in `dbSendQuery()`:
#> ! rapi_prepare: Failed to prepare query SELECT idcol, Ozone, "Solar.R", Wind, "Temp", "Month", "Day"
#> FROM (
#>   SELECT
#>     idcol,
#>     Ozone,
#>     "Solar.R",
#>     Wind,
#>     "Temp",
#>     "Month",
#>     "Day",
#>     CASE WHEN (pb_is_good_ = 2) THEN TRUE ELSE FALSE END AS pb_is_good_
#>   FROM (
#>     SELECT
#>       idcol,
#>       Ozone,
#>       "Solar.R",
#>       Wind,
#>       "Temp",
#>       "Month",
#>       "Day",
#>       pb_is_good_1 + pb_is_good_2 AS pb_is_good_
#>     FROM (
#>       SELECT
#>         "...1".*,
#>         pb_is_good_2,
#>         Ozone,
#>         "Solar.R",
#>         Wind,
#>         "Temp",
#>         "Month",
#>         "Day"
#>       FROM (
#>         SELECT
#>           idcol,
#>           CASE WHEN ((Wind IS NULL)) THEN FALSE ELSE pb_is_good_ END AS pb_is_good_1
#>         FROM (
#>           SELECT
#>             airquality_cp.*,
#>             CASE
#> WHEN (Wind >= 1.0 AND Wind <= 2.0) THEN TRUE
#> WHEN (Wind < 1.0 OR Wind > 2.0) THEN FALSE
#> END AS pb_is_good_
#>           FROM airquality_cp
#>         ) q01
#>       ) "...1"
#>       LEFT JOIN (
#>         SELECT
#>           idcol,
#>           CASE WHEN ((pb_is_good_ IS NULL)) THEN FALSE ELSE pb_is_good_ END AS pb_is_good_2
#>         FROM (
#>           SELECT airquality_cp.*, Ozone > 20.0 AS pb_is_good_
#>           FROM airquality_cp
#>         ) q01
#>       ) "...2"
#>         ON ("...1".idcol = "...2".idcol)
#>       LEFT JOIN (
#>         SELECT
#>           airquality_cp.*,
#>           CASE
#> WHEN (Wind >= 1.0 AND Wind <= 2.0) THEN TRUE
#> WHEN (Wind < 1.0 OR Wind > 2.0) THEN FALSE
#> END AS pb_is_good_
#>         FROM airquality_cp
#>       ) "...3"
#>         ON ("...1".idcol = "...3".idcol)
#>     ) q01
#>   ) q01
#> ) q01
#> WHERE (pb_is_good_ = 1.0)
#> LIMIT 21
#> Error: Binder Error: No function matches the given name and argument types '+(BOOLEAN, BOOLEAN)'. You might need to add explicit type casts.
#>  Candidate functions:
#>  +(TINYINT) -> TINYINT
#>  +(TINYINT, TINYINT) -> TINYINT
#>  +(SMALLINT) -> SMALLINT
#>  +(SMALLINT, SMALLINT) -> SMALLINT
#>  +(INTEGER) -> INTEGER
#>  +(INTEGER, INTEGER) -> INTEGER
#>  +(BIGINT) -> BIGINT
#>  +(BIGINT, BIGINT) -> BIGINT
#>  +(HUGEINT) -> HUGEINT
#>  +(HUGEINT, HUGEINT) -> HUGEINT
#>  +(FLOAT) -> FLOAT
#>  +(FLOAT, FLOAT) -> FLOAT
#>  +(DOUBLE) -> DOUBLE
#>  +(DOUBLE, DOUBLE) -> DOUBLE
#>  +(DECIMAL) -> DECIMAL
#>  +(DECIMAL, DECIMAL) -> DECIMAL
#>  +(UTINYINT) -> UTINYINT
#>  +(UTINYINT, UTINYINT) -> UTINYINT
#>  +(USMALLINT) -> USMALLINT
#>  +(USMALLINT, USMALLINT) -> USMALLINT
#>  +(UINTEGER) -> UINTEGER
#>  +(UINTEGER, UINTEGER) -> UINTEGER
#>  +(UBIGINT) -> UBIGINT
#>  +(UBIGINT, UBIGINT) -> UBIGINT
#>  +(UHUGEINT) -> UHUGEINT
#>  +(UHUGEINT, UHUGEINT) -> UHUGEINT
#>  +(DATE, INTEGER) -> DATE
#>  +(INTEGER, DATE) -> DATE
#>  +(INTERVAL, INTERVAL) -> INTERVAL
#>  +(DATE, INTERVAL) -> TIMESTAMP
#>  +(INTERVAL, DATE) -> TIMESTAMP
#>  +(TIME, INTERVAL) -> TIME
#>  +(INTERVAL, TIME) -> TIME
#>  +(TIMESTAMP, INTERVAL) -> TIMESTAMP
#>  +(INTERVAL, TIMESTAMP) -> TIMESTAMP
#>  +(TIME WITH TIME ZONE, INTERVAL) -> TIME WITH TIME ZONE
#>  +(INTERVAL, TIME WITH TIME ZONE) -> TIME WITH TIME ZONE
#>  +(TIME, DATE) -> TIMESTAMP
#>  +(DATE, TIME) -> TIMESTAMP
#>  +(TIME WITH TIME ZONE, DATE) -> TIMESTAMP WITH TIME ZONE
#>  +(DATE, TIME WITH TIME ZONE) -> TIMESTAMP WITH TIME ZONE
#>  +([ANY[]...]) -> ANY[]
#> 
#> 
#> LINE 21:       pb_is_good_1 + pb_is_good_2 AS pb_is_good_
#>                             ^

dbDisconnect(con)

Created on 2025-07-12 with reprex v2.1.1

Session info

sessioninfo::session_info()
#> ─ Session info ───────────────────────────────────────────────────────────────
#>  setting  value
#>  version  R version 4.4.1 (2024-06-14)
#>  os       macOS 15.5
#>  system   aarch64, darwin20
#>  ui       X11
#>  language (EN)
#>  collate  en_US.UTF-8
#>  ctype    UTF-8
#>  tz       Europe/Prague
#>  date     2025-07-12
#>  pandoc   3.6.3 @ /Applications/Positron.app/Contents/Resources/app/quarto/bin/tools/aarch64/ (via rmarkdown)
#> 
#> ─ Packages ───────────────────────────────────────────────────────────────────
#>  package     * version     date (UTC) lib source
#>  blastula      0.3.6       2025-04-03 [1] CRAN (R 4.4.1)
#>  blob          1.2.4       2023-03-17 [1] CRAN (R 4.4.0)
#>  cli           3.6.5.9000  2025-07-12 [1] Github (r-lib/cli@cbb3424)
#>  DBI         * 1.2.3       2024-06-02 [1] CRAN (R 4.4.0)
#>  dbplyr        2.5.0       2024-03-19 [1] CRAN (R 4.4.0)
#>  digest        0.6.37      2024-08-19 [1] CRAN (R 4.4.1)
#>  dplyr       * 1.1.4       2023-11-17 [1] CRAN (R 4.4.0)
#>  duckdb      * 1.3.0       2025-06-02 [1] CRAN (R 4.4.1)
#>  evaluate      1.0.4       2025-06-18 [1] CRAN (R 4.4.1)
#>  fastmap       1.2.0       2024-05-15 [1] CRAN (R 4.4.0)
#>  fs            1.6.6       2025-04-12 [1] CRAN (R 4.4.1)
#>  generics      0.1.4       2025-05-09 [1] CRAN (R 4.4.1)
#>  glue          1.8.0       2024-09-30 [1] CRAN (R 4.4.1)
#>  htmltools     0.5.8.1     2024-04-04 [1] CRAN (R 4.4.0)
#>  knitr         1.50        2025-03-16 [1] CRAN (R 4.4.1)
#>  lifecycle     1.0.4       2023-11-07 [1] CRAN (R 4.4.0)
#>  magrittr      2.0.3       2022-03-30 [1] CRAN (R 4.4.0)
#>  pillar        1.11.0      2025-07-04 [1] CRAN (R 4.4.1)
#>  pkgconfig     2.0.3       2019-09-22 [1] CRAN (R 4.4.0)
#>  pointblank  * 0.12.2.9000 2025-07-12 [1] Github (rstudio/pointblank@8371ca0)
#>  purrr         1.1.0       2025-07-10 [1] RSPM (R 4.4.0)
#>  R6            2.6.1       2025-02-15 [1] CRAN (R 4.4.1)
#>  reprex        2.1.1       2024-07-06 [1] CRAN (R 4.4.0)
#>  rlang         1.1.6       2025-04-11 [1] CRAN (R 4.4.1)
#>  rmarkdown     2.29        2024-11-04 [1] CRAN (R 4.4.1)
#>  sessioninfo   1.2.2       2021-12-06 [1] CRAN (R 4.4.0)
#>  tibble        3.3.0       2025-06-08 [1] CRAN (R 4.4.1)
#>  tidyselect    1.2.1       2024-03-11 [1] CRAN (R 4.4.0)
#>  vctrs         0.6.5       2023-12-01 [1] CRAN (R 4.4.0)
#>  withr         3.0.2       2024-10-28 [1] CRAN (R 4.4.1)
#>  xfun          0.52        2025-04-02 [1] CRAN (R 4.4.1)
#>  yaml          2.3.10      2024-07-26 [1] CRAN (R 4.4.1)
#> 
#>  [1] /Users/petr/Library/R/4.4/library
#>  [2] /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/library
#> 
#> ──────────────────────────────────────────────────────────────────────────────

Expected result

This should work the same as for local data frames.

Session info

see reprex

Metadata

Metadata

Assignees

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions