Skip to content

Commit d24d88d

Browse files
authored
Merge pull request #371 from xiaodaigh/development
Development for v0.6
2 parents b522999 + 4f0f581 commit d24d88d

File tree

198 files changed

+19227
-13544
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

198 files changed

+19227
-13544
lines changed

.Rbuildignore

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
^renv$
2+
^renv\.lock$
13
^.*\.Rproj$
24
^\.github$
35
^manuscript$
@@ -65,4 +67,6 @@ vignettes.Rnw.template
6567
^codecov\.yml$
6668
new-nse-dev.r
6769
test-poorman.R
68-
*.parquet
70+
.parquet$
71+
maditr-devs.r
72+
^CRAN-SUBMISSION$

CRAN-RELEASE

Lines changed: 0 additions & 2 deletions
This file was deleted.

DESCRIPTION

Lines changed: 9 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
Type: Package
22
Package: disk.frame
33
Title: Larger-than-RAM Disk-Based Data Manipulation Framework
4-
Version: 0.5.0
5-
Date: 2021-05-09
4+
Version: 0.6.0
5+
Date: 2022-01-31
66
Authors@R: c(
77
person("Dai", "ZJ", email = "zhuojia.dai@gmail.com", role = c("aut", "cre")),
88
person("Jacky", "Poon", role = c("ctb"))
@@ -17,27 +17,24 @@ License: MIT + file LICENSE
1717
Imports:
1818
Rcpp (>= 0.12.13),
1919
glue (>= 1.3.1),
20-
rlang (>= 0.4.0),
2120
future.apply (>= 1.3.0),
2221
fs (>= 1.3.1),
2322
jsonlite (>= 1.6),
2423
pryr (>= 0.1.4),
2524
stringr (>= 1.4.0),
2625
fst (>= 0.8.0),
27-
globals (>= 0.12.4),
2826
future (>= 1.14.0),
2927
data.table (>= 1.12.2),
3028
crayon (>= 1.3.4),
3129
bigreadr (>= 0.2.0),
32-
furrr (>= 0.2.2),
3330
bit64,
34-
benchmarkme
31+
benchmarkme,
32+
purrr (>= 0.3.2),
33+
rlang
3534
Depends:
3635
R (>= 3.4),
37-
dplyr (>= 1.0.0),
38-
purrr (>= 0.3.2)
36+
dplyr (>= 1.0.0)
3937
Suggests:
40-
testthat (>= 2.1.0),
4138
nycflights13,
4239
magrittr,
4340
shiny,
@@ -49,10 +46,11 @@ Suggests:
4946
speedglm,
5047
broom,
5148
ggplot2,
52-
covr
49+
rmarkdown
5350
LinkingTo:
5451
Rcpp
55-
RoxygenNote: 7.1.1
52+
RoxygenNote: 7.1.2
53+
VignetteBuilder: rmarkdown
5654
Encoding: UTF-8
5755
URL: https://diskframe.com
5856
BugReports: https://github.com/xiaodaigh/disk.frame/issues

NAMESPACE

Lines changed: 3 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -17,31 +17,17 @@ S3method(colnames,disk.frame)
1717
S3method(compute,disk.frame)
1818
S3method(delayed,disk.frame)
1919
S3method(distinct,disk.frame)
20-
S3method(do,disk.frame)
2120
S3method(filter,disk.frame)
2221
S3method(full_join,disk.frame)
2322
S3method(get_chunk,disk.frame)
2423
S3method(glimpse,disk.frame)
2524
S3method(group_by,disk.frame)
2625
S3method(group_vars,disk.frame)
2726
S3method(groups,disk.frame)
28-
S3method(hard_arrange,data.frame)
29-
S3method(hard_arrange,disk.frame)
30-
S3method(hard_group_by,data.frame)
31-
S3method(hard_group_by,disk.frame)
3227
S3method(head,disk.frame)
33-
S3method(imap,default)
34-
S3method(imap_dfr,default)
35-
S3method(imap_dfr,disk.frame)
3628
S3method(inner_join,disk.frame)
3729
S3method(lazy,disk.frame)
3830
S3method(left_join,disk.frame)
39-
S3method(map,default)
40-
S3method(map,disk.frame)
41-
S3method(map2,default)
42-
S3method(map2,disk.frame)
43-
S3method(map_dfr,default)
44-
S3method(map_dfr,disk.frame)
4531
S3method(merge,disk.frame)
4632
S3method(mutate,disk.frame)
4733
S3method(names,disk.frame)
@@ -67,22 +53,22 @@ S3method(transmute,disk.frame)
6753
export(IQR_df.chunk_agg.disk.frame)
6854
export(IQR_df.collected_agg.disk.frame)
6955
export(add_chunk)
70-
export(add_tally.disk.frame)
7156
export(all_df.chunk_agg.disk.frame)
7257
export(all_df.collected_agg.disk.frame)
7358
export(any_df.chunk_agg.disk.frame)
7459
export(any_df.collected_agg.disk.frame)
7560
export(as.disk.frame)
61+
export(bind_rows.disk.frame)
7662
export(ceremony_text)
7763
export(chunk_arrange)
7864
export(chunk_distinct)
7965
export(chunk_group_by)
80-
export(chunk_lapply)
8166
export(chunk_summarise)
8267
export(chunk_summarize)
8368
export(chunk_ungroup)
8469
export(cimap)
8570
export(cimap_dfr)
71+
export(clapply)
8672
export(cmap)
8773
export(cmap2)
8874
export(cmap_dfr)
@@ -102,18 +88,12 @@ export(foverlaps.disk.frame)
10288
export(gen_datatable_synthetic)
10389
export(get_chunk)
10490
export(get_chunk_ids)
105-
export(hard_arrange)
106-
export(hard_group_by)
107-
export(imap)
108-
export(imap_dfr)
10991
export(insert_ceremony)
11092
export(is_disk.frame)
11193
export(lazy)
11294
export(length_df.chunk_agg.disk.frame)
11395
export(length_df.collected_agg.disk.frame)
11496
export(make_glm_streaming_fn)
115-
export(map)
116-
export(map2)
11797
export(map_by_chunk_id)
11898
export(max_df.chunk_agg.disk.frame)
11999
export(max_df.collected_agg.disk.frame)
@@ -148,7 +128,6 @@ export(shardkey_equal)
148128
export(show_boilerplate)
149129
export(show_ceremony)
150130
export(srckeep)
151-
export(srckeepchunks)
152131
export(sum_df.chunk_agg.disk.frame)
153132
export(sum_df.collected_agg.disk.frame)
154133
export(var_df.chunk_agg.disk.frame)
@@ -172,10 +151,8 @@ importFrom(data.table,foverlaps)
172151
importFrom(data.table,fread)
173152
importFrom(data.table,rbindlist)
174153
importFrom(data.table,setDT)
175-
importFrom(data.table,setkey)
176154
importFrom(data.table,setkeyv)
177155
importFrom(data.table,timetaken)
178-
importFrom(dplyr,add_tally)
179156
importFrom(dplyr,anti_join)
180157
importFrom(dplyr,arrange)
181158
importFrom(dplyr,bind_rows)
@@ -218,7 +195,6 @@ importFrom(future,nbrOfWorkers)
218195
importFrom(future,plan)
219196
importFrom(future,sequential)
220197
importFrom(future.apply,future_lapply)
221-
importFrom(globals,findGlobals)
222198
importFrom(glue,glue)
223199
importFrom(jsonlite,fromJSON)
224200
importFrom(jsonlite,toJSON)
@@ -230,19 +206,14 @@ importFrom(purrr,map2)
230206
importFrom(purrr,map_chr)
231207
importFrom(purrr,map_dfr)
232208
importFrom(purrr,map_lgl)
233-
importFrom(rlang,enquos)
234-
importFrom(rlang,eval_tidy)
235-
importFrom(rlang,quo)
209+
importFrom(rlang,enexpr)
236210
importFrom(stats,median)
237211
importFrom(stats,quantile)
238212
importFrom(stats,runif)
239213
importFrom(stringr,fixed)
240214
importFrom(utils,capture.output)
241215
importFrom(utils,head)
242216
importFrom(utils,memory.limit)
243-
importFrom(utils,methods)
244-
importFrom(utils,setTxtProgressBar)
245217
importFrom(utils,tail)
246-
importFrom(utils,txtProgressBar)
247218
importFrom(utils,unzip)
248219
useDynLib(disk.frame)

NEWS.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,8 @@
1+
# disk.frame 0.6
2+
* Much better NSE support in disk.frame!
3+
* removed `hard_arrange` and `hard_group_by`
4+
* various API updates
5+
16
# disk.frame 0.5
27
* removed `add_count` method
38

R/add_chunk.r

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -116,9 +116,10 @@ add_chunk <- function(df, chunk, chunk_id = NULL, full.names = FALSE, ...) {
116116

117117
data.table::setDT(check_vars)
118118
if(nrow(check_vars[is.na(new_chunk)]) > 0) {
119+
vars_strings = paste0(check_vars[is.na(new_chunk), colnames], collapse=',\n ')
119120
warning(
120-
glue::glue(
121-
"these variables are in the disk.frame but not in the new chunk: \n {paste0(check_vars[is.na(new_chunk), colnames], collapse=',\n ')}"))
121+
sprintf(
122+
"these variables are in the disk.frame but not in the new chunk: \n %s", vars_strings))
122123
}
123124
if(nrow(check_vars[is.na(existing_df)]) > 0){
124125
warning(glue::glue("these variables are in the new chunk but not in the existing disk.frame: {paste0(check_vars[is.na(existing_df), colnames], collapse=', ')}"))

R/anti_join.r

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,10 @@
33
#' @param merge_by_chunk_id the merge is performed by chunk id
44
#' @param overwrite overwrite output directory
55
#' @param .progress Show progress or not. Defaults to FALSE
6+
#' @param suffix see dplyr::XXX_join
7+
#' @param keep see dplyr::XXX_join
68
#' @param ... same as dplyr's joins
79
#' @rdname join
8-
#' @importFrom rlang quo enquos
910
#' @importFrom dplyr anti_join left_join full_join semi_join inner_join
1011
#' @return disk.frame or data.frame/data.table
1112
#' @export
@@ -29,11 +30,11 @@ anti_join.disk.frame <- function(x, y, by=NULL, copy=FALSE, ..., outdir = tempfi
2930
overwrite_check(outdir, overwrite)
3031

3132
if("data.frame" %in% class(y)) {
32-
quo_dotdotdot = enquos(...)
33-
cmap_dfr.disk.frame(x, ~{
34-
code = quo(anti_join(.x, y, by = by, copy = copy, !!!quo_dotdotdot))
35-
rlang::eval_tidy(code)
33+
tmp = cmap.disk.frame(x, ~{
34+
anti_join(.x, y, by = by, copy = copy, ...)
3635
}, .progress = .progress)
36+
37+
return(tmp)
3738
} else if("disk.frame" %in% class(y)) {
3839
if(is.null(merge_by_chunk_id)) {
3940
stop("both x and y are disk.frames. You need to specify merge_by_chunk_id = TRUE or FALSE explicitly")
@@ -47,12 +48,12 @@ anti_join.disk.frame <- function(x, y, by=NULL, copy=FALSE, ..., outdir = tempfi
4748
if (merge_by_chunk_id == FALSE) {
4849
warning("merge_by_chunk_id = FALSE. This will take significantly longer and the preparations needed are performed eagerly which may lead to poor performance. Consider making y a data.frame or set merge_by_chunk_id = TRUE for better performance.")
4950

50-
x = hard_group_by(x, by, nchunks = max(ncy,ncx), overwrite = TRUE)
51-
y = hard_group_by(y, by, nchunks = max(ncy,ncx), overwrite = TRUE)
51+
ncxy = max(ncy,ncx)
52+
x = rechunk(x, shardby=by, nchunks = ncxy, outdir=tempfile(fileext = ".jdf"), overwrite = FALSE)
53+
y = rechunk(y, shardby=by, nchunks =ncxy, outdir=tempfile(fileext = ".jdf"), overwrite = FALSE)
5254
return(anti_join.disk.frame(x, y, by, copy = copy, outdir = outdir, merge_by_chunk_id = TRUE, overwrite = overwrite))
5355
} else if ((identical(shardkey(x)$shardkey, "") & identical(shardkey(y)$shardkey, "")) | identical(shardkey(x), shardkey(y))) {
5456
res = cmap2.disk.frame(x, y, ~{
55-
#res = cmap2(x, y, ~{
5657
if(is.null(.y)) {
5758
return(.x)
5859
} else if (is.null(.x)) {

R/as.disk.frame.r

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,6 @@
2525
#' delete(cars_new_location.df)
2626
#' delete(cars_chunks.df)
2727
as.disk.frame <- function(df, outdir = tempfile(fileext = ".df"), nchunks = recommend_nchunks(df), overwrite = FALSE, shardby = NULL, compress = 50,...) {
28-
2928
stopifnot("data.frame" %in% class(df))
3029
overwrite_check(outdir, overwrite)
3130
data.table::setDT(df)

R/bind_rows.r

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
#' Bind rows
2+
#' @param ... disk.frame to be row bound
3+
#' @export
4+
bind_rows.disk.frame <- function(...) {
5+
rbindlist.disk.frame(list(...))
6+
}

R/chunk_mapper.r

Lines changed: 37 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -31,59 +31,49 @@
3131
#' @param chunk_fn The dplyr function to create a mapper for
3232
#' @param warning_msg The warning message to display when invoking the mapper
3333
#' @param as.data.frame force the input chunk of a data.frame; needed for dtplyr
34-
#' @importFrom rlang enquos quo
3534
#' @export
36-
create_chunk_mapper <- function(chunk_fn, warning_msg = NULL, as.data.frame = TRUE) {
37-
return_func <- function(.data, ...) {
38-
if (!is.null(warning_msg)) {
35+
create_chunk_mapper <- function(chunk_fn, warning_msg = NULL, as.data.frame = FALSE) {
36+
if(as.data.frame) {
37+
warning("`as.data.frame` is deprecated in create_chunk_mapper")
38+
}
39+
40+
return(function(.data, ...) {
41+
if(!is.null(warning_msg)) {
3942
warning(warning_msg)
4043
}
4144

45+
# you need to use list otherwise the names will be gone
46+
code = substitute(chunk_fn(.disk.frame.chunk, ...))
4247

43-
quo_dotdotdot = rlang::enquos(...)
48+
if (paste0(deparse(code), collapse="") == "chunk_fn(NULL)") {
49+
globals_and_pkgs = future::getGlobalsAndPackages(expression(chunk_fn()))
50+
} else {
51+
globals_and_pkgs = future::getGlobalsAndPackages(code)
52+
}
4453

45-
# this is designed to capture any global stuff
46-
vars_and_pkgs = future::getGlobalsAndPackages(quo_dotdotdot)
47-
data_for_eval_tidy = force(vars_and_pkgs$globals)
4854

49-
res = cmap(.data, ~{
50-
51-
this_env = environment()
52-
53-
if(length(data_for_eval_tidy) > 0) {
54-
for(i in 1:length(data_for_eval_tidy)) {
55-
assign(names(data_for_eval_tidy)[i], data_for_eval_tidy[[i]], pos = this_env)
56-
}
57-
}
58-
59-
lapply(quo_dotdotdot, function(x) {
60-
attr(x, ".Environment") = this_env
61-
})
62-
63-
if(as.data.frame) {
64-
if("grouped_df" %in% class(.x)) {
65-
code = rlang::quo(chunk_fn(.x, !!!quo_dotdotdot))
66-
} else {
67-
code = rlang::quo(chunk_fn(as.data.frame(.x), !!!quo_dotdotdot))
68-
}
69-
} else {
70-
code = rlang::quo(chunk_fn(.x, !!!quo_dotdotdot))
55+
global_vars = globals_and_pkgs$globals
56+
57+
env = parent.frame()
58+
59+
done = identical(env, emptyenv()) || identical(env, globalenv())
60+
61+
# keep adding global variables by moving up the environment chain
62+
while(!done) {
63+
tmp_globals_and_pkgs = future::getGlobalsAndPackages(code, envir = env)
64+
new_global_vars = tmp_globals_and_pkgs$globals
65+
for (name in setdiff(names(new_global_vars), names(global_vars))) {
66+
global_vars[[name]] <- new_global_vars[[name]]
7167
}
7268

73-
# ZJ: we need both approaches. TRUST ME
74-
# TODO better NSE at some point need dist
75-
tryCatch({
76-
return(rlang::eval_tidy(code))
77-
}, error = function(e) {
78-
as_label_code = rlang::as_label(code)
79-
if(as_label_code == "chunk_fn(...)") {
80-
stop(glue::glue("disk.frame has detected a syntax error in \n\n`{code}`\n\n. If you believe your syntax is correct, raise an issue at https://github.com/xiaodaigh/disk.frame with a MWE"))
81-
} else {
82-
# likely to be dealing with data.tables
83-
return(eval(parse(text=as_label_code), envir = this_env))
84-
}
85-
})
86-
}, lazy = TRUE)
87-
}
88-
return_func
89-
}
69+
done = identical(env, emptyenv()) || identical(env, globalenv())
70+
env = parent.env(env)
71+
}
72+
73+
globals_and_pkgs$globals = global_vars
74+
75+
attr(.data, "recordings") = c(attr(.data, "recordings"), list(globals_and_pkgs))
76+
77+
.data
78+
})
79+
}

R/clapply.r

Whitespace-only changes.

0 commit comments

Comments
 (0)