Add files via upload

hope-data-science · web-flow · commit c864c38c538a · 2020-05-28T17:11:05.000+08:00
diff --git a/CRAN-RELEASE b/CRAN-RELEASE
@@ -1,2 +1,2 @@
-This package was submitted to CRAN on 2020-05-02.
+This package was submitted to CRAN on 2020-05-28.
 Once it is accepted, delete this file and tag the release (commit fbccc9581e).
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: tidyfst
 Title: Tidy Verbs for Fast Data Manipulation
-Version: 0.9.6
+Version: 0.9.7
 Depends: R (>= 3.3.0)
 Authors@R: 
     person(given = "Tian-Yuan",
diff --git a/NAMESPACE b/NAMESPACE
@@ -3,7 +3,6 @@
 export("%>%")
 export(CJ)
 export(add_count_dt)
-export(ajoin)
 export(anti_join_dt)
 export(arrange_dt)
 export(as.data.table)
@@ -28,7 +27,6 @@ export(fill_na_dt)
 export(filter_dt)
 export(filter_fst)
 export(fintersect)
-export(fjoin)
 export(fread)
 export(frollapply)
 export(fsetdiff)
@@ -39,7 +37,6 @@ export(fwrite)
 export(group_by_dt)
 export(group_dt)
 export(group_exe_dt)
-export(ijoin)
 export(import_fst)
 export(impute_dt)
 export(in_dt)
@@ -50,7 +47,6 @@ export(lag_dt)
 export(lead_dt)
 export(left_join_dt)
 export(like)
-export(ljoin)
 export(longer_dt)
 export(mat_df)
 export(mutate_dt)
@@ -67,7 +63,6 @@ export(rename_dt)
 export(replace_dt)
 export(replace_na_dt)
 export(right_join_dt)
-export(rjoin)
 export(rn_col)
 export(rowwise_dt)
 export(sample_dt)
@@ -81,9 +76,13 @@ export(separate_dt)
 export(setdiff_dt)
 export(setequal_dt)
 export(shift_fill)
-export(sjoin)
 export(slice_dt)
 export(slice_fst)
+export(slice_head_dt)
+export(slice_max_dt)
+export(slice_min_dt)
+export(slice_sample_dt)
+export(slice_tail_dt)
 export(squeeze_dt)
 export(summarise_dt)
 export(summarise_vars)
@@ -113,4 +112,6 @@ import(stringr)
 importFrom(stats,median)
 importFrom(stats,na.omit)
 importFrom(stats,setNames)
+importFrom(utils,head)
 importFrom(utils,object.size)
+importFrom(utils,tail)
diff --git a/R/_global_setting.R b/R/_global_setting.R
@@ -3,7 +3,7 @@
 #' @import fst
 #' @import stringr
 #' @importFrom stats na.omit setNames median
-#' @importFrom utils object.size
+#' @importFrom utils object.size head tail
 
 #' @export
 stringr::`%>%`
diff --git a/R/complete.R b/R/complete.R
@@ -41,6 +41,7 @@ complete_dt = function(.data,...,fill = NA){
   if(
     substitute(list(...)) %>%
     deparse() %>%
+    .[1] %>%
     str_detect("=")
   ) {
     list(...) %>%
diff --git a/R/join.R b/R/join.R
@@ -0,0 +1,198 @@
+
+#' @title Join tables
+#' @name join
+#' @description  The mutating joins add columns from `y` to `x`,
+#' matching rows based on the keys:
+#'
+#' * `inner_join_dt()`: includes all rows in `x` and `y`.
+#' * `left_join_dt()`: includes all rows in `x`.
+#' * `right_join_dt()`: includes all rows in `y`.
+#' * `full_join_dt()`: includes all rows in `x` or `y`.
+#' @description
+#' Filtering joins filter rows from `x` based on the presence or absence
+#' of matches in `y`:
+#'
+#' * `semi_join_dt()` return all rows from `x` with a match in `y`.
+#' * `anti_join_dt()` return all rows from `x` without a match in `y`.
+#' @param x A data.table
+#' @param y A data.table
+#' @param by (Optional) A character vector of variables to join by.
+#'
+#'   If `NULL`, the default, `*_join_dt()` will perform a natural join, using all
+#'   variables in common across `x` and `y`. A message lists the variables so that you
+#'   can check they're correct; suppress the message by supplying `by` explicitly.
+#'
+#'   To join by different variables on `x` and `y`, use a named vector.
+#'   For example, `by = c("a" = "b")` will match `x$a` to `y$b`.
+#'
+#'   To join by multiple variables, use a vector with length > 1.
+#'   For example, `by = c("a", "b")` will match `x$a` to `y$a` and `x$b` to
+#'   `y$b`. Use a named vector to match different variables in `x` and `y`.
+#'   For example, `by = c("a" = "b", "c" = "d")` will match `x$a` to `y$b` and
+#'   `x$c` to `y$d`.
+#' @param on  (Optional)
+#' Indicate which columns in x should be joined with which columns in y.
+#' Examples included:
+#'   1.\code{.by = c("a","b")} (this is a must for \code{set_full_join_dt});
+#'   2.\code{.by = c(x1="y1", x2="y2")};
+#'   3.\code{.by = c("x1==y1", "x2==y2")};
+#'   4.\code{.by = c("a", V2="b")};
+#'   5.\code{.by = .(a, b)};
+#'   6.\code{.by = c("x>=a", "y<=b")} or \code{.by = .(x>=a, y<=b)}.
+#' @param suffix If there are non-joined duplicate variables in x and y, these
+#'   suffixes will be added to the output to disambiguate them. Should be a
+#'   character vector of length 2.
+#' @return A data.table
+#' @examples
+#'
+#' workers = fread("
+#'     name company
+#'     Nick Acme
+#'     John Ajax
+#'     Daniela Ajax
+#' ")
+#'
+#' positions = fread("
+#'     name position
+#'     John designer
+#'     Daniela engineer
+#'     Cathie manager
+#' ")
+#'
+#' workers %>% inner_join_dt(positions)
+#' workers %>% left_join_dt(positions)
+#' workers %>% right_join_dt(positions)
+#' workers %>% full_join_dt(positions)
+#'
+#' # filtering joins
+#' workers %>% anti_join_dt(positions)
+#' workers %>% semi_join_dt(positions)
+#'
+#' # To suppress the message, supply 'by' argument
+#' workers %>% left_join_dt(positions, by = "name")
+#'
+#' # Use a named 'by' if the join variables have different names
+#' positions2 = setNames(positions, c("worker", "position")) # rename first column in 'positions'
+#' workers %>% inner_join_dt(positions2, by = c("name" = "worker"))
+#'
+#' # the syntax of 'on' could be a bit different
+#' workers %>% inner_join_dt(positions2,on = "name==worker")
+#'
+#'
+
+#' @rdname join
+#' @export
+inner_join_dt = function(x,y,by = NULL, on = NULL,suffix = c(".x",".y")){
+  x = as_dt(x)
+  y = as_dt(y)
+  on_ = substitute(on) %>% deparse()
+  by_ = substitute(by) %>% deparse()
+  if(on_ != "NULL") x[y, nomatch = 0L, on = on]
+  else if(by_ == "NULL"){
+    by = intersect(names(x), names(y))
+    by_name = str_c(by, collapse = ",")
+    message(str_glue("Joining by: {by_name}\n\n"))
+    merge.data.table(x,y,by = by,suffixes = suffix)
+  }else if(is.null(names(by))) merge.data.table(x,y,by = by,suffixes = suffix)
+  else merge.data.table(x,y,by.x = names(by),by.y = by,suffixes = suffix)
+}
+
+#' @rdname join
+#' @export
+left_join_dt = function(x,y,by = NULL, on = NULL,suffix = c(".x",".y")){
+  x = as_dt(x)
+  y = as_dt(y)
+  on_ = substitute(on) %>% deparse()
+  by_ = substitute(by) %>% deparse()
+  if(on_ != "NULL") y[x, on = on]
+  else if(by_ == "NULL"){
+    by = intersect(names(x), names(y))
+    by_name = str_c(by, collapse = ",")
+    message(str_glue("Joining by: {by_name}\n\n"))
+    merge.data.table(x,y,by = by,all.x = TRUE,suffixes = suffix)
+  }else if(is.null(names(by))) merge.data.table(x,y,by = by,all.x = TRUE,suffixes = suffix)
+  else merge.data.table(x,y,by.x = names(by),by.y = by,all.x = TRUE,suffixes = suffix)
+}
+
+#' @rdname join
+#' @export
+right_join_dt = function(x,y,by = NULL, on = NULL,suffix = c(".x",".y")){
+  x = as_dt(x)
+  y = as_dt(y)
+  on_ = substitute(on) %>% deparse()
+  by_ = substitute(by) %>% deparse()
+  if(on_ != "NULL") x[y, on = on]
+  else if(by_ == "NULL"){
+    by = intersect(names(x), names(y))
+    by_name = str_c(by, collapse = ",")
+    message(str_glue("Joining by: {by_name}\n\n"))
+    merge.data.table(x,y,by = by,all.y = TRUE,suffixes = suffix)
+  }else if(is.null(names(by))) merge.data.table(x,y,by = by,all.y = TRUE,suffixes = suffix)
+  else merge.data.table(x,y,by.x = names(by),by.y = by,all.y = TRUE,suffixes = suffix)
+}
+
+#' @rdname join
+#' @export
+full_join_dt = function(x,y,by = NULL, on = NULL,suffix = c(".x",".y")){
+  x = as_dt(x)
+  y = as_dt(y)
+  on_ = substitute(on) %>% deparse()
+  by_ = substitute(by) %>% deparse()
+  if(on_ != "NULL") {
+    if(by_!="null"){
+      rbind(x[, .SD, .SDcols = by],
+            y[, .SD, .SDcols = by]) %>%
+        unique()-> unique_keys
+      y[x[.(unique_keys), on = on], on = on]
+    }else{
+      rbind(x[, .SD, .SDcols = on],
+            y[, .SD, .SDcols = on]) %>%
+        unique()-> unique_keys
+      y[x[.(unique_keys), on = on], on = on]
+    }
+  } else if(by_ == "NULL"){
+    by = intersect(names(x), names(y))
+    by_name = str_c(by, collapse = ",")
+    message(str_glue("Joining by: {by_name}\n\n"))
+    merge.data.table(x,y,by = by,all = TRUE,suffixes = suffix)
+  }else if(is.null(names(by))) merge.data.table(x,y,by = by,all = TRUE,suffixes = suffix)
+  else merge.data.table(x,y,by.x = names(by),by.y = by,all = TRUE,suffixes = suffix)
+}
+
+#' @rdname join
+#' @export
+anti_join_dt = function(x,y,by = NULL, on = NULL){
+  x = as_dt(x)
+  y = as_dt(y)
+  on_ = substitute(on) %>% deparse()
+  by_ = substitute(by) %>% deparse()
+  if(on_ != "NULL") x[!y, on = on]
+  else if(by_ == "NULL"){
+    by = intersect(names(x), names(y))
+    by_name = str_c(by, collapse = ",")
+    message(str_glue("Joining by: {by_name}\n\n"))
+    x[!y, on = by]
+  }else x[!y, on = by]
+}
+
+#' @rdname join
+#' @export
+semi_join_dt = function(x,y,by = NULL, on = NULL){
+  x = as_dt(x)
+  y = as_dt(y)
+  on_ = substitute(on) %>% deparse()
+  by_ = substitute(by) %>% deparse()
+  if(on_ != "NULL") {
+    w = unique(x[y, on = on, nomatch = 0L, which = TRUE, allow.cartesian = TRUE])
+    x[w]
+  }
+  else{
+    if(by_ == "NULL"){
+      by = intersect(names(x), names(y))
+      by_name = str_c(by, collapse = ",")
+      message(str_glue("Joining by: {by_name}\n\n"))
+    }
+    w = unique(x[y, on = by, nomatch = 0L, which = TRUE, allow.cartesian = TRUE])
+    x[w]
+  }
+}
diff --git a/R/nest_dt.R b/R/nest_dt.R
@@ -145,7 +145,7 @@ unnest_dt = function(.data,...){
   if(length(col_names) == 1) unnest_col(dt,...)
   else
     lapply(col_names,function(x) unnest_col(dt,cols = x)) %>%
-    Reduce(x = ., f = function(x,y) merge(x,y))
+    Reduce(x = ., f = function(x,y) merge(x,y,all = TRUE))
 }
 
 unnest_col = function(.data,...){
diff --git a/R/separate.R b/R/separate.R
@@ -15,40 +15,57 @@
 #' df %>% separate_dt(x, c("A", "B"))
 #' # equals to
 #' df %>% separate_dt("x", c("A", "B"))
+#'
+#' # If you just want the second variable:
+#' df %>% separate_dt(x,into = c(NA,"B"))
 
 #' @export
 separate_dt = function(.data,separated_colname,into,
-                    sep = "[^[:alnum:]]+",
-                    remove = TRUE){
+                       sep = "[^[:alnum:]]+",
+                       remove = TRUE){
   dt = as.data.table(.data)
   substitute(separated_colname) %>% deparse() -> parse_name
   if(!str_detect(parse_name,"^\"")) separated_colname = parse_name
 
+  if(anyNA(into)) into[is.na(into)] = "NA_COL_"
+
   dt[[separated_colname]] %>%
     tstrsplit(split = sep) %>%
     setDT() %>%
-    setnames(names(.),into) -> split_columns
+    setnames(names(.),into) %>%
+    select_dt(-"NA_COL_")-> split_columns
   if(remove)
     dt[,(separated_colname):=NULL][,names(split_columns):=split_columns][]
   else dt[,names(split_columns):=split_columns][]
 
 }
 
-#' separate_dt = function(.data,separated_colname,into,
-#'                        sep = "[^[:alnum:]]+",
-#'                        remove = TRUE){
-#'   dt = as_dt(.data)
-#'   substitute(separated_colname) %>% deparse() -> parse_name
-#'   if(!str_detect(parse_name,"^\"")) separated_colname = parse_name
-#'
-#'   dt[[separated_colname]] %>%
-#'     tstrsplit(split = sep) %>%
-#'     setNames(into) %>%
-#'     as.data.table() -> split_columns
-#'   if(remove) cbind(dt[,.SD,.SDcols = -separated_colname],split_columns)
-#'   else cbind(dt,split_columns)
-#'
-#' }
+
+# separate_dt = function(.data,separated_colname,into,
+#                     sep = "[^[:alnum:]]+",
+#                     remove = TRUE){
+#   dt = as.data.table(.data)
+#   substitute(separated_colname) %>% deparse() -> parse_name
+#   if(!str_detect(parse_name,"^\"")) separated_colname = parse_name
+#
+#   dt[[separated_colname]] %>%
+#     tstrsplit(split = sep) %>%
+#     setDT() %>%
+#     setnames(names(.),into) -> split_columns
+#   if(remove)
+#     dt[,(separated_colname):=NULL][,names(split_columns):=split_columns][]
+#   else dt[,names(split_columns):=split_columns][]
+#
+# }
+
+
+
+
+
+
+
+
+
 
 
 
diff --git a/R/slice.R b/R/slice.R
diff --git a/README.md b/README.md
diff --git a/cran-comments.md b/cran-comments.md

Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`		`-This package was submitted to CRAN on 2020-05-02.`
	`1`	`+This package was submitted to CRAN on 2020-05-28.`
`2`	`2`	`Once it is accepted, delete this file and tag the release (commit fbccc9581e).`
Original file line number	Diff line number	Diff line change
`@@ -145,7 +145,7 @@ unnest_dt = function(.data,...){`
`145`	`145`	`if(length(col_names) == 1) unnest_col(dt,...)`
`146`	`146`	`else`
`147`	`147`	`lapply(col_names,function(x) unnest_col(dt,cols = x)) %>%`
`148`		`- Reduce(x = ., f = function(x,y) merge(x,y))`
	`148`	`+ Reduce(x = ., f = function(x,y) merge(x,y,all = TRUE))`
`149`	`149`	`}`
`150`	`150`
`151`	`151`	`unnest_col = function(.data,...){`