|
| 1 | + |
| 2 | +#' @title Join tables |
| 3 | +#' @name join |
| 4 | +#' @description The mutating joins add columns from `y` to `x`, |
| 5 | +#' matching rows based on the keys: |
| 6 | +#' |
| 7 | +#' * `inner_join_dt()`: includes all rows in `x` and `y`. |
| 8 | +#' * `left_join_dt()`: includes all rows in `x`. |
| 9 | +#' * `right_join_dt()`: includes all rows in `y`. |
| 10 | +#' * `full_join_dt()`: includes all rows in `x` or `y`. |
| 11 | +#' @description |
| 12 | +#' Filtering joins filter rows from `x` based on the presence or absence |
| 13 | +#' of matches in `y`: |
| 14 | +#' |
| 15 | +#' * `semi_join_dt()` return all rows from `x` with a match in `y`. |
| 16 | +#' * `anti_join_dt()` return all rows from `x` without a match in `y`. |
| 17 | +#' @param x A data.table |
| 18 | +#' @param y A data.table |
| 19 | +#' @param by (Optional) A character vector of variables to join by. |
| 20 | +#' |
| 21 | +#' If `NULL`, the default, `*_join_dt()` will perform a natural join, using all |
| 22 | +#' variables in common across `x` and `y`. A message lists the variables so that you |
| 23 | +#' can check they're correct; suppress the message by supplying `by` explicitly. |
| 24 | +#' |
| 25 | +#' To join by different variables on `x` and `y`, use a named vector. |
| 26 | +#' For example, `by = c("a" = "b")` will match `x$a` to `y$b`. |
| 27 | +#' |
| 28 | +#' To join by multiple variables, use a vector with length > 1. |
| 29 | +#' For example, `by = c("a", "b")` will match `x$a` to `y$a` and `x$b` to |
| 30 | +#' `y$b`. Use a named vector to match different variables in `x` and `y`. |
| 31 | +#' For example, `by = c("a" = "b", "c" = "d")` will match `x$a` to `y$b` and |
| 32 | +#' `x$c` to `y$d`. |
| 33 | +#' @param on (Optional) |
| 34 | +#' Indicate which columns in x should be joined with which columns in y. |
| 35 | +#' Examples included: |
| 36 | +#' 1.\code{.by = c("a","b")} (this is a must for \code{set_full_join_dt}); |
| 37 | +#' 2.\code{.by = c(x1="y1", x2="y2")}; |
| 38 | +#' 3.\code{.by = c("x1==y1", "x2==y2")}; |
| 39 | +#' 4.\code{.by = c("a", V2="b")}; |
| 40 | +#' 5.\code{.by = .(a, b)}; |
| 41 | +#' 6.\code{.by = c("x>=a", "y<=b")} or \code{.by = .(x>=a, y<=b)}. |
| 42 | +#' @param suffix If there are non-joined duplicate variables in x and y, these |
| 43 | +#' suffixes will be added to the output to disambiguate them. Should be a |
| 44 | +#' character vector of length 2. |
| 45 | +#' @return A data.table |
| 46 | +#' @examples |
| 47 | +#' |
| 48 | +#' workers = fread(" |
| 49 | +#' name company |
| 50 | +#' Nick Acme |
| 51 | +#' John Ajax |
| 52 | +#' Daniela Ajax |
| 53 | +#' ") |
| 54 | +#' |
| 55 | +#' positions = fread(" |
| 56 | +#' name position |
| 57 | +#' John designer |
| 58 | +#' Daniela engineer |
| 59 | +#' Cathie manager |
| 60 | +#' ") |
| 61 | +#' |
| 62 | +#' workers %>% inner_join_dt(positions) |
| 63 | +#' workers %>% left_join_dt(positions) |
| 64 | +#' workers %>% right_join_dt(positions) |
| 65 | +#' workers %>% full_join_dt(positions) |
| 66 | +#' |
| 67 | +#' # filtering joins |
| 68 | +#' workers %>% anti_join_dt(positions) |
| 69 | +#' workers %>% semi_join_dt(positions) |
| 70 | +#' |
| 71 | +#' # To suppress the message, supply 'by' argument |
| 72 | +#' workers %>% left_join_dt(positions, by = "name") |
| 73 | +#' |
| 74 | +#' # Use a named 'by' if the join variables have different names |
| 75 | +#' positions2 = setNames(positions, c("worker", "position")) # rename first column in 'positions' |
| 76 | +#' workers %>% inner_join_dt(positions2, by = c("name" = "worker")) |
| 77 | +#' |
| 78 | +#' # the syntax of 'on' could be a bit different |
| 79 | +#' workers %>% inner_join_dt(positions2,on = "name==worker") |
| 80 | +#' |
| 81 | +#' |
| 82 | + |
| 83 | +#' @rdname join |
| 84 | +#' @export |
| 85 | +inner_join_dt = function(x,y,by = NULL, on = NULL,suffix = c(".x",".y")){ |
| 86 | + x = as_dt(x) |
| 87 | + y = as_dt(y) |
| 88 | + on_ = substitute(on) %>% deparse() |
| 89 | + by_ = substitute(by) %>% deparse() |
| 90 | + if(on_ != "NULL") x[y, nomatch = 0L, on = on] |
| 91 | + else if(by_ == "NULL"){ |
| 92 | + by = intersect(names(x), names(y)) |
| 93 | + by_name = str_c(by, collapse = ",") |
| 94 | + message(str_glue("Joining by: {by_name}\n\n")) |
| 95 | + merge.data.table(x,y,by = by,suffixes = suffix) |
| 96 | + }else if(is.null(names(by))) merge.data.table(x,y,by = by,suffixes = suffix) |
| 97 | + else merge.data.table(x,y,by.x = names(by),by.y = by,suffixes = suffix) |
| 98 | +} |
| 99 | + |
| 100 | +#' @rdname join |
| 101 | +#' @export |
| 102 | +left_join_dt = function(x,y,by = NULL, on = NULL,suffix = c(".x",".y")){ |
| 103 | + x = as_dt(x) |
| 104 | + y = as_dt(y) |
| 105 | + on_ = substitute(on) %>% deparse() |
| 106 | + by_ = substitute(by) %>% deparse() |
| 107 | + if(on_ != "NULL") y[x, on = on] |
| 108 | + else if(by_ == "NULL"){ |
| 109 | + by = intersect(names(x), names(y)) |
| 110 | + by_name = str_c(by, collapse = ",") |
| 111 | + message(str_glue("Joining by: {by_name}\n\n")) |
| 112 | + merge.data.table(x,y,by = by,all.x = TRUE,suffixes = suffix) |
| 113 | + }else if(is.null(names(by))) merge.data.table(x,y,by = by,all.x = TRUE,suffixes = suffix) |
| 114 | + else merge.data.table(x,y,by.x = names(by),by.y = by,all.x = TRUE,suffixes = suffix) |
| 115 | +} |
| 116 | + |
| 117 | +#' @rdname join |
| 118 | +#' @export |
| 119 | +right_join_dt = function(x,y,by = NULL, on = NULL,suffix = c(".x",".y")){ |
| 120 | + x = as_dt(x) |
| 121 | + y = as_dt(y) |
| 122 | + on_ = substitute(on) %>% deparse() |
| 123 | + by_ = substitute(by) %>% deparse() |
| 124 | + if(on_ != "NULL") x[y, on = on] |
| 125 | + else if(by_ == "NULL"){ |
| 126 | + by = intersect(names(x), names(y)) |
| 127 | + by_name = str_c(by, collapse = ",") |
| 128 | + message(str_glue("Joining by: {by_name}\n\n")) |
| 129 | + merge.data.table(x,y,by = by,all.y = TRUE,suffixes = suffix) |
| 130 | + }else if(is.null(names(by))) merge.data.table(x,y,by = by,all.y = TRUE,suffixes = suffix) |
| 131 | + else merge.data.table(x,y,by.x = names(by),by.y = by,all.y = TRUE,suffixes = suffix) |
| 132 | +} |
| 133 | + |
| 134 | +#' @rdname join |
| 135 | +#' @export |
| 136 | +full_join_dt = function(x,y,by = NULL, on = NULL,suffix = c(".x",".y")){ |
| 137 | + x = as_dt(x) |
| 138 | + y = as_dt(y) |
| 139 | + on_ = substitute(on) %>% deparse() |
| 140 | + by_ = substitute(by) %>% deparse() |
| 141 | + if(on_ != "NULL") { |
| 142 | + if(by_!="null"){ |
| 143 | + rbind(x[, .SD, .SDcols = by], |
| 144 | + y[, .SD, .SDcols = by]) %>% |
| 145 | + unique()-> unique_keys |
| 146 | + y[x[.(unique_keys), on = on], on = on] |
| 147 | + }else{ |
| 148 | + rbind(x[, .SD, .SDcols = on], |
| 149 | + y[, .SD, .SDcols = on]) %>% |
| 150 | + unique()-> unique_keys |
| 151 | + y[x[.(unique_keys), on = on], on = on] |
| 152 | + } |
| 153 | + } else if(by_ == "NULL"){ |
| 154 | + by = intersect(names(x), names(y)) |
| 155 | + by_name = str_c(by, collapse = ",") |
| 156 | + message(str_glue("Joining by: {by_name}\n\n")) |
| 157 | + merge.data.table(x,y,by = by,all = TRUE,suffixes = suffix) |
| 158 | + }else if(is.null(names(by))) merge.data.table(x,y,by = by,all = TRUE,suffixes = suffix) |
| 159 | + else merge.data.table(x,y,by.x = names(by),by.y = by,all = TRUE,suffixes = suffix) |
| 160 | +} |
| 161 | + |
| 162 | +#' @rdname join |
| 163 | +#' @export |
| 164 | +anti_join_dt = function(x,y,by = NULL, on = NULL){ |
| 165 | + x = as_dt(x) |
| 166 | + y = as_dt(y) |
| 167 | + on_ = substitute(on) %>% deparse() |
| 168 | + by_ = substitute(by) %>% deparse() |
| 169 | + if(on_ != "NULL") x[!y, on = on] |
| 170 | + else if(by_ == "NULL"){ |
| 171 | + by = intersect(names(x), names(y)) |
| 172 | + by_name = str_c(by, collapse = ",") |
| 173 | + message(str_glue("Joining by: {by_name}\n\n")) |
| 174 | + x[!y, on = by] |
| 175 | + }else x[!y, on = by] |
| 176 | +} |
| 177 | + |
| 178 | +#' @rdname join |
| 179 | +#' @export |
| 180 | +semi_join_dt = function(x,y,by = NULL, on = NULL){ |
| 181 | + x = as_dt(x) |
| 182 | + y = as_dt(y) |
| 183 | + on_ = substitute(on) %>% deparse() |
| 184 | + by_ = substitute(by) %>% deparse() |
| 185 | + if(on_ != "NULL") { |
| 186 | + w = unique(x[y, on = on, nomatch = 0L, which = TRUE, allow.cartesian = TRUE]) |
| 187 | + x[w] |
| 188 | + } |
| 189 | + else{ |
| 190 | + if(by_ == "NULL"){ |
| 191 | + by = intersect(names(x), names(y)) |
| 192 | + by_name = str_c(by, collapse = ",") |
| 193 | + message(str_glue("Joining by: {by_name}\n\n")) |
| 194 | + } |
| 195 | + w = unique(x[y, on = by, nomatch = 0L, which = TRUE, allow.cartesian = TRUE]) |
| 196 | + x[w] |
| 197 | + } |
| 198 | +} |
0 commit comments