[read] Document changes to reading date/datetime (#1335)

JanMarvin · web-flow · commit ff584f479597 · 2025-04-24T00:18:37.000+02:00
* [write] allow writing date vectors horizontally

* [read] convert dates in column names and row names

* [doc] document various types of date conversion

* [misc] cleanup
diff --git a/R/helper-functions.R b/R/helper-functions.R
@@ -1315,10 +1315,15 @@ fits_in_dims <- function(x, dims, startCol, startRow) {
   dims
 }
 
-# transpose single column or row data frames to wide/long. keeps attributes and class
+# transpose single column or row data frames to wide/long. keeps attributes and
+# class.
+# The magic of t(). A Date can be something like a numeric with a
+# format attached. After t(x) it will be a string "yyyy-mm-dd".
+# Therefore unclass first and apply the class afterwards.
 transpose_df <- function(x) {
   attribs <- attr(x, "c_cm")
   classes <- class(x[[1]])
+  x[] <- lapply(x[], unclass)
   x <- as.data.frame(t(x), stringsAsFactors = FALSE)
   for (i in seq_along(x)) {
     class(x[[i]]) <- classes
diff --git a/R/read.R b/R/read.R
@@ -1,3 +1,46 @@
+# Internal function to convert data frame from character to whatever is required
+convert_df <- function(z, types, date_conv, datetime_conv, hms_conv, as_character = FALSE) {
+  sel <- !is.na(names(types))
+
+  if (any(sel)) {
+    nums <- names(which(types[sel] == 1))
+    dtes <- names(which(types[sel] == 2))
+    poxs <- names(which(types[sel] == 3))
+    logs <- names(which(types[sel] == 4))
+    difs <- names(which(types[sel] == 5))
+    fmls <- names(which(types[sel] == 6))
+    # convert "#NUM!" to "NaN" -- then converts to NaN
+    # maybe consider this an option to instead return NA?
+
+    if (as_character) {
+      date_conv_c     <- function(...) as.character(date_conv(...))
+      datetime_conv_c <- function(...) as.character(datetime_conv(...))
+      hms_conv_c      <- function(...) as.character(hms_conv(...))
+
+      if (length(nums)) z[nums] <- lapply(z[nums], function(i) as.character(as.numeric(replace(i, i == "#NUM!", "NaN"))))
+      if (length(dtes)) z[dtes] <- lapply(z[dtes], date_conv_c)
+      if (length(poxs)) z[poxs] <- lapply(z[poxs], datetime_conv_c)
+      if (length(logs)) z[logs] <- lapply(z[logs], function(i) as.character(as.logical(i)))
+      if (isNamespaceLoaded("hms")) z[difs] <- lapply(z[difs], hms_conv_c)
+    } else {
+      if (length(nums)) z[nums] <- lapply(z[nums], function(i) as.numeric(replace(i, i == "#NUM!", "NaN")))
+      if (length(dtes)) z[dtes] <- lapply(z[dtes], date_conv)
+      if (length(poxs)) z[poxs] <- lapply(z[poxs], datetime_conv)
+      if (length(logs)) z[logs] <- lapply(z[logs], as.logical)
+      if (isNamespaceLoaded("hms")) z[difs] <- lapply(z[difs], hms_conv)
+    }
+
+    for (i in seq_along(z)) { # convert df to class formula
+      if (names(z)[i] %in% fmls) class(z[[i]]) <- c(class(z[[i]]), "formula")
+    }
+
+  } else {
+    warning("could not convert. All missing in row used for variable names")
+  }
+
+  z
+}
+
 # `wb_to_df()` ----------------------------------------
 #' Create a data frame from a Workbook
 #'
@@ -34,9 +77,17 @@
 #' Opening, saving and closing the file in a spreadsheet software will resolve
 #' this.
 #'
-#' Prior to release `1.15`, datetime variables (yyyy-mm-dd hh:mm:ss) were
-#' imported in the users current timezone (`Sys.timezone()`). This was
-#' changed. All datetime variables are now imported with timezone `"UTC"`.
+#' Before release 1.15, datetime variables (in 'yyyy-mm-dd hh:mm:ss' format)
+#' were imported using the user's local system timezone (`Sys.timezone()`).
+#' This behavior has been updated. Now, all datetime variables are imported
+#' with the timezone set to "UTC".
+#' If automatic date detection and conversion are enabled but the conversion
+#' is unsuccessful (for instance, in a column containing a mix of data types
+#' like strings, numbers, and dates), the dates will be displayed as a Unix
+#' timestamp. These can be transformed using the [as.POSIXct()] function.
+#' If date detection is disabled, dates will show up as a spreadsheet date
+#' format. To convert these, you can use the functions [convert_date()],
+#' [convert_datetime()], or [convert_hms()].
 #'
 #' @seealso [wb_get_named_regions()]
 #'
@@ -596,39 +647,34 @@ wb_to_df <- function(
   xlsx_cols_names <- colnames(z)
   names(xlsx_cols_names) <- xlsx_cols_names
 
-  # if colNames, then change tt too
+
+  date_conv     <- function(x) as.Date(.POSIXct(as.double(x), "UTC"), tz = "UTC", origin = "1970-01-01")
+  datetime_conv <- function(x) .POSIXct(as.double(x), "UTC")
+  hms_conv      <- convert_hms
+
+  # if colNames, then change tt too. rownames will be converted later. If column name row
+  # is in z/tt, the column name guessing will fail below
   if (col_names) {
     # select first row as colnames, but do not yet assign. it might contain
     # missing values and if assigned, convert below might break with unambiguous
     # names.
+
     nams <- names(xlsx_cols_names)
-    xlsx_cols_names  <- z[1, ]
+    if (convert)
+      xlsx_cols_names <- convert_df(z[1, , drop = FALSE], guess_col_type(tt[1, , drop = FALSE]), date_conv, datetime_conv, hms_conv, as_character = TRUE)
+    else
+      xlsx_cols_names <- z[1, , drop = FALSE]
     names(xlsx_cols_names) <- nams
 
     z  <- z[-1, , drop = FALSE]
     tt <- tt[-1, , drop = FALSE]
   }
 
-  if (row_names) {
-    rownames(z)  <- z[, 1]
-    rownames(tt) <- z[, 1]
-    xlsx_cols_names <- xlsx_cols_names[-1]
-
-    z  <- z[, -1, drop = FALSE]
-    tt <- tt[, -1, drop = FALSE]
-  }
-
   # # faster guess_col_type alternative? to avoid tt
   # types <- ftable(cc$row_r ~ cc$c_r ~ cc$typ)
 
-  date_conv     <- NULL
-  datetime_conv <- NULL
-  hms_conv      <- convert_hms
-
   if (missing(types)) {
     types <- guess_col_type(tt)
-    date_conv     <- function(x) as.Date(.POSIXct(as.double(x), "UTC"), tz = "UTC", origin = "1970-01-01")
-    datetime_conv <- function(x) .POSIXct(as.double(x), "UTC")
   } else {
     # TODO check if guessing only if !all() is possible
     if (any(xlsx_cols_names %in% names(types))) {
@@ -659,36 +705,24 @@ wb_to_df <- function(
       stop("no variable from `types` found in data")
     }
 
+    # avoid multiple conversion
     date_conv     <- function(x) convert_date(x, origin = origin)
     datetime_conv <- function(x) convert_datetime(x, origin = origin)
   }
 
   # could make it optional or explicit
   if (convert) {
-    sel <- !is.na(names(types))
-
-    if (any(sel)) {
-      nums <- names(which(types[sel] == 1))
-      dtes <- names(which(types[sel] == 2))
-      poxs <- names(which(types[sel] == 3))
-      logs <- names(which(types[sel] == 4))
-      difs <- names(which(types[sel] == 5))
-      fmls <- names(which(types[sel] == 6))
-      # convert "#NUM!" to "NaN" -- then converts to NaN
-      # maybe consider this an option to instead return NA?
-      if (length(nums)) z[nums] <- lapply(z[nums], function(i) as.numeric(replace(i, i == "#NUM!", "NaN")))
-      if (length(dtes)) z[dtes] <- lapply(z[dtes], date_conv)
-      if (length(poxs)) z[poxs] <- lapply(z[poxs], datetime_conv)
-      if (length(logs)) z[logs] <- lapply(z[logs], as.logical)
-      if (isNamespaceLoaded("hms")) z[difs] <- lapply(z[difs], hms_conv)
+    z <- convert_df(z, types, date_conv, datetime_conv, hms_conv)
+  }
 
-      for (i in seq_along(z)) { # convert df to class formula
-        if (names(z)[i] %in% fmls) class(z[[i]]) <- c(class(z[[i]]), "formula")
-      }
+  # column names were picked earlier
+  if (row_names) {
+    rownames(z)  <- z[, 1]
+    rownames(tt) <- z[, 1]
+    xlsx_cols_names <- xlsx_cols_names[-1]
 
-    } else {
-      warning("could not convert. All missing in row used for variable names")
-    }
+    z  <- z[, -1, drop = FALSE]
+    tt <- tt[, -1, drop = FALSE]
   }
 
   if (col_names) {
diff --git a/man/wb_to_df.Rd b/man/wb_to_df.Rd
diff --git a/tests/testthat/test-date_time_conversion.R b/tests/testthat/test-date_time_conversion.R
@@ -159,3 +159,38 @@ test_that("date 1904 works as expected", {
   # ignore rounding differences
   expect_equal(as.Date(df$pos), as.Date(got[["pos"]], tz = "UTC"))
 })
+
+test_that("date conversion works", {
+
+  wb <- wb_workbook()$add_worksheet()
+  wb$add_data(dims = "A1:D1", x = as.Date(paste0("2025-0", 1:4, "-01")), col_names = FALSE)
+  wb$add_data(dims = "A2:D4", x = matrix(1:12, 3, 4), col_names = FALSE)
+
+  # column name is converted date, column is numeric
+  df <- wb$to_df()
+  expect_true(is.numeric(df$`2025-01-01`))
+
+  # column name is converted date, column is character
+  df <- wb$to_df(convert = FALSE)
+  expect_true(is.character(df$`2025-01-01`))
+
+  # column name is spreadsheet date, column is numeric
+  df <- wb$to_df(convert = TRUE, detect_dates = FALSE)
+  expect_true(is.numeric(df$`45658`))
+
+  # column name is spreadsheet date, column is character
+  df <- wb$to_df(convert = FALSE, detect_dates = FALSE)
+  expect_true(is.character(df$`45658`))
+
+  # conversion works for rownames
+  df <- data.frame(
+    x = as.Date(paste0("2025-0", 1:4, "-01")),
+    y = 1:4
+  )
+  wb <- wb_workbook()$add_worksheet()
+  wb$add_data(x = df)
+  df <- wb$to_df(row_names = TRUE)
+  exp <- c("2025-01-01", "2025-02-01", "2025-03-01", "2025-04-01")
+  got <- rownames(df)
+  expect_equal(exp, got)
+})
diff --git a/tests/testthat/test-named_regions.R b/tests/testthat/test-named_regions.R
@@ -48,7 +48,7 @@ test_that("Maintaining Named Regions on Load", {
 
   # nonsense
   # df1 is a single value and this single value is now used as rowName
-  expect_warning(df1 <- read_xlsx(wb, named_region = "region1", row_names = TRUE))
+  df1 <- read_xlsx(wb, named_region = "region1", row_names = TRUE)
   expect_s3_class(df1, "data.frame")
   expect_equal(nrow(df1), 0)
   expect_equal(ncol(df1), 0)