docs: add documentation about missing value handling

mllg · mllg · commit ed6394bea160 · 2023-01-16T10:07:24.000+01:00
diff --git a/.lintr b/.lintr
@@ -5,7 +5,7 @@ linters: with_defaults(
     object_name_linter = object_name_linter(c("snake_case", "CamelCase")), # only allow snake case and camel case object names
     cyclocomp_linter = NULL, # do not check function complexity
     commented_code_linter = NULL, # allow code in comments
-    line_length_linter = line_length_linter(100)
+    line_length_linter = line_length_linter(120)
     )
 exclusions: list("R/Filter.R" = 159,
                  "R/mlr_filters.R" = 30,
diff --git a/R/FilterCorrelation.R b/R/FilterCorrelation.R
@@ -6,6 +6,14 @@
 #' Simple correlation filter calling [stats::cor()].
 #' The filter score is the absolute value of the correlation.
 #'
+#' @note
+#' This filter, in its default settings, can handle missing values in the features.
+#' However, the resulting filter scores may be misleading or at least difficult to compare
+#' if some features have a large proportion of missing values.
+#'
+#' If a feature has no non-missing value, the resulting score will be `NA`.
+#' Missing scores  appear in a random, non-deterministic order at the end of the vector of scores.
+#'
 #' @references
 #' For a benchmark of filter methods:
 #'
@@ -45,7 +53,8 @@ FilterCorrelation = R6Class("FilterCorrelation",
     #' @description Create a FilterCorrelation object.
     initialize = function() {
       param_set = ps(
-        use    = p_fct(c("everything", "all.obs", "complete.obs", "na.or.complete", "pairwise.complete.obs"), default = "everything"),
+        use    = p_fct(c("everything", "all.obs", "complete.obs", "na.or.complete", "pairwise.complete.obs"),
+          default = "everything"),
         method = p_fct(c("pearson", "kendall", "spearman"), default = "pearson")
       )
 
diff --git a/R/FilterKruskalTest.R b/R/FilterKruskalTest.R
@@ -7,6 +7,15 @@
 #' The filter value is `-log10(p)` where `p` is the \eqn{p}-value. This
 #' transformation is necessary to ensure numerical stability for very small
 #' \eqn{p}-values.
+
+#' @note
+#' This filter, in its default settings, can handle missing values in the features.
+#' However, the resulting filter scores may be misleading or at least difficult to compare
+#' if some features have a large proportion of missing values.
+#'
+#' If a feature has not at least one non-missing observation per label, the resulting score will be NA.
+#' Missing scores  appear in a random, non-deterministic order at the end of the vector of scores.
+#'
 #'
 #' @references
 #' For a benchmark of filter methods:
@@ -66,8 +75,15 @@ FilterKruskalTest = R6Class("FilterKruskalTest",
 
       data = task$data(cols = task$feature_names)
       g = task$truth()
+
       -log10(map_dbl(data, function(x) {
-        kruskal.test(x = x, g = g, na.action = na_action)$p.value
+        tab = table(g[!is.na(x)])
+
+        if (any(tab == 0L)) {
+          NA_real_
+        } else {
+          kruskal.test(x = x, g = g, na.action = na_action)$p.value
+        }
       }))
     },
 
diff --git a/R/FilterRelief.R b/R/FilterRelief.R
@@ -5,6 +5,13 @@
 #' @description Information gain filter calling
 #'   [FSelectorRcpp::relief()] in package \CRANpkg{FSelectorRcpp}.
 #'
+#' @note
+#' This filter can handle missing values in the features.
+#' However, the resulting filter scores may be misleading or at least difficult to compare
+#' if some features have a large proportion of missing values.
+#'
+#' If a feature has no non-missing observation, the resulting score will be (close to) 0.
+#'
 #' @family Filter
 #' @template seealso_filter
 #' @export
diff --git a/man/mlr_filters_correlation.Rd b/man/mlr_filters_correlation.Rd
diff --git a/man/mlr_filters_kruskal_test.Rd b/man/mlr_filters_kruskal_test.Rd
diff --git a/man/mlr_filters_relief.Rd b/man/mlr_filters_relief.Rd
diff --git a/tests/testthat/test_FilterCorreltation.R b/tests/testthat/test_FilterCorreltation.R
@@ -0,0 +1,11 @@
+test_that("FilterCorrelation handles features with only missings gracefully", {
+  data = as.data.table(mtcars)
+  data[, disp := NA]
+  task = as_task_regr(data, target = "mpg")
+
+  scores = flt("correlation")$calculate(task)$scores
+
+  expect_numeric(scores)
+  expect_true(is.na(scores["disp"]))
+  expect_true(all(!is.na(scores[setdiff(names(scores), "disp")])))
+})
diff --git a/tests/testthat/test_FilterInformationGain.R b/tests/testthat/test_FilterInformationGain.R
@@ -0,0 +1,21 @@
+test_that("FilterInformationGain handles features with only missings gracefully", {
+  data = tsk("mtcars")$data()
+  data[, wt := NA]
+  task = as_task_regr(data, target = "mpg")
+
+  scores = flt("information_gain")$calculate(task)$scores
+
+  expect_numeric(scores, any.missing = FALSE)
+  expect_lte(scores["wt"], 1e-8)
+})
+
+test_that("FilterInformationGain handles features with only missings gracefully", {
+  data = tsk("iris")$data()
+  data[, Sepal.Length := NA]
+  task = as_task_classif(data, target = "Species")
+
+  scores = flt("information_gain")$calculate(task)$scores
+
+  expect_numeric(scores, any.missing = FALSE)
+  expect_lte(scores["Sepal.Length"], 1e-8)
+})
diff --git a/tests/testthat/test_FilterKruskalTest.R b/tests/testthat/test_FilterKruskalTest.R
@@ -0,0 +1,12 @@
+test_that("FilterKruskalTest handles features with only missings gracefully", {
+  data = tsk("spam")$data()
+  data[, report := NA]
+  data[1, report := 1]
+  task = as_task_classif(data, target = "type")
+
+  scores = flt("kruskal_test")$calculate(task)$scores
+
+  expect_numeric(scores)
+  expect_true(is.na(scores["disp"]))
+  expect_true(all(!is.na(scores[setdiff(names(scores), "report")])))
+})
diff --git a/tests/testthat/test_FilterRelief.R b/tests/testthat/test_FilterRelief.R
@@ -0,0 +1,21 @@
+test_that("FilterRelief handles features with only missings gracefully", {
+  data = tsk("mtcars")$data()
+  data[, wt := NA]
+  task = as_task_regr(data, target = "mpg")
+
+  scores = flt("relief")$calculate(task)$scores
+
+  expect_numeric(scores, any.missing = FALSE)
+  expect_lte(scores["wt"], 1e-8)
+})
+
+test_that("FilterRelief handles features with only missings gracefully", {
+  data = tsk("iris")$data()
+  data[, Sepal.Length := NA]
+  task = as_task_classif(data, target = "Species")
+
+  scores = flt("relief")$calculate(task)$scores
+
+  expect_numeric(scores, any.missing = FALSE)
+  expect_lte(scores["Sepal.Length"], 1e-8)
+})

Original file line number	Diff line number	Diff line change
`@@ -5,7 +5,7 @@ linters: with_defaults(`
`5`	`5`	`object_name_linter = object_name_linter(c("snake_case", "CamelCase")), # only allow snake case and camel case object names`
`6`	`6`	`cyclocomp_linter = NULL, # do not check function complexity`
`7`	`7`	`commented_code_linter = NULL, # allow code in comments`
`8`		`- line_length_linter = line_length_linter(100)`
	`8`	`+ line_length_linter = line_length_linter(120)`
`9`	`9`	`)`
`10`	`10`	`exclusions: list("R/Filter.R" = 159,`
`11`	`11`	`"R/mlr_filters.R" = 30,`