Skip to content

Commit ed6394b

Browse files
committed
docs: add documentation about missing value handling
1 parent eab7c5a commit ed6394b

11 files changed

+123
-3
lines changed

.lintr

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ linters: with_defaults(
55
object_name_linter = object_name_linter(c("snake_case", "CamelCase")), # only allow snake case and camel case object names
66
cyclocomp_linter = NULL, # do not check function complexity
77
commented_code_linter = NULL, # allow code in comments
8-
line_length_linter = line_length_linter(100)
8+
line_length_linter = line_length_linter(120)
99
)
1010
exclusions: list("R/Filter.R" = 159,
1111
"R/mlr_filters.R" = 30,

R/FilterCorrelation.R

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,14 @@
66
#' Simple correlation filter calling [stats::cor()].
77
#' The filter score is the absolute value of the correlation.
88
#'
9+
#' @note
10+
#' This filter, in its default settings, can handle missing values in the features.
11+
#' However, the resulting filter scores may be misleading or at least difficult to compare
12+
#' if some features have a large proportion of missing values.
13+
#'
14+
#' If a feature has no non-missing value, the resulting score will be `NA`.
15+
#' Missing scores appear in a random, non-deterministic order at the end of the vector of scores.
16+
#'
917
#' @references
1018
#' For a benchmark of filter methods:
1119
#'
@@ -45,7 +53,8 @@ FilterCorrelation = R6Class("FilterCorrelation",
4553
#' @description Create a FilterCorrelation object.
4654
initialize = function() {
4755
param_set = ps(
48-
use = p_fct(c("everything", "all.obs", "complete.obs", "na.or.complete", "pairwise.complete.obs"), default = "everything"),
56+
use = p_fct(c("everything", "all.obs", "complete.obs", "na.or.complete", "pairwise.complete.obs"),
57+
default = "everything"),
4958
method = p_fct(c("pearson", "kendall", "spearman"), default = "pearson")
5059
)
5160

R/FilterKruskalTest.R

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,15 @@
77
#' The filter value is `-log10(p)` where `p` is the \eqn{p}-value. This
88
#' transformation is necessary to ensure numerical stability for very small
99
#' \eqn{p}-values.
10+
11+
#' @note
12+
#' This filter, in its default settings, can handle missing values in the features.
13+
#' However, the resulting filter scores may be misleading or at least difficult to compare
14+
#' if some features have a large proportion of missing values.
15+
#'
16+
#' If a feature has not at least one non-missing observation per label, the resulting score will be NA.
17+
#' Missing scores appear in a random, non-deterministic order at the end of the vector of scores.
18+
#'
1019
#'
1120
#' @references
1221
#' For a benchmark of filter methods:
@@ -66,8 +75,15 @@ FilterKruskalTest = R6Class("FilterKruskalTest",
6675

6776
data = task$data(cols = task$feature_names)
6877
g = task$truth()
78+
6979
-log10(map_dbl(data, function(x) {
70-
kruskal.test(x = x, g = g, na.action = na_action)$p.value
80+
tab = table(g[!is.na(x)])
81+
82+
if (any(tab == 0L)) {
83+
NA_real_
84+
} else {
85+
kruskal.test(x = x, g = g, na.action = na_action)$p.value
86+
}
7187
}))
7288
},
7389

R/FilterRelief.R

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,13 @@
55
#' @description Information gain filter calling
66
#' [FSelectorRcpp::relief()] in package \CRANpkg{FSelectorRcpp}.
77
#'
8+
#' @note
9+
#' This filter can handle missing values in the features.
10+
#' However, the resulting filter scores may be misleading or at least difficult to compare
11+
#' if some features have a large proportion of missing values.
12+
#'
13+
#' If a feature has no non-missing observation, the resulting score will be (close to) 0.
14+
#'
815
#' @family Filter
916
#' @template seealso_filter
1017
#' @export

man/mlr_filters_correlation.Rd

Lines changed: 8 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

man/mlr_filters_kruskal_test.Rd

Lines changed: 8 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

man/mlr_filters_relief.Rd

Lines changed: 7 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
test_that("FilterCorrelation handles features with only missings gracefully", {
2+
data = as.data.table(mtcars)
3+
data[, disp := NA]
4+
task = as_task_regr(data, target = "mpg")
5+
6+
scores = flt("correlation")$calculate(task)$scores
7+
8+
expect_numeric(scores)
9+
expect_true(is.na(scores["disp"]))
10+
expect_true(all(!is.na(scores[setdiff(names(scores), "disp")])))
11+
})
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
test_that("FilterInformationGain handles features with only missings gracefully", {
2+
data = tsk("mtcars")$data()
3+
data[, wt := NA]
4+
task = as_task_regr(data, target = "mpg")
5+
6+
scores = flt("information_gain")$calculate(task)$scores
7+
8+
expect_numeric(scores, any.missing = FALSE)
9+
expect_lte(scores["wt"], 1e-8)
10+
})
11+
12+
test_that("FilterInformationGain handles features with only missings gracefully", {
13+
data = tsk("iris")$data()
14+
data[, Sepal.Length := NA]
15+
task = as_task_classif(data, target = "Species")
16+
17+
scores = flt("information_gain")$calculate(task)$scores
18+
19+
expect_numeric(scores, any.missing = FALSE)
20+
expect_lte(scores["Sepal.Length"], 1e-8)
21+
})
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
test_that("FilterKruskalTest handles features with only missings gracefully", {
2+
data = tsk("spam")$data()
3+
data[, report := NA]
4+
data[1, report := 1]
5+
task = as_task_classif(data, target = "type")
6+
7+
scores = flt("kruskal_test")$calculate(task)$scores
8+
9+
expect_numeric(scores)
10+
expect_true(is.na(scores["disp"]))
11+
expect_true(all(!is.na(scores[setdiff(names(scores), "report")])))
12+
})

0 commit comments

Comments
 (0)