diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index b5ee64b6e708c..c5627c60a38ca 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -320,6 +320,7 @@ def __init__( max_samples=None, max_bins=None, store_leaf_values=False, + missing_car=False, ): super().__init__( estimator=estimator, @@ -337,6 +338,7 @@ def __init__( self.max_samples = max_samples self.max_bins = max_bins self.store_leaf_values = store_leaf_values + self.missing_car = missing_car def apply(self, X): """ @@ -1085,6 +1087,7 @@ def __init__( max_samples=None, max_bins=None, store_leaf_values=False, + missing_car=False, ): super().__init__( estimator=estimator, @@ -1100,6 +1103,7 @@ def __init__( max_samples=max_samples, max_bins=max_bins, store_leaf_values=store_leaf_values, + missing_car=missing_car, ) @staticmethod @@ -1970,6 +1974,9 @@ class RandomForestClassifier(ForestClassifier): .. versionadded:: 1.4 + missing_car : bool, default=False + Whether the missing values are missing completely at random (CAR). + Attributes ---------- estimator_ : :class:`~sklearn.tree.DecisionTreeClassifier` @@ -2111,6 +2118,7 @@ def __init__( max_bins=None, store_leaf_values=False, monotonic_cst=None, + missing_car=False, ): super().__init__( estimator=DecisionTreeClassifier(), @@ -2128,6 +2136,7 @@ def __init__( "ccp_alpha", "store_leaf_values", "monotonic_cst", + "missing_car", ), bootstrap=bootstrap, oob_score=oob_score, @@ -2139,6 +2148,7 @@ def __init__( max_samples=max_samples, max_bins=max_bins, store_leaf_values=store_leaf_values, + missing_car=missing_car, ) self.criterion = criterion @@ -2742,6 +2752,9 @@ class ExtraTreesClassifier(ForestClassifier): .. versionadded:: 1.4 + missing_car : bool, default=False + Whether the missing values are missing completely at random (CAR). + Attributes ---------- estimator_ : :class:`~sklearn.tree.ExtraTreeClassifier` @@ -2872,6 +2885,7 @@ def __init__( max_bins=None, store_leaf_values=False, monotonic_cst=None, + missing_car=False, ): super().__init__( estimator=ExtraTreeClassifier(), @@ -2889,6 +2903,7 @@ def __init__( "ccp_alpha", "store_leaf_values", "monotonic_cst", + "missing_car", ), bootstrap=bootstrap, oob_score=oob_score, @@ -2900,6 +2915,7 @@ def __init__( max_samples=max_samples, max_bins=max_bins, store_leaf_values=store_leaf_values, + missing_car=missing_car, ) self.criterion = criterion diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index 2124cd76c69c8..ea3683025cf6f 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -129,6 +129,7 @@ class BaseDecisionTree(MultiOutputMixin, BaseEstimator, metaclass=ABCMeta): "ccp_alpha": [Interval(Real, 0.0, None, closed="left")], "store_leaf_values": ["boolean"], "monotonic_cst": ["array-like", None], + "missing_car": ["boolean"], } @abstractmethod @@ -149,6 +150,7 @@ def __init__( ccp_alpha=0.0, store_leaf_values=False, monotonic_cst=None, + missing_car=False, ): self.criterion = criterion self.splitter = splitter @@ -164,6 +166,7 @@ def __init__( self.ccp_alpha = ccp_alpha self.store_leaf_values = store_leaf_values self.monotonic_cst = monotonic_cst + self.missing_car = missing_car def get_depth(self): """Return the depth of the decision tree. @@ -532,6 +535,7 @@ def _build_tree( min_weight_leaf, random_state, monotonic_cst, + self.missing_car, ) if is_classifier(self): @@ -614,6 +618,7 @@ def _update_tree(self, X, y, sample_weight): min_weight_leaf, random_state, monotonic_cst, + self.missing_car, ) # Use BestFirst if max_leaf_nodes given; use DepthFirst otherwise @@ -1152,6 +1157,9 @@ class DecisionTreeClassifier(ClassifierMixin, BaseDecisionTree): .. versionadded:: 1.4 + missing_car : bool, default=False + Whether the missing values are missing completely at random (CAR). + Attributes ---------- classes_ : ndarray of shape (n_classes,) or list of ndarray @@ -1280,6 +1288,7 @@ def __init__( ccp_alpha=0.0, store_leaf_values=False, monotonic_cst=None, + missing_car=False, ): super().__init__( criterion=criterion, @@ -1296,6 +1305,7 @@ def __init__( monotonic_cst=monotonic_cst, ccp_alpha=ccp_alpha, store_leaf_values=store_leaf_values, + missing_car=missing_car, ) @_fit_context(prefer_skip_nested_validation=True) @@ -1784,6 +1794,7 @@ def __init__( ccp_alpha=0.0, store_leaf_values=False, monotonic_cst=None, + missing_car=False, ): super().__init__( criterion=criterion, @@ -1799,6 +1810,7 @@ def __init__( ccp_alpha=ccp_alpha, store_leaf_values=store_leaf_values, monotonic_cst=monotonic_cst, + missing_car=missing_car, ) @_fit_context(prefer_skip_nested_validation=True) @@ -2054,6 +2066,9 @@ class ExtraTreeClassifier(DecisionTreeClassifier): .. versionadded:: 1.4 + missing_car : bool, default=False + Whether the missing values are missing completely at random (CAR). + Attributes ---------- classes_ : ndarray of shape (n_classes,) or list of ndarray @@ -2168,6 +2183,7 @@ def __init__( ccp_alpha=0.0, store_leaf_values=False, monotonic_cst=None, + missing_car=False, ): super().__init__( criterion=criterion, @@ -2184,6 +2200,7 @@ def __init__( ccp_alpha=ccp_alpha, store_leaf_values=store_leaf_values, monotonic_cst=monotonic_cst, + missing_car=missing_car, ) diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd index 9a8ae9da81b52..94dce58254663 100644 --- a/sklearn/tree/_splitter.pxd +++ b/sklearn/tree/_splitter.pxd @@ -96,6 +96,7 @@ cdef class Splitter(BaseSplitter): cdef public Criterion criterion # Impurity criterion cdef const float64_t[:, ::1] y + cdef bint missing_car # Monotonicity constraints for each feature. # The encoding is as follows: diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index d3c8fa1f98e83..5469845e8fe80 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -148,6 +148,7 @@ cdef class Splitter(BaseSplitter): float64_t min_weight_leaf, object random_state, const int8_t[:] monotonic_cst, + bint missing_car, *argv ): """ @@ -173,8 +174,17 @@ cdef class Splitter(BaseSplitter): The user inputted random state to be used for pseudo-randomness monotonic_cst : const int8_t[:] - Monotonicity constraints + Indicates the monotonicity constraint to enforce on each feature. + - 1: monotonic increase + - 0: no constraint + - -1: monotonic decrease + If monotonic_cst is None, no constraints are applied. + + missing_car : bool + Indicates if the missing-values should be assumed as missing completely + at random. If that is the case, the missing values will be randomly + assigned to the left or right child of the split. """ self.criterion = criterion @@ -187,14 +197,18 @@ cdef class Splitter(BaseSplitter): self.random_state = random_state self.monotonic_cst = monotonic_cst self.with_monotonic_cst = monotonic_cst is not None + self.missing_car = missing_car def __reduce__(self): - return (type(self), (self.criterion, - self.max_features, - self.min_samples_leaf, - self.min_weight_leaf, - self.random_state, - self.monotonic_cst.base if self.monotonic_cst is not None else None), self.__getstate__()) + return (type(self), ( + self.criterion, + self.max_features, + self.min_samples_leaf, + self.min_weight_leaf, + self.random_state, + self.monotonic_cst.base if self.monotonic_cst is not None else None, + self.missing_car, + ), self.__getstate__()) cdef int init( self, @@ -562,10 +576,13 @@ cdef inline intp_t node_split_best( # The second search will have all the missing values going to the left node. # If there are no missing values, then we search only once for the most # optimal split. - n_searches = 2 if has_missing else 1 + n_searches = 2 if has_missing and not splitter.missing_car else 1 for i in range(n_searches): - missing_go_to_left = i == 1 + if not splitter.missing_car: + missing_go_to_left = i == 1 + else: + missing_go_to_left = rand_int(0, 2, random_state) criterion.missing_go_to_left = missing_go_to_left criterion.reset() @@ -645,7 +662,7 @@ cdef inline intp_t node_split_best( # Evaluate when there are missing values and all missing values goes # to the right node and non-missing values goes to the left node. - if has_missing: + if has_missing and not splitter.missing_car: n_left, n_right = end - start - n_missing, n_missing p = end - n_missing missing_go_to_left = 0 diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index ef26ec1be0b1d..0c722be827b36 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -2349,7 +2349,9 @@ def test_splitter_serializable(Splitter): n_outputs, n_classes = 2, np.array([3, 2], dtype=np.intp) criterion = CRITERIA_CLF["gini"](n_outputs, n_classes) - splitter = Splitter(criterion, max_features, 5, 0.5, rng, monotonic_cst=None) + splitter = Splitter( + criterion, max_features, 5, 0.5, rng, monotonic_cst=None, missing_car=False + ) splitter_serialize = pickle.dumps(splitter) splitter_back = pickle.loads(splitter_serialize) @@ -2600,6 +2602,37 @@ def test_missing_value_is_predictive(): assert tree.score(X_test, y_test) >= 0.85 +def test_missing_value_is_not_predictive_with_mcar(): + """Check the tree doesnt learns when the missing value is forced to be + unpredictive. + """ + rng = np.random.RandomState(0) + n_samples = 1000 + + X = rng.standard_normal(size=(n_samples, 10)) + y = rng.randint(0, high=2, size=n_samples) + + # Create a predictive feature using `y` and with some noise + X_random_mask = rng.choice([False, True], size=n_samples, p=[0.9, 0.1]) + y_mask = y.copy().astype(bool) + y_mask[X_random_mask] = ~y_mask[X_random_mask] + + X_predictive = rng.standard_normal(size=n_samples) + X_predictive[y_mask] = np.nan + + X[:, 5] = X_predictive + + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) + tree = DecisionTreeClassifier(random_state=rng, missing_car=True).fit( + X_train, y_train + ) + non_mcar_tree = DecisionTreeClassifier(random_state=rng, missing_car=False).fit( + X_train, y_train + ) + + non_mcar_tree.score(X_test, y_test) > tree.score(X_test, y_test) + 0.2 + + @pytest.mark.parametrize( "make_data, Tree", [