Remove some diff

adam2392 · adam2392 · commit 4bc651dd7916 · 2023-06-23T12:02:45.000-04:00
Signed-off-by: Adam Li &lt;adam2392@gmail.com&gt;
diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py
@@ -511,7 +511,6 @@ def _build_tree(
                 self.min_impurity_decrease,
                 self.store_leaf_values,
             )
-
         builder.build(self.tree_, X, y, sample_weight, missing_values_in_feature_mask)
 
         if self.n_outputs_ == 1 and is_classifier(self):
diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx
@@ -155,8 +155,10 @@ cdef class BaseCriterion:
 
         This method computes the improvement in impurity when a split occurs.
         The weighted impurity improvement equation is the following:
+
             N_t / N * (impurity - N_t_R / N_t * right_impurity
                                 - N_t_L / N_t * left_impurity)
+
         where N is the total number of samples, N_t is the number of samples
         at the current node, N_t_L is the number of samples in the left child,
         and N_t_R is the number of samples in the right child,
@@ -165,8 +167,10 @@ cdef class BaseCriterion:
         ----------
         impurity_parent : double
             The initial impurity of the parent node before the split
+
         impurity_left : double
             The impurity of the left child
+
         impurity_right : double
             The impurity of the right child
 
@@ -611,10 +615,13 @@ cdef class Entropy(ClassificationCriterion):
     This handles cases where the target is a classification taking values
     0, 1, ... K-2, K-1. If node m represents a region Rm with Nm observations,
     then let
+
         count_k = 1 / Nm \sum_{x_i in Rm} I(yi = k)
+
     be the proportion of class k observations in node m.
 
     The cross-entropy is then defined as
+
         cross-entropy = -\sum_{k=0}^{K-1} count_k log(count_k)
     """
 
@@ -1058,10 +1065,14 @@ cdef class MSE(RegressionCriterion):
 
         The absolute impurity improvement is only computed by the
         impurity_improvement method once the best split has been found.
+
         The MSE proxy is derived from
+
             sum_{i left}(y_i - y_pred_L)^2 + sum_{i right}(y_i - y_pred_R)^2
             = sum(y_i^2) - n_L * mean_{i left}(y_i)^2 - n_R * mean_{i right}(y_i)^2
+
         Neglecting constant terms, this gives:
+
             - 1/n_L * sum_{i left}(y_i)^2 - 1/n_R * sum_{i right}(y_i)^2
         """
         cdef SIZE_t k
@@ -1139,6 +1150,7 @@ cdef class MAE(RegressionCriterion):
         ----------
         n_outputs : SIZE_t
             The number of targets to be predicted
+
         n_samples : SIZE_t
             The total number of samples to fit on
         """
@@ -1429,6 +1441,7 @@ cdef class FriedmanMSE(MSE):
     """Mean squared error impurity criterion with improvement score by Friedman.
 
     Uses the formula (35) in Friedman's original Gradient Boosting paper:
+
         diff = mean_left - mean_right
         improvement = n_left * n_right * diff^2 / (n_left + n_right)
     """
@@ -1483,6 +1496,7 @@ cdef class Poisson(RegressionCriterion):
     """Half Poisson deviance as impurity criterion.
 
     Poisson deviance = 2/n * sum(y_true * log(y_true/y_pred) + y_pred - y_true)
+    
     Note that the deviance is >= 0, and since we have `y_pred = mean(y_true)`
     at the leaves, one always has `sum(y_pred - y_true) = 0`. It remains the
     implemented impurity (factor 2 is skipped):
@@ -1519,12 +1533,16 @@ cdef class Poisson(RegressionCriterion):
 
         The absolute impurity improvement is only computed by the
         impurity_improvement method once the best split has been found.
+
         The Poisson proxy is derived from:
+
               sum_{i left }(y_i * log(y_i / y_pred_L))
             + sum_{i right}(y_i * log(y_i / y_pred_R))
             = sum(y_i * log(y_i) - n_L * mean_{i left}(y_i) * log(mean_{i left}(y_i))
                                  - n_R * mean_{i right}(y_i) * log(mean_{i right}(y_i))
+
         Neglecting constant terms, this gives
+
             - sum{i left }(y_i) * log(mean{i left}(y_i))
             - sum{i right}(y_i) * log(mean{i right}(y_i))
         """
diff --git a/sklearn/tree/_tree.pxd b/sklearn/tree/_tree.pxd
@@ -141,7 +141,8 @@ cdef class TreeBuilder:
     # This class controls the various stopping criteria and the node splitting
     # evaluation order, e.g. depth-first or best-first.
 
-    cdef Splitter splitter
+    cdef Splitter splitter              # Splitting algorithm
+
     cdef SIZE_t min_samples_split       # Minimum number of samples in an internal node
     cdef SIZE_t min_samples_leaf        # Minimum number of samples in a leaf
     cdef double min_weight_leaf         # Minimum weight in a leaf
diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx
@@ -61,7 +61,6 @@ cdef extern from "<stack>" namespace "std" nogil:
 from numpy import float32 as DTYPE
 from numpy import float64 as DOUBLE
 
-
 cdef double INFINITY = np.inf
 cdef double EPSILON = np.finfo('double').eps
 
@@ -87,7 +86,6 @@ NODE_DTYPE = np.asarray(<Node[:1]>(&dummy)).dtype
 # TreeBuilder
 # =============================================================================
 
-
 cdef class TreeBuilder:
     """Interface for different tree building strategies."""
 
diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py
@@ -33,13 +33,15 @@
     DENSE_SPLITTERS,
     SPARSE_SPLITTERS,
 )
-from sklearn.tree._tree import NODE_DTYPE, TREE_LEAF, TREE_UNDEFINED
-from sklearn.tree._tree import Tree as CythonTree
 from sklearn.tree._tree import (
+    NODE_DTYPE,
+    TREE_LEAF,
+    TREE_UNDEFINED,
     _check_n_classes,
     _check_node_ndarray,
     _check_value_ndarray,
 )
+from sklearn.tree._tree import Tree as CythonTree
 from sklearn.utils import _IS_32BIT, compute_sample_weight
 from sklearn.utils._testing import (
     assert_almost_equal,
@@ -2424,7 +2426,7 @@ def test_missing_values_on_equal_nodes_no_missing(criterion):
     X = np.array([[0, 1, 2, 3, 8, 9, 11, 12, 15]]).T
     y = np.array([0.1, 0.2, 0.3, 0.2, 1.4, 1.4, 1.5, 1.6, 2.6])
 
-    dtc = DecisionTreeRegressor(random_state=42, max_depth=1, criterion=criterion)
+    dtc = DecisionTreeRegressor(random_state=42, max_depth=1, criterion=criterion, store_leaf_values=True)
     dtc.fit(X, y)
 
     # Goes to right node because it has the most data points

Original file line number	Diff line number	Diff line change
`@@ -511,7 +511,6 @@ def _build_tree(`
`511`	`511`	`self.min_impurity_decrease,`
`512`	`512`	`self.store_leaf_values,`
`513`	`513`	`)`
`514`		`-`
`515`	`514`	`builder.build(self.tree_, X, y, sample_weight, missing_values_in_feature_mask)`
`516`	`515`
`517`	`516`	`if self.n_outputs_ == 1 and is_classifier(self):`