Skip to content

Commit 4bc651d

Browse files
committed
Remove some diff
Signed-off-by: Adam Li <adam2392@gmail.com>
1 parent 1994f15 commit 4bc651d

File tree

5 files changed

+25
-7
lines changed

5 files changed

+25
-7
lines changed

sklearn/tree/_classes.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -511,7 +511,6 @@ def _build_tree(
511511
self.min_impurity_decrease,
512512
self.store_leaf_values,
513513
)
514-
515514
builder.build(self.tree_, X, y, sample_weight, missing_values_in_feature_mask)
516515

517516
if self.n_outputs_ == 1 and is_classifier(self):

sklearn/tree/_criterion.pyx

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -155,8 +155,10 @@ cdef class BaseCriterion:
155155
156156
This method computes the improvement in impurity when a split occurs.
157157
The weighted impurity improvement equation is the following:
158+
158159
N_t / N * (impurity - N_t_R / N_t * right_impurity
159160
- N_t_L / N_t * left_impurity)
161+
160162
where N is the total number of samples, N_t is the number of samples
161163
at the current node, N_t_L is the number of samples in the left child,
162164
and N_t_R is the number of samples in the right child,
@@ -165,8 +167,10 @@ cdef class BaseCriterion:
165167
----------
166168
impurity_parent : double
167169
The initial impurity of the parent node before the split
170+
168171
impurity_left : double
169172
The impurity of the left child
173+
170174
impurity_right : double
171175
The impurity of the right child
172176
@@ -611,10 +615,13 @@ cdef class Entropy(ClassificationCriterion):
611615
This handles cases where the target is a classification taking values
612616
0, 1, ... K-2, K-1. If node m represents a region Rm with Nm observations,
613617
then let
618+
614619
count_k = 1 / Nm \sum_{x_i in Rm} I(yi = k)
620+
615621
be the proportion of class k observations in node m.
616622
617623
The cross-entropy is then defined as
624+
618625
cross-entropy = -\sum_{k=0}^{K-1} count_k log(count_k)
619626
"""
620627

@@ -1058,10 +1065,14 @@ cdef class MSE(RegressionCriterion):
10581065
10591066
The absolute impurity improvement is only computed by the
10601067
impurity_improvement method once the best split has been found.
1068+
10611069
The MSE proxy is derived from
1070+
10621071
sum_{i left}(y_i - y_pred_L)^2 + sum_{i right}(y_i - y_pred_R)^2
10631072
= sum(y_i^2) - n_L * mean_{i left}(y_i)^2 - n_R * mean_{i right}(y_i)^2
1073+
10641074
Neglecting constant terms, this gives:
1075+
10651076
- 1/n_L * sum_{i left}(y_i)^2 - 1/n_R * sum_{i right}(y_i)^2
10661077
"""
10671078
cdef SIZE_t k
@@ -1139,6 +1150,7 @@ cdef class MAE(RegressionCriterion):
11391150
----------
11401151
n_outputs : SIZE_t
11411152
The number of targets to be predicted
1153+
11421154
n_samples : SIZE_t
11431155
The total number of samples to fit on
11441156
"""
@@ -1429,6 +1441,7 @@ cdef class FriedmanMSE(MSE):
14291441
"""Mean squared error impurity criterion with improvement score by Friedman.
14301442
14311443
Uses the formula (35) in Friedman's original Gradient Boosting paper:
1444+
14321445
diff = mean_left - mean_right
14331446
improvement = n_left * n_right * diff^2 / (n_left + n_right)
14341447
"""
@@ -1483,6 +1496,7 @@ cdef class Poisson(RegressionCriterion):
14831496
"""Half Poisson deviance as impurity criterion.
14841497
14851498
Poisson deviance = 2/n * sum(y_true * log(y_true/y_pred) + y_pred - y_true)
1499+
14861500
Note that the deviance is >= 0, and since we have `y_pred = mean(y_true)`
14871501
at the leaves, one always has `sum(y_pred - y_true) = 0`. It remains the
14881502
implemented impurity (factor 2 is skipped):
@@ -1519,12 +1533,16 @@ cdef class Poisson(RegressionCriterion):
15191533
15201534
The absolute impurity improvement is only computed by the
15211535
impurity_improvement method once the best split has been found.
1536+
15221537
The Poisson proxy is derived from:
1538+
15231539
sum_{i left }(y_i * log(y_i / y_pred_L))
15241540
+ sum_{i right}(y_i * log(y_i / y_pred_R))
15251541
= sum(y_i * log(y_i) - n_L * mean_{i left}(y_i) * log(mean_{i left}(y_i))
15261542
- n_R * mean_{i right}(y_i) * log(mean_{i right}(y_i))
1543+
15271544
Neglecting constant terms, this gives
1545+
15281546
- sum{i left }(y_i) * log(mean{i left}(y_i))
15291547
- sum{i right}(y_i) * log(mean{i right}(y_i))
15301548
"""

sklearn/tree/_tree.pxd

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -141,7 +141,8 @@ cdef class TreeBuilder:
141141
# This class controls the various stopping criteria and the node splitting
142142
# evaluation order, e.g. depth-first or best-first.
143143

144-
cdef Splitter splitter
144+
cdef Splitter splitter # Splitting algorithm
145+
145146
cdef SIZE_t min_samples_split # Minimum number of samples in an internal node
146147
cdef SIZE_t min_samples_leaf # Minimum number of samples in a leaf
147148
cdef double min_weight_leaf # Minimum weight in a leaf

sklearn/tree/_tree.pyx

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,6 @@ cdef extern from "<stack>" namespace "std" nogil:
6161
from numpy import float32 as DTYPE
6262
from numpy import float64 as DOUBLE
6363

64-
6564
cdef double INFINITY = np.inf
6665
cdef double EPSILON = np.finfo('double').eps
6766

@@ -87,7 +86,6 @@ NODE_DTYPE = np.asarray(<Node[:1]>(&dummy)).dtype
8786
# TreeBuilder
8887
# =============================================================================
8988

90-
9189
cdef class TreeBuilder:
9290
"""Interface for different tree building strategies."""
9391

sklearn/tree/tests/test_tree.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,13 +33,15 @@
3333
DENSE_SPLITTERS,
3434
SPARSE_SPLITTERS,
3535
)
36-
from sklearn.tree._tree import NODE_DTYPE, TREE_LEAF, TREE_UNDEFINED
37-
from sklearn.tree._tree import Tree as CythonTree
3836
from sklearn.tree._tree import (
37+
NODE_DTYPE,
38+
TREE_LEAF,
39+
TREE_UNDEFINED,
3940
_check_n_classes,
4041
_check_node_ndarray,
4142
_check_value_ndarray,
4243
)
44+
from sklearn.tree._tree import Tree as CythonTree
4345
from sklearn.utils import _IS_32BIT, compute_sample_weight
4446
from sklearn.utils._testing import (
4547
assert_almost_equal,
@@ -2424,7 +2426,7 @@ def test_missing_values_on_equal_nodes_no_missing(criterion):
24242426
X = np.array([[0, 1, 2, 3, 8, 9, 11, 12, 15]]).T
24252427
y = np.array([0.1, 0.2, 0.3, 0.2, 1.4, 1.4, 1.5, 1.6, 2.6])
24262428

2427-
dtc = DecisionTreeRegressor(random_state=42, max_depth=1, criterion=criterion)
2429+
dtc = DecisionTreeRegressor(random_state=42, max_depth=1, criterion=criterion, store_leaf_values=True)
24282430
dtc.fit(X, y)
24292431

24302432
# Goes to right node because it has the most data points

0 commit comments

Comments
 (0)