Add leaf storage ability

adam2392 · adam2392 · commit 9b07f2ab2b1b · 2023-06-13T15:42:37.000-04:00
Signed-off-by: Adam Li &lt;adam2392@gmail.com&gt;
diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py
@@ -713,73 +713,6 @@ def feature_importances_(self):
 
         return self.tree_.compute_feature_importances()
 
-    def _get_y_for_leaves(self, X, sample_weight=None):
-        n_samples = X.shape[0]
-
-        # get the predictions
-        X_leaves = self.apply(X)
-
-        bootstrap_indices = np.empty(shape, dtype=np.int64)
-        for i, estimator in enumerate(self.estimators_):
-            # Get bootstrap indices.
-            if self.bootstrap:
-                n_samples_bootstrap = _get_n_samples_bootstrap(n_samples, self.max_samples)
-                bootstrap_indices[:, i] = _generate_sample_indices(
-                    estimator.random_state, n_samples, n_samples_bootstrap
-                )
-            else:
-                bootstrap_indices[:, i] = np.arange(n_samples)
-
-            # Get predictions on bootstrap indices.
-            X_leaves[:, i] = X_leaves[bootstrap_indices[:, i], i]
-
-        if sorter is not None:
-            # Reassign bootstrap indices to account for target sorting.
-            bootstrap_indices = np.argsort(sorter)[bootstrap_indices]
-
-        bootstrap_indices += 1  # for sparse matrix (0s as empty)
-
-        # Get the maximum number of nodes (internal + leaves) across trees.
-        # Get the maximum number of samples per leaf across trees (if needed).
-        max_node_count = 0
-        max_samples_leaf = 0 if not leaf_subsample else max_samples_leaf
-        for i, estimator in enumerate(self.estimators_):
-            node_count = estimator.tree_.node_count
-            if node_count > max_node_count:
-                max_node_count = node_count
-            if not leaf_subsample:
-                sample_count = np.max(np.bincount(X_leaves[:, i]))
-                if sample_count > max_samples_leaf:
-                    max_samples_leaf = sample_count
-
-        # Initialize NumPy array (more efficient serialization than dict/list).
-        shape = (self.n_estimators, max_node_count, max_samples_leaf)
-        y_train_leaves = np.zeros(shape, dtype=np.int64)
-
-        for i, estimator in enumerate(self.estimators_):
-            # Group training indices by leaf node.
-            leaf_indices, leaf_values_list = _group_by_value(X_leaves[:, i])
-
-            if leaf_subsample:
-                random.seed(estimator.random_state)
-
-            # Map each leaf node to its list of training indices.
-            for leaf_idx, leaf_values in zip(leaf_indices, leaf_values_list):
-                y_indices = bootstrap_indices[:, i][leaf_values]
-
-                if sample_weight is not None:
-                    y_indices = y_indices[sample_weight[y_indices - 1] > 0]
-
-                # Subsample leaf training indices (without replacement).
-                if leaf_subsample and max_samples_leaf < len(y_indices):
-                    if not isinstance(y_indices, list):
-                        y_indices = list(y_indices)
-                    y_indices = random.sample(y_indices, max_samples_leaf)
-
-                y_train_leaves[i, leaf_idx, : len(y_indices)] = y_indices
-
-        return y_train_leaves
-
 
 # =============================================================================
 # Public estimators
diff --git a/sklearn/tree/_criterion.pxd b/sklearn/tree/_criterion.pxd
@@ -72,10 +72,6 @@ cdef class BaseCriterion:
         SIZE_t end
     ) noexcept nogil
 
-    # cdef void node_samples(
-    #     self,
-    #     vector[vector[DOUBLE_t]]* dest
-    # ) noexcept nogil
 
 cdef class Criterion(BaseCriterion):
     """Abstract interface for supervised impurity criteria."""
@@ -94,6 +90,11 @@ cdef class Criterion(BaseCriterion):
     cdef void init_sum_missing(self)
     cdef void init_missing(self, SIZE_t n_missing) noexcept nogil
 
+    cdef void node_samples(
+        self,
+        vector[vector[DOUBLE_t]]* dest
+    ) noexcept nogil
+
 cdef class ClassificationCriterion(Criterion):
     """Abstract criterion for classification."""
 
diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx
@@ -46,7 +46,7 @@ cdef class BaseCriterion:
     in current node and in children nodes.
 
     This object stores methods on how to calculate how good a split is using
-    a set API. 
+    a set API.
 
     Samples in the "current" node are stored in `samples[start:end]` which is
     partitioned around `pos` (an index in `start:end`) so that:
@@ -186,9 +186,9 @@ cdef class BaseCriterion:
     ) noexcept nogil:
         """Abstract method which will set sample pointers in the criterion.
 
-        The dataset array that we compute criteria on is assumed to consist of 'N' 
-        ordered samples or rows (i.e. sorted). Since we pass this by reference, we 
-        use sample pointers to move the start and end around to consider only a subset of data. 
+        The dataset array that we compute criteria on is assumed to consist of 'N'
+        ordered samples or rows (i.e. sorted). Since we pass this by reference, we
+        use sample pointers to move the start and end around to consider only a subset of data.
         This function should also update relevant statistics that the class uses to compute the final criterion.
 
         Parameters
@@ -252,10 +252,28 @@ cdef class Criterion(BaseCriterion):
             Number of missing values for specific feature.
         """
         pass
-      
+
     cdef void init_sum_missing(self):
         """Init sum_missing to hold sums for missing values."""
 
+    cdef void node_samples(
+        self,
+        vector[vector[DOUBLE_t]]* dest
+    ) noexcept nogil:
+        cdef SIZE_t i, j
+
+        # Resize the destination vector of vectors
+        dest.resize(self.n_node_samples)
+
+        # Loop over the samples
+        for i in range(self.n_node_samples):
+            # Get the index of the current sample
+            j = self.sample_indices[self.start + i]
+
+            # Get the sample values for each output
+            for k in range(self.n_outputs):
+                dest[i][k].push_back(self.y[j, k])
+
 cdef inline void _move_sums_classification(
     ClassificationCriterion criterion,
     double[:, ::1] sum_1,
diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd
@@ -10,6 +10,7 @@
 # License: BSD 3 clause
 
 # See _splitter.pyx for details.
+from libcpp.vector cimport vector
 
 from ._criterion cimport BaseCriterion, Criterion
 
@@ -106,6 +107,8 @@ cdef class Splitter(BaseSplitter):
         const unsigned char[::1] feature_has_missing,
     ) except -1
 
+    cdef void node_samples(self, vector[vector[DOUBLE_t]]* dest) noexcept nogil
+
     # Methods that allow modifications to stopping conditions
     cdef bint check_presplit_conditions(
         self,
diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx
@@ -53,12 +53,12 @@ cdef inline void _init_split(SplitRecord* self, SIZE_t start_pos) noexcept nogil
     self.n_missing = 0
 
 cdef class BaseSplitter:
-    """This is an abstract interface for splitters. 
+    """This is an abstract interface for splitters.
 
     For example, a tree model could be either supervisedly, or unsupervisedly computing splits on samples of
     covariates, labels, or both. Although scikit-learn currently only contains
     supervised tree methods, this class enables 3rd party packages to leverage
-    scikit-learn's Cython code for splitting. 
+    scikit-learn's Cython code for splitting.
 
     A splitter is usually used in conjunction with a criterion class, which explicitly handles
     computing the criteria, which we split on. The setting of that criterion class is handled
@@ -112,7 +112,7 @@ cdef class BaseSplitter:
 
     cdef int pointer_size(self) noexcept nogil:
         """Size of the pointer for split records.
-        
+
         Overriding this function allows one to use different subclasses of
         `SplitRecord`.
         """
@@ -156,7 +156,6 @@ cdef class Splitter(BaseSplitter):
         self.min_weight_leaf = min_weight_leaf
         self.random_state = random_state
 
-
     def __reduce__(self):
         return (type(self), (self.criterion,
                              self.max_features,
@@ -281,6 +280,10 @@ cdef class Splitter(BaseSplitter):
 
         self.criterion.node_value(dest)
 
+    cdef void node_samples(self, vector[vector[DOUBLE_t]]* dest) noexcept nogil:
+        """Copy the samples[start:end] into dest."""
+        self.criterion.node_samples(dest)
+
     cdef double node_impurity(self) noexcept nogil:
         """Return the impurity of the current node."""
 
@@ -293,15 +296,15 @@ cdef class Splitter(BaseSplitter):
         bint missing_go_to_left,
     ) noexcept nogil:
         """Check stopping conditions pre-split.
-        
+
         This is typically a metric that is cheaply computed given the
         current proposed split, which is stored as a the `current_split`
         argument.
         """
         cdef SIZE_t min_samples_leaf = self.min_samples_leaf
         cdef SIZE_t end_non_missing = self.end - n_missing
         cdef SIZE_t n_left, n_right
-        
+
         if missing_go_to_left:
             n_left = current_split.pos - self.start + n_missing
             n_right = end_non_missing - current_split.pos
@@ -312,14 +315,14 @@ cdef class Splitter(BaseSplitter):
         # Reject if min_samples_leaf is not guaranteed
         if n_left < min_samples_leaf or n_right < min_samples_leaf:
             return 1
-        
+
         return 0
 
     cdef bint check_postsplit_conditions(
         self
     ) noexcept nogil:
         """Check stopping conditions after evaluating the split.
-        
+
         This takes some metric that is stored in the Criterion
         object and checks against internal stop metrics.
         """
@@ -329,10 +332,10 @@ cdef class Splitter(BaseSplitter):
         if ((self.criterion.weighted_n_left < min_weight_leaf) or
                 (self.criterion.weighted_n_right < min_weight_leaf)):
             return 1
-        
+
         return 0
 
-      
+
 cdef inline void shift_missing_values_to_left_if_required(
     SplitRecord* best,
     SIZE_t[::1] samples,
@@ -360,7 +363,7 @@ cdef inline void shift_missing_values_to_left_if_required(
 ctypedef fused Partitioner:
     DensePartitioner
     SparsePartitioner
-    
+
 cdef inline int node_split_best(
     Splitter splitter,
     Partitioner partitioner,
@@ -504,9 +507,9 @@ cdef inline int node_split_best(
 
                 if p >= end_non_missing:
                     continue
-                    
+
                 current_split.pos = p
-                
+
                 # Reject if min_samples_leaf is not guaranteed
                 if splitter.check_presplit_conditions(current_split, n_missing, missing_go_to_left) == 1:
                     continue
@@ -740,8 +743,6 @@ cdef inline int node_split_random(
     cdef SIZE_t n_features = splitter.n_features
 
     cdef SIZE_t max_features = splitter.max_features
-    cdef SIZE_t min_samples_leaf = splitter.min_samples_leaf
-    cdef double min_weight_leaf = splitter.min_weight_leaf
     cdef UINT32_t* random_state = &splitter.rand_r_state
 
     cdef SplitRecord best_split, current_split
diff --git a/sklearn/tree/_tree.pxd b/sklearn/tree/_tree.pxd
@@ -14,6 +14,7 @@ import numpy as np
 cimport numpy as cnp
 
 from libcpp.vector cimport vector
+from libcpp.unordered_map cimport unordered_map
 
 ctypedef cnp.npy_float32 DTYPE_t          # Type of X
 ctypedef cnp.npy_float64 DOUBLE_t         # Type of y, sample_weight
@@ -36,6 +37,7 @@ cdef struct Node:
     DOUBLE_t weighted_n_node_samples     # Weighted number of samples at the node
     unsigned char missing_go_to_left     # Whether features have missing values
 
+
 cdef class BaseTree:
     # Inner structures: values are stored separately from node structure,
     # since size is determined at runtime.
@@ -45,7 +47,14 @@ cdef class BaseTree:
     cdef Node* nodes                     # Array of nodes
 
     cdef SIZE_t value_stride             # The dimensionality of a vectorized output per sample
-    cdef double* value                   # Array of values prediction values for each node        
+    cdef double* value                   # Array of values prediction values for each node
+
+    # Enables the use of tree to store distributions of the output to allow
+    # arbitrary usage of the the leaves. This is used in the quantile
+    # estimators for example.
+    # for storing samples at each leaf node with leaf's node ID as the key and
+    # the sample values as the value
+    cdef unordered_map[SIZE_t, vector[vector[DOUBLE_t]]] value_samples
 
     # Generic Methods: These are generic methods used by any tree.
     cdef int _resize(self, SIZE_t capacity) except -1 nogil
@@ -61,7 +70,7 @@ cdef class BaseTree:
         double weighted_n_node_samples,
         unsigned char missing_go_to_left
     ) except -1 nogil
-    
+
     # Python API methods: These are methods exposed to Python
     cpdef cnp.ndarray apply(self, object X)
     cdef cnp.ndarray _apply_dense(self, object X)
@@ -101,10 +110,10 @@ cdef class Tree(BaseTree):
     # The Supervised Tree object is a binary tree structure constructed by the
     # TreeBuilder. The tree structure is used for predictions and
     # feature importances.
-    # 
+    #
     # Value of upstream properties:
     # - value_stride = n_outputs * max_n_classes
-    # - value = (capacity, n_outputs, max_n_classes) array of values          
+    # - value = (capacity, n_outputs, max_n_classes) array of values
 
     # Input/Output layout for supervised tree
     cdef public SIZE_t n_features        # Number of features in X
@@ -137,6 +146,8 @@ cdef class TreeBuilder:
     cdef SIZE_t max_depth               # Maximal tree depth
     cdef double min_impurity_decrease   # Impurity threshold for early stopping
 
+    cdef unsigned char store_leaf_values # Whether to store leaf values
+
     cpdef build(
         self,
         Tree tree,
diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx