Fix classes and criterion

adam2392 · adam2392 · commit 49526f026c46 · 2023-06-13T15:19:07.000-04:00
Signed-off-by: Adam Li &lt;adam2392@gmail.com&gt;
diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py
@@ -713,6 +713,73 @@ def feature_importances_(self):
 
         return self.tree_.compute_feature_importances()
 
+    def _get_y_for_leaves(self, X, sample_weight=None):
+        n_samples = X.shape[0]
+
+        # get the predictions
+        X_leaves = self.apply(X)
+
+        bootstrap_indices = np.empty(shape, dtype=np.int64)
+        for i, estimator in enumerate(self.estimators_):
+            # Get bootstrap indices.
+            if self.bootstrap:
+                n_samples_bootstrap = _get_n_samples_bootstrap(n_samples, self.max_samples)
+                bootstrap_indices[:, i] = _generate_sample_indices(
+                    estimator.random_state, n_samples, n_samples_bootstrap
+                )
+            else:
+                bootstrap_indices[:, i] = np.arange(n_samples)
+
+            # Get predictions on bootstrap indices.
+            X_leaves[:, i] = X_leaves[bootstrap_indices[:, i], i]
+
+        if sorter is not None:
+            # Reassign bootstrap indices to account for target sorting.
+            bootstrap_indices = np.argsort(sorter)[bootstrap_indices]
+
+        bootstrap_indices += 1  # for sparse matrix (0s as empty)
+
+        # Get the maximum number of nodes (internal + leaves) across trees.
+        # Get the maximum number of samples per leaf across trees (if needed).
+        max_node_count = 0
+        max_samples_leaf = 0 if not leaf_subsample else max_samples_leaf
+        for i, estimator in enumerate(self.estimators_):
+            node_count = estimator.tree_.node_count
+            if node_count > max_node_count:
+                max_node_count = node_count
+            if not leaf_subsample:
+                sample_count = np.max(np.bincount(X_leaves[:, i]))
+                if sample_count > max_samples_leaf:
+                    max_samples_leaf = sample_count
+
+        # Initialize NumPy array (more efficient serialization than dict/list).
+        shape = (self.n_estimators, max_node_count, max_samples_leaf)
+        y_train_leaves = np.zeros(shape, dtype=np.int64)
+
+        for i, estimator in enumerate(self.estimators_):
+            # Group training indices by leaf node.
+            leaf_indices, leaf_values_list = _group_by_value(X_leaves[:, i])
+
+            if leaf_subsample:
+                random.seed(estimator.random_state)
+
+            # Map each leaf node to its list of training indices.
+            for leaf_idx, leaf_values in zip(leaf_indices, leaf_values_list):
+                y_indices = bootstrap_indices[:, i][leaf_values]
+
+                if sample_weight is not None:
+                    y_indices = y_indices[sample_weight[y_indices - 1] > 0]
+
+                # Subsample leaf training indices (without replacement).
+                if leaf_subsample and max_samples_leaf < len(y_indices):
+                    if not isinstance(y_indices, list):
+                        y_indices = list(y_indices)
+                    y_indices = random.sample(y_indices, max_samples_leaf)
+
+                y_train_leaves[i, leaf_idx, : len(y_indices)] = y_indices
+
+        return y_train_leaves
+
 
 # =============================================================================
 # Public estimators
diff --git a/sklearn/tree/_criterion.pxd b/sklearn/tree/_criterion.pxd
@@ -11,6 +11,8 @@
 
 # See _criterion.pyx for implementation details.
 
+# from libcpp.vector cimport vector
+
 from ._tree cimport DTYPE_t          # Type of X
 from ._tree cimport DOUBLE_t         # Type of y, sample_weight
 from ._tree cimport SIZE_t           # Type for indices and counters
@@ -19,7 +21,7 @@ from ._tree cimport UINT32_t         # Unsigned 32 bit integer
 
 
 cdef class BaseCriterion:
-    """Abstract interface for criterion."""    
+    """Abstract interface for criterion."""
 
     # Internal structures
     cdef const DOUBLE_t[:] sample_weight  # Sample weights
@@ -70,13 +72,18 @@ cdef class BaseCriterion:
         SIZE_t end
     ) noexcept nogil
 
+    # cdef void node_samples(
+    #     self,
+    #     vector[vector[DOUBLE_t]]* dest
+    # ) noexcept nogil
+
 cdef class Criterion(BaseCriterion):
     """Abstract interface for supervised impurity criteria."""
 
     cdef const DOUBLE_t[:, ::1] y         # Values of y
     cdef SIZE_t n_missing                # Number of missing values for the feature being evaluated
     cdef bint missing_go_to_left         # Whether missing values go to the left node
-    
+
     cdef int init(
         self,
         const DOUBLE_t[:, ::1] y,
diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx