Skip to content

Missing implementation of sklearn's "Estimator" Interface #2

@Dieschdel

Description

@Dieschdel

Not really a complaint, mainly an Idea for future Features:

Some (really useful) features of sklearn require sklearn's Estimator Interface. One example I am currently facing is the usage of GridSearchCV.

I currently get this to work by wrapping S_BDT with its own class, however it would be really handy if this interface is supported by default.

Below you find a (admittedly quite long) minimal working example

import numpy as np
from dataclasses import dataclass
from sbdt import S_BDT
from sklearn.base import BaseEstimator
from sklearn.model_selection import GridSearchCV, train_test_split


@dataclass
class SBDTEstimator(BaseEstimator):
    # Dataset
    task: str
    feature_val_border: tuple
    continuous_learning: bool
    scale_y: bool
    cat_idx: np.ndarray
    num_idx: np.ndarray
    cat_values: np.ndarray

    # Base Parameters
    nb_trees: int = 150
    reg_delta: float = 2.0
    subsampling_ratio: float = 0.1
    learning_rate: float = 0.1
    max_depth: int = 2
    newton_boosting: bool = False
    balance_partition: bool = True
    lambda_reg_mode: int = 1  # ADD
    l2_lambda: int = 15

    # Split Configuration
    min_samples_split: int = 0
    ignore_split_constraints: bool = True
    max_features: int = 0  # RAND
    max_feature_values: int = 0  # RAND
    criterion: int = 0  # XGD_MSE
    reuse_attr: bool = True

    # Privacy Configuration
    use_dp: bool = True
    privacy_budget: float = 0.1
    privacy_budget_init_score_ratio: float = 0.1
    privacy_budget_gain_ratio: float = 0.5
    leaf_denom_noise_weight: float = 0.2
    l2_threshold: float = 0.1
    hess_l2_threshold: float = 1.0
    init_score_threshold: float = 1.0
    numeric_feature_weight: float = 1.0
    leaf_noise: int = 1  # GAUSS

    # Individual Privacy Filter Configuration
    use_privacy_filter: bool = False
    approximate_privacy_filter: bool = False
    pf_additional_nb_trees: int = 0
    pf_l2_threshold: float = 0.1
    pf_hess_l2_threshold: float = 1.0
    pf_subsampling_ratio_factor: float = 1.0

    # Stream Baseline Parameters
    additional_nb_trees: int = 0

    # other config Parameters
    refine_splits_rounds: int = 0
    num_split_candidates: int = 32
    gradient_filtering: bool = False
    leaf_clipping: bool = False
    cyclical_feature_interactions: bool = True
    refine_splits: bool = False
    random_splits_from_candidates: bool = True
    refine_splits_subsample: float = 1.0
    cut_off_leaf_denom: bool = True

    # Dbugging Parameters
    custom_noise_scale: float = -1.0
    verbosity: int = 4  # 1: debug; 2: info; 4: err

    def fit(self, X_train, y_train):
        self.sbdt_regressor = S_BDT(**self.__dict__)
        return self.sbdt_regressor.train(X_train, y_train, "abalone")

    def predict(self, X_test):
        return self.sbdt_regressor.predict(X_test)


y = np.random.rand(100).astype(float)
X = np.random.rand(len(y), 2).astype(float)

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True)

estimator = SBDTEstimator(
    task="Regression",
    feature_val_border=(0.0, 0.5),
    continuous_learning=False,
    privacy_budget=0.1,
    scale_y=True,
    cat_idx=[],
    num_idx=[0, 1],
    cat_values=[[], []],
)

param_grid = {
    "nb_trees": [150, 180, 200],
}

grid_search = GridSearchCV(
    estimator=estimator,
    param_grid=param_grid,
    scoring="r2",
)

grid_search.fit(X_train, y_train)
print(f"Best parameters found: {grid_search.best_params_}")
print(f"Best R² found: {grid_search.best_score_}")

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions