-
Notifications
You must be signed in to change notification settings - Fork 2
Open
Description
Not really a complaint, mainly an Idea for future Features:
Some (really useful) features of sklearn require sklearn's Estimator Interface. One example I am currently facing is the usage of GridSearchCV.
I currently get this to work by wrapping S_BDT with its own class, however it would be really handy if this interface is supported by default.
Below you find a (admittedly quite long) minimal working example
import numpy as np
from dataclasses import dataclass
from sbdt import S_BDT
from sklearn.base import BaseEstimator
from sklearn.model_selection import GridSearchCV, train_test_split
@dataclass
class SBDTEstimator(BaseEstimator):
# Dataset
task: str
feature_val_border: tuple
continuous_learning: bool
scale_y: bool
cat_idx: np.ndarray
num_idx: np.ndarray
cat_values: np.ndarray
# Base Parameters
nb_trees: int = 150
reg_delta: float = 2.0
subsampling_ratio: float = 0.1
learning_rate: float = 0.1
max_depth: int = 2
newton_boosting: bool = False
balance_partition: bool = True
lambda_reg_mode: int = 1 # ADD
l2_lambda: int = 15
# Split Configuration
min_samples_split: int = 0
ignore_split_constraints: bool = True
max_features: int = 0 # RAND
max_feature_values: int = 0 # RAND
criterion: int = 0 # XGD_MSE
reuse_attr: bool = True
# Privacy Configuration
use_dp: bool = True
privacy_budget: float = 0.1
privacy_budget_init_score_ratio: float = 0.1
privacy_budget_gain_ratio: float = 0.5
leaf_denom_noise_weight: float = 0.2
l2_threshold: float = 0.1
hess_l2_threshold: float = 1.0
init_score_threshold: float = 1.0
numeric_feature_weight: float = 1.0
leaf_noise: int = 1 # GAUSS
# Individual Privacy Filter Configuration
use_privacy_filter: bool = False
approximate_privacy_filter: bool = False
pf_additional_nb_trees: int = 0
pf_l2_threshold: float = 0.1
pf_hess_l2_threshold: float = 1.0
pf_subsampling_ratio_factor: float = 1.0
# Stream Baseline Parameters
additional_nb_trees: int = 0
# other config Parameters
refine_splits_rounds: int = 0
num_split_candidates: int = 32
gradient_filtering: bool = False
leaf_clipping: bool = False
cyclical_feature_interactions: bool = True
refine_splits: bool = False
random_splits_from_candidates: bool = True
refine_splits_subsample: float = 1.0
cut_off_leaf_denom: bool = True
# Dbugging Parameters
custom_noise_scale: float = -1.0
verbosity: int = 4 # 1: debug; 2: info; 4: err
def fit(self, X_train, y_train):
self.sbdt_regressor = S_BDT(**self.__dict__)
return self.sbdt_regressor.train(X_train, y_train, "abalone")
def predict(self, X_test):
return self.sbdt_regressor.predict(X_test)
y = np.random.rand(100).astype(float)
X = np.random.rand(len(y), 2).astype(float)
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True)
estimator = SBDTEstimator(
task="Regression",
feature_val_border=(0.0, 0.5),
continuous_learning=False,
privacy_budget=0.1,
scale_y=True,
cat_idx=[],
num_idx=[0, 1],
cat_values=[[], []],
)
param_grid = {
"nb_trees": [150, 180, 200],
}
grid_search = GridSearchCV(
estimator=estimator,
param_grid=param_grid,
scoring="r2",
)
grid_search.fit(X_train, y_train)
print(f"Best parameters found: {grid_search.best_params_}")
print(f"Best R² found: {grid_search.best_score_}")
Metadata
Metadata
Assignees
Labels
No labels