From 2cecab4ef0ae1a5ee56f21f64ebb85c0de4079cf Mon Sep 17 00:00:00 2001 From: "rodrigo.arenas" <31422766+rodrigo-arenas@users.noreply.github.com> Date: Wed, 23 Jul 2025 10:48:24 -0500 Subject: [PATCH 1/3] fix tests --- .../tests/test_feature_selection.py | 2 +- sklearn_genetic/tests/test_mlflow.py | 73 ++++++++++--------- 2 files changed, 39 insertions(+), 36 deletions(-) diff --git a/sklearn_genetic/tests/test_feature_selection.py b/sklearn_genetic/tests/test_feature_selection.py index c6473ea..07ba94a 100644 --- a/sklearn_genetic/tests/test_feature_selection.py +++ b/sklearn_genetic/tests/test_feature_selection.py @@ -194,7 +194,7 @@ def test_negative_criteria(): evolved_estimator = GAFeatureSelectionCV( clf, cv=3, - scoring="neg_max_error", + scoring="neg_mean_squared_error", population_size=5, generations=generations, tournament_size=3, diff --git a/sklearn_genetic/tests/test_mlflow.py b/sklearn_genetic/tests/test_mlflow.py index 0b397af..e64dff0 100644 --- a/sklearn_genetic/tests/test_mlflow.py +++ b/sklearn_genetic/tests/test_mlflow.py @@ -21,6 +21,38 @@ def mlflow_resources(): uri = mlflow.get_tracking_uri() client = MlflowClient(uri) + return uri, client + + +@pytest.fixture +def mlflow_run(mlflow_resources): + _, client = mlflow_resources + exp_id = client.get_experiment_by_name(EXPERIMENT_NAME).experiment_id + active_run = client.search_runs(exp_id, run_view_type=ViewType.ACTIVE_ONLY) + runs = [run.info.run_id for run in active_run] + return runs + + +def test_mlflow_config(mlflow_resources): + """ + Check MLflow config creation. + """ + uri, _ = mlflow_resources + mlflow_config = MLflowConfig( + tracking_uri=uri, + experiment=EXPERIMENT_NAME, + run_name="Decision Tree", + save_models=True, + tags={"team": "sklearn-genetic-opt", "version": "0.5.0"}, + ) + assert isinstance(mlflow_config, MLflowConfig) + + +def test_runs(mlflow_resources, mlflow_run): + """ + Check if runs are captured and parameters are true. + """ + uri, client = mlflow_resources mlflow_config = MLflowConfig( tracking_uri=uri, experiment=EXPERIMENT_NAME, @@ -65,44 +97,15 @@ def mlflow_resources(): ) evolved_estimator.fit(X_train, y_train) - - return uri, client, mlflow_config - - -@pytest.fixture -def mlflow_run(mlflow_resources): - _, client, _ = mlflow_resources - exp = client.get_experiment_by_name(EXPERIMENT_NAME) - if exp is None: - exp_id = client.create_experiment(EXPERIMENT_NAME) - else: - exp_id = exp.experiment_id - active_run = client.search_runs(exp_id, run_view_type=ViewType.ALL) - runs = [run.info.run_id for run in active_run] - return runs - - -@pytest.mark.order(1) -def test_mlflow_config(mlflow_resources): - """ - Check MLflow config creation. - """ - uri, _, mlflow_config = mlflow_resources - assert isinstance(mlflow_config, MLflowConfig) - - -@pytest.mark.order(2) -def test_runs(mlflow_resources, mlflow_run): - """ - Check if runs are captured and parameters are true. - """ + y_predict_ga = evolved_estimator.predict(X_test) runs = mlflow_run - assert len(runs) >= 1 + assert len(runs) >= 1 and evolved_estimator.best_params_["min_weight_fraction_leaf"] -@pytest.mark.order(3) def test_mlflow_artifacts(mlflow_resources, mlflow_run): + import os + import mlflow _, client = mlflow_resources run_id = mlflow_run[0] @@ -128,11 +131,12 @@ def test_mlflow_artifacts(mlflow_resources, mlflow_run): + def test_mlflow_params(mlflow_resources, mlflow_run): """ Test parameters are all in the run and within range. """ - _, client, _ = mlflow_resources + _, client = mlflow_resources run_id = mlflow_run[0] run = client.get_run(run_id) params = run.data.params @@ -148,7 +152,6 @@ def test_mlflow_after_run(mlflow_resources, mlflow_run): Check that the run has logged expected artifacts, metrics, and hyperparameters to the MLflow server. """ run_id = mlflow_run[0] - _, client = mlflow_resources run = client.get_run(run_id) From 64011c9af496aa233fd95190d0e3f17ca909d25c Mon Sep 17 00:00:00 2001 From: "rodrigo.arenas" <31422766+rodrigo-arenas@users.noreply.github.com> Date: Wed, 23 Jul 2025 11:02:38 -0500 Subject: [PATCH 2/3] outliers docs --- docs/tutorials/outliers.rst | 120 ++++++++++++++++++ ...detection.py => test_outlier_detection.py} | 2 +- 2 files changed, 121 insertions(+), 1 deletion(-) create mode 100644 docs/tutorials/outliers.rst rename sklearn_genetic/tests/{test_outliner_detection.py => test_outlier_detection.py} (99%) diff --git a/docs/tutorials/outliers.rst b/docs/tutorials/outliers.rst new file mode 100644 index 0000000..5c130ed --- /dev/null +++ b/docs/tutorials/outliers.rst @@ -0,0 +1,120 @@ +.. _outlier-detection: + +Outlier Detection Support +========================= + +Overview +-------- + +`sklearn-genetic` now includes native support for tuning outlier detection models such as +`IsolationForest`, `OneClassSVM`, and `LocalOutlierFactor` using `GASearchCV` and `GAFeatureSelectionCV`. +These models are recognized automatically, and a default scoring function is applied when +`scoring=None` is passed. + +This feature simplifies hyperparameter optimization for unsupervised anomaly detection problems, +where `y` labels are not available. + +Default Scoring Logic +---------------------- + +When `scoring=None` and an estimator is recognized as an outlier detector, a default scorer is used. +This scorer attempts the following, in order: + +1. If the estimator has `score_samples`, the mean of the scores is used. +2. If `score_samples` is unavailable but `decision_function` exists, its mean value is used. +3. As a fallback, the estimator is used with `fit_predict`, and the mean of `(predictions == 1)` is returned. + +This scoring system is designed to maximize flexibility and compatibility with a wide range of outlier models. + +.. code-block:: python + + def default_outlier_scorer(estimator, X, y=None): + if hasattr(estimator, 'score_samples'): + return np.mean(estimator.score_samples(X)) + elif hasattr(estimator, 'decision_function'): + return np.mean(estimator.decision_function(X)) + else: + predictions = estimator.fit_predict(X) + return np.mean(predictions == 1) + +Examples +-------- + +Using `GASearchCV` with `IsolationForest`: + +.. code-block:: python + + from sklearn.ensemble import IsolationForest + from sklearn_genetic import GASearchCV + from sklearn_genetic.space import Integer, Continuous + from sklearn.datasets import make_blobs + import numpy as np + + # Create synthetic data with outliers + X_normal, _ = make_blobs(n_samples=200, centers=1, n_features=4, random_state=42) + X_outliers = np.random.uniform(low=-6, high=6, size=(20, 4)) + X = np.vstack([X_normal, X_outliers]) + + estimator = IsolationForest(random_state=42) + + param_grid = { + 'contamination': Continuous(0.05, 0.3), + 'n_estimators': Integer(50, 150) + } + + search = GASearchCV(estimator=estimator, + param_grid=param_grid, + scoring=None, # triggers default_outlier_scorer + cv=3, + generations=4, + population_size=6, + n_jobs=-1) + + search.fit(X) + +Using `GAFeatureSelectionCV` with outlier detection: + +.. code-block:: python + + from sklearn_genetic import GAFeatureSelectionCV + from sklearn.ensemble import IsolationForest + + selector = GAFeatureSelectionCV( + estimator=IsolationForest(random_state=42), + scoring=None, # default_outlier_scorer used + cv=3, + generations=4, + population_size=6, + n_jobs=-1 + ) + + selector.fit(X) + +Custom Scoring +-------------- + +You may override the default logic by passing your own custom scoring function: + +.. code-block:: python + + def custom_score(estimator, X, y=None): + return np.std(estimator.score_samples(X)) + + search = GASearchCV( + estimator=IsolationForest(), + param_grid=param_grid, + scoring=custom_score, + cv=3, + generations=4, + population_size=6, + n_jobs=1 + ) + + search.fit(X) + +Limitations +----------- + +- Only estimators with `fit_predict`, `decision_function`, or `score_samples` are supported by default. +- Models not recognized as outlier detectors must be scored explicitly or will raise a `ValueError`. + diff --git a/sklearn_genetic/tests/test_outliner_detection.py b/sklearn_genetic/tests/test_outlier_detection.py similarity index 99% rename from sklearn_genetic/tests/test_outliner_detection.py rename to sklearn_genetic/tests/test_outlier_detection.py index 3e4e5b1..e2c8802 100644 --- a/sklearn_genetic/tests/test_outliner_detection.py +++ b/sklearn_genetic/tests/test_outlier_detection.py @@ -198,7 +198,7 @@ def mock_is_outlier_detector(est): sg_module.is_outlier_detector = original_is_outlier_detector -def test_importerror_fallback(): +def test_import_error_fallback(): """Test the ImportError fallback for is_outlier_detector""" import sklearn_genetic.genetic_search as sg_module From b9169be4630951d50ad8ec8ed4d769a58ca6622b Mon Sep 17 00:00:00 2001 From: "rodrigo.arenas" <31422766+rodrigo-arenas@users.noreply.github.com> Date: Wed, 23 Jul 2025 11:11:38 -0500 Subject: [PATCH 3/3] release 0.12.0 --- docs/index.rst | 1 + docs/release_notes.rst | 9 +++++++++ sklearn_genetic/_version.py | 2 +- 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/docs/index.rst b/docs/index.rst index a8fd0b0..794d5dc 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -68,6 +68,7 @@ This command will install all the extra requirements:: tutorials/adapters tutorials/understand_cv tutorials/mlflow + tutorials/outliers tutorials/reproducibility .. toctree:: diff --git a/docs/release_notes.rst b/docs/release_notes.rst index e046e00..bd30ba1 100644 --- a/docs/release_notes.rst +++ b/docs/release_notes.rst @@ -3,6 +3,15 @@ Release Notes Some notes on new features in various releases +What's new in 0.12.0 +-------------------- + +^^^^^^^^^ +Features: +^^^^^^^^^ + +* Added compatibility for outlier detection algorithms + What's new in 0.11.1 -------------------- diff --git a/sklearn_genetic/_version.py b/sklearn_genetic/_version.py index 9ed6bcc..ea370a8 100644 --- a/sklearn_genetic/_version.py +++ b/sklearn_genetic/_version.py @@ -1 +1 @@ -__version__ = "0.12.0dev" +__version__ = "0.12.0"