Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ This command will install all the extra requirements::
tutorials/adapters
tutorials/understand_cv
tutorials/mlflow
tutorials/outliers
tutorials/reproducibility

.. toctree::
Expand Down
9 changes: 9 additions & 0 deletions docs/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,15 @@ Release Notes

Some notes on new features in various releases

What's new in 0.12.0
--------------------

^^^^^^^^^
Features:
^^^^^^^^^

* Added compatibility for outlier detection algorithms

What's new in 0.11.1
--------------------

Expand Down
120 changes: 120 additions & 0 deletions docs/tutorials/outliers.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
.. _outlier-detection:

Outlier Detection Support
=========================

Overview
--------

`sklearn-genetic` now includes native support for tuning outlier detection models such as
`IsolationForest`, `OneClassSVM`, and `LocalOutlierFactor` using `GASearchCV` and `GAFeatureSelectionCV`.
These models are recognized automatically, and a default scoring function is applied when
`scoring=None` is passed.

This feature simplifies hyperparameter optimization for unsupervised anomaly detection problems,
where `y` labels are not available.

Default Scoring Logic
----------------------

When `scoring=None` and an estimator is recognized as an outlier detector, a default scorer is used.
This scorer attempts the following, in order:

1. If the estimator has `score_samples`, the mean of the scores is used.
2. If `score_samples` is unavailable but `decision_function` exists, its mean value is used.
3. As a fallback, the estimator is used with `fit_predict`, and the mean of `(predictions == 1)` is returned.

This scoring system is designed to maximize flexibility and compatibility with a wide range of outlier models.

.. code-block:: python

def default_outlier_scorer(estimator, X, y=None):
if hasattr(estimator, 'score_samples'):
return np.mean(estimator.score_samples(X))
elif hasattr(estimator, 'decision_function'):
return np.mean(estimator.decision_function(X))
else:
predictions = estimator.fit_predict(X)
return np.mean(predictions == 1)

Examples
--------

Using `GASearchCV` with `IsolationForest`:

.. code-block:: python

from sklearn.ensemble import IsolationForest
from sklearn_genetic import GASearchCV
from sklearn_genetic.space import Integer, Continuous
from sklearn.datasets import make_blobs
import numpy as np

# Create synthetic data with outliers
X_normal, _ = make_blobs(n_samples=200, centers=1, n_features=4, random_state=42)
X_outliers = np.random.uniform(low=-6, high=6, size=(20, 4))
X = np.vstack([X_normal, X_outliers])

estimator = IsolationForest(random_state=42)

param_grid = {
'contamination': Continuous(0.05, 0.3),
'n_estimators': Integer(50, 150)
}

search = GASearchCV(estimator=estimator,
param_grid=param_grid,
scoring=None, # triggers default_outlier_scorer
cv=3,
generations=4,
population_size=6,
n_jobs=-1)

search.fit(X)

Using `GAFeatureSelectionCV` with outlier detection:

.. code-block:: python

from sklearn_genetic import GAFeatureSelectionCV
from sklearn.ensemble import IsolationForest

selector = GAFeatureSelectionCV(
estimator=IsolationForest(random_state=42),
scoring=None, # default_outlier_scorer used
cv=3,
generations=4,
population_size=6,
n_jobs=-1
)

selector.fit(X)

Custom Scoring
--------------

You may override the default logic by passing your own custom scoring function:

.. code-block:: python

def custom_score(estimator, X, y=None):
return np.std(estimator.score_samples(X))

search = GASearchCV(
estimator=IsolationForest(),
param_grid=param_grid,
scoring=custom_score,
cv=3,
generations=4,
population_size=6,
n_jobs=1
)

search.fit(X)

Limitations
-----------

- Only estimators with `fit_predict`, `decision_function`, or `score_samples` are supported by default.
- Models not recognized as outlier detectors must be scored explicitly or will raise a `ValueError`.

2 changes: 1 addition & 1 deletion sklearn_genetic/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.12.0dev"
__version__ = "0.12.0"
2 changes: 1 addition & 1 deletion sklearn_genetic/tests/test_feature_selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,7 @@ def test_negative_criteria():
evolved_estimator = GAFeatureSelectionCV(
clf,
cv=3,
scoring="neg_max_error",
scoring="neg_mean_squared_error",
population_size=5,
generations=generations,
tournament_size=3,
Expand Down
73 changes: 38 additions & 35 deletions sklearn_genetic/tests/test_mlflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,38 @@
def mlflow_resources():
uri = mlflow.get_tracking_uri()
client = MlflowClient(uri)
return uri, client


@pytest.fixture
def mlflow_run(mlflow_resources):
_, client = mlflow_resources
exp_id = client.get_experiment_by_name(EXPERIMENT_NAME).experiment_id
active_run = client.search_runs(exp_id, run_view_type=ViewType.ACTIVE_ONLY)
runs = [run.info.run_id for run in active_run]
return runs


def test_mlflow_config(mlflow_resources):
"""
Check MLflow config creation.
"""
uri, _ = mlflow_resources
mlflow_config = MLflowConfig(
tracking_uri=uri,
experiment=EXPERIMENT_NAME,
run_name="Decision Tree",
save_models=True,
tags={"team": "sklearn-genetic-opt", "version": "0.5.0"},
)
assert isinstance(mlflow_config, MLflowConfig)


def test_runs(mlflow_resources, mlflow_run):
"""
Check if runs are captured and parameters are true.
"""
uri, client = mlflow_resources
mlflow_config = MLflowConfig(
tracking_uri=uri,
experiment=EXPERIMENT_NAME,
Expand Down Expand Up @@ -65,44 +97,15 @@ def mlflow_resources():
)

evolved_estimator.fit(X_train, y_train)

return uri, client, mlflow_config


@pytest.fixture
def mlflow_run(mlflow_resources):
_, client, _ = mlflow_resources
exp = client.get_experiment_by_name(EXPERIMENT_NAME)
if exp is None:
exp_id = client.create_experiment(EXPERIMENT_NAME)
else:
exp_id = exp.experiment_id
active_run = client.search_runs(exp_id, run_view_type=ViewType.ALL)
runs = [run.info.run_id for run in active_run]
return runs


@pytest.mark.order(1)
def test_mlflow_config(mlflow_resources):
"""
Check MLflow config creation.
"""
uri, _, mlflow_config = mlflow_resources
assert isinstance(mlflow_config, MLflowConfig)


@pytest.mark.order(2)
def test_runs(mlflow_resources, mlflow_run):
"""
Check if runs are captured and parameters are true.
"""
y_predict_ga = evolved_estimator.predict(X_test)

runs = mlflow_run
assert len(runs) >= 1
assert len(runs) >= 1 and evolved_estimator.best_params_["min_weight_fraction_leaf"]


@pytest.mark.order(3)
def test_mlflow_artifacts(mlflow_resources, mlflow_run):
import os
import mlflow

_, client = mlflow_resources
run_id = mlflow_run[0]
Expand All @@ -128,11 +131,12 @@ def test_mlflow_artifacts(mlflow_resources, mlflow_run):




def test_mlflow_params(mlflow_resources, mlflow_run):
"""
Test parameters are all in the run and within range.
"""
_, client, _ = mlflow_resources
_, client = mlflow_resources
run_id = mlflow_run[0]
run = client.get_run(run_id)
params = run.data.params
Expand All @@ -148,7 +152,6 @@ def test_mlflow_after_run(mlflow_resources, mlflow_run):
Check that the run has logged expected artifacts, metrics, and hyperparameters to the MLflow server.
"""
run_id = mlflow_run[0]

_, client = mlflow_resources

run = client.get_run(run_id)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,7 @@ def mock_is_outlier_detector(est):
sg_module.is_outlier_detector = original_is_outlier_detector


def test_importerror_fallback():
def test_import_error_fallback():
"""Test the ImportError fallback for is_outlier_detector"""
import sklearn_genetic.genetic_search as sg_module

Expand Down
Loading