From aff5ed57393b418bc0669000dda45f4df42e63b6 Mon Sep 17 00:00:00 2001 From: Tony Bagnall Date: Tue, 14 Jan 2025 10:26:05 +0000 Subject: [PATCH 01/19] first draft --- .../collection/imbalance/__init__.py | 1 + .../collection/imbalance/_smote.py | 30 +++++++++++++++++++ 2 files changed, 31 insertions(+) create mode 100644 aeon/transformations/collection/imbalance/__init__.py create mode 100644 aeon/transformations/collection/imbalance/_smote.py diff --git a/aeon/transformations/collection/imbalance/__init__.py b/aeon/transformations/collection/imbalance/__init__.py new file mode 100644 index 0000000000..eeff2f5d85 --- /dev/null +++ b/aeon/transformations/collection/imbalance/__init__.py @@ -0,0 +1 @@ +"""Supervised transformers to rebalance colelctions of time series.""" diff --git a/aeon/transformations/collection/imbalance/_smote.py b/aeon/transformations/collection/imbalance/_smote.py new file mode 100644 index 0000000000..24078d180e --- /dev/null +++ b/aeon/transformations/collection/imbalance/_smote.py @@ -0,0 +1,30 @@ +"""Wrapper for imblearn minority class rebalancer SMOTE.""" + +from imblearn.over_sampling import SMOTE as smote + +from aeon.transformations.collection import BaseCollectionTransformer + +__maintainer__ = ["TonyBagnall"] +__all__ = ["SMOTE"] + + +class SMOTE(BaseCollectionTransformer): + """Wrapper for SMOTE transform.""" + + _tags = { + "capability:multivariate": True, + "capability:unequal_length": True, + "requires_y": True, + } + + def __init__(self, sampling_strategy="auto", random_state=None, k_neighbors=5): + self.sampling_strategy = sampling_strategy + self.random_state = random_state + self.k_neighbors = k_neighbors + + def _fit(self, X, y=None): + self.smote_ = smote(self.sampling_strategy, self.random_state, self.k_neighbors) + self.smote_.fit(X, y) + + def _transform(self, X, y=None): + return self.smote_.resample(X, y) From 4bec820c8b9995cacad10261b736f004541d53ce Mon Sep 17 00:00:00 2001 From: Chuanhang Qiu <80885865+LinGinQiu@users.noreply.github.com> Date: Thu, 23 Jan 2025 15:00:27 +0000 Subject: [PATCH 02/19] [ENH] wrapper for smote and adasyn of the imbalance module in collection transformers (#2501) * smote & adasyn in aeon.transformation.imbalance * smote & adasyn in aeon.transformation.imbalance * smote & adasyn in aeon.transformation.imbalance * smote & adasyn in aeon.transformation.imbalance --- .../collection/imbalance/__init__.py | 5 + .../collection/imbalance/_adasyn.py | 140 ++++++++++++ .../collection/imbalance/_smote.py | 216 +++++++++++++++++- .../collection/tests/test_imbalance.py | 60 +++++ 4 files changed, 411 insertions(+), 10 deletions(-) create mode 100644 aeon/transformations/collection/imbalance/_adasyn.py create mode 100644 aeon/transformations/collection/tests/test_imbalance.py diff --git a/aeon/transformations/collection/imbalance/__init__.py b/aeon/transformations/collection/imbalance/__init__.py index eeff2f5d85..280251ad04 100644 --- a/aeon/transformations/collection/imbalance/__init__.py +++ b/aeon/transformations/collection/imbalance/__init__.py @@ -1 +1,6 @@ """Supervised transformers to rebalance colelctions of time series.""" + +__all__ = ["SMOTE", "ADASYN"] + +from aeon.transformations.collection.imbalance._smote import SMOTE +from aeon.transformations.collection.imbalance._adasyn import ADASYN diff --git a/aeon/transformations/collection/imbalance/_adasyn.py b/aeon/transformations/collection/imbalance/_adasyn.py new file mode 100644 index 0000000000..72818b72a8 --- /dev/null +++ b/aeon/transformations/collection/imbalance/_adasyn.py @@ -0,0 +1,140 @@ +""" +implement for imblearn minority class rebalancer ADASYN. +see more in imblearn.over_sampling.ADASYN +original authors: +# Guillaume Lemaitre +# Christos Aridas +# License: MIT +""" +import numpy as np +from aeon.transformations.collection import BaseCollectionTransformer +from sklearn.neighbors import NearestNeighbors +from sklearn.utils import check_random_state +from scipy import sparse +from collections import OrderedDict + +__maintainer__ = ["TonyBagnall, Chris Qiu"] +__all__ = ["ADASYN"] + + +class ADASYN(BaseCollectionTransformer): + """ + Class to perform over-sampling using ADASYN . + + This object is a simplified implementation of ADASYN - Adaptive + Synthetic (ADASYN) algorithm as presented in imblearn.over_sampling.ADASYN + This method is similar to SMOTE, but it generates different number of + samples depending on an estimate of the local distribution of the class + to be oversampled. + Parameters + ---------- + {random_state} + + k_neighbors : int or object, default=5 + The nearest neighbors used to define the neighborhood of samples to use + to generate the synthetic samples. `~sklearn.neighbors.NearestNeighbors` + instance will be fitted in this case. + """ + + _tags = { + "capability:multivariate": True, + "capability:unequal_length": True, + "requires_y": True, + } + + def __init__(self, random_state=None, k_neighbors=5): + self.random_state = random_state + self.k_neighbors = k_neighbors + super().__init__() + + def _fit(self, X, y=None): + # set the additional_neighbor=1 + self.nn_ = NearestNeighbors(n_neighbors=self.k_neighbors + 1) + + # generate sampling target by targeting all classes but not the majority + unique, counts = np.unique(y, return_counts=True) + target_stats = dict(zip(unique, counts)) + n_sample_majority = max(target_stats.values()) + class_majority = max(target_stats, key=target_stats.get) + sampling_strategy = { + key: n_sample_majority - value + for (key, value) in target_stats.items() + if key != class_majority + } + self.sampling_strategy_ = OrderedDict( + sorted(sampling_strategy.items()) + ) + return self + + def _transform(self, X, y=None): + shape_recover = False # use to recover the shape of X + if X.ndim == 3 and X.shape[1] == 1: + X = np.squeeze(X, axis=1) # remove the middle dimension to be compatible with sklearn + shape_recover = True + random_state = check_random_state(self.random_state) + X_resampled = [X.copy()] + y_resampled = [y.copy()] + + # got the minority class label and the number needs to be generated i.e. num_majority - num_minority + for class_sample, n_samples in self.sampling_strategy_.items(): + if n_samples == 0: + continue + target_class_indices = np.flatnonzero(y == class_sample) + X_class = X[target_class_indices] + + self.nn_.fit(X) + nns = self.nn_.kneighbors(X_class, return_distance=False)[:, 1:] + # The ratio is computed using a one-vs-rest manner. Using majority + # in multi-class would lead to slightly different results at the + # cost of introducing a new parameter. + n_neighbors = self.nn_.n_neighbors - 1 + ratio_nn = np.sum(y[nns] != class_sample, axis=1) / n_neighbors + if not np.sum(ratio_nn): + raise RuntimeError( + "Not any neigbours belong to the majority" + " class. This case will induce a NaN case" + " with a division by zero. ADASYN is not" + " suited for this specific dataset." + " Use SMOTE instead." + ) + ratio_nn /= np.sum(ratio_nn) + n_samples_generate = np.rint(ratio_nn * n_samples).astype(int) + # rounding may cause new amount for n_samples + n_samples = np.sum(n_samples_generate) + if not n_samples: + raise ValueError( + "No samples will be generated with the provided ratio settings." + ) + + # the nearest neighbors need to be fitted only on the current class + # to find the class NN to generate new samples + self.nn_.fit(X_class) + nns = self.nn_.kneighbors(X_class, return_distance=False)[:, 1:] + + enumerated_class_indices = np.arange(len(target_class_indices)) + rows = np.repeat(enumerated_class_indices, n_samples_generate) + cols = random_state.choice(n_neighbors, size=n_samples) + diffs = X_class[nns[rows, cols]] - X_class[rows] + steps = random_state.uniform(size=(n_samples, 1)) + + if sparse.issparse(X): + sparse_func = type(X).__name__ + steps = getattr(sparse, sparse_func)(steps) + X_new = X_class[rows] + steps.multiply(diffs) + else: + X_new = X_class[rows] + steps * diffs + + X_new = X_new.astype(X.dtype) + y_new = np.full(n_samples, fill_value=class_sample, dtype=y.dtype) + X_resampled.append(X_new) + y_resampled.append(y_new) + + if sparse.issparse(X): + X_resampled = sparse.vstack(X_resampled, format=X.format) + else: + X_resampled = np.vstack(X_resampled) + y_resampled = np.hstack(y_resampled) + + if shape_recover: + X_resampled = X_resampled[:, np.newaxis, :] + return X_resampled, y_resampled diff --git a/aeon/transformations/collection/imbalance/_smote.py b/aeon/transformations/collection/imbalance/_smote.py index 24078d180e..36aea38b1c 100644 --- a/aeon/transformations/collection/imbalance/_smote.py +++ b/aeon/transformations/collection/imbalance/_smote.py @@ -1,15 +1,42 @@ -"""Wrapper for imblearn minority class rebalancer SMOTE.""" - -from imblearn.over_sampling import SMOTE as smote +""" +implement for imblearn minority class rebalancer SMOTE. +see more in imblearn.over_sampling.SMOTE +original authors: +# Guillaume Lemaitre +# Fernando Nogueira +# Christos Aridas +# Dzianis Dudnik +# License: MIT +""" +import numpy as np from aeon.transformations.collection import BaseCollectionTransformer +from sklearn.neighbors import NearestNeighbors +from sklearn.utils import check_random_state +from scipy import sparse +from collections import OrderedDict -__maintainer__ = ["TonyBagnall"] +__maintainer__ = ["TonyBagnall, Chris Qiu"] __all__ = ["SMOTE"] class SMOTE(BaseCollectionTransformer): - """Wrapper for SMOTE transform.""" + """ + Class to perform over-sampling using SMOTE. + + This object is a simplified implementation of SMOTE - Synthetic Minority + Over-sampling Technique as presented in imblearn.over_sampling.SMOTE + sampling_strategy is sampling target by targeting all classes but not the + majority, which directly expressed in _fit.sampling_strategy. + Parameters + ---------- + {random_state} + + k_neighbors : int or object, default=5 + The nearest neighbors used to define the neighborhood of samples to use + to generate the synthetic samples. `~sklearn.neighbors.NearestNeighbors` + instance will be fitted in this case. + """ _tags = { "capability:multivariate": True, @@ -17,14 +44,183 @@ class SMOTE(BaseCollectionTransformer): "requires_y": True, } - def __init__(self, sampling_strategy="auto", random_state=None, k_neighbors=5): - self.sampling_strategy = sampling_strategy + def __init__(self, random_state=None, k_neighbors=5): self.random_state = random_state self.k_neighbors = k_neighbors + super().__init__() def _fit(self, X, y=None): - self.smote_ = smote(self.sampling_strategy, self.random_state, self.k_neighbors) - self.smote_.fit(X, y) + # set the additional_neighbor=1 + self.nn_ = NearestNeighbors(n_neighbors=self.k_neighbors + 1) + + # generate sampling target by targeting all classes but not the majority + unique, counts = np.unique(y, return_counts=True) + target_stats = dict(zip(unique, counts)) + n_sample_majority = max(target_stats.values()) + class_majority = max(target_stats, key=target_stats.get) + sampling_strategy = { + key: n_sample_majority - value + for (key, value) in target_stats.items() + if key != class_majority + } + self.sampling_strategy_ = OrderedDict( + sorted(sampling_strategy.items()) + ) + return self def _transform(self, X, y=None): - return self.smote_.resample(X, y) + shape_recover = False # use to recover the shape of X + if X.ndim == 3 and X.shape[1] == 1: + X = np.squeeze(X, axis=1) # remove the middle dimension to be compatible with sklearn + shape_recover = True + X_resampled = [X.copy()] + y_resampled = [y.copy()] + + # got the minority class label and the number needs to be generated i.e. num_majority - num_minority + for class_sample, n_samples in self.sampling_strategy_.items(): + if n_samples == 0: + continue + target_class_indices = np.flatnonzero(y == class_sample) + X_class = X[target_class_indices] + + self.nn_.fit(X_class) + nns = self.nn_.kneighbors(X_class, return_distance=False)[:, 1:] + X_new, y_new = self._make_samples( + X_class, y.dtype, class_sample, X_class, nns, n_samples, 1.0 + ) + X_resampled.append(X_new) + y_resampled.append(y_new) + + if sparse.issparse(X): + X_resampled = sparse.vstack(X_resampled, format=X.format) + else: + X_resampled = np.vstack(X_resampled) + y_resampled = np.hstack(y_resampled) + if shape_recover: + X_resampled = X_resampled[:, np.newaxis, :] + return X_resampled, y_resampled + + def _make_samples( + self, X, y_dtype, y_type, nn_data, nn_num, n_samples, step_size=1.0, y=None + ): + """A support function that returns artificial samples constructed along + the line connecting nearest neighbours. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Points from which the points will be created. + + y_dtype : dtype + The data type of the targets. + + y_type : str or int + The minority target value, just so the function can return the + target values for the synthetic variables with correct length in + a clear format. + + nn_data : ndarray of shape (n_samples_all, n_features) + Data set carrying all the neighbours to be used + + nn_num : ndarray of shape (n_samples_all, k_nearest_neighbours) + The nearest neighbours of each sample in `nn_data`. + + n_samples : int + The number of samples to generate. + + step_size : float, default=1.0 + The step size to create samples. + + y : ndarray of shape (n_samples_all,), default=None + The true target associated with `nn_data`. Used by Borderline SMOTE-2 to + weight the distances in the sample generation process. + + Returns + ------- + X_new : {ndarray, sparse matrix} of shape (n_samples_new, n_features) + Synthetically generated samples. + + y_new : ndarray of shape (n_samples_new,) + Target values for synthetic samples. + """ + random_state = check_random_state(self.random_state) + samples_indices = random_state.randint(low=0, high=nn_num.size, size=n_samples) + + # np.newaxis for backwards compatability with random_state + steps = step_size * random_state.uniform(size=n_samples)[:, np.newaxis] + rows = np.floor_divide(samples_indices, nn_num.shape[1]) + cols = np.mod(samples_indices, nn_num.shape[1]) + + X_new = self._generate_samples(X, nn_data, nn_num, rows, cols, steps, y_type, y) + y_new = np.full(n_samples, fill_value=y_type, dtype=y_dtype) + return X_new, y_new + + def _generate_samples( + self, X, nn_data, nn_num, rows, cols, steps, y_type=None, y=None + ): + r"""Generate a synthetic sample. + + The rule for the generation is: + + .. math:: + \mathbf{s_{s}} = \mathbf{s_{i}} + \mathcal{u}(0, 1) \times + (\mathbf{s_{i}} - \mathbf{s_{nn}}) \, + + where \mathbf{s_{s}} is the new synthetic samples, \mathbf{s_{i}} is + the current sample, \mathbf{s_{nn}} is a randomly selected neighbors of + \mathbf{s_{i}} and \mathcal{u}(0, 1) is a random number between [0, 1). + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Points from which the points will be created. + + nn_data : ndarray of shape (n_samples_all, n_features) + Data set carrying all the neighbours to be used. + + nn_num : ndarray of shape (n_samples_all, k_nearest_neighbours) + The nearest neighbours of each sample in `nn_data`. + + rows : ndarray of shape (n_samples,), dtype=int + Indices pointing at feature vector in X which will be used + as a base for creating new samples. + + cols : ndarray of shape (n_samples,), dtype=int + Indices pointing at which nearest neighbor of base feature vector + will be used when creating new samples. + + steps : ndarray of shape (n_samples,), dtype=float + Step sizes for new samples. + + y_type : str, int or None, default=None + Class label of the current target classes for which we want to generate + samples. + + y : ndarray of shape (n_samples_all,), default=None + The true target associated with `nn_data`. Used by Borderline SMOTE-2 to + weight the distances in the sample generation process. + + Returns + ------- + X_new : {ndarray, sparse matrix} of shape (n_samples, n_features) + Synthetically generated samples. + """ + diffs = nn_data[nn_num[rows, cols]] - X[rows] + if y is not None: # only entering for BorderlineSMOTE-2 + random_state = check_random_state(self.random_state) + mask_pair_samples = y[nn_num[rows, cols]] != y_type + diffs[mask_pair_samples] *= random_state.uniform( + low=0.0, high=0.5, size=(mask_pair_samples.sum(), 1) + ) + + if sparse.issparse(X): + sparse_func = type(X).__name__ + steps = getattr(sparse, sparse_func)(steps) + X_new = X[rows] + steps.multiply(diffs) + else: + X_new = X[rows] + steps * diffs + + return X_new.astype(X.dtype) + + + diff --git a/aeon/transformations/collection/tests/test_imbalance.py b/aeon/transformations/collection/tests/test_imbalance.py new file mode 100644 index 0000000000..f56df6fcfe --- /dev/null +++ b/aeon/transformations/collection/tests/test_imbalance.py @@ -0,0 +1,60 @@ +"""Tests for the rebalancer transformers.""" + +import numpy as np +import pytest + +from aeon.transformations.collection.imbalance import SMOTE, ADASYN + + +def test_smote(): + """Test the SMOTE class. + + This function creates a 3D numpy array, applies + SMOTE using the SMOTE class, and asserts that the + transformed data has a balanced number of samples. + """ + n_samples = 100 # Total number of labels + majority_num = 90 # number of majority class + minority_num = n_samples - majority_num # number of minority class + + X = np.random.rand(n_samples, 1, 10) + y = np.array([0] * majority_num + [1] * minority_num) + + transformer = SMOTE() + transformer.fit(X, y) + res_X, res_y = transformer.transform(X, y) + _, res_count = np.unique(res_y, return_counts=True) + + assert len(res_X) == 2 * majority_num + assert len(res_y) == 2 * majority_num + assert res_count[0] == majority_num + assert res_count[1] == majority_num + + +def test_adasyn(): + """Test the ADASYN class. + + This function creates a 3D numpy array, applies + ADASYN using the ADASYN class, and asserts that the + transformed data has a balanced number of samples. + ADASYN is a variant of SMOTE that generates synthetic samples, + but it focuses on generating samples near the decision boundary. + Therefore, sometimes, it may generate more or less samples than SMOTE, + which is why we only check if the number of samples is nearly balanced. + """ + n_samples = 100 # Total number of labels + majority_num = 90 # number of majority class + minority_num = n_samples - majority_num # number of minority class + + X = np.random.rand(n_samples, 1, 10) + y = np.array([0] * majority_num + [1] * minority_num) + + transformer = ADASYN() + transformer.fit(X, y) + res_X, res_y = transformer.transform(X, y) + _, res_count = np.unique(res_y, return_counts=True) + + assert np.abs(len(res_X) - 2 * majority_num) < minority_num + assert np.abs(len(res_y) - 2 * majority_num) < minority_num + assert res_count[0] == majority_num + assert np.abs(res_count[0] - res_count[1]) < minority_num From 5db24f39201b630cc6e125027774c9fd1c2fca0b Mon Sep 17 00:00:00 2001 From: Tony Bagnall Date: Thu, 23 Jan 2025 16:50:11 +0000 Subject: [PATCH 03/19] make experimental --- README.md | 1 + .../collection/imbalance/_adasyn.py | 46 +++++++----- .../collection/imbalance/_smote.py | 75 ++++++++++--------- .../collection/imbalance/tests/__init__.py | 1 + docs/developer_guide/deprecation.md | 1 + docs/index.md | 1 + 6 files changed, 71 insertions(+), 54 deletions(-) create mode 100644 aeon/transformations/collection/imbalance/tests/__init__.py diff --git a/README.md b/README.md index e1475d6d85..e267f053fb 100644 --- a/README.md +++ b/README.md @@ -26,6 +26,7 @@ does not apply: - `segmentation` - `similarity_search` - `visualisation` +- `transformations.collection.imbalance` | Overview | | |-----------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| diff --git a/aeon/transformations/collection/imbalance/_adasyn.py b/aeon/transformations/collection/imbalance/_adasyn.py index 72818b72a8..2db87c36d1 100644 --- a/aeon/transformations/collection/imbalance/_adasyn.py +++ b/aeon/transformations/collection/imbalance/_adasyn.py @@ -1,17 +1,20 @@ -""" -implement for imblearn minority class rebalancer ADASYN. +"""ADASYN over sampling algorithm. + see more in imblearn.over_sampling.ADASYN original authors: # Guillaume Lemaitre # Christos Aridas # License: MIT """ + +from collections import OrderedDict + import numpy as np -from aeon.transformations.collection import BaseCollectionTransformer +from scipy import sparse from sklearn.neighbors import NearestNeighbors from sklearn.utils import check_random_state -from scipy import sparse -from collections import OrderedDict + +from aeon.transformations.collection import BaseCollectionTransformer __maintainer__ = ["TonyBagnall, Chris Qiu"] __all__ = ["ADASYN"] @@ -26,20 +29,27 @@ class ADASYN(BaseCollectionTransformer): This method is similar to SMOTE, but it generates different number of samples depending on an estimate of the local distribution of the class to be oversampled. + + Currently only works with two class problems. + Parameters ---------- - {random_state} - k_neighbors : int or object, default=5 The nearest neighbors used to define the neighborhood of samples to use to generate the synthetic samples. `~sklearn.neighbors.NearestNeighbors` instance will be fitted in this case. + random_state : int, RandomState instance or None, default=None + If `int`, random_state is the seed used by the random number generator; + If `RandomState` instance, random_state is the random number generator; + If `None`, the random number generator is the `RandomState` instance used + by `np.random`. """ _tags = { - "capability:multivariate": True, - "capability:unequal_length": True, + "capability:multivariate": False, + "capability:unequal_length": False, "requires_y": True, + "python_dependencies": "imbalanced-learn", } def __init__(self, random_state=None, k_neighbors=5): @@ -51,9 +61,11 @@ def _fit(self, X, y=None): # set the additional_neighbor=1 self.nn_ = NearestNeighbors(n_neighbors=self.k_neighbors + 1) - # generate sampling target by targeting all classes but not the majority + # resamples all classes except the majority. unique, counts = np.unique(y, return_counts=True) target_stats = dict(zip(unique, counts)) + # If two or more classes are equal largest, the majority is assumed to be the + # one with the largest index. n_sample_majority = max(target_stats.values()) class_majority = max(target_stats, key=target_stats.get) sampling_strategy = { @@ -61,21 +73,16 @@ def _fit(self, X, y=None): for (key, value) in target_stats.items() if key != class_majority } - self.sampling_strategy_ = OrderedDict( - sorted(sampling_strategy.items()) - ) + self.sampling_strategy_ = OrderedDict(sorted(sampling_strategy.items())) return self def _transform(self, X, y=None): - shape_recover = False # use to recover the shape of X - if X.ndim == 3 and X.shape[1] == 1: - X = np.squeeze(X, axis=1) # remove the middle dimension to be compatible with sklearn - shape_recover = True + X = np.squeeze(X, axis=1) random_state = check_random_state(self.random_state) X_resampled = [X.copy()] y_resampled = [y.copy()] - # got the minority class label and the number needs to be generated i.e. num_majority - num_minority + # got the minority class label and the number needs to be generated for class_sample, n_samples in self.sampling_strategy_.items(): if n_samples == 0: continue @@ -135,6 +142,5 @@ def _transform(self, X, y=None): X_resampled = np.vstack(X_resampled) y_resampled = np.hstack(y_resampled) - if shape_recover: - X_resampled = X_resampled[:, np.newaxis, :] + X_resampled = X_resampled[:, np.newaxis, :] return X_resampled, y_resampled diff --git a/aeon/transformations/collection/imbalance/_smote.py b/aeon/transformations/collection/imbalance/_smote.py index 36aea38b1c..604179dded 100644 --- a/aeon/transformations/collection/imbalance/_smote.py +++ b/aeon/transformations/collection/imbalance/_smote.py @@ -1,6 +1,6 @@ -""" -implement for imblearn minority class rebalancer SMOTE. -see more in imblearn.over_sampling.SMOTE +"""SMOTE over sampling algorithm. + +See more in imblearn.over_sampling.SMOTE original authors: # Guillaume Lemaitre # Fernando Nogueira @@ -9,42 +9,58 @@ # License: MIT """ +from collections import OrderedDict + import numpy as np -from aeon.transformations.collection import BaseCollectionTransformer +from scipy import sparse from sklearn.neighbors import NearestNeighbors from sklearn.utils import check_random_state -from scipy import sparse -from collections import OrderedDict -__maintainer__ = ["TonyBagnall, Chris Qiu"] +from aeon.transformations.collection import BaseCollectionTransformer + +__maintainer__ = ["TonyBagnall"] __all__ = ["SMOTE"] class SMOTE(BaseCollectionTransformer): """ - Class to perform over-sampling using SMOTE. + Over-sampling using the Synthetic Minority Over-sampling TEchnique (SMOTE)[1]_. + + An adaptation of the imbalance-learn implementation of SMOTE in + imblearn.over_sampling.SMOTE. sampling_strategy is sampling target by + targeting all classes but not the majority, which is directly expressed in + _fit.sampling_strategy. - This object is a simplified implementation of SMOTE - Synthetic Minority - Over-sampling Technique as presented in imblearn.over_sampling.SMOTE - sampling_strategy is sampling target by targeting all classes but not the - majority, which directly expressed in _fit.sampling_strategy. Parameters ---------- - {random_state} - k_neighbors : int or object, default=5 The nearest neighbors used to define the neighborhood of samples to use to generate the synthetic samples. `~sklearn.neighbors.NearestNeighbors` instance will be fitted in this case. + random_state : int, RandomState instance or None, default=None + If `int`, random_state is the seed used by the random number generator; + If `RandomState` instance, random_state is the random number generator; + If `None`, the random number generator is the `RandomState` instance used + by `np.random`. + + See Also + -------- + ADASYN + + References + ---------- + .. [1] Chawla et al. SMOTE: synthetic minority over-sampling technique, Journal + of Artificial Intelligence Research 16(1): 321–357, 2002. + https://dl.acm.org/doi/10.5555/1622407.1622416 """ _tags = { - "capability:multivariate": True, - "capability:unequal_length": True, + "capability:multivariate": False, + "capability:unequal_length": False, "requires_y": True, } - def __init__(self, random_state=None, k_neighbors=5): + def __init__(self, k_neighbors=5, random_state=None): self.random_state = random_state self.k_neighbors = k_neighbors super().__init__() @@ -63,20 +79,16 @@ def _fit(self, X, y=None): for (key, value) in target_stats.items() if key != class_majority } - self.sampling_strategy_ = OrderedDict( - sorted(sampling_strategy.items()) - ) + self.sampling_strategy_ = OrderedDict(sorted(sampling_strategy.items())) return self def _transform(self, X, y=None): - shape_recover = False # use to recover the shape of X - if X.ndim == 3 and X.shape[1] == 1: - X = np.squeeze(X, axis=1) # remove the middle dimension to be compatible with sklearn - shape_recover = True + # remove the channel dimension to be compatible with sklearn + X = np.squeeze(X, axis=1) X_resampled = [X.copy()] y_resampled = [y.copy()] - # got the minority class label and the number needs to be generated i.e. num_majority - num_minority + # got the minority class label and the number needs to be generated for class_sample, n_samples in self.sampling_strategy_.items(): if n_samples == 0: continue @@ -96,15 +108,13 @@ def _transform(self, X, y=None): else: X_resampled = np.vstack(X_resampled) y_resampled = np.hstack(y_resampled) - if shape_recover: - X_resampled = X_resampled[:, np.newaxis, :] + X_resampled = X_resampled[:, np.newaxis, :] return X_resampled, y_resampled def _make_samples( - self, X, y_dtype, y_type, nn_data, nn_num, n_samples, step_size=1.0, y=None + self, X, y_dtype, y_type, nn_data, nn_num, n_samples, step_size=1.0, y=None ): - """A support function that returns artificial samples constructed along - the line connecting nearest neighbours. + """Make artificial samples constructed based on nearest neighbours. Parameters ---------- @@ -156,7 +166,7 @@ def _make_samples( return X_new, y_new def _generate_samples( - self, X, nn_data, nn_num, rows, cols, steps, y_type=None, y=None + self, X, nn_data, nn_num, rows, cols, steps, y_type=None, y=None ): r"""Generate a synthetic sample. @@ -221,6 +231,3 @@ def _generate_samples( X_new = X[rows] + steps * diffs return X_new.astype(X.dtype) - - - diff --git a/aeon/transformations/collection/imbalance/tests/__init__.py b/aeon/transformations/collection/imbalance/tests/__init__.py new file mode 100644 index 0000000000..55831a6ec8 --- /dev/null +++ b/aeon/transformations/collection/imbalance/tests/__init__.py @@ -0,0 +1 @@ +"""Test resampling transformers.""" diff --git a/docs/developer_guide/deprecation.md b/docs/developer_guide/deprecation.md index 4b10d81cb2..04aadbab3a 100644 --- a/docs/developer_guide/deprecation.md +++ b/docs/developer_guide/deprecation.md @@ -24,6 +24,7 @@ experimental. Currently experimental modules are: - `segmentation` - `similarity_search` - `visualisation` +- `transformations.collection.imbalance` When we introduce a new module, we may classify it as experimental until the API is stable. We will try to not make drastic changes to experimental modules, but we need diff --git a/docs/index.md b/docs/index.md index 11b558839e..76fb04e1ce 100644 --- a/docs/index.md +++ b/docs/index.md @@ -276,6 +276,7 @@ experimental modules are: - `segmentation` - `similarity_search` - `visualisation` +- `transformations.collection.imbalance` ```{toctree} :caption: Using aeon From d9b35b79b5df40010cbe792a283009ddbff293d3 Mon Sep 17 00:00:00 2001 From: Tony Bagnall Date: Thu, 23 Jan 2025 17:01:17 +0000 Subject: [PATCH 04/19] inherit from SMOTE --- .../collection/imbalance/_adasyn.py | 78 ++++--------------- 1 file changed, 16 insertions(+), 62 deletions(-) diff --git a/aeon/transformations/collection/imbalance/_adasyn.py b/aeon/transformations/collection/imbalance/_adasyn.py index 2db87c36d1..6b487529d8 100644 --- a/aeon/transformations/collection/imbalance/_adasyn.py +++ b/aeon/transformations/collection/imbalance/_adasyn.py @@ -1,80 +1,34 @@ -"""ADASYN over sampling algorithm. - -see more in imblearn.over_sampling.ADASYN -original authors: -# Guillaume Lemaitre -# Christos Aridas -# License: MIT -""" - -from collections import OrderedDict +"""ADASYN over sampling algorithm.""" import numpy as np from scipy import sparse -from sklearn.neighbors import NearestNeighbors from sklearn.utils import check_random_state -from aeon.transformations.collection import BaseCollectionTransformer +from aeon.transformations.collection.imbalance import SMOTE -__maintainer__ = ["TonyBagnall, Chris Qiu"] +__maintainer__ = ["TonyBagnall"] __all__ = ["ADASYN"] -class ADASYN(BaseCollectionTransformer): +class ADASYN(SMOTE): """ - Class to perform over-sampling using ADASYN . + Over-sampling using Adaptive Synthetic Sampling (ADASYN). - This object is a simplified implementation of ADASYN - Adaptive - Synthetic (ADASYN) algorithm as presented in imblearn.over_sampling.ADASYN - This method is similar to SMOTE, but it generates different number of + Adaptation of imblearn.over_sampling.ADASYN + original authors: + # Guillaume Lemaitre + # Christos Aridas + # License: MIT + + This transformer extends SMOTE, but it generates different number of samples depending on an estimate of the local distribution of the class to be oversampled. - - Currently only works with two class problems. - - Parameters - ---------- - k_neighbors : int or object, default=5 - The nearest neighbors used to define the neighborhood of samples to use - to generate the synthetic samples. `~sklearn.neighbors.NearestNeighbors` - instance will be fitted in this case. - random_state : int, RandomState instance or None, default=None - If `int`, random_state is the seed used by the random number generator; - If `RandomState` instance, random_state is the random number generator; - If `None`, the random number generator is the `RandomState` instance used - by `np.random`. """ - _tags = { - "capability:multivariate": False, - "capability:unequal_length": False, - "requires_y": True, - "python_dependencies": "imbalanced-learn", - } - - def __init__(self, random_state=None, k_neighbors=5): - self.random_state = random_state - self.k_neighbors = k_neighbors - super().__init__() - - def _fit(self, X, y=None): - # set the additional_neighbor=1 - self.nn_ = NearestNeighbors(n_neighbors=self.k_neighbors + 1) - - # resamples all classes except the majority. - unique, counts = np.unique(y, return_counts=True) - target_stats = dict(zip(unique, counts)) - # If two or more classes are equal largest, the majority is assumed to be the - # one with the largest index. - n_sample_majority = max(target_stats.values()) - class_majority = max(target_stats, key=target_stats.get) - sampling_strategy = { - key: n_sample_majority - value - for (key, value) in target_stats.items() - if key != class_majority - } - self.sampling_strategy_ = OrderedDict(sorted(sampling_strategy.items())) - return self + def __init__( + self, + ): + super().__init__(random_state=None, k_neighbors=5) def _transform(self, X, y=None): X = np.squeeze(X, axis=1) From 97c7466d076899b54119c7987fdebeaf65ebafbe Mon Sep 17 00:00:00 2001 From: Tony Bagnall Date: Fri, 24 Jan 2025 11:12:58 +0000 Subject: [PATCH 05/19] test equivalence to imblearn --- .../collection/imbalance/_adasyn.py | 6 ++-- .../collection/imbalance/_smote.py | 4 +-- .../collection/imbalance/tests/test_adasyn.py | 32 +++++++++++++++++++ .../collection/imbalance/tests/test_smote.py | 32 +++++++++++++++++++ 4 files changed, 68 insertions(+), 6 deletions(-) create mode 100644 aeon/transformations/collection/imbalance/tests/test_adasyn.py create mode 100644 aeon/transformations/collection/imbalance/tests/test_smote.py diff --git a/aeon/transformations/collection/imbalance/_adasyn.py b/aeon/transformations/collection/imbalance/_adasyn.py index 6b487529d8..0d78637f86 100644 --- a/aeon/transformations/collection/imbalance/_adasyn.py +++ b/aeon/transformations/collection/imbalance/_adasyn.py @@ -25,10 +25,8 @@ class ADASYN(SMOTE): to be oversampled. """ - def __init__( - self, - ): - super().__init__(random_state=None, k_neighbors=5) + def __init__(self, random_state=None, k_neighbors=5): + super().__init__(random_state=random_state, k_neighbors=k_neighbors) def _transform(self, X, y=None): X = np.squeeze(X, axis=1) diff --git a/aeon/transformations/collection/imbalance/_smote.py b/aeon/transformations/collection/imbalance/_smote.py index 604179dded..f56e6f7b40 100644 --- a/aeon/transformations/collection/imbalance/_smote.py +++ b/aeon/transformations/collection/imbalance/_smote.py @@ -66,10 +66,10 @@ def __init__(self, k_neighbors=5, random_state=None): super().__init__() def _fit(self, X, y=None): - # set the additional_neighbor=1 + # set the additional_neighbor required by SMOTE self.nn_ = NearestNeighbors(n_neighbors=self.k_neighbors + 1) - # generate sampling target by targeting all classes but not the majority + # generate sampling target by targeting all classes except the majority unique, counts = np.unique(y, return_counts=True) target_stats = dict(zip(unique, counts)) n_sample_majority = max(target_stats.values()) diff --git a/aeon/transformations/collection/imbalance/tests/test_adasyn.py b/aeon/transformations/collection/imbalance/tests/test_adasyn.py new file mode 100644 index 0000000000..3557f85cb4 --- /dev/null +++ b/aeon/transformations/collection/imbalance/tests/test_adasyn.py @@ -0,0 +1,32 @@ +"""Test ADASYN oversampler ported from imblearn.""" + +import numpy as np +import pytest + +from aeon.testing.data_generation import make_example_3d_numpy +from aeon.transformations.collection.imbalance import ADASYN +from aeon.utils.validation._dependencies import _check_soft_dependencies + + +@pytest.mark.skipif( + not _check_soft_dependencies( + "imbalanced-learn", + package_import_alias={"imbalanced-learn": "imblearn"}, + severity="none", + ), + reason="skip test if required soft dependency imbalanced-learn not available", +) +def test_equivalence_imbalance(): + """Test ported ADASYN code produces the same as imblearn version.""" + from imblearn.over_sampling import ADASYN as imbADASYN + + X, y = make_example_3d_numpy(n_cases=20, n_channels=1) + y = np.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]) + X = X.squeeze() + s1 = imbADASYN(random_state=49) + X2, y2 = s1.fit_resample(X, y) + s2 = ADASYN(random_state=49) + X3, y3 = s2.fit_transform(X, y) + X3 = X3.squeeze() + assert np.array_equal(y2, y3) + assert np.allclose(X2, X3, atol=1e-4) diff --git a/aeon/transformations/collection/imbalance/tests/test_smote.py b/aeon/transformations/collection/imbalance/tests/test_smote.py new file mode 100644 index 0000000000..53cc95cac7 --- /dev/null +++ b/aeon/transformations/collection/imbalance/tests/test_smote.py @@ -0,0 +1,32 @@ +"""Test function for SMOTE.""" + +import numpy as np +import pytest + +from aeon.testing.data_generation import make_example_3d_numpy +from aeon.transformations.collection.imbalance import SMOTE +from aeon.utils.validation._dependencies import _check_soft_dependencies + + +@pytest.mark.skipif( + not _check_soft_dependencies( + "imbalanced-learn", + package_import_alias={"imbalanced-learn": "imblearn"}, + severity="none", + ), + reason="skip test if required soft dependency imbalanced-learn not available", +) +def test_equivalence_imbalance(): + """Test ported SMOTE code produces the same as imblearn version.""" + from imblearn.over_sampling import SMOTE as imbSMOTE + + X, y = make_example_3d_numpy(n_cases=20, n_channels=1) + y = np.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]) + X = X.squeeze() + s1 = imbSMOTE(random_state=49) + X2, y2 = s1.fit_resample(X, y) + s2 = SMOTE(random_state=49) + X3, y3 = s2.fit_transform(X, y) + X3 = X3.squeeze() + assert np.array_equal(y2, y3) + assert np.allclose(X2, X3, atol=1e-4) From a440a90a43e88d2149cf4a181875e4aad03360fb Mon Sep 17 00:00:00 2001 From: Tony Bagnall Date: Fri, 24 Jan 2025 11:18:34 +0000 Subject: [PATCH 06/19] move tests --- .../collection/imbalance/tests/test_adasyn.py | 29 +++++++++ .../collection/imbalance/tests/test_smote.py | 25 ++++++++ .../collection/tests/test_imbalance.py | 60 ------------------- 3 files changed, 54 insertions(+), 60 deletions(-) delete mode 100644 aeon/transformations/collection/tests/test_imbalance.py diff --git a/aeon/transformations/collection/imbalance/tests/test_adasyn.py b/aeon/transformations/collection/imbalance/tests/test_adasyn.py index 3557f85cb4..0bb5c62ea6 100644 --- a/aeon/transformations/collection/imbalance/tests/test_adasyn.py +++ b/aeon/transformations/collection/imbalance/tests/test_adasyn.py @@ -8,6 +8,35 @@ from aeon.utils.validation._dependencies import _check_soft_dependencies +def test_adasyn(): + """Test the ADASYN class. + + This function creates a 3D numpy array, applies + ADASYN using the ADASYN class, and asserts that the + transformed data has a balanced number of samples. + ADASYN is a variant of SMOTE that generates synthetic samples, + but it focuses on generating samples near the decision boundary. + Therefore, sometimes, it may generate more or less samples than SMOTE, + which is why we only check if the number of samples is nearly balanced. + """ + n_samples = 100 # Total number of labels + majority_num = 90 # number of majority class + minority_num = n_samples - majority_num # number of minority class + + X = np.random.rand(n_samples, 1, 10) + y = np.array([0] * majority_num + [1] * minority_num) + + transformer = ADASYN() + transformer.fit(X, y) + res_X, res_y = transformer.transform(X, y) + _, res_count = np.unique(res_y, return_counts=True) + + assert np.abs(len(res_X) - 2 * majority_num) < minority_num + assert np.abs(len(res_y) - 2 * majority_num) < minority_num + assert res_count[0] == majority_num + assert np.abs(res_count[0] - res_count[1]) < minority_num + + @pytest.mark.skipif( not _check_soft_dependencies( "imbalanced-learn", diff --git a/aeon/transformations/collection/imbalance/tests/test_smote.py b/aeon/transformations/collection/imbalance/tests/test_smote.py index 53cc95cac7..70189633d0 100644 --- a/aeon/transformations/collection/imbalance/tests/test_smote.py +++ b/aeon/transformations/collection/imbalance/tests/test_smote.py @@ -8,6 +8,31 @@ from aeon.utils.validation._dependencies import _check_soft_dependencies +def test_smote(): + """Test the SMOTE class. + + This function creates a 3D numpy array, applies + SMOTE using the SMOTE class, and asserts that the + transformed data has a balanced number of samples. + """ + n_samples = 100 # Total number of labels + majority_num = 90 # number of majority class + minority_num = n_samples - majority_num # number of minority class + + X = np.random.rand(n_samples, 1, 10) + y = np.array([0] * majority_num + [1] * minority_num) + + transformer = SMOTE() + transformer.fit(X, y) + res_X, res_y = transformer.transform(X, y) + _, res_count = np.unique(res_y, return_counts=True) + + assert len(res_X) == 2 * majority_num + assert len(res_y) == 2 * majority_num + assert res_count[0] == majority_num + assert res_count[1] == majority_num + + @pytest.mark.skipif( not _check_soft_dependencies( "imbalanced-learn", diff --git a/aeon/transformations/collection/tests/test_imbalance.py b/aeon/transformations/collection/tests/test_imbalance.py deleted file mode 100644 index f56df6fcfe..0000000000 --- a/aeon/transformations/collection/tests/test_imbalance.py +++ /dev/null @@ -1,60 +0,0 @@ -"""Tests for the rebalancer transformers.""" - -import numpy as np -import pytest - -from aeon.transformations.collection.imbalance import SMOTE, ADASYN - - -def test_smote(): - """Test the SMOTE class. - - This function creates a 3D numpy array, applies - SMOTE using the SMOTE class, and asserts that the - transformed data has a balanced number of samples. - """ - n_samples = 100 # Total number of labels - majority_num = 90 # number of majority class - minority_num = n_samples - majority_num # number of minority class - - X = np.random.rand(n_samples, 1, 10) - y = np.array([0] * majority_num + [1] * minority_num) - - transformer = SMOTE() - transformer.fit(X, y) - res_X, res_y = transformer.transform(X, y) - _, res_count = np.unique(res_y, return_counts=True) - - assert len(res_X) == 2 * majority_num - assert len(res_y) == 2 * majority_num - assert res_count[0] == majority_num - assert res_count[1] == majority_num - - -def test_adasyn(): - """Test the ADASYN class. - - This function creates a 3D numpy array, applies - ADASYN using the ADASYN class, and asserts that the - transformed data has a balanced number of samples. - ADASYN is a variant of SMOTE that generates synthetic samples, - but it focuses on generating samples near the decision boundary. - Therefore, sometimes, it may generate more or less samples than SMOTE, - which is why we only check if the number of samples is nearly balanced. - """ - n_samples = 100 # Total number of labels - majority_num = 90 # number of majority class - minority_num = n_samples - majority_num # number of minority class - - X = np.random.rand(n_samples, 1, 10) - y = np.array([0] * majority_num + [1] * minority_num) - - transformer = ADASYN() - transformer.fit(X, y) - res_X, res_y = transformer.transform(X, y) - _, res_count = np.unique(res_y, return_counts=True) - - assert np.abs(len(res_X) - 2 * majority_num) < minority_num - assert np.abs(len(res_y) - 2 * majority_num) < minority_num - assert res_count[0] == majority_num - assert np.abs(res_count[0] - res_count[1]) < minority_num From 6e24ef0c5327eabd284fa988095b92ae979b0ffe Mon Sep 17 00:00:00 2001 From: Tony Bagnall Date: Fri, 24 Jan 2025 11:29:58 +0000 Subject: [PATCH 07/19] format --- aeon/transformations/collection/imbalance/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/aeon/transformations/collection/imbalance/__init__.py b/aeon/transformations/collection/imbalance/__init__.py index 280251ad04..38441e9e9f 100644 --- a/aeon/transformations/collection/imbalance/__init__.py +++ b/aeon/transformations/collection/imbalance/__init__.py @@ -1,6 +1,6 @@ """Supervised transformers to rebalance colelctions of time series.""" -__all__ = ["SMOTE", "ADASYN"] +__all__ = ["ADASYN", "SMOTE"] -from aeon.transformations.collection.imbalance._smote import SMOTE from aeon.transformations.collection.imbalance._adasyn import ADASYN +from aeon.transformations.collection.imbalance._smote import SMOTE From c73111755a727ecdf804010b85a8f6295322f13b Mon Sep 17 00:00:00 2001 From: Tony Bagnall Date: Fri, 24 Jan 2025 11:50:54 +0000 Subject: [PATCH 08/19] import --- .../collection/imbalance/_adasyn.py | 17 ++--------- .../collection/imbalance/_smote.py | 29 ++++++------------- 2 files changed, 12 insertions(+), 34 deletions(-) diff --git a/aeon/transformations/collection/imbalance/_adasyn.py b/aeon/transformations/collection/imbalance/_adasyn.py index 0d78637f86..412007009d 100644 --- a/aeon/transformations/collection/imbalance/_adasyn.py +++ b/aeon/transformations/collection/imbalance/_adasyn.py @@ -1,10 +1,9 @@ """ADASYN over sampling algorithm.""" import numpy as np -from scipy import sparse from sklearn.utils import check_random_state -from aeon.transformations.collection.imbalance import SMOTE +from aeon.transformations.collection.imbalance._smote import SMOTE __maintainer__ = ["TonyBagnall"] __all__ = ["ADASYN"] @@ -75,23 +74,13 @@ def _transform(self, X, y=None): cols = random_state.choice(n_neighbors, size=n_samples) diffs = X_class[nns[rows, cols]] - X_class[rows] steps = random_state.uniform(size=(n_samples, 1)) - - if sparse.issparse(X): - sparse_func = type(X).__name__ - steps = getattr(sparse, sparse_func)(steps) - X_new = X_class[rows] + steps.multiply(diffs) - else: - X_new = X_class[rows] + steps * diffs + X_new = X_class[rows] + steps * diffs X_new = X_new.astype(X.dtype) y_new = np.full(n_samples, fill_value=class_sample, dtype=y.dtype) X_resampled.append(X_new) y_resampled.append(y_new) - - if sparse.issparse(X): - X_resampled = sparse.vstack(X_resampled, format=X.format) - else: - X_resampled = np.vstack(X_resampled) + X_resampled = np.vstack(X_resampled) y_resampled = np.hstack(y_resampled) X_resampled = X_resampled[:, np.newaxis, :] diff --git a/aeon/transformations/collection/imbalance/_smote.py b/aeon/transformations/collection/imbalance/_smote.py index f56e6f7b40..f8b7084e5e 100644 --- a/aeon/transformations/collection/imbalance/_smote.py +++ b/aeon/transformations/collection/imbalance/_smote.py @@ -12,7 +12,6 @@ from collections import OrderedDict import numpy as np -from scipy import sparse from sklearn.neighbors import NearestNeighbors from sklearn.utils import check_random_state @@ -33,10 +32,10 @@ class SMOTE(BaseCollectionTransformer): Parameters ---------- - k_neighbors : int or object, default=5 - The nearest neighbors used to define the neighborhood of samples to use - to generate the synthetic samples. `~sklearn.neighbors.NearestNeighbors` - instance will be fitted in this case. + k_neighbors : int, default=5 + The number of nearest neighbors used to define the neighborhood of samples + to use to generate the synthetic time series. + `~sklearn.neighbors.NearestNeighbors` instance will be fitted in this case. random_state : int, RandomState instance or None, default=None If `int`, random_state is the seed used by the random number generator; If `RandomState` instance, random_state is the random number generator; @@ -102,11 +101,7 @@ def _transform(self, X, y=None): ) X_resampled.append(X_new) y_resampled.append(y_new) - - if sparse.issparse(X): - X_resampled = sparse.vstack(X_resampled, format=X.format) - else: - X_resampled = np.vstack(X_resampled) + X_resampled = np.vstack(X_resampled) y_resampled = np.hstack(y_resampled) X_resampled = X_resampled[:, np.newaxis, :] return X_resampled, y_resampled @@ -118,8 +113,9 @@ def _make_samples( Parameters ---------- - X : {array-like, sparse matrix} of shape (n_samples, n_features) - Points from which the points will be created. + X : np.ndarray + Shape (n_cases, n_timepoints), time series from which the new series will + be created. y_dtype : dtype The data type of the targets. @@ -222,12 +218,5 @@ def _generate_samples( diffs[mask_pair_samples] *= random_state.uniform( low=0.0, high=0.5, size=(mask_pair_samples.sum(), 1) ) - - if sparse.issparse(X): - sparse_func = type(X).__name__ - steps = getattr(sparse, sparse_func)(steps) - X_new = X[rows] + steps.multiply(diffs) - else: - X_new = X[rows] + steps * diffs - + X_new = X[rows] + steps * diffs return X_new.astype(X.dtype) From 770ea7515b35ac0a7fe104081e69eab6fae9ba50 Mon Sep 17 00:00:00 2001 From: Tony Bagnall Date: Fri, 24 Jan 2025 13:09:14 +0000 Subject: [PATCH 09/19] add test parameters --- .../_yield_estimator_checks.py | 5 ++- .../collection/imbalance/_smote.py | 38 +++++++++++++++---- 2 files changed, 34 insertions(+), 9 deletions(-) diff --git a/aeon/testing/estimator_checking/_yield_estimator_checks.py b/aeon/testing/estimator_checking/_yield_estimator_checks.py index 70f714d4d9..b90e15df68 100644 --- a/aeon/testing/estimator_checking/_yield_estimator_checks.py +++ b/aeon/testing/estimator_checking/_yield_estimator_checks.py @@ -637,7 +637,10 @@ def check_persistence_via_pickle(estimator, datatype): def check_fit_deterministic(estimator, datatype): """Test that fit is deterministic. - Check that calling fit twice is equivalent to calling it once. + Check that calling fit twice is equivalent to calling it once, in terms of the + output of non-state changing methods such as predict and transform. Calls + fit, then calls all non-state changing methods, then calls fit and non-state + changing methods again, checking the output is the same. """ estimator = _clone_estimator(estimator, random_state=0) _run_estimator_method(estimator, "fit", datatype, "train") diff --git a/aeon/transformations/collection/imbalance/_smote.py b/aeon/transformations/collection/imbalance/_smote.py index f8b7084e5e..ee00c78174 100644 --- a/aeon/transformations/collection/imbalance/_smote.py +++ b/aeon/transformations/collection/imbalance/_smote.py @@ -54,8 +54,6 @@ class SMOTE(BaseCollectionTransformer): """ _tags = { - "capability:multivariate": False, - "capability:unequal_length": False, "requires_y": True, } @@ -143,11 +141,11 @@ def _make_samples( Returns ------- - X_new : {ndarray, sparse matrix} of shape (n_samples_new, n_features) - Synthetically generated samples. + X_new : ndarray + Synthetically generated samples of shape (n_samples_new, n_timepoints). - y_new : ndarray of shape (n_samples_new,) - Target values for synthetic samples. + y_new : ndarray + Target values for synthetic samples of shape (n_samples_new,). """ random_state = check_random_state(self.random_state) samples_indices = random_state.randint(low=0, high=nn_num.size, size=n_samples) @@ -178,8 +176,9 @@ def _generate_samples( Parameters ---------- - X : {array-like, sparse matrix} of shape (n_samples, n_features) - Points from which the points will be created. + X : np.ndarray + Series from which the points will be created of shape (n_cases, + n_timepoints). nn_data : ndarray of shape (n_samples_all, n_features) Data set carrying all the neighbours to be used. @@ -220,3 +219,26 @@ def _generate_samples( ) X_new = X[rows] + steps * diffs return X_new.astype(X.dtype) + + @classmethod + def _get_test_params(cls, parameter_set="default"): + """Return testing parameter settings for the estimator. + + Parameters + ---------- + parameter_set : str, default="default" + Name of the set of test parameters to return, for use in tests. If no + special parameters are defined for a value, will return `"default"` set. + ClassifierChannelEnsemble provides the following special sets: + - "results_comparison" - used in some classifiers to compare against + previously generated results where the default set of parameters + cannot produce suitable probability estimates + + Returns + ------- + params : dict or list of dict, default={} + Parameters to create testing instances of the class. + Each dict are parameters to construct an "interesting" test instance, i.e., + `MyClass(**params)` or `MyClass(**params[i])` creates a valid test instance. + """ + return {"k_neighbors": 1} From 2337f67c04c849834257160f20dc08e70451abb1 Mon Sep 17 00:00:00 2001 From: Chuanhang Qiu <80885865+LinGinQiu@users.noreply.github.com> Date: Thu, 15 May 2025 20:25:13 +0100 Subject: [PATCH 10/19] Ported OHIT (#2573) --- .../collection/imbalance/__init__.py | 3 +- .../collection/imbalance/_ohit.py | 256 ++++++++++++++++++ .../collection/imbalance/tests/test_ohit.py | 31 +++ 3 files changed, 289 insertions(+), 1 deletion(-) create mode 100644 aeon/transformations/collection/imbalance/_ohit.py create mode 100644 aeon/transformations/collection/imbalance/tests/test_ohit.py diff --git a/aeon/transformations/collection/imbalance/__init__.py b/aeon/transformations/collection/imbalance/__init__.py index 38441e9e9f..2431c4c363 100644 --- a/aeon/transformations/collection/imbalance/__init__.py +++ b/aeon/transformations/collection/imbalance/__init__.py @@ -1,6 +1,7 @@ """Supervised transformers to rebalance colelctions of time series.""" -__all__ = ["ADASYN", "SMOTE"] +__all__ = ["ADASYN", "SMOTE", "OHIT"] from aeon.transformations.collection.imbalance._adasyn import ADASYN from aeon.transformations.collection.imbalance._smote import SMOTE +from aeon.transformations.collection.imbalance._ohit import OHIT \ No newline at end of file diff --git a/aeon/transformations/collection/imbalance/_ohit.py b/aeon/transformations/collection/imbalance/_ohit.py new file mode 100644 index 0000000000..94bb86eedf --- /dev/null +++ b/aeon/transformations/collection/imbalance/_ohit.py @@ -0,0 +1,256 @@ +"""OHIT over sampling algorithm. + +An adaptation of the oversampling method based on DRSNN clustering. + +Original authors: +# zhutuanfei +""" + +from collections import OrderedDict + +import numpy as np +from scipy.stats import multivariate_normal +from sklearn.utils import check_random_state +from aeon.transformations.collection import BaseCollectionTransformer +from sklearn.neighbors import NearestNeighbors +from sklearn.covariance import ledoit_wolf + + +__all__ = ["OHIT"] + + +class OHIT(BaseCollectionTransformer): + """ + Over-sampling using the Over-sampling based on (OHIT). + + This method is based on Density-Ratio Shared Nearest Neighbor (DRSNN) clustering to find high-density regions + of minority class samples and generate synthetic samples within these clusters and Shrinkage estimation of + large-dimensional covariance matrix + + DRSNN also contains three parameters(i.e.,drT ,k and kapa),it is capable of selecting the proper value for + drT around 1.In addition,k and kapa can be set in a complementary way to avoid the merging and + dissociation of clusters,that is,a large k with a relatively low kapa. + Parameters + ---------- + k : int, the nearest neighbor parameter in SNN similarity + if None, set k = int(np.ceil(n ** 0.5 * 1.25)) where n is the number of minority samples + kapa : int, the nearest neighbor parameter in defining density ratio + if None, set kapa = int(np.ceil(n ** 0.5)) where n is the number of minority samples + drT : float, default=0.9, the threshold of density ratio. + distance : str or callable, default='euclidean' + Distance metric to use for KNN in SNN similarity. + random_state : int, RandomState instance or None, default=None + If `int`, random_state is the seed used by the random number generator; + If `RandomState` instance, random_state is the random number generator; + If `None`, the random number generator is the `RandomState` instance used + by `np.random`. + """ + + _tags = { + "requires_y": True, + } + + def __init__(self, k=None, kapa=None, drT=0.9, distance='euclidean' ,random_state=None): + self.k = k + self.kapa = kapa + self.drT = drT + self.distance = distance + self.random_state = random_state + super().__init__() + + def _fit(self, X, y=None): + + unique, counts = np.unique(y, return_counts=True) + target_stats = dict(zip(unique, counts)) + n_sample_majority = max(target_stats.values()) + class_majority = max(target_stats, key=target_stats.get) + sampling_strategy = { + key: n_sample_majority - value + for (key, value) in target_stats.items() + if key != class_majority + } + self.sampling_strategy_ = OrderedDict(sorted(sampling_strategy.items())) + + return self + + def _transform(self, X, y=None): + X = np.squeeze(X, axis=1) + X_resampled = [X.copy()] + y_resampled = [y.copy()] + + for class_sample, n_samples in self.sampling_strategy_.items(): + if n_samples == 0: + continue + target_class_indices = np.flatnonzero(y == class_sample) + if len(target_class_indices) == 1: + X_new = np.tile(X[target_class_indices], (n_samples, 1)) + y_new = np.full(n_samples, fill_value=class_sample, dtype=y.dtype) + X_resampled.append(X_new) + y_resampled.append(y_new) + continue + X_class = X[target_class_indices] + n, m = X_class.shape + # set the default value of k and kapa + if self.k is None: + self.k = int(np.ceil(n ** 0.5 * 1.25)) + if self.kapa is None: + self.kapa = int(np.ceil(n ** 0.5)) + + # Initialize NearestNeighbors for SNN similarity + self.NearestNeighbors = NearestNeighbors(metric=self.distance, n_neighbors=self.k + 1) + + clusters, cluster_label = self._cluster_minority(X_class) + Me, eigen_matrices, eigen_values = self._covStruct(X_class, clusters) + + # allocate the number of synthetic samples to be generated for each cluster + random_state = check_random_state(self.random_state) + os_ind = np.tile(np.arange(0, n), int(np.floor(n_samples / n))) + remaining = random_state.choice(np.arange(0, n), n_samples - n * int(np.floor(n_samples / n)), replace=False) + os_ind = np.concatenate([os_ind, remaining]) + R = 1.25 if len(clusters) > 1 else 1.1 + + """generate the structure-preserving synthetic samples for each cluster""" + X_new = np.zeros((n_samples, m)) + count = 0 + # consider the samples in the cluster with label 0 i.e. the samples that are not clustered + X_class_0 = X_class[cluster_label == 0] + if X_class_0.size != 0: + gen_0 = np.sum(np.isin(os_ind, np.where(cluster_label == 0)[0])) + idx_0 = random_state.choice(len(X_class_0), gen_0, replace=True) + X_new[count:count + gen_0, :] = X_class_0[idx_0] + count += gen_0 + for i, cluster in enumerate(clusters): + gen_i = np.sum(np.isin(os_ind, np.where(cluster_label == (i + 1))[0])) + X_new[count:count + gen_i, :] = self._generate_synthetic_samples( + Me[i], eigen_matrices[i], eigen_values[i], gen_i, R) + count += gen_i + + assert count == n_samples + X_resampled.append(X_new) + y_new = np.full(n_samples, fill_value=class_sample, dtype=y.dtype) + y_resampled.append(y_new) + + X_resampled = np.vstack(X_resampled) + y_resampled = np.hstack(y_resampled) + X_resampled = X_resampled[:, np.newaxis, :] + return X_resampled, y_resampled + + def _cluster_minority(self, X): + """Apply DRSNN clustering on minority class samples.""" + n = X.shape[0] + k = self.k + kapa = self.kapa + drT = self.drT + + self.NearestNeighbors.fit(X) + neighbors = self.NearestNeighbors.kneighbors(X, return_distance=False)[:,1:] + """ construct the shared nearest neighbor similarity """ + strength = np.zeros((n, n)) + for i in range(n): + for j in range(i + 1, n): + shared_nn = np.intersect1d(neighbors[i, :k], neighbors[j, :k]) + strength[i, j] = strength[j, i] = np.sum((k + 1 - np.searchsorted(neighbors[i, :k], shared_nn)) * + (k + 1 - np.searchsorted(neighbors[j, :k], shared_nn))) + + """ construct the shared nearest neighbor graph """ + strength_nn = np.sort(strength, axis=1)[:, ::-1][:, :k] + idx_nn = np.argsort(strength, axis=1)[:, ::-1] + graph = np.zeros((n, k)) + for i in range(n): + for j in range(k): + if np.any(idx_nn[idx_nn[i, j], :k] == i): + graph[i, j] = 1 + + density = np.sum(strength_nn * graph, axis=1) + density_ratio = np.zeros(n) + for i in range(n): + non_noise = np.where(density[idx_nn[i, :kapa]] != 0)[0] + if non_noise.size == 0: + density_ratio[i] = 0 + else: + density_ratio[i] = density[i] / np.mean(density[idx_nn[i, non_noise]]) + + """ identify core points """ + core_idx = np.where(density_ratio > drT)[0] + """ find directly density-reachable samples for each core point""" + neighborhood = {core: set(idx_nn[core, :kapa]) for core in core_idx} + for i in core_idx: + for j in core_idx: + if np.any(idx_nn[j, :kapa] == i): + neighborhood[i].add(j) + neighborhood = {key: list(value) for key, value in neighborhood.items()} + + clusters = [] + cluster_label = np.zeros(len(neighbors), dtype=int) + cluster_id = 0 + + for i in core_idx: + if cluster_label[i] == 0: + cluster_id += 1 + seed = [i] + clusters.append(set(seed)) + while seed: + point = seed.pop(0) + idx = np.where(core_idx == point)[0] + if idx.size > 0 and cluster_label[point] == 0: + seed.extend(neighborhood[point]) + clusters[-1].update(neighborhood[point]) + cluster_label[point] = cluster_id + # no cluster has been found, the whole samples are taken as one cluster + if len(clusters) == 0: + clusters.append(list(range(n))) + cluster_label = np.ones(n, dtype=int) + return clusters, cluster_label + + def _covStruct(self, data, clusters): + """ + Calculate the covariance matrix of the minority samples. + """ + Me, Eigen_matrices, Eigen_values = [], [], [] + for cluster in clusters: + cluster = list(cluster) + cluster_data = data[cluster] + sigma, shrinkage = ledoit_wolf(cluster_data) + me = np.mean(cluster_data, axis=0) + eigenValues, eigenVectors = np.linalg.eigh(sigma) + eigenValues = np.diag((eigenValues)) + Me.append(me) + Eigen_matrices.append(eigenVectors) + Eigen_values.append(eigenValues) + return Me, Eigen_matrices, Eigen_values + + def _generate_synthetic_samples(self, Me, eigenMatrix, eigenValue, eta, R): + """Generate synthetic samples based on clustered minority samples.""" + # Initialize the output sample generator and probability arrays + n_samples = int(np.ceil(eta * R)) + SampGen = np.zeros((n_samples, len(Me))) + Prob = np.zeros(n_samples) + + # Calculate the square root of the absolute eigenvalues + DD = np.sqrt(np.abs(np.diag(eigenValue))) + DD = DD.reshape(1, -1) + + # Initialize mean and covariance for the multivariate normal distribution + Mu = np.zeros(len(Me)) + Sigma = np.eye(len(Me)) + + for cnt in range(n_samples): + # Generate a sample from the multivariate normal distribution + S = np.random.multivariate_normal(Mu, Sigma, 1) + Prob[cnt] = multivariate_normal.pdf(S, Mu, Sigma) + + # Scale the sample with the eigenvalues + S = S * DD + # Generate the final sample by applying the eigenvector matrix + x = S @ eigenMatrix.T + Me + SampGen[cnt, :] = x + + # Sort the samples based on the probability in descending order + sorted_indices = np.argsort(Prob)[::-1] + SampGen = SampGen[sorted_indices[:eta], :] + + return SampGen + + @classmethod + def _get_test_params(cls, parameter_set="default"): + return {"n_clusters": 3} \ No newline at end of file diff --git a/aeon/transformations/collection/imbalance/tests/test_ohit.py b/aeon/transformations/collection/imbalance/tests/test_ohit.py new file mode 100644 index 0000000000..58d0794ba6 --- /dev/null +++ b/aeon/transformations/collection/imbalance/tests/test_ohit.py @@ -0,0 +1,31 @@ +"""Test function for OHIT.""" + +import numpy as np +import pytest +from aeon.transformations.collection.imbalance import OHIT + + +def test_ohit(): + """Test the OHIT class. + + This function creates a 3D numpy array, applies + OHIT using the OHIT class, and asserts that the + transformed data has a balanced number of samples. + """ + n_samples = 100 # Total number of labels + majority_num = 90 # number of majority class + minority_num = n_samples - majority_num # number of minority class + + X = np.random.rand(n_samples, 1, 10) + y = np.array([0] * majority_num + [1] * minority_num) + + transformer = OHIT() + transformer.fit(X, y) + res_X, res_y = transformer.transform(X, y) + _, res_count = np.unique(res_y, return_counts=True) + + assert len(res_X) == 2 * majority_num + assert len(res_y) == 2 * majority_num + assert res_count[0] == majority_num + assert res_count[1] == majority_num + From eed16110aa728f32ca44a3048f6b4220c36314a5 Mon Sep 17 00:00:00 2001 From: MatthewMiddlehurst <25731235+MatthewMiddlehurst@users.noreply.github.com> Date: Thu, 15 May 2025 19:26:35 +0000 Subject: [PATCH 11/19] Automatic `pre-commit` fixes --- .../collection/imbalance/__init__.py | 2 +- .../collection/imbalance/_ohit.py | 44 ++++++++++++------- .../collection/imbalance/tests/test_ohit.py | 2 +- 3 files changed, 30 insertions(+), 18 deletions(-) diff --git a/aeon/transformations/collection/imbalance/__init__.py b/aeon/transformations/collection/imbalance/__init__.py index 2431c4c363..d6ee723069 100644 --- a/aeon/transformations/collection/imbalance/__init__.py +++ b/aeon/transformations/collection/imbalance/__init__.py @@ -3,5 +3,5 @@ __all__ = ["ADASYN", "SMOTE", "OHIT"] from aeon.transformations.collection.imbalance._adasyn import ADASYN +from aeon.transformations.collection.imbalance._ohit import OHIT from aeon.transformations.collection.imbalance._smote import SMOTE -from aeon.transformations.collection.imbalance._ohit import OHIT \ No newline at end of file diff --git a/aeon/transformations/collection/imbalance/_ohit.py b/aeon/transformations/collection/imbalance/_ohit.py index 94bb86eedf..cde354e275 100644 --- a/aeon/transformations/collection/imbalance/_ohit.py +++ b/aeon/transformations/collection/imbalance/_ohit.py @@ -10,11 +10,11 @@ import numpy as np from scipy.stats import multivariate_normal -from sklearn.utils import check_random_state -from aeon.transformations.collection import BaseCollectionTransformer -from sklearn.neighbors import NearestNeighbors from sklearn.covariance import ledoit_wolf +from sklearn.neighbors import NearestNeighbors +from sklearn.utils import check_random_state +from aeon.transformations.collection import BaseCollectionTransformer __all__ = ["OHIT"] @@ -30,6 +30,7 @@ class OHIT(BaseCollectionTransformer): DRSNN also contains three parameters(i.e.,drT ,k and kapa),it is capable of selecting the proper value for drT around 1.In addition,k and kapa can be set in a complementary way to avoid the merging and dissociation of clusters,that is,a large k with a relatively low kapa. + Parameters ---------- k : int, the nearest neighbor parameter in SNN similarity @@ -50,7 +51,9 @@ class OHIT(BaseCollectionTransformer): "requires_y": True, } - def __init__(self, k=None, kapa=None, drT=0.9, distance='euclidean' ,random_state=None): + def __init__( + self, k=None, kapa=None, drT=0.9, distance="euclidean", random_state=None + ): self.k = k self.kapa = kapa self.drT = drT @@ -92,12 +95,14 @@ def _transform(self, X, y=None): n, m = X_class.shape # set the default value of k and kapa if self.k is None: - self.k = int(np.ceil(n ** 0.5 * 1.25)) + self.k = int(np.ceil(n**0.5 * 1.25)) if self.kapa is None: - self.kapa = int(np.ceil(n ** 0.5)) + self.kapa = int(np.ceil(n**0.5)) # Initialize NearestNeighbors for SNN similarity - self.NearestNeighbors = NearestNeighbors(metric=self.distance, n_neighbors=self.k + 1) + self.NearestNeighbors = NearestNeighbors( + metric=self.distance, n_neighbors=self.k + 1 + ) clusters, cluster_label = self._cluster_minority(X_class) Me, eigen_matrices, eigen_values = self._covStruct(X_class, clusters) @@ -105,7 +110,11 @@ def _transform(self, X, y=None): # allocate the number of synthetic samples to be generated for each cluster random_state = check_random_state(self.random_state) os_ind = np.tile(np.arange(0, n), int(np.floor(n_samples / n))) - remaining = random_state.choice(np.arange(0, n), n_samples - n * int(np.floor(n_samples / n)), replace=False) + remaining = random_state.choice( + np.arange(0, n), + n_samples - n * int(np.floor(n_samples / n)), + replace=False, + ) os_ind = np.concatenate([os_ind, remaining]) R = 1.25 if len(clusters) > 1 else 1.1 @@ -117,12 +126,13 @@ def _transform(self, X, y=None): if X_class_0.size != 0: gen_0 = np.sum(np.isin(os_ind, np.where(cluster_label == 0)[0])) idx_0 = random_state.choice(len(X_class_0), gen_0, replace=True) - X_new[count:count + gen_0, :] = X_class_0[idx_0] + X_new[count : count + gen_0, :] = X_class_0[idx_0] count += gen_0 for i, cluster in enumerate(clusters): gen_i = np.sum(np.isin(os_ind, np.where(cluster_label == (i + 1))[0])) - X_new[count:count + gen_i, :] = self._generate_synthetic_samples( - Me[i], eigen_matrices[i], eigen_values[i], gen_i, R) + X_new[count : count + gen_i, :] = self._generate_synthetic_samples( + Me[i], eigen_matrices[i], eigen_values[i], gen_i, R + ) count += gen_i assert count == n_samples @@ -143,14 +153,16 @@ def _cluster_minority(self, X): drT = self.drT self.NearestNeighbors.fit(X) - neighbors = self.NearestNeighbors.kneighbors(X, return_distance=False)[:,1:] + neighbors = self.NearestNeighbors.kneighbors(X, return_distance=False)[:, 1:] """ construct the shared nearest neighbor similarity """ strength = np.zeros((n, n)) for i in range(n): for j in range(i + 1, n): shared_nn = np.intersect1d(neighbors[i, :k], neighbors[j, :k]) - strength[i, j] = strength[j, i] = np.sum((k + 1 - np.searchsorted(neighbors[i, :k], shared_nn)) * - (k + 1 - np.searchsorted(neighbors[j, :k], shared_nn))) + strength[i, j] = strength[j, i] = np.sum( + (k + 1 - np.searchsorted(neighbors[i, :k], shared_nn)) + * (k + 1 - np.searchsorted(neighbors[j, :k], shared_nn)) + ) """ construct the shared nearest neighbor graph """ strength_nn = np.sort(strength, axis=1)[:, ::-1][:, :k] @@ -213,7 +225,7 @@ def _covStruct(self, data, clusters): sigma, shrinkage = ledoit_wolf(cluster_data) me = np.mean(cluster_data, axis=0) eigenValues, eigenVectors = np.linalg.eigh(sigma) - eigenValues = np.diag((eigenValues)) + eigenValues = np.diag(eigenValues) Me.append(me) Eigen_matrices.append(eigenVectors) Eigen_values.append(eigenValues) @@ -253,4 +265,4 @@ def _generate_synthetic_samples(self, Me, eigenMatrix, eigenValue, eta, R): @classmethod def _get_test_params(cls, parameter_set="default"): - return {"n_clusters": 3} \ No newline at end of file + return {"n_clusters": 3} diff --git a/aeon/transformations/collection/imbalance/tests/test_ohit.py b/aeon/transformations/collection/imbalance/tests/test_ohit.py index 58d0794ba6..b7d3372c2a 100644 --- a/aeon/transformations/collection/imbalance/tests/test_ohit.py +++ b/aeon/transformations/collection/imbalance/tests/test_ohit.py @@ -2,6 +2,7 @@ import numpy as np import pytest + from aeon.transformations.collection.imbalance import OHIT @@ -28,4 +29,3 @@ def test_ohit(): assert len(res_y) == 2 * majority_num assert res_count[0] == majority_num assert res_count[1] == majority_num - From c12a9c27b0c55b4365cbc93d3037376fcc12ad70 Mon Sep 17 00:00:00 2001 From: Tony Bagnall Date: Wed, 28 May 2025 18:24:56 +0100 Subject: [PATCH 12/19] docstrings --- .../collection/imbalance/_adasyn.py | 37 ++++++++--- .../collection/imbalance/_ohit.py | 62 ++++++++++++------- .../collection/imbalance/_smote.py | 45 +++++++++----- 3 files changed, 96 insertions(+), 48 deletions(-) diff --git a/aeon/transformations/collection/imbalance/_adasyn.py b/aeon/transformations/collection/imbalance/_adasyn.py index 412007009d..15106eca69 100644 --- a/aeon/transformations/collection/imbalance/_adasyn.py +++ b/aeon/transformations/collection/imbalance/_adasyn.py @@ -11,17 +11,36 @@ class ADASYN(SMOTE): """ - Over-sampling using Adaptive Synthetic Sampling (ADASYN). + Adaptive Synthetic Sampling (ADASYN) over-sampler. - Adaptation of imblearn.over_sampling.ADASYN - original authors: - # Guillaume Lemaitre - # Christos Aridas - # License: MIT + Generates synthetic samples for the minority class based on local data + distribution. ADASYN extends SMOTE by adapting the number of synthetic samples + according to the density of the minority class: more samples are generated for + minority samples that are harder to learn (i.e., surrounded by more majority + samples). - This transformer extends SMOTE, but it generates different number of - samples depending on an estimate of the local distribution of the class - to be oversampled. + This implementation is adapted from imbalanced-learn's + `imblearn.over_sampling.ADASYN`. + + Parameters + ---------- + random_state : int or None, optional (default=None) + Random seed for reproducibility. + k_neighbors : int, optional (default=5) + Number of nearest neighbours used to construct synthetic samples. + + References + ---------- + .. [1] He, H., Bai, Y., Garcia, E. A., & Li, S. (2008). + ADASYN: Adaptive synthetic sampling approach for imbalanced learning. + In IEEE International Joint Conference on Neural Networks, pp. 1322-1328. + https://doi.org/10.1109/IJCNN.2008.4633969 + + Examples + -------- + >>> from aeon.classification.sampling import ADASYN + >>> sampler = ADASYN(random_state=42) + >>> X_res, y_res = sampler.fit_resample(X, y) """ def __init__(self, random_state=None, k_neighbors=5): diff --git a/aeon/transformations/collection/imbalance/_ohit.py b/aeon/transformations/collection/imbalance/_ohit.py index cde354e275..d5ff0167c7 100644 --- a/aeon/transformations/collection/imbalance/_ohit.py +++ b/aeon/transformations/collection/imbalance/_ohit.py @@ -21,30 +21,51 @@ class OHIT(BaseCollectionTransformer): """ - Over-sampling using the Over-sampling based on (OHIT). + ver-sampling based on High-density region and Iterative Thresholding (OHIT). - This method is based on Density-Ratio Shared Nearest Neighbor (DRSNN) clustering to find high-density regions - of minority class samples and generate synthetic samples within these clusters and Shrinkage estimation of - large-dimensional covariance matrix + OHIT generates synthetic minority class samples based on the Density-Ratio Shared + Nearest Neighbor (DRSNN) clustering algorithm. It identifies high-density regions + amoung the minority class using DRSNN, then produces synthetic samples within + these clusters. Covariance estimation for high-dimensional data is performed using + shrinkage techniques. - DRSNN also contains three parameters(i.e.,drT ,k and kapa),it is capable of selecting the proper value for - drT around 1.In addition,k and kapa can be set in a complementary way to avoid the merging and - dissociation of clusters,that is,a large k with a relatively low kapa. + The DRSNN procedure involves three main parameters: + - `drT`: the density ratio threshold (typically set around 1). + - `k`: the nearest neighbour parameter in shared nearest neighbour similarity. + - `kapa`: the nearest neighbour parameter in defining density ratio. + + `k` and `kapa` should be set in a complementary manner to avoid cluster merging + and dissociation. Typically, a large `k` is paired with a relatively low `kapa`. Parameters ---------- - k : int, the nearest neighbor parameter in SNN similarity - if None, set k = int(np.ceil(n ** 0.5 * 1.25)) where n is the number of minority samples - kapa : int, the nearest neighbor parameter in defining density ratio - if None, set kapa = int(np.ceil(n ** 0.5)) where n is the number of minority samples - drT : float, default=0.9, the threshold of density ratio. + k : int or None, optional + The nearest neighbour parameter for SNN similarity. + If None, set to int(np.ceil(n ** 0.5 * 1.25)), where n is the number of + minority samples. + kapa : int or None, optional + The nearest neighbour parameter for defining the density ratio. + If None, set to int(np.ceil(n ** 0.5)), where n is the number of minority + samples. + drT : float, default=0.9 + Threshold for the density ratio in DRSNN clustering. distance : str or callable, default='euclidean' - Distance metric to use for KNN in SNN similarity. + Distance metric to use for KNN in SNN similarity computation. random_state : int, RandomState instance or None, default=None - If `int`, random_state is the seed used by the random number generator; - If `RandomState` instance, random_state is the random number generator; - If `None`, the random number generator is the `RandomState` instance used - by `np.random`. + Controls random number generation for reproducibility: + - If `int`, sets the random seed. + - If `RandomState` instance, uses it as the generator. + - If `None`, uses `np.random`. + + References + ---------- + .. [1] (Add the relevant reference for OHIT here.) + + Examples + -------- + >>> from aeon.classification.sampling import OHIT + >>> ohit = OHIT(k=10, kapa=5, drT=0.9, random_state=0) + >>> X_resampled, y_resampled = ohit.fit_resample(X, y) """ _tags = { @@ -121,14 +142,13 @@ def _transform(self, X, y=None): """generate the structure-preserving synthetic samples for each cluster""" X_new = np.zeros((n_samples, m)) count = 0 - # consider the samples in the cluster with label 0 i.e. the samples that are not clustered X_class_0 = X_class[cluster_label == 0] if X_class_0.size != 0: gen_0 = np.sum(np.isin(os_ind, np.where(cluster_label == 0)[0])) idx_0 = random_state.choice(len(X_class_0), gen_0, replace=True) X_new[count : count + gen_0, :] = X_class_0[idx_0] count += gen_0 - for i, cluster in enumerate(clusters): + for i, _ in enumerate(clusters): gen_i = np.sum(np.isin(os_ind, np.where(cluster_label == (i + 1))[0])) X_new[count : count + gen_i, :] = self._generate_synthetic_samples( Me[i], eigen_matrices[i], eigen_values[i], gen_i, R @@ -215,9 +235,7 @@ def _cluster_minority(self, X): return clusters, cluster_label def _covStruct(self, data, clusters): - """ - Calculate the covariance matrix of the minority samples. - """ + """Calculate the covariance matrix of the minority samples.""" Me, Eigen_matrices, Eigen_values = [], [], [] for cluster in clusters: cluster = list(cluster) diff --git a/aeon/transformations/collection/imbalance/_smote.py b/aeon/transformations/collection/imbalance/_smote.py index ee00c78174..59a4f6e8c5 100644 --- a/aeon/transformations/collection/imbalance/_smote.py +++ b/aeon/transformations/collection/imbalance/_smote.py @@ -23,41 +23,52 @@ class SMOTE(BaseCollectionTransformer): """ - Over-sampling using the Synthetic Minority Over-sampling TEchnique (SMOTE)[1]_. + Synthetic Minority Over-sampling TEchnique (SMOTE) for imbalanced datasets. - An adaptation of the imbalance-learn implementation of SMOTE in - imblearn.over_sampling.SMOTE. sampling_strategy is sampling target by - targeting all classes but not the majority, which is directly expressed in - _fit.sampling_strategy. + Generates synthetic samples of the minority class to address class imbalance. + SMOTE constructs new samples by interpolating between existing minority samples + and their nearest neighbours in feature space. + + This implementation adapts the algorithm from `imblearn.over_sampling.SMOTE`. + It targets all classes except the majority, as controlled by the `sampling_strategy` + in the `_fit` method. It uses ``aeon`` distances to find the nearest neighbours. Parameters ---------- k_neighbors : int, default=5 - The number of nearest neighbors used to define the neighborhood of samples - to use to generate the synthetic time series. - `~sklearn.neighbors.NearestNeighbors` instance will be fitted in this case. + Number of nearest neighbours used to generate synthetic samples. A + `sklearn.neighbors.NearestNeighbors` instance is fitted for this purpose. random_state : int, RandomState instance or None, default=None - If `int`, random_state is the seed used by the random number generator; - If `RandomState` instance, random_state is the random number generator; - If `None`, the random number generator is the `RandomState` instance used - by `np.random`. + Controls the random number generation for reproducibility: + - If `int`, sets the random seed. + - If `RandomState` instance, uses it as the generator. + - If `None`, uses `np.random`. See Also -------- - ADASYN + ADASYN : Adaptive synthetic sampling extension to SMOTE. References ---------- - .. [1] Chawla et al. SMOTE: synthetic minority over-sampling technique, Journal - of Artificial Intelligence Research 16(1): 321–357, 2002. - https://dl.acm.org/doi/10.5555/1622407.1622416 + .. [1] Chawla, N. V., Bowyer, K. W., Hall, L. O., & Kegelmeyer, W. P. (2002). + SMOTE: Synthetic minority over-sampling technique. + Journal of Artificial Intelligence Research, 16, 321–357. + https://dl.acm.org/doi/10.5555/1622407.1622416 + + Examples + -------- + >>> from aeon.classification.sampling import SMOTE + >>> from aeon.datasets import load_unit_test + >>> X, y = load_unit_test() + >>> smote = SMOTE(k_neighbors=3, random_state=0) + >>> X_resampled, y_resampled = smote.fit(X, y) """ _tags = { "requires_y": True, } - def __init__(self, k_neighbors=5, random_state=None): + def __init__(self, k_neighbors: int = 5, random_state=None): self.random_state = random_state self.k_neighbors = k_neighbors super().__init__() From f42d4b0032ec14ddc41ca2f2ae7f7aeb5bbbfa5e Mon Sep 17 00:00:00 2001 From: Tony Bagnall Date: Wed, 28 May 2025 19:03:33 +0100 Subject: [PATCH 13/19] remove import --- aeon/transformations/collection/imbalance/tests/test_ohit.py | 1 - 1 file changed, 1 deletion(-) diff --git a/aeon/transformations/collection/imbalance/tests/test_ohit.py b/aeon/transformations/collection/imbalance/tests/test_ohit.py index b7d3372c2a..7162aab894 100644 --- a/aeon/transformations/collection/imbalance/tests/test_ohit.py +++ b/aeon/transformations/collection/imbalance/tests/test_ohit.py @@ -1,7 +1,6 @@ """Test function for OHIT.""" import numpy as np -import pytest from aeon.transformations.collection.imbalance import OHIT From 2da6505eec1ed2b7de76c09b4d86c52be4e0cfd2 Mon Sep 17 00:00:00 2001 From: Tony Bagnall Date: Wed, 28 May 2025 19:09:35 +0100 Subject: [PATCH 14/19] remove incorrect test parameters --- aeon/transformations/collection/imbalance/_adasyn.py | 11 ++++++++++- aeon/transformations/collection/imbalance/_ohit.py | 6 +----- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/aeon/transformations/collection/imbalance/_adasyn.py b/aeon/transformations/collection/imbalance/_adasyn.py index 15106eca69..6dd81f4bdf 100644 --- a/aeon/transformations/collection/imbalance/_adasyn.py +++ b/aeon/transformations/collection/imbalance/_adasyn.py @@ -1,4 +1,13 @@ -"""ADASYN over sampling algorithm.""" +"""ADASYN over sampling algorithm. + +See more in imblearn.over_sampling.ADASYN +original authors: +# Guillaume Lemaitre +# Fernando Nogueira +# Christos Aridas +# Dzianis Dudnik +# License: MIT +""" import numpy as np from sklearn.utils import check_random_state diff --git a/aeon/transformations/collection/imbalance/_ohit.py b/aeon/transformations/collection/imbalance/_ohit.py index d5ff0167c7..f87691ba65 100644 --- a/aeon/transformations/collection/imbalance/_ohit.py +++ b/aeon/transformations/collection/imbalance/_ohit.py @@ -21,7 +21,7 @@ class OHIT(BaseCollectionTransformer): """ - ver-sampling based on High-density region and Iterative Thresholding (OHIT). + Over-sampling based on High-density region and Iterative Thresholding (OHIT). OHIT generates synthetic minority class samples based on the Density-Ratio Shared Nearest Neighbor (DRSNN) clustering algorithm. It identifies high-density regions @@ -280,7 +280,3 @@ def _generate_synthetic_samples(self, Me, eigenMatrix, eigenValue, eta, R): SampGen = SampGen[sorted_indices[:eta], :] return SampGen - - @classmethod - def _get_test_params(cls, parameter_set="default"): - return {"n_clusters": 3} From 4847a443c75be743bdda3342782068efa7e42732 Mon Sep 17 00:00:00 2001 From: Tony Bagnall Date: Wed, 28 May 2025 19:30:01 +0100 Subject: [PATCH 15/19] docstrings --- .../collection/imbalance/_adasyn.py | 9 ++++++--- .../transformations/collection/imbalance/_ohit.py | 13 ++++++++++--- .../collection/imbalance/_smote.py | 15 ++++++++++----- 3 files changed, 26 insertions(+), 11 deletions(-) diff --git a/aeon/transformations/collection/imbalance/_adasyn.py b/aeon/transformations/collection/imbalance/_adasyn.py index 6dd81f4bdf..fef69ad467 100644 --- a/aeon/transformations/collection/imbalance/_adasyn.py +++ b/aeon/transformations/collection/imbalance/_adasyn.py @@ -47,9 +47,12 @@ class ADASYN(SMOTE): Examples -------- - >>> from aeon.classification.sampling import ADASYN - >>> sampler = ADASYN(random_state=42) - >>> X_res, y_res = sampler.fit_resample(X, y) + >>> from aeon.transformations.collection.imbalance import ADASYN + >>> import numpy as np + >>> X = np.random.random(size=(100,1,50)) + >>> y = np.array([0] * 90 + [1] * 10) + >>> sampler = ADASYN(random_state=49) + >>> X_res, y_res = sampler.fit_transform(X, y) """ def __init__(self, random_state=None, k_neighbors=5): diff --git a/aeon/transformations/collection/imbalance/_ohit.py b/aeon/transformations/collection/imbalance/_ohit.py index f87691ba65..8c901dfb62 100644 --- a/aeon/transformations/collection/imbalance/_ohit.py +++ b/aeon/transformations/collection/imbalance/_ohit.py @@ -63,9 +63,16 @@ class OHIT(BaseCollectionTransformer): Examples -------- - >>> from aeon.classification.sampling import OHIT - >>> ohit = OHIT(k=10, kapa=5, drT=0.9, random_state=0) - >>> X_resampled, y_resampled = ohit.fit_resample(X, y) + >>> from aeon.transformations.collection.imbalance import OHIT + >>> import numpy as np + >>> X = np.random.random(size=(100,1,50)) + >>> y = np.array([0] * 90 + [1] * 10) + >>> sampler = OHIT(random_state=49) + >>> X_res, y_res = sampler.fit_transform(X, y) + >>> np.sum(y_res == 1) + 90 + >>> np.sum(y_res == 0) + 90 """ _tags = { diff --git a/aeon/transformations/collection/imbalance/_smote.py b/aeon/transformations/collection/imbalance/_smote.py index 59a4f6e8c5..fb5d1896a2 100644 --- a/aeon/transformations/collection/imbalance/_smote.py +++ b/aeon/transformations/collection/imbalance/_smote.py @@ -57,11 +57,16 @@ class SMOTE(BaseCollectionTransformer): Examples -------- - >>> from aeon.classification.sampling import SMOTE - >>> from aeon.datasets import load_unit_test - >>> X, y = load_unit_test() - >>> smote = SMOTE(k_neighbors=3, random_state=0) - >>> X_resampled, y_resampled = smote.fit(X, y) + >>> from aeon.transformations.collection.imbalance import SMOTE + >>> import numpy as np + >>> X = np.random.random(size=(100,1,50)) + >>> y = np.array([0] * 90 + [1] * 10) + >>> sampler = SMOTE(random_state=49) + >>> X_res, y_res = sampler.fit_transform(X, y) + >>> np.sum(y_res == 1) + 90 + >>> np.sum(y_res == 0) + 90 """ _tags = { From c5719dc26d29313d0e0a43cf37fb85b5a2d21a65 Mon Sep 17 00:00:00 2001 From: Tony Bagnall Date: Wed, 28 May 2025 20:39:01 +0100 Subject: [PATCH 16/19] examples --- aeon/transformations/collection/imbalance/_ohit.py | 3 ++- aeon/transformations/collection/imbalance/_smote.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/aeon/transformations/collection/imbalance/_ohit.py b/aeon/transformations/collection/imbalance/_ohit.py index 8c901dfb62..9d7f4e122b 100644 --- a/aeon/transformations/collection/imbalance/_ohit.py +++ b/aeon/transformations/collection/imbalance/_ohit.py @@ -64,8 +64,9 @@ class OHIT(BaseCollectionTransformer): Examples -------- >>> from aeon.transformations.collection.imbalance import OHIT + >>> from aeon.testing.data_generation import make_example_3d_numpy >>> import numpy as np - >>> X = np.random.random(size=(100,1,50)) + >>> X = make_example_3d_numpy(n_cases=100, return_y=False, random_state=49) >>> y = np.array([0] * 90 + [1] * 10) >>> sampler = OHIT(random_state=49) >>> X_res, y_res = sampler.fit_transform(X, y) diff --git a/aeon/transformations/collection/imbalance/_smote.py b/aeon/transformations/collection/imbalance/_smote.py index fb5d1896a2..f74f596f45 100644 --- a/aeon/transformations/collection/imbalance/_smote.py +++ b/aeon/transformations/collection/imbalance/_smote.py @@ -58,8 +58,9 @@ class SMOTE(BaseCollectionTransformer): Examples -------- >>> from aeon.transformations.collection.imbalance import SMOTE + >>> from aeon.testing.data_generation import make_example_3d_numpy >>> import numpy as np - >>> X = np.random.random(size=(100,1,50)) + >>> X = make_example_3d_numpy(n_cases=100, return_y=False, random_state=49) >>> y = np.array([0] * 90 + [1] * 10) >>> sampler = SMOTE(random_state=49) >>> X_res, y_res = sampler.fit_transform(X, y) From 230ffdd1e21fc1872af203d8e6d052c870104d49 Mon Sep 17 00:00:00 2001 From: Tony Bagnall Date: Wed, 28 May 2025 20:52:06 +0100 Subject: [PATCH 17/19] examples --- aeon/transformations/collection/imbalance/_ohit.py | 6 ++---- aeon/transformations/collection/imbalance/_smote.py | 6 ++---- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/aeon/transformations/collection/imbalance/_ohit.py b/aeon/transformations/collection/imbalance/_ohit.py index 9d7f4e122b..b300e8687b 100644 --- a/aeon/transformations/collection/imbalance/_ohit.py +++ b/aeon/transformations/collection/imbalance/_ohit.py @@ -70,10 +70,8 @@ class OHIT(BaseCollectionTransformer): >>> y = np.array([0] * 90 + [1] * 10) >>> sampler = OHIT(random_state=49) >>> X_res, y_res = sampler.fit_transform(X, y) - >>> np.sum(y_res == 1) - 90 - >>> np.sum(y_res == 0) - 90 + >>> y_res.shape + (180,) """ _tags = { diff --git a/aeon/transformations/collection/imbalance/_smote.py b/aeon/transformations/collection/imbalance/_smote.py index f74f596f45..611b2bbb0e 100644 --- a/aeon/transformations/collection/imbalance/_smote.py +++ b/aeon/transformations/collection/imbalance/_smote.py @@ -64,10 +64,8 @@ class SMOTE(BaseCollectionTransformer): >>> y = np.array([0] * 90 + [1] * 10) >>> sampler = SMOTE(random_state=49) >>> X_res, y_res = sampler.fit_transform(X, y) - >>> np.sum(y_res == 1) - 90 - >>> np.sum(y_res == 0) - 90 + >>> y_res.shape + (180,) """ _tags = { From 15cecc20f3cf8da03220a9f6af1cea3f46265a30 Mon Sep 17 00:00:00 2001 From: Tony Bagnall Date: Sun, 1 Jun 2025 20:46:15 +0100 Subject: [PATCH 18/19] refactor variable name --- aeon/transformations/collection/imbalance/_ohit.py | 8 +++----- aeon/transformations/collection/imbalance/_smote.py | 2 +- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/aeon/transformations/collection/imbalance/_ohit.py b/aeon/transformations/collection/imbalance/_ohit.py index b300e8687b..a704838179 100644 --- a/aeon/transformations/collection/imbalance/_ohit.py +++ b/aeon/transformations/collection/imbalance/_ohit.py @@ -127,9 +127,7 @@ def _transform(self, X, y=None): self.kapa = int(np.ceil(n**0.5)) # Initialize NearestNeighbors for SNN similarity - self.NearestNeighbors = NearestNeighbors( - metric=self.distance, n_neighbors=self.k + 1 - ) + self.nn_ = NearestNeighbors(metric=self.distance, n_neighbors=self.k + 1) clusters, cluster_label = self._cluster_minority(X_class) Me, eigen_matrices, eigen_values = self._covStruct(X_class, clusters) @@ -178,8 +176,8 @@ def _cluster_minority(self, X): kapa = self.kapa drT = self.drT - self.NearestNeighbors.fit(X) - neighbors = self.NearestNeighbors.kneighbors(X, return_distance=False)[:, 1:] + self.nn_.fit(X) + neighbors = self.nn_.kneighbors(X, return_distance=False)[:, 1:] """ construct the shared nearest neighbor similarity """ strength = np.zeros((n, n)) for i in range(n): diff --git a/aeon/transformations/collection/imbalance/_smote.py b/aeon/transformations/collection/imbalance/_smote.py index 611b2bbb0e..63ada8a23b 100644 --- a/aeon/transformations/collection/imbalance/_smote.py +++ b/aeon/transformations/collection/imbalance/_smote.py @@ -226,7 +226,7 @@ def _generate_samples( Synthetically generated samples. """ diffs = nn_data[nn_num[rows, cols]] - X[rows] - if y is not None: # only entering for BorderlineSMOTE-2 + if y is not None: random_state = check_random_state(self.random_state) mask_pair_samples = y[nn_num[rows, cols]] != y_type diffs[mask_pair_samples] *= random_state.uniform( From 5b5ce9cee03a93a0ae05ff8b1232c49452a24607 Mon Sep 17 00:00:00 2001 From: Tony Bagnall Date: Mon, 2 Jun 2025 19:42:50 +0100 Subject: [PATCH 19/19] format comments and reference --- aeon/transformations/collection/imbalance/_ohit.py | 14 ++++++++------ .../transformations/collection/imbalance/_smote.py | 7 ------- 2 files changed, 8 insertions(+), 13 deletions(-) diff --git a/aeon/transformations/collection/imbalance/_ohit.py b/aeon/transformations/collection/imbalance/_ohit.py index a704838179..4d154a2d13 100644 --- a/aeon/transformations/collection/imbalance/_ohit.py +++ b/aeon/transformations/collection/imbalance/_ohit.py @@ -59,7 +59,9 @@ class OHIT(BaseCollectionTransformer): References ---------- - .. [1] (Add the relevant reference for OHIT here.) + .. [1] T. Zhu, C. Luo, Z. Zhang, J. Li, S. Ren, and Y. Zeng. Minority + oversampling for imbalanced time series classification. Knowledge-Based Systems, + 247:108764, 2022. Examples -------- @@ -143,7 +145,7 @@ def _transform(self, X, y=None): os_ind = np.concatenate([os_ind, remaining]) R = 1.25 if len(clusters) > 1 else 1.1 - """generate the structure-preserving synthetic samples for each cluster""" + # generate the structure-preserving synthetic samples for each cluster X_new = np.zeros((n_samples, m)) count = 0 X_class_0 = X_class[cluster_label == 0] @@ -178,7 +180,7 @@ def _cluster_minority(self, X): self.nn_.fit(X) neighbors = self.nn_.kneighbors(X, return_distance=False)[:, 1:] - """ construct the shared nearest neighbor similarity """ + # construct the shared nearest neighbor similarity strength = np.zeros((n, n)) for i in range(n): for j in range(i + 1, n): @@ -188,7 +190,7 @@ def _cluster_minority(self, X): * (k + 1 - np.searchsorted(neighbors[j, :k], shared_nn)) ) - """ construct the shared nearest neighbor graph """ + # construct the shared nearest neighbor graph strength_nn = np.sort(strength, axis=1)[:, ::-1][:, :k] idx_nn = np.argsort(strength, axis=1)[:, ::-1] graph = np.zeros((n, k)) @@ -206,9 +208,9 @@ def _cluster_minority(self, X): else: density_ratio[i] = density[i] / np.mean(density[idx_nn[i, non_noise]]) - """ identify core points """ + # identify core points core_idx = np.where(density_ratio > drT)[0] - """ find directly density-reachable samples for each core point""" + # find directly density-reachable samples for each core point neighborhood = {core: set(idx_nn[core, :kapa]) for core in core_idx} for i in core_idx: for j in core_idx: diff --git a/aeon/transformations/collection/imbalance/_smote.py b/aeon/transformations/collection/imbalance/_smote.py index 63ada8a23b..f6e7062f2f 100644 --- a/aeon/transformations/collection/imbalance/_smote.py +++ b/aeon/transformations/collection/imbalance/_smote.py @@ -194,28 +194,21 @@ def _generate_samples( X : np.ndarray Series from which the points will be created of shape (n_cases, n_timepoints). - nn_data : ndarray of shape (n_samples_all, n_features) Data set carrying all the neighbours to be used. - nn_num : ndarray of shape (n_samples_all, k_nearest_neighbours) The nearest neighbours of each sample in `nn_data`. - rows : ndarray of shape (n_samples,), dtype=int Indices pointing at feature vector in X which will be used as a base for creating new samples. - cols : ndarray of shape (n_samples,), dtype=int Indices pointing at which nearest neighbor of base feature vector will be used when creating new samples. - steps : ndarray of shape (n_samples,), dtype=float Step sizes for new samples. - y_type : str, int or None, default=None Class label of the current target classes for which we want to generate samples. - y : ndarray of shape (n_samples_all,), default=None The true target associated with `nn_data`. Used by Borderline SMOTE-2 to weight the distances in the sample generation process.