From 374341dbdcdbc862a1d6f21ec4cafbc5122037e1 Mon Sep 17 00:00:00 2001 From: Aryan Date: Sun, 9 Mar 2025 10:59:01 +0530 Subject: [PATCH 1/4] initialised parameters --- aeon/clustering/density_based/__init__.py | 1 + .../clustering/density_based/_density_peak.py | 47 +++++++++++++++++++ 2 files changed, 48 insertions(+) create mode 100644 aeon/clustering/density_based/__init__.py create mode 100644 aeon/clustering/density_based/_density_peak.py diff --git a/aeon/clustering/density_based/__init__.py b/aeon/clustering/density_based/__init__.py new file mode 100644 index 0000000000..4bfc3bf42c --- /dev/null +++ b/aeon/clustering/density_based/__init__.py @@ -0,0 +1 @@ +"""Density.""" diff --git a/aeon/clustering/density_based/_density_peak.py b/aeon/clustering/density_based/_density_peak.py new file mode 100644 index 0000000000..693b5ca591 --- /dev/null +++ b/aeon/clustering/density_based/_density_peak.py @@ -0,0 +1,47 @@ +"""Density clustering for time series data.""" + +__maintainer__ = [] +__all__ = ["DensityPeakClusterer"] + +import numpy as np # noqa + +from aeon.distances import get_distance_function, pairwise_distance # noqa + + +class DensityPeakClusterer: + """Density Peak Clusterer. + + Clusters time series data using a density-based approach that estimates local + densities and identifies peaks as cluster centers. + """ + + def __init__( + self, + rho=None, + cutoff_distance=None, + distance="dtw", + n_jobs=1, + ): + self.rho = rho + self.cutoff_distance = cutoff_distance + self.distance = distance + self.n_jobs = n_jobs + + def fit(self, X, y=None): + """Fit time series clusterer to training data. + + Parameters + ---------- + X : array-like + Time series data to cluster. + y : array-like, optional + Labels for the data (unused in clustering). + + Returns + ------- + self : object + The fitted clusterer. + """ + self._fit(X) + self.is_fitted = True + return self From 61853264394fc0e088beb40b2ecb201d6bd6e4c6 Mon Sep 17 00:00:00 2001 From: Aryan Date: Thu, 13 Mar 2025 17:22:59 +0530 Subject: [PATCH 2/4] Uncertain results --- .../clustering/density_based/_density_peak.py | 228 ++++++++++- .../density_based/tests/__init__.py | 1 + .../density_based/tests/test_density_peak.py | 382 ++++++++++++++++++ 3 files changed, 602 insertions(+), 9 deletions(-) create mode 100644 aeon/clustering/density_based/tests/__init__.py create mode 100644 aeon/clustering/density_based/tests/test_density_peak.py diff --git a/aeon/clustering/density_based/_density_peak.py b/aeon/clustering/density_based/_density_peak.py index 693b5ca591..034679ea10 100644 --- a/aeon/clustering/density_based/_density_peak.py +++ b/aeon/clustering/density_based/_density_peak.py @@ -3,31 +3,114 @@ __maintainer__ = [] __all__ = ["DensityPeakClusterer"] -import numpy as np # noqa +import numpy as np -from aeon.distances import get_distance_function, pairwise_distance # noqa +from aeon.distances import get_distance_function, pairwise_distance class DensityPeakClusterer: - """Density Peak Clusterer. + """ + Density Peak Clusterer. Clusters time series data using a density-based approach that estimates local densities and identifies peaks as cluster centers. + + Parameters + ---------- + rho : float, optional + The local density for each data point. + delta : np.ndarray + For each point, the minimum distance to any point with higher density. + gauss_cutoff : bool, default=True + Whether to use Gaussian cutoff for density estimation. + cutoff_distance : float, optional + Distance cutoff for Gaussian kernel. + distance_metric : str, default="euclidean" + Distance metric to use for clustering. + n_jobs : int, default=1 + Number of parallel jobs to run. + density_threshold : float, optional + Density threshold to select cluster centers. If None, will use midpoint. + distance_threshold : float, optional + Distance threshold to select cluster centers. If None, will use midpoint. """ def __init__( self, - rho=None, - cutoff_distance=None, - distance="dtw", - n_jobs=1, + rho: float = None, + gauss_cutoff: bool = True, + cutoff_distance: int = None, + distance_metric: str = "euclidean", + n_jobs: int = 1, + density_threshold: float = None, + distance_threshold: float = None, ): self.rho = rho + self.gauss_cutoff = gauss_cutoff self.cutoff_distance = cutoff_distance - self.distance = distance + self.distance_metric = distance_metric self.n_jobs = n_jobs + self.density_threshold = density_threshold + self.distance_threshold = distance_threshold + + def _build_distance(self): + """ + Compute the pairwise distance matrix using the resolved distance function. + + Returns + ------- + distance_matrix : np.ndarray + A matrix of symmetric pairwise distances. + """ + dist_func = get_distance_function(self.distance_metric) + distance_matrix = pairwise_distance(self.data, method=dist_func) + return distance_matrix + + def _auto_select_dc(self): + """Auto-select cutoff distance (dc) so that the fraction of pairwise distances less than dc is within a target range (e.g: 1-2%).""" # noqa + tri_indices = np.triu_indices( + self.n, k=1 + ) # k=1 to ignore diagonal as it has 0's + distances = self.distance_matrix[tri_indices] + max_distance = np.max(distances) + min_distance = np.min(distances) + dc = (max_distance + min_distance) / 2 + + lower_bound = 0.002 + upper_bound = 0.01 # 1-2% range + + # recursively setting the value of dc + while True: + nneighs = np.sum(distances < dc) / (self.n**2) # nearest neighbors + + if lower_bound < nneighs < upper_bound: # case 1 : dc is in range + break + if nneighs < lower_bound: # case 2 : increase dc (too few neighbors) + min_distance = dc + if nneighs > upper_bound: # case 3 : decrease dc (too many neighbors) + max_distance = dc + + dc = (max_distance + min_distance) / 2 + if max_distance - min_distance < 1e-6: + break + + return dc - def fit(self, X, y=None): + def select_dc(self): + """ + Select the cutoff distance (dc) for density estimation. + + Returns + ------- + dc : float + The cutoff distance. + """ + if self.cutoff_distance == "auto": + return self._auto_select_dc() + else: + return self.cutoff_distance + + def _fit(self, X: np.ndarray, y: np.ndarray = None): """Fit time series clusterer to training data. Parameters @@ -42,6 +125,133 @@ def fit(self, X, y=None): self : object The fitted clusterer. """ + self.data = X + self.n = X.shape[0] # total no. of time points + + self.distance_matrix = self._build_distance() # pairwise distance matrix + + self.dc = self.select_dc() # selecting cutoff distance + + self.rho = np.zeros(self.n) # local density + + if self.gauss_cutoff: + # Gaussian kernel: weight = exp(- (d / self.dc) ** 2) + for i in range(self.n): + self.rho[i] = np.sum( + np.exp(-((self.distance_matrix[i] / self.dc) ** 2)) + ) + else: + # Hard cutoff: weight = 1 if d < dc, 0 otherwise + for i in range(self.n): + self.rho[i] = np.sum(self.distance_matrix[i] < self.dc) + + self.delta = np.full(self.n, np.inf) + self.nneigh = np.zeros(self.n, dtype=int) + sorted_indices = np.argsort( + -self.rho + ) # decending order of local density indices + self.sorted_indices = sorted_indices + + highest_index = sorted_indices[0] + self.delta[highest_index] = np.max( + self.distance_matrix + ) # distance to highest density point + self.nneigh[highest_index] = highest_index + + for i in range(1, self.n): + current_index = sorted_indices[i] + for j in range(i): + higher_index = sorted_indices[j] + d = self.distance_matrix[current_index, higher_index] + if d < self.delta[current_index]: + self.delta[current_index] = d + self.nneigh[current_index] = higher_index + + # If thresholds are not provided, use the midpoint rule + if self.density_threshold is None: + rho_threshold = 0.5 * (self.rho.min() + self.rho.max()) + else: + rho_threshold = self.density_threshold + + if self.distance_threshold is None: + delta_threshold = 0.5 * (self.delta.min() + self.delta.max()) + else: + delta_threshold = self.distance_threshold + + # Initialize cluster labels to -1 and assign centers based on the thresholds + self.labels_ = -np.ones(self.n, dtype=int) + for idx in range(self.n): + if (self.rho[idx] >= rho_threshold) and ( + self.delta[idx] >= delta_threshold + ): + self.labels_[idx] = ( + idx # mark as cluster center (label equals its own index) + ) + + # descending-density assignment for non-center points + for idx in sorted_indices: + if self.labels_[idx] == -1: + self.labels_[idx] = self.labels_[self.nneigh[idx]] + + # store the final center indices for reference. + self.cluster_centers = [i for i in range(self.n) if self.labels_[i] == i] + + def fit(self, X: np.ndarray, y: np.ndarray = None): + """ + Fit time series clusterer to training data. + + Parameters + ---------- + X : array-like, shape=(n_samples, n_timepoints) or + (n_samples, n_channels, n_timepoints) + Time series data to cluster. + y : array-like, optional + Labels for the data (unused in clustering). + + Returns + ------- + self : DensityPeakClusterer + The fitted clusterer. + """ self._fit(X) self.is_fitted = True return self + + def plot(self, mode="all", title="", **kwargs): + """ + Plot clustering results and/or the decision graph. + + Parameters + ---------- + mode : str, default="all" + One of "decision" (plot decision graph), "label" (plot clustered data), + or "all" (plot both). + title : str, optional + Title for the plots. + kwargs : dict + Additional keyword arguments passed to plotting functions. + """ + import matplotlib.pyplot as plt + + if mode in {"decision", "all"}: + plt.figure() + plt.scatter(self.rho, self.delta, c=self.labels_, cmap="viridis") + plt.xlabel("Local Density (rho)") + plt.ylabel("Delta") + plt.title(title + " Decision Graph") + plt.colorbar() + plt.show() + + if mode in {"label", "all"}: + plt.figure() + if self.data.ndim == 2 and self.data.shape[1] >= 2: + plt.scatter( + self.data[:, 0], self.data[:, 1], c=self.labels_, cmap="viridis" + ) + plt.xlabel("Feature 1") + plt.ylabel("Feature 2") + else: + plt.plot(self.data.T) + plt.title(title + " Cluster Labels") + plt.colorbar() + plt.show() diff --git a/aeon/clustering/density_based/tests/__init__.py b/aeon/clustering/density_based/tests/__init__.py new file mode 100644 index 0000000000..44d07e76f6 --- /dev/null +++ b/aeon/clustering/density_based/tests/__init__.py @@ -0,0 +1 @@ +"""Tests for density based clustering algorithms.""" diff --git a/aeon/clustering/density_based/tests/test_density_peak.py b/aeon/clustering/density_based/tests/test_density_peak.py new file mode 100644 index 0000000000..ba6a14a0fe --- /dev/null +++ b/aeon/clustering/density_based/tests/test_density_peak.py @@ -0,0 +1,382 @@ +"""Test for the DensityPeakClusterer module.""" + +import numpy as np +import pytest + +from aeon.clustering.density_based._density_peak import DensityPeakClusterer + +# spiral +DATA_STR = """ +31.95 7.95 3 +31.15 7.3 3 +30.45 6.65 3 +29.7 6.0 3 +28.9 5.55 3 +28.05 5.0 3 +27.2 4.55 3 +26.35 4.15 3 +25.4 3.85 3 +24.6 3.6 3 +23.6 3.3 3 +22.75 3.15 3 +21.85 3.05 3 +20.9 3.0 3 +20.0 2.9 3 +19.1 3.0 3 +18.2 3.2 3 +17.3 3.25 3 +16.55 3.5 3 +15.7 3.7 3 +14.85 4.1 3 +14.15 4.4 3 +13.4 4.75 3 +12.7 5.2 3 +12.05 5.65 3 +11.45 6.15 3 +10.9 6.65 3 +10.3 7.25 3 +9.7 7.85 3 +9.35 8.35 3 +8.9 9.05 3 +8.55 9.65 3 +8.15 10.35 3 +7.95 10.95 3 +7.75 11.7 3 +7.55 12.35 3 +7.45 13.0 3 +7.35 13.75 3 +7.3 14.35 3 +7.35 14.95 3 +7.35 15.75 3 +7.55 16.35 3 +7.7 16.95 3 +7.8 17.55 3 +8.05 18.15 3 +8.3 18.75 3 +8.65 19.3 3 +8.9 19.85 3 +9.3 20.3 3 +9.65 20.8 3 +10.2 21.25 3 +10.6 21.65 3 +11.1 22.15 3 +11.55 22.45 3 +11.95 22.7 3 +12.55 23.0 3 +13.05 23.2 3 +13.45 23.4 3 +14.0 23.55 3 +14.55 23.6 3 +15.1 23.75 3 +15.7 23.75 3 +16.15 23.85 3 +16.7 23.8 3 +17.15 23.75 3 +17.75 23.75 3 +18.2 23.6 3 +18.65 23.5 3 +19.1 23.35 3 +19.6 23.15 3 +20.0 22.95 3 +20.4 22.7 3 +20.7 22.55 3 +21.0 22.15 3 +21.45 21.95 3 +21.75 21.55 3 +22.0 21.25 3 +22.25 21.0 3 +22.5 20.7 3 +22.65 20.35 3 +22.75 20.05 3 +22.9 19.65 3 +23.0 19.35 3 +23.1 19.0 3 +23.15 18.65 3 +23.2 18.25 3 +23.2 18.05 3 +23.2 17.8 3 +23.1 17.45 3 +23.05 17.15 3 +22.9 16.9 3 +22.85 16.6 3 +22.7 16.4 3 +22.6 16.2 3 +22.55 16.05 3 +22.4 15.95 3 +22.35 15.8 3 +22.2 15.65 3 +22.15 15.55 3 +22.0 15.4 3 +21.9 15.3 3 +21.85 15.25 3 +21.75 15.15 3 +21.65 15.05 3 +21.55 15.0 3 +21.5 14.9 3 +19.35 31.65 1 +20.35 31.45 1 +21.35 31.1 1 +22.25 30.9 1 +23.2 30.45 1 +23.95 30.05 1 +24.9 29.65 1 +25.6 29.05 1 +26.35 28.5 1 +27.15 27.9 1 +27.75 27.35 1 +28.3 26.6 1 +28.95 25.85 1 +29.5 25.15 1 +29.95 24.45 1 +30.4 23.7 1 +30.6 22.9 1 +30.9 22.1 1 +31.25 21.3 1 +31.35 20.55 1 +31.5 19.7 1 +31.55 18.9 1 +31.65 18.15 1 +31.6 17.35 1 +31.45 16.55 1 +31.3 15.8 1 +31.15 15.05 1 +30.9 14.35 1 +30.6 13.65 1 +30.3 13.0 1 +29.9 12.3 1 +29.5 11.75 1 +29.0 11.15 1 +28.5 10.6 1 +28.0 10.1 1 +27.55 9.65 1 +26.9 9.1 1 +26.25 8.8 1 +25.7 8.4 1 +25.15 8.05 1 +24.5 7.75 1 +23.9 7.65 1 +23.15 7.4 1 +22.5 7.3 1 +21.9 7.1 1 +21.25 7.05 1 +20.5 7.0 1 +19.9 6.95 1 +19.25 7.05 1 +18.75 7.1 1 +18.05 7.25 1 +17.5 7.35 1 +16.9 7.6 1 +16.35 7.8 1 +15.8 8.05 1 +15.4 8.35 1 +14.9 8.7 1 +14.45 8.9 1 +13.95 9.3 1 +13.6 9.65 1 +13.25 10.1 1 +12.95 10.55 1 +12.65 10.9 1 +12.35 11.4 1 +12.2 11.75 1 +11.95 12.2 1 +11.8 12.65 1 +11.75 13.05 1 +11.55 13.6 1 +11.55 14.0 1 +11.55 14.35 1 +11.55 14.7 1 +11.6 15.25 1 +11.65 15.7 1 +11.8 16.05 1 +11.85 16.5 1 +12.0 16.75 1 +12.15 17.2 1 +12.3 17.6 1 +12.55 17.85 1 +12.8 18.05 1 +13.1 18.4 1 +13.3 18.6 1 +13.55 18.85 1 +13.8 19.05 1 +14.15 19.25 1 +14.45 19.5 1 +14.85 19.55 1 +15.0 19.7 1 +15.25 19.7 1 +15.55 19.85 1 +15.95 19.9 1 +16.2 19.9 1 +16.55 19.9 1 +16.85 19.9 1 +17.2 19.9 1 +17.4 19.8 1 +17.65 19.75 1 +17.8 19.7 1 +18.0 19.6 1 +18.2 19.55 1 +3.9 9.6 2 +3.55 10.65 2 +3.35 11.4 2 +3.1 12.35 2 +3.1 13.25 2 +3.05 14.15 2 +3.0 15.1 2 +3.1 16.0 2 +3.2 16.85 2 +3.45 17.75 2 +3.7 18.7 2 +3.95 19.55 2 +4.35 20.25 2 +4.7 21.1 2 +5.15 21.8 2 +5.6 22.5 2 +6.2 23.3 2 +6.8 23.85 2 +7.35 24.45 2 +8.05 24.95 2 +8.8 25.45 2 +9.5 26.0 2 +10.2 26.35 2 +10.9 26.75 2 +11.7 27.0 2 +12.45 27.25 2 +13.3 27.6 2 +14.05 27.6 2 +14.7 27.75 2 +15.55 27.75 2 +16.4 27.75 2 +17.1 27.75 2 +17.9 27.75 2 +18.55 27.7 2 +19.35 27.6 2 +20.1 27.35 2 +20.7 27.1 2 +21.45 26.8 2 +22.05 26.5 2 +22.7 26.15 2 +23.35 25.65 2 +23.8 25.3 2 +24.3 24.85 2 +24.75 24.35 2 +25.25 23.95 2 +25.65 23.45 2 +26.05 23.0 2 +26.2 22.3 2 +26.6 21.8 2 +26.75 21.25 2 +27.0 20.7 2 +27.15 20.15 2 +27.15 19.6 2 +27.35 19.1 2 +27.35 18.45 2 +27.4 18.0 2 +27.3 17.4 2 +27.15 16.9 2 +27.0 16.4 2 +27.0 15.9 2 +26.75 15.35 2 +26.55 14.85 2 +26.3 14.45 2 +25.95 14.1 2 +25.75 13.7 2 +25.35 13.3 2 +25.05 12.95 2 +24.8 12.7 2 +24.4 12.45 2 +24.05 12.2 2 +23.55 11.85 2 +23.2 11.65 2 +22.75 11.4 2 +22.3 11.3 2 +21.9 11.1 2 +21.45 11.05 2 +21.1 11.0 2 +20.7 10.95 2 +20.35 10.95 2 +19.95 11.0 2 +19.55 11.0 2 +19.15 11.05 2 +18.85 11.1 2 +18.45 11.25 2 +18.15 11.35 2 +17.85 11.5 2 +17.5 11.7 2 +17.2 11.95 2 +17.0 12.05 2 +16.75 12.2 2 +16.65 12.35 2 +16.5 12.5 2 +16.35 12.7 2 +16.2 12.8 2 +16.15 12.95 2 +16.0 13.1 2 +15.95 13.25 2 +15.9 13.4 2 +15.8 13.5 2 +15.8 13.65 2 +15.75 13.85 2 +15.65 14.05 2 +15.65 14.25 2 +15.65 14.5 2 +15.65 14.6 2 +""" + + +def load_dataset(): + """ + Load the provided dataset from the multiline string. + + Each row has three columns (x, y, and an unused label). + Returns the x and y coordinates as (n_samples, 2). + """ + lines = DATA_STR.strip().splitlines() + data_list = [] + for line in lines: + parts = line.split() + if len(parts) >= 2: + x = float(parts[0]) + y = float(parts[1]) + data_list.append([x, y]) + return np.array(data_list) + + +@pytest.fixture +def dataset(): + """Load the dataset.""" + return load_dataset() + + +def test_density_peak_clusterer(dataset): + """Test the DensityPeakClusterer with the provided dataset.""" + clusterer = DensityPeakClusterer( + gauss_cutoff=True, + cutoff_distance="auto", + distance_metric="euclidean", + density_threshold=8, + distance_threshold=5, + ) + + # Fit the clusterer on the dataset + clusterer.fit(dataset) + + # Print cluster labels and cluster centers + print("Cluster Labels:") # noqa + print(clusterer.labels_) # noqa + print("\nCluster Centers:") # noqa + print(clusterer.cluster_centers) # noqa + + # Assertions to verify the clustering results + assert clusterer.labels_ is not None, "Cluster labels should not be None" + assert len(clusterer.labels_) == len( + dataset + ), "Number of labels should match number of data points" + assert clusterer.cluster_centers is not None, "Cluster centers should not be None" + assert ( + len(clusterer.cluster_centers) > 0 + ), "There should be at least one cluster center" + + # Optional: Plot decision graph and clustering results + clusterer.plot(mode="all", title="Density Peak Clustering") + + +# use pytest -s From b00f466edae889500481ea4b2df8804bea812e5a Mon Sep 17 00:00:00 2001 From: Aryan Date: Wed, 19 Mar 2025 15:13:29 +0530 Subject: [PATCH 3/4] 98 cluster center instead of 97 (auto) --- .../clustering/density_based/_density_peak.py | 163 ++++++++++++------ .../density_based/tests/test_density_peak.py | 5 +- 2 files changed, 109 insertions(+), 59 deletions(-) diff --git a/aeon/clustering/density_based/_density_peak.py b/aeon/clustering/density_based/_density_peak.py index 034679ea10..d35a703278 100644 --- a/aeon/clustering/density_based/_density_peak.py +++ b/aeon/clustering/density_based/_density_peak.py @@ -3,6 +3,7 @@ __maintainer__ = [] __all__ = ["DensityPeakClusterer"] +import matplotlib.pyplot as plt import numpy as np from aeon.distances import get_distance_function, pairwise_distance @@ -22,17 +23,23 @@ class DensityPeakClusterer: delta : np.ndarray For each point, the minimum distance to any point with higher density. gauss_cutoff : bool, default=True - Whether to use Gaussian cutoff for density estimation. - cutoff_distance : float, optional - Distance cutoff for Gaussian kernel. + Whether to use a Gaussian kernel for density estimation. + cutoff_distance : float or str, optional + Distance cutoff for the Gaussian kernel. If set to "auto", the cutoff is + automatically selected. distance_metric : str, default="euclidean" - Distance metric to use for clustering. + The distance metric to use for clustering. n_jobs : int, default=1 Number of parallel jobs to run. density_threshold : float, optional - Density threshold to select cluster centers. If None, will use midpoint. + Density threshold for selecting cluster centers. If None, the midpoint of min + and max density is used. distance_threshold : float, optional - Distance threshold to select cluster centers. If None, will use midpoint. + Distance threshold for selecting cluster centers. If None, the midpoint of min + and max delta is used. + anormal : bool, default=False + If True, points in the halo region (border points) are marked with a label + of -1. """ def __init__( @@ -44,6 +51,7 @@ def __init__( n_jobs: int = 1, density_threshold: float = None, distance_threshold: float = None, + anormal: bool = False, ): self.rho = rho self.gauss_cutoff = gauss_cutoff @@ -52,6 +60,7 @@ def __init__( self.n_jobs = n_jobs self.density_threshold = density_threshold self.distance_threshold = distance_threshold + self.anormal = anormal def _build_distance(self): """ @@ -60,36 +69,37 @@ def _build_distance(self): Returns ------- distance_matrix : np.ndarray - A matrix of symmetric pairwise distances. + A symmetric matrix of pairwise distances. """ dist_func = get_distance_function(self.distance_metric) distance_matrix = pairwise_distance(self.data, method=dist_func) return distance_matrix def _auto_select_dc(self): - """Auto-select cutoff distance (dc) so that the fraction of pairwise distances less than dc is within a target range (e.g: 1-2%).""" # noqa - tri_indices = np.triu_indices( - self.n, k=1 - ) # k=1 to ignore diagonal as it has 0's + """ + Auto-select cutoff distance (dc). + + The fraction of pairwise distances less than dc + is within a target range (approximately 0.2% to 1%). + Intended target range: 0.01 <= nneighs <= 0.002. + """ + tri_indices = np.triu_indices(self.n, k=1) # ignore diagonal (self-distances) distances = self.distance_matrix[tri_indices] max_distance = np.max(distances) min_distance = np.min(distances) dc = (max_distance + min_distance) / 2 - lower_bound = 0.002 - upper_bound = 0.01 # 1-2% range + lower_bound = 0.01 # 0.002 + upper_bound = 0.002 # 0.01 # target range: 0.2% to 1% - # recursively setting the value of dc while True: - nneighs = np.sum(distances < dc) / (self.n**2) # nearest neighbors - - if lower_bound < nneighs < upper_bound: # case 1 : dc is in range + nneighs = np.sum(distances < dc) / (self.n**2) + if lower_bound <= nneighs <= upper_bound: break - if nneighs < lower_bound: # case 2 : increase dc (too few neighbors) + if nneighs < lower_bound: # too few neighbors => increase dc min_distance = dc - if nneighs > upper_bound: # case 3 : decrease dc (too many neighbors) + elif nneighs > upper_bound: # too many neighbors => decrease dc max_distance = dc - dc = (max_distance + min_distance) / 2 if max_distance - min_distance < 1e-6: break @@ -103,20 +113,33 @@ def select_dc(self): Returns ------- dc : float - The cutoff distance. + The chosen cutoff distance. """ if self.cutoff_distance == "auto": return self._auto_select_dc() + elif self.cutoff_distance is None: + n = self.data.shape[0] + self.distances = {} + for i in range(n): + for j in range(i + 1, n): + self.distances[(i, j)] = self.distance_matrix[i, j] + self.distances[(j, i)] = self.distance_matrix[i, j] + percent = 2.0 + position = int(self.n * (self.n + 1) / 2 * percent / 100) + sorted_vals = np.sort(list(self.distances.values())) + dc = sorted_vals[position * 2 + self.n] + return dc else: return self.cutoff_distance def _fit(self, X: np.ndarray, y: np.ndarray = None): - """Fit time series clusterer to training data. + """ + Fit the density peak clusterer to the training data. Parameters ---------- X : array-like - Time series data to cluster. + Time series (or 2D) data to cluster. y : array-like, optional Labels for the data (unused in clustering). @@ -126,36 +149,32 @@ def _fit(self, X: np.ndarray, y: np.ndarray = None): The fitted clusterer. """ self.data = X - self.n = X.shape[0] # total no. of time points - - self.distance_matrix = self._build_distance() # pairwise distance matrix + self.n = X.shape[0] # total number of data points - self.dc = self.select_dc() # selecting cutoff distance - - self.rho = np.zeros(self.n) # local density + self.distance_matrix = self._build_distance() + self.dc = self.select_dc() + # Compute local density, excluding self-distance + self.rho = np.zeros(self.n) if self.gauss_cutoff: - # Gaussian kernel: weight = exp(- (d / self.dc) ** 2) for i in range(self.n): - self.rho[i] = np.sum( - np.exp(-((self.distance_matrix[i] / self.dc) ** 2)) - ) + self.rho[i] = ( + np.sum(np.exp(-((self.distance_matrix[i] / self.dc) ** 2))) - 1 + ) # -1 to exclude self (diagonal) else: - # Hard cutoff: weight = 1 if d < dc, 0 otherwise + # Count neighbors using a hard cutoff, subtracting the self-count for i in range(self.n): - self.rho[i] = np.sum(self.distance_matrix[i] < self.dc) + self.rho[i] = np.sum(self.distance_matrix[i] < self.dc) - 1 + # computing delta and nearest higher-density neighbor self.delta = np.full(self.n, np.inf) self.nneigh = np.zeros(self.n, dtype=int) - sorted_indices = np.argsort( - -self.rho - ) # decending order of local density indices + sorted_indices = np.argsort(-self.rho) # indices in descending order of density self.sorted_indices = sorted_indices highest_index = sorted_indices[0] - self.delta[highest_index] = np.max( - self.distance_matrix - ) # distance to highest density point + # For the highest-density point, set delta to the maximum distance in the matrix + self.delta[highest_index] = np.max(self.distance_matrix) self.nneigh[highest_index] = highest_index for i in range(1, self.n): @@ -167,7 +186,7 @@ def _fit(self, X: np.ndarray, y: np.ndarray = None): self.delta[current_index] = d self.nneigh[current_index] = higher_index - # If thresholds are not provided, use the midpoint rule + # midpoint rule if thresholds are not provided if self.density_threshold is None: rho_threshold = 0.5 * (self.rho.min() + self.rho.max()) else: @@ -178,33 +197,69 @@ def _fit(self, X: np.ndarray, y: np.ndarray = None): else: delta_threshold = self.distance_threshold - # Initialize cluster labels to -1 and assign centers based on the thresholds + # Initial cluster assignment: + # marking points as centers,they exceed both density and delta thresholds self.labels_ = -np.ones(self.n, dtype=int) for idx in range(self.n): if (self.rho[idx] >= rho_threshold) and ( self.delta[idx] >= delta_threshold ): - self.labels_[idx] = ( - idx # mark as cluster center (label equals its own index) - ) + self.labels_[idx] = idx - # descending-density assignment for non-center points + # labels for non-center points based on the nearest higher-density neighbor. for idx in sorted_indices: if self.labels_[idx] == -1: self.labels_[idx] = self.labels_[self.nneigh[idx]] - # store the final center indices for reference. + # Create a copy for halo assignment. + halo = self.labels_.copy() + # Identify unique cluster centers. + unique_centers = [i for i in range(self.n) if self.labels_[i] == i] + # Initialize border density for each cluster. + bord_rho = {center: 0.0 for center in unique_centers} + + # For every pair of points in clusters within dc, update border density + for i in range(self.n): + for j in range(i + 1, self.n): + if ( + self.labels_[i] != self.labels_[j] + and self.distance_matrix[i, j] <= self.dc + ): + rho_avg = (self.rho[i] + self.rho[j]) / 2.0 + if ( + self.labels_[i] in bord_rho + and rho_avg > bord_rho[self.labels_[i]] + ): + bord_rho[self.labels_[i]] = rho_avg + if ( + self.labels_[j] in bord_rho + and rho_avg > bord_rho[self.labels_[j]] + ): + bord_rho[self.labels_[j]] = rho_avg + + # points falling in halo region (density lower than cluster's border density) + for i in range(self.n): + if self.labels_[i] in bord_rho and self.rho[i] < bord_rho[self.labels_[i]]: + halo[i] = 0 + + # if anormal flag is True, assign halo points a label of -1 + if self.anormal: + for i in range(self.n): + if halo[i] == 0: + self.labels_[i] = -1 + + # Final cluster centers: points whose label equals their own index. self.cluster_centers = [i for i in range(self.n) if self.labels_[i] == i] def fit(self, X: np.ndarray, y: np.ndarray = None): """ - Fit time series clusterer to training data. + Fit the clusterer to the training data. Parameters ---------- X : array-like, shape=(n_samples, n_timepoints) or (n_samples, n_channels, n_timepoints) - Time series data to cluster. + Data to cluster. y : array-like, optional Labels for the data (unused in clustering). @@ -219,20 +274,18 @@ def fit(self, X: np.ndarray, y: np.ndarray = None): def plot(self, mode="all", title="", **kwargs): """ - Plot clustering results and/or the decision graph. + Plot the clustering results and/or the decision graph. Parameters ---------- mode : str, default="all" - One of "decision" (plot decision graph), "label" (plot clustered data), - or "all" (plot both). + One of "decision" (to plot the decision graph), + "label" (to plot cluster labels), or "all" (to plot both). title : str, optional Title for the plots. kwargs : dict Additional keyword arguments passed to plotting functions. """ - import matplotlib.pyplot as plt - if mode in {"decision", "all"}: plt.figure() plt.scatter(self.rho, self.delta, c=self.labels_, cmap="viridis") diff --git a/aeon/clustering/density_based/tests/test_density_peak.py b/aeon/clustering/density_based/tests/test_density_peak.py index ba6a14a0fe..434108b216 100644 --- a/aeon/clustering/density_based/tests/test_density_peak.py +++ b/aeon/clustering/density_based/tests/test_density_peak.py @@ -354,18 +354,16 @@ def test_density_peak_clusterer(dataset): distance_metric="euclidean", density_threshold=8, distance_threshold=5, + anormal=False, ) - # Fit the clusterer on the dataset clusterer.fit(dataset) - # Print cluster labels and cluster centers print("Cluster Labels:") # noqa print(clusterer.labels_) # noqa print("\nCluster Centers:") # noqa print(clusterer.cluster_centers) # noqa - # Assertions to verify the clustering results assert clusterer.labels_ is not None, "Cluster labels should not be None" assert len(clusterer.labels_) == len( dataset @@ -375,7 +373,6 @@ def test_density_peak_clusterer(dataset): len(clusterer.cluster_centers) > 0 ), "There should be at least one cluster center" - # Optional: Plot decision graph and clustering results clusterer.plot(mode="all", title="Density Peak Clustering") From e9daf14e35fd2b33363dac153c28797137680211 Mon Sep 17 00:00:00 2001 From: Aryan Date: Sun, 25 May 2025 06:31:53 +0530 Subject: [PATCH 4/4] ari=0.915552 --- aeon/clustering/density_based/_density_peak.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/aeon/clustering/density_based/_density_peak.py b/aeon/clustering/density_based/_density_peak.py index d35a703278..2950fecdab 100644 --- a/aeon/clustering/density_based/_density_peak.py +++ b/aeon/clustering/density_based/_density_peak.py @@ -89,8 +89,8 @@ def _auto_select_dc(self): min_distance = np.min(distances) dc = (max_distance + min_distance) / 2 - lower_bound = 0.01 # 0.002 - upper_bound = 0.002 # 0.01 # target range: 0.2% to 1% + lower_bound = 0.01 # 0.2 + upper_bound = 0.002 # 1 while True: nneighs = np.sum(distances < dc) / (self.n**2)