diff --git a/aeon/clustering/density_based/__init__.py b/aeon/clustering/density_based/__init__.py new file mode 100644 index 0000000000..4bfc3bf42c --- /dev/null +++ b/aeon/clustering/density_based/__init__.py @@ -0,0 +1 @@ +"""Density.""" diff --git a/aeon/clustering/density_based/_density_peak.py b/aeon/clustering/density_based/_density_peak.py new file mode 100644 index 0000000000..2950fecdab --- /dev/null +++ b/aeon/clustering/density_based/_density_peak.py @@ -0,0 +1,310 @@ +"""Density clustering for time series data.""" + +__maintainer__ = [] +__all__ = ["DensityPeakClusterer"] + +import matplotlib.pyplot as plt +import numpy as np + +from aeon.distances import get_distance_function, pairwise_distance + + +class DensityPeakClusterer: + """ + Density Peak Clusterer. + + Clusters time series data using a density-based approach that estimates local + densities and identifies peaks as cluster centers. + + Parameters + ---------- + rho : float, optional + The local density for each data point. + delta : np.ndarray + For each point, the minimum distance to any point with higher density. + gauss_cutoff : bool, default=True + Whether to use a Gaussian kernel for density estimation. + cutoff_distance : float or str, optional + Distance cutoff for the Gaussian kernel. If set to "auto", the cutoff is + automatically selected. + distance_metric : str, default="euclidean" + The distance metric to use for clustering. + n_jobs : int, default=1 + Number of parallel jobs to run. + density_threshold : float, optional + Density threshold for selecting cluster centers. If None, the midpoint of min + and max density is used. + distance_threshold : float, optional + Distance threshold for selecting cluster centers. If None, the midpoint of min + and max delta is used. + anormal : bool, default=False + If True, points in the halo region (border points) are marked with a label + of -1. + """ + + def __init__( + self, + rho: float = None, + gauss_cutoff: bool = True, + cutoff_distance: int = None, + distance_metric: str = "euclidean", + n_jobs: int = 1, + density_threshold: float = None, + distance_threshold: float = None, + anormal: bool = False, + ): + self.rho = rho + self.gauss_cutoff = gauss_cutoff + self.cutoff_distance = cutoff_distance + self.distance_metric = distance_metric + self.n_jobs = n_jobs + self.density_threshold = density_threshold + self.distance_threshold = distance_threshold + self.anormal = anormal + + def _build_distance(self): + """ + Compute the pairwise distance matrix using the resolved distance function. + + Returns + ------- + distance_matrix : np.ndarray + A symmetric matrix of pairwise distances. + """ + dist_func = get_distance_function(self.distance_metric) + distance_matrix = pairwise_distance(self.data, method=dist_func) + return distance_matrix + + def _auto_select_dc(self): + """ + Auto-select cutoff distance (dc). + + The fraction of pairwise distances less than dc + is within a target range (approximately 0.2% to 1%). + Intended target range: 0.01 <= nneighs <= 0.002. + """ + tri_indices = np.triu_indices(self.n, k=1) # ignore diagonal (self-distances) + distances = self.distance_matrix[tri_indices] + max_distance = np.max(distances) + min_distance = np.min(distances) + dc = (max_distance + min_distance) / 2 + + lower_bound = 0.01 # 0.2 + upper_bound = 0.002 # 1 + + while True: + nneighs = np.sum(distances < dc) / (self.n**2) + if lower_bound <= nneighs <= upper_bound: + break + if nneighs < lower_bound: # too few neighbors => increase dc + min_distance = dc + elif nneighs > upper_bound: # too many neighbors => decrease dc + max_distance = dc + dc = (max_distance + min_distance) / 2 + if max_distance - min_distance < 1e-6: + break + + return dc + + def select_dc(self): + """ + Select the cutoff distance (dc) for density estimation. + + Returns + ------- + dc : float + The chosen cutoff distance. + """ + if self.cutoff_distance == "auto": + return self._auto_select_dc() + elif self.cutoff_distance is None: + n = self.data.shape[0] + self.distances = {} + for i in range(n): + for j in range(i + 1, n): + self.distances[(i, j)] = self.distance_matrix[i, j] + self.distances[(j, i)] = self.distance_matrix[i, j] + percent = 2.0 + position = int(self.n * (self.n + 1) / 2 * percent / 100) + sorted_vals = np.sort(list(self.distances.values())) + dc = sorted_vals[position * 2 + self.n] + return dc + else: + return self.cutoff_distance + + def _fit(self, X: np.ndarray, y: np.ndarray = None): + """ + Fit the density peak clusterer to the training data. + + Parameters + ---------- + X : array-like + Time series (or 2D) data to cluster. + y : array-like, optional + Labels for the data (unused in clustering). + + Returns + ------- + self : object + The fitted clusterer. + """ + self.data = X + self.n = X.shape[0] # total number of data points + + self.distance_matrix = self._build_distance() + self.dc = self.select_dc() + + # Compute local density, excluding self-distance + self.rho = np.zeros(self.n) + if self.gauss_cutoff: + for i in range(self.n): + self.rho[i] = ( + np.sum(np.exp(-((self.distance_matrix[i] / self.dc) ** 2))) - 1 + ) # -1 to exclude self (diagonal) + else: + # Count neighbors using a hard cutoff, subtracting the self-count + for i in range(self.n): + self.rho[i] = np.sum(self.distance_matrix[i] < self.dc) - 1 + + # computing delta and nearest higher-density neighbor + self.delta = np.full(self.n, np.inf) + self.nneigh = np.zeros(self.n, dtype=int) + sorted_indices = np.argsort(-self.rho) # indices in descending order of density + self.sorted_indices = sorted_indices + + highest_index = sorted_indices[0] + # For the highest-density point, set delta to the maximum distance in the matrix + self.delta[highest_index] = np.max(self.distance_matrix) + self.nneigh[highest_index] = highest_index + + for i in range(1, self.n): + current_index = sorted_indices[i] + for j in range(i): + higher_index = sorted_indices[j] + d = self.distance_matrix[current_index, higher_index] + if d < self.delta[current_index]: + self.delta[current_index] = d + self.nneigh[current_index] = higher_index + + # midpoint rule if thresholds are not provided + if self.density_threshold is None: + rho_threshold = 0.5 * (self.rho.min() + self.rho.max()) + else: + rho_threshold = self.density_threshold + + if self.distance_threshold is None: + delta_threshold = 0.5 * (self.delta.min() + self.delta.max()) + else: + delta_threshold = self.distance_threshold + + # Initial cluster assignment: + # marking points as centers,they exceed both density and delta thresholds + self.labels_ = -np.ones(self.n, dtype=int) + for idx in range(self.n): + if (self.rho[idx] >= rho_threshold) and ( + self.delta[idx] >= delta_threshold + ): + self.labels_[idx] = idx + + # labels for non-center points based on the nearest higher-density neighbor. + for idx in sorted_indices: + if self.labels_[idx] == -1: + self.labels_[idx] = self.labels_[self.nneigh[idx]] + + # Create a copy for halo assignment. + halo = self.labels_.copy() + # Identify unique cluster centers. + unique_centers = [i for i in range(self.n) if self.labels_[i] == i] + # Initialize border density for each cluster. + bord_rho = {center: 0.0 for center in unique_centers} + + # For every pair of points in clusters within dc, update border density + for i in range(self.n): + for j in range(i + 1, self.n): + if ( + self.labels_[i] != self.labels_[j] + and self.distance_matrix[i, j] <= self.dc + ): + rho_avg = (self.rho[i] + self.rho[j]) / 2.0 + if ( + self.labels_[i] in bord_rho + and rho_avg > bord_rho[self.labels_[i]] + ): + bord_rho[self.labels_[i]] = rho_avg + if ( + self.labels_[j] in bord_rho + and rho_avg > bord_rho[self.labels_[j]] + ): + bord_rho[self.labels_[j]] = rho_avg + + # points falling in halo region (density lower than cluster's border density) + for i in range(self.n): + if self.labels_[i] in bord_rho and self.rho[i] < bord_rho[self.labels_[i]]: + halo[i] = 0 + + # if anormal flag is True, assign halo points a label of -1 + if self.anormal: + for i in range(self.n): + if halo[i] == 0: + self.labels_[i] = -1 + + # Final cluster centers: points whose label equals their own index. + self.cluster_centers = [i for i in range(self.n) if self.labels_[i] == i] + + def fit(self, X: np.ndarray, y: np.ndarray = None): + """ + Fit the clusterer to the training data. + + Parameters + ---------- + X : array-like, shape=(n_samples, n_timepoints) or + (n_samples, n_channels, n_timepoints) + Data to cluster. + y : array-like, optional + Labels for the data (unused in clustering). + + Returns + ------- + self : DensityPeakClusterer + The fitted clusterer. + """ + self._fit(X) + self.is_fitted = True + return self + + def plot(self, mode="all", title="", **kwargs): + """ + Plot the clustering results and/or the decision graph. + + Parameters + ---------- + mode : str, default="all" + One of "decision" (to plot the decision graph), + "label" (to plot cluster labels), or "all" (to plot both). + title : str, optional + Title for the plots. + kwargs : dict + Additional keyword arguments passed to plotting functions. + """ + if mode in {"decision", "all"}: + plt.figure() + plt.scatter(self.rho, self.delta, c=self.labels_, cmap="viridis") + plt.xlabel("Local Density (rho)") + plt.ylabel("Delta") + plt.title(title + " Decision Graph") + plt.colorbar() + plt.show() + + if mode in {"label", "all"}: + plt.figure() + if self.data.ndim == 2 and self.data.shape[1] >= 2: + plt.scatter( + self.data[:, 0], self.data[:, 1], c=self.labels_, cmap="viridis" + ) + plt.xlabel("Feature 1") + plt.ylabel("Feature 2") + else: + plt.plot(self.data.T) + plt.title(title + " Cluster Labels") + plt.colorbar() + plt.show() diff --git a/aeon/clustering/density_based/tests/__init__.py b/aeon/clustering/density_based/tests/__init__.py new file mode 100644 index 0000000000..44d07e76f6 --- /dev/null +++ b/aeon/clustering/density_based/tests/__init__.py @@ -0,0 +1 @@ +"""Tests for density based clustering algorithms.""" diff --git a/aeon/clustering/density_based/tests/test_density_peak.py b/aeon/clustering/density_based/tests/test_density_peak.py new file mode 100644 index 0000000000..434108b216 --- /dev/null +++ b/aeon/clustering/density_based/tests/test_density_peak.py @@ -0,0 +1,379 @@ +"""Test for the DensityPeakClusterer module.""" + +import numpy as np +import pytest + +from aeon.clustering.density_based._density_peak import DensityPeakClusterer + +# spiral +DATA_STR = """ +31.95 7.95 3 +31.15 7.3 3 +30.45 6.65 3 +29.7 6.0 3 +28.9 5.55 3 +28.05 5.0 3 +27.2 4.55 3 +26.35 4.15 3 +25.4 3.85 3 +24.6 3.6 3 +23.6 3.3 3 +22.75 3.15 3 +21.85 3.05 3 +20.9 3.0 3 +20.0 2.9 3 +19.1 3.0 3 +18.2 3.2 3 +17.3 3.25 3 +16.55 3.5 3 +15.7 3.7 3 +14.85 4.1 3 +14.15 4.4 3 +13.4 4.75 3 +12.7 5.2 3 +12.05 5.65 3 +11.45 6.15 3 +10.9 6.65 3 +10.3 7.25 3 +9.7 7.85 3 +9.35 8.35 3 +8.9 9.05 3 +8.55 9.65 3 +8.15 10.35 3 +7.95 10.95 3 +7.75 11.7 3 +7.55 12.35 3 +7.45 13.0 3 +7.35 13.75 3 +7.3 14.35 3 +7.35 14.95 3 +7.35 15.75 3 +7.55 16.35 3 +7.7 16.95 3 +7.8 17.55 3 +8.05 18.15 3 +8.3 18.75 3 +8.65 19.3 3 +8.9 19.85 3 +9.3 20.3 3 +9.65 20.8 3 +10.2 21.25 3 +10.6 21.65 3 +11.1 22.15 3 +11.55 22.45 3 +11.95 22.7 3 +12.55 23.0 3 +13.05 23.2 3 +13.45 23.4 3 +14.0 23.55 3 +14.55 23.6 3 +15.1 23.75 3 +15.7 23.75 3 +16.15 23.85 3 +16.7 23.8 3 +17.15 23.75 3 +17.75 23.75 3 +18.2 23.6 3 +18.65 23.5 3 +19.1 23.35 3 +19.6 23.15 3 +20.0 22.95 3 +20.4 22.7 3 +20.7 22.55 3 +21.0 22.15 3 +21.45 21.95 3 +21.75 21.55 3 +22.0 21.25 3 +22.25 21.0 3 +22.5 20.7 3 +22.65 20.35 3 +22.75 20.05 3 +22.9 19.65 3 +23.0 19.35 3 +23.1 19.0 3 +23.15 18.65 3 +23.2 18.25 3 +23.2 18.05 3 +23.2 17.8 3 +23.1 17.45 3 +23.05 17.15 3 +22.9 16.9 3 +22.85 16.6 3 +22.7 16.4 3 +22.6 16.2 3 +22.55 16.05 3 +22.4 15.95 3 +22.35 15.8 3 +22.2 15.65 3 +22.15 15.55 3 +22.0 15.4 3 +21.9 15.3 3 +21.85 15.25 3 +21.75 15.15 3 +21.65 15.05 3 +21.55 15.0 3 +21.5 14.9 3 +19.35 31.65 1 +20.35 31.45 1 +21.35 31.1 1 +22.25 30.9 1 +23.2 30.45 1 +23.95 30.05 1 +24.9 29.65 1 +25.6 29.05 1 +26.35 28.5 1 +27.15 27.9 1 +27.75 27.35 1 +28.3 26.6 1 +28.95 25.85 1 +29.5 25.15 1 +29.95 24.45 1 +30.4 23.7 1 +30.6 22.9 1 +30.9 22.1 1 +31.25 21.3 1 +31.35 20.55 1 +31.5 19.7 1 +31.55 18.9 1 +31.65 18.15 1 +31.6 17.35 1 +31.45 16.55 1 +31.3 15.8 1 +31.15 15.05 1 +30.9 14.35 1 +30.6 13.65 1 +30.3 13.0 1 +29.9 12.3 1 +29.5 11.75 1 +29.0 11.15 1 +28.5 10.6 1 +28.0 10.1 1 +27.55 9.65 1 +26.9 9.1 1 +26.25 8.8 1 +25.7 8.4 1 +25.15 8.05 1 +24.5 7.75 1 +23.9 7.65 1 +23.15 7.4 1 +22.5 7.3 1 +21.9 7.1 1 +21.25 7.05 1 +20.5 7.0 1 +19.9 6.95 1 +19.25 7.05 1 +18.75 7.1 1 +18.05 7.25 1 +17.5 7.35 1 +16.9 7.6 1 +16.35 7.8 1 +15.8 8.05 1 +15.4 8.35 1 +14.9 8.7 1 +14.45 8.9 1 +13.95 9.3 1 +13.6 9.65 1 +13.25 10.1 1 +12.95 10.55 1 +12.65 10.9 1 +12.35 11.4 1 +12.2 11.75 1 +11.95 12.2 1 +11.8 12.65 1 +11.75 13.05 1 +11.55 13.6 1 +11.55 14.0 1 +11.55 14.35 1 +11.55 14.7 1 +11.6 15.25 1 +11.65 15.7 1 +11.8 16.05 1 +11.85 16.5 1 +12.0 16.75 1 +12.15 17.2 1 +12.3 17.6 1 +12.55 17.85 1 +12.8 18.05 1 +13.1 18.4 1 +13.3 18.6 1 +13.55 18.85 1 +13.8 19.05 1 +14.15 19.25 1 +14.45 19.5 1 +14.85 19.55 1 +15.0 19.7 1 +15.25 19.7 1 +15.55 19.85 1 +15.95 19.9 1 +16.2 19.9 1 +16.55 19.9 1 +16.85 19.9 1 +17.2 19.9 1 +17.4 19.8 1 +17.65 19.75 1 +17.8 19.7 1 +18.0 19.6 1 +18.2 19.55 1 +3.9 9.6 2 +3.55 10.65 2 +3.35 11.4 2 +3.1 12.35 2 +3.1 13.25 2 +3.05 14.15 2 +3.0 15.1 2 +3.1 16.0 2 +3.2 16.85 2 +3.45 17.75 2 +3.7 18.7 2 +3.95 19.55 2 +4.35 20.25 2 +4.7 21.1 2 +5.15 21.8 2 +5.6 22.5 2 +6.2 23.3 2 +6.8 23.85 2 +7.35 24.45 2 +8.05 24.95 2 +8.8 25.45 2 +9.5 26.0 2 +10.2 26.35 2 +10.9 26.75 2 +11.7 27.0 2 +12.45 27.25 2 +13.3 27.6 2 +14.05 27.6 2 +14.7 27.75 2 +15.55 27.75 2 +16.4 27.75 2 +17.1 27.75 2 +17.9 27.75 2 +18.55 27.7 2 +19.35 27.6 2 +20.1 27.35 2 +20.7 27.1 2 +21.45 26.8 2 +22.05 26.5 2 +22.7 26.15 2 +23.35 25.65 2 +23.8 25.3 2 +24.3 24.85 2 +24.75 24.35 2 +25.25 23.95 2 +25.65 23.45 2 +26.05 23.0 2 +26.2 22.3 2 +26.6 21.8 2 +26.75 21.25 2 +27.0 20.7 2 +27.15 20.15 2 +27.15 19.6 2 +27.35 19.1 2 +27.35 18.45 2 +27.4 18.0 2 +27.3 17.4 2 +27.15 16.9 2 +27.0 16.4 2 +27.0 15.9 2 +26.75 15.35 2 +26.55 14.85 2 +26.3 14.45 2 +25.95 14.1 2 +25.75 13.7 2 +25.35 13.3 2 +25.05 12.95 2 +24.8 12.7 2 +24.4 12.45 2 +24.05 12.2 2 +23.55 11.85 2 +23.2 11.65 2 +22.75 11.4 2 +22.3 11.3 2 +21.9 11.1 2 +21.45 11.05 2 +21.1 11.0 2 +20.7 10.95 2 +20.35 10.95 2 +19.95 11.0 2 +19.55 11.0 2 +19.15 11.05 2 +18.85 11.1 2 +18.45 11.25 2 +18.15 11.35 2 +17.85 11.5 2 +17.5 11.7 2 +17.2 11.95 2 +17.0 12.05 2 +16.75 12.2 2 +16.65 12.35 2 +16.5 12.5 2 +16.35 12.7 2 +16.2 12.8 2 +16.15 12.95 2 +16.0 13.1 2 +15.95 13.25 2 +15.9 13.4 2 +15.8 13.5 2 +15.8 13.65 2 +15.75 13.85 2 +15.65 14.05 2 +15.65 14.25 2 +15.65 14.5 2 +15.65 14.6 2 +""" + + +def load_dataset(): + """ + Load the provided dataset from the multiline string. + + Each row has three columns (x, y, and an unused label). + Returns the x and y coordinates as (n_samples, 2). + """ + lines = DATA_STR.strip().splitlines() + data_list = [] + for line in lines: + parts = line.split() + if len(parts) >= 2: + x = float(parts[0]) + y = float(parts[1]) + data_list.append([x, y]) + return np.array(data_list) + + +@pytest.fixture +def dataset(): + """Load the dataset.""" + return load_dataset() + + +def test_density_peak_clusterer(dataset): + """Test the DensityPeakClusterer with the provided dataset.""" + clusterer = DensityPeakClusterer( + gauss_cutoff=True, + cutoff_distance="auto", + distance_metric="euclidean", + density_threshold=8, + distance_threshold=5, + anormal=False, + ) + + clusterer.fit(dataset) + + print("Cluster Labels:") # noqa + print(clusterer.labels_) # noqa + print("\nCluster Centers:") # noqa + print(clusterer.cluster_centers) # noqa + + assert clusterer.labels_ is not None, "Cluster labels should not be None" + assert len(clusterer.labels_) == len( + dataset + ), "Number of labels should match number of data points" + assert clusterer.cluster_centers is not None, "Cluster centers should not be None" + assert ( + len(clusterer.cluster_centers) > 0 + ), "There should be at least one cluster center" + + clusterer.plot(mode="all", title="Density Peak Clustering") + + +# use pytest -s