diff --git a/src/tdamapper/core.py b/src/tdamapper/core.py index a8185a1..7806bde 100644 --- a/src/tdamapper/core.py +++ b/src/tdamapper/core.py @@ -29,6 +29,7 @@ """ import logging +import numpy as np import networkx as nx from joblib import Parallel, delayed @@ -122,6 +123,11 @@ def mapper_connected_components(X, y, cover, clustering, n_jobs=1): label to each point in the dataset, based on the connected component of the Mapper graph that it belongs to. + Points that are classified as noise (label -1) by the clustering algorithm + will retain their noise label unless modified by a noise handling wrapper + (see :class:`tdamapper.core.NoiseHandlingClustering`). This allows for + flexible handling of noise points in different applications. + This function uses a union-find data structure to efficiently keep track of the connected components as it scans the points of the dataset. This approach should be faster than computing the Mapper graph by first calling @@ -135,7 +141,8 @@ def mapper_connected_components(X, y, cover, clustering, n_jobs=1): :param cover: A cover algorithm. :type cover: A class compatible with :class:`tdamapper.core.Cover` :param clustering: The clustering algorithm to apply to each subset of the - dataset. + dataset. Can be wrapped with :class:`tdamapper.core.NoiseHandlingClustering` + to control how noise points are handled. :type clustering: An estimator compatible with scikit-learn's clustering interface, typically from :mod:`sklearn.cluster`. :param n_jobs: The maximum number of parallel clustering jobs. This @@ -143,7 +150,8 @@ def mapper_connected_components(X, y, cover, clustering, n_jobs=1): Defaults to 1. :type n_jobs: int :return: A list of labels. The label at position i identifies the connected - component of the point at position i in the dataset. + component of the point at position i in the dataset. Points labeled as + -1 are considered noise points. :rtype: list[int] """ itm_lbls = mapper_labels(X, y, cover, clustering, n_jobs=n_jobs) @@ -157,9 +165,10 @@ def mapper_connected_components(X, y, cover, clustering, n_jobs=1): uf.union(first, second) labels = [-1 for _ in X] for i, lbls in enumerate(itm_lbls): - # assign -1 to noise points - root = uf.find(lbls[0]) if lbls else -1 - labels[i] = root + if lbls: # if the point belongs to any cluster + root = uf.find(lbls[0]) + labels[i] = root + # else: keep as -1 (noise point) return labels @@ -433,6 +442,72 @@ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) +class NoiseHandlingClustering(ParamsMixin): + """ + A clustering algorithm wrapper that provides control over noise point handling. + + This class wraps a clustering algorithm and provides options for handling noise + points (points labeled as -1). By default, noise points are preserved as + singleton clusters, but they can also be dropped or grouped into a single noise + cluster. + + Performance implications of each noise handling mode: + - 'singleton': Creates individual clusters for noise points, which may increase + memory usage and processing time when there are many noise points. + - 'drop': Most memory efficient as noise points are simply ignored, but loses + information about noise points. + - 'group': Balances memory usage and information preservation by grouping all + noise points into a single cluster. + + :param clustering: A clustering algorithm to delegate to. + :type clustering: An estimator compatible with scikit-learn's clustering + interface, typically from :mod:`sklearn.cluster`. + :param noise_handling: How to handle noise points. Options are: + - 'singleton': Each noise point becomes its own cluster (default) + - 'drop': Noise points are kept as -1 and will be dropped + - 'group': All noise points are grouped into a single cluster + :type noise_handling: str, optional + """ + + def __init__(self, clustering=None, noise_handling='singleton'): + self.clustering = clustering + if noise_handling not in ['singleton', 'drop', 'group']: + raise ValueError( + "noise_handling must be one of 'singleton', 'drop', or 'group', " + f"got {noise_handling!r} instead" + ) + self.noise_handling = noise_handling + + def fit(self, X, y=None): + # Initialize and fit the base clustering algorithm + clustering = TrivialClustering() if self.clustering is None else clone(self.clustering) + clustering.fit(X, y) + labels = np.array(clustering.labels_) # Convert to numpy array for easier manipulation + + # Find the maximum non-noise label, defaulting to -1 if all points are noise + non_noise_labels = [label for label in labels if label != -1] + max_label = max(non_noise_labels) if non_noise_labels else -1 + + if self.noise_handling == 'drop': + # Keep noise points as -1 + self.labels_ = labels + elif self.noise_handling == 'group': + # Group all noise points into a single cluster + noise_label = max_label + 1 + noise_mask = (labels == -1) + self.labels_ = labels.copy() # Preserve original cluster labels + self.labels_[noise_mask] = noise_label + else: # 'singleton' (default) + # Convert each noise point into its own cluster + next_label = max_label + 1 + self.labels_ = labels.copy() # Preserve original cluster labels + noise_indices = np.where(labels == -1)[0] + for idx in noise_indices: + self.labels_[idx] = next_label + next_label += 1 + + return self + class FailSafeClustering(ParamsMixin): """ A delegating clustering algorithm that prevents failure. diff --git a/tests/test_unit_noise.py b/tests/test_unit_noise.py new file mode 100644 index 0000000..a487d9f --- /dev/null +++ b/tests/test_unit_noise.py @@ -0,0 +1,97 @@ +"""Tests for noise point handling in clustering.""" + +import numpy as np +from sklearn.cluster import DBSCAN +from tdamapper.core import ( + NoiseHandlingClustering, + TrivialCover, + mapper_connected_components, +) + + +def test_noise_handling_clustering(): + # Create a simple dataset with obvious noise points + X = np.array([ + [0, 0], # Cluster 1 + [0.1, 0.1], # Cluster 1 + [5, 5], # Noise point + [1, 1], # Cluster 2 + [1.1, 0.9], # Cluster 2 + [10, 10], # Noise point + ]) + + # Base clustering with DBSCAN (eps=0.3 will make points far apart noise) + base_clustering = DBSCAN(eps=0.3, min_samples=2) # min_samples=2 to ensure small clusters are valid + # Debug: Print raw DBSCAN labels + debug_labels = base_clustering.fit(X).labels_ + print(f"\nDebug - Raw DBSCAN labels: {debug_labels}") + + # Test invalid noise_handling parameter + try: + NoiseHandlingClustering(clustering=base_clustering, noise_handling='invalid') + assert False, "Should raise ValueError for invalid noise_handling" + except ValueError as e: + assert "noise_handling must be one of" in str(e) + + # Test 'drop' mode + clustering_drop = NoiseHandlingClustering( + clustering=base_clustering, + noise_handling='drop' + ) + clustering_drop.fit(X) + assert -1 in clustering_drop.labels_, "Noise points should be kept as -1" + + # Test 'group' mode + clustering_group = NoiseHandlingClustering( + clustering=base_clustering, + noise_handling='group' + ) + clustering_group.fit(X) + assert -1 not in clustering_group.labels_, "No points should be marked as noise" + noise_points = np.where(clustering_group.labels_ == max(clustering_group.labels_))[0] + assert len(noise_points) == 2, "Should have 2 points in noise cluster" + assert 2 in noise_points and 5 in noise_points, "Points [5,5] and [10,10] should be noise" + + # Test 'singleton' mode (default) + clustering_singleton = NoiseHandlingClustering( + clustering=base_clustering + ) + clustering_singleton.fit(X) + assert -1 not in clustering_singleton.labels_, "No points should be marked as noise" + # Each noise point should have its own unique label + noise_labels = clustering_singleton.labels_[[2, 5]] # labels for [5,5] and [10,10] + assert len(set(noise_labels)) == 2, "Each noise point should have unique label" + # Verify exact number of clusters (2 original clusters + 2 singleton noise clusters) + assert len(set(clustering_singleton.labels_)) == 4, "Should have exactly 4 clusters (2 original + 2 noise)" + + +def test_mapper_with_noise_handling(): + # Create a dataset with noise points + X = np.array([ + [0, 0], [0.1, 0.1], # Cluster 1 + [5, 5], # Noise point + [1, 1], [1.1, 0.9], # Cluster 2 + [10, 10], # Noise point + ]) + + # Test with default noise handling (drop) + base_clustering = DBSCAN(eps=0.3) + labels = mapper_connected_components( + X, X, # Use X as both data and lens + TrivialCover(), + base_clustering + ) + assert -1 in labels, "Noise points should be kept by default" + + # Test with custom noise handling + noise_handler = NoiseHandlingClustering( + clustering=base_clustering, + noise_handling='singleton' + ) + labels = mapper_connected_components( + X, X, + TrivialCover(), + noise_handler + ) + assert -1 not in labels, "No points should be marked as noise" + assert len(set(labels)) >= 4, "Should have at least 4 components (2 clusters + 2 noise)" \ No newline at end of file