diff --git a/src/github_analysis/cluster.py b/src/github_analysis/cluster.py index 0e77d5b..5a80265 100644 --- a/src/github_analysis/cluster.py +++ b/src/github_analysis/cluster.py @@ -1,45 +1,132 @@ import pandas as pd -from sklearn.cluster import KMeans -import pickle import logging +import pickle +import numpy as np +from sklearn.preprocessing import MinMaxScaler, StandardScaler +from sklearn.cluster import DBSCAN, MiniBatchKMeans + + +class Cluster(): + def __init__(self): + """ Initializes the Cluster class + + Parameters + ---------- + raw_data: pd.DataFrame or np.ndarray + Data in a 2 dimensional ndarray or a pandas Data Frame + + Returns + ------- + None + """ + self.raw_data = None + self.data = None + self.algorithm = None + self.transformed_data = None + self.fitted = None + + def open_embeddings(self, input_file): + self.raw_data = pd.read_csv(input_file, index_col = 0) -logging.basicConfig(format="%(asctime)s : %(levelname)s : %(message)s", filename="log.log", level=logging.INFO) + def scale_data(self, min_max = True): + """ Scales the data in all columns to a same scale -def get_embedding_clusters(embedding_input_file='./results/embeddings.csv', k_for_clustering=10, random_state=None, - output_file='./results/clusters.pickle'): - """ Given a file with embeddings (or other features) cluster similar rows together using kmeans. - Parameters ---------- - embedding_input_file: file where every row is a project and every col a feature. - k_for_clustering: how many groups to cluster into. - random_state: random state for clustering algo. - output_file: string with the filename to output the results to as a pickle. If this param is set to None no file will be outputted. + min_max: bool + If True uses the MinMaxScaler, if False uses the StandardScaler Returns ------- - a dictionary where the keys are the cluster labels and the values are lists of GitHub projectIds that fall in that cluster. - """ - embeddings = pd.read_csv(embedding_input_file, index_col=0) - - # Run k-means algo TODO: spend more time on this algo: tune hyperparams, consider algo that better handles high dim, etc. - kmeans = KMeans(n_clusters=k_for_clustering, random_state=random_state).fit(embeddings.values) - - # Make dict where key is cluster # and value are projects in that clusters - clusters = {} - for n, label in enumerate(kmeans.labels_): - if label in clusters: - clusters[label].append(embeddings.index[n]) + None + """ + data = self.raw_data + + if min_max: + scaled_data = MinMaxScaler().fit_transform(data) else: - clusters[label] = [embeddings.index[n]] + scaled_data = StandardScaler().fit_transform(data) - if output_file is not None: - with open(output_file, 'wb') as output: - pickle.dump(clusters, output) - logging.info('Cluster file outputted!') + self.data = scaled_data + + def set_algorithm(self, name, **kwargs): + """ Sets the clustering algorithm to use + + Parameters + ---------- + name: str + Name of the algorithm to use + **kwargs + Named arguments specific to the algorithm to use + + Returns + ------- + None + """ + name = name.lower() + if name == 'k_means': + self.algorithm = MiniBatchKMeans(**kwargs) + elif name == 'dbscan': + self.algorithm = DBSCAN(**kwargs) + + def fit_algorithm(self): + """ Fits the algorithm to the scaled data - return clusters + Parameters + ---------- + None + + Returns + ------- + None + """ + self.scale_data() + self.algorithm.fit(self.data) + self.fitted = True + + def get_labels(self): + """ Gets the cluster labels + + Parameters + ---------- + None + + Returns + ------- + ndarray + Array of cluster labels + """ + self.labels = self.algorithm.labels_ + return self.labels + + def get_inertia(self): + """ Gets the inertia of the clusters + + Parameters + ---------- + None + Returns + ------- + float + Returns the intertia if the algorithm has an inertia attribute + """ + try: + self.inertia = self.algorithm.inertia_ + return self.inertia + except: + print('Not Inertia in this algorithm') + + def save_file(self, output_file): + embeddings = self.data + clusters = {} + for n, label in enumerate(self.get_labels()): + if label in clusters: + clusters[label].append(embeddings.index[n]) + else: + clusters[label] = [embeddings.index[n]] + + with open(output_file, 'wb') as output: + pickle.dump(clusters, output) -if __name__ == '__main__': - get_embedding_clusters() + logging.info('Cluster file outputted.') diff --git a/src/github_analysis/data_pull.py b/src/github_analysis/data_pull.py new file mode 100644 index 0000000..bc8fd24 --- /dev/null +++ b/src/github_analysis/data_pull.py @@ -0,0 +1,5 @@ +import pandas as pd + +csv_commits = pd.read_csv('https://storage.cloud.google.com/rstudio_bucket/2019_seed_commits.csv?_ga=2.112003524.-1920784121.1551992733') + +csv_commits.to_feather('../artifacts/commits.feather') diff --git a/src/github_analysis/dim_reduce.py b/src/github_analysis/dim_reduce.py new file mode 100644 index 0000000..19a93a7 --- /dev/null +++ b/src/github_analysis/dim_reduce.py @@ -0,0 +1,107 @@ +from sklearn.decomposition import PCA +from MulticoreTSNE import MulticoreTSNE as TSNE +from sklearn.manifold import Isomap, LocallyLinearEmbedding, MDS, SpectralEmbedding +from sklearn.preprocessing import MinMaxScaler, StandardScaler +import numpy as np + +class ReduceDim(): + def __init__(self, n_dimensions): + """ Initializes the ReduceDim class + + Parameters + ---------- + raw_data: pd.DataFrame or np.ndarray + Data in a 2 dimensional ndarray or a pandas Data Frame + n_dimensions: int + Number of dimensions we want to reduce to + + Returns + ------- + None + """ + self.dimensions = n_dimensions + self.raw_data = None + self.data = None + self.algorithm = None + self.transformed_data = None + + def open_embeddings(self, input_file): + self.raw_data = pd.read_csv(input_file, index_col = 0) + + def scale_data(self, min_max = True): + """ Scales the data in all columns to a same scale + + Parameters + ---------- + min_max: bool + If True uses the MinMaxScaler, if False uses the StandardScaler + + Returns + ------- + None + """ + data = self.raw_data + + if min_max: + scaled_data = MinMaxScaler().fit_transform(data) + else: + scaled_data = StandardScaler().fit_transform(data) + + self.data = scaled_data + + def set_algorithm(self, name, **kwargs): + """ Sets the dimensionality reduction algorithm to use + + Parameters + ---------- + name: str + Name of the algorithm to use ### (*add algorithms available in the docstring*) + **kwargs + Named arguments specific to the algorithm to use + + Returns + ------- + None + """ + name = name.lower() + + if name == 'pca': + self.algorithm = PCA(n_components = self.dimensions, **kwargs) + elif name == 't_sne': + self.algorithm = TSNE(n_components = self.dimensions, **kwargs) + elif name == 'isomap': + self.algorithm = Isomap(n_components = self.dimensions, **kwargs) + elif name == 'locally_linear': + self.algorithm = LocallyLinearEmbedding(n_components = self.dimensions, **kwargs) + elif name == 'mds': + self.algorithm = MDS(n_components = self.dimensions, **kwargs) + elif name == 'spectral': + self.algorithm = SpectralEmbedding(n_components = self.dimensions, **kwargs) + + def fit_transform(self): + """ Fits the algorithm to the scaled data + + Parameters + ---------- + None + + Returns + ------- + ndarray + Dimensionality reduced data + """ + self.scale_data() + + self.transformed_data = self.algorithm.fit_transform(self.data) + self.transformed_data = pd.DataFrame(self.transformed_data, columns = ['x', 'y']) + self.transformed_data.index = self.raw_data.index + return self.transformed_data + + def plot_tsne(self, file_name): + fig, ax = plt.subplots() + ax.scatter(self.transformed_data.x, self.transformed_data.y) + ax.set_title('Embedding Clusters (t-SNE Transformed)') + plt.savefig(file_name) + + def save_reduced_data(self, output_file): + self.transformed_data.to_csv(output_file) diff --git a/src/github_analysis/feature_extract.py b/src/github_analysis/feature_extract.py new file mode 100644 index 0000000..a307247 --- /dev/null +++ b/src/github_analysis/feature_extract.py @@ -0,0 +1,361 @@ +import pandas as pd +import networkx as nx +import numpy as np + +class GraphExtracter(): + def __init__(self, graph): + """ Initializes the GraphExtracter class + + Parameters + ---------- + graph: nx.Graph() + networkx graph object to be analyzed + + Returns + ------- + None + """ + self.graph = graph + + def get_nodes(self): + """Gets the number of nodes of the graph + + Parameters + ---------- + None + + Returns + ------- + int + Number of nodes in the graph + """ + self.num_nodes = self.graph.number_of_nodes() + return self.num_nodes + + def get_edges(self): + """Gets the number of edges of the graph + + Parameters + ---------- + None + + Returns + ------- + int + Number of edges in the graph + """ + self.num_edges = self.graph.number_of_edges() + return self.num_edges + + def get_density(self): + """Gets the density of the graph + + Parameters + ---------- + None + + Returns + ------- + float + Density of the graph + """ + self.density = nx.density(self.graph) + return self.density + + def get_avg_clustering(self): + """Gets the average clustering of the graph + + Parameters + ---------- + None + + Returns + ------- + float + Average Clustering of the graph + """ + self.avg_clustering = nx.algorithms.cluster.average_clustering(self.graph) + return self.avg_clustering + + def get_transitivity(self): + """Gets the transitivity of the graph + + Parameters + ---------- + None + + Returns + ------- + float + Transitivity of the graph + """ + self.transitivity = nx.algorithms.cluster.transitivity(self.graph) + return self.transitivity + + def get_weakly_connected(self): + """Determines if graph is weakly connected + + Parameters + ---------- + None + + Returns + ------- + bool + True if graph is weakly connected + """ + self.weakly_connected = nx.algorithms.components.is_weakly_connected(self.graph) + return self.weakly_connected + + def get_num_weakly_connected(self): + """Gets the number of nodes that are weakly connected on the graph + + Parameters + ---------- + None + + Returns + ------- + int + Number of nodes that are weakly connected on the graph + """ + self.num_weakly_connected = nx.algorithms.components.number_weakly_connected_components(self.graph) + return self.num_weakly_connected + + def get_num_attrac_components(self): + """Gets the number of nodes that are attracting components on the graph + + Parameters + ---------- + None + + Returns + ------- + int + Number of nodes that are attracting components on the graph + """ + self.num_attrac_components = nx.algorithms.components.number_attracting_components(self.graph) + return self.num_attrac_components + + def get_avg_degree(self): + """Gets the average degree of the nodes on the graph + ### Look to add the median depending on the distribution + Parameters + ---------- + None + + Returns + ------- + float + Average degree of the nodes on the graph + """ + degree = nx.degree(self.graph) + degree = list(degree) + degree = [entry[1] for entry in degree] + self.avg_degree = np.mean(degree) + return self.avg_degree + + def get_avg_degree_centrality(self): + """Gets the average degree centrality of the nodes on the graph + + Parameters + ---------- + None + + Returns + ------- + float + Average degree centrality of the nodes on the graph + """ + self.avg_degree_centrality = np.mean(list(nx.algorithms.centrality.degree_centrality(self.graph).values())) + return self.avg_degree_centrality + + def get_avg_in_degree(self): + """Gets the average in degree of the nodes on the graph + ### Adding other stats as parameters, (i.e. median) + Parameters + ---------- + None + + Returns + ------- + float + Average in degree of the nodes on the graph + """ + in_ = self.graph.in_degree + in_ = list(in_) + in_ = [entry[1] for entry in in_] + self.avg_in_degree = np.mean(in_) + return self.avg_in_degree + + def get_avg_in_degree_centrality(self): + """Gets the average in degree centrality of the nodes on the graph + + Parameters + ---------- + None + + Returns + ------- + float + Average in degree centrality of the nodes on the graph + """ + self.avg_in_degree_centrality = np.mean(list(nx.algorithms.centrality.in_degree_centrality(self.graph).values())) + return self.avg_in_degree_centrality + + def get_avg_out_degree(self): + """Gets the average out degree of the nodes on the graph + + Parameters + ---------- + None + + Returns + ------- + float + Average out degree of the nodes on the graph + """ + out_ = self.graph.out_degree + out_ = list(out) + out_ = [entry[1] for entry in out] + self.avg_out_degree = np.mean(out) + return self.avg_out_degree + + def get_avg_out_degree_centrality(self): + """Gets the average out degree centrality of the nodes on the graph + + Parameters + ---------- + None + + Returns + ------- + float + Average out degree centrality of the nodes on the graph + """ + self.avg_out_degree_centrality = np.mean(list(nx.algorithms.centrality.out_degree_centrality(self.graph).values())) + return self.avg_out_degree_centrality + + def get_eigen_centrality(self): + """Gets the mean eigen centrality of the nodes in the graph + + Parameters + ---------- + None + + Returns + ------- + float + Mean eigen centrality of the nodes in the graph + """ + self.eigen_centrality = np.mean(list(nx.algorithms.centrality.eigenvector_centrality(self.graph, max_iter=int(1e6)).values())) + return self.eigen_centrality + + def get_katz_centrality(self): + """Gets the mean katz centrality of the nodes in the graph + + Parameters + ---------- + None + + Returns + ------- + float + Mean katz centrality of the nodes in the graph + """ + self.katz_centrality = np.mean(list(nx.algorithms.centrality.katz_centrality(self.graph, max_iter=int(1e6)).values())) + return self.katz_centrality + + def get_num_triangles(self): + """Gets the mean number of triangles of the nodes in the graph + + Parameters + ---------- + None + + Returns + ------- + float + Mean number of triangles of the nodes in the graph + """ + self.num_triangles = np.mean(list(nx.algorithms.cluster.clustering(self.graph).values())) + return self.num_triangles + + def set_all_features(self): + """Function that extracts all of the graph features + + Parameters + ---------- + None + + Returns + ------- + None + """ + self.get_nodes() + self.get_edges() + self.get_density() + self.get_avg_clustering() + self.get_transitivity() + self.get_weakly_connected() + self.get_num_weakly_connected() + self.get_num_attrac_components() + self.get_avg_degree() + self.get_avg_degree_centrality() + self.get_avg_in_degree() + self.get_avg_in_degree_centrality() + self.get_avg_out_degree() + self.get_avg_out_degree_centrality() + self.get_eigen_centrality() + self.get_katz_centrality() + self.get_num_triangles() + return None + + def get_all_features(self): + """Gets all of the features extracted from the graph object + + Parameters + ---------- + None + + Returns + ------- + dict + Dictionary keyed by the features extracted from the graphs + """ + return { + 'num_nodes': self.num_nodes, + 'num_edges': self.num_edges, + 'density': self.density, + 'avg_clustering': self.avg_clustering, + 'transitivity': self.transitivity, + 'weakly_connected': self.weakly_connected, + 'num_weakly_connected': self.num_weakly_connected, + 'num_attrac_components': self.num_attrac_components, + 'avg_degree': self.avg_degree, + 'avg_degree_centrality': self.avg_degree_centrality, + 'avg_in_degree': self.avg_in_degree, + 'avg_in_degree_centrality': self.avg_in_degree_centrality, + 'avg_out_degree': self.avg_out_degree, + 'avg_out_degree_centrality': self.avg_out_degree_centrality, + 'eigen_centrality': self.eigen_centrality, + 'katz_centrality': self.katz_centrality, + 'num_triangles': self.num_triangles} + +#for idx, project in enumerate(project_names_sample.project_name.values): +# if idx%10 == 0: +# print(f'Percentage Completed: {idx/5}%') +# graph_df = commits_sample.loc[commits_sample['project_name'] == project] +# graph_df = graph_df[['commit_id', 'parent_id']] +# graph_df.columns = pd.Index(['target', 'source']) + +# if graph_df.shape[0] > 5000: +# continue + +# graph = nx.from_pandas_edgelist(graph_df, create_using=nx.DiGraph) +# graph.name = project +# graph_extracter = GraphExtracter(graph) +# graph_extracter.set_all_features() +# graphs[project] = graph_extracter.get_all_features() + +#graph_features = pd.DataFrame(graphs).T diff --git a/src/github_analysis/main.py b/src/github_analysis/main.py index 787d430..a756023 100644 --- a/src/github_analysis/main.py +++ b/src/github_analysis/main.py @@ -9,7 +9,7 @@ import collections import graph2vec as g2v -import reduce_embedding_dim as red +import dim_reduce as dr import data_layer as dl import cluster as c import motif_finder as mf @@ -31,29 +31,52 @@ def main(args): getDataTime = time.time() logging.info("Query Complete: " + str(getDataTime - startTime) + " seconds") - project_ids = dl.getUniqueProjectIdsFromDf(project_data) - project_groups = commits_dl.getGroupedCommitsByProjectIds(project_ids) + for iter in [2, 3, 4, 5, 6, 7, 8, 9, 10]: + embeddings_path = None + if args.embeddings_file_path is None: # If embeddings not specified, generate the model and set the path to the output embeddings + project_ids = dl.getUniqueProjectIdsFromDf(project_data) + project_groups = commits_dl.getGroupedCommitsByProjectIds(project_ids) + + project_graphs = [] + project_ids_ordered = [] + for name, group in project_groups: + project_graphs.append(nxutils.git_graph(group)) + project_ids_ordered.append(name) + + # with open("project_graphs.pkl", 'w') as f: + # pickle.dump(project_graphs, f) + # + # with open("project_ids_ordered.pkl", 'w') as f: + # pickle.dump(project_ids_ordered, f) + + generateGraphsTime = time.time() + logging.info("NxGraphs Built: " + str(generateGraphsTime - getDataTime) + " seconds") + + embeddings_path = args.results_path + "embeddings.csv" + g2vModel = g2v.Graph2Vec(workers=args.n_workers, size=args.n_neurons, min_count=args.min_count, iter=iter, seed=args.random_state) + g2vEmbeddings = g2vModel.fit_transform(project_graphs, project_ids_ordered, output_path=embeddings_path) + buildModelTime = time.time() + logging.info("G2V Model Built: " + str(buildModelTime - generateGraphsTime) + " seconds") + else: + embeddings_path = args.embeddings_file_path + generateGraphsTime = time.time() + buildModelTime = time.time() + + reducers = dr.ReduceDim(n_dimensions = 2) + reducers.open_embeddings(embeddings_path) + reducers.set_algorithm('t_sne', random_state = args.random_state, n_jobs = args.n_workers) + reducers.fit_transform() + reducers.plot_tsne(f'{args.results_path}{iter}/embeddings_tsne') + reducers.save_reduced_data(f'{args.results_path}{iter}/embeddings_reduced_dim.csv') + reduceTime = time.time() + logging.info("Dims Reduced: " + str(reduceTime - buildModelTime) + " seconds") + + clusters = c.Cluster() + clusters.open_embeddings(embeddings_path) + clusters.set_algorithm('k_means', n_clusters = 19, random_state = args.random_state) + clusters.fit_algorithm() + clusters.save_file(f'{args.results_path}clusters.pickle') - project_graphs = [] - project_ids_ordered = [] - for name, group in project_groups: - project_graphs.append(nxutils.git_graph(group)) - project_ids_ordered.append(name) - - generateGraphsTime = time.time() - logging.info("NxGraphs Built: " + str(generateGraphsTime - getDataTime) + " seconds") - - embeddings_path = args.results_path + "embeddings.csv" - g2vModel = g2v.Graph2Vec(workers=args.n_workers, size=args.n_neurons, min_count=args.min_count, iter=args.n_iter, seed=args.random_state) - g2vEmbeddings = g2vModel.fit_transform(project_graphs, project_ids_ordered, output_path=embeddings_path) - buildModelTime = time.time() - logging.info("G2V Model Built: " + str(buildModelTime - generateGraphsTime) + " seconds") - - red.reduce_dim(workers=args.n_workers, output_path=args.results_path, input_path=embeddings_path, random_state=args.random_state) - reduceTime = time.time() - logging.info("Dims Reduced: " + str(reduceTime - buildModelTime) + " seconds") - - clusters = c.get_embedding_clusters(embedding_input_file=embeddings_path, output_file=args.results_path + "clusters.pickle", random_state=args.random_state) projectClusterTime = time.time() logging.info("Projects Clustered: " + str(projectClusterTime - reduceTime) + " seconds") diff --git a/tests/test_cluster.py b/tests/test_cluster.py new file mode 100644 index 0000000..391d6cd --- /dev/null +++ b/tests/test_cluster.py @@ -0,0 +1,62 @@ +import numpy as np +import cluster as clustering +from sklearn.cluster import KMeans, DBSCAN + +embeddings = np.array([[1, 2, 3], [4, 5, 6], [6, 5 , 4], [3, 2, 1]]) + +cluster = clustering.Cluster(embeddings) + +### Look to add explanations about what should happen after the asserts + +def test_init_(): + """Tests that the Cluster class is initializing correctly""" + assert cluster.raw_data is None, 'The Cluster() class is not initializing correctly.' + assert cluster.data is None, 'The Cluster() class is not initializing correctly.' + assert cluster.algorithm is None, 'The Cluster() class is not initializing correctly.' + +def test_scale_data(): + """Tests that both methods of scaling work properly""" + # Tests Min-Max Scaling + cluster.scale_data(min_max = True) + assert np.min(cluster.data) == 0, 'It should be 0 as it is Min-Max' + assert np.isclose(np.max(cluster.data), 1), 'It should be 1 as it is Min-Max' + assert cluster.data is not None, 'The scaling is going correctly but not the assignments to the class attributes' + + # Resets the state of the attributes + cluster.data = None + + #Tests StandardScaler + cluster.scale_data(min_max = False) + assert np.array_equal(np.mean(embeddings, axis = 1), np.array([2., 5., 5., 2.])) #This is not testing StandardScaler + assert np.array_equal(np.mean(embeddings, axis = 0), np.array([3.5, 3.5, 3.5])) #This is not testing StandardScaler + assert cluster.data is not None + +def test_set_algorithm(): + """Tests that the method assigns the correct algorithms.""" + # Tests K-Means + cluster.set_algorithm('k_means') + # Look at class names for comparison + assert isinstance(cluster.algorithm, MiniBatchKMeans), 'MiniBatchKMeans is not implemented correctly' + + # Tests DBSCAN + cluster.set_algorithm('dbscan') + assert isinstance(cluster.algorithm, DBSCAN), 'DBSCAN is not implemented correctly' + +def test_fit_algorithm(): + """Tests that the selected algorithm is fitted correctly""" + cluster.set_algorithm('k_means') + cluster.fit_algorithm() + assert cluster.fitted == True, 'Didn\'t fit the algorithm correctly' + +def test_get_labels(): + """Tests that the label assignments are working as expected""" + labels = cluster.labels + assert labels is not None, 'Labels assignments are not correctly done' + assert len(labels) == embeddings.shape[0], 'Labels are not the same length as the input' + +def test_get_inertia(): + """Tests that the inertia we get is an actual number and not empty""" + inertia = cluster.get_inertia + assert len(inertia) == 1, 'Not getting the correct inertia values' + assert inertia is not None + assert inertia is float, 'Not getting the correct inertia values' diff --git a/tests/test_dim_reduce.py b/tests/test_dim_reduce.py new file mode 100644 index 0000000..617a019 --- /dev/null +++ b/tests/test_dim_reduce.py @@ -0,0 +1,74 @@ +from sklearn.decomposition import PCA +from MulticoreTSNE import MulticoreTSNE as TSNE +from sklearn.manifold import Isomap, LocallyLinearEmbedding, MDS, SpectralEmbedding +import dim_reduce +import numpy as np + +embeddings = np.array([[1, 2, 3], [4, 5, 6], [6, 5 , 4], [3, 2, 1]]) + +reducer = dim_reduce.ReduceDim(embeddings, n_dimensions = 2) + +def test_init_(): + """Tests that the ReduceDim class is initializing correctly""" + assert reducer.dimensions == 2, 'The ReduceDim() class is not initializing correctly.' + assert reducer.data is None, 'The ReduceDim() class is not initializing correctly.' + assert reducer.raw_data is None, 'The ReduceDim() class is not initializing correctly.' + assert reducer.algorithm is None, 'The ReduceDim() class is not initializing correctly.' + +def test_scale_data(): + """Tests that both methods of scaling work properly""" + # Tests Min-Max Scaling + reducer.scale_data(min_max = True) + assert np.min(reducer.data) == 0, 'It should be 0 as it is Min-Max' + assert np.isclose(np.max(reducer.data), 1), 'It should be 1 as it is Min-Max' + assert reducer.data is not None, 'The scaling is going correctly but not the assignments to the class attributes' + + # Resets the state of the attributes + reducer.data = None + + #Tests StandardScaler + reducer.scale_data(min_max = False) + assert np.array_equal(np.mean(embeddings, axis = 1), np.array([2., 5., 5., 2.])) #This is not testing StandardScaler + assert np.array_equal(np.mean(embeddings, axis = 0), np.array([3.5, 3.5, 3.5])) #This is not testing StandardScaler + assert reducer.data is not None, 'The scaling is going correctly but not the assignments to the class attributes' + +def test_set_algorithm(): + """Tests that the method assigns the correct algorithms.""" + # Tests PCA + reducer.set_algorithm('pca') + assert isinstance(reducer.algorithm, PCA), 'PCA is not implemented correctly' + + # Tests t-SNE + reducer.set_algorithm('t_sne') + assert isinstance(reducer.algorithm, TSNE), 'TSNE is not implemented correctly' + + # Tests Isomap + reducer.set_algorithm('isomap') + assert isinstance(reducer.algorithm, Isomap), 'Isomap is not implemented correctly' + + # Tests LocallyLinearEmbedding + reducer.set_algorithm('locally_linear') + assert isinstance(reducer.algorithm, LocallyLinearEmbedding), 'LocallyLinearEmbedding is not implemented correctly' + + # Tests MDS + reducer.set_algorithm('mds') + assert isinstance(reducer.algorithm, MDS), 'MDS is not implemented correctly' + + # Tests Spectral + reducer.set_algorithm('spectral') + assert isinstance(reducer.algorithm, SpectralEmbedding), 'SpectralEmbedding is not implemented correctly' + +def test_fit_transform(): + """Tests that the fit_transform methods work correctly""" + reducer.set_algorithm('t_sne') + + # Transform test + reduced_data = reducer.fit_transform() + assert reduced_data is not None + assert reduced_data.shape[1] == 2 + + # Reset state + reducer.transformed_data = None + + # Don't transform test + assert reducer.fit_transform(transform = False) is None