UBC-MDS · ian-flores · May 15, 2019 · May 22, 2019 · May 22, 2019 · Jun 24, 2019
diff --git a/src/github_analysis/cluster.py b/src/github_analysis/cluster.py
@@ -1,45 +1,132 @@
 import pandas as pd
-from sklearn.cluster import KMeans
-import pickle
 import logging
+import pickle
+import numpy as np
+from sklearn.preprocessing import MinMaxScaler, StandardScaler
+from sklearn.cluster import DBSCAN, MiniBatchKMeans
+
+
+class Cluster():
+    def __init__(self):
+        """ Initializes the Cluster class
+
+        Parameters
+        ----------
+        raw_data: pd.DataFrame or np.ndarray
+            Data in a 2 dimensional ndarray or a pandas Data Frame
+
+        Returns
+        -------
+        None
+        """
+        self.raw_data = None
+        self.data = None
+        self.algorithm = None
+        self.transformed_data  = None
+        self.fitted = None
+
+    def open_embeddings(self, input_file):
+        self.raw_data = pd.read_csv(input_file, index_col = 0)
 
-logging.basicConfig(format="%(asctime)s : %(levelname)s : %(message)s", filename="log.log", level=logging.INFO)
+    def scale_data(self, min_max = True):
+        """ Scales the data in all columns to a same scale
 
-def get_embedding_clusters(embedding_input_file='./results/embeddings.csv', k_for_clustering=10, random_state=None,
-                           output_file='./results/clusters.pickle'):
-    """ Given a file with embeddings (or other features) cluster similar rows together using kmeans.
-
         Parameters
         ----------
-        embedding_input_file: file where every row is a project and every col a feature.
-        k_for_clustering: how many groups to cluster into.
-        random_state: random state for clustering algo.
-        output_file: string with the filename to output the results to as a pickle. If this param is set to None no file will be outputted.
+        min_max: bool
+            If True uses the MinMaxScaler, if False uses the StandardScaler
 
         Returns
         -------
-        a dictionary where the keys are the cluster labels and the values are lists of GitHub projectIds that fall in that cluster.
-    """
-    embeddings = pd.read_csv(embedding_input_file, index_col=0)
-
-    # Run k-means algo TODO: spend more time on this algo: tune hyperparams, consider algo that better handles high dim, etc.
-    kmeans = KMeans(n_clusters=k_for_clustering, random_state=random_state).fit(embeddings.values)
-
-    # Make dict where key is cluster # and value are projects in that clusters
-    clusters = {}
-    for n, label in enumerate(kmeans.labels_):
-        if label in clusters:
-            clusters[label].append(embeddings.index[n])
+        None
+        """
+        data = self.raw_data
+
+        if min_max:
+            scaled_data = MinMaxScaler().fit_transform(data)
         else:
-            clusters[label] = [embeddings.index[n]]
+            scaled_data = StandardScaler().fit_transform(data)
 
-    if output_file is not None:
-        with open(output_file, 'wb') as output:
-            pickle.dump(clusters, output)
-        logging.info('Cluster file outputted!')
+        self.data = scaled_data
+
+    def set_algorithm(self, name, **kwargs):
+        """ Sets the clustering algorithm to use
+
+        Parameters
+        ----------
+        name: str
+            Name of the algorithm to use
+        **kwargs
+            Named arguments specific to the algorithm to use
+
+        Returns
+        -------
+        None
+        """
+        name = name.lower()
+        if name == 'k_means':
+            self.algorithm = MiniBatchKMeans(**kwargs)
+        elif name == 'dbscan':
+            self.algorithm = DBSCAN(**kwargs)
+
+    def fit_algorithm(self):
+        """ Fits the algorithm to the scaled data
 
-    return clusters
+        Parameters
+        ----------
+        None
+
+        Returns
+        -------
+        None
+        """
+        self.scale_data()
+        self.algorithm.fit(self.data)
+        self.fitted = True
+
+    def get_labels(self):
+        """ Gets the cluster labels
+
+        Parameters
+        ----------
+        None
+
+        Returns
+        -------
+        ndarray
+            Array of cluster labels
+        """
+        self.labels = self.algorithm.labels_
+        return self.labels
+
+    def get_inertia(self):
+        """ Gets the inertia of the clusters
+
+        Parameters
+        ----------
+        None
 
+        Returns
+        -------
+        float
+            Returns the intertia if the algorithm has an inertia attribute
+        """
+        try:
+            self.inertia = self.algorithm.inertia_
+            return self.inertia
+        except:
+            print('Not Inertia in this algorithm')
+
+    def save_file(self, output_file):
+        embeddings = self.data
+        clusters = {}
+        for n, label in enumerate(self.get_labels()):
+            if label in clusters:
+                clusters[label].append(embeddings.index[n])
+            else:
+                clusters[label] = [embeddings.index[n]]
+
+        with open(output_file, 'wb') as output:
+            pickle.dump(clusters, output)
 
-if __name__ == '__main__':
-    get_embedding_clusters()
+        logging.info('Cluster file outputted.')
diff --git a/src/github_analysis/data_pull.py b/src/github_analysis/data_pull.py
@@ -0,0 +1,5 @@
+import pandas as pd
+
+csv_commits = pd.read_csv('https://storage.cloud.google.com/rstudio_bucket/2019_seed_commits.csv?_ga=2.112003524.-1920784121.1551992733')
+
+csv_commits.to_feather('../artifacts/commits.feather')
diff --git a/src/github_analysis/dim_reduce.py b/src/github_analysis/dim_reduce.py
@@ -0,0 +1,107 @@
+from sklearn.decomposition import PCA
+from MulticoreTSNE import MulticoreTSNE as TSNE
+from sklearn.manifold import Isomap, LocallyLinearEmbedding, MDS, SpectralEmbedding
+from sklearn.preprocessing import MinMaxScaler, StandardScaler
+import numpy as np
+
+class ReduceDim():
+    def __init__(self, n_dimensions):
+        """ Initializes the ReduceDim class
+
+        Parameters
+        ----------
+        raw_data: pd.DataFrame or np.ndarray
+            Data in a 2 dimensional ndarray or a pandas Data Frame
+        n_dimensions: int
+            Number of dimensions we want to reduce to
+
+        Returns
+        -------
+        None
+        """
+        self.dimensions = n_dimensions
+        self.raw_data = None
+        self.data = None
+        self.algorithm = None
+        self.transformed_data  = None
+
+    def open_embeddings(self, input_file):
+        self.raw_data = pd.read_csv(input_file, index_col = 0)
+
+    def scale_data(self, min_max = True):
+        """ Scales the data in all columns to a same scale
+
+        Parameters
+        ----------
+        min_max: bool
+            If True uses the MinMaxScaler, if False uses the StandardScaler
+
+        Returns
+        -------
+        None
+        """
+        data = self.raw_data
+
+        if min_max:
+            scaled_data = MinMaxScaler().fit_transform(data)
+        else:
+            scaled_data = StandardScaler().fit_transform(data)
+
+        self.data = scaled_data
+
+    def set_algorithm(self, name, **kwargs):
+        """ Sets the dimensionality reduction algorithm to use
+
+        Parameters
+        ----------
+        name: str
+            Name of the algorithm to use ### (*add algorithms available in the docstring*)
+        **kwargs
+            Named arguments specific to the algorithm to use
+
+        Returns
+        -------
+        None
+        """
+        name = name.lower()
+
+        if name == 'pca':
+            self.algorithm = PCA(n_components = self.dimensions, **kwargs)
+        elif name == 't_sne':
+            self.algorithm = TSNE(n_components = self.dimensions, **kwargs)
+        elif name == 'isomap':
+            self.algorithm = Isomap(n_components = self.dimensions, **kwargs)
+        elif name == 'locally_linear':
+            self.algorithm = LocallyLinearEmbedding(n_components = self.dimensions, **kwargs)
+        elif name == 'mds':
+            self.algorithm = MDS(n_components = self.dimensions, **kwargs)
+        elif name == 'spectral':
+            self.algorithm = SpectralEmbedding(n_components = self.dimensions, **kwargs)
+
+    def fit_transform(self):
+        """ Fits the algorithm to the scaled data
+
+        Parameters
+        ----------
+        None
+
+        Returns
+        -------
+        ndarray
+            Dimensionality reduced data
+        """
+        self.scale_data()
+
+        self.transformed_data = self.algorithm.fit_transform(self.data)
+        self.transformed_data = pd.DataFrame(self.transformed_data, columns = ['x', 'y'])
+        self.transformed_data.index = self.raw_data.index
+        return self.transformed_data
+
+    def plot_tsne(self, file_name):
+        fig, ax = plt.subplots()
+        ax.scatter(self.transformed_data.x, self.transformed_data.y)
+        ax.set_title('Embedding Clusters (t-SNE Transformed)')
+        plt.savefig(file_name)
+
+    def save_reduced_data(self, output_file):
+        self.transformed_data.to_csv(output_file)