From 02da9aecbe98a5b8cb618fa98858d5fb3f200f02 Mon Sep 17 00:00:00 2001
From: Ian Flores Siaca <iflores.siaca@gmail.com>
Date: Wed, 15 May 2019 11:17:37 -0700
Subject: [PATCH 1/2] Cluster.py code update

---
 src/cluster.py | 142 +++++++++++++++++++++++++++++++++++++------------
 1 file changed, 109 insertions(+), 33 deletions(-)

diff --git a/src/cluster.py b/src/cluster.py
index eaadbdc..e62f19e 100644
--- a/src/cluster.py
+++ b/src/cluster.py
@@ -1,39 +1,115 @@
 import pandas as pd
-from sklearn.cluster import KMeans
-import pickle
-
-
-def get_embedding_clusters(embedding_input_file='./results/embeddings.csv', k_for_clustering=10, random_state=None,
-                           output_file='./results/clusters.pickle'):
-    """
-    Given a file with embeddings (or other features) cluster similar rows together using kmeans.
-
-    :param embedding_input_file: file where every row is a project and every col a feature
-    :param k_for_clustering: how many groups to cluster
-    :param random_state: random state for clustering algo
-    :param output_file: string with the filename to output the results to as a pickle. If this param is set to None no file will be outputted.
-    :return: a dictionary where the keys are the cluster labels and the values are lists of GitHub projectIds that fall in that cluster.
-    """
-    embeddings = pd.read_csv(embedding_input_file, index_col=0)
-
-    # Run k-means algo TODO: spend more time on this algo: tune hyperparams, consider algo that better handles high dim, etc.
-    kmeans = KMeans(n_clusters=k_for_clustering, random_state=random_state).fit(embeddings.values)
-
-    # Make dict where key is cluster # and value are projects in that clusters
-    clusters = {}
-    for n, label in enumerate(kmeans.labels_):
-        if label in clusters:
-            clusters[label].append(embeddings.index[n])
+import numpy as np
+from sklearn.preprocessing import MinMaxScaler, StandardScaler
+from sklearn.cluster import KMeans, DBSCAN
+
+class Cluster():
+    def __init__(self, raw_data):
+        """ Initializes the Cluster class
+
+        Parameters
+        ----------
+        raw_data: pd.DataFrame or np.ndarray
+            Data in a 2 dimensional ndarray or a pandas Data Frame
+
+        Returns
+        -------
+        None
+        """
+        self.raw_data = raw_data
+        self.data = None
+        self.algorithm = None
+        self.transformed_data  = None
+
+    def scale_data(self, min_max = False):
+        """ Scales the data in all columns to a same scale
+
+        Parameters
+        ----------
+        min_max: bool
+            If True uses the MinMaxScaler, if False uses the StandardScaler
+
+        Returns
+        -------
+        None
+        """
+        data = self.raw_data
+
+        if min_max:
+            scaled_data = MinMaxScaler().fit_transform(data)
         else:
-            clusters[label] = [embeddings.index[n]]
+            scaled_data = StandardScaler().fit_transform(data)
+
+        self.data = scaled_data
+
+    def set_algorithm(self, name, **kwargs):
+        """ Sets the clustering algorithm to use
+
+        Parameters
+        ----------
+        name: str
+            Name of the algorithm to use
+        **kwargs
+            Named arguments specific to the algorithm to use
+
+        Returns
+        -------
+        None
+        """
+        if name == 'k_means':
+            self.algorithm = KMeans(**kwargs)
+        elif name == 'dbscan':
+            self.algorithm = DBSCAN(**kwargs)
+
+    def fit_algorithm(self):
+        """ Fits the algorithm to the scaled data
+
+        Parameters
+        ----------
+        None
+
+        Returns
+        -------
+        None
+        """
+        self.scale_data()
+        self.algorithm.fit(self.data)
+
+    def get_labels(self):
+        """ Gets the cluster labels
+
+        Parameters
+        ----------
+        None
+
+        Returns
+        -------
+        ndarray
+            Array of cluster labels
+        """
+        self.labels = self.algorithm.labels_
+        return self.labels
 
-    if output_file is not None:
-        with open(output_file, 'wb') as output:
-            pickle.dump(clusters, output)
-        print('cluster file outputted!')
+    def get_inertia(self):
+        """ Gets the inertia of the clusters
 
-    return clusters
+        Parameters
+        ----------
+        None
 
+        Returns
+        -------
+        float
+            Returns the intertia if the algorithm has an inertia attribute
+        """
+        try:
+            self.inertia = self.algorithm.inertia_
+            return self.inertia
+        except:
+            print('Not Inertia in this algorithm')
 
-if __name__ == '__main__':
-    get_embedding_clusters()
+cluster = Cluster([[1,2,3], [2, 4, 6]])
+cluster.set_algorithm('k_means', n_clusters = 2)
+print('Up to here')
+print(cluster.fit_algorithm())
+print(cluster.get_labels())

From 39050fba71c6f3f312694a22ba87ba7349f0232c Mon Sep 17 00:00:00 2001
From: Ian Flores Siaca <iflores.siaca@gmail.com>
Date: Wed, 22 May 2019 08:36:40 -0700
Subject: [PATCH 2/2] Latest changes cluster.py

---
 src/cluster.py | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/src/cluster.py b/src/cluster.py
index e62f19e..49f3312 100644
--- a/src/cluster.py
+++ b/src/cluster.py
@@ -1,7 +1,8 @@
 import pandas as pd
 import numpy as np
 from sklearn.preprocessing import MinMaxScaler, StandardScaler
-from sklearn.cluster import KMeans, DBSCAN
+from sklearn.cluster import DBSCAN, MiniBatchKMeans
+
 
 class Cluster():
     def __init__(self, raw_data):
@@ -20,8 +21,9 @@ def __init__(self, raw_data):
         self.data = None
         self.algorithm = None
         self.transformed_data  = None
+        self.fitted = None
 
-    def scale_data(self, min_max = False):
+    def scale_data(self, min_max = True):
         """ Scales the data in all columns to a same scale
 
         Parameters
@@ -56,8 +58,9 @@ def set_algorithm(self, name, **kwargs):
         -------
         None
         """
+        name = name.lower()
         if name == 'k_means':
-            self.algorithm = KMeans(**kwargs)
+            self.algorithm = MiniBatchKMeans(**kwargs)
         elif name == 'dbscan':
             self.algorithm = DBSCAN(**kwargs)
 
@@ -74,6 +77,7 @@ def fit_algorithm(self):
         """
         self.scale_data()
         self.algorithm.fit(self.data)
+        self.fitted = True
 
     def get_labels(self):
         """ Gets the cluster labels
@@ -107,9 +111,3 @@ def get_inertia(self):
             return self.inertia
         except:
             print('Not Inertia in this algorithm')
-
-cluster = Cluster([[1,2,3], [2, 4, 6]])
-cluster.set_algorithm('k_means', n_clusters = 2)
-print('Up to here')
-print(cluster.fit_algorithm())
-print(cluster.get_labels())