UBC-MDS
diff --git a/‎src/github_analysis/cluster.py
Lines changed: 12 additions & 8 deletions b/‎src/github_analysis/cluster.py
Lines changed: 12 additions & 8 deletions
diff --git a/‎src/github_analysis/data_layer.py
Lines changed: 3 additions & 0 deletions b/‎src/github_analysis/data_layer.py
Lines changed: 3 additions & 0 deletions
diff --git a/‎src/github_analysis/freq_graph.py
Lines changed: 24 additions & 57 deletions b/‎src/github_analysis/freq_graph.py
Lines changed: 24 additions & 57 deletions
diff --git a/‎src/github_analysis/graph2vec.py
Lines changed: 37 additions & 24 deletions b/‎src/github_analysis/graph2vec.py
Lines changed: 37 additions & 24 deletions
@@ -7,14 +7,18 @@
 
 def get_embedding_clusters(embedding_input_file='./results/embeddings.csv', k_for_clustering=10, random_state=None,
                            output_file='./results/clusters.pickle'):
-    """
-    Given a file with embeddings (or other features) cluster similar rows together using kmeans.
-
-    :param embedding_input_file: file where every row is a project and every col a feature
-    :param k_for_clustering: how many groups to cluster
-    :param random_state: random state for clustering algo
-    :param output_file: string with the filename to output the results to as a pickle. If this param is set to None no file will be outputted.
-    :return: a dictionary where the keys are the cluster labels and the values are lists of GitHub projectIds that fall in that cluster.
+    """ Given a file with embeddings (or other features) cluster similar rows together using kmeans.
+    
+        Parameters
+        ----------
+        embedding_input_file: file where every row is a project and every col a feature.
+        k_for_clustering: how many groups to cluster into.
+        random_state: random state for clustering algo.
+        output_file: string with the filename to output the results to as a pickle. If this param is set to None no file will be outputted.
+
+        Returns
+        -------
+        a dictionary where the keys are the cluster labels and the values are lists of GitHub projectIds that fall in that cluster.
     """
     embeddings = pd.read_csv(embedding_input_file, index_col=0)
 
 
@@ -8,6 +8,9 @@ def getUniqueProjectNamesFromDf(df):
     return df.project_name.unique()
 
 class data_layer:
+    """ Read the feather file of commit history data, and group commits per project ID.
+        It provides the option for filtering the projects with a minimum number of commits.
+    """
     def __init__(self, data_path, min_number_commits=None):
         self.data_path = data_path
         self.commits_df = pd.read_feather(data_path)
 
@@ -10,11 +10,18 @@
 
 
 def generate_motif_visualisations_by_cluster(input_file_motif_clusters='./results/motifs_by_cluster.pickle', output_file='./results/clustering_output.pdf'):
-    """
-    :param input_file_motif_clusters: string with a filepath to a pickled dictionary where the keys are cluster names and the values
-    are dictionaries where the keys are motifs (nx subgraph) of length k and the values are how many times similar
-    (isomorphic) motifs occur in the graph.
-    :param output_file: string thats a path of a pdf file to output the graphs to
+    """ Visualize the motif clustering result and output the visualize result to pdf file.
+
+        Parameters
+        ----------
+        input_file_motif_clusters: string with a filepath to a pickled dictionary where the keys are cluster names and the values
+                                   are dictionaries where the keys are motifs (nx subgraph) of length k and the values are how
+                                   many times similar(isomorphic) motifs occur in the graph.
+        output_file: string thats a path of a pdf file to output the graphs to.
+
+        Returns
+        -------
+        Visulization result of motif clustering, by cluster, saved down to a pdf file.
     """
     with open(input_file_motif_clusters, 'rb') as pickle_in:
         motif_clusters = pickle.load(pickle_in)
@@ -30,15 +37,18 @@ def generate_motif_visualisations_by_cluster(input_file_motif_clusters='./result
 
 
 def visualize_motif_samples_bar_graph(motifs, plot_title='Motif Frequency in Dataset', motifs_to_show=8):
-    """
-    Given a collection of motifs and their frequency in a graph, output a file with a bar chart showing the motifs and
-    their associated frequencies.
-
-    :param motifs: dictionary where the keys are motifs (nx subgraph) of length k and the values are how many times similar
-    (isomorphic) motifs occur in the graph.
-    :param plot_title: string thats the tile of your plot.
-    :return: fig that is a bar chart of the most common motifs and how often they occurred
-
+    """ Given a collection of motifs and their frequency in a graph, output a file with a bar chart showing the motifs and
+        their associated frequencies.
+
+        Parameters
+        ----------
+        motifs: dictionary where the keys are motifs (nx subgraph) of length k and the values are how many times similar
+                (isomorphic) motifs occur in the graph.
+        plot_title: string thats the tile of your plot.
+
+        Returns
+        -------
+        A bar chart figure of the most common motifs and how often they occurred.
     """
     motifs_sorted = sorted(motifs.items(), key=lambda kv: kv[1], reverse=True)
 
@@ -97,48 +107,5 @@ def visualize_motif_samples_bar_graph(motifs, plot_title='Motif Frequency in Dat
         '{}% of Sampled Motifs are a Single Chain'.format(round(100 * single_chain_occurences / number_of_samples, 3)))
     return fig
 
-#    plt.savefig(output_file, pad_inches=2)
-#    plt.close()
-
-
-# def visualize_motif_samples(motifs, output_file):
-#     """
-#     Given a sample of motifs, output a file with their graphs and how often they occurred.
-#
-#     :param motifs: a dictionary where the keys are motifs (nx subgraph) of length k and the keys are how many times similar
-#     (isomorphic) motifs occur in the graph.
-#     :param output_file: string thats apath of a pdf file to output the graphs to
-#     :return: a pdf file with name output_file with the graphs and how often they occured
-#     """
-#     motif_count = sum(motifs.values())
-#     motifs_sorted = sorted(motifs.items(), key=lambda kv: kv[1], reverse=True)
-#     with PdfPages(output_file) as pdf:
-#         for motif in motifs_sorted:
-#             fig = plt.figure()
-#             nx.draw_kamada_kawai(motif[0], node_size=25, arrowsize=5)
-#             fig.suptitle('{} Occurrences ({}%)'.format(motif[1], round(100 * motif[1] / motif_count, 3)))
-#             pdf.savefig(fig)
-#             plt.close()
-#
-
 if __name__ == '__main__':
     main()
-
- # try:
-        #     makedirs('results/clustering_{}'.format(output_folder_suffix)) # make output folder
-        # except FileExistsError:
-        #     print('About to overwrite existing output folder and files...')
-        #     #TODO: Have user have to type 'y' or something continue, then also delete all files in folder so theres not like one cluster left over from before.
-
-
-
-
-#         cluster_visual = visualize_motif_samples_bar_graph(motifs, 'Cluster ' + str(cluster), number_of_samples)
-#         pdf.savefig(cluster_visual,pad_inches=2)
-#         #visualize_motif_samples(motifs, './results/clustering_{}/cluster_{}.pdf'.format(output_folder_suffix,cluster))
-#
-
-
-    # # Sort keys in cluster dictionary so they are outputted
-    # sorted_cluster_keys = list(clusters.keys())
-    # sorted_cluster_keys.sort()
@@ -57,12 +57,17 @@ def extract_features(self, projectGraphs):
         return document_collections
 
     def feature_extractor(self, graph, rounds, name):
-        """
-        Function to extract WL features from a graph.
-        :param graph: The nx graph.
-        :param rounds: Number of WL iterations.
-        :param name: ProjectId to output
-        :return doc: Document collection object.
+        """ Function to extract WL features from a graph.
+
+            Parameters
+            ----------
+            graph: The nx graph.
+            rounds: Number of WL iterations.
+            name: ProjectId to output
+
+            Return
+            ----------
+            doc: Document collection object.
         """
         features = nx.degree(graph)
         features = {int(k):v for k,v, in features}
@@ -73,9 +78,11 @@ def feature_extractor(self, graph, rounds, name):
         return doc
 
     def get_embeddings(self, n_graphs):
-        """
-        Function to get embeddings from the model.
-        :param n_graphs: The number of graphs used to train the model.
+        """ Function to get embeddings from the model.
+
+            Parameters
+            ----------
+            n_graphs: The number of graphs used to train the model.
         """
         if not self.fitted:
             print("Model has not been fit, run Graph2Vec.fit() before getting embeddings")
@@ -91,11 +98,13 @@ def get_embeddings(self, n_graphs):
         return out
 
     def save_embeddings(self, n_graphs, output_path='./results/embeddings.csv', projectGraphsIndex=None):
-        """
-        Function to save the embedding.
-        :param output_path: Path to the embedding csv.
-        :param n_graphs: The number of graphs used to train the model.
-        :param dimensions: The embedding dimension parameter.
+        """ Function to save the embedding.
+
+            Parameters
+            ----------
+            output_path: Path to the embedding csv.
+            n_graphs: The number of graphs used to train the model.
+            dimensions: The embedding dimension parameter.
         """
         if not self.fitted:
             print("Model has not been fit, run Graph2Vec.fit() before saving embeddings")
@@ -113,11 +122,13 @@ class WeisfeilerLehmanMachine:
     Weisfeiler Lehman feature extractor class.
     """
     def __init__(self, graph, features, iterations):
-        """
-        Initialization method which executes feature extraction.
-        :param graph: The Nx graph object.
-        :param features: Feature hash table.
-        :param iterations: Number of WL iterations.
+        """ Initialization method which executes feature extraction.
+
+            Parameters
+            ----------
+            graph: The Nx graph object.
+            features: Feature hash table.
+            iterations: Number of WL iterations.
         """
         self.iterations = iterations
         self.graph = graph
@@ -127,9 +138,11 @@ def __init__(self, graph, features, iterations):
         self.do_recursions()
 
     def do_a_recursion(self):
-        """
-        The method does a single WL recursion.
-        :return new_features: The hash table with extracted WL features.
+        """ The method does a single WL recursion.
+
+            Returns
+            -------
+            new_features: The hash table with extracted WL features.
         """
         new_features = {}
         for node in self.nodes:
@@ -144,8 +157,8 @@ def do_a_recursion(self):
         return new_features
 
     def do_recursions(self):
-        """
-        The method does a series of WL recursions.
+        """ The method does a series of WL recursions.
+
         """
         for iteration in range(self.iterations):
             self.features = self.do_a_recursion()