UBC-MDS
diff --git a/‎docs/proposal_presentation/proposal_report.pdf
-34 Bytes b/‎docs/proposal_presentation/proposal_report.pdf
-34 Bytes
diff --git a/‎src/github_analysis/cluster.py
Lines changed: 12 additions & 8 deletions b/‎src/github_analysis/cluster.py
Lines changed: 12 additions & 8 deletions
diff --git a/‎src/github_analysis/data_layer.py
Lines changed: 3 additions & 0 deletions b/‎src/github_analysis/data_layer.py
Lines changed: 3 additions & 0 deletions
diff --git a/‎src/github_analysis/freq_graph.py
Lines changed: 26 additions & 58 deletions b/‎src/github_analysis/freq_graph.py
Lines changed: 26 additions & 58 deletions
diff --git a/‎src/github_analysis/graph2vec.py
Lines changed: 37 additions & 24 deletions b/‎src/github_analysis/graph2vec.py
Lines changed: 37 additions & 24 deletions
diff --git a/‎src/github_analysis/main.py
Lines changed: 32 additions & 47 deletions b/‎src/github_analysis/main.py
Lines changed: 32 additions & 47 deletions
@@ -7,14 +7,18 @@
 
 def get_embedding_clusters(embedding_input_file='./results/embeddings.csv', k_for_clustering=10, random_state=None,
                            output_file='./results/clusters.pickle'):
-    """
-    Given a file with embeddings (or other features) cluster similar rows together using kmeans.
-
-    :param embedding_input_file: file where every row is a project and every col a feature
-    :param k_for_clustering: how many groups to cluster
-    :param random_state: random state for clustering algo
-    :param output_file: string with the filename to output the results to as a pickle. If this param is set to None no file will be outputted.
-    :return: a dictionary where the keys are the cluster labels and the values are lists of GitHub projectIds that fall in that cluster.
+    """ Given a file with embeddings (or other features) cluster similar rows together using kmeans.
+    
+        Parameters
+        ----------
+        embedding_input_file: file where every row is a project and every col a feature.
+        k_for_clustering: how many groups to cluster into.
+        random_state: random state for clustering algo.
+        output_file: string with the filename to output the results to as a pickle. If this param is set to None no file will be outputted.
+
+        Returns
+        -------
+        a dictionary where the keys are the cluster labels and the values are lists of GitHub projectIds that fall in that cluster.
     """
     embeddings = pd.read_csv(embedding_input_file, index_col=0)
 
 
@@ -8,6 +8,9 @@ def getUniqueProjectNamesFromDf(df):
     return df.project_name.unique()
 
 class data_layer:
+    """ Read the feather file of commit history data, and group commits per project ID.
+        It provides the option for filtering the projects with a minimum number of commits.
+    """
     def __init__(self, data_path, min_number_commits=None):
         self.data_path = data_path
         self.commits_df = pd.read_feather(data_path)
 
@@ -10,11 +10,18 @@
 
 
 def generate_motif_visualisations_by_cluster(input_file_motif_clusters='./results/motifs_by_cluster.pickle', output_file='./results/clustering_output.pdf'):
-    """
-    :param input_file_motif_clusters: string with a filepath to a pickled dictionary where the keys are cluster names and the values
-    are dictionaries where the keys are motifs (nx subgraph) of length k and the values are how many times similar
-    (isomorphic) motifs occur in the graph.
-    :param output_file: string thats a path of a pdf file to output the graphs to
+    """ Visualize the motif clustering result and output the visualize result to pdf file.
+
+        Parameters
+        ----------
+        input_file_motif_clusters: string with a filepath to a pickled dictionary where the keys are cluster names and the values
+                                   are dictionaries where the keys are motifs (nx subgraph) of length k and the values are how
+                                   many times similar(isomorphic) motifs occur in the graph.
+        output_file: string thats a path of a pdf file to output the graphs to.
+
+        Returns
+        -------
+        Visulization result of motif clustering, by cluster, saved down to a pdf file.
     """
     with open(input_file_motif_clusters, 'rb') as pickle_in:
         motif_clusters = pickle.load(pickle_in)
@@ -30,21 +37,25 @@ def generate_motif_visualisations_by_cluster(input_file_motif_clusters='./result
 
 
 def visualize_motif_samples_bar_graph(motifs, plot_title='Motif Frequency in Dataset', motifs_to_show=8):
-    """
-    Given a collection of motifs and their frequency in a graph, output a file with a bar chart showing the motifs and
-    their associated frequencies.
-
-    :param motifs: dictionary where the keys are motifs (nx subgraph) of length k and the values are how many times similar
-    (isomorphic) motifs occur in the graph.
-    :param plot_title: string thats the tile of your plot.
-    :return: fig that is a bar chart of the most common motifs and how often they occurred
-
+    """ Given a collection of motifs and their frequency in a graph, output a file with a bar chart showing the motifs and
+        their associated frequencies.
+
+        Parameters
+        ----------
+        motifs: dictionary where the keys are motifs (nx subgraph) of length k and the values are how many times similar
+                (isomorphic) motifs occur in the graph.
+        plot_title: string thats the tile of your plot.
+
+        Returns
+        -------
+        A bar chart figure of the most common motifs and how often they occurred.
     """
     motifs_sorted = sorted(motifs.items(), key=lambda kv: kv[1], reverse=True)
-
+    single_chain_occurences = 0
     # output files with individual motif images to be used in bar graph
     occurrences = []
     for n, motif in enumerate(motifs_sorted):
+
         # print(motif[1])
         # nx.draw_spectral(motif[0], node_size=500, arrowsize=40, width=6)
         # plt.show()
@@ -97,48 +108,5 @@ def visualize_motif_samples_bar_graph(motifs, plot_title='Motif Frequency in Dat
         '{}% of Sampled Motifs are a Single Chain'.format(round(100 * single_chain_occurences / number_of_samples, 3)))
     return fig
 
-#    plt.savefig(output_file, pad_inches=2)
-#    plt.close()
-
-
-# def visualize_motif_samples(motifs, output_file):
-#     """
-#     Given a sample of motifs, output a file with their graphs and how often they occurred.
-#
-#     :param motifs: a dictionary where the keys are motifs (nx subgraph) of length k and the keys are how many times similar
-#     (isomorphic) motifs occur in the graph.
-#     :param output_file: string thats apath of a pdf file to output the graphs to
-#     :return: a pdf file with name output_file with the graphs and how often they occured
-#     """
-#     motif_count = sum(motifs.values())
-#     motifs_sorted = sorted(motifs.items(), key=lambda kv: kv[1], reverse=True)
-#     with PdfPages(output_file) as pdf:
-#         for motif in motifs_sorted:
-#             fig = plt.figure()
-#             nx.draw_kamada_kawai(motif[0], node_size=25, arrowsize=5)
-#             fig.suptitle('{} Occurrences ({}%)'.format(motif[1], round(100 * motif[1] / motif_count, 3)))
-#             pdf.savefig(fig)
-#             plt.close()
-#
-
 if __name__ == '__main__':
     main()
-
- # try:
-        #     makedirs('results/clustering_{}'.format(output_folder_suffix)) # make output folder
-        # except FileExistsError:
-        #     print('About to overwrite existing output folder and files...')
-        #     #TODO: Have user have to type 'y' or something continue, then also delete all files in folder so theres not like one cluster left over from before.
-
-
-
-
-#         cluster_visual = visualize_motif_samples_bar_graph(motifs, 'Cluster ' + str(cluster), number_of_samples)
-#         pdf.savefig(cluster_visual,pad_inches=2)
-#         #visualize_motif_samples(motifs, './results/clustering_{}/cluster_{}.pdf'.format(output_folder_suffix,cluster))
-#
-
-
-    # # Sort keys in cluster dictionary so they are outputted
-    # sorted_cluster_keys = list(clusters.keys())
-    # sorted_cluster_keys.sort()
@@ -57,12 +57,17 @@ def extract_features(self, projectGraphs):
         return document_collections
 
     def feature_extractor(self, graph, rounds, name):
-        """
-        Function to extract WL features from a graph.
-        :param graph: The nx graph.
-        :param rounds: Number of WL iterations.
-        :param name: ProjectId to output
-        :return doc: Document collection object.
+        """ Function to extract WL features from a graph.
+
+            Parameters
+            ----------
+            graph: The nx graph.
+            rounds: Number of WL iterations.
+            name: ProjectId to output
+
+            Return
+            ----------
+            doc: Document collection object.
         """
         features = nx.degree(graph)
         features = {int(k):v for k,v, in features}
@@ -73,9 +78,11 @@ def feature_extractor(self, graph, rounds, name):
         return doc
 
     def get_embeddings(self, n_graphs):
-        """
-        Function to get embeddings from the model.
-        :param n_graphs: The number of graphs used to train the model.
+        """ Function to get embeddings from the model.
+
+            Parameters
+            ----------
+            n_graphs: The number of graphs used to train the model.
         """
         if not self.fitted:
             print("Model has not been fit, run Graph2Vec.fit() before getting embeddings")
@@ -91,11 +98,13 @@ def get_embeddings(self, n_graphs):
         return out
 
     def save_embeddings(self, n_graphs, output_path='./results/embeddings.csv', projectGraphsIndex=None):
-        """
-        Function to save the embedding.
-        :param output_path: Path to the embedding csv.
-        :param n_graphs: The number of graphs used to train the model.
-        :param dimensions: The embedding dimension parameter.
+        """ Function to save the embedding.
+
+            Parameters
+            ----------
+            output_path: Path to the embedding csv.
+            n_graphs: The number of graphs used to train the model.
+            dimensions: The embedding dimension parameter.
         """
         if not self.fitted:
             print("Model has not been fit, run Graph2Vec.fit() before saving embeddings")
@@ -113,11 +122,13 @@ class WeisfeilerLehmanMachine:
     Weisfeiler Lehman feature extractor class.
     """
     def __init__(self, graph, features, iterations):
-        """
-        Initialization method which executes feature extraction.
-        :param graph: The Nx graph object.
-        :param features: Feature hash table.
-        :param iterations: Number of WL iterations.
+        """ Initialization method which executes feature extraction.
+
+            Parameters
+            ----------
+            graph: The Nx graph object.
+            features: Feature hash table.
+            iterations: Number of WL iterations.
         """
         self.iterations = iterations
         self.graph = graph
@@ -127,9 +138,11 @@ def __init__(self, graph, features, iterations):
         self.do_recursions()
 
     def do_a_recursion(self):
-        """
-        The method does a single WL recursion.
-        :return new_features: The hash table with extracted WL features.
+        """ The method does a single WL recursion.
+
+            Returns
+            -------
+            new_features: The hash table with extracted WL features.
         """
         new_features = {}
         for node in self.nodes:
@@ -144,8 +157,8 @@ def do_a_recursion(self):
         return new_features
 
     def do_recursions(self):
-        """
-        The method does a series of WL recursions.
+        """ The method does a series of WL recursions.
+
         """
         for iteration in range(self.iterations):
             self.features = self.do_a_recursion()
@@ -27,46 +27,31 @@ def main(args):
 
     commits_dl = dl.data_layer(args.data_path, min_number_commits=args.min_commits)
 
-    project_data = commits_dl.getRandomProjects(args.n_projects, 1)
+    project_data = commits_dl.getRandomProjects(args.n_projects, args.random_state)
     getDataTime = time.time()
-
     logging.info("Query Complete: " + str(getDataTime - startTime) + " seconds")
 
+    project_ids = dl.getUniqueProjectIdsFromDf(project_data)
+    project_groups = commits_dl.getGroupedCommitsByProjectIds(project_ids)
+
+    project_graphs = []
+    project_ids_ordered = []
+    for name, group in project_groups:
+        project_graphs.append(nxutils.git_graph(group))
+        project_ids_ordered.append(name)
+
+    generateGraphsTime = time.time()
+    logging.info("NxGraphs Built: " + str(generateGraphsTime - getDataTime) + " seconds")
 
-    for iter in [2, 3, 4, 5, 6, 7, 8, 9, 10]:
-        embeddings_path = None
-        if args.embeddings_file_path is None: # If embeddings not specified, generate the model and set the path to the output embeddings
-            project_ids = dl.getUniqueProjectIdsFromDf(project_data)
-            project_groups = commits_dl.getGroupedCommitsByProjectIds(project_ids)
-
-            project_graphs = []
-            project_ids_ordered = []
-            for name, group in project_groups:
-                project_graphs.append(nxutils.git_graph(group))
-                project_ids_ordered.append(name)
-
-            # with open("project_graphs.pkl", 'w') as f:
-            #     pickle.dump(project_graphs, f)
-            #
-            # with open("project_ids_ordered.pkl", 'w') as f:
-            #     pickle.dump(project_ids_ordered, f)
-
-            generateGraphsTime = time.time()
-            logging.info("NxGraphs Built: " + str(generateGraphsTime - getDataTime) + " seconds")
-
-            embeddings_path = args.results_path + "embeddings.csv"
-            g2vModel = g2v.Graph2Vec(workers=args.n_workers, size=args.n_neurons, min_count=args.min_count, iter=iter, seed=args.random_state)
-            g2vEmbeddings = g2vModel.fit_transform(project_graphs, project_ids_ordered, output_path=embeddings_path)
-            buildModelTime = time.time()
-            logging.info("G2V Model Built: " + str(buildModelTime - generateGraphsTime) + " seconds")
-        else:
-            embeddings_path = args.embeddings_file_path
-            generateGraphsTime = time.time()
-            buildModelTime = time.time()
-
-        red.reduce_dim(workers=args.n_workers, output_path=args.results_path + str(iter) + "/", input_path=embeddings_path, random_state=args.random_state)
-        reduceTime = time.time()
-        logging.info("Dims Reduced: " + str(reduceTime - buildModelTime) + " seconds")
+    embeddings_path = args.results_path + "embeddings.csv"
+    g2vModel = g2v.Graph2Vec(workers=args.n_workers, size=args.n_neurons, min_count=args.min_count, iter=args.n_iter, seed=args.random_state)
+    g2vEmbeddings = g2vModel.fit_transform(project_graphs, project_ids_ordered, output_path=embeddings_path)
+    buildModelTime = time.time()
+    logging.info("G2V Model Built: " + str(buildModelTime - generateGraphsTime) + " seconds")
+
+    red.reduce_dim(workers=args.n_workers, output_path=args.results_path, input_path=embeddings_path, random_state=args.random_state)
+    reduceTime = time.time()
+    logging.info("Dims Reduced: " + str(reduceTime - buildModelTime) + " seconds")
 
     clusters = c.get_embedding_clusters(embedding_input_file=embeddings_path, output_file=args.results_path + "clusters.pickle", random_state=args.random_state)
     projectClusterTime = time.time()
@@ -97,16 +82,16 @@ def main(args):
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
-    parser.add_argument("-rp", "--results_path", help="The folder to output results of the analysis. e.g. embeddings and plots", default="./results/")
-    parser.add_argument("-nw", "--n_workers", help="The number of workers to use when running the analysis.", default=8, type=int)
-    parser.add_argument("-dp", "--data_path", help="The path to the commits.feather file. e.g. /home/user/RStudio-Data-Repository/clean_data/commits.feather", default="./results/")
-    parser.add_argument("-np", "--n_projects", help="The number of projects to sample from the dataset.", default=1000, type=int)
-    parser.add_argument("-mc", "--min_commits", help="The minimum number of commits for a project to be included in the sample.", default=None, type=int)
-    parser.add_argument("-mcount", "--min_count", help="The min_count parameter for the graph2vec model.", default=5, type=int)
-    parser.add_argument("-nps", "--n_personas", help="The number of personas to extract from each cluster.", default=5, type=int)
-    parser.add_argument("-nn", "--n_neurons", help="The number of neurons to use for Graph2Vec (project level)", default=128, type=int)
-    parser.add_argument("-emb", "--embeddings_file_path", help="The file to read the embeddings from. Supplying this parameter skips training of the model.", default=None)
-    parser.add_argument("-rs", "--random_state", help="The random state to initalize all random states.", default=1, type=int)
+    parser.add_argument("-rp",      "--results_path",   help="The folder to output results of the analysis. e.g. embeddings and plots", default="./results/")
+    parser.add_argument("-nw",      "--n_workers",      help="The number of workers to use when running the analysis.", default=8, type=int)
+    parser.add_argument("-dp",      "--data_path",      help="The path to the commits.feather file. e.g. /home/user/RStudio-Data-Repository/clean_data/commits.feather", default="./results/")
+    parser.add_argument("-np",      "--n_projects",     help="The number of projects to sample from the dataset.", default=1000, type=int)
+    parser.add_argument("-mc",      "--min_commits",    help="The minimum number of commits for a project to be included in the sample.", default=None, type=int)
+    parser.add_argument("-mcount",  "--min_count",      help="The min_count parameter for the graph2vec model.", default=5, type=int)
+    parser.add_argument("-nps",     "--n_personas",     help="The number of personas to extract from each cluster.", default=5, type=int)
+    parser.add_argument("-nn",      "--n_neurons",      help="The number of neurons to use for Graph2Vec (project level)", default=128, type=int)
+    parser.add_argument("-ni",      "--n_iter",         help="The number of iteration to use to run the WeisfeilerLehmanMachine", default=10, type=int)
+    parser.add_argument("-rs",      "--random_state",   help="The random state to initalize all random states.", default=1, type=int)
 
     args = parser.parse_args()
 
@@ -149,4 +134,4 @@ def main(args):
     #             left join `ghtorrent-bq.ght.commit_parents` cp on (cp.commit_id = c.id)
     #             where (p.id = """ + str(projectId) + """)
     #         """
-    #
+    #