UBC-MDS · huijuechen · Jun 12, 2019 · Jun 12, 2019 · Jun 12, 2019 · Jun 14, 2019
diff --git a/results/motifs_by_cluster_10.pickle b/results/motifs_by_cluster_10.pickle
diff --git a/results/motifs_by_cluster_20.pickle b/results/motifs_by_cluster_20.pickle
diff --git a/results/motifs_by_cluster_30.pickle b/results/motifs_by_cluster_30.pickle
diff --git a/results/motifs_by_cluster_5.pickle b/results/motifs_by_cluster_5.pickle
diff --git a/results/motifs_by_cluster_6.pickle b/results/motifs_by_cluster_6.pickle
diff --git a/results/motifs_by_cluster_7.pickle b/results/motifs_by_cluster_7.pickle
diff --git a/results/motifs_by_cluster_8.pickle b/results/motifs_by_cluster_8.pickle
diff --git a/results/motifs_by_cluster_9.pickle b/results/motifs_by_cluster_9.pickle
diff --git a/src/github_analysis/main.py b/src/github_analysis/main.py
@@ -15,6 +15,7 @@
 import motif_finder as mf
 import freq_graph as fg
 import persona as p
+import motif_merge as mm
 import nxutils
 
 import pandas as pd
@@ -61,10 +62,15 @@ def main(n_projects, n_workers, data_path, results_path, min_commits, n_personas
     personaGenerationTime = time.time()
     logging.info("Personas Generated: " + str(personaGenerationTime - projectClusterTime) + " seconds")
 
-    motifs_by_cluster = mf.get_motifs_by_cluster(clusters, commits_dl, output_file=results_path + "motifs_by_cluster.pickle")
+    motif_k_list = [5,6,7,8,9,10,20,30]
+    for i in motif_k_list:
+        output_file_name = 'motifs_by_cluster_%s.pickle' %i
+        motifs_by_cluster = mf.get_motifs_by_cluster(clusters, commits_dl, k_for_motifs=i, number_of_samples=1000, output_file= results_path + output_file_name)
     motifTime = time.time()
     logging.info("Motifs Generated: " + str(motifTime - personaGenerationTime) + " seconds")
 
+    clustering_of_motif = mm.(motif_k_list, clusters, n_dimensions=4, epochs=3, workers=2, iter=4, output_file_path = results_path + "clustering_of_motif.pickle", k_for_clustering=10)
+
     fg.generate_motif_visualisations_by_cluster(input_file_motif_clusters=results_path + "motifs_by_cluster.pickle", output_file=results_path + "clustering_output.pdf")
     freqGraphTime = time.time()
     logging.info("Frequency Graphs Built: " + str(freqGraphTime- motifTime) + " seconds")

diff --git a/src/github_analysis/motif_merge.py b/src/github_analysis/motif_merge.py
@@ -3,9 +3,10 @@
 import graph2vec as g2v
 import motif_finder as mf
 import pickle
+import operator
 n_dimensions = 128
 
-def motif_merging(input_file_motif_clusters='motifs_by_cluster.pickle', k_for_clustering=10):
+def motif_merging_per_cluster(motif_k_list, cluster_id, n_dimensions=4, epochs=3, workers=2, iter=4, input_file_path = "", k_for_clustering=10):
     """ Group similar motifs together and add up their frequencies.
 
         Parameters
@@ -16,28 +17,52 @@ def motif_merging(input_file_motif_clusters='motifs_by_cluster.pickle', k_for_cl
 
         Returns
         -------
-        Groups of similar motifs(noted by their index from the embedding file) and frequencies of each group
+        Groups of similar motifs(each group represented by its most frequent motif) and frequencies of each group
     """
 
-    with open(input_file_motif_clusters, 'rb') as pickle_in:
-        motifs_by_cluster = pickle.load(pickle_in)
+    motif_dict = {}
+    for i in motif_k_list:
+        input_file_name = input_file_path + 'motifs_by_cluster_%s.pickle' %i
+        with open(input_file_name, 'rb') as pickle_in:
+            motifs_by_cluster = pickle.load(pickle_in)
+        motif_dict_k = motifs_by_cluster[cluster_id]
+        motif_dict.update(motif_dict_k)
 
-    motif_dict = motifs_by_cluster[cluster_id]
+    motif_list = list(motif_dict.keys())
+    freq_list = list(motif_dict.values())
+    motif_index = range(len(list(motif_dict.keys())))
+
+    motif_index_dict = {}
+    for i in motif_index:
+        motif_index_dict[i]=motif_list[i]
 
     freq_by_motif = {}
-    freq_list = list(motif_dict.values())
     for i in range(0,len(freq_list)):
         freq_by_motif[i] = freq_list[i]
 
-
-    m2vModel = g2v.Graph2Vec(size=n_dimensions)
-    m2vModel = m2vModel.fit_transform(list(motif_dict.keys()), output_path='./results/motif_embeddings.csv')
-    #m2vModel.save_embeddings(len(motif_dict), output_path='./results/motif_embeddings.csv')
-    clusters_of_motif = mf.get_embedding_clusters(embedding_input_file='./results/motif_embeddings.csv', k_for_clustering=k_for_clustering, random_state=None,
-                           output_file='./results/clusters_of_motif.pickle')
+    m2vModel = g2v.Graph2Vec(size=4, epochs=3, workers=2, iter=4)
+    m2vModel = m2vModel.fit_transform(list(motif_dict.keys()),projectGraphsIndex=motif_index, output_path='motif_embeddings_20.csv')
+    clusters_of_motif = mf.get_embedding_clusters(embedding_input_file='motif_embeddings_20.csv', k_for_clustering=k_for_clustering, random_state=None,
+                           output_file='clusters_of_motif_20.pickle')
 
     freq_by_clusters = {}
     for cluster in clusters_of_motif:
-        freq_by_clusters[cluster] = sum(freq_by_motif[i] for i in clusters_of_motif[cluster])
+        cluster_freq = {k: freq_by_motif[k] for k in clusters_of_motif[cluster]}
+        max_motif_index = max(cluster_freq.items(), key=operator.itemgetter(1))[0]
+        max_motif = motif_index_dict[max_motif_index]
+        freq_by_clusters[max_motif] = sum(freq_by_motif[i] for i in clusters_of_motif[cluster])
+
+    return freq_by_clusters
+
+def motif_merging(motif_k_list, clusters, n_dimensions=4, epochs=3, workers=2, iter=4, input_file_path = "", output_file_path = "motifs_by_cluster.pickle", k_for_clustering=10):
+    motif_clustering = {}
+    for cluster in clusters:
+        motif_merging_per_cluster = motif_merging_per_cluster(motif_k_list, cluster_id, n_dimensions=n_dimensions, epochs=epochs, workers=workers, iter=iter, input_file_path = input_file_path, k_for_clustering=k_for_clustering)
+        motif_clustering[cluster] = motif_merging_per_cluster
+
+    if output_file_path is not None:
+        with open(output_file_path, 'wb') as output:
+            pickle.dump(motif_clustering, output)
+        logging.info('Cluster file outputted!')
 
-    return clusters_of_motif, freq_by_clusters
+    return motif_clustering