diff --git a/results/motifs_by_cluster_10.pickle b/results/motifs_by_cluster_10.pickle new file mode 100644 index 0000000..8748f03 Binary files /dev/null and b/results/motifs_by_cluster_10.pickle differ diff --git a/results/motifs_by_cluster_20.pickle b/results/motifs_by_cluster_20.pickle new file mode 100644 index 0000000..f962709 Binary files /dev/null and b/results/motifs_by_cluster_20.pickle differ diff --git a/results/motifs_by_cluster_30.pickle b/results/motifs_by_cluster_30.pickle new file mode 100644 index 0000000..54041df Binary files /dev/null and b/results/motifs_by_cluster_30.pickle differ diff --git a/results/motifs_by_cluster_5.pickle b/results/motifs_by_cluster_5.pickle new file mode 100644 index 0000000..64976a7 Binary files /dev/null and b/results/motifs_by_cluster_5.pickle differ diff --git a/results/motifs_by_cluster_6.pickle b/results/motifs_by_cluster_6.pickle new file mode 100644 index 0000000..a846582 Binary files /dev/null and b/results/motifs_by_cluster_6.pickle differ diff --git a/results/motifs_by_cluster_7.pickle b/results/motifs_by_cluster_7.pickle new file mode 100644 index 0000000..8209e53 Binary files /dev/null and b/results/motifs_by_cluster_7.pickle differ diff --git a/results/motifs_by_cluster_8.pickle b/results/motifs_by_cluster_8.pickle new file mode 100644 index 0000000..71bb11b Binary files /dev/null and b/results/motifs_by_cluster_8.pickle differ diff --git a/results/motifs_by_cluster_9.pickle b/results/motifs_by_cluster_9.pickle new file mode 100644 index 0000000..bcbb962 Binary files /dev/null and b/results/motifs_by_cluster_9.pickle differ diff --git a/src/github_analysis/main.py b/src/github_analysis/main.py index b056e8b..f293634 100644 --- a/src/github_analysis/main.py +++ b/src/github_analysis/main.py @@ -15,6 +15,7 @@ import motif_finder as mf import freq_graph as fg import persona as p +import motif_merge as mm import nxutils import pandas as pd @@ -61,10 +62,15 @@ def main(n_projects, n_workers, data_path, results_path, min_commits, n_personas personaGenerationTime = time.time() logging.info("Personas Generated: " + str(personaGenerationTime - projectClusterTime) + " seconds") - motifs_by_cluster = mf.get_motifs_by_cluster(clusters, commits_dl, output_file=results_path + "motifs_by_cluster.pickle") + motif_k_list = [5,6,7,8,9,10,20,30] + for i in motif_k_list: + output_file_name = 'motifs_by_cluster_%s.pickle' %i + motifs_by_cluster = mf.get_motifs_by_cluster(clusters, commits_dl, k_for_motifs=i, number_of_samples=1000, output_file= results_path + output_file_name) motifTime = time.time() logging.info("Motifs Generated: " + str(motifTime - personaGenerationTime) + " seconds") + clustering_of_motif = mm.(motif_k_list, clusters, n_dimensions=4, epochs=3, workers=2, iter=4, output_file_path = results_path + "clustering_of_motif.pickle", k_for_clustering=10) + fg.generate_motif_visualisations_by_cluster(input_file_motif_clusters=results_path + "motifs_by_cluster.pickle", output_file=results_path + "clustering_output.pdf") freqGraphTime = time.time() logging.info("Frequency Graphs Built: " + str(freqGraphTime- motifTime) + " seconds") diff --git a/src/github_analysis/motif_merge.py b/src/github_analysis/motif_merge.py index f896bc4..f886fa8 100644 --- a/src/github_analysis/motif_merge.py +++ b/src/github_analysis/motif_merge.py @@ -3,9 +3,10 @@ import graph2vec as g2v import motif_finder as mf import pickle +import operator n_dimensions = 128 -def motif_merging(input_file_motif_clusters='motifs_by_cluster.pickle', k_for_clustering=10): +def motif_merging_per_cluster(motif_k_list, cluster_id, n_dimensions=4, epochs=3, workers=2, iter=4, input_file_path = "", k_for_clustering=10): """ Group similar motifs together and add up their frequencies. Parameters @@ -16,28 +17,52 @@ def motif_merging(input_file_motif_clusters='motifs_by_cluster.pickle', k_for_cl Returns ------- - Groups of similar motifs(noted by their index from the embedding file) and frequencies of each group + Groups of similar motifs(each group represented by its most frequent motif) and frequencies of each group """ - with open(input_file_motif_clusters, 'rb') as pickle_in: - motifs_by_cluster = pickle.load(pickle_in) + motif_dict = {} + for i in motif_k_list: + input_file_name = input_file_path + 'motifs_by_cluster_%s.pickle' %i + with open(input_file_name, 'rb') as pickle_in: + motifs_by_cluster = pickle.load(pickle_in) + motif_dict_k = motifs_by_cluster[cluster_id] + motif_dict.update(motif_dict_k) - motif_dict = motifs_by_cluster[cluster_id] + motif_list = list(motif_dict.keys()) + freq_list = list(motif_dict.values()) + motif_index = range(len(list(motif_dict.keys()))) + + motif_index_dict = {} + for i in motif_index: + motif_index_dict[i]=motif_list[i] freq_by_motif = {} - freq_list = list(motif_dict.values()) for i in range(0,len(freq_list)): freq_by_motif[i] = freq_list[i] - - m2vModel = g2v.Graph2Vec(size=n_dimensions) - m2vModel = m2vModel.fit_transform(list(motif_dict.keys()), output_path='./results/motif_embeddings.csv') - #m2vModel.save_embeddings(len(motif_dict), output_path='./results/motif_embeddings.csv') - clusters_of_motif = mf.get_embedding_clusters(embedding_input_file='./results/motif_embeddings.csv', k_for_clustering=k_for_clustering, random_state=None, - output_file='./results/clusters_of_motif.pickle') + m2vModel = g2v.Graph2Vec(size=4, epochs=3, workers=2, iter=4) + m2vModel = m2vModel.fit_transform(list(motif_dict.keys()),projectGraphsIndex=motif_index, output_path='motif_embeddings_20.csv') + clusters_of_motif = mf.get_embedding_clusters(embedding_input_file='motif_embeddings_20.csv', k_for_clustering=k_for_clustering, random_state=None, + output_file='clusters_of_motif_20.pickle') freq_by_clusters = {} for cluster in clusters_of_motif: - freq_by_clusters[cluster] = sum(freq_by_motif[i] for i in clusters_of_motif[cluster]) + cluster_freq = {k: freq_by_motif[k] for k in clusters_of_motif[cluster]} + max_motif_index = max(cluster_freq.items(), key=operator.itemgetter(1))[0] + max_motif = motif_index_dict[max_motif_index] + freq_by_clusters[max_motif] = sum(freq_by_motif[i] for i in clusters_of_motif[cluster]) + + return freq_by_clusters + +def motif_merging(motif_k_list, clusters, n_dimensions=4, epochs=3, workers=2, iter=4, input_file_path = "", output_file_path = "motifs_by_cluster.pickle", k_for_clustering=10): + motif_clustering = {} + for cluster in clusters: + motif_merging_per_cluster = motif_merging_per_cluster(motif_k_list, cluster_id, n_dimensions=n_dimensions, epochs=epochs, workers=workers, iter=iter, input_file_path = input_file_path, k_for_clustering=k_for_clustering) + motif_clustering[cluster] = motif_merging_per_cluster + + if output_file_path is not None: + with open(output_file_path, 'wb') as output: + pickle.dump(motif_clustering, output) + logging.info('Cluster file outputted!') - return clusters_of_motif, freq_by_clusters + return motif_clustering