Skip to content

Commit 9cef81d

Browse files
authored
Merge pull request #89 from UBC-MDS/improve_reproducib
Improve reproducib
2 parents 6a97c48 + fcb32fc commit 9cef81d

File tree

9 files changed

+223
-196
lines changed

9 files changed

+223
-196
lines changed
-34 Bytes
Binary file not shown.

src/github_analysis/cluster.py

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7,14 +7,18 @@
77

88
def get_embedding_clusters(embedding_input_file='./results/embeddings.csv', k_for_clustering=10, random_state=None,
99
output_file='./results/clusters.pickle'):
10-
"""
11-
Given a file with embeddings (or other features) cluster similar rows together using kmeans.
12-
13-
:param embedding_input_file: file where every row is a project and every col a feature
14-
:param k_for_clustering: how many groups to cluster
15-
:param random_state: random state for clustering algo
16-
:param output_file: string with the filename to output the results to as a pickle. If this param is set to None no file will be outputted.
17-
:return: a dictionary where the keys are the cluster labels and the values are lists of GitHub projectIds that fall in that cluster.
10+
""" Given a file with embeddings (or other features) cluster similar rows together using kmeans.
11+
12+
Parameters
13+
----------
14+
embedding_input_file: file where every row is a project and every col a feature.
15+
k_for_clustering: how many groups to cluster into.
16+
random_state: random state for clustering algo.
17+
output_file: string with the filename to output the results to as a pickle. If this param is set to None no file will be outputted.
18+
19+
Returns
20+
-------
21+
a dictionary where the keys are the cluster labels and the values are lists of GitHub projectIds that fall in that cluster.
1822
"""
1923
embeddings = pd.read_csv(embedding_input_file, index_col=0)
2024

src/github_analysis/data_layer.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,9 @@ def getUniqueProjectNamesFromDf(df):
88
return df.project_name.unique()
99

1010
class data_layer:
11+
""" Read the feather file of commit history data, and group commits per project ID.
12+
It provides the option for filtering the projects with a minimum number of commits.
13+
"""
1114
def __init__(self, data_path, min_number_commits=None):
1215
self.data_path = data_path
1316
self.commits_df = pd.read_feather(data_path)

src/github_analysis/freq_graph.py

Lines changed: 26 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,18 @@
1010

1111

1212
def generate_motif_visualisations_by_cluster(input_file_motif_clusters='./results/motifs_by_cluster.pickle', output_file='./results/clustering_output.pdf'):
13-
"""
14-
:param input_file_motif_clusters: string with a filepath to a pickled dictionary where the keys are cluster names and the values
15-
are dictionaries where the keys are motifs (nx subgraph) of length k and the values are how many times similar
16-
(isomorphic) motifs occur in the graph.
17-
:param output_file: string thats a path of a pdf file to output the graphs to
13+
""" Visualize the motif clustering result and output the visualize result to pdf file.
14+
15+
Parameters
16+
----------
17+
input_file_motif_clusters: string with a filepath to a pickled dictionary where the keys are cluster names and the values
18+
are dictionaries where the keys are motifs (nx subgraph) of length k and the values are how
19+
many times similar(isomorphic) motifs occur in the graph.
20+
output_file: string thats a path of a pdf file to output the graphs to.
21+
22+
Returns
23+
-------
24+
Visulization result of motif clustering, by cluster, saved down to a pdf file.
1825
"""
1926
with open(input_file_motif_clusters, 'rb') as pickle_in:
2027
motif_clusters = pickle.load(pickle_in)
@@ -30,21 +37,25 @@ def generate_motif_visualisations_by_cluster(input_file_motif_clusters='./result
3037

3138

3239
def visualize_motif_samples_bar_graph(motifs, plot_title='Motif Frequency in Dataset', motifs_to_show=8):
33-
"""
34-
Given a collection of motifs and their frequency in a graph, output a file with a bar chart showing the motifs and
35-
their associated frequencies.
36-
37-
:param motifs: dictionary where the keys are motifs (nx subgraph) of length k and the values are how many times similar
38-
(isomorphic) motifs occur in the graph.
39-
:param plot_title: string thats the tile of your plot.
40-
:return: fig that is a bar chart of the most common motifs and how often they occurred
41-
40+
""" Given a collection of motifs and their frequency in a graph, output a file with a bar chart showing the motifs and
41+
their associated frequencies.
42+
43+
Parameters
44+
----------
45+
motifs: dictionary where the keys are motifs (nx subgraph) of length k and the values are how many times similar
46+
(isomorphic) motifs occur in the graph.
47+
plot_title: string thats the tile of your plot.
48+
49+
Returns
50+
-------
51+
A bar chart figure of the most common motifs and how often they occurred.
4252
"""
4353
motifs_sorted = sorted(motifs.items(), key=lambda kv: kv[1], reverse=True)
44-
54+
single_chain_occurences = 0
4555
# output files with individual motif images to be used in bar graph
4656
occurrences = []
4757
for n, motif in enumerate(motifs_sorted):
58+
4859
# print(motif[1])
4960
# nx.draw_spectral(motif[0], node_size=500, arrowsize=40, width=6)
5061
# plt.show()
@@ -97,48 +108,5 @@ def visualize_motif_samples_bar_graph(motifs, plot_title='Motif Frequency in Dat
97108
'{}% of Sampled Motifs are a Single Chain'.format(round(100 * single_chain_occurences / number_of_samples, 3)))
98109
return fig
99110

100-
# plt.savefig(output_file, pad_inches=2)
101-
# plt.close()
102-
103-
104-
# def visualize_motif_samples(motifs, output_file):
105-
# """
106-
# Given a sample of motifs, output a file with their graphs and how often they occurred.
107-
#
108-
# :param motifs: a dictionary where the keys are motifs (nx subgraph) of length k and the keys are how many times similar
109-
# (isomorphic) motifs occur in the graph.
110-
# :param output_file: string thats apath of a pdf file to output the graphs to
111-
# :return: a pdf file with name output_file with the graphs and how often they occured
112-
# """
113-
# motif_count = sum(motifs.values())
114-
# motifs_sorted = sorted(motifs.items(), key=lambda kv: kv[1], reverse=True)
115-
# with PdfPages(output_file) as pdf:
116-
# for motif in motifs_sorted:
117-
# fig = plt.figure()
118-
# nx.draw_kamada_kawai(motif[0], node_size=25, arrowsize=5)
119-
# fig.suptitle('{} Occurrences ({}%)'.format(motif[1], round(100 * motif[1] / motif_count, 3)))
120-
# pdf.savefig(fig)
121-
# plt.close()
122-
#
123-
124111
if __name__ == '__main__':
125112
main()
126-
127-
# try:
128-
# makedirs('results/clustering_{}'.format(output_folder_suffix)) # make output folder
129-
# except FileExistsError:
130-
# print('About to overwrite existing output folder and files...')
131-
# #TODO: Have user have to type 'y' or something continue, then also delete all files in folder so theres not like one cluster left over from before.
132-
133-
134-
135-
136-
# cluster_visual = visualize_motif_samples_bar_graph(motifs, 'Cluster ' + str(cluster), number_of_samples)
137-
# pdf.savefig(cluster_visual,pad_inches=2)
138-
# #visualize_motif_samples(motifs, './results/clustering_{}/cluster_{}.pdf'.format(output_folder_suffix,cluster))
139-
#
140-
141-
142-
# # Sort keys in cluster dictionary so they are outputted
143-
# sorted_cluster_keys = list(clusters.keys())
144-
# sorted_cluster_keys.sort()

src/github_analysis/graph2vec.py

Lines changed: 37 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -57,12 +57,17 @@ def extract_features(self, projectGraphs):
5757
return document_collections
5858

5959
def feature_extractor(self, graph, rounds, name):
60-
"""
61-
Function to extract WL features from a graph.
62-
:param graph: The nx graph.
63-
:param rounds: Number of WL iterations.
64-
:param name: ProjectId to output
65-
:return doc: Document collection object.
60+
""" Function to extract WL features from a graph.
61+
62+
Parameters
63+
----------
64+
graph: The nx graph.
65+
rounds: Number of WL iterations.
66+
name: ProjectId to output
67+
68+
Return
69+
----------
70+
doc: Document collection object.
6671
"""
6772
features = nx.degree(graph)
6873
features = {int(k):v for k,v, in features}
@@ -73,9 +78,11 @@ def feature_extractor(self, graph, rounds, name):
7378
return doc
7479

7580
def get_embeddings(self, n_graphs):
76-
"""
77-
Function to get embeddings from the model.
78-
:param n_graphs: The number of graphs used to train the model.
81+
""" Function to get embeddings from the model.
82+
83+
Parameters
84+
----------
85+
n_graphs: The number of graphs used to train the model.
7986
"""
8087
if not self.fitted:
8188
print("Model has not been fit, run Graph2Vec.fit() before getting embeddings")
@@ -91,11 +98,13 @@ def get_embeddings(self, n_graphs):
9198
return out
9299

93100
def save_embeddings(self, n_graphs, output_path='./results/embeddings.csv', projectGraphsIndex=None):
94-
"""
95-
Function to save the embedding.
96-
:param output_path: Path to the embedding csv.
97-
:param n_graphs: The number of graphs used to train the model.
98-
:param dimensions: The embedding dimension parameter.
101+
""" Function to save the embedding.
102+
103+
Parameters
104+
----------
105+
output_path: Path to the embedding csv.
106+
n_graphs: The number of graphs used to train the model.
107+
dimensions: The embedding dimension parameter.
99108
"""
100109
if not self.fitted:
101110
print("Model has not been fit, run Graph2Vec.fit() before saving embeddings")
@@ -113,11 +122,13 @@ class WeisfeilerLehmanMachine:
113122
Weisfeiler Lehman feature extractor class.
114123
"""
115124
def __init__(self, graph, features, iterations):
116-
"""
117-
Initialization method which executes feature extraction.
118-
:param graph: The Nx graph object.
119-
:param features: Feature hash table.
120-
:param iterations: Number of WL iterations.
125+
""" Initialization method which executes feature extraction.
126+
127+
Parameters
128+
----------
129+
graph: The Nx graph object.
130+
features: Feature hash table.
131+
iterations: Number of WL iterations.
121132
"""
122133
self.iterations = iterations
123134
self.graph = graph
@@ -127,9 +138,11 @@ def __init__(self, graph, features, iterations):
127138
self.do_recursions()
128139

129140
def do_a_recursion(self):
130-
"""
131-
The method does a single WL recursion.
132-
:return new_features: The hash table with extracted WL features.
141+
""" The method does a single WL recursion.
142+
143+
Returns
144+
-------
145+
new_features: The hash table with extracted WL features.
133146
"""
134147
new_features = {}
135148
for node in self.nodes:
@@ -144,8 +157,8 @@ def do_a_recursion(self):
144157
return new_features
145158

146159
def do_recursions(self):
147-
"""
148-
The method does a series of WL recursions.
160+
""" The method does a series of WL recursions.
161+
149162
"""
150163
for iteration in range(self.iterations):
151164
self.features = self.do_a_recursion()

src/github_analysis/main.py

Lines changed: 32 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -27,46 +27,31 @@ def main(args):
2727

2828
commits_dl = dl.data_layer(args.data_path, min_number_commits=args.min_commits)
2929

30-
project_data = commits_dl.getRandomProjects(args.n_projects, 1)
30+
project_data = commits_dl.getRandomProjects(args.n_projects, args.random_state)
3131
getDataTime = time.time()
32-
3332
logging.info("Query Complete: " + str(getDataTime - startTime) + " seconds")
3433

34+
project_ids = dl.getUniqueProjectIdsFromDf(project_data)
35+
project_groups = commits_dl.getGroupedCommitsByProjectIds(project_ids)
36+
37+
project_graphs = []
38+
project_ids_ordered = []
39+
for name, group in project_groups:
40+
project_graphs.append(nxutils.git_graph(group))
41+
project_ids_ordered.append(name)
42+
43+
generateGraphsTime = time.time()
44+
logging.info("NxGraphs Built: " + str(generateGraphsTime - getDataTime) + " seconds")
3545

36-
for iter in [2, 3, 4, 5, 6, 7, 8, 9, 10]:
37-
embeddings_path = None
38-
if args.embeddings_file_path is None: # If embeddings not specified, generate the model and set the path to the output embeddings
39-
project_ids = dl.getUniqueProjectIdsFromDf(project_data)
40-
project_groups = commits_dl.getGroupedCommitsByProjectIds(project_ids)
41-
42-
project_graphs = []
43-
project_ids_ordered = []
44-
for name, group in project_groups:
45-
project_graphs.append(nxutils.git_graph(group))
46-
project_ids_ordered.append(name)
47-
48-
# with open("project_graphs.pkl", 'w') as f:
49-
# pickle.dump(project_graphs, f)
50-
#
51-
# with open("project_ids_ordered.pkl", 'w') as f:
52-
# pickle.dump(project_ids_ordered, f)
53-
54-
generateGraphsTime = time.time()
55-
logging.info("NxGraphs Built: " + str(generateGraphsTime - getDataTime) + " seconds")
56-
57-
embeddings_path = args.results_path + "embeddings.csv"
58-
g2vModel = g2v.Graph2Vec(workers=args.n_workers, size=args.n_neurons, min_count=args.min_count, iter=iter, seed=args.random_state)
59-
g2vEmbeddings = g2vModel.fit_transform(project_graphs, project_ids_ordered, output_path=embeddings_path)
60-
buildModelTime = time.time()
61-
logging.info("G2V Model Built: " + str(buildModelTime - generateGraphsTime) + " seconds")
62-
else:
63-
embeddings_path = args.embeddings_file_path
64-
generateGraphsTime = time.time()
65-
buildModelTime = time.time()
66-
67-
red.reduce_dim(workers=args.n_workers, output_path=args.results_path + str(iter) + "/", input_path=embeddings_path, random_state=args.random_state)
68-
reduceTime = time.time()
69-
logging.info("Dims Reduced: " + str(reduceTime - buildModelTime) + " seconds")
46+
embeddings_path = args.results_path + "embeddings.csv"
47+
g2vModel = g2v.Graph2Vec(workers=args.n_workers, size=args.n_neurons, min_count=args.min_count, iter=args.n_iter, seed=args.random_state)
48+
g2vEmbeddings = g2vModel.fit_transform(project_graphs, project_ids_ordered, output_path=embeddings_path)
49+
buildModelTime = time.time()
50+
logging.info("G2V Model Built: " + str(buildModelTime - generateGraphsTime) + " seconds")
51+
52+
red.reduce_dim(workers=args.n_workers, output_path=args.results_path, input_path=embeddings_path, random_state=args.random_state)
53+
reduceTime = time.time()
54+
logging.info("Dims Reduced: " + str(reduceTime - buildModelTime) + " seconds")
7055

7156
clusters = c.get_embedding_clusters(embedding_input_file=embeddings_path, output_file=args.results_path + "clusters.pickle", random_state=args.random_state)
7257
projectClusterTime = time.time()
@@ -97,16 +82,16 @@ def main(args):
9782

9883
if __name__ == '__main__':
9984
parser = argparse.ArgumentParser()
100-
parser.add_argument("-rp", "--results_path", help="The folder to output results of the analysis. e.g. embeddings and plots", default="./results/")
101-
parser.add_argument("-nw", "--n_workers", help="The number of workers to use when running the analysis.", default=8, type=int)
102-
parser.add_argument("-dp", "--data_path", help="The path to the commits.feather file. e.g. /home/user/RStudio-Data-Repository/clean_data/commits.feather", default="./results/")
103-
parser.add_argument("-np", "--n_projects", help="The number of projects to sample from the dataset.", default=1000, type=int)
104-
parser.add_argument("-mc", "--min_commits", help="The minimum number of commits for a project to be included in the sample.", default=None, type=int)
105-
parser.add_argument("-mcount", "--min_count", help="The min_count parameter for the graph2vec model.", default=5, type=int)
106-
parser.add_argument("-nps", "--n_personas", help="The number of personas to extract from each cluster.", default=5, type=int)
107-
parser.add_argument("-nn", "--n_neurons", help="The number of neurons to use for Graph2Vec (project level)", default=128, type=int)
108-
parser.add_argument("-emb", "--embeddings_file_path", help="The file to read the embeddings from. Supplying this parameter skips training of the model.", default=None)
109-
parser.add_argument("-rs", "--random_state", help="The random state to initalize all random states.", default=1, type=int)
85+
parser.add_argument("-rp", "--results_path", help="The folder to output results of the analysis. e.g. embeddings and plots", default="./results/")
86+
parser.add_argument("-nw", "--n_workers", help="The number of workers to use when running the analysis.", default=8, type=int)
87+
parser.add_argument("-dp", "--data_path", help="The path to the commits.feather file. e.g. /home/user/RStudio-Data-Repository/clean_data/commits.feather", default="./results/")
88+
parser.add_argument("-np", "--n_projects", help="The number of projects to sample from the dataset.", default=1000, type=int)
89+
parser.add_argument("-mc", "--min_commits", help="The minimum number of commits for a project to be included in the sample.", default=None, type=int)
90+
parser.add_argument("-mcount", "--min_count", help="The min_count parameter for the graph2vec model.", default=5, type=int)
91+
parser.add_argument("-nps", "--n_personas", help="The number of personas to extract from each cluster.", default=5, type=int)
92+
parser.add_argument("-nn", "--n_neurons", help="The number of neurons to use for Graph2Vec (project level)", default=128, type=int)
93+
parser.add_argument("-ni", "--n_iter", help="The number of iteration to use to run the WeisfeilerLehmanMachine", default=10, type=int)
94+
parser.add_argument("-rs", "--random_state", help="The random state to initalize all random states.", default=1, type=int)
11095

11196
args = parser.parse_args()
11297

@@ -149,4 +134,4 @@ def main(args):
149134
# left join `ghtorrent-bq.ght.commit_parents` cp on (cp.commit_id = c.id)
150135
# where (p.id = """ + str(projectId) + """)
151136
# """
152-
#
137+
#

0 commit comments

Comments
 (0)