Skip to content

Commit 31a025f

Browse files
committed
update to steady main.py
1 parent 66b259a commit 31a025f

File tree

1 file changed

+32
-47
lines changed

1 file changed

+32
-47
lines changed

src/github_analysis/main.py

Lines changed: 32 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -27,46 +27,31 @@ def main(args):
2727

2828
commits_dl = dl.data_layer(args.data_path, min_number_commits=args.min_commits)
2929

30-
project_data = commits_dl.getRandomProjects(args.n_projects, 1)
30+
project_data = commits_dl.getRandomProjects(args.n_projects, args.random_state)
3131
getDataTime = time.time()
32-
3332
logging.info("Query Complete: " + str(getDataTime - startTime) + " seconds")
3433

34+
project_ids = dl.getUniqueProjectIdsFromDf(project_data)
35+
project_groups = commits_dl.getGroupedCommitsByProjectIds(project_ids)
36+
37+
project_graphs = []
38+
project_ids_ordered = []
39+
for name, group in project_groups:
40+
project_graphs.append(nxutils.git_graph(group))
41+
project_ids_ordered.append(name)
42+
43+
generateGraphsTime = time.time()
44+
logging.info("NxGraphs Built: " + str(generateGraphsTime - getDataTime) + " seconds")
3545

36-
for iter in [2, 3, 4, 5, 6, 7, 8, 9, 10]:
37-
embeddings_path = None
38-
if args.embeddings_file_path is None: # If embeddings not specified, generate the model and set the path to the output embeddings
39-
project_ids = dl.getUniqueProjectIdsFromDf(project_data)
40-
project_groups = commits_dl.getGroupedCommitsByProjectIds(project_ids)
41-
42-
project_graphs = []
43-
project_ids_ordered = []
44-
for name, group in project_groups:
45-
project_graphs.append(nxutils.git_graph(group))
46-
project_ids_ordered.append(name)
47-
48-
# with open("project_graphs.pkl", 'w') as f:
49-
# pickle.dump(project_graphs, f)
50-
#
51-
# with open("project_ids_ordered.pkl", 'w') as f:
52-
# pickle.dump(project_ids_ordered, f)
53-
54-
generateGraphsTime = time.time()
55-
logging.info("NxGraphs Built: " + str(generateGraphsTime - getDataTime) + " seconds")
56-
57-
embeddings_path = args.results_path + "embeddings.csv"
58-
g2vModel = g2v.Graph2Vec(workers=args.n_workers, size=args.n_neurons, min_count=args.min_count, iter=iter, seed=args.random_state)
59-
g2vEmbeddings = g2vModel.fit_transform(project_graphs, project_ids_ordered, output_path=embeddings_path)
60-
buildModelTime = time.time()
61-
logging.info("G2V Model Built: " + str(buildModelTime - generateGraphsTime) + " seconds")
62-
else:
63-
embeddings_path = args.embeddings_file_path
64-
generateGraphsTime = time.time()
65-
buildModelTime = time.time()
66-
67-
red.reduce_dim(workers=args.n_workers, output_path=args.results_path + str(iter) + "/", input_path=embeddings_path, random_state=args.random_state)
68-
reduceTime = time.time()
69-
logging.info("Dims Reduced: " + str(reduceTime - buildModelTime) + " seconds")
46+
embeddings_path = args.results_path + "embeddings.csv"
47+
g2vModel = g2v.Graph2Vec(workers=args.n_workers, size=args.n_neurons, min_count=args.min_count, iter=args.n_iter, seed=args.random_state)
48+
g2vEmbeddings = g2vModel.fit_transform(project_graphs, project_ids_ordered, output_path=embeddings_path)
49+
buildModelTime = time.time()
50+
logging.info("G2V Model Built: " + str(buildModelTime - generateGraphsTime) + " seconds")
51+
52+
red.reduce_dim(workers=args.n_workers, output_path=args.results_path + str(iter) + "/", input_path=embeddings_path, random_state=args.random_state)
53+
reduceTime = time.time()
54+
logging.info("Dims Reduced: " + str(reduceTime - buildModelTime) + " seconds")
7055

7156
clusters = c.get_embedding_clusters(embedding_input_file=embeddings_path, output_file=args.results_path + "clusters.pickle", random_state=args.random_state)
7257
projectClusterTime = time.time()
@@ -97,16 +82,16 @@ def main(args):
9782

9883
if __name__ == '__main__':
9984
parser = argparse.ArgumentParser()
100-
parser.add_argument("-rp", "--results_path", help="The folder to output results of the analysis. e.g. embeddings and plots", default="./results/")
101-
parser.add_argument("-nw", "--n_workers", help="The number of workers to use when running the analysis.", default=8, type=int)
102-
parser.add_argument("-dp", "--data_path", help="The path to the commits.feather file. e.g. /home/user/RStudio-Data-Repository/clean_data/commits.feather", default="./results/")
103-
parser.add_argument("-np", "--n_projects", help="The number of projects to sample from the dataset.", default=1000, type=int)
104-
parser.add_argument("-mc", "--min_commits", help="The minimum number of commits for a project to be included in the sample.", default=None, type=int)
105-
parser.add_argument("-mcount", "--min_count", help="The min_count parameter for the graph2vec model.", default=5, type=int)
106-
parser.add_argument("-nps", "--n_personas", help="The number of personas to extract from each cluster.", default=5, type=int)
107-
parser.add_argument("-nn", "--n_neurons", help="The number of neurons to use for Graph2Vec (project level)", default=128, type=int)
108-
parser.add_argument("-emb", "--embeddings_file_path", help="The file to read the embeddings from. Supplying this parameter skips training of the model.", default=None)
109-
parser.add_argument("-rs", "--random_state", help="The random state to initalize all random states.", default=1, type=int)
85+
parser.add_argument("-rp", "--results_path", help="The folder to output results of the analysis. e.g. embeddings and plots", default="./results/")
86+
parser.add_argument("-nw", "--n_workers", help="The number of workers to use when running the analysis.", default=8, type=int)
87+
parser.add_argument("-dp", "--data_path", help="The path to the commits.feather file. e.g. /home/user/RStudio-Data-Repository/clean_data/commits.feather", default="./results/")
88+
parser.add_argument("-np", "--n_projects", help="The number of projects to sample from the dataset.", default=1000, type=int)
89+
parser.add_argument("-mc", "--min_commits", help="The minimum number of commits for a project to be included in the sample.", default=None, type=int)
90+
parser.add_argument("-mcount", "--min_count", help="The min_count parameter for the graph2vec model.", default=5, type=int)
91+
parser.add_argument("-nps", "--n_personas", help="The number of personas to extract from each cluster.", default=5, type=int)
92+
parser.add_argument("-nn", "--n_neurons", help="The number of neurons to use for Graph2Vec (project level)", default=128, type=int)
93+
parser.add_argument("-ni", "--n_iter", help="The number of iteration to use to run the WeisfeilerLehmanMachine", default=10, type=int)
94+
parser.add_argument("-rs", "--random_state", help="The random state to initalize all random states.", default=1, type=int)
11095

11196
args = parser.parse_args()
11297

@@ -149,4 +134,4 @@ def main(args):
149134
# left join `ghtorrent-bq.ght.commit_parents` cp on (cp.commit_id = c.id)
150135
# where (p.id = """ + str(projectId) + """)
151136
# """
152-
#
137+
#

0 commit comments

Comments
 (0)