Skip to content

Commit 43e369f

Browse files
committed
Merge branch 'improve_reproducib' into motif
2 parents 9cef81d + 786dbfb commit 43e369f

File tree

2 files changed

+30
-3
lines changed

2 files changed

+30
-3
lines changed

Snakefile

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
rule run_analysis:
2+
input:
3+
data_path = "/Users/richiezitomer/Documents/RStudio-Data-Repository/clean_data/commits.feather"
4+
output:
5+
results_path = directory("results/")
6+
params:
7+
python_hash_seed = 0,
8+
n_workers = 8,
9+
n_projects = 1000,
10+
min_commits = None,
11+
min_count = 5,
12+
n_personas = 5,
13+
n_neurons = 128,
14+
n_iter = 10,
15+
random_state = 1
16+
shell:
17+
"PYTHONHASHSEED={params.python_hash_seed} python src/github_analysis/main.py -dp {input.data_path} -rp {output.results_path} -nw {params.n_workers} -np {params.n_projects} -mc {params.min_commits} -mcount {params.min_count} -nps {params.n_personas} -nn {params.n_neurons} -ni {params.n_iter} -rs {params.random_state}"
18+
19+
# Commented out because repo is currently over bandwidth: https://help.github.com/en/articles/about-storage-and-bandwidth-usage
20+
#rule clone_data_repo:
21+
# shell: "git clone https://github.com/UBC-MDS/RStudio-Data-Repository.git"

src/github_analysis/main.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ def main(args):
5757
projectClusterTime = time.time()
5858
logging.info("Projects Clustered: " + str(projectClusterTime - reduceTime) + " seconds")
5959

60-
cluster_personas = p.Personas(clusters, commits_dl, args.n_personas, 1, output_path=args.results_path + "cluster_personas.csv")
60+
# cluster_personas = p.Personas(clusters, commits_dl, args.n_personas, 1, output_path=args.results_path + "cluster_personas.csv")
6161
personaGenerationTime = time.time()
6262
logging.info("Personas Generated: " + str(personaGenerationTime - projectClusterTime) + " seconds")
6363

@@ -80,13 +80,19 @@ def main(args):
8080
print("Frequency Graph Time:\t" + str(freqGraphTime - motifTime) + "\tseconds")
8181
print("Total Time:\t\t" + str(freqGraphTime - startTime) + "\tseconds")
8282

83+
84+
def none_or_str(value):
85+
if value == 'None':
86+
return None
87+
return value
88+
8389
if __name__ == '__main__':
8490
parser = argparse.ArgumentParser()
8591
parser.add_argument("-rp", "--results_path", help="The folder to output results of the analysis. e.g. embeddings and plots", default="./results/")
8692
parser.add_argument("-nw", "--n_workers", help="The number of workers to use when running the analysis.", default=8, type=int)
87-
parser.add_argument("-dp", "--data_path", help="The path to the commits.feather file. e.g. /home/user/RStudio-Data-Repository/clean_data/commits.feather", default="./results/")
93+
parser.add_argument("-dp", "--data_path", help="The path to the commits.feather file. e.g. /home/user/RStudio-Data-Repository/clean_data/commits_by_org.feather", default="/home/user/RStudio-Data-Repository/clean_data/commits_by_org.feather")
8894
parser.add_argument("-np", "--n_projects", help="The number of projects to sample from the dataset.", default=1000, type=int)
89-
parser.add_argument("-mc", "--min_commits", help="The minimum number of commits for a project to be included in the sample.", default=None, type=int)
95+
parser.add_argument("-mc", "--min_commits", help="The minimum number of commits for a project to be included in the sample.", default=None, type=none_or_str)
9096
parser.add_argument("-mcount", "--min_count", help="The min_count parameter for the graph2vec model.", default=5, type=int)
9197
parser.add_argument("-nps", "--n_personas", help="The number of personas to extract from each cluster.", default=5, type=int)
9298
parser.add_argument("-nn", "--n_neurons", help="The number of neurons to use for Graph2Vec (project level)", default=128, type=int)

0 commit comments

Comments
 (0)