Skip to content

Commit 72eb261

Browse files
authored
Merge pull request #90 from UBC-MDS/improve_reproducib
Improve reproducib
2 parents 59ac97f + bab6d7a commit 72eb261

File tree

4 files changed

+49
-6
lines changed

4 files changed

+49
-6
lines changed

README.md

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,12 @@ From the root directory, run:
3131
python src/python src/github_analysis/main.py -dp "/home/rayce/Assignments/Capstone/RStudio-Data-Repository/clean_data/commits_by_org.feather".py
3232
```
3333

34-
## Installation Instructions
35-
PLACEHOLDER
34+
## Installation instructions
35+
To get credentials file for GitHub Torrent Google Cloud (necessary for re-running the pipeline to generate images):
36+
37+
- Follow the instructions here to create and download a credentials file: https://developers.google.com/adwords/api/docs/guides/authentication#generate_oauth2_credentials
38+
- Change the name of the file to `credentials_file.json` and put it in the root directory of the project (a sample file with the same name is included as a reference).
39+
3640

3741
## Data Repository
3842
[RStudio-Data-Repository](https://github.com/UBC-MDS/RStudio-Data-Repository)

Snakefile

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
rule run_analysis:
2+
input:
3+
data_path = "/Users/richiezitomer/Documents/RStudio-Data-Repository/clean_data/commits.feather"
4+
output:
5+
results_path = directory("results/")
6+
params:
7+
python_hash_seed = 0,
8+
n_workers = 8,
9+
n_projects = 1000,
10+
min_commits = None,
11+
min_count = 5,
12+
n_personas = 5,
13+
n_neurons = 128,
14+
n_iter = 10,
15+
random_state = 1
16+
shell:
17+
"PYTHONHASHSEED={params.python_hash_seed} python src/github_analysis/main.py -dp {input.data_path} -rp {output.results_path} -nw {params.n_workers} -np {params.n_projects} -mc {params.min_commits} -mcount {params.min_count} -nps {params.n_personas} -nn {params.n_neurons} -ni {params.n_iter} -rs {params.random_state}"
18+
19+
# Commented out because repo is currently over bandwidth: https://help.github.com/en/articles/about-storage-and-bandwidth-usage
20+
#rule clone_data_repo:
21+
# shell: "git clone https://github.com/UBC-MDS/RStudio-Data-Repository.git"

credentials_file.json

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
{
2+
"type": "INSERT_TYPE",
3+
"project_id": "INSERT_PROJECT_ID",
4+
"private_key_id": "INSERT_PRIVATE_KEY_ID",
5+
"private_key": "INSERT_PRIVATE_KEY",
6+
"client_email": "INSERT_EMAIL",
7+
"client_id": "INSERT_CLIENT_ID",
8+
"auth_uri": "INSERT_AUTH_URI",
9+
"token_uri": "INSERT_TOKEN_URI",
10+
"auth_provider_x509_cert_url": "INSERT_AUTH_CERT_URL",
11+
"client_x509_cert_url": "INSERT_CLIENT_CERT_URL"
12+
}

src/github_analysis/main.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ def main(args):
5757
projectClusterTime = time.time()
5858
logging.info("Projects Clustered: " + str(projectClusterTime - reduceTime) + " seconds")
5959

60-
cluster_personas = p.Personas(clusters, commits_dl, args.n_personas, 1, output_path=args.results_path + "cluster_personas.csv")
60+
# cluster_personas = p.Personas(clusters, commits_dl, args.n_personas, 1, output_path=args.results_path + "cluster_personas.csv")
6161
personaGenerationTime = time.time()
6262
logging.info("Personas Generated: " + str(personaGenerationTime - projectClusterTime) + " seconds")
6363

@@ -80,13 +80,19 @@ def main(args):
8080
print("Frequency Graph Time:\t" + str(freqGraphTime - motifTime) + "\tseconds")
8181
print("Total Time:\t\t" + str(freqGraphTime - startTime) + "\tseconds")
8282

83+
84+
def none_or_int(value):
85+
if value == 'None':
86+
return None
87+
return int(value)
88+
8389
if __name__ == '__main__':
8490
parser = argparse.ArgumentParser()
8591
parser.add_argument("-rp", "--results_path", help="The folder to output results of the analysis. e.g. embeddings and plots", default="./results/")
8692
parser.add_argument("-nw", "--n_workers", help="The number of workers to use when running the analysis.", default=8, type=int)
87-
parser.add_argument("-dp", "--data_path", help="The path to the commits.feather file. e.g. /home/user/RStudio-Data-Repository/clean_data/commits.feather", default="./results/")
93+
parser.add_argument("-dp", "--data_path", help="The path to the commits.feather file. e.g. /home/user/RStudio-Data-Repository/clean_data/commits_by_org.feather", default="/home/user/RStudio-Data-Repository/clean_data/commits_by_org.feather")
8894
parser.add_argument("-np", "--n_projects", help="The number of projects to sample from the dataset.", default=1000, type=int)
89-
parser.add_argument("-mc", "--min_commits", help="The minimum number of commits for a project to be included in the sample.", default=None, type=int)
95+
parser.add_argument("-mc", "--min_commits", help="The minimum number of commits for a project to be included in the sample.", default=None, type=none_or_int)
9096
parser.add_argument("-mcount", "--min_count", help="The min_count parameter for the graph2vec model.", default=5, type=int)
9197
parser.add_argument("-nps", "--n_personas", help="The number of personas to extract from each cluster.", default=5, type=int)
9298
parser.add_argument("-nn", "--n_neurons", help="The number of neurons to use for Graph2Vec (project level)", default=128, type=int)
@@ -134,4 +140,4 @@ def main(args):
134140
# left join `ghtorrent-bq.ght.commit_parents` cp on (cp.commit_id = c.id)
135141
# where (p.id = """ + str(projectId) + """)
136142
# """
137-
#
143+
#

0 commit comments

Comments
 (0)