Merge pull request #90 from UBC-MDS/improve_reproducib

RayceRossum · web-flow · commit 72eb261bcf2d · 2019-06-25T18:02:41.000-07:00
Improve reproducib
diff --git a/README.md b/README.md
@@ -31,8 +31,12 @@ From the root directory, run:
 python src/python src/github_analysis/main.py -dp "/home/rayce/Assignments/Capstone/RStudio-Data-Repository/clean_data/commits_by_org.feather".py
 ```
 
-## Installation Instructions
-PLACEHOLDER
+## Installation instructions
+To get credentials file for GitHub Torrent Google Cloud (necessary for re-running the pipeline to generate images):
+
+- Follow the instructions here to create and download a credentials file: https://developers.google.com/adwords/api/docs/guides/authentication#generate_oauth2_credentials
+- Change the name of the file to `credentials_file.json` and put it in the root directory of the project (a sample file with the same name is included as a reference).
+
 
 ## Data Repository
 [RStudio-Data-Repository](https://github.com/UBC-MDS/RStudio-Data-Repository)
diff --git a/Snakefile b/Snakefile
@@ -0,0 +1,21 @@
+rule run_analysis:
+    input:
+        data_path = "/Users/richiezitomer/Documents/RStudio-Data-Repository/clean_data/commits.feather"
+    output:
+        results_path = directory("results/")
+    params:
+        python_hash_seed = 0,
+        n_workers = 8,
+        n_projects = 1000,
+        min_commits = None,
+        min_count = 5,
+        n_personas = 5,
+        n_neurons = 128,
+        n_iter = 10,
+        random_state = 1
+    shell:
+        "PYTHONHASHSEED={params.python_hash_seed} python src/github_analysis/main.py -dp {input.data_path} -rp {output.results_path} -nw {params.n_workers} -np {params.n_projects} -mc {params.min_commits} -mcount {params.min_count} -nps {params.n_personas} -nn {params.n_neurons} -ni {params.n_iter} -rs {params.random_state}"
+
+# Commented out because repo is currently over bandwidth: https://help.github.com/en/articles/about-storage-and-bandwidth-usage
+#rule clone_data_repo:
+#    shell: "git clone https://github.com/UBC-MDS/RStudio-Data-Repository.git"
diff --git a/credentials_file.json b/credentials_file.json
@@ -0,0 +1,12 @@
+{
+  "type": "INSERT_TYPE",
+  "project_id": "INSERT_PROJECT_ID",
+  "private_key_id": "INSERT_PRIVATE_KEY_ID",
+  "private_key": "INSERT_PRIVATE_KEY",
+  "client_email": "INSERT_EMAIL",
+  "client_id": "INSERT_CLIENT_ID",
+  "auth_uri": "INSERT_AUTH_URI",
+  "token_uri": "INSERT_TOKEN_URI",
+  "auth_provider_x509_cert_url": "INSERT_AUTH_CERT_URL",
+  "client_x509_cert_url": "INSERT_CLIENT_CERT_URL"
+}
diff --git a/src/github_analysis/main.py b/src/github_analysis/main.py
@@ -57,7 +57,7 @@ def main(args):
     projectClusterTime = time.time()
     logging.info("Projects Clustered: " + str(projectClusterTime - reduceTime) + " seconds")
 
-    cluster_personas = p.Personas(clusters, commits_dl, args.n_personas, 1, output_path=args.results_path + "cluster_personas.csv")
+    # cluster_personas = p.Personas(clusters, commits_dl, args.n_personas, 1, output_path=args.results_path + "cluster_personas.csv")
     personaGenerationTime = time.time()
     logging.info("Personas Generated: " + str(personaGenerationTime - projectClusterTime) + " seconds")
 
@@ -80,13 +80,19 @@ def main(args):
     print("Frequency Graph Time:\t" +   str(freqGraphTime - motifTime) +                    "\tseconds")
     print("Total Time:\t\t" +           str(freqGraphTime - startTime) +                    "\tseconds")
 
+
+def none_or_int(value):
+    if value == 'None':
+        return None
+    return int(value)
+
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
     parser.add_argument("-rp",      "--results_path",   help="The folder to output results of the analysis. e.g. embeddings and plots", default="./results/")
     parser.add_argument("-nw",      "--n_workers",      help="The number of workers to use when running the analysis.", default=8, type=int)
-    parser.add_argument("-dp",      "--data_path",      help="The path to the commits.feather file. e.g. /home/user/RStudio-Data-Repository/clean_data/commits.feather", default="./results/")
+    parser.add_argument("-dp",      "--data_path",      help="The path to the commits.feather file. e.g. /home/user/RStudio-Data-Repository/clean_data/commits_by_org.feather", default="/home/user/RStudio-Data-Repository/clean_data/commits_by_org.feather")
     parser.add_argument("-np",      "--n_projects",     help="The number of projects to sample from the dataset.", default=1000, type=int)
-    parser.add_argument("-mc",      "--min_commits",    help="The minimum number of commits for a project to be included in the sample.", default=None, type=int)
+    parser.add_argument("-mc",      "--min_commits",    help="The minimum number of commits for a project to be included in the sample.", default=None, type=none_or_int)
     parser.add_argument("-mcount",  "--min_count",      help="The min_count parameter for the graph2vec model.", default=5, type=int)
     parser.add_argument("-nps",     "--n_personas",     help="The number of personas to extract from each cluster.", default=5, type=int)
     parser.add_argument("-nn",      "--n_neurons",      help="The number of neurons to use for Graph2Vec (project level)", default=128, type=int)
@@ -134,4 +140,4 @@ def main(args):
     #             left join `ghtorrent-bq.ght.commit_parents` cp on (cp.commit_id = c.id)
     #             where (p.id = """ + str(projectId) + """)
     #         """
-    #
+    #