Merge pull request #100 from UBC-MDS/motif

ian-flores · web-flow · commit b521a9cc90f4 · 2019-06-26T13:32:04.000-07:00
Motif
diff --git a/Snakefile b/Snakefile
@@ -1,21 +1,33 @@
+configfile: "config.json"
+
+rule get_ght_data:
+    params:
+        download_url = config["data_download_url"]
+    output:
+        output_file = "data/commits_by_org.feather"
+    shell: "python src/github_analysis/make_report.py -du {params.download_url} -of {output.output_file}"
+
 rule run_analysis:
     input:
-        data_path = "/Users/richiezitomer/Documents/RStudio-Data-Repository/clean_data/commits.feather"
+        data_path = "data/commits_by_org.feather"
     output:
         results_path = directory("results/")
     params:
-        python_hash_seed = 0,
-        n_workers = 8,
-        n_projects = 1000,
-        min_commits = None,
-        min_count = 5,
-        n_personas = 5,
-        n_neurons = 128,
-        n_iter = 10,
-        random_state = 1
+        python_hash_seed = config["python_hash_seed"],
+        n_workers = config["n_workers"],
+        n_projects = config["n_projects"],
+        min_commits = config["min_commits"],
+        min_count = config["min_count"],
+        n_personas = config["n_personas"],
+        n_neurons = config["n_neurons"],
+        n_iter = config["n_iter"],
+        random_state = config["random_state"]
     shell:
         "PYTHONHASHSEED={params.python_hash_seed} python src/github_analysis/main.py -dp {input.data_path} -rp {output.results_path} -nw {params.n_workers} -np {params.n_projects} -mc {params.min_commits} -mcount {params.min_count} -nps {params.n_personas} -nn {params.n_neurons} -ni {params.n_iter} -rs {params.random_state}"
 
-# Commented out because repo is currently over bandwidth: https://help.github.com/en/articles/about-storage-and-bandwidth-usage
-#rule clone_data_repo:
-#    shell: "git clone https://github.com/UBC-MDS/RStudio-Data-Repository.git"
+rule generate_images:
+    input:
+        data_path="data/commits_by_org.feather",
+        embedding_path="results/embeddings.csv"
+    shell:
+        "python src/github_analysis/make_report.py -dp {input.data_path} -ep {input.embedding_path}"
diff --git a/config.json b/config.json
@@ -0,0 +1,10 @@
+{"data_download_url": "https://api.figshare.com/v2/file/download/15593951",
+"python_hash_seed": 0,
+"n_workers": 1,
+"n_projects": 1000,
+"min_commits": "None",
+"min_count": 5,
+"n_personas": 5,
+"n_neurons": 128,
+"n_iter": 10,
+"random_state": 1}
diff --git a/src/github_analysis/download_gh_data.py b/src/github_analysis/download_gh_data.py
@@ -0,0 +1,20 @@
+import requests
+import shutil
+
+
+def download_file(download_URL, filename):
+    """Download file from CURL url using request.
+    download_URL: """
+    with requests.get(download_URL, stream=True) as r:
+            with open(filename, 'wb') as f:
+                shutil.copyfileobj(r.raw, f)
+    return filename
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-du",      "--download_URL",   help="The URL to download the file.", default='https://api.figshare.com/v2/file/download/15593951')
+    parser.add_argument("-of",      "--output_file",      help="The number of workers to use when running the analysis.", default='data/commits_by_org.feather')
+    args = parser.parse_args()
+
+    download_file(args.download_URL, args.output_file)
diff --git a/src/github_analysis/main.py b/src/github_analysis/main.py
@@ -57,7 +57,7 @@ def main(args):
     projectClusterTime = time.time()
     logging.info("Projects Clustered: " + str(projectClusterTime - reduceTime) + " seconds")
 
-    # cluster_personas = p.Personas(clusters, commits_dl, args.n_personas, 1, output_path=args.results_path + "cluster_personas.csv")
+    cluster_personas = p.Personas(clusters, commits_dl, args.n_personas, 1, output_path=args.results_path + "cluster_personas.csv")
     personaGenerationTime = time.time()
     logging.info("Personas Generated: " + str(personaGenerationTime - projectClusterTime) + " seconds")
 
diff --git a/src/github_analysis/make_dash.py b/src/github_analysis/make_dash.py
diff --git a/src/github_analysis/make_report.py b/src/github_analysis/make_report.py