Skip to content

Commit b521a9c

Browse files
authored
Merge pull request #100 from UBC-MDS/motif
Motif
2 parents 72eb261 + 2f4348e commit b521a9c

File tree

6 files changed

+400
-491
lines changed

6 files changed

+400
-491
lines changed

Snakefile

Lines changed: 25 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,33 @@
1+
configfile: "config.json"
2+
3+
rule get_ght_data:
4+
params:
5+
download_url = config["data_download_url"]
6+
output:
7+
output_file = "data/commits_by_org.feather"
8+
shell: "python src/github_analysis/make_report.py -du {params.download_url} -of {output.output_file}"
9+
110
rule run_analysis:
211
input:
3-
data_path = "/Users/richiezitomer/Documents/RStudio-Data-Repository/clean_data/commits.feather"
12+
data_path = "data/commits_by_org.feather"
413
output:
514
results_path = directory("results/")
615
params:
7-
python_hash_seed = 0,
8-
n_workers = 8,
9-
n_projects = 1000,
10-
min_commits = None,
11-
min_count = 5,
12-
n_personas = 5,
13-
n_neurons = 128,
14-
n_iter = 10,
15-
random_state = 1
16+
python_hash_seed = config["python_hash_seed"],
17+
n_workers = config["n_workers"],
18+
n_projects = config["n_projects"],
19+
min_commits = config["min_commits"],
20+
min_count = config["min_count"],
21+
n_personas = config["n_personas"],
22+
n_neurons = config["n_neurons"],
23+
n_iter = config["n_iter"],
24+
random_state = config["random_state"]
1625
shell:
1726
"PYTHONHASHSEED={params.python_hash_seed} python src/github_analysis/main.py -dp {input.data_path} -rp {output.results_path} -nw {params.n_workers} -np {params.n_projects} -mc {params.min_commits} -mcount {params.min_count} -nps {params.n_personas} -nn {params.n_neurons} -ni {params.n_iter} -rs {params.random_state}"
1827

19-
# Commented out because repo is currently over bandwidth: https://help.github.com/en/articles/about-storage-and-bandwidth-usage
20-
#rule clone_data_repo:
21-
# shell: "git clone https://github.com/UBC-MDS/RStudio-Data-Repository.git"
28+
rule generate_images:
29+
input:
30+
data_path="data/commits_by_org.feather",
31+
embedding_path="results/embeddings.csv"
32+
shell:
33+
"python src/github_analysis/make_report.py -dp {input.data_path} -ep {input.embedding_path}"

config.json

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
{"data_download_url": "https://api.figshare.com/v2/file/download/15593951",
2+
"python_hash_seed": 0,
3+
"n_workers": 1,
4+
"n_projects": 1000,
5+
"min_commits": "None",
6+
"min_count": 5,
7+
"n_personas": 5,
8+
"n_neurons": 128,
9+
"n_iter": 10,
10+
"random_state": 1}
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
import requests
2+
import shutil
3+
4+
5+
def download_file(download_URL, filename):
6+
"""Download file from CURL url using request.
7+
download_URL: """
8+
with requests.get(download_URL, stream=True) as r:
9+
with open(filename, 'wb') as f:
10+
shutil.copyfileobj(r.raw, f)
11+
return filename
12+
13+
14+
if __name__ == '__main__':
15+
parser = argparse.ArgumentParser()
16+
parser.add_argument("-du", "--download_URL", help="The URL to download the file.", default='https://api.figshare.com/v2/file/download/15593951')
17+
parser.add_argument("-of", "--output_file", help="The number of workers to use when running the analysis.", default='data/commits_by_org.feather')
18+
args = parser.parse_args()
19+
20+
download_file(args.download_URL, args.output_file)

src/github_analysis/main.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ def main(args):
5757
projectClusterTime = time.time()
5858
logging.info("Projects Clustered: " + str(projectClusterTime - reduceTime) + " seconds")
5959

60-
# cluster_personas = p.Personas(clusters, commits_dl, args.n_personas, 1, output_path=args.results_path + "cluster_personas.csv")
60+
cluster_personas = p.Personas(clusters, commits_dl, args.n_personas, 1, output_path=args.results_path + "cluster_personas.csv")
6161
personaGenerationTime = time.time()
6262
logging.info("Personas Generated: " + str(personaGenerationTime - projectClusterTime) + " seconds")
6363

0 commit comments

Comments
 (0)