Skip to content

Commit d49cc67

Browse files
authored
JSON config file now generated (#52)
* Renamed fdg folder to visuals, matched source code to new folder #49 Removed unnecessary folder creation in outputs * Implemented JSON output and tested via Mocks (#47)
1 parent e6e9f31 commit d49cc67

File tree

9 files changed

+186
-51
lines changed

9 files changed

+186
-51
lines changed

detect.py

Lines changed: 84 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
import argparse
2+
import json
23
import os
4+
import sys
35

46
from pandas import Timestamp, read_pickle, ExcelWriter
57

@@ -24,7 +26,7 @@ def year2pandas_earliest_date(year_in):
2426
return Timestamp(year_string)
2527

2628

27-
def get_args():
29+
def get_args(command_line_arguments):
2830
parser = argparse.ArgumentParser(description="create report, wordcloud, and fdg graph for patent texts")
2931

3032
parser.add_argument("-f", "--focus", default=False, action="store_true",
@@ -36,7 +38,8 @@ def get_args():
3638
help="options are <median> <max> <sum> <avg> defaults to sum. Average is over non zero values")
3739
parser.add_argument("-o", "--output", default='report', choices=['fdg', 'wordcloud', 'report', 'table', 'all'],
3840
help="options are: <fdg> <wordcloud> <report> <table> <all>")
39-
41+
parser.add_argument("-j", "--json", default=False, action="store_true",
42+
help="Output configuration as JSON file alongside output report")
4043
parser.add_argument("-yf", "--year_from", type=int, default=2000, help="The first year for the patent cohort")
4144
parser.add_argument("-yt", "--year_to", type=int, default=0, help="The last year for the patent cohort (0 is now)")
4245

@@ -66,7 +69,7 @@ def get_args():
6669

6770
parser.add_argument("-nltk", "--nltk_path", default=None, help="custom path for NLTK data")
6871

69-
args = parser.parse_args()
72+
args = parser.parse_args(command_line_arguments)
7073
return args
7174

7275

@@ -104,55 +107,15 @@ def check_cpc_between_years(args, df):
104107
exit(0)
105108

106109

107-
def get_tfidf(args, filename, cpc):
110+
def get_tfidf(args, pickle_file_name, cpc):
108111
date_from = year2pandas_earliest_date(args.year_from)
109112
date_to = year2pandas_latest_date(args.year_to)
110113

111-
df = PatentsPickle2DataFrame(filename, classification=cpc, date_from=date_from, date_to=date_to).data_frame
114+
df = PatentsPickle2DataFrame(pickle_file_name, classification=cpc, date_from=date_from, date_to=date_to).data_frame
112115
check_cpc_between_years(args, df)
113116
return TFIDF(df, tokenizer=LemmaTokenizer(), ngram_range=(args.min_n, args.max_n))
114117

115118

116-
def main():
117-
paths = [os.path.join('outputs', 'reports'), os.path.join('outputs', 'json'), os.path.join('outputs', 'wordclouds'),
118-
os.path.join('outputs', 'table')]
119-
for path in paths:
120-
os.makedirs(path, exist_ok=True)
121-
122-
args = get_args()
123-
checkargs(args)
124-
125-
if args.nltk_path:
126-
import nltk
127-
nltk.data.path.append(args.nltk_path)
128-
129-
path = os.path.join('data', args.patent_source + ".pkl.bz2")
130-
tfidf = get_tfidf(args, path, args.cpc_classification)
131-
132-
newtfidf = None
133-
if args.focus or args.output == 'table':
134-
path2 = os.path.join('data', args.focus_source + ".pkl.bz2")
135-
newtfidf = get_tfidf(args, path2, None)
136-
137-
citation_count_dict = None
138-
if args.cite:
139-
citation_count_dict = load_citation_count_dict()
140-
141-
out = args.output
142-
143-
ngram_multiplier = 4
144-
145-
if out == 'report':
146-
run_report(args, ngram_multiplier, tfidf, newtfidf, citation_count_dict=citation_count_dict)
147-
elif out == 'wordcloud' or out == 'all':
148-
run_report(args, ngram_multiplier, tfidf, newtfidf, wordclouds=True, citation_count_dict=citation_count_dict)
149-
elif out == 'table' or out == 'all':
150-
run_table(args, ngram_multiplier, tfidf, newtfidf, citation_count_dict)
151-
152-
if out == 'fdg' or out == 'all':
153-
run_fdg(args, tfidf, newtfidf)
154-
155-
156119
def load_citation_count_dict():
157120
citation_count_dict = read_pickle(FilePaths.us_patents_citation_dictionary_1of2_pickle_name)
158121
citation_count_dict_pt2 = read_pickle(FilePaths.us_patents_citation_dictionary_2of2_pickle_name)
@@ -178,11 +141,11 @@ def run_report(args, ngram_multiplier, tfidf, tfidf_random=None, wordclouds=Fals
178141
number_of_ngrams_to_return=ngram_multiplier * num_ngrams,
179142
pick=args.pick, time=args.time,
180143
citation_count_dict=citation_count_dict)
181-
set_terms = set(terms) if not args.focus \
182-
else tfidf.detect_popular_ngrams_in_corpus_excluding_common(tfidf_random,
183-
number_of_ngrams_to_return=ngram_multiplier * num_ngrams,
184-
pick=args.pick, time=args.time,
185-
citation_count_dict=citation_count_dict)
144+
set_terms = set(terms) if not args.focus else \
145+
tfidf.detect_popular_ngrams_in_corpus_excluding_common(tfidf_random,
146+
number_of_ngrams_to_return=ngram_multiplier * num_ngrams,
147+
pick=args.pick, time=args.time,
148+
citation_count_dict=citation_count_dict)
186149

187150
dict_freqs = dict([((p[1]), p[0]) for p in ngrams_scores_tuple if p[1] in set_terms])
188151

@@ -209,5 +172,76 @@ def run_fdg(args, tf_idf, tf_idf2=None):
209172
graph.save_graph("key-terms", 'data')
210173

211174

175+
def write_config_to_json(args, patent_pickle_file_name):
176+
patent_pickle_file_name = os.path.abspath(patent_pickle_file_name)
177+
report_file_name = os.path.abspath(args.report_name)
178+
json_file_name = os.path.splitext(report_file_name)[0] + '.json'
179+
180+
json_data = {
181+
'paths': {
182+
'data': patent_pickle_file_name,
183+
'tech_report': report_file_name
184+
},
185+
'year': {
186+
'from': args.year_from,
187+
'to': args.year_to
188+
},
189+
'parameters': {
190+
'cpc': '' if args.cpc_classification is None else args.cpc_classification,
191+
'pick': args.pick,
192+
'time': args.time,
193+
'cite': args.cite,
194+
'focus': args.focus
195+
}
196+
}
197+
198+
with open(json_file_name, 'w') as json_file:
199+
json.dump(json_data, json_file)
200+
201+
202+
def main():
203+
paths = [os.path.join('outputs', 'reports'), os.path.join('outputs', 'wordclouds'),
204+
os.path.join('outputs', 'table')]
205+
for path in paths:
206+
os.makedirs(path, exist_ok=True)
207+
208+
args = get_args(sys.argv[1:])
209+
checkargs(args)
210+
211+
patent_pickle_file_name = os.path.join('data', args.patent_source + ".pkl.bz2")
212+
213+
if args.json:
214+
write_config_to_json(args, patent_pickle_file_name)
215+
216+
if args.nltk_path:
217+
import nltk
218+
nltk.data.path.append(args.nltk_path)
219+
220+
tfidf = get_tfidf(args, patent_pickle_file_name, args.cpc_classification)
221+
222+
newtfidf = None
223+
if args.focus or args.output == 'table':
224+
path2 = os.path.join('data', args.focus_source + ".pkl.bz2")
225+
newtfidf = get_tfidf(args, path2, None)
226+
227+
citation_count_dict = None
228+
if args.cite:
229+
citation_count_dict = load_citation_count_dict()
230+
231+
out = args.output
232+
233+
ngram_multiplier = 4
234+
235+
if out == 'report':
236+
run_report(args, ngram_multiplier, tfidf, newtfidf, citation_count_dict=citation_count_dict)
237+
elif out == 'wordcloud' or out == 'all':
238+
run_report(args, ngram_multiplier, tfidf, newtfidf, wordclouds=True, citation_count_dict=citation_count_dict)
239+
elif out == 'table' or out == 'all':
240+
run_table(args, ngram_multiplier, tfidf, newtfidf, citation_count_dict)
241+
242+
if out == 'fdg' or out == 'all':
243+
run_fdg(args, tfidf, newtfidf)
244+
245+
212246
if __name__ == '__main__':
213247
main()
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.

scripts/visualization/graphs/fdgprep.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ def __create_graph_json(self):
9494
def save_graph(self, fname, varname):
9595

9696
graph = self.__create_graph_json()
97-
file_name = os.path.join('outputs', 'fdg', fname + '.js')
97+
file_name = os.path.join('outputs', 'visuals', fname + '.js')
9898
with open(file_name, 'w') as js_temp:
9999
js_temp.write(varname + " = '[")
100100
json.dump(graph, js_temp)

tests/test_detect.py

Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
import os
2+
import unittest
3+
from unittest import mock
4+
5+
import detect
6+
7+
8+
class TestDetect(unittest.TestCase):
9+
10+
def test_args_json_not_requested(self):
11+
args = detect.get_args([])
12+
self.assertFalse(args.json)
13+
14+
def test_args_json_requested_short(self):
15+
args = detect.get_args(['-j'])
16+
self.assertTrue(args.json)
17+
18+
def test_args_json_requested_long(self):
19+
args = detect.get_args(['--json'])
20+
self.assertTrue(args.json)
21+
22+
def test_args_report_name_requested_long(self):
23+
args = detect.get_args(['--report_name=my/test/name.txt'])
24+
self.assertEqual('my/test/name.txt', args.report_name)
25+
26+
def test_args_patent_source_requested_long(self):
27+
args = detect.get_args(['--patent_source=my-test'])
28+
self.assertEqual('my-test', args.patent_source)
29+
30+
@mock.patch("detect.json.dump", create=True)
31+
@mock.patch("detect.open", create=True)
32+
def test_json_configuration_encoding_minimal(self, mock_open, mock_json_dump):
33+
patent_pickle_file_name = 'test.pkl'
34+
patent_pickle_absolute_file_name = os.path.abspath('test.pkl')
35+
report_file_name = os.path.join(os.path.abspath(os.sep), 'dummy', 'test.txt')
36+
json_file_name = os.path.join(os.path.abspath(os.sep), 'dummy', 'test.json')
37+
args = detect.get_args(['-j', f'--report_name={report_file_name}'])
38+
39+
detect.write_config_to_json(args, patent_pickle_file_name)
40+
41+
self.assertTrue(args.json)
42+
mock_open.assert_called_with(json_file_name, 'w')
43+
44+
actual_json = mock_json_dump.call_args[0][0]
45+
expected_json = {
46+
'paths': {
47+
'data': patent_pickle_absolute_file_name,
48+
'tech_report': report_file_name
49+
},
50+
'year': {
51+
'from': 2000,
52+
'to': 0
53+
},
54+
'parameters': {
55+
'cite': False,
56+
'cpc': '',
57+
'focus': False,
58+
'pick': 'sum',
59+
'time': False
60+
}
61+
}
62+
self.assertEqual(expected_json, actual_json)
63+
64+
@mock.patch("detect.json.dump", create=True)
65+
@mock.patch("detect.open", create=True)
66+
def test_json_configuration_encoding_maximal(self, mock_open, mock_json_dump):
67+
patent_pickle_file_name = os.path.join('dummy', 'test.pkl')
68+
patent_pickle_absolute_file_name = os.path.abspath(patent_pickle_file_name)
69+
report_file_name = os.path.join(os.path.abspath(os.sep), 'dummy', 'test.txt')
70+
json_file_name = os.path.join(os.path.abspath(os.sep), 'dummy', 'test.json')
71+
args = detect.get_args(['-j', f'--report_name={report_file_name}', '-c', '-t', '-f', '-p=max', '-cpc=Y12',
72+
'-yf=1998', '-yt=2001'])
73+
74+
detect.write_config_to_json(args, patent_pickle_file_name)
75+
76+
self.assertTrue(args.json)
77+
mock_open.assert_called_with(json_file_name, 'w')
78+
79+
actual_json = mock_json_dump.call_args[0][0]
80+
expected_json = {
81+
'paths': {
82+
'data': patent_pickle_absolute_file_name,
83+
'tech_report': report_file_name
84+
},
85+
'year': {
86+
'from': 1998,
87+
'to': 2001
88+
},
89+
'parameters': {
90+
'cite': True,
91+
'cpc': 'Y12',
92+
'focus': True,
93+
'pick': 'max',
94+
'time': True
95+
}
96+
}
97+
self.assertEqual(expected_json, actual_json)
98+
99+
100+
if __name__ == '__main__':
101+
unittest.main()

0 commit comments

Comments
 (0)