1
1
import argparse
2
+ import json
2
3
import os
4
+ import sys
3
5
4
6
from pandas import Timestamp , read_pickle , ExcelWriter
5
7
@@ -24,7 +26,7 @@ def year2pandas_earliest_date(year_in):
24
26
return Timestamp (year_string )
25
27
26
28
27
- def get_args ():
29
+ def get_args (command_line_arguments ):
28
30
parser = argparse .ArgumentParser (description = "create report, wordcloud, and fdg graph for patent texts" )
29
31
30
32
parser .add_argument ("-f" , "--focus" , default = False , action = "store_true" ,
@@ -36,7 +38,8 @@ def get_args():
36
38
help = "options are <median> <max> <sum> <avg> defaults to sum. Average is over non zero values" )
37
39
parser .add_argument ("-o" , "--output" , default = 'report' , choices = ['fdg' , 'wordcloud' , 'report' , 'table' , 'all' ],
38
40
help = "options are: <fdg> <wordcloud> <report> <table> <all>" )
39
-
41
+ parser .add_argument ("-j" , "--json" , default = False , action = "store_true" ,
42
+ help = "Output configuration as JSON file alongside output report" )
40
43
parser .add_argument ("-yf" , "--year_from" , type = int , default = 2000 , help = "The first year for the patent cohort" )
41
44
parser .add_argument ("-yt" , "--year_to" , type = int , default = 0 , help = "The last year for the patent cohort (0 is now)" )
42
45
@@ -66,7 +69,7 @@ def get_args():
66
69
67
70
parser .add_argument ("-nltk" , "--nltk_path" , default = None , help = "custom path for NLTK data" )
68
71
69
- args = parser .parse_args ()
72
+ args = parser .parse_args (command_line_arguments )
70
73
return args
71
74
72
75
@@ -104,55 +107,15 @@ def check_cpc_between_years(args, df):
104
107
exit (0 )
105
108
106
109
107
- def get_tfidf (args , filename , cpc ):
110
+ def get_tfidf (args , pickle_file_name , cpc ):
108
111
date_from = year2pandas_earliest_date (args .year_from )
109
112
date_to = year2pandas_latest_date (args .year_to )
110
113
111
- df = PatentsPickle2DataFrame (filename , classification = cpc , date_from = date_from , date_to = date_to ).data_frame
114
+ df = PatentsPickle2DataFrame (pickle_file_name , classification = cpc , date_from = date_from , date_to = date_to ).data_frame
112
115
check_cpc_between_years (args , df )
113
116
return TFIDF (df , tokenizer = LemmaTokenizer (), ngram_range = (args .min_n , args .max_n ))
114
117
115
118
116
- def main ():
117
- paths = [os .path .join ('outputs' , 'reports' ), os .path .join ('outputs' , 'json' ), os .path .join ('outputs' , 'wordclouds' ),
118
- os .path .join ('outputs' , 'table' )]
119
- for path in paths :
120
- os .makedirs (path , exist_ok = True )
121
-
122
- args = get_args ()
123
- checkargs (args )
124
-
125
- if args .nltk_path :
126
- import nltk
127
- nltk .data .path .append (args .nltk_path )
128
-
129
- path = os .path .join ('data' , args .patent_source + ".pkl.bz2" )
130
- tfidf = get_tfidf (args , path , args .cpc_classification )
131
-
132
- newtfidf = None
133
- if args .focus or args .output == 'table' :
134
- path2 = os .path .join ('data' , args .focus_source + ".pkl.bz2" )
135
- newtfidf = get_tfidf (args , path2 , None )
136
-
137
- citation_count_dict = None
138
- if args .cite :
139
- citation_count_dict = load_citation_count_dict ()
140
-
141
- out = args .output
142
-
143
- ngram_multiplier = 4
144
-
145
- if out == 'report' :
146
- run_report (args , ngram_multiplier , tfidf , newtfidf , citation_count_dict = citation_count_dict )
147
- elif out == 'wordcloud' or out == 'all' :
148
- run_report (args , ngram_multiplier , tfidf , newtfidf , wordclouds = True , citation_count_dict = citation_count_dict )
149
- elif out == 'table' or out == 'all' :
150
- run_table (args , ngram_multiplier , tfidf , newtfidf , citation_count_dict )
151
-
152
- if out == 'fdg' or out == 'all' :
153
- run_fdg (args , tfidf , newtfidf )
154
-
155
-
156
119
def load_citation_count_dict ():
157
120
citation_count_dict = read_pickle (FilePaths .us_patents_citation_dictionary_1of2_pickle_name )
158
121
citation_count_dict_pt2 = read_pickle (FilePaths .us_patents_citation_dictionary_2of2_pickle_name )
@@ -178,11 +141,11 @@ def run_report(args, ngram_multiplier, tfidf, tfidf_random=None, wordclouds=Fals
178
141
number_of_ngrams_to_return = ngram_multiplier * num_ngrams ,
179
142
pick = args .pick , time = args .time ,
180
143
citation_count_dict = citation_count_dict )
181
- set_terms = set (terms ) if not args .focus \
182
- else tfidf .detect_popular_ngrams_in_corpus_excluding_common (tfidf_random ,
183
- number_of_ngrams_to_return = ngram_multiplier * num_ngrams ,
184
- pick = args .pick , time = args .time ,
185
- citation_count_dict = citation_count_dict )
144
+ set_terms = set (terms ) if not args .focus else \
145
+ tfidf .detect_popular_ngrams_in_corpus_excluding_common (tfidf_random ,
146
+ number_of_ngrams_to_return = ngram_multiplier * num_ngrams ,
147
+ pick = args .pick , time = args .time ,
148
+ citation_count_dict = citation_count_dict )
186
149
187
150
dict_freqs = dict ([((p [1 ]), p [0 ]) for p in ngrams_scores_tuple if p [1 ] in set_terms ])
188
151
@@ -209,5 +172,76 @@ def run_fdg(args, tf_idf, tf_idf2=None):
209
172
graph .save_graph ("key-terms" , 'data' )
210
173
211
174
175
+ def write_config_to_json (args , patent_pickle_file_name ):
176
+ patent_pickle_file_name = os .path .abspath (patent_pickle_file_name )
177
+ report_file_name = os .path .abspath (args .report_name )
178
+ json_file_name = os .path .splitext (report_file_name )[0 ] + '.json'
179
+
180
+ json_data = {
181
+ 'paths' : {
182
+ 'data' : patent_pickle_file_name ,
183
+ 'tech_report' : report_file_name
184
+ },
185
+ 'year' : {
186
+ 'from' : args .year_from ,
187
+ 'to' : args .year_to
188
+ },
189
+ 'parameters' : {
190
+ 'cpc' : '' if args .cpc_classification is None else args .cpc_classification ,
191
+ 'pick' : args .pick ,
192
+ 'time' : args .time ,
193
+ 'cite' : args .cite ,
194
+ 'focus' : args .focus
195
+ }
196
+ }
197
+
198
+ with open (json_file_name , 'w' ) as json_file :
199
+ json .dump (json_data , json_file )
200
+
201
+
202
+ def main ():
203
+ paths = [os .path .join ('outputs' , 'reports' ), os .path .join ('outputs' , 'wordclouds' ),
204
+ os .path .join ('outputs' , 'table' )]
205
+ for path in paths :
206
+ os .makedirs (path , exist_ok = True )
207
+
208
+ args = get_args (sys .argv [1 :])
209
+ checkargs (args )
210
+
211
+ patent_pickle_file_name = os .path .join ('data' , args .patent_source + ".pkl.bz2" )
212
+
213
+ if args .json :
214
+ write_config_to_json (args , patent_pickle_file_name )
215
+
216
+ if args .nltk_path :
217
+ import nltk
218
+ nltk .data .path .append (args .nltk_path )
219
+
220
+ tfidf = get_tfidf (args , patent_pickle_file_name , args .cpc_classification )
221
+
222
+ newtfidf = None
223
+ if args .focus or args .output == 'table' :
224
+ path2 = os .path .join ('data' , args .focus_source + ".pkl.bz2" )
225
+ newtfidf = get_tfidf (args , path2 , None )
226
+
227
+ citation_count_dict = None
228
+ if args .cite :
229
+ citation_count_dict = load_citation_count_dict ()
230
+
231
+ out = args .output
232
+
233
+ ngram_multiplier = 4
234
+
235
+ if out == 'report' :
236
+ run_report (args , ngram_multiplier , tfidf , newtfidf , citation_count_dict = citation_count_dict )
237
+ elif out == 'wordcloud' or out == 'all' :
238
+ run_report (args , ngram_multiplier , tfidf , newtfidf , wordclouds = True , citation_count_dict = citation_count_dict )
239
+ elif out == 'table' or out == 'all' :
240
+ run_table (args , ngram_multiplier , tfidf , newtfidf , citation_count_dict )
241
+
242
+ if out == 'fdg' or out == 'all' :
243
+ run_fdg (args , tfidf , newtfidf )
244
+
245
+
212
246
if __name__ == '__main__' :
213
247
main ()
0 commit comments