19
19
from __future__ import division
20
20
from __future__ import print_function
21
21
22
- import gzip
23
22
import json
24
23
import os
25
- import tensorflow .compat .v2 as tf
24
+ import re
25
+
26
26
import tensorflow_datasets .public_api as tfds
27
27
28
28
_CITATION = """
53
53
54
54
"""
55
55
56
- _URL = "https://drive.google.com/uc?export=download&id=1J3mucMFTWrgAYa3LuBZoLRR3CzzYD3fa"
56
+ # Raw data provided by Eva Sharma (evasharma@ccs.neu.edu).
57
+ _URL = "https://drive.google.com/uc?export=download&id=1mwH7eSh1kNci31xduR4Da_XcmTE8B8C3"
57
58
58
59
_DOCUMENT = "description"
59
60
_SUMMARY = "abstract"
@@ -83,21 +84,25 @@ def __init__(self, cpc_codes=None, **kwargs):
83
84
**kwargs: keyword arguments forwarded to super.
84
85
"""
85
86
super (BigPatentConfig , self ).__init__ (
86
- version = tfds .core .Version ("1.0.0" ), ** kwargs )
87
+ # 1.0.0 lower cased tokenized words.
88
+ # 2.0.0 cased raw strings.
89
+ version = tfds .core .Version ("2.0.0" , "Updated to cased raw strings." ),
90
+ supported_versions = [tfds .core .Version ("1.0.0" )],
91
+ ** kwargs )
87
92
self .cpc_codes = cpc_codes
88
93
89
94
90
- class BigPatent (tfds .core .GeneratorBasedBuilder ):
95
+ class BigPatent (tfds .core .BeamBasedBuilder ):
91
96
"""BigPatent datasets."""
92
97
93
98
BUILDER_CONFIGS = [
94
99
BigPatentConfig (
95
- cpc_codes = list ( _CPC_DESCRIPTION ) ,
100
+ cpc_codes = "*" ,
96
101
name = "all" ,
97
102
description = "Patents under all categories." ),
98
103
] + [
99
104
BigPatentConfig ( # pylint:disable=g-complex-comprehension
100
- cpc_codes = [ k ] ,
105
+ cpc_codes = k ,
101
106
name = k ,
102
107
description = ("Patents under Cooperative Patent Classification (CPC)"
103
108
"{0}: {1}" .format (k , v )),
@@ -122,7 +127,7 @@ def _split_generators(self, dl_manager):
122
127
dl_path = dl_manager .download_and_extract (_URL )
123
128
split_types = ["train" , "val" , "test" ]
124
129
extract_paths = dl_manager .extract ({
125
- k : os .path .join (dl_path , "bigPatentData " , k + ".tar.gz" )
130
+ k : os .path .join (dl_path , "bigPatentDataNonTokenized " , k + ".tar.gz" )
126
131
for k in split_types
127
132
})
128
133
extract_paths = {k : os .path .join (extract_paths [k ], k ) for k in split_types }
@@ -142,16 +147,140 @@ def _split_generators(self, dl_manager):
142
147
),
143
148
]
144
149
145
- def _generate_examples (self , path = None ):
146
- """Yields examples."""
147
- for cpc_code in self .builder_config .cpc_codes :
148
- filenames = tf .io .gfile .glob (os .path .join (path , cpc_code , "*" ))
149
- for filename in filenames :
150
- with tf .io .gfile .GFile (filename , "rb" ) as fin :
151
- fin = gzip .GzipFile (fileobj = fin )
152
- for row in fin :
153
- json_obj = json .loads (row )
154
- yield json_obj ["publication_number" ], {
155
- _DOCUMENT : json_obj [_DOCUMENT ],
156
- _SUMMARY : json_obj [_SUMMARY ]
157
- }
150
+ def _build_pcollection (self , pipeline , path = None ):
151
+ """Build PCollection of examples."""
152
+ beam = tfds .core .lazy_imports .apache_beam
153
+
154
+ def _process_example (row ):
155
+ json_obj = json .loads (row )
156
+ yield json_obj ["publication_number" ], {
157
+ _DOCUMENT : _bigpatent_clean_description (json_obj [_DOCUMENT ]),
158
+ _SUMMARY : _bigpatent_clean_abstract (json_obj [_SUMMARY ])
159
+ }
160
+
161
+ file_pattern = os .path .join (path , self .builder_config .cpc_codes , "*" )
162
+ return (pipeline
163
+ | "ReadTextIO" >> beam .io .textio .ReadFromText (file_pattern )
164
+ | beam .FlatMap (_process_example ))
165
+
166
+
167
+ # The preprocessing functions below are kindly provided by
168
+ # Eva Sharma (evasharma@ccs.neu.edu).
169
+ # They are modified in a few ways:
170
+ # 1) minor code formating changes, add prefix _bigpatent to those functions.
171
+ # 2) enchant is replaced with nltk to detect english words.
172
+ # 3) remove excessive white space.
173
+
174
+ # Regex for cleaning the abstract and description fields of unwanted text
175
+ # spans.
176
+
177
+ _FIG_EXP1 = re .compile (r"(FIG.)\s+(\d)(,*)\s*(\d*)" )
178
+ _FIG_EXP2 = re .compile (r"(FIGS.)\s+(\d)(,*)\s*(\d*)" )
179
+ _FIG_EXP3 = re .compile (r"(FIGURE)\s+(\d)(,*)\s*(\d*)" )
180
+
181
+ _LINE_NUM_EXP = re .compile (r"\[(\d+)\]" )
182
+ _NON_EMPTY_LINES = re .compile (r"^\s*\[(\d+)\]" )
183
+ _TABLE_HEADER = re .compile (r"^(\s*)TABLE\s+\d+(\s+(.*))?$" )
184
+
185
+ _ENGLISH_WORDS = None
186
+
187
+
188
+ def _get_english_words ():
189
+ global _ENGLISH_WORDS
190
+ if not _ENGLISH_WORDS :
191
+ _ENGLISH_WORDS = frozenset (tfds .core .lazy_imports .nltk .corpus .words .words ())
192
+ return _ENGLISH_WORDS
193
+
194
+
195
+ def _remove_excessive_whitespace (text ):
196
+ return " " .join ([w for w in text .split (" " ) if w ])
197
+
198
+
199
+ def _bigpatent_clean_abstract (text ):
200
+ """Cleans the abstract text."""
201
+ text = re .sub (r"[\(\{\[].*?[\}\)\]]" , "" , text ).strip ()
202
+ text = _remove_excessive_whitespace (text )
203
+ return text
204
+
205
+
206
+ def _bigpatent_remove_referenecs (text ):
207
+ """Remove references from description text."""
208
+ text = _FIG_EXP1 .sub (r"FIG\2 " , text )
209
+ text = _FIG_EXP2 .sub (r"FIG\2 " , text )
210
+ text = _FIG_EXP3 .sub (r"FIG\2 " , text )
211
+ return text
212
+
213
+
214
+ def _bigpatent_get_list_of_non_empty_lines (text ):
215
+ """Remove non-empty lines."""
216
+ # Split into lines
217
+ # Remove empty lines
218
+ # Remove line numbers
219
+ return [
220
+ _NON_EMPTY_LINES .sub ("" , s ).strip ()
221
+ for s in text .strip ().splitlines (True )
222
+ if s .strip ()
223
+ ]
224
+
225
+
226
+ def _bigpatent_remove_tables (sentences ):
227
+ """Remove Tables from description text."""
228
+ # Remove tables from text
229
+ new_sentences = []
230
+ i = 0
231
+ table_start = 0
232
+ # A table header will be a line starting with "TABLE" after zero or more
233
+ # whitespaces, followed by an integer.
234
+ # After the integer, the line ends, or is followed by whitespace and
235
+ # description.
236
+ while i < len (sentences ):
237
+ sentence = sentences [i ]
238
+ if table_start == 0 :
239
+ # Not inside a table
240
+ # Check if it's start of a table
241
+ if _TABLE_HEADER .match (sentence ):
242
+ table_start = 1
243
+ else :
244
+ new_sentences .append (sentence )
245
+
246
+ elif table_start == 1 :
247
+ words = sentence .strip ("\t " ).split (" " )
248
+ num_eng = 0
249
+ for w in words :
250
+ if not w .isalpha ():
251
+ continue
252
+ if w in _get_english_words ():
253
+ num_eng += 1
254
+ if num_eng > 20 :
255
+ # Table end condition
256
+ table_start = 0
257
+ new_sentences .append (sentence )
258
+ break
259
+ i += 1
260
+ return new_sentences
261
+
262
+
263
+ def _bigpatent_remove_lines_with_less_words (sentences ):
264
+ """Remove sentences with less than 10 words."""
265
+ new_sentences = []
266
+ for sentence in sentences :
267
+ words = set (sentence .split (" " ))
268
+ if len (words ) > 10 :
269
+ new_sentences .append (sentence )
270
+ return new_sentences
271
+
272
+
273
+ def _bigpatent_clean_description (text ):
274
+ """Clean the description text."""
275
+ # split the text by newlines, keep only non-empty lines
276
+ sentences = _bigpatent_get_list_of_non_empty_lines (text )
277
+ # remove tables from the description text
278
+ sentences = _bigpatent_remove_tables (sentences )
279
+ # remove sentences with less than 10 words
280
+ sentences = _bigpatent_remove_lines_with_less_words (sentences )
281
+ text = "\n " .join (sentences )
282
+ # remove references like FIG. 8, FIGS. 8, 8, FIG. 8-d
283
+ text = _bigpatent_remove_referenecs (text )
284
+ # remove excessive whitespace
285
+ text = _remove_excessive_whitespace (text )
286
+ return text
0 commit comments