Skip to content

Commit a708d50

Browse files
yaozhaogooglecopybara-github
authored andcommitted
use cased raw text string in bigpatent
PiperOrigin-RevId: 292594706
1 parent 34ae6dd commit a708d50

File tree

5 files changed

+151
-21
lines changed

5 files changed

+151
-21
lines changed

tensorflow_datasets/summarization/big_patent.py

Lines changed: 150 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,10 @@
1919
from __future__ import division
2020
from __future__ import print_function
2121

22-
import gzip
2322
import json
2423
import os
25-
import tensorflow.compat.v2 as tf
24+
import re
25+
2626
import tensorflow_datasets.public_api as tfds
2727

2828
_CITATION = """
@@ -53,7 +53,8 @@
5353
5454
"""
5555

56-
_URL = "https://drive.google.com/uc?export=download&id=1J3mucMFTWrgAYa3LuBZoLRR3CzzYD3fa"
56+
# Raw data provided by Eva Sharma (evasharma@ccs.neu.edu).
57+
_URL = "https://drive.google.com/uc?export=download&id=1mwH7eSh1kNci31xduR4Da_XcmTE8B8C3"
5758

5859
_DOCUMENT = "description"
5960
_SUMMARY = "abstract"
@@ -83,21 +84,25 @@ def __init__(self, cpc_codes=None, **kwargs):
8384
**kwargs: keyword arguments forwarded to super.
8485
"""
8586
super(BigPatentConfig, self).__init__(
86-
version=tfds.core.Version("1.0.0"), **kwargs)
87+
# 1.0.0 lower cased tokenized words.
88+
# 2.0.0 cased raw strings.
89+
version=tfds.core.Version("2.0.0", "Updated to cased raw strings."),
90+
supported_versions=[tfds.core.Version("1.0.0")],
91+
**kwargs)
8792
self.cpc_codes = cpc_codes
8893

8994

90-
class BigPatent(tfds.core.GeneratorBasedBuilder):
95+
class BigPatent(tfds.core.BeamBasedBuilder):
9196
"""BigPatent datasets."""
9297

9398
BUILDER_CONFIGS = [
9499
BigPatentConfig(
95-
cpc_codes=list(_CPC_DESCRIPTION),
100+
cpc_codes="*",
96101
name="all",
97102
description="Patents under all categories."),
98103
] + [
99104
BigPatentConfig( # pylint:disable=g-complex-comprehension
100-
cpc_codes=[k],
105+
cpc_codes=k,
101106
name=k,
102107
description=("Patents under Cooperative Patent Classification (CPC)"
103108
"{0}: {1}".format(k, v)),
@@ -122,7 +127,7 @@ def _split_generators(self, dl_manager):
122127
dl_path = dl_manager.download_and_extract(_URL)
123128
split_types = ["train", "val", "test"]
124129
extract_paths = dl_manager.extract({
125-
k: os.path.join(dl_path, "bigPatentData", k + ".tar.gz")
130+
k: os.path.join(dl_path, "bigPatentDataNonTokenized", k + ".tar.gz")
126131
for k in split_types
127132
})
128133
extract_paths = {k: os.path.join(extract_paths[k], k) for k in split_types}
@@ -142,16 +147,140 @@ def _split_generators(self, dl_manager):
142147
),
143148
]
144149

145-
def _generate_examples(self, path=None):
146-
"""Yields examples."""
147-
for cpc_code in self.builder_config.cpc_codes:
148-
filenames = tf.io.gfile.glob(os.path.join(path, cpc_code, "*"))
149-
for filename in filenames:
150-
with tf.io.gfile.GFile(filename, "rb") as fin:
151-
fin = gzip.GzipFile(fileobj=fin)
152-
for row in fin:
153-
json_obj = json.loads(row)
154-
yield json_obj["publication_number"], {
155-
_DOCUMENT: json_obj[_DOCUMENT],
156-
_SUMMARY: json_obj[_SUMMARY]
157-
}
150+
def _build_pcollection(self, pipeline, path=None):
151+
"""Build PCollection of examples."""
152+
beam = tfds.core.lazy_imports.apache_beam
153+
154+
def _process_example(row):
155+
json_obj = json.loads(row)
156+
yield json_obj["publication_number"], {
157+
_DOCUMENT: _bigpatent_clean_description(json_obj[_DOCUMENT]),
158+
_SUMMARY: _bigpatent_clean_abstract(json_obj[_SUMMARY])
159+
}
160+
161+
file_pattern = os.path.join(path, self.builder_config.cpc_codes, "*")
162+
return (pipeline
163+
| "ReadTextIO" >> beam.io.textio.ReadFromText(file_pattern)
164+
| beam.FlatMap(_process_example))
165+
166+
167+
# The preprocessing functions below are kindly provided by
168+
# Eva Sharma (evasharma@ccs.neu.edu).
169+
# They are modified in a few ways:
170+
# 1) minor code formating changes, add prefix _bigpatent to those functions.
171+
# 2) enchant is replaced with nltk to detect english words.
172+
# 3) remove excessive white space.
173+
174+
# Regex for cleaning the abstract and description fields of unwanted text
175+
# spans.
176+
177+
_FIG_EXP1 = re.compile(r"(FIG.)\s+(\d)(,*)\s*(\d*)")
178+
_FIG_EXP2 = re.compile(r"(FIGS.)\s+(\d)(,*)\s*(\d*)")
179+
_FIG_EXP3 = re.compile(r"(FIGURE)\s+(\d)(,*)\s*(\d*)")
180+
181+
_LINE_NUM_EXP = re.compile(r"\[(\d+)\]")
182+
_NON_EMPTY_LINES = re.compile(r"^\s*\[(\d+)\]")
183+
_TABLE_HEADER = re.compile(r"^(\s*)TABLE\s+\d+(\s+(.*))?$")
184+
185+
_ENGLISH_WORDS = None
186+
187+
188+
def _get_english_words():
189+
global _ENGLISH_WORDS
190+
if not _ENGLISH_WORDS:
191+
_ENGLISH_WORDS = frozenset(tfds.core.lazy_imports.nltk.corpus.words.words())
192+
return _ENGLISH_WORDS
193+
194+
195+
def _remove_excessive_whitespace(text):
196+
return " ".join([w for w in text.split(" ") if w])
197+
198+
199+
def _bigpatent_clean_abstract(text):
200+
"""Cleans the abstract text."""
201+
text = re.sub(r"[\(\{\[].*?[\}\)\]]", "", text).strip()
202+
text = _remove_excessive_whitespace(text)
203+
return text
204+
205+
206+
def _bigpatent_remove_referenecs(text):
207+
"""Remove references from description text."""
208+
text = _FIG_EXP1.sub(r"FIG\2 ", text)
209+
text = _FIG_EXP2.sub(r"FIG\2 ", text)
210+
text = _FIG_EXP3.sub(r"FIG\2 ", text)
211+
return text
212+
213+
214+
def _bigpatent_get_list_of_non_empty_lines(text):
215+
"""Remove non-empty lines."""
216+
# Split into lines
217+
# Remove empty lines
218+
# Remove line numbers
219+
return [
220+
_NON_EMPTY_LINES.sub("", s).strip()
221+
for s in text.strip().splitlines(True)
222+
if s.strip()
223+
]
224+
225+
226+
def _bigpatent_remove_tables(sentences):
227+
"""Remove Tables from description text."""
228+
# Remove tables from text
229+
new_sentences = []
230+
i = 0
231+
table_start = 0
232+
# A table header will be a line starting with "TABLE" after zero or more
233+
# whitespaces, followed by an integer.
234+
# After the integer, the line ends, or is followed by whitespace and
235+
# description.
236+
while i < len(sentences):
237+
sentence = sentences[i]
238+
if table_start == 0:
239+
# Not inside a table
240+
# Check if it's start of a table
241+
if _TABLE_HEADER.match(sentence):
242+
table_start = 1
243+
else:
244+
new_sentences.append(sentence)
245+
246+
elif table_start == 1:
247+
words = sentence.strip("\t").split(" ")
248+
num_eng = 0
249+
for w in words:
250+
if not w.isalpha():
251+
continue
252+
if w in _get_english_words():
253+
num_eng += 1
254+
if num_eng > 20:
255+
# Table end condition
256+
table_start = 0
257+
new_sentences.append(sentence)
258+
break
259+
i += 1
260+
return new_sentences
261+
262+
263+
def _bigpatent_remove_lines_with_less_words(sentences):
264+
"""Remove sentences with less than 10 words."""
265+
new_sentences = []
266+
for sentence in sentences:
267+
words = set(sentence.split(" "))
268+
if len(words) > 10:
269+
new_sentences.append(sentence)
270+
return new_sentences
271+
272+
273+
def _bigpatent_clean_description(text):
274+
"""Clean the description text."""
275+
# split the text by newlines, keep only non-empty lines
276+
sentences = _bigpatent_get_list_of_non_empty_lines(text)
277+
# remove tables from the description text
278+
sentences = _bigpatent_remove_tables(sentences)
279+
# remove sentences with less than 10 words
280+
sentences = _bigpatent_remove_lines_with_less_words(sentences)
281+
text = "\n".join(sentences)
282+
# remove references like FIG. 8, FIGS. 8, 8, FIG. 8-d
283+
text = _bigpatent_remove_referenecs(text)
284+
# remove excessive whitespace
285+
text = _remove_excessive_whitespace(text)
286+
return text
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
11
https://drive.google.com/uc?export=download&id=1J3mucMFTWrgAYa3LuBZoLRR3CzzYD3fa 6448045871 7e1093c7e0d09677c79bd872a07b6a6dd2b3235633207e9918b75056205f04dc
2+
https://drive.google.com/uc?export=download&id=1mwH7eSh1kNci31xduR4Da_XcmTE8B8C3 10142849928 826f156d43e0b0de0d49a1e61d6f72139d58d8afc275c9d556ed0276d9c6b7a8

0 commit comments

Comments
 (0)