Skip to content

Commit aa4178f

Browse files
yaozhaogooglecopybara-github
authored andcommitted
update billsum to 2.0.0
PiperOrigin-RevId: 282681798
1 parent 6012139 commit aa4178f

File tree

6 files changed

+10
-22
lines changed

6 files changed

+10
-22
lines changed

tensorflow_datasets/summarization/billsum.py

Lines changed: 8 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
from __future__ import print_function
2121

2222
import json
23+
import os
2324

2425
import tensorflow as tf
2526
import tensorflow_datasets.public_api as tfds
@@ -47,14 +48,7 @@
4748
- sum_len: number of chars in summary.
4849
"""
4950

50-
_URLS = {
51-
"us_train":
52-
"https://datahub.io/akornilo/billsum/r/us_train_data_final.jsonl",
53-
"us_test":
54-
"https://datahub.io/akornilo/billsum/r/us_test_data_final.jsonl",
55-
"ca_test":
56-
"https://datahub.io/akornilo/billsum/r/ca_test_data_final.jsonl",
57-
}
51+
_URL = "https://datahub.io/akornilo/billsum/r/billsum.zip"
5852

5953
_DOCUMENT = "text"
6054
_SUMMARY = "summary"
@@ -63,7 +57,8 @@
6357
class Billsum(tfds.core.GeneratorBasedBuilder):
6458
"""BillSum Dataset."""
6559

66-
VERSION = tfds.core.Version("1.0.0")
60+
# data source updated to filter near duplicates.
61+
VERSION = tfds.core.Version("2.0.0")
6762

6863
def _info(self):
6964
return tfds.core.DatasetInfo(
@@ -81,26 +76,26 @@ def _info(self):
8176

8277
def _split_generators(self, dl_manager):
8378
"""Returns SplitGenerators."""
84-
dl_paths = dl_manager.download(_URLS)
79+
dl_path = dl_manager.download_and_extract(_URL)
8580
return [
8681
tfds.core.SplitGenerator(
8782
name=tfds.Split.TRAIN,
8883
gen_kwargs={
89-
"path": dl_paths["us_train"],
84+
"path": os.path.join(dl_path, "us_train_data_final_v2.jsonl"),
9085
"key": "bill_id"
9186
},
9287
),
9388
tfds.core.SplitGenerator(
9489
name=tfds.Split.TEST,
9590
gen_kwargs={
96-
"path": dl_paths["us_test"],
91+
"path": os.path.join(dl_path, "us_test_data_final_v2.jsonl"),
9792
"key": "bill_id"
9893
},
9994
),
10095
tfds.core.SplitGenerator(
10196
name="ca_test",
10297
gen_kwargs={
103-
"path": dl_paths["ca_test"],
98+
"path": os.path.join(dl_path, "ca_test_data_final.jsonl"),
10499
"key": "external_id"
105100
},
106101
),

tensorflow_datasets/summarization/billsum_test.py

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -30,12 +30,7 @@ class BillsumTest(testing.DatasetBuilderTestCase):
3030
"test": 1, # Number of fake test example
3131
"ca_test": 1 # Number of fake test example
3232
}
33-
DL_EXTRACT_RESULT = {
34-
"us_train": "us_train.jsonl",
35-
"us_test": "us_test.jsonl",
36-
"ca_test": "ca_test.jsonl"
37-
}
38-
33+
DL_EXTRACT_RESULT = ""
3934

4035
if __name__ == "__main__":
4136
testing.test_main()
Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1 @@
1-
https://datahub.io/akornilo/billsum/r/ca_test_data_final.jsonl 15086790 4db74f4ffed4ba6692a14f8823e719dbc93b4aa79acd9fedf3825e7f656b73ed
2-
https://datahub.io/akornilo/billsum/r/us_test_data_final.jsonl 59068884 6285ff95d86d0c2f4e4b42954a7aae0709439bf3f448da66fb822ab681c69489
3-
https://datahub.io/akornilo/billsum/r/us_train_data_final.jsonl 335338742 baa27a45443541ff77f610f8353c75b121c66ae29517629eeee9154bdce7ab82
1+
https://datahub.io/akornilo/billsum/r/billsum.zip 68989684 f20c03fd95a0b457a39d31b23c7fa677466ce53b3cb9cf8ae4f256e074f01f13

0 commit comments

Comments
 (0)