Skip to content

Commit 286d7a8

Browse files
adarobcopybara-github
authored andcommitted
Add MANUAL_DOWNLOAD_INSTRUCTIONS for C4.
PiperOrigin-RevId: 283784072
1 parent 38189d9 commit 286d7a8

File tree

1 file changed

+8
-0
lines changed
  • tensorflow_datasets/text

1 file changed

+8
-0
lines changed

tensorflow_datasets/text/c4.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,14 @@ def __init__(self,
120120
class C4(tfds.core.BeamBasedBuilder):
121121
"""C4 dataset based on Common Crawl."""
122122

123+
MANUAL_DOWNLOAD_INSTRUCTIONS = """\
124+
For the WebText-like config, you must manually download 'OpenWebText.zip'
125+
(from https://mega.nz/#F!EZZD0YwJ!9_PlEQzdMVLaNdKv_ICNVQ) and the Common Crawl
126+
WET files from August 2018 to July 2019
127+
(https://commoncrawl.org/the-data/get-started/) and place them in the
128+
`manual_dir`.
129+
"""
130+
123131
BUILDER_CONFIGS = [
124132
C4Config(language="en", description="English C4 dataset."),
125133
C4Config(

0 commit comments

Comments
 (0)