Skip to content
This repository was archived by the owner on Aug 26, 2022. It is now read-only.

Commit f4a908a

Browse files
committed
Fix dataset preprocessor for jsonl format
1 parent c97b268 commit f4a908a

File tree

2 files changed

+67
-41
lines changed

2 files changed

+67
-41
lines changed

USAGE.md

Lines changed: 34 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -488,6 +488,10 @@ import os
488488
from transformers import AutoTokenizer
489489
from oslo import DatasetPreprocessor
490490

491+
data_names=[
492+
"/path/to/wikitext103", "/path/to/lambada", ...
493+
]
494+
491495
# 1. Create tokenizer
492496
os.environ["TOKENIZERS_PARALLELISM"] = "true"
493497
tokenizer = AutoTokenizer.from_pretrained(...)
@@ -500,16 +504,36 @@ preprocessor = DatasetPreprocessor(
500504
append_eod=True,
501505
)
502506

503-
# 3. Preform preprocessing
504-
preprocessor.preprocess(
505-
data_names=[
506-
"/path/to/wikitext103",
507-
"/path/to/lambada",
508-
...
509-
],
510-
extension=".txt",
511-
log_interval=100,
512-
)
507+
# 3-1. Preform preprocessing (.txt)
508+
# save_file_name + '.idx' and '.bin' will be created.
509+
for data_name in data_names:
510+
preprocessor.preprocess(
511+
open(data_name + ".txt"),
512+
save_file_name=data_name,
513+
log_interval=100,
514+
)
515+
516+
# 3-2. Perform preprocessing (.jsonl, Megatron-LM format)
517+
# 1 {"text": "blah blah"}
518+
# 2 {"text": "blah blah"}
519+
# 3 ...
520+
for data_name in data_names:
521+
preprocessor.preprocess(
522+
preprocessor.open_jsonl(
523+
data_name,
524+
json_key="text",
525+
),
526+
save_file_name=data_name,
527+
log_interval=100,
528+
)
529+
530+
# 3-3 Perform preprocessing (any other format)
531+
for data_name in data_names:
532+
preprocessor.preprocess(
533+
YOUR_OWN_LIST_OF_STRING,
534+
save_file_name=data_name,
535+
log_interval=100,
536+
)
513537
```
514538

515539
### DatasetForCausalLM

oslo/data/preprocess/preprocessor.py

Lines changed: 33 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -92,19 +92,18 @@ def __init__(
9292
else:
9393
self.eod_token_id = eod_token_id
9494

95-
def _preprocess(
95+
def preprocess(
9696
self,
97-
data_path: str,
98-
index_path: str,
99-
log_interval: int,
100-
kwargs,
97+
iterable,
98+
save_file_name: str,
99+
log_interval: int = 1000,
101100
) -> None:
102101
"""
103102
Preprocess a dataset
104103
105104
Args:
106-
data_path (str): dataset bin path
107-
index_path (str): dataset index path
105+
iterable: iterable of string
106+
save_file_name (str): save file name
108107
log_interval (int) logging interval
109108
"""
110109

@@ -114,12 +113,12 @@ def _preprocess(
114113
eod_token_id=self.eod_token_id,
115114
)
116115
binarizer = DatasetBinarizer(self.binarization_impl)
117-
index_path, builder = binarizer.create_builder(index_path)
116+
index_path, builder = binarizer.create_builder(save_file_name)
118117

119118
with ProcessPoolExecutor() as pool:
120119
iterator = pool.map(
121120
encoder.encode,
122-
open(data_path, **kwargs),
121+
iterable,
123122
chunksize=self.chunksize,
124123
)
125124

@@ -130,30 +129,33 @@ def _preprocess(
130129
log_interval=log_interval,
131130
)
132131

133-
def preprocess(
134-
self,
135-
data_names: List[str],
136-
extension: str = ".txt",
137-
log_interval: int = 1000,
138-
**kwargs,
139-
):
132+
@staticmethod
133+
def open_jsonl(file, json_key):
140134
"""
141-
Preprocess datasets
135+
Open jsonl file similar with Megatron-LM data format
142136
143-
Args:
144-
data_names (List[str]): dataset names
145-
extension (str): data file extension
146-
log_interval (int): logging interval
137+
Examples:
138+
1 {'text': 'blah blah blah ...'}
139+
2 {'text': 'blah blah blah ...'}
140+
3 {'text': 'blah blah blah ...'}
141+
4 ...
142+
143+
>>> DatasetPreprocessor.open_jsonl(
144+
... file=FILE_NAME, json_key='text'
145+
... )
147146
"""
148147

149-
if "." not in extension:
150-
extension = "." + extension
148+
import json
151149

152-
for name in data_names:
153-
logger.info(f"Start to preprocess {name}.")
154-
self._preprocess(
155-
name + extension,
156-
name,
157-
log_interval,
158-
kwargs,
159-
)
150+
if file[-6:].lower() != ".jsonl":
151+
file = file + ".jsonl"
152+
153+
source = open(file)
154+
155+
while True:
156+
line = source.readline()
157+
158+
if not line:
159+
break
160+
else:
161+
yield json.loads(line)[json_key]

0 commit comments

Comments
 (0)