Skip to content

Commit 96dbedf

Browse files
authored
New Splitter trail; revamped io_utils (#51)
* new splitter * formatting * revamped io utils * formatting * removed unused import
1 parent 2ab965f commit 96dbedf

File tree

7 files changed

+375
-123
lines changed

7 files changed

+375
-123
lines changed

examples/squad.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
)
1515
>> sm.TextToWordsMapper(
1616
fields=["question", "context", "answers"],
17-
splitter="whitespace",
17+
splitter="ws",
1818
)
1919
>> sm.SingleSequenceStriderMapper(
2020
field_to_stride=["context"],

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[project]
22
name = "smashed"
3-
version = "0.16.0"
3+
version = "0.17.0"
44
description = """\
55
SMASHED is a toolkit designed to apply transformations to samples in \
66
datasets, such as fields extraction, tokenization, prompting, batching, \

src/smashed/mappers/text.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
BlingFireSplitter,
1010
WhitespacePlusSplitter,
1111
WhitespaceSplitter,
12+
WhitespaceTrailSplitter,
1213
)
1314

1415

@@ -69,15 +70,15 @@ class TextToWordsMapper(SingleBaseMapper):
6970
def __init__(
7071
self,
7172
fields: Union[str, Sequence[str]],
72-
splitter: Literal[
73-
"blingfire", "whitespace", "whitespace_plus"
74-
] = "whitespace_plus",
73+
splitter: Literal["blingfire", "ws", "plus", "trail"] = "plus",
7574
):
7675
if splitter == "blingfire":
7776
self.splitter = BlingFireSplitter()
78-
elif splitter == "whitespace_plus":
77+
elif splitter == "plus":
7978
self.splitter = WhitespacePlusSplitter()
80-
elif splitter == "whitespace":
79+
elif splitter == "trail":
80+
self.splitter = WhitespaceTrailSplitter()
81+
elif splitter == "ws":
8182
self.splitter = WhitespaceSplitter()
8283
else:
8384
raise ValueError(f"Unknown splitter: {splitter}")

0 commit comments

Comments
 (0)