Added TruncateMultipleNestedFieldsMapper (#50)

soldni · web-flow · commit 2ab965f06db8 · 2023-01-19T17:31:28.000-08:00
* added TruncateMultipleNestedFieldsMapper

* accidentally commited .dmypy

* formatting

* lowered requirements to 3.8

* added new ref to mapper

* style
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,19 +1,15 @@
 [project]
 name = "smashed"
-version = "0.15.5"
+version = "0.16.0"
 description = """\
 SMASHED is a toolkit designed to apply transformations to samples in \
 datasets, such as fields extraction, tokenization, prompting, batching, \
 and more. Supports datasets from Huggingface, torchdata iterables, or \
 simple lists of dictionaries.\
 """
-# authors = [
-#     {name = "Allen Institute for Artificial Intelligence", email = "contact@allenai.org"},
-#     {name = "Luca Soldaini", email = "luca@soldaini.net"}
-# ]
 license = {text = "Apache-2.0"}
 readme = "README.md"
-requires-python = ">=3.9"
+requires-python = ">=3.8"
 dependencies = [
     "torch>=1.9",
     "transformers>=4.5",
diff --git a/src/smashed/mappers/__init__.py b/src/smashed/mappers/__init__.py
@@ -43,6 +43,7 @@
     FillEncodedPromptMapper,
     FillTextPromptMapper,
     TruncateMultipleFieldsMapper,
+    TruncateMultipleNestedFieldsMapper,
 )
 from .promptsource import FewShotJinjaMapper, JinjaMapper, PromptsourceMapper
 from .shape import (
@@ -112,6 +113,7 @@
     "TokenTypeIdsSequencePaddingMapper",
     "Torch2PythonMapper",
     "TruncateMultipleFieldsMapper",
+    "TruncateMultipleNestedFieldsMapper",
     "TruncateSingleFieldMapper",
     "UnpackingMapper",
     "ValidUnicodeMapper",
diff --git a/src/smashed/mappers/prompting.py b/src/smashed/mappers/prompting.py
@@ -8,13 +8,15 @@
 from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
 
 from ..base import SingleBaseMapper, TransformElementType
+from ..utils.shape_utils import flatten_with_indices, reconstruct_from_indices
 from .tokenize import GetTokenizerOutputFieldsAndNamesMixIn
 
 __all__ = [
     "EncodeFieldsMapper",
     "FillEncodedPromptMapper",
     "FillTextPromptMapper",
     "TruncateMultipleFieldsMapper",
+    "TruncateMultipleNestedFieldsMapper",
 ]
 
 
@@ -291,6 +293,31 @@ def transform(self, data: TransformElementType) -> TransformElementType:
         return output
 
 
+class TruncateMultipleNestedFieldsMapper(TruncateMultipleFieldsMapper):
+    """Like TruncateMultipleFieldsMapper, but works on nested fields."""
+
+    def transform(self, data: TransformElementType) -> TransformElementType:
+        # gather fields to truncate in flatted_data, keep track of
+        # the indices of the fields in flatted_index
+        flatted_index: dict = {}
+        flatted_data: dict = {}
+
+        for k in self.input_fields:
+            flatted_data[k], flatted_index[k] = flatten_with_indices(data[k])
+
+        flatted_output = super().transform(flatted_data)
+
+        output = {
+            k: (
+                reconstruct_from_indices(flatted_output[k], flatted_index[k])
+                if k in flatted_output
+                else data[k]
+            )
+            for k in data
+        }
+        return output
+
+
 @dataclass
 class PromptSegment:
     """Class to represent a segment of a prompt. Not meant to be used
diff --git a/src/smashed/utils/shape_utils.py b/src/smashed/utils/shape_utils.py
@@ -0,0 +1,118 @@
+from collections.abc import Sequence as SequenceABC
+from typing import Any, List, Sequence, Tuple, TypeVar, Union, cast
+
+from typing_extensions import TypeAlias
+
+T = TypeVar("T")
+
+LocTupleType: TypeAlias = Tuple[int, int]
+KeysType: TypeAlias = Union[LocTupleType, List["KeysType"]]
+NestedSequenceType: TypeAlias = Union[
+    Sequence[T], Sequence["NestedSequenceType[T]"]
+]
+NestedListType: TypeAlias = Union[List[T], List["NestedListType[T]"]]
+
+
+def is_sequence_but_not_str(obj: Any) -> bool:
+    """Check if an object is a sequence but not a string."""
+    return isinstance(obj, SequenceABC) and not isinstance(obj, (str, bytes))
+
+
+def flatten_with_indices(
+    sequence: NestedSequenceType[T], __offset: int = 0
+) -> Tuple[List[T], Union[KeysType, None]]:
+    """Recursively flatten an iterable of iterables, returning both the
+    flatten list, as well as the indices of the original list.
+
+    Args:
+        sequence (NestedSequenceType[T]): Either a sequence or a sequence
+            of sequences; if a sequence of sequences, will be flattened.
+        __offset (int, optional): Internal offset to keep track of the
+            position in the flattened list. Defaults to 0; DO NOT CHANGE.
+
+    Raises:
+        ValueError: If the sequence contains both sequences and
+            non-sequences.
+
+    Returns:
+        List[T]: The flattened list; if the original list was not nested,
+            will be the same as the original list.
+        Union[KeysType, None]: The indices of the original list; if the
+            original list was not nested, will be None.
+    """
+
+    it = iter(sequence)
+    flattened: list = []
+    keys: list = []
+    is_nested_sequence = is_already_flat = False
+
+    while True:
+        try:
+            item = next(it)
+        except StopIteration:
+            break
+
+        if is_sequence_but_not_str(item):
+            if is_already_flat:
+                raise ValueError(
+                    "Cannot mix sequences and non-sequences when flattening."
+                )
+            is_nested_sequence = True
+
+            offset = len(flattened) + __offset
+            # manual casting bc we know this is a sequence (see function
+            # is_sequence_but_not_str) but if we don't cast mypy is going
+            # to complain.
+            item = cast(NestedSequenceType[T], item)
+
+            # must use type: ignore here because mypy doesn't like using
+            # the __offset kwarg (which is a good idea in general but
+            # we nee to use it during recursive calls)
+            sub_flattened, sub_keys = flatten_with_indices(  # type: ignore
+                sequence=item, __offset=offset
+            )
+
+            if sub_keys is None:
+                sub_keys = (offset, offset + len(sub_flattened))
+
+            keys.append(sub_keys)
+            flattened.extend(sub_flattened)
+        else:
+            if is_nested_sequence:
+                raise ValueError(
+                    "Cannot mix sequences and non-sequences when flattening."
+                )
+            is_already_flat = True
+
+            flattened.append(item)
+
+    return flattened, (keys or None)
+
+
+def reconstruct_from_indices(
+    flattened: List[T], keys: Union[KeysType, None]
+) -> NestedListType[T]:
+    """Recursively reconstruct a list from a flattened list and the keys that
+    were returned from recursively_flatten_with_indices.
+
+    Args:
+        flattened (List[T]): A flat list of items.
+
+    """
+
+    if keys is None:
+        return flattened
+
+    reconstructed: list = []
+    for key in keys:
+        if isinstance(key, list):
+            reconstructed.append(reconstruct_from_indices(flattened, key))
+        elif isinstance(key, tuple):
+            start, end = key
+            reconstructed.append(flattened[start:end])
+        else:
+            raise ValueError(
+                f"Invalid key type: expected tuple or list, got {type(key)}"
+            )
+
+    return reconstructed
diff --git a/tests/test_shape_utils.py b/tests/test_shape_utils.py
@@ -0,0 +1,63 @@
+import unittest
+
+from smashed.utils.shape_utils import (
+    flatten_with_indices,
+    reconstruct_from_indices,
+)
+
+
+class TestFlatten(unittest.TestCase):
+    def test_flatten(self):
+        li = [
+            [0, 1, 2, 3],
+            ["4", "5"],
+            [6, 7],
+            ["8"],
+            [9.0, 10.0, 11.0, 12.0, 13.0],
+            [],
+            [14, 15, 16],
+            [17, 18, 19, "20"],
+            [21, "22"],
+            [""],
+            [23, 24, 25, 26, 27, 28, 29, "30"],
+        ]
+
+        fl, idx = flatten_with_indices(li)
+        new_li = reconstruct_from_indices(fl, idx)
+
+        self.assertEqual(li, new_li)
+
+    def test_deeply_nested(self):
+        # a nested 4-deep nested list
+        li = [
+            [[[0, 1, 2, 3], ["4", "5"]], [[6, 7], ["8"]]],
+            [
+                [[9.0, 10.0, 11.0, 12.0, 13.0], []],
+                [[14, 15, 16], [17, 18, 19, "20"], [21, "22"], [""]],
+                [[23, 24, 25, 26, 27, 28, 29, "30"]],
+            ],
+        ]
+
+        fl, idx = flatten_with_indices(li)
+        new_li = reconstruct_from_indices(fl, idx)
+
+        self.assertEqual(li, new_li)
+
+    def test_empty(self):
+        li = []
+        fl, idx = flatten_with_indices(li)
+        new_li = reconstruct_from_indices(fl, idx)
+
+        self.assertEqual(li, new_li)
+
+    def test_already_flat(self):
+        li = [0, 1, 2, 3]
+        fl, idx = flatten_with_indices(li)
+        new_li = reconstruct_from_indices(fl, idx)
+
+        self.assertEqual(li, new_li)
+
+    def test_error_when_mixed(self):
+        li = [0, 1, 2, 3, [4, 5, 6]]
+        with self.assertRaises(ValueError):
+            flatten_with_indices(li)