add SDF.print(), allow inplace for SDF.update() and SDF.to_topic() (#403)

tim-quix · web-flow · commit 000d795e2291 · 2024-07-17T12:55:17.000-04:00
- Added `SDF.print(pretty: bool, metadata: bool)`, which prints the value, and optionally metadata
- Enables `SDF.print()`, `SDF.update()` and `SDF.to_topic()` to be used without reassigning them (and you can chain them as normal)

```python
sdf = SDF()
sdf = sdf.apply(a_func)
sdf.update(other_func).print(metadata=True).to_topic(my_topic)
```
diff --git a/docs/processing.md b/docs/processing.md
@@ -501,15 +501,23 @@ For example, to log input data, or to update a counter in the State.
 The return of the callback passed to `.update()` will be ignored, and the original input
 will be sent to downstream operations instead.
 
+This operation occurs in-place, meaning reassigning the operation to your `sdf` is 
+entirely OPTIONAL; the original `StreamingDataFrame` is still returned to allow the 
+chaining of commands like `sdf.update().print()`.
+
+> Note: chains that include any non-inplace function will still require reassignment: 
+> `sdf = sdf.update().filter().print()`
+
 **Example:**
 
 ```python
 # Mutate a list by appending a new item to it
 # The updated list will be passed downstream
 sdf = sdf.update(lambda some_list: some_list.append(1))
 
-# Using .update() to print a value to the console
-sdf = sdf.update(lambda value: print("Received value: ", value))
+# OR instead (no reassignment):
+sdf.update(lambda some_list: some_list.append(1))
+
 ```
 
 ### StreamingDataFrame.filter()
diff --git a/quixstreams/dataframe/dataframe.py b/quixstreams/dataframe/dataframe.py
@@ -3,6 +3,7 @@
 import contextvars
 import functools
 import operator
+import pprint
 from copy import deepcopy
 from datetime import timedelta
 from typing import (
@@ -282,6 +283,9 @@ def update(
         The result of the function will be ignored, and the original value will be
         passed downstream.
 
+        This operation occurs in-place, meaning reassignment is entirely OPTIONAL: the
+        original `StreamingDataFrame` is returned for chaining (`sdf.update().print()`).
+
 
         Example Snippet:
 
@@ -297,7 +301,8 @@ def func(values: list, state: State):
 
         sdf = StreamingDataframe()
         sdf = sdf.update(func, stateful=True)
-        sdf = sdf.update(lambda value: print("Received value: ", value))
+        # does not require reassigning
+        sdf.update(lambda v: v.append(1))
         ```
 
         :param func: function to update value
@@ -306,6 +311,7 @@ def func(values: list, state: State):
         :param metadata: if True, the callback will receive key, timestamp and headers
             along with the value.
             Default - `False`.
+        :return: the updated StreamingDataFrame instance (reassignment NOT required).
         """
         if stateful:
             self._register_store()
@@ -319,15 +325,14 @@ def func(values: list, state: State):
                 func=cast(UpdateWithMetadataCallbackStateful, with_metadata_func),
                 processing_context=self._processing_context,
             )
-            stream = self.stream.add_update(
+            return self._add_update(
                 cast(UpdateWithMetadataCallback, stateful_func), metadata=True
             )
         else:
-            stream = self.stream.add_update(
+            return self._add_update(
                 cast(Union[UpdateCallback, UpdateWithMetadataCallback], func),
                 metadata=metadata,
             )
-        return self.__dataframe_clone__(stream=stream)
 
     @overload
     def filter(self, func: FilterCallback) -> Self: ...
@@ -546,6 +551,9 @@ def to_topic(
         """
         Produce current value to a topic. You can optionally specify a new key.
 
+        This operation occurs in-place, meaning reassignment is entirely OPTIONAL: the
+        original `StreamingDataFrame` is returned for chaining (`sdf.update().print()`).
+
         Example Snippet:
 
         ```python
@@ -560,17 +568,18 @@ def to_topic(
 
         sdf = app.dataframe(input_topic)
         sdf = sdf.to_topic(output_topic_0)
-        sdf = sdf.to_topic(output_topic_1, key=lambda data: data["a_field"])
+        # does not require reassigning
+        sdf.to_topic(output_topic_1, key=lambda data: data["a_field"])
         ```
 
         :param topic: instance of `Topic`
         :param key: a callable to generate a new message key, optional.
             If passed, the return type of this callable must be serializable
             by `key_serializer` defined for this Topic object.
             By default, the current message key will be used.
-
+        :return: the updated StreamingDataFrame instance (reassignment NOT required).
         """
-        return self.update(
+        return self._add_update(
             lambda value, orig_key, timestamp, headers: self._produce(
                 topic=topic,
                 value=value,
@@ -673,6 +682,48 @@ def _set_headers_callback(
         stream = self.stream.add_transform(func=_set_headers_callback)
         return self.__dataframe_clone__(stream=stream)
 
+    def print(self, pretty: bool = True, metadata: bool = False) -> Self:
+        """
+        Print out the current message value (and optionally, the message metadata) to
+        stdout (console) (like the built-in `print` function).
+
+        Can also output a more dict-friendly format with `pretty=True`.
+
+        This operation occurs in-place, meaning reassignment is entirely OPTIONAL: the
+        original `StreamingDataFrame` is returned for chaining (`sdf.update().print()`).
+
+        > NOTE: prints the current (edited) values, not the original values.
+
+        Example Snippet:
+
+        ```python
+        from quixstreams import Application
+
+
+        app = Application()
+        input_topic = app.topic("data")
+
+        sdf = app.dataframe(input_topic)
+        sdf["edited_col"] = sdf["orig_col"] + "edited"
+        # print the updated message value with the newly added column
+        sdf.print()
+        ```
+
+        :param pretty: Whether to use "pprint" formatting, which uses new-lines and
+            indents for easier console reading (but might be worse for log parsing).
+        :param metadata: Whether to additionally print the key, timestamp, and headers
+        :return: the updated StreamingDataFrame instance (reassignment NOT required).
+        """
+        print_args = ["value", "key", "timestamp", "headers"]
+        if pretty:
+            printer = functools.partial(pprint.pprint, indent=2, sort_dicts=False)
+        else:
+            printer = print
+        return self._add_update(
+            lambda *args: printer({print_args[i]: args[i] for i in range(len(args))}),
+            metadata=metadata,
+        )
+
     def compose(
         self,
         sink: Optional[Callable[[Any, Any, int, Any], None]] = None,
@@ -929,6 +980,14 @@ def _produce(
         )
         self._producer.produce_row(row=row, topic=topic, key=key, timestamp=timestamp)
 
+    def _add_update(
+        self,
+        func: Union[UpdateCallback, UpdateWithMetadataCallback],
+        metadata: bool = False,
+    ):
+        self._stream = self._stream.add_update(func, metadata=metadata)
+        return self
+
     def _register_store(self):
         """
         Register the default store for input topic in StateStoreManager
@@ -986,7 +1045,7 @@ def __setitem__(self, item_key: Any, item: Union[Self, object]):
             # Update an item key with a result of another sdf.apply()
             diff = self.stream.diff(item.stream)
             other_sdf_composed = diff.compose_returning()
-            stream = self.stream.add_update(
+            self._add_update(
                 lambda value, key, timestamp, headers: operator.setitem(
                     value,
                     item_key,
@@ -997,18 +1056,15 @@ def __setitem__(self, item_key: Any, item: Union[Self, object]):
         elif isinstance(item, StreamingSeries):
             # Update an item key with a result of another series
             series_composed = item.compose_returning()
-            stream = self.stream.add_update(
+            self._add_update(
                 lambda value, key, timestamp, headers: operator.setitem(
                     value, item_key, series_composed(value, key, timestamp, headers)[0]
                 ),
                 metadata=True,
             )
         else:
             # Update an item key with a constant
-            stream = self.stream.add_update(
-                lambda value: operator.setitem(value, item_key, item)
-            )
-        self._stream = stream
+            self._add_update(lambda value: operator.setitem(value, item_key, item))
 
     @overload
     def __getitem__(self, item: str) -> StreamingSeries: ...
diff --git a/tests/test_quixstreams/test_dataframe/test_dataframe.py b/tests/test_quixstreams/test_dataframe/test_dataframe.py
@@ -1,3 +1,4 @@
+import json
 import operator
 import uuid
 from collections import namedtuple
@@ -369,6 +370,26 @@ def test_set_headers(self, original_headers, new_headers, dataframe_factory):
         )[0]
         assert result == expected
 
+    @pytest.mark.parametrize(
+        "metadata,expected",
+        [
+            (False, str({"value": {"x": 1}})),
+            (
+                True,
+                str({"value": {"x": 1}, "key": b"key", "timestamp": 0, "headers": []}),
+            ),
+        ],
+    )
+    def test_print(self, dataframe_factory, metadata, expected, capsys):
+        sdf = dataframe_factory()
+        sdf.print(metadata=metadata)
+
+        value = {"x": 1}
+        key, timestamp, headers = b"key", 0, []
+        sdf.test(value=value, key=key, timestamp=timestamp, headers=headers)
+
+        assert expected in capsys.readouterr().out
+
 
 class TestStreamingDataFrameApplyExpand:
     def test_apply_expand(self, dataframe_factory):
@@ -418,14 +439,76 @@ def test_setitem_expand_not_allowed(self, dataframe_factory):
             _ = sdf[sdf.apply(lambda v: [v, v], expand=True)]
 
 
+class TestStreamingDataFrameUpdate:
+    def test_update_no_reassign(self, dataframe_factory):
+        """
+        "Update" operations should be applied regardless of a reassignment,
+        and anything else requires assignment.
+        """
+        sdf = dataframe_factory()
+        sdf_tree_1 = sdf.stream.tree()
+        sdf_id_1 = id(sdf)
+
+        # non-update non-reassignment (no change!)
+        sdf.apply(lambda v: v)
+        sdf_tree_2 = sdf.stream.tree()
+        sdf_id_2 = id(sdf)
+        assert sdf_id_1 == sdf_id_2
+        assert sdf_tree_1 == sdf_tree_2
+
+        # non-update reassignment
+        sdf = sdf.apply(lambda v: v)
+        sdf_tree_3 = sdf.stream.tree()
+        sdf_id_3 = id(sdf)
+        assert sdf_id_2 != sdf_id_3
+        assert sdf_tree_2 != sdf_tree_3
+
+        # update non-reassignment
+        sdf.update(lambda v: v)
+        sdf_tree_4 = sdf.stream.tree()
+        sdf_id_4 = id(sdf)
+        assert sdf_id_3 == sdf_id_4
+        assert sdf_tree_3 != sdf_tree_4
+
+        # update reassignment
+        sdf = sdf.update(lambda v: v)
+        sdf_tree_5 = sdf.stream.tree()
+        sdf_id_5 = id(sdf)
+        assert sdf_id_4 == sdf_id_5
+        assert sdf_tree_4 != sdf_tree_5
+
+    def test_chaining_inplace_with_non_inplace(self, dataframe_factory):
+        """
+        When chaining together inplace and non-inplace, reassigning must happen else
+        everything starting with the non-inplace will be lost.
+        """
+        sdf = dataframe_factory()
+        sdf.update(lambda v: v.append(1)).apply(lambda v: v + [2]).update(
+            lambda v: v.append(3)
+        )
+        sdf = sdf.apply(lambda v: v + [4])
+
+        value = []
+        key, timestamp, headers = b"key", 0, []
+
+        assert sdf.test(value, key, timestamp, headers)[0] == (
+            [1, 4],
+            key,
+            timestamp,
+            headers,
+        )
+
+
 class TestStreamingDataFrameToTopic:
+    @pytest.mark.parametrize("reassign", [True, False])
     def test_to_topic(
         self,
         dataframe_factory,
         row_consumer_factory,
         row_producer_factory,
         topic_manager_topic_factory,
         message_context_factory,
+        reassign,
     ):
         topic = topic_manager_topic_factory(
             key_serializer="str",
@@ -436,7 +519,10 @@ def test_to_topic(
         producer = row_producer_factory()
 
         sdf = dataframe_factory(producer=producer)
-        sdf = sdf.to_topic(topic)
+        if reassign:
+            sdf = sdf.to_topic(topic)
+        else:
+            sdf.to_topic(topic)
 
         value = {"x": 1, "y": 2}
         key, timestamp = "key", 10