quixio
diff --git a/‎quixstreams/app.py
Lines changed: 22 additions & 2 deletions b/‎quixstreams/app.py
Lines changed: 22 additions & 2 deletions
diff --git a/‎quixstreams/dataframe/dataframe.py
Lines changed: 1 addition & 0 deletions b/‎quixstreams/dataframe/dataframe.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎quixstreams/dataframe/registry.py
Lines changed: 19 additions & 0 deletions b/‎quixstreams/dataframe/registry.py
Lines changed: 19 additions & 0 deletions
diff --git a/‎quixstreams/kafka/consumer.py
Lines changed: 16 additions & 0 deletions b/‎quixstreams/kafka/consumer.py
Lines changed: 16 additions & 0 deletions
diff --git a/‎quixstreams/rowconsumer/__init__.py
Lines changed: 1 addition & 0 deletions b/‎quixstreams/rowconsumer/__init__.py
Lines changed: 1 addition & 0 deletions
@@ -69,6 +69,11 @@
 # Enforce idempotent producing for the internal RowProducer
 _default_producer_extra_config = {"enable.idempotence": True}
 
+# Default config for the internal consumer
+_default_consumer_extra_config = {
+    "fetch.queue.backoff.ms": 100,  # Make the consumer to fetch data more often
+}
+
 # Force assignment strategy to be "range" for co-partitioning in internal Consumers
 consumer_extra_config_overrides = {"partition.assignment.strategy": "range"}
 
@@ -151,6 +156,7 @@ def __init__(
         request_timeout: float = 30,
         topic_create_timeout: float = 60,
         processing_guarantee: ProcessingGuarantee = "at-least-once",
+        max_partition_buffer_size: int = 10000,
     ):
         """
         :param broker_address: Connection settings for Kafka.
@@ -210,6 +216,11 @@ def __init__(
         :param request_timeout: timeout (seconds) for REST-based requests
         :param topic_create_timeout: timeout (seconds) for topic create finalization
         :param processing_guarantee: Use "exactly-once" or "at-least-once" processing.
+        :param max_partition_buffer_size:  the maximum number of messages to buffer per topic partition to consider it full.
+            The buffering is used to consume messages in-order between multiple partitions with the same number.
+            It is a soft limit, and the actual number of buffered messages can be up to x2 higher.
+            Lower value decreases the memory use, but increases the latency.
+            Default - `10000`.
 
         <br><br>***Error Handlers***<br>
         To handle errors, `Application` accepts callbacks triggered when
@@ -305,7 +316,10 @@ def __init__(
                 **_default_producer_extra_config,
                 **producer_extra_config,
             },
-            consumer_extra_config=consumer_extra_config,
+            consumer_extra_config={
+                **_default_consumer_extra_config,
+                **consumer_extra_config,
+            },
             processing_guarantee=processing_guarantee,
             consumer_poll_timeout=consumer_poll_timeout,
             producer_poll_timeout=producer_poll_timeout,
@@ -315,6 +329,7 @@ def __init__(
             state_dir=state_dir,
             rocksdb_options=rocksdb_options,
             use_changelog_topics=use_changelog_topics,
+            max_partition_buffer_size=max_partition_buffer_size,
         )
 
         self._on_message_processed = on_message_processed
@@ -634,6 +649,7 @@ def _get_rowconsumer(
             consumer_group=self._config.consumer_group,
             auto_offset_reset=self._config.auto_offset_reset,
             auto_commit_enable=False,  # Disable auto commit and manage commits manually
+            max_partition_buffer_size=self._config.max_partition_buffer_size,
             extra_config=extra_config,
             on_error=on_error,
         )
@@ -905,7 +921,10 @@ def _quix_runtime_init(self):
     def _process_message(self, dataframe_composed):
         # Serve producer callbacks
         self._producer.poll(self._config.producer_poll_timeout)
-        rows = self._consumer.poll_row(timeout=self._config.consumer_poll_timeout)
+        rows = self._consumer.poll_row(
+            timeout=self._config.consumer_poll_timeout,
+            buffered=self._dataframe_registry.requires_time_alignment,
+        )
 
         if rows is None:
             self._run_tracker.set_current_message_tp(None)
@@ -1100,6 +1119,7 @@ class ApplicationConfig(BaseSettings):
     state_dir: Path = Path("state")
     rocksdb_options: Optional[RocksDBOptionsType] = None
     use_changelog_topics: bool = True
+    max_partition_buffer_size: int = 10000
 
     @classmethod
     def settings_customise_sources(
 
@@ -1611,6 +1611,7 @@ def concat(self, other: "StreamingDataFrame") -> "StreamingDataFrame":
         """
 
         merged_stream = self.stream.merge(other.stream)
+        self._registry.require_time_alignment()
         return self.__dataframe_clone__(
             *self.topics, *other.topics, stream=merged_stream
         )
 
@@ -26,6 +26,15 @@ def __init__(self) -> None:
         self._repartition_origins: set[str] = set()
         self._topics_to_stream_ids: dict[str, set[str]] = {}
         self._stream_ids_to_topics: dict[str, set[str]] = {}
+        self._requires_time_alignment = False
+
+    @property
+    def requires_time_alignment(self) -> bool:
+        """
+        Check if registered StreamingDataFrames require topics to be read in timestamp-aligned way.
+        That's normally required for the operations like `.concat()` and joins.
+        """
+        return self._requires_time_alignment
 
     @property
     def consumer_topics(self) -> list[Topic]:
@@ -131,3 +140,13 @@ def get_topics_for_stream_id(self, stream_id: str) -> list[str]:
         :return: a list of topic names
         """
         return list(self._stream_ids_to_topics[stream_id])
+
+    def require_time_alignment(self):
+        """
+        Require the time alignment for the topology.
+
+        This flag is set by individual StreamingDataFrames when certain operations like
+        .concat() or joins are triggered, and it will inform the application to consume
+        messages in the timestamp-aligned way for the correct processing.
+        """
+        self._requires_time_alignment = True
@@ -580,6 +580,22 @@ def consumer_group_metadata(self) -> GroupMetadata:
         """
         return self._consumer.consumer_group_metadata()
 
+    def consume(
+        self, num_messages: int = 1, timeout: Optional[float] = None
+    ) -> list[RawConfluentKafkaMessageProto]:
+        """
+        Consumes a list of messages (possibly empty on timeout).
+        Callbacks may be executed as a side effect of calling this method.
+
+        :param num_messages: The maximum number of messages to return.
+            Default: `1`.
+        :param timeout: The maximum time in seconds to block waiting for message, event or callback.
+            Default: `None` (infinite).
+        """
+        return self._consumer.consume(
+            num_messages=num_messages, timeout=timeout if timeout is not None else -1
+        )
+
     @property
     def _consumer(self) -> ConfluentConsumer:
         """
 
@@ -0,0 +1 @@
+from .consumer import RowConsumer as RowConsumer
Original file line number	Diff line number	Diff line change
`@@ -1611,6 +1611,7 @@ def concat(self, other: "StreamingDataFrame") -> "StreamingDataFrame":`
`1611`	`1611`	`"""`
`1612`	`1612`
`1613`	`1613`	`merged_stream = self.stream.merge(other.stream)`
	`1614`	`+ self._registry.require_time_alignment()`
`1614`	`1615`	`return self.__dataframe_clone__(`
`1615`	`1616`	`self.topics, other.topics, stream=merged_stream`
`1616`	`1617`	`)`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+from .consumer import RowConsumer as RowConsumer`