local file source (#601)

tim-quix · web-flow · commit b60d4adbc255 · 2024-11-05T14:27:24.000-05:00
Added community local file source.

Supports Parquet, Jsonlines with gzip compression, and replaying messages at original produce speed.
diff --git a/docs/build/build.py b/docs/build/build.py
@@ -133,6 +133,10 @@
             "quixstreams.sources.core.csv",
             "quixstreams.sources.core.kafka.kafka",
             "quixstreams.sources.core.kafka.quix",
+            "quixstreams.sources.community.file.file",
+            "quixstreams.sources.community.file.compressions.gzip",
+            "quixstreams.sources.community.file.formats.json",
+            "quixstreams.sources.community.file.formats.parquet",
         ]
     },
 }
diff --git a/docs/connectors/sources/file-source.md b/docs/connectors/sources/file-source.md
@@ -0,0 +1,81 @@
+# Quix File Source Connector
+
+This source enables reading from a localized file source, such as a JSONlines or Parquet
+file. It also supports file (de)compression.
+
+The resulting messages can be produced in "replay" mode, where the time between record 
+producing is matched as close as possible to the original. (per topic partition only).
+
+The Quix File Source Connector is generally intended to be used alongside the related 
+Quix File Sink Connector (in terms of expected file and data formatting).
+
+## How to use CSV Source
+
+To use a CSV Source, you need to create and instance of `FileSource` 
+and pass it to the `app.dataframe()` method.
+
+One important thing to note is that you should in general point to a single topic folder
+(rather than a root folder with many topics) otherwise topic partitions may not line up correctly.
+
+```python
+from quixstreams import Application
+from quixstreams.sources.community.file import FileSource
+
+app = Application(broker_address="localhost:9092")
+source = FileSource(
+    filepath="/path/to/my/topic_folder",
+    file_format="json",
+    file_compression="gzip",
+    as_replay=True
+)
+sdf = app.dataframe(source=source).print(metadata=True)
+
+if __name__ == "__main__":
+    app.run()
+```
+
+## File hierarchy/structure
+
+The Quix File Source Connector expects a folder structure like so:
+
+```
+    my_sinked_topics/
+    ├── topic_a/          # topic name (use this path to File Source!)
+    │   ├── 0/            # topic partition number
+    │   │   ├── 0000.ext  # formatted offset files (ex: JSON)
+    │   │   └── 0011.ext
+    │   └── 1/
+    │       ├── 0003.ext
+    │       └── 0016.ext
+    └── topic_b/
+        └── etc...
+```
+
+This is the default structure generated by the Quix File Sink Connector.
+
+## File data format/schema
+
+The expected data schema is largely dependent on the file format chosen.
+
+For easiest use with the Quix File Sink Connector, you can follow these patterns: 
+
+- for row-based formats (like JSON), the expected data should have records
+with the following fields, where value is the entirety of the message value, 
+ideally as a JSON-deserializable item:
+  - `_key`
+  - `_value`
+  - `_timestamp`
+
+- for columnar formats (like Parquet), they do not expect an explicit `value` 
+field; instead all columns should be included individually while including `_key` and `_timestamp`:
+  - `_key`
+  - `_timestamp`
+  - `field_a`
+  - `field_b`...
+
+etc...
+    
+## Topic
+
+The default topic will have a partition count that reflects the partition count found 
+within the provided topic's folder structure.
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -60,6 +60,7 @@ nav:
         - Kafka Replicator Source: connectors/sources/kafka-source.md
         - Quix Source: connectors/sources/quix-source.md
         - Creating a Custom Source: connectors/sources/custom-sources.md
+        - Local File Source: connectors/source/file-source.md
     - Contribution Guide: 'connectors/contribution-guide.md'
     - Community and Core Connectors: 'connectors/community-and-core.md'
   - Upgrading Guide:
diff --git a/quixstreams/sources/community/file/__init__.py b/quixstreams/sources/community/file/__init__.py
@@ -0,0 +1,3 @@
+# ruff: noqa: F403
+from .file import *
+from .formats import *
diff --git a/quixstreams/sources/community/file/compressions/__init__.py b/quixstreams/sources/community/file/compressions/__init__.py
@@ -0,0 +1,6 @@
+# ruff: noqa: F403
+# ruff: noqa: F405
+from .base import *
+from .gzip import *
+
+COMPRESSION_MAPPER = {"gz": GZipDecompressor, "gzip": GZipDecompressor}
diff --git a/quixstreams/sources/community/file/compressions/base.py b/quixstreams/sources/community/file/compressions/base.py
@@ -0,0 +1,16 @@
+from abc import ABC, abstractmethod
+from pathlib import Path
+from typing import Literal
+
+__all__ = (
+    "Decompressor",
+    "CompressionName",
+)
+
+
+CompressionName = Literal["gz", "gzip"]
+
+
+class Decompressor(ABC):
+    @abstractmethod
+    def decompress(self, filepath: Path) -> bytes: ...
diff --git a/quixstreams/sources/community/file/compressions/gzip.py b/quixstreams/sources/community/file/compressions/gzip.py
@@ -0,0 +1,16 @@
+from pathlib import Path
+
+from .base import Decompressor
+
+__all__ = ("GZipDecompressor",)
+
+
+class GZipDecompressor(Decompressor):
+    def __init__(self):
+        from gzip import decompress
+
+        self._decompressor = decompress
+
+    def decompress(self, filepath: Path) -> bytes:
+        with open(filepath, "rb") as f:
+            return self._decompressor(f.read())
diff --git a/quixstreams/sources/community/file/file.py b/quixstreams/sources/community/file/file.py
@@ -0,0 +1,174 @@
+import logging
+from pathlib import Path
+from time import sleep
+from typing import Generator, Optional, Union
+
+from quixstreams.models import Topic, TopicConfig
+from quixstreams.sources import Source
+
+from .compressions import CompressionName
+from .formats import FORMATS, Format, FormatName
+
+__all__ = ("FileSource",)
+
+logger = logging.getLogger(__name__)
+
+
+class FileSource(Source):
+    """
+    Ingest a set of local files into kafka by iterating through the provided folder and
+    processing all nested files within it.
+
+    Expects folder and file structures as generated by the related Quix Streams File
+    Sink Connector:
+
+    my_topics/
+    ├── topic_a/
+    │   ├── 0/
+    │   │   ├── 0000.ext
+    │   │   └── 0011.ext
+    │   └── 1/
+    │       ├── 0003.ext
+    │       └── 0016.ext
+    └── topic_b/
+        └── etc...
+
+    Intended to be used with a single topic (ex: topic_a), but will recursively read
+    from whatever entrypoint is passed to it.
+
+    File format structure depends on the file format.
+
+    See the `.formats` and `.compressions` modules to see what is supported.
+
+    Example:
+
+    from quixstreams import Application
+    from quixstreams.sources.community.file import FileSource
+
+    app = Application(broker_address="localhost:9092", auto_offset_reset="earliest")
+    source = FileSource(
+        filepath="/path/to/my/topic_folder",
+        file_format="json",
+        file_compression="gzip",
+    )
+    sdf = app.dataframe(source=source).print(metadata=True)
+
+    if __name__ == "__main__":
+        app.run()
+    """
+
+    def __init__(
+        self,
+        filepath: Union[str, Path],
+        file_format: Union[Format, FormatName],
+        file_compression: Optional[CompressionName] = None,
+        as_replay: bool = True,
+        name: Optional[str] = None,
+        shutdown_timeout: float = 10,
+    ):
+        """
+        :param filepath: a filepath to recursively read through; it is recommended to
+            provide the path to a given topic folder (ex: `/path/to/topic_a`).
+        :param file_format: what format the message files are in (ex: json, parquet).
+            Optionally, can provide a `Format` instance if more than file_compression
+            is necessary to define (file_compression will then be ignored).
+        :param file_compression: what compression is used on the given files, if any.
+        :param as_replay: Produce the messages with the original time delay between them.
+            Otherwise, produce the messages as fast as possible.
+            NOTE: Time delay will only be accurate per partition, NOT overall.
+        :param name: The name of the Source application (Default: last folder name).
+        :param shutdown_timeout: Time in seconds the application waits for the source
+            to gracefully shutdown
+        """
+        self._filepath = Path(filepath)
+        self._formatter = _get_formatter(file_format, file_compression)
+        self._as_replay = as_replay
+        self._previous_timestamp = None
+        self._previous_partition = None
+        super().__init__(
+            name=name or self._filepath.name, shutdown_timeout=shutdown_timeout
+        )
+
+    def _replay_delay(self, current_timestamp: int):
+        """
+        Apply the replay speed by calculating the delay between messages
+        based on their timestamps.
+        """
+        if self._previous_timestamp is not None:
+            time_diff = (current_timestamp - self._previous_timestamp) / 1000
+            if time_diff > 0:
+                logger.debug(f"Sleeping for {time_diff} seconds...")
+                sleep(time_diff)
+        self._previous_timestamp = current_timestamp
+
+    def _get_partition_count(self) -> int:
+        return len([f for f in self._filepath.iterdir()])
+
+    def default_topic(self) -> Topic:
+        """
+        Uses the file structure to generate the desired partition count for the
+        internal topic.
+        :return: the original default topic, with updated partition count
+        """
+        topic = super().default_topic()
+        topic.config = TopicConfig(
+            num_partitions=self._get_partition_count(), replication_factor=1
+        )
+        return topic
+
+    def _check_file_partition_number(self, file: Path):
+        """
+        Checks whether the next file is the start of a new partition so the timestamp
+        tracker can be reset.
+        """
+        partition = int(file.parent.name)
+        if self._previous_partition != partition:
+            self._previous_timestamp = None
+            self._previous_partition = partition
+            logger.debug(f"Beginning reading partition {partition}")
+
+    def _produce(self, record: dict):
+        kafka_msg = self._producer_topic.serialize(
+            key=record["_key"],
+            value=record["_value"],
+            timestamp_ms=record["_timestamp"],
+        )
+        self.produce(
+            key=kafka_msg.key, value=kafka_msg.value, timestamp=kafka_msg.timestamp
+        )
+
+    def run(self):
+        while self._running:
+            for file in _file_finder(self._filepath):
+                logger.info(f"Reading files from topic {self._filepath.name}")
+                self._check_file_partition_number(file)
+                for record in self._formatter.file_read(file):
+                    if self._as_replay:
+                        self._replay_delay(record["_timestamp"])
+                    self._produce(record)
+                self.flush()
+            return
+
+
+def _get_formatter(
+    formatter: Union[Format, FormatName], compression: Optional[CompressionName]
+) -> Format:
+    if isinstance(formatter, Format):
+        return formatter
+    elif format_obj := FORMATS.get(formatter):
+        return format_obj(compression=compression)
+
+    allowed_formats = ", ".join(FormatName.__args__)
+    raise ValueError(
+        f'Invalid format name "{formatter}". '
+        f"Allowed values: {allowed_formats}, "
+        f"or an instance of a subclass of `Format`."
+    )
+
+
+def _file_finder(filepath: Path) -> Generator[Path, None, None]:
+    if filepath.is_dir():
+        for i in sorted(filepath.iterdir(), key=lambda x: x.name):
+            yield from _file_finder(i)
+    else:
+        yield filepath
diff --git a/quixstreams/sources/community/file/formats/__init__.py b/quixstreams/sources/community/file/formats/__init__.py
@@ -0,0 +1,10 @@
+# ruff: noqa: F403
+# ruff: noqa: F405
+from .base import *
+from .json import *
+from .parquet import *
+
+FORMATS = {
+    "json": JSONFormat,
+    "parquet": ParquetFormat,
+}
diff --git a/quixstreams/sources/community/file/formats/base.py b/quixstreams/sources/community/file/formats/base.py
diff --git a/quixstreams/sources/community/file/formats/json.py b/quixstreams/sources/community/file/formats/json.py
diff --git a/quixstreams/sources/community/file/formats/parquet.py b/quixstreams/sources/community/file/formats/parquet.py

Original file line number	Diff line number	Diff line change
`@@ -133,6 +133,10 @@`
`133`	`133`	`"quixstreams.sources.core.csv",`
`134`	`134`	`"quixstreams.sources.core.kafka.kafka",`
`135`	`135`	`"quixstreams.sources.core.kafka.quix",`
	`136`	`+ "quixstreams.sources.community.file.file",`
	`137`	`+ "quixstreams.sources.community.file.compressions.gzip",`
	`138`	`+ "quixstreams.sources.community.file.formats.json",`
	`139`	`+ "quixstreams.sources.community.file.formats.parquet",`
`136`	`140`	`]`
`137`	`141`	`},`
`138`	`142`	`}`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+# ruff: noqa: F403`
	`2`	`+from .file import *`
	`3`	`+from .formats import *`