Streaming dataframe support (#354)

KshitizLohia · web-flow · commit fbdddd35a0e1 · 2023-10-16T16:33:13.000+05:30
diff --git a/ads/feature_store/common/enums.py b/ads/feature_store/common/enums.py
@@ -49,7 +49,20 @@ class DatasetIngestionMode(Enum):
     SQL = "SQL"
 
 
-class IngestionMode(Enum):
+class IngestionType(Enum):
+    """
+    The type of ingestion that can be performed.
+
+    Possible values:
+        * STREAMING: The data is ingested in real time.
+        * BATCH: The data is ingested in batches.
+    """
+
+    STREAMING = "STREAMING"
+    BATCH = "BATCH"
+
+
+class BatchIngestionMode(Enum):
     """
     An enumeration that represents the supported Ingestion Mode in feature store.
 
@@ -68,6 +81,20 @@ class IngestionMode(Enum):
     UPSERT = "UPSERT"
 
 
+class StreamingIngestionMode(Enum):
+    """
+    Enumeration for stream ingestion modes.
+
+    - `COMPLETE`: Represents complete stream ingestion where the entire dataset is replaced.
+    - `APPEND`: Represents appending new data to the existing dataset.
+    - `UPDATE`: Represents updating existing data in the dataset.
+    """
+
+    COMPLETE = "COMPLETE"
+    APPEND = "APPEND"
+    UPDATE = "UPDATE"
+
+
 class JoinType(Enum):
     """Enumeration of supported SQL join types.
 
@@ -214,6 +241,7 @@ class TransformationMode(Enum):
 
     SQL = "sql"
     PANDAS = "pandas"
+    SPARK = "spark"
 
 
 class FilterOperators(Enum):
diff --git a/ads/feature_store/common/utils/transformation_utils.py b/ads/feature_store/common/utils/transformation_utils.py
@@ -69,7 +69,10 @@ def apply_transformation(
                     temporary_table_view, **transformation_kwargs_dict
                 )
             )
-        elif transformation.transformation_mode == TransformationMode.PANDAS.value:
+        elif transformation.transformation_mode in [
+            TransformationMode.PANDAS.value,
+            TransformationMode.SPARK.value,
+        ]:
             transformed_data = transformation_function_caller(
                 dataframe, **transformation_kwargs_dict
             )
diff --git a/ads/feature_store/dataset.py b/ads/feature_store/dataset.py
@@ -21,14 +21,15 @@
     ExecutionEngine,
     ExpectationType,
     EntityType,
+    BatchIngestionMode,
 )
 from ads.feature_store.common.exceptions import NotMaterializedError
 from ads.feature_store.common.utils.utility import (
     get_metastore_id,
     validate_delta_format_parameters,
     convert_expectation_suite_to_expectation,
 )
-from ads.feature_store.dataset_job import DatasetJob, IngestionMode
+from ads.feature_store.dataset_job import DatasetJob
 from ads.feature_store.execution_strategy.engine.spark_engine import SparkEngine
 from ads.feature_store.execution_strategy.execution_strategy_provider import (
     OciExecutionStrategyProvider,
@@ -779,7 +780,7 @@ def delete(self):
         None
         """
         # Create DataSet Job and persist it
-        dataset_job = self._build_dataset_job(IngestionMode.DEFAULT)
+        dataset_job = self._build_dataset_job(BatchIngestionMode.DEFAULT)
 
         # Create the Job
         dataset_job.create()
@@ -874,7 +875,7 @@ def _update_from_oci_dataset_model(self, oci_dataset: OCIDataset) -> "Dataset":
 
     def materialise(
         self,
-        ingestion_mode: IngestionMode = IngestionMode.OVERWRITE,
+        ingestion_mode: BatchIngestionMode = BatchIngestionMode.OVERWRITE,
         feature_option_details: FeatureOptionDetails = None,
     ):
         """Creates a dataset job.
diff --git a/ads/feature_store/dataset_job.py b/ads/feature_store/dataset_job.py
@@ -5,13 +5,17 @@
 # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
 import logging
 from copy import deepcopy
-from typing import Dict, List, Any
+from typing import Dict, List, Any, Union
 
 import pandas
 
 from ads.common import utils
+from ads.feature_store.common.enums import (
+    JobConfigurationType,
+    BatchIngestionMode,
+    StreamingIngestionMode,
+)
 from ads.feature_store.feature_option_details import FeatureOptionDetails
-from ads.feature_store.common.enums import IngestionMode, JobConfigurationType
 from ads.feature_store.service.oci_dataset_job import OCIDatasetJob
 from ads.jobs.builders.base import Builder
 
@@ -225,10 +229,14 @@ def ingestion_mode(self) -> str:
         return self.get_spec(self.CONST_INGESTION_MODE)
 
     @ingestion_mode.setter
-    def ingestion_mode(self, ingestion_mode: IngestionMode) -> "DatasetJob":
+    def ingestion_mode(
+        self, ingestion_mode: Union[BatchIngestionMode, StreamingIngestionMode]
+    ) -> "DatasetJob":
         return self.with_ingestion_mode(ingestion_mode)
 
-    def with_ingestion_mode(self, ingestion_mode: IngestionMode) -> "DatasetJob":
+    def with_ingestion_mode(
+        self, ingestion_mode: Union[BatchIngestionMode, StreamingIngestionMode]
+    ) -> "DatasetJob":
         """Sets the mode of the dataset ingestion mode.
 
         Parameters
diff --git a/ads/feature_store/docs/source/dataset.rst b/ads/feature_store/docs/source/dataset.rst
@@ -74,7 +74,7 @@ Use the ``from_id()`` method from the ``Dataset`` class to load an existing data
 
   from ads.feature_store.dataset import Dataset
 
-  dataset = Dataset.from_id("ocid1.dataset..<unique_id>")
+  dataset = Dataset.from_id("<unique_id>")
 
 Materialise
 ===========
@@ -138,6 +138,10 @@ Feature store allows you to define expectations on data being materialized into
 
 .. code-block:: python3
 
+    from great_expectations.core import ExpectationSuite, ExpectationConfiguration
+    from ads.feature_store.common.enums import TransformationMode, ExpectationType
+    from ads.feature_store.feature_group import FeatureGroup
+
     expectation_suite = ExpectationSuite(
         expectation_suite_name="expectation_suite_name"
     )
@@ -186,6 +190,7 @@ dataset or it can be updated later as well.
 .. code-block:: python3
 
   # Define statistics configuration for selected features
+  from ads.feature_store.statistics_config import StatisticsConfig
   stats_config = StatisticsConfig().with_is_enabled(True).with_columns(["column1", "column2"])
 
 
@@ -194,6 +199,7 @@ This can be used with dataset instance.
 .. code-block:: python3
 
   from ads.feature_store.dataset import Dataset
+  from ads.feature_store.statistics_config import StatisticsConfig
 
   dataset = (
         Dataset
diff --git a/ads/feature_store/docs/source/dataset_job.rst b/ads/feature_store/docs/source/dataset_job.rst
@@ -67,7 +67,7 @@ Use the ``from_id()`` method from the ``DatasetJob`` class to load an existing d
 
   from ads.feature_store.dataset_job import DatasetJob
 
-  dataset_job = DatasetJob.from_id("ocid1.dataset_job..<unique_id>")
+  dataset_job = DatasetJob.from_id("<unique_id>")
 
 Delete
 ======
diff --git a/ads/feature_store/docs/source/entity.rst b/ads/feature_store/docs/source/entity.rst
@@ -66,7 +66,7 @@ Use the ``from_id()`` method from the ``Entity`` class to load an existing entit
 
   from ads.feature_store.entity import Entity
 
-  entity = Entity.from_id("ocid1.entity..<unique_id>")
+  entity = Entity.from_id("<unique_id>")
 
 Delete
 ======
diff --git a/ads/feature_store/docs/source/feature_group.rst b/ads/feature_store/docs/source/feature_group.rst
@@ -93,7 +93,7 @@ Use the ``from_id()`` method from the ``FeatureGroup`` class to load an existing
 
   from ads.feature_store.feature_group import FeatureGroup
 
-  feature_group = FeatureGroup.from_id("ocid1.feature_group..<unique_id>")
+  feature_group = FeatureGroup.from_id("<unique_id>")
 
 
 Materialise
@@ -122,6 +122,32 @@ The ``.materialise()`` method takes the following parameter:
 .. seealso::
    Refer  :ref:`Data types` supported by feature store
 
+
+Materialise Stream
+==================
+You can call the ``materialise_stream() -> FeatureGroupJob`` method of the ``FeatureGroup`` instance to load the streaming data to feature group. To persist the feature_group and save feature_group data along the metadata in the feature store, call the ``materialise_stream()``
+
+The ``.materialise_stream()`` method takes the following parameter:
+    - ``input_dataframe``: Features in Streaming Dataframe to be saved.
+    - ``query_name``: It is possible to optionally specify a name for the query to make it easier to recognise in the Spark UI. Defaults to ``None``.
+    - ``ingestion_mode``: Specifies how data of a streaming DataFrame/Dataset is written to a streaming sink.
+        - ``append``: Only the new rows in the streaming DataFrame/Dataset will be written to the sink. If the query doesn’t contain aggregations, it will be equivalent to append mode. Defaults to ``"append"``.
+        - ``complete``: All the rows in the streaming DataFrame/Dataset will be written to the sink every time there is some update.
+        - ``update``: only the rows that were updated in the streaming DataFrame/Dataset will be written to the sink every time there are some updates.
+    - ``await_termination``: Waits for the termination of this query, either by ``query.stop()`` or by an exception. If the query has terminated with an exception, then the exception will be thrown. If timeout is set, it returns whether the query has terminated or not within the timeout seconds. Defaults to ``False``.
+    - ``timeout``: Only relevant in combination with ``await_termination=True``.
+        - Defaults to ``None``.
+    - ``checkpoint_dir``: Checkpoint directory location. This will be used to as a reference to from where to resume the streaming job. Defaults to ``None``.
+    - ``write_options``: Additional write options for Spark as key-value pairs.
+        - Defaults to ``{}``.
+
+.. seealso::
+   :ref:`Feature Group Job`
+
+.. seealso::
+   Refer  :ref:`Data types` supported by feature store
+
+
 Delete
 ======
 
@@ -173,6 +199,9 @@ With a ``FeatureGroup`` instance, You can save the expectation details using ``w
 .. image:: figures/validation.png
 
 .. code-block:: python3
+    from great_expectations.core import ExpectationSuite, ExpectationConfiguration
+    from ads.feature_store.common.enums import TransformationMode, ExpectationType
+    from ads.feature_store.feature_group import FeatureGroup
 
     expectation_suite = ExpectationSuite(
         expectation_suite_name="expectation_suite_name"
@@ -221,6 +250,7 @@ feature group or it can be updated later as well.
 .. code-block:: python3
 
   # Define statistics configuration for selected features
+  from ads.feature_store.statistics_config import StatisticsConfig
   stats_config = StatisticsConfig().with_is_enabled(True).with_columns(["column1", "column2"])
 
 
diff --git a/ads/feature_store/docs/source/feature_group_job.rst b/ads/feature_store/docs/source/feature_group_job.rst
@@ -67,7 +67,7 @@ Use the ``from_id()`` method from the ``FeatureGroupJob`` class to load an exist
 
   from ads.feature_store.feature_group_job import FeatureGroupJob
 
-  feature_group_job = FeatureGroupJob.from_id("ocid1.feature_group_job..<unique_id>")
+  feature_group_job = FeatureGroupJob.from_id("<unique_id>")
 
 Delete
 ======
diff --git a/ads/feature_store/docs/source/feature_store.rst b/ads/feature_store/docs/source/feature_store.rst
@@ -67,7 +67,7 @@ Use the ``from_id()`` method from the ``FeatureStore`` class to load an existing
 
   from ads.feature_store.feature_store import FeatureStore
 
-  feature_store = FeatureStore.from_id("ocid1.feature_store..<unique_id>")
+  feature_store = FeatureStore.from_id("<unique_id>")
 
 Delete
 ======
diff --git a/ads/feature_store/docs/source/transformation.rst b/ads/feature_store/docs/source/transformation.rst
@@ -69,7 +69,7 @@ Use the ``from_id()`` method from the ``Transformation`` class to load an existi
 
   from ads.feature_store.transformation import Transformation
 
-  transformation = Transformation.from_id("ocid1.transformation..<unique_id>")
+  transformation = Transformation.from_id(".<unique_id>")
 
 Delete
 ======
diff --git a/ads/feature_store/execution_strategy/delta_lake/delta_lake_service.py b/ads/feature_store/execution_strategy/delta_lake/delta_lake_service.py
@@ -7,7 +7,7 @@
 import logging
 
 from ads.common.decorator.runtime_dependency import OptionalDependency
-from ads.feature_store.common.enums import IngestionMode
+from ads.feature_store.common.enums import BatchIngestionMode
 from ads.feature_store.execution_strategy.engine.spark_engine import SparkEngine
 
 try:
@@ -57,9 +57,10 @@ def write_dataframe_to_delta_lake(
             None.
         """
         logger.info(f"target table name {target_table_name}")
+
         if (
             self.spark_engine.is_delta_table_exists(target_table_name)
-            and ingestion_mode.upper() == IngestionMode.UPSERT.value
+            and ingestion_mode.upper() == BatchIngestionMode.UPSERT.value
         ):
             logger.info(f"Upsert ops for target table {target_table_name} begin")
 
@@ -341,3 +342,34 @@ def __get_insert_update_query_expression(feature_data_source_columns, table_name
 
         logger.info(f"get_insert_update_query_expression {feature_data_update_set}")
         return feature_data_update_set
+
+    def write_stream_dataframe_to_delta_lake(
+        self,
+        stream_dataframe,
+        target_table,
+        output_mode,
+        query_name,
+        await_termination,
+        timeout,
+        checkpoint_dir,
+        feature_option_details,
+    ):
+        if query_name is None:
+            query_name = "insert_stream_" + target_table.split(".")[1]
+
+        query = (
+            stream_dataframe.writeStream.outputMode(output_mode)
+            .format("delta")
+            .option(
+                "checkpointLocation",
+                checkpoint_dir,
+            )
+            .options(**self.get_delta_write_config(feature_option_details))
+            .queryName(query_name)
+            .toTable(target_table)
+        )
+
+        if await_termination:
+            query.awaitTermination(timeout)
+
+        return query
diff --git a/ads/feature_store/execution_strategy/engine/spark_engine.py b/ads/feature_store/execution_strategy/engine/spark_engine.py
@@ -186,19 +186,31 @@ def get_tables_from_database(self, database):
 
         return permanent_tables
 
-    def get_columns_from_table(self, table_name: str):
+    def get_output_columns_from_table_or_dataframe(
+        self, table_name: str = None, dataframe=None
+    ):
         """Returns the column(features) along with type from the given table.
 
         Args:
           table_name(str): A string specifying the name of table name for which columns should be returned.
+          dataframe: Dataframe containing the transformed dataframe.
 
         Returns:
          List[{"name": "<feature_name>","featureType": "<feature_type>"}]
          Returns the List of dictionary of column with name and type from the given table.
+
         """
+        if table_name is None and dataframe is None:
+            raise ValueError(
+                "Either 'table_name' or 'dataframe' must be provided to retrieve output columns."
+            )
+
+        if dataframe is not None:
+            feature_data_target = dataframe
+        else:
+            feature_data_target = self.spark.sql(f"SELECT * FROM {table_name} LIMIT 1")
 
         target_table_columns = []
-        feature_data_target = self.spark.sql(f"SELECT * FROM {table_name} LIMIT 1")
 
         for field in feature_data_target.schema.fields:
             target_table_columns.append(
diff --git a/ads/feature_store/execution_strategy/execution_strategy.py b/ads/feature_store/execution_strategy/execution_strategy.py
@@ -42,6 +42,19 @@ def ingest_feature_definition(
         """
         pass
 
+    @abstractmethod
+    def ingest_feature_definition_stream(
+        self,
+        feature_group,
+        feature_group_job: FeatureGroupJob,
+        dataframe,
+        query_name,
+        await_termination,
+        timeout,
+        checkpoint_dir,
+    ):
+        pass
+
     @abstractmethod
     def ingest_dataset(self, dataset, dataset_job: DatasetJob):
         """
diff --git a/ads/feature_store/execution_strategy/spark/spark_execution.py b/ads/feature_store/execution_strategy/spark/spark_execution.py
diff --git a/ads/feature_store/feature_group.py b/ads/feature_store/feature_group.py
diff --git a/ads/feature_store/feature_group_job.py b/ads/feature_store/feature_group_job.py
diff --git a/tests/integration/feature_store/test_data/credit_score_batch_1.csv b/tests/integration/feature_store/test_data/credit_score_batch_1.csv
diff --git a/tests/integration/feature_store/test_data/credit_score_batch_2.csv b/tests/integration/feature_store/test_data/credit_score_batch_2.csv
diff --git a/tests/integration/feature_store/test_streaming_dataframe_feature_group.py b/tests/integration/feature_store/test_streaming_dataframe_feature_group.py

Original file line number	Diff line number	Diff line change
`@@ -69,7 +69,10 @@ def apply_transformation(`
`69`	`69`	`temporary_table_view, **transformation_kwargs_dict`
`70`	`70`	`)`
`71`	`71`	`)`
`72`		`- elif transformation.transformation_mode == TransformationMode.PANDAS.value:`
	`72`	`+ elif transformation.transformation_mode in [`
	`73`	`+ TransformationMode.PANDAS.value,`
	`74`	`+ TransformationMode.SPARK.value,`
	`75`	`+ ]:`
`73`	`76`	`transformed_data = transformation_function_caller(`
`74`	`77`	`dataframe, **transformation_kwargs_dict`
`75`	`78`	`)`