added as_of interface to the feature store (#279)

KshitizLohia · web-flow · commit 820b5bd17e4b · 2023-08-11T17:36:32.000+05:30
diff --git a/ads/feature_store/common/spark_session_singleton.py b/ads/feature_store/common/spark_session_singleton.py
@@ -84,6 +84,7 @@ def __init__(self, metastore_id: str = None):
             )
             .enableHiveSupport()
         )
+        _managed_table_location = None
 
         if not developer_enabled() and metastore_id:
             # Get the authentication credentials for the OCI data catalog service
@@ -94,12 +95,11 @@ def __init__(self, metastore_id: str = None):
 
             data_catalog_client = OCIClientFactory(**auth).data_catalog
             metastore = data_catalog_client.get_metastore(metastore_id).data
+            _managed_table_location = metastore.default_managed_table_location
             # Configure the Spark session builder object to use the specified metastore
             spark_builder.config(
                 "spark.hadoop.oracle.dcat.metastore.id", metastore_id
-            ).config(
-                "spark.sql.warehouse.dir", metastore.default_managed_table_location
-            ).config(
+            ).config("spark.sql.warehouse.dir", _managed_table_location).config(
                 "spark.driver.memory", "16G"
             )
 
@@ -114,7 +114,12 @@ def __init__(self, metastore_id: str = None):
 
         self.spark_session.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")
         self.spark_session.sparkContext.setLogLevel("OFF")
+        self.managed_table_location = _managed_table_location
 
     def get_spark_session(self):
         """Access method to get the spark session."""
         return self.spark_session
+
+    def get_managed_table_location(self):
+        """Returns the managed table location for the spark"""
+        return self.managed_table_location
diff --git a/ads/feature_store/common/utils/transformation_utils.py b/ads/feature_store/common/utils/transformation_utils.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python
 # -*- coding: utf-8; -*-
 import json
+
 # Copyright (c) 2023 Oracle and/or its affiliates.
 # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
 
@@ -64,9 +65,13 @@ def apply_transformation(
             dataframe.createOrReplaceTempView(temporary_table_view)
 
             transformed_data = spark.sql(
-                transformation_function_caller(temporary_table_view, **transformation_kwargs_dict)
+                transformation_function_caller(
+                    temporary_table_view, **transformation_kwargs_dict
+                )
             )
         elif transformation.transformation_mode == TransformationMode.PANDAS.value:
-            transformed_data = transformation_function_caller(dataframe, **transformation_kwargs_dict)
+            transformed_data = transformation_function_caller(
+                dataframe, **transformation_kwargs_dict
+            )
 
         return transformed_data
diff --git a/ads/feature_store/dataset.py b/ads/feature_store/dataset.py
@@ -8,13 +8,16 @@
 
 import pandas
 from great_expectations.core import ExpectationSuite
+
+from ads import deprecated
 from ads.common import utils
 from ads.common.oci_mixin import OCIModelMixin
 from ads.feature_store.common.enums import (
     ExecutionEngine,
     ExpectationType,
     EntityType,
 )
+from ads.feature_store.common.exceptions import NotMaterializedError
 from ads.feature_store.common.utils.utility import (
     get_metastore_id,
     validate_delta_format_parameters,
@@ -475,6 +478,20 @@ def with_statistics_config(
             self.CONST_STATISTICS_CONFIG, statistics_config_in.to_dict()
         )
 
+    def target_delta_table(self):
+        """
+        Returns the fully-qualified name of the target table for storing delta data.
+
+        The name of the target table is constructed by concatenating the entity ID
+        and the name of the table, separated by a dot. The resulting string has the
+        format 'entity_id.table_name'.
+
+        Returns:
+            str: The fully-qualified name of the target delta table.
+        """
+        target_table = f"{self.entity_id}.{self.name}"
+        return target_table
+
     @property
     def model_details(self) -> "ModelDetails":
         return self.get_spec(self.CONST_MODEL_DETAILS)
@@ -560,7 +577,9 @@ def add_models(self, model_details: ModelDetails) -> "Dataset":
                 f"Dataset update Failed with : {type(ex)} with error message: {ex}"
             )
             if existing_model_details:
-                self.with_model_details(ModelDetails().with_items(existing_model_details["items"]))
+                self.with_model_details(
+                    ModelDetails().with_items(existing_model_details["items"])
+                )
             else:
                 self.with_model_details(ModelDetails().with_items([]))
                 return self
@@ -773,6 +792,7 @@ def materialise(
 
         dataset_execution_strategy.ingest_dataset(self, dataset_job)
 
+    @deprecated(details="preview functionality is deprecated. Please use as_of.")
     def preview(
         self,
         row_count: int = 10,
@@ -797,6 +817,8 @@ def preview(
         spark dataframe
             The preview result in spark dataframe
         """
+        self.check_resource_materialization()
+
         validate_delta_format_parameters(timestamp, version_number)
         target_table = f"{self.entity_id}.{self.name}"
 
@@ -806,6 +828,43 @@ def preview(
 
         return self.spark_engine.sql(sql_query)
 
+    def check_resource_materialization(self):
+        """Checks whether the target Delta table for this resource has been materialized in Spark.
+        If the target Delta table doesn't exist, raises a NotMaterializedError with the type and name of this resource.
+        """
+        if not self.spark_engine.is_delta_table_exists(self.target_delta_table()):
+            raise NotMaterializedError(self.type, self.name)
+
+    def as_of(
+        self,
+        version_number: int = None,
+        commit_timestamp: datetime = None,
+    ):
+        """preview the feature definition and return the response in dataframe.
+
+        Parameters
+        ----------
+        commit_timestamp: datetime
+            commit date time to preview in format yyyy-MM-dd or yyyy-MM-dd HH:mm:ss
+            commit date time is maintained for every ingestion commit using delta lake
+        version_number: int
+            commit version number for the preview. Version numbers are automatically versioned for every ingestion
+            commit using delta lake
+
+        Returns
+        -------
+        spark dataframe
+            The preview result in spark dataframe
+        """
+        self.check_resource_materialization()
+
+        validate_delta_format_parameters(commit_timestamp, version_number)
+        target_table = self.target_delta_table()
+
+        return self.spark_engine.get_time_version_data(
+            target_table, version_number, commit_timestamp
+        )
+
     def profile(self):
         """Get the dataset profile information and return the response in dataframe.
 
@@ -814,6 +873,8 @@ def profile(self):
         spark dataframe
             The profile result in spark dataframe
         """
+        self.check_resource_materialization()
+
         target_table = f"{self.entity_id}.{self.name}"
         sql_query = f"DESCRIBE DETAIL {target_table}"
 
@@ -835,6 +896,8 @@ def restore(self, version_number: int = None, timestamp: datetime = None):
         spark dataframe
             The restore output as spark dataframe
         """
+        self.check_resource_materialization()
+
         validate_delta_format_parameters(timestamp, version_number, True)
         target_table = f"{self.entity_id}.{self.name}"
         if version_number is not None:
diff --git a/ads/feature_store/docs/source/dataset.rst b/ads/feature_store/docs/source/dataset.rst
@@ -248,6 +248,9 @@ You can call the ``get_input_schema_dataframe()`` method of the Dataset instance
 Preview
 ========
 
+.. deprecated:: 1.0.3
+   Use :func:`as_of` instead.
+
 You can call the ``preview()`` method of the Dataset instance to preview the dataset.
 
 The ``.preview()`` method takes the following optional parameter:
@@ -261,6 +264,21 @@ The ``.preview()`` method takes the following optional parameter:
   df = dataset.preview(row_count=50)
   df.show()
 
+as_of
+=======
+
+You can call the ``as_of()`` method of the Dataset instance to get specified point in time and time traveled data.
+
+The ``.as_of()`` method takes the following optional parameter:
+
+- ``commit_timestamp: date-time``. Commit timestamp for dataset
+- ``version_number: int``. Version number for dataset
+
+.. code-block:: python3
+
+  # as_of feature group
+  df = dataset.as_of(version_number=1)
+
 
 Restore
 =======
diff --git a/ads/feature_store/docs/source/feature_group.rst b/ads/feature_store/docs/source/feature_group.rst
@@ -290,9 +290,13 @@ Feature store provides an API similar to Pandas to join feature groups together
   # Filter feature group
   feature_group.filter(feature_group.col1 > 10).show()
 
+
 Preview
 =======
 
+.. deprecated:: 1.0.3
+   Use :func:`as_of` instead.
+
 You can call the ``preview()`` method of the FeatureGroup instance to preview the feature group.
 
 The ``.preview()`` method takes the following optional parameter:
@@ -306,6 +310,21 @@ The ``.preview()`` method takes the following optional parameter:
   # Preview feature group
   df = feature_group.preview(row_count=50)
 
+as_of
+=======
+
+You can call the ``as_of()`` method of the FeatureGroup instance to get specified point in time and time traveled data.
+
+The ``.as_of()`` method takes the following optional parameter:
+
+- ``commit_timestamp: date-time``. Commit timestamp for feature group
+- ``version_number: int``. Version number for feature group
+
+.. code-block:: python3
+
+  # as_of feature group
+  df = feature_group.as_of(version_number=1)
+
 Restore
 =======
 
diff --git a/ads/feature_store/execution_strategy/engine/spark_engine.py b/ads/feature_store/execution_strategy/engine/spark_engine.py
@@ -5,6 +5,7 @@
 # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
 
 import logging
+from datetime import datetime
 
 from ads.common.decorator.runtime_dependency import OptionalDependency
 
@@ -17,7 +18,7 @@
     )
 except Exception as e:
     raise
-from typing import List
+from typing import List, Dict
 
 from ads.feature_store.common.utils.feature_schema_mapper import (
     map_spark_type_to_feature_type,
@@ -36,6 +37,71 @@ def __init__(self, metastore_id: str = None, spark_session: SparkSession = None)
         else:
             self.spark = SparkSessionSingleton(metastore_id).get_spark_session()
 
+        self.managed_table_location = (
+            SparkSessionSingleton().get_managed_table_location()
+        )
+
+    def get_time_version_data(
+        self,
+        delta_table_name: str,
+        version_number: int = None,
+        timestamp: datetime = None,
+    ):
+        split_db_name = delta_table_name.split(".")
+
+        # Get the Delta table path
+        delta_table_path = (
+            f"{self.managed_table_location}/{split_db_name[0].lower()}.db/{split_db_name[1]}"
+            if self.managed_table_location
+            else self._get_delta_table_path(delta_table_name)
+        )
+
+        # Set read options based on version_number and timestamp
+        read_options = {}
+        if version_number is not None:
+            read_options["versionAsOf"] = version_number
+        if timestamp:
+            read_options["timestampAsOf"] = timestamp
+
+        # Load the data from the Delta table using specified read options
+        df = self._read_delta_table(delta_table_path, read_options)
+        return df
+
+    def _get_delta_table_path(self, delta_table_name: str) -> str:
+        """
+        Get the path of the Delta table using DESCRIBE EXTENDED SQL command.
+
+        Args:
+            delta_table_name (str): The name of the Delta table.
+
+        Returns:
+            str: The path of the Delta table.
+        """
+        delta_table_path = (
+            self.spark.sql(f"DESCRIBE EXTENDED {delta_table_name}")
+            .filter("col_name = 'Location'")
+            .collect()[0][1]
+        )
+        return delta_table_path
+
+    def _read_delta_table(self, delta_table_path: str, read_options: Dict):
+        """
+        Read the Delta table using specified read options.
+
+        Args:
+            delta_table_path (str): The path of the Delta table.
+            read_options (dict): Dictionary of read options for Delta table.
+
+        Returns:
+            DataFrame: The loaded DataFrame from the Delta table.
+        """
+        df = (
+            self.spark.read.format("delta")
+            .options(**read_options)
+            .load(delta_table_path)
+        )
+        return df
+
     def sql(
         self,
         query: str,
diff --git a/ads/feature_store/feature_group.py b/ads/feature_store/feature_group.py
diff --git a/tests/integration/feature_store/test_as_of_for_featuregroup_and_dataset.py b/tests/integration/feature_store/test_as_of_for_featuregroup_and_dataset.py