review comments,new Int test for schema mismatch,doc update

najiyacl · najiyacl · commit ce05d8932978 · 2023-07-05T12:41:35.000+05:30
diff --git a/ads/feature_store/common/utils/utility.py b/ads/feature_store/common/utils/utility.py
@@ -264,7 +264,7 @@ def largest_matching_subset_of_primary_keys(left_feature_group, right_feature_gr
 
 def convert_pandas_datatype_with_schema(
         raw_feature_details: List[dict], input_df: pd.DataFrame
-):
+) -> pd.DataFrame:
     feature_detail_map = {}
     columns_to_remove = []
     for feature_details in raw_feature_details:
@@ -280,19 +280,21 @@ def convert_pandas_datatype_with_schema(
                 .where(pd.notnull(input_df[column]), None)
             )
         else:
-            logger.warning("column" + column + "doesnt exist in the input feature details")
+            logger.warning("column" + column + "doesn't exist in the input feature details")
             columns_to_remove.append(column)
     return input_df.drop(columns = columns_to_remove)
 
 
-def validate_spark_dataframe_schema(raw_feature_details: List[dict], input_df: DataFrame):
+def convert_spark_dataframe_with_schema(
+        raw_feature_details: List[dict], input_df: DataFrame
+) -> DataFrame:
     feature_detail_map = {}
     columns_to_remove = []
     for feature_details in raw_feature_details:
         feature_detail_map[feature_details.get("name")] = feature_details
     for column in input_df.columns:
         if column not in feature_detail_map.keys():
-            logger.warning("column" + column + "doesnt exist in the input feature details")
+            logger.warning("column" + column + "doesn't exist in the input feature details")
             columns_to_remove.append(column)
 
     return input_df.drop(*columns_to_remove)
@@ -301,4 +303,4 @@ def validate_spark_dataframe_schema(raw_feature_details: List[dict], input_df: D
 def validate_input_feature_details(input_feature_details, data_frame):
     if isinstance(data_frame, pd.DataFrame):
         return convert_pandas_datatype_with_schema(input_feature_details, data_frame)
-    return validate_spark_dataframe_schema(input_feature_details, data_frame)
+    return convert_spark_dataframe_with_schema(input_feature_details, data_frame)
diff --git a/ads/feature_store/docs/source/feature_group.rst b/ads/feature_store/docs/source/feature_group.rst
@@ -173,7 +173,7 @@ The ``.save_expectation()`` method takes the following optional parameter:
 
 Statistics Results
 ==================
-You can call the ``get_statistics()`` method of the FeatureGroup instance to fetch validation results for a specific ingestion job.
+You can call the ``get_statistics()`` method of the FeatureGroup instance to fetch statistics for a specific ingestion job.
 
 .. code-block:: python3
 
@@ -295,7 +295,8 @@ The data will be stored in a data type native to each store. There is an option
 
     Offline data types
     ###################
-    Please refer to the following mapping when registering a Spark DataFrame, or a Pandas DataFrame.
+    Please refer to the following mapping when registering a Spark DataFrame, or a Pandas DataFrame.For spark dataframes we support
+    all the data types and the ones which are not specified in the following table will be mapped to  Offline Feature Type COMPLEX
 
     .. list-table::
        :widths: 20 25 25 40
diff --git a/ads/feature_store/docs/source/release_notes.rst b/ads/feature_store/docs/source/release_notes.rst
@@ -3,7 +3,7 @@
 =============
 Release Notes
 =============
-1.0
+1.1
 ---
 
 .. note::
@@ -25,6 +25,32 @@ Release Notes
         - Par link expires Jan 5, 2026
 
 
+Release notes: July 5, 2023
+
+* [FEATURE] Supporting Offline Feature Type COMPLEX
+* [[DOCS] Data Type update for Offline Feature Type COMPLEX
+
+1.0
+---
+
+.. note::
+
+    .. list-table::
+      :header-rows: 1
+
+      * - Package Name
+        - Latest Version
+        - Notes
+      * - Conda pack
+        - `https://objectstorage.us-ashburn-1.oraclecloud.com/n/bigdatadatasciencelarge/b/service-conda-packs-fs/o/service_pack/cpu/PySpark_3.2_and_Feature_Store/1.0/fspyspark32_p38_cpu_v1#conda`
+        -
+      * - SERVICE_VERSION
+        - 0.1.209.master
+        -
+      * - Terraform Stack
+        - `link <https://objectstorage.us-ashburn-1.oraclecloud.com/p/vZogtXWwHqbkGLeqyKiqBmVxdbR4MK4nyOBqDsJNVE4sHGUY5KFi4T3mOFGA3FOy/n/idogsu2ylimg/b/oci-feature-store/o/beta/terraform/feature-store-terraform.zip>`__
+        - Par link expires Jan 5, 2026
+
 Release notes: June 15, 2023
 
 * [FEATURE] Included ``FeatureStore``, ``FeatureGroup``, ``Dataset``, ``Entity`` and ``Transformation`` concepts for feature store.
diff --git a/tests/integration/feature_store/test_input_feature_details.py b/tests/integration/feature_store/test_input_feature_details.py
@@ -0,0 +1,123 @@
+from pyspark.sql.types import StructType, ShortType, IntegerType, LongType, FloatType, DoubleType, \
+    BooleanType, StringType, StructField, ByteType, BinaryType, DecimalType
+from tests.integration.feature_store.test_base import FeatureStoreTestCase
+from ads.feature_store.input_feature_detail import FeatureDetail, FeatureType
+from ads.feature_store.feature_group import FeatureGroup
+from ads.feature_store.feature_group_job import FeatureGroupJob
+import pandas as pd
+import numpy as np
+import pytest
+from ads.feature_store.common.spark_session_singleton import SparkSessionSingleton
+
+
+class TestInputSchema(FeatureStoreTestCase):
+    input_feature_details = [
+        FeatureDetail("A").with_feature_type(FeatureType.STRING).with_order_number(1),
+        FeatureDetail("B").with_feature_type(FeatureType.INTEGER).with_order_number(2)
+    ]
+
+    a = ["value1", "value2"]
+    b = [25, 60]
+    c = [30, 50]
+    pandas_basic_df = pd.DataFrame(
+        {
+            "A": a,
+            "B": b,
+            "C": c
+        }
+    )
+
+    schema = StructType(
+        [StructField("string_col", StringType(), True),
+         StructField("int_col", IntegerType(), True),
+         StructField("long_col", LongType(), True)]
+    )
+
+    input_feature_details_spark = [
+        FeatureDetail("string_col").with_feature_type(FeatureType.STRING).with_order_number(1),
+        FeatureDetail("int_col").with_feature_type(FeatureType.INTEGER).with_order_number(2),
+        FeatureDetail("C").with_feature_type(FeatureType.INTEGER).with_order_number(2),
+        FeatureDetail("B").with_feature_type(FeatureType.INTEGER).with_order_number(2),
+    ]
+
+    data = [
+        ("value1", 100, 1000),
+        ("value2", 200, 2000)
+    ]
+    spark = SparkSessionSingleton(FeatureStoreTestCase.METASTORE_ID).get_spark_session()
+    basic_df = spark.createDataFrame(data, schema)
+
+    def define_feature_group_resource_with_pandas_schema(
+            self, entity_id, feature_store_id
+    ) -> "FeatureGroup":
+        feature_group_pandas_array = (
+            FeatureGroup()
+            .with_description("feature group resource for pandas array types")
+            .with_compartment_id(self.COMPARTMENT_ID)
+            .with_name(self.get_name("feature_group_pandas_array"))
+            .with_entity_id(entity_id)
+            .with_feature_store_id(feature_store_id)
+            .with_primary_keys([])
+            .with_input_feature_details(self.input_feature_details)
+        )
+        return feature_group_pandas_array
+
+    def define_feature_group_resource_with_spark_schema(
+            self, entity_id, feature_store_id
+    ) -> "FeatureGroup":
+        feature_group_spark_schema = (
+            FeatureGroup()
+            .with_description("feature group resource for pandas array types")
+            .with_compartment_id(self.COMPARTMENT_ID)
+            .with_name(self.get_name("feature_group_spark_schema"))
+            .with_entity_id(entity_id)
+            .with_feature_store_id(feature_store_id)
+            .with_primary_keys([])
+            .with_input_feature_details(self.input_feature_details_spark)
+        )
+        return feature_group_spark_schema
+
+    def test_feature_group_pandas_schema_mismatch(self):
+        """Tests  pandas schema"""
+        fs = self.define_feature_store_resource().create()
+        assert fs.oci_fs.id
+
+        entity = self.create_entity_resource(fs)
+        assert entity.oci_fs_entity.id
+
+        feature_group = self.define_feature_group_resource_with_pandas_schema(
+            entity.oci_fs_entity.id, fs.oci_fs.id
+        )
+        feature_group.create()
+        feature_group.materialise(self.pandas_basic_df)
+
+
+        df = feature_group.select().read()
+        assert len(df.columns) == 2
+
+        self.clean_up_feature_group(feature_group)
+        self.clean_up_entity(entity)
+        self.clean_up_feature_store(fs)
+
+    def test_feature_group_spark_schema_mismatch(self):
+        """Tests  pandas date time data types"""
+        fs = self.define_feature_store_resource().create()
+        assert fs.oci_fs.id
+
+        entity = self.create_entity_resource(fs)
+        assert entity.oci_fs_entity.id
+
+        feature_group = self.define_feature_group_resource_with_spark_schema(
+            entity.oci_fs_entity.id, fs.oci_fs.id
+        )
+        feature_group.create()
+        feature_group.materialise(self.basic_df)
+
+        df = feature_group.select().read()
+        assert len(df.columns) == 2
+
+        self.clean_up_feature_group(feature_group)
+        self.clean_up_entity(entity)
+        self.clean_up_feature_store(fs)
+
+