addressed review comments

K-Kapil-Sharma · K-Kapil-Sharma · commit 8c1e68c3c578 · 2023-06-19T11:35:07.000+05:30
diff --git a/THIRD_PARTY_LICENSES.txt b/THIRD_PARTY_LICENSES.txt
@@ -247,6 +247,12 @@ pyspark
 * Source code: https://github.com/apache/spark/tree/master/python
 * Project home: https://spark.apache.org/
 
+pyarrow
+* Copyright 2004 and onwards The Apache Software Foundation.
+* License: Apache-2.0 LICENSE
+* Source code: https://github.com/apache/arrow/tree/main/python
+* Project home: https://arrow.apache.org/
+
 python_jsonschema_objects
 * Copyright (c) 2014 Chris Wacek
 * License: MIT License
diff --git a/ads/feature_store/common/utils/feature_schema_mapper.py b/ads/feature_store/common/utils/feature_schema_mapper.py
@@ -87,8 +87,8 @@ def map_pandas_type_to_feature_type(feature_name, values):
                 inferred_dtype = current_dtype
             else:
                 if (
-                    current_dtype != inferred_dtype
-                    and current_dtype is not FeatureType.UNKNOWN
+                        current_dtype != inferred_dtype
+                        and current_dtype is not FeatureType.UNKNOWN
                 ):
                     raise TypeError(
                         f"Input feature '{feature_name}' has mixed types, {current_dtype} and {inferred_dtype}. "
@@ -233,7 +233,7 @@ def map_feature_type_to_pandas(feature_type):
 
 
 def convert_pandas_datatype_with_schema(
-    raw_feature_details: List[dict], input_df: pd.DataFrame
+        raw_feature_details: List[dict], input_df: pd.DataFrame
 ):
     feature_detail_map = {}
     for feature_details in raw_feature_details:
@@ -245,16 +245,19 @@ def convert_pandas_datatype_with_schema(
             pandas_type = map_feature_type_to_pandas(feature_type)
             input_df[column] = (
                 input_df[column]
-                .astype(pandas_type)
-                .where(pd.notnull(input_df[column]), None)
+                    .astype(pandas_type)
+                    .where(pd.notnull(input_df[column]), None)
             )
 
+
 def map_spark_type_to_stats_data_type(spark_type):
-    """Returns the MLM data type corresponding to SparkType
-    :param spark_type:
+    """ Maps the spark data types to MLM library data types
+    args:
+        param spark_type: Spark data type input from the feature dataframe on which we need stats
     :return:
+        Returns the MLM data type corresponding to SparkType
     """
-    spark_type_to_feature_type = {
+    spark_type_to_mlm_data_type = {
         StringType(): types.DataType.STRING,
         IntegerType(): types.DataType.INTEGER,
         FloatType(): types.DataType.FLOAT,
@@ -265,13 +268,15 @@ def map_spark_type_to_stats_data_type(spark_type):
         LongType(): types.DataType.INTEGER,
     }
 
-    return spark_type_to_feature_type.get(spark_type)
+    return spark_type_to_mlm_data_type.get(spark_type)
 
 
 def map_spark_type_to_stats_variable_type(spark_type):
-    """Returns the MLM variable type corresponding to SparkType
-    :param spark_type:
+    """ Maps the spark data types to MLM library variable types
+    args:
+        param spark_type: Spark data type input from the feature dataframe on which we need stats
     :return:
+        Returns the MLM variable type corresponding to SparkType
     """
     spark_type_to_feature_type = {
         StringType(): types.VariableType.NOMINAL,
diff --git a/ads/feature_store/common/utils/transformation_query_validator.py b/ads/feature_store/common/utils/transformation_query_validator.py
@@ -5,6 +5,14 @@
 
 from ads.feature_store.common.spark_session_singleton import SparkSessionSingleton
 
+"""
+USER_TRANSFORMATION_FUNCTION template: It is used to transform the user provided SQL query to the 
+transformation function
+
+Args:
+    function_name: Transformation function name 
+    input : The input placeholder for the FROM clause
+"""
 USER_TRANSFORMATION_FUNCTION = \
     """def {function_name}(input):
     sql_query = f\"\"\"{query}\"\"\"
@@ -19,9 +27,13 @@ def __verify_sql_query_plan(parser_plan, input_symbol: str):
         Once the sql parser has parsed the query,
         This function takes the parser plan as an input, It checks for the table names
         and verifies to ensure that there should only be single table and that too should have the placeholder name
-
+        A regex has been added to cater to common table expressions
         Args:
             parser_plan: A Spark sqlParser ParsePlan object.
+                parser_plan contain the project and unresolved relation items
+                project: list of unresolved attributes - table field names
+                UnresolvedRelation: list of unresolved relation attributes - table names
+                e.g. : Project ['user_id, 'credit_score], 'UnresolvedRelation [DATA_SOURCE_INPUT], [], false
             input_symbol (Transformation): The table name to be matched.
         """
         plan_items = json.loads(parser_plan.toJSON())
@@ -58,11 +70,12 @@ def verify_sql_input(query_input: str, input_symbol: str):
         parser = spark._jsparkSession.sessionState().sqlParser()
         try:
             parser_plan = parser.parsePlan(query_input)
-            # verify if the parser plan has only FROM DATA_SOURCE_INPUT template
-            TransformationQueryValidator.__verify_sql_query_plan(parser_plan, input_symbol)
         except ParseException as pe:
             raise ParseException(f"Unable to parse the sql expression, exception occurred:  {pe}")
 
+        # verify if the parser plan has only FROM DATA_SOURCE_INPUT template
+        TransformationQueryValidator.__verify_sql_query_plan(parser_plan, input_symbol)
+
     @staticmethod
     def create_transformation_template(query: str, input_symbol: str, function_name: str):
         """
diff --git a/ads/feature_store/execution_strategy/spark/spark_execution.py b/ads/feature_store/execution_strategy/spark/spark_execution.py
@@ -229,7 +229,7 @@ def _save_offline_dataframe(
             # Compute Feature Statistics
             
             feature_statistics = StatisticsService.compute_stats_with_mlm(
-                statistics_config=feature_group.statistics_config,
+                statistics_config=feature_group.oci_feature_group.statistics_config,
                 input_df=featured_data)
 
         except Exception as ex:
@@ -347,7 +347,7 @@ def _save_dataset_input(self, dataset, dataset_job: DatasetJob):
 
             # Compute Feature Statistics
             feature_statistics = StatisticsService.compute_stats_with_mlm(
-                statistics_config=dataset.statistics_config,
+                statistics_config=dataset.oci_dataset.statistics_config,
                 input_df=dataset_dataframe,
             )
 
diff --git a/ads/feature_store/feature_statistics/statistics_service.py b/ads/feature_store/feature_statistics/statistics_service.py
@@ -39,7 +39,7 @@ def compute_stats_with_mlm(
             statistics_config: StatisticsConfig, input_df: DataFrame
     ):
         feature_metrics = None
-        if bool(input_df.head(1)) and statistics_config and statistics_config.get("isEnabled"):
+        if bool(input_df.head(1)) and statistics_config and statistics_config.is_enabled:
             feature_schema = {}
             if input_df.schema:
                 StatisticsService.__get_mlm_supported_schema(feature_schema, input_df, statistics_config)
@@ -66,7 +66,7 @@ def __get_feature_metric(feature_schema: dict, data_frame: DataFrame):
 
     @staticmethod
     def __get_mlm_supported_schema(feature_schema: dict, input_df: DataFrame, statistics_config: StatisticsConfig):
-        relevant_columns = statistics_config.get("columns")
+        relevant_columns = statistics_config.columns
         for field in input_df.schema.fields:
             data_type = map_spark_type_to_stats_data_type(field.dataType)
             if not data_type: