oracle
diff --git a/‎THIRD_PARTY_LICENSES.txt
Lines changed: 6 additions & 0 deletions b/‎THIRD_PARTY_LICENSES.txt
Lines changed: 6 additions & 0 deletions
diff --git a/‎ads/common/decorator/runtime_dependency.py
Lines changed: 2 additions & 1 deletion b/‎ads/common/decorator/runtime_dependency.py
Lines changed: 2 additions & 1 deletion
diff --git a/‎ads/feature_store/common/spark_session_singleton.py
Lines changed: 2 additions & 4 deletions b/‎ads/feature_store/common/spark_session_singleton.py
Lines changed: 2 additions & 4 deletions
diff --git a/‎ads/feature_store/common/utils/feature_schema_mapper.py
Lines changed: 52 additions & 5 deletions b/‎ads/feature_store/common/utils/feature_schema_mapper.py
Lines changed: 52 additions & 5 deletions
diff --git a/‎ads/feature_store/common/utils/transformation_query_validator.py
Lines changed: 90 additions & 0 deletions b/‎ads/feature_store/common/utils/transformation_query_validator.py
Lines changed: 90 additions & 0 deletions
diff --git a/‎ads/feature_store/execution_strategy/spark/spark_execution.py
Lines changed: 7 additions & 9 deletions b/‎ads/feature_store/execution_strategy/spark/spark_execution.py
Lines changed: 7 additions & 9 deletions
diff --git a/‎ads/feature_store/feature_statistics/pydeequ_service.py
Lines changed: 0 additions & 66 deletions b/‎ads/feature_store/feature_statistics/pydeequ_service.py
Lines changed: 0 additions & 66 deletions
@@ -247,6 +247,12 @@ pyspark
 * Source code: https://github.com/apache/spark/tree/master/python
 * Project home: https://spark.apache.org/
 
+pyarrow
+* Copyright 2004 and onwards The Apache Software Foundation.
+* License: Apache-2.0 LICENSE
+* Source code: https://github.com/apache/arrow/tree/main/python
+* Project home: https://arrow.apache.org/
+
 python_jsonschema_objects
 * Copyright (c) 2014 Chris Wacek
 * License: MIT License
 
@@ -65,8 +65,9 @@ class OptionalDependency:
     SPARK = "oracle-ads[spark]"
     HUGGINGFACE = "oracle-ads[huggingface]"
     GREAT_EXPECTATIONS = "oracle-ads[great-expectations]"
-    PYDEEQU = "oracle-ads[pydeequ]"
     GRAPHVIZ = "oracle-ads[graphviz]"
+    MLM_INSIGHTS = "oracle-ads[mlm_insights]"
+    PYARROW = "oracle-ads[pyarrow]"
 
 
 def runtime_dependency(
 
@@ -80,16 +80,14 @@ def __init__(self, metastore_id: str = None):
         if developer_enabled():
             # Configure spark session with delta jars only in developer mode. In other cases,
             # jars should be part of the conda pack
-            spark_builder.config(
-                "spark.jars",
-                "https://repo1.maven.org/maven2/com/amazon/deequ/deequ/2.0.1-spark-3.2/deequ-2.0.1-spark-3.2.jar",
-            )
             self.spark_session = configure_spark_with_delta_pip(
                 spark_builder
             ).getOrCreate()
         else:
             self.spark_session = spark_builder.getOrCreate()
 
+        self.spark_session.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")
+
     def get_spark_session(self):
         """Access method to get the spark session."""
         return self.spark_session
@@ -4,12 +4,15 @@
 # Copyright (c) 2023 Oracle and/or its affiliates.
 # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
 
+import logging
 from typing import List
 
 import numpy as np
 import pandas as pd
 
 from ads.common.decorator.runtime_dependency import OptionalDependency
+from mlm_insights.constants import types
+
 from ads.feature_store.common.enums import FeatureType
 
 try:
@@ -22,6 +25,8 @@
 except Exception as e:
     raise
 
+logger = logging.getLogger(__name__)
+
 
 def map_spark_type_to_feature_type(spark_type):
     """Returns the feature type corresponding to SparkType
@@ -82,8 +87,8 @@ def map_pandas_type_to_feature_type(feature_name, values):
                 inferred_dtype = current_dtype
             else:
                 if (
-                    current_dtype != inferred_dtype
-                    and current_dtype is not FeatureType.UNKNOWN
+                        current_dtype != inferred_dtype
+                        and current_dtype is not FeatureType.UNKNOWN
                 ):
                     raise TypeError(
                         f"Input feature '{feature_name}' has mixed types, {current_dtype} and {inferred_dtype}. "
@@ -228,7 +233,7 @@ def map_feature_type_to_pandas(feature_type):
 
 
 def convert_pandas_datatype_with_schema(
-    raw_feature_details: List[dict], input_df: pd.DataFrame
+        raw_feature_details: List[dict], input_df: pd.DataFrame
 ):
     feature_detail_map = {}
     for feature_details in raw_feature_details:
@@ -240,6 +245,48 @@ def convert_pandas_datatype_with_schema(
             pandas_type = map_feature_type_to_pandas(feature_type)
             input_df[column] = (
                 input_df[column]
-                .astype(pandas_type)
-                .where(pd.notnull(input_df[column]), None)
+                    .astype(pandas_type)
+                    .where(pd.notnull(input_df[column]), None)
             )
+
+
+def map_spark_type_to_stats_data_type(spark_type):
+    """ Maps the spark data types to MLM library data types
+    args:
+        param spark_type: Spark data type input from the feature dataframe on which we need stats
+    :return:
+        Returns the MLM data type corresponding to SparkType
+    """
+    spark_type_to_mlm_data_type = {
+        StringType(): types.DataType.STRING,
+        IntegerType(): types.DataType.INTEGER,
+        FloatType(): types.DataType.FLOAT,
+        DoubleType(): types.DataType.FLOAT,
+        BooleanType(): types.DataType.BOOLEAN,
+        DecimalType(): types.DataType.FLOAT,
+        ShortType(): types.DataType.INTEGER,
+        LongType(): types.DataType.INTEGER,
+    }
+
+    return spark_type_to_mlm_data_type.get(spark_type)
+
+
+def map_spark_type_to_stats_variable_type(spark_type):
+    """ Maps the spark data types to MLM library variable types
+    args:
+        param spark_type: Spark data type input from the feature dataframe on which we need stats
+    :return:
+        Returns the MLM variable type corresponding to SparkType
+    """
+    spark_type_to_feature_type = {
+        StringType(): types.VariableType.NOMINAL,
+        IntegerType(): types.VariableType.CONTINUOUS,
+        FloatType(): types.VariableType.CONTINUOUS,
+        DoubleType(): types.VariableType.CONTINUOUS,
+        BooleanType(): types.VariableType.BINARY,
+        DecimalType(): types.VariableType.CONTINUOUS,
+        ShortType(): types.VariableType.CONTINUOUS,
+        LongType(): types.VariableType.CONTINUOUS,
+    }
+
+    return spark_type_to_feature_type.get(spark_type)
@@ -0,0 +1,90 @@
+import json
+import re
+
+from pyparsing import ParseException
+
+from ads.feature_store.common.spark_session_singleton import SparkSessionSingleton
+
+"""
+USER_TRANSFORMATION_FUNCTION template: It is used to transform the user provided SQL query to the 
+transformation function
+
+Args:
+    function_name: Transformation function name 
+    input : The input placeholder for the FROM clause
+"""
+USER_TRANSFORMATION_FUNCTION = \
+    """def {function_name}(input):
+    sql_query = f\"\"\"{query}\"\"\"
+    return sql_query"""
+
+
+class TransformationQueryValidator:
+
+    @staticmethod
+    def __verify_sql_query_plan(parser_plan, input_symbol: str):
+        """
+        Once the sql parser has parsed the query,
+        This function takes the parser plan as an input, It checks for the table names
+        and verifies to ensure that there should only be single table and that too should have the placeholder name
+        A regex has been added to cater to common table expressions
+        Args:
+            parser_plan: A Spark sqlParser ParsePlan object.
+                parser_plan contain the project and unresolved relation items
+                project: list of unresolved attributes - table field names
+                UnresolvedRelation: list of unresolved relation attributes - table names
+                e.g. : Project ['user_id, 'credit_score], 'UnresolvedRelation [DATA_SOURCE_INPUT], [], false
+            input_symbol (Transformation): The table name to be matched.
+        """
+        plan_items = json.loads(parser_plan.toJSON())
+        plan_string = parser_plan.toString()
+        cte = re.findall(r"CTE \[(.*?)\]", plan_string)
+        table_names = []
+        for plan_item in plan_items:
+            if plan_item['class'] == 'org.apache.spark.sql.catalyst.analysis.UnresolvedRelation':
+                table = plan_item['multipartIdentifier']
+                res = table.strip('][').split(', ')
+                if len(res) >= 2:
+                    raise ValueError(
+                        "FROM Clause has invalid input {0}".format(table))
+                else:
+                    if res[0].lower() != input_symbol.lower():
+                        raise ValueError(
+                            f"Incorrect table template name, It should be {input_symbol}")
+                    if table not in cte:
+                        table_names.append(f"{table}")
+                        if len(table_names) > 1:
+                            raise ValueError(
+                                "Multiple tables are not supported ")
+
+    @staticmethod
+    def verify_sql_input(query_input: str, input_symbol: str):
+        """
+        verifies the query provided by user to ensure that its a valid sql query.
+
+        Args:
+            query_input: A Spark Sql query
+            input_symbol (Transformation): The table name to be matched.
+        """
+        spark = SparkSessionSingleton().get_spark_session()
+        parser = spark._jsparkSession.sessionState().sqlParser()
+        try:
+            parser_plan = parser.parsePlan(query_input)
+        except ParseException as pe:
+            raise ParseException(f"Unable to parse the sql expression, exception occurred:  {pe}")
+
+        # verify if the parser plan has only FROM DATA_SOURCE_INPUT template
+        TransformationQueryValidator.__verify_sql_query_plan(parser_plan, input_symbol)
+
+    @staticmethod
+    def create_transformation_template(query: str, input_symbol: str, function_name: str):
+        """
+        Creates the query transformation function to ensure backend integrity
+        Args:
+            query: A Spark Sql query
+            input_symbol (Transformation): The table name to be used.
+            function_name : The name of the transformation function
+        """
+        transformation_query = query.replace(input_symbol, "{input}")
+        output = USER_TRANSFORMATION_FUNCTION.format(query=transformation_query, function_name=function_name)
+        return output
@@ -40,7 +40,7 @@
 from ads.feature_store.feature_group_job import FeatureGroupJob
 from ads.feature_store.transformation import Transformation
 
-from ads.feature_store.feature_statistics.pydeequ_service import StatisticsService
+from ads.feature_store.feature_statistics.statistics_service import StatisticsService
 
 logger = logging.getLogger(__name__)
 
@@ -227,11 +227,10 @@ def _save_offline_dataframe(
 
             logger.info(f"output features for the FeatureGroup: {output_features}")
             # Compute Feature Statistics
-            feature_statistics = StatisticsService.compute_statistics(
-                spark=self._spark_session,
-                statistics_config=feature_group.statistics_config,
-                input_df=featured_data,
-            )
+            
+            feature_statistics = StatisticsService.compute_stats_with_mlm(
+                statistics_config=feature_group.oci_feature_group.statistics_config,
+                input_df=featured_data)
 
         except Exception as ex:
             error_details = str(ex)
@@ -347,9 +346,8 @@ def _save_dataset_input(self, dataset, dataset_job: DatasetJob):
             logger.info(f"output features for the dataset: {output_features}")
 
             # Compute Feature Statistics
-            feature_statistics = StatisticsService.compute_statistics(
-                spark=self._spark_session,
-                statistics_config=dataset.statistics_config,
+            feature_statistics = StatisticsService.compute_stats_with_mlm(
+                statistics_config=dataset.oci_dataset.statistics_config,
                 input_df=dataset_dataframe,
             )