oracle
diff --git a/‎ads/common/decorator/runtime_dependency.py
Lines changed: 2 additions & 1 deletion b/‎ads/common/decorator/runtime_dependency.py
Lines changed: 2 additions & 1 deletion
diff --git a/‎ads/feature_store/common/spark_session_singleton.py
Lines changed: 2 additions & 4 deletions b/‎ads/feature_store/common/spark_session_singleton.py
Lines changed: 2 additions & 4 deletions
diff --git a/‎ads/feature_store/common/utils/feature_schema_mapper.py
Lines changed: 42 additions & 0 deletions b/‎ads/feature_store/common/utils/feature_schema_mapper.py
Lines changed: 42 additions & 0 deletions
diff --git a/‎ads/feature_store/common/utils/transformation_query_validator.py
Lines changed: 77 additions & 0 deletions b/‎ads/feature_store/common/utils/transformation_query_validator.py
Lines changed: 77 additions & 0 deletions
diff --git a/‎ads/feature_store/execution_strategy/spark/spark_execution.py
Lines changed: 5 additions & 7 deletions b/‎ads/feature_store/execution_strategy/spark/spark_execution.py
Lines changed: 5 additions & 7 deletions
diff --git a/‎ads/feature_store/feature_statistics/pydeequ_service.py
Lines changed: 0 additions & 66 deletions b/‎ads/feature_store/feature_statistics/pydeequ_service.py
Lines changed: 0 additions & 66 deletions
diff --git a/‎ads/feature_store/feature_statistics/statistics_service.py
Lines changed: 84 additions & 0 deletions b/‎ads/feature_store/feature_statistics/statistics_service.py
Lines changed: 84 additions & 0 deletions
@@ -65,8 +65,9 @@ class OptionalDependency:
     SPARK = "oracle-ads[spark]"
     HUGGINGFACE = "oracle-ads[huggingface]"
     GREAT_EXPECTATIONS = "oracle-ads[great-expectations]"
-    PYDEEQU = "oracle-ads[pydeequ]"
     GRAPHVIZ = "oracle-ads[graphviz]"
+    MLM_INSIGHTS = "oracle-ads[mlm_insights]"
+    PYARROW = "oracle-ads[pyarrow]"
 
 
 def runtime_dependency(
 
@@ -80,16 +80,14 @@ def __init__(self, metastore_id: str = None):
         if developer_enabled():
             # Configure spark session with delta jars only in developer mode. In other cases,
             # jars should be part of the conda pack
-            spark_builder.config(
-                "spark.jars",
-                "https://repo1.maven.org/maven2/com/amazon/deequ/deequ/2.0.1-spark-3.2/deequ-2.0.1-spark-3.2.jar",
-            )
             self.spark_session = configure_spark_with_delta_pip(
                 spark_builder
             ).getOrCreate()
         else:
             self.spark_session = spark_builder.getOrCreate()
 
+        self.spark_session.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")
+
     def get_spark_session(self):
         """Access method to get the spark session."""
         return self.spark_session
@@ -4,12 +4,15 @@
 # Copyright (c) 2023 Oracle and/or its affiliates.
 # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
 
+import logging
 from typing import List
 
 import numpy as np
 import pandas as pd
 
 from ads.common.decorator.runtime_dependency import OptionalDependency
+from mlm_insights.constants import types
+
 from ads.feature_store.common.enums import FeatureType
 
 try:
@@ -22,6 +25,8 @@
 except Exception as e:
     raise
 
+logger = logging.getLogger(__name__)
+
 
 def map_spark_type_to_feature_type(spark_type):
     """Returns the feature type corresponding to SparkType
@@ -243,3 +248,40 @@ def convert_pandas_datatype_with_schema(
                 .astype(pandas_type)
                 .where(pd.notnull(input_df[column]), None)
             )
+
+def map_spark_type_to_stats_data_type(spark_type):
+    """Returns the MLM data type corresponding to SparkType
+    :param spark_type:
+    :return:
+    """
+    spark_type_to_feature_type = {
+        StringType(): types.DataType.STRING,
+        IntegerType(): types.DataType.INTEGER,
+        FloatType(): types.DataType.FLOAT,
+        DoubleType(): types.DataType.FLOAT,
+        BooleanType(): types.DataType.BOOLEAN,
+        DecimalType(): types.DataType.FLOAT,
+        ShortType(): types.DataType.INTEGER,
+        LongType(): types.DataType.INTEGER,
+    }
+
+    return spark_type_to_feature_type.get(spark_type)
+
+
+def map_spark_type_to_stats_variable_type(spark_type):
+    """Returns the MLM variable type corresponding to SparkType
+    :param spark_type:
+    :return:
+    """
+    spark_type_to_feature_type = {
+        StringType(): types.VariableType.NOMINAL,
+        IntegerType(): types.VariableType.CONTINUOUS,
+        FloatType(): types.VariableType.CONTINUOUS,
+        DoubleType(): types.VariableType.CONTINUOUS,
+        BooleanType(): types.VariableType.BINARY,
+        DecimalType(): types.VariableType.CONTINUOUS,
+        ShortType(): types.VariableType.CONTINUOUS,
+        LongType(): types.VariableType.CONTINUOUS,
+    }
+
+    return spark_type_to_feature_type.get(spark_type)
@@ -0,0 +1,77 @@
+import json
+import re
+
+from pyparsing import ParseException
+
+from ads.feature_store.common.spark_session_singleton import SparkSessionSingleton
+
+USER_TRANSFORMATION_FUNCTION = \
+    """def {function_name}(input):
+    sql_query = f\"\"\"{query}\"\"\"
+    return sql_query"""
+
+
+class TransformationQueryValidator:
+
+    @staticmethod
+    def __verify_sql_query_plan(parser_plan, input_symbol: str):
+        """
+        Once the sql parser has parsed the query,
+        This function takes the parser plan as an input, It checks for the table names
+        and verifies to ensure that there should only be single table and that too should have the placeholder name
+
+        Args:
+            parser_plan: A Spark sqlParser ParsePlan object.
+            input_symbol (Transformation): The table name to be matched.
+        """
+        plan_items = json.loads(parser_plan.toJSON())
+        plan_string = parser_plan.toString()
+        cte = re.findall(r"CTE \[(.*?)\]", plan_string)
+        table_names = []
+        for plan_item in plan_items:
+            if plan_item['class'] == 'org.apache.spark.sql.catalyst.analysis.UnresolvedRelation':
+                table = plan_item['multipartIdentifier']
+                res = table.strip('][').split(', ')
+                if len(res) >= 2:
+                    raise ValueError(
+                        "FROM Clause has invalid input {0}".format(table))
+                else:
+                    if res[0].lower() != input_symbol.lower():
+                        raise ValueError(
+                            f"Incorrect table template name, It should be {input_symbol}")
+                    if table not in cte:
+                        table_names.append(f"{table}")
+                        if len(table_names) > 1:
+                            raise ValueError(
+                                "Multiple tables are not supported ")
+
+    @staticmethod
+    def verify_sql_input(query_input: str, input_symbol: str):
+        """
+        verifies the query provided by user to ensure that its a valid sql query.
+
+        Args:
+            query_input: A Spark Sql query
+            input_symbol (Transformation): The table name to be matched.
+        """
+        spark = SparkSessionSingleton().get_spark_session()
+        parser = spark._jsparkSession.sessionState().sqlParser()
+        try:
+            parser_plan = parser.parsePlan(query_input)
+            # verify if the parser plan has only FROM DATA_SOURCE_INPUT template
+            TransformationQueryValidator.__verify_sql_query_plan(parser_plan, input_symbol)
+        except ParseException as pe:
+            raise ParseException(f"Unable to parse the sql expression, exception occurred:  {pe}")
+
+    @staticmethod
+    def create_transformation_template(query: str, input_symbol: str, function_name: str):
+        """
+        Creates the query transformation function to ensure backend integrity
+        Args:
+            query: A Spark Sql query
+            input_symbol (Transformation): The table name to be used.
+            function_name : The name of the transformation function
+        """
+        transformation_query = query.replace(input_symbol, "{input}")
+        output = USER_TRANSFORMATION_FUNCTION.format(query=transformation_query, function_name=function_name)
+        return output
@@ -40,7 +40,7 @@
 from ads.feature_store.feature_group_job import FeatureGroupJob
 from ads.feature_store.transformation import Transformation
 
-from ads.feature_store.feature_statistics.pydeequ_service import StatisticsService
+from ads.feature_store.feature_statistics.statistics_service import StatisticsService
 
 logger = logging.getLogger(__name__)
 
@@ -227,11 +227,10 @@ def _save_offline_dataframe(
 
             logger.info(f"output features for the FeatureGroup: {output_features}")
             # Compute Feature Statistics
-            feature_statistics = StatisticsService.compute_statistics(
-                spark=self._spark_session,
+            
+            feature_statistics = StatisticsService.compute_stats_with_mlm(
                 statistics_config=feature_group.statistics_config,
-                input_df=featured_data,
-            )
+                input_df=featured_data)
 
         except Exception as ex:
             error_details = str(ex)
@@ -347,8 +346,7 @@ def _save_dataset_input(self, dataset, dataset_job: DatasetJob):
             logger.info(f"output features for the dataset: {output_features}")
 
             # Compute Feature Statistics
-            feature_statistics = StatisticsService.compute_statistics(
-                spark=self._spark_session,
+            feature_statistics = StatisticsService.compute_stats_with_mlm(
                 statistics_config=dataset.statistics_config,
                 input_df=dataset_dataframe,
             )
 
@@ -0,0 +1,84 @@
+#!/usr/bin/env python
+# -*- coding: utf-8; -*-
+
+# Copyright (c) 2023 Oracle and/or its affiliates.
+# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
+import json
+import logging
+
+from ads.feature_store.statistics_config import StatisticsConfig
+from ads.feature_store.common.utils.feature_schema_mapper import *
+
+try:
+    from pyspark.sql import DataFrame
+except ModuleNotFoundError:
+    raise ModuleNotFoundError(
+        f"The `pyspark` module was not found. Please run `pip install "
+        f"{OptionalDependency.SPARK}`."
+    )
+
+try:
+    from mlm_insights.builder.builder_component import EngineDetail
+    from mlm_insights.builder.insights_builder import InsightsBuilder
+    from mlm_insights.constants.types import FeatureType
+except ModuleNotFoundError:
+    raise ModuleNotFoundError(
+        f"The `mlm_insights` module was not found. Please run `pip install "
+        f"{OptionalDependency.MLM_INSIGHTS}`."
+    )
+
+logger = logging.getLogger(__name__)
+CONST_FEATURE_METRICS = "feature_metrics"
+
+
+class StatisticsService:
+    """StatisticsService is used to compute the statistics using pydeequ column profiler"""
+
+    @staticmethod
+    def compute_stats_with_mlm(
+            statistics_config: StatisticsConfig, input_df: DataFrame
+    ):
+        feature_metrics = None
+        if bool(input_df.head(1)) and statistics_config and statistics_config.get("isEnabled"):
+            feature_schema = {}
+            if input_df.schema:
+                StatisticsService.__get_mlm_supported_schema(feature_schema, input_df, statistics_config)
+                feature_metrics = StatisticsService.__get_feature_metric(feature_schema, input_df)
+            else:
+                raise ValueError("Dataframe schema is missing")
+        return feature_metrics
+
+    @staticmethod
+    def __get_feature_metric(feature_schema: dict, data_frame: DataFrame):
+        feature_metrics = None
+        runner = InsightsBuilder(). \
+            with_input_schema(input_schema=feature_schema). \
+            with_data_frame(data_frame=data_frame). \
+            with_engine(engine=EngineDetail(engine_name="spark")). \
+            build()
+        result = runner.run()
+        if result and result.profile:
+            profile = result.profile
+            feature_metrics = json.dumps(profile.to_json()[CONST_FEATURE_METRICS])
+        else:
+            logger.warning(f"stats computation failed with MLM for schema {feature_schema}")
+        return feature_metrics
+
+    @staticmethod
+    def __get_mlm_supported_schema(feature_schema: dict, input_df: DataFrame, statistics_config: StatisticsConfig):
+        relevant_columns = statistics_config.get("columns")
+        for field in input_df.schema.fields:
+            data_type = map_spark_type_to_stats_data_type(field.dataType)
+            if not data_type:
+                logger.warning(f"Unable to map spark data type fields to MLM fields, "
+                               f"Actual data type {field.dataType}")
+            elif relevant_columns:
+                if field.name in relevant_columns:
+                    feature_schema[field.name] = FeatureType(data_type,
+                                                             map_spark_type_to_stats_variable_type(
+                                                                 field.dataType))
+            else:
+                feature_schema[field.name] = FeatureType(data_type,
+                                                         map_spark_type_to_stats_variable_type(
+                                                             field.dataType))
+