code changes to tie ingestion and validation (#247)

KshitizLohia · web-flow · commit 52d463f5cf33 · 2023-07-08T13:22:44.000+05:30
diff --git a/ads/feature_store/common/utils/utility.py b/ads/feature_store/common/utils/utility.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 # -*- coding: utf-8; -*-
-
+import copy
 # Copyright (c) 2023 Oracle and/or its affiliates.
 # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
 
@@ -41,6 +41,7 @@
 from ads.feature_engineering.feature_type import datetime
 
 logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
 
 
 def get_execution_engine_type(
@@ -117,6 +118,87 @@ def validate_delta_format_parameters(
             raise Exception(f"version number cannot be negative")
 
 
+def show_ingestion_summary(
+    entity_id: str,
+    entity_type: EntityType = EntityType.FEATURE_GROUP,
+    error_details: str = None,
+):
+    """
+    Displays a ingestion summary table with the given entity type and error details.
+
+    Args:
+        entity_id: str
+        entity_type (EntityType, optional): The type of entity being ingested. Defaults to EntityType.FEATURE_GROUP.
+        error_details (str, optional): Details of any errors that occurred during ingestion. Defaults to None.
+    """
+    from tabulate import tabulate
+
+    table_headers = ["entity_id", "entity_type", "ingestion_status", "error_details"]
+    ingestion_status = "Failed" if error_details else "Succeeded"
+
+    table_values = [
+        entity_id,
+        entity_type.value,
+        ingestion_status,
+        error_details if error_details else "None",
+    ]
+
+    logger.info(
+        "Ingestion Summary \n"
+        + tabulate(
+            [table_values],
+            headers=table_headers,
+            tablefmt="fancy_grid",
+            numalign="center",
+            stralign="center",
+        )
+    )
+
+
+def show_validation_summary(ingestion_status: str, validation_output, expectation_type):
+    from tabulate import tabulate
+    statistics = validation_output["statistics"]
+
+    table_headers = (
+        ["expectation_type"] + list(statistics.keys()) + ["ingestion_status"]
+    )
+
+    table_values = [expectation_type] + list(statistics.values()) + [ingestion_status]
+
+    logger.info(
+        "Validation Summary \n"
+        + tabulate(
+            [table_values],
+            headers=table_headers,
+            tablefmt="fancy_grid",
+            numalign="center",
+            stralign="center",
+        )
+    )
+
+    rule_table_headers = ["rule_type", "arguments", "status"]
+
+    rule_table_values = [
+        [
+            rule_output["expectation_config"].get("expectation_type"),
+            {key: value for key, value in rule_output["expectation_config"]["kwargs"].items() if key != "batch_id"},
+            rule_output.get("success")
+        ]
+        for rule_output in validation_output["results"]
+    ]
+
+    logger.info(
+        "Validations Rules Summary \n"
+        + tabulate(
+            rule_table_values,
+            headers=rule_table_headers,
+            tablefmt="fancy_grid",
+            numalign="center",
+            stralign="center",
+        )
+    )
+
+
 def get_features(
     output_columns: List[dict],
     parent_id: str,
diff --git a/ads/feature_store/data_validation/great_expectation.py b/ads/feature_store/data_validation/great_expectation.py
@@ -115,7 +115,7 @@ def apply_validations(expectation_details, expectation_suite_name, dataframe):
         str
             A string representation of the validation result.
         """
-        validation_output = None
+        expectation_response = None
         if (
             expectation_details
             and expectation_details.get("expectationType")
@@ -126,14 +126,4 @@ def apply_validations(expectation_details, expectation_suite_name, dataframe):
                 expectation_details, expectation_suite_name, dataframe
             )
 
-            validation_output = str(expectation_response)
-
-            if expectation_details["expectationType"] == ExpectationType.STRICT.value:
-                if not expectation_response["success"]:
-                    raise Exception(
-                        "Expectation failed with statistics: {0} ... Aborting ingestion.".format(
-                            expectation_response["statistics"]
-                        )
-                    )
-
-        return validation_output
+        return expectation_response
diff --git a/ads/feature_store/execution_strategy/spark/spark_execution.py b/ads/feature_store/execution_strategy/spark/spark_execution.py
@@ -1,5 +1,6 @@
 #!/usr/bin/env python
 # -*- coding: utf-8; -*-
+import json
 
 # Copyright (c) 2023 Oracle and/or its affiliates.
 # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
@@ -8,7 +9,11 @@
 import pandas as pd
 
 from ads.common.decorator.runtime_dependency import OptionalDependency
-from ads.feature_store.common.utils.utility import get_features
+from ads.feature_store.common.utils.utility import (
+    get_features,
+    show_ingestion_summary,
+    show_validation_summary,
+)
 from ads.feature_store.execution_strategy.engine.spark_engine import SparkEngine
 
 try:
@@ -25,6 +30,7 @@
     FeatureStoreJobType,
     LifecycleState,
     EntityType,
+    ExpectationType,
 )
 from ads.feature_store.common.spark_session_singleton import SparkSessionSingleton
 from ads.feature_store.common.utils.transformation_utils import TransformationUtils
@@ -145,6 +151,36 @@ def delete_dataset(self, dataset, dataset_job: DatasetJob):
                 output_details=output_details,
             )
 
+    @staticmethod
+    def _validate_expectation(expectation_type, validation_output):
+        """
+        Validates the expectation based on the given expectation type and the validation output.
+
+        Args:
+            expectation_type (str): The type of expectation to validate (e.g., "STRICT", "LENIENT").
+            validation_output (dict): The output of the validation containing success status and statistics.
+
+        Raises:
+            Exception: If the expectation fails in strict mode, raises an exception with an error message.
+
+        Warnings:
+            If the expectation fails in lenient mode, logs a warning message.
+
+        """
+
+        error_message = None
+        ingestion_status = "Ingestion in progress"
+
+        if not validation_output["success"]:
+            if expectation_type == ExpectationType.STRICT.value:
+                error_message = f"Expectation failed with Insufficient Success Rate, Aborting ingestion"
+                ingestion_status = "Insufficient Success Rate, Aborting ingestion"
+
+        show_validation_summary(ingestion_status, validation_output, expectation_type)
+
+        if error_message:
+            raise Exception(error_message)
+
     def _save_offline_dataframe(
         self, data_frame, feature_group, feature_group_job: FeatureGroupJob
     ):
@@ -182,12 +218,22 @@ def _save_offline_dataframe(
 
             # TODO: Get event timestamp column and apply filtering basis from and to timestamp
 
-            # Apply validations
-            validation_output = ExpectationService.apply_validations(
-                expectation_details=feature_group.expectation_details,
-                expectation_suite_name=feature_group.name,
-                dataframe=data_frame,
-            )
+            if feature_group.expectation_details:
+                expectation_type = feature_group.expectation_details["expectationType"]
+                logger.info(f"Validation expectation type: {expectation_type}")
+
+                # Apply validations
+                validation_output = ExpectationService.apply_validations(
+                    expectation_details=feature_group.expectation_details,
+                    expectation_suite_name=feature_group.name,
+                    dataframe=data_frame,
+                )
+
+                if validation_output:
+                    self._validate_expectation(
+                        expectation_type=expectation_type,
+                        validation_output=validation_output,
+                    )
 
             # Apply the transformation
             if feature_group.transformation_id:
@@ -238,9 +284,15 @@ def _save_offline_dataframe(
                 f"FeatureGroup Materialization Failed with : {type(ex)} with error message: {ex}"
             )
 
+        show_ingestion_summary(
+            entity_id=feature_group.id,
+            entity_type=EntityType.FEATURE_GROUP,
+            error_details=error_details,
+        )
+
         output_details = {
             "error_details": error_details,
-            "validation_output": validation_output,
+            "validation_output": str(validation_output),
             "commit_id": "commit_id",
             "feature_statistics": feature_statistics,
         }
@@ -323,12 +375,22 @@ def _save_dataset_input(self, dataset, dataset_job: DatasetJob):
             # Execute the SQL query on the spark and load the dataframe.
             dataset_dataframe = self.spark_engine.sql(dataset.query)
 
-            # Apply validations
-            validation_output = ExpectationService.apply_validations(
-                expectation_details=dataset.expectation_details,
-                expectation_suite_name=dataset.name,
-                dataframe=dataset_dataframe,
-            )
+            if dataset.expectation_details:
+                expectation_type = dataset.expectation_details["expectationType"]
+                logger.info(f"Validation expectation type: {expectation_type}")
+
+                # Apply validations
+                validation_output = ExpectationService.apply_validations(
+                    expectation_details=dataset.expectation_details,
+                    expectation_suite_name=dataset.name,
+                    dataframe=dataset_dataframe,
+                )
+
+                if validation_output:
+                    self._validate_expectation(
+                        expectation_type=expectation_type,
+                        validation_output=validation_output,
+                    )
 
             self.delta_lake_service.save_delta_dataframe(
                 dataset_dataframe,
@@ -357,9 +419,15 @@ def _save_dataset_input(self, dataset, dataset_job: DatasetJob):
                 f"Dataset Materialization Failed with : {type(ex)} with error message: {ex}"
             )
 
+        show_ingestion_summary(
+            entity_id=dataset.id,
+            entity_type=EntityType.DATASET,
+            error_details=error_details,
+        )
+
         output_details = {
             "error_details": error_details,
-            "validation_output": validation_output,
+            "validation_output": str(validation_output),
             "commit_id": "commit_id",
             "feature_statistics": feature_statistics,
         }