review comment changes

guptadivyank · guptadivyank · commit b8f9c16b277c · 2023-08-22T19:53:41.000+05:30
diff --git a/ads/feature_store/dataset.py b/ads/feature_store/dataset.py
@@ -835,7 +835,33 @@ def materialise(
 
         dataset_execution_strategy.ingest_dataset(self, dataset_job)
 
-    @deprecated(details="preview functionality is deprecated. Please use as_of.")
+    def get_last_job(self) -> "DatasetJob":
+        """Gets the Job details for the last running Dataset job.
+
+        Returns:
+            DatasetJob
+        """
+
+        if not self.id:
+            raise ValueError(
+                "Dataset needs to be saved to the feature store before getting associated jobs."
+            )
+
+        if not self.job_id:
+            ds_job = DatasetJob.list(
+                dataset_id=self.id,
+                compartment_id=self.compartment_id,
+                sort_by="timeCreated",
+                limit="1",
+            )
+            if not ds_job:
+                raise ValueError(
+                    "Unable to retrieve the associated last job. Please make sure you materialized the data."
+                )
+            self.with_job_id(ds_job[0].id)
+            return ds_job[0]
+        return DatasetJob.from_id(self.job_id)
+
     def preview(
         self,
         row_count: int = 10,
@@ -990,14 +1016,8 @@ def get_statistics(self, job_id: str = None) -> "Statistics":
             raise ValueError(
                 "Dataset needs to be saved to the feature store before retrieving the statistics"
             )
-        stat_job_id = job_id
-        if job_id is None:
-            if self.job_id is None:
-                raise ValueError(
-                    "Unable to retrieve the last job,please provide the job id,make sure you materialised the data'"
-                )
-            else:
-                stat_job_id = self.job_id
+
+        stat_job_id = job_id if job_id is not None else self.get_last_job().id
 
         # TODO: take the one in memory or will list down job ids and find the latest
         dataset_job = DatasetJob.from_id(stat_job_id)
@@ -1023,14 +1043,8 @@ def get_validation_output(self, job_id: str = None) -> "ValidationOutput":
             raise ValueError(
                 "Dataset needs to be saved to the feature store before retrieving the validation report"
             )
-        validation_job_id = job_id
-        if job_id is None:
-            if self.job_id is None:
-                raise ValueError(
-                    "Unable to retrieve the last job,please provide the job id,make sure you materialised the data'"
-                )
-            else:
-                validation_job_id = self.job_id
+
+        validation_job_id = job_id if job_id is not None else self.get_last_job().id
 
         # retrieve the validation output JSON from data_flow_batch_execution_output
         dataset_job = DatasetJob.from_id(validation_job_id)
diff --git a/ads/feature_store/feature_group.py b/ads/feature_store/feature_group.py
@@ -930,7 +930,7 @@ def get_last_job(self) -> "FeatureGroupJob":
             )
             if not fg_job:
                 raise ValueError(
-                    "Associated jobs cannot be retrieved before calling 'materialise' or 'delete'."
+                    "Unable to retrieve the associated last job. Please make sure you materialized the data."
                 )
             self.with_job_id(fg_job[0].id)
             return fg_job[0]
@@ -1353,7 +1353,7 @@ def get_statistics(self, job_id: str = None) -> "Statistics":
                 "FeatureGroup needs to be saved to the feature store before retrieving the statistics"
             )
 
-        stat_job_id = self._get_job_id(job_id)
+        stat_job_id = job_id if job_id is not None else self.get_last_job().id
 
         # TODO: take the one in memory or will list down job ids and find the latest
         fg_job = FeatureGroupJob.from_id(stat_job_id)
@@ -1382,7 +1382,7 @@ def get_validation_output(self, job_id: str = None) -> "ValidationOutput":
                 "FeatureGroup needs to be saved to the feature store before retrieving the validation report"
             )
 
-        validation_job_id = self._get_job_id(job_id)
+        validation_job_id = job_id if job_id is not None else self.get_last_job().id
 
         # Retrieve the validation output JSON from data_flow_batch_execution_output.
         fg_job = FeatureGroupJob.from_id(validation_job_id)
diff --git a/tests/integration/feature_store/test_feature_group_dataset_listing.py b/tests/integration/feature_store/test_feature_group_dataset_listing.py
@@ -0,0 +1,185 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*--
+
+# Copyright (c) 2023 Oracle and/or its affiliates.
+# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
+
+import pytest
+from ads.feature_store.feature_group_job import FeatureGroupJob
+
+from ads.feature_store.dataset import Dataset
+from ads.feature_store.feature_group import FeatureGroup
+from tests.integration.feature_store.test_base import FeatureStoreTestCase
+
+
+class TestFeatureGroupDatasetListing(FeatureStoreTestCase):
+    """Contains integration tests for Feature Group and Dataset Listing."""
+
+    def define_feature_group_resource_with_default_config(
+        self, entity_id, feature_store_id
+    ) -> "FeatureGroup":
+        feature_group_resource = (
+            FeatureGroup()
+            .with_description("feature group with default stats config")
+            .with_compartment_id(self.COMPARTMENT_ID)
+            .with_name(self.get_name("petals3"))
+            .with_entity_id(entity_id)
+            .with_feature_store_id(feature_store_id)
+            .with_primary_keys([])
+            .with_input_feature_details(self.INPUT_FEATURE_DETAILS)
+        )
+        return feature_group_resource
+
+    def define_feature_group_resource_with_stats_disabled(
+        self, entity_id, feature_store_id
+    ) -> "FeatureGroup":
+        feature_group_resource = (
+            FeatureGroup()
+            .with_description("feature group with statistics disabled")
+            .with_compartment_id(self.COMPARTMENT_ID)
+            .with_name(self.get_name("petals2"))
+            .with_entity_id(entity_id)
+            .with_feature_store_id(feature_store_id)
+            .with_primary_keys([])
+            .with_input_feature_details(self.INPUT_FEATURE_DETAILS)
+            .with_statistics_config(False)
+        )
+        return feature_group_resource
+
+    def define_dataset_resource_with_default_config(
+        self, entity_id, feature_store_id, feature_group_name
+    ) -> "Dataset":
+        name = self.get_name("petals1")
+        dataset_resource = (
+            Dataset()
+            .with_description("dataset with default statistics configuration")
+            .with_compartment_id(self.COMPARTMENT_ID)
+            .with_name(self.get_name("petals_ds_default_stat"))
+            .with_entity_id(entity_id)
+            .with_feature_store_id(feature_store_id)
+            .with_query(f"SELECT * FROM `{entity_id}`.{feature_group_name}")
+        )
+        return dataset_resource
+
+    def define_dataset_resource_with_stats_disabled(
+        self, entity_id, feature_store_id, feature_group_name
+    ) -> "Dataset":
+        name = self.get_name("petals4")
+        dataset_resource = (
+            Dataset()
+            .with_description("dataset with statistics disabled")
+            .with_compartment_id(self.COMPARTMENT_ID)
+            .with_name(self.get_name("petals_ds_stat_disabled"))
+            .with_entity_id(entity_id)
+            .with_feature_store_id(feature_store_id)
+            .with_query(f"SELECT * FROM `{entity_id}`.{feature_group_name}")
+            .with_statistics_config(False)
+        )
+        return dataset_resource
+
+    def test_feature_group_listing_without_limit(self):
+        """Tests listing of feature group resources with user defined limit."""
+        fs = self.define_feature_store_resource().create()
+        assert fs.oci_fs.id
+
+        entity = self.create_entity_resource(fs)
+        assert entity.oci_fs_entity.id
+
+        fg1 = self.define_feature_group_resource_with_default_config(
+            entity.oci_fs_entity.id, fs.oci_fs.id
+        ).create()
+        assert fg1.oci_feature_group.id
+        fg1.materialise(self.data)
+        fg1.materialise(self.data2)
+
+        fg1_job_list = FeatureGroupJob.list(compartment_id=self.COMPARTMENT_ID)
+        assert fg1_job_list is not None
+        assert len(fg1_job_list) == 2
+
+        fg2 = self.define_feature_group_resource_with_stats_disabled(
+            entity.oci_fs_entity.id, fs.oci_fs.id
+        ).create()
+        assert fg2.oci_feature_group.id
+        fg2.materialise(self.data3)
+
+        fg_list = FeatureGroup.list(compartment_id=self.COMPARTMENT_ID)
+        assert fg_list is not None
+        assert len(fg_list) == 2
+
+        self.clean_up_feature_group(fg1)
+        self.clean_up_feature_group(fg2)
+        self.clean_up_entity(entity)
+        self.clean_up_feature_store(fs)
+
+    def test_feature_group_listing_with_limit(self):
+        """Tests listing of feature group resources with user defined limit."""
+        fs = self.define_feature_store_resource().create()
+        assert fs.oci_fs.id
+
+        entity = self.create_entity_resource(fs)
+        assert entity.oci_fs_entity.id
+
+        fg1 = self.define_feature_group_resource_with_default_config(
+            entity.oci_fs_entity.id, fs.oci_fs.id
+        ).create()
+        assert fg1.oci_feature_group.id
+        fg1.materialise(self.data)
+        fg1.materialise(self.data2)
+
+        fg1_job_list = FeatureGroupJob.list(
+            compartment_id=self.COMPARTMENT_ID,
+            feature_group_id=fg1.id,
+            sort_by="timeCreated",
+            limit="1",
+        )
+        assert fg1_job_list is not None
+        assert len(fg1_job_list) == 1
+
+        fg2 = self.define_feature_group_resource_with_stats_disabled(
+            entity.oci_fs_entity.id, fs.oci_fs.id
+        ).create()
+        assert fg2.oci_feature_group.id
+        fg2.materialise(self.data3)
+
+        fg_list = FeatureGroup.list(
+            compartment_id=self.COMPARTMENT_ID,
+            sort_by="timeCreated",
+            limit="1",
+        )
+        assert fg_list is not None
+        assert len(fg_list) == 1
+
+        self.clean_up_feature_group(fg1)
+        self.clean_up_feature_group(fg2)
+        self.clean_up_entity(entity)
+        self.clean_up_feature_store(fs)
+
+    def test_dataset_listing_without_limit(self):
+        """Tests listing of dataset resources without any limit."""
+        fs = self.define_feature_store_resource().create()
+        assert fs.oci_fs.id
+
+        entity = self.create_entity_resource(fs)
+        assert entity.oci_fs_entity.id
+
+        fg = self.define_feature_group_resource(
+            entity.oci_fs_entity.id, fs.oci_fs.id
+        ).create()
+        assert fg.oci_feature_group.id
+
+        fg.materialise(self.data)
+
+        dataset = self.define_dataset_resource(
+            entity.oci_fs_entity.id, fs.oci_fs.id, fg.oci_feature_group.name
+        ).create()
+        assert dataset.oci_dataset.id
+
+        dataset.materialise()
+        ds_list = Dataset.list(compartment_id=self.COMPARTMENT_ID)
+        assert ds_list is not None
+        assert len(ds_list) == 1
+
+        self.clean_up_dataset(dataset)
+        self.clean_up_feature_group(fg)
+        self.clean_up_entity(entity)
+        self.clean_up_feature_store(fs)
diff --git a/tests/unitary/with_extras/feature_store/test_dataset.py b/tests/unitary/with_extras/feature_store/test_dataset.py
@@ -43,6 +43,14 @@
     "ingestionMode": "OVERWRITE",
 }
 
+DATASET_JOB_RESPONSE_PAYLOAD = {
+    "compartmentId": "ocid1.compartment.oc1.iad.xxx",
+    "datasetId": "861AA4E9C8E811A79D74C464A01CDF42",
+    "id": "d40265b7-d66e-49a3-ae26-699012e0df5d",
+    "ingestionMode": "OVERWRITE",
+    "lifecycleState": "SUCCEEDED",
+}
+
 
 @pytest.fixture
 def dataframe_fixture_basic():
@@ -259,12 +267,12 @@ def test__to_oci_fs_entity(self, mock_load_key_file, mock_config_from_file):
     @patch.object(SparkSessionSingleton, "__init__", return_value=None)
     @patch.object(SparkSessionSingleton, "get_spark_session")
     def test_materialise(self, spark, get_spark_session, mock_update):
-            with patch.object(DatasetJob, "create") as mock_dataset_job:
-                with patch.object(FeatureStore, "from_id"):
-                    with patch.object(DatasetJob, "_mark_job_complete"):
-                        mock_dataset_job.return_value = self.mock_dsc_dataset_job
-                        self.mock_dsc_dataset.with_id(DATASET_OCID)
-                        self.mock_dsc_dataset.materialise()
+        with patch.object(DatasetJob, "create") as mock_dataset_job:
+            with patch.object(FeatureStore, "from_id"):
+                with patch.object(DatasetJob, "_mark_job_complete"):
+                    mock_dataset_job.return_value = self.mock_dsc_dataset_job
+                    self.mock_dsc_dataset.with_id(DATASET_OCID)
+                    self.mock_dsc_dataset.materialise()
 
     @patch.object(SparkSessionSingleton, "__init__", return_value=None)
     @patch.object(SparkSessionSingleton, "get_spark_session")
@@ -306,3 +314,13 @@ def test_restore(self, spark, get_spark_session, feature_store, mock_update):
             self.mock_dsc_dataset.with_id(DATASET_OCID)
             self.mock_dsc_dataset.restore(1)
             mock_execution_strategy.assert_called_once()
+
+    def test_get_last_job(self):
+        """Tests getting most recent dataset job for a dataset."""
+        with patch.object(DatasetJob, "list") as mock_dataset_job:
+            self.mock_dsc_dataset.with_id(DATASET_OCID)
+            mock_dataset_job.return_value = [
+                DatasetJob.from_dict({"spec": DATASET_JOB_RESPONSE_PAYLOAD})
+            ]
+            ds_job = self.mock_dsc_dataset.get_last_job()
+            assert ds_job is not None

Original file line number	Diff line number	Diff line change
`@@ -930,7 +930,7 @@ def get_last_job(self) -> "FeatureGroupJob":`
`930`	`930`	`)`
`931`	`931`	`if not fg_job:`
`932`	`932`	`raise ValueError(`
`933`		`- "Associated jobs cannot be retrieved before calling 'materialise' or 'delete'."`
	`933`	`+ "Unable to retrieve the associated last job. Please make sure you materialized the data."`
`934`	`934`	`)`
`935`	`935`	`self.with_job_id(fg_job[0].id)`
`936`	`936`	`return fg_job[0]`
`@@ -1353,7 +1353,7 @@ def get_statistics(self, job_id: str = None) -> "Statistics":`
`1353`	`1353`	`"FeatureGroup needs to be saved to the feature store before retrieving the statistics"`
`1354`	`1354`	`)`
`1355`	`1355`
`1356`		`- stat_job_id = self._get_job_id(job_id)`
	`1356`	`+ stat_job_id = job_id if job_id is not None else self.get_last_job().id`
`1357`	`1357`
`1358`	`1358`	`# TODO: take the one in memory or will list down job ids and find the latest`
`1359`	`1359`	`fg_job = FeatureGroupJob.from_id(stat_job_id)`
`@@ -1382,7 +1382,7 @@ def get_validation_output(self, job_id: str = None) -> "ValidationOutput":`
`1382`	`1382`	`"FeatureGroup needs to be saved to the feature store before retrieving the validation report"`
`1383`	`1383`	`)`
`1384`	`1384`
`1385`		`- validation_job_id = self._get_job_id(job_id)`
	`1385`	`+ validation_job_id = job_id if job_id is not None else self.get_last_job().id`
`1386`	`1386`
`1387`	`1387`	`# Retrieve the validation output JSON from data_flow_batch_execution_output.`
`1388`	`1388`	`fg_job = FeatureGroupJob.from_id(validation_job_id)`