Skip to content

Commit 95eaab8

Browse files
committed
Merge branch 'feature/feature-store' of github.com:oracle/accelerated-data-science into ODSC-44567/Interoperability-with-model-catalog
2 parents 57ce096 + d28c13d commit 95eaab8

File tree

67 files changed

+2435
-181
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

67 files changed

+2435
-181
lines changed

.gitleaks.toml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,13 @@ useDefault = true
99
# Paths listed in allowlist will not be scanned.
1010
[allowlist]
1111
description = "Global allow list"
12-
stopwords = ["test_password", "sample_key"]
1312
regexes = [
1413
'''example-password''',
1514
'''this-is-not-the-secret''',
16-
'''<redacted>'''
15+
'''<redacted>''',
16+
# NVIDIA_GPGKEY_SUM from public documentation:
17+
# https://gitlab.com/nvidia/container-images/cuda/-/blob/master/dist/10.1/centos7/base/Dockerfile
18+
'''d0664fbbdb8c32356d45de36c5984617217b2d0bef41b93ccecd326ba3b80c87'''
1719
]
1820
paths = [
1921
'''tests/integration/tests_configs.yaml'''

.pre-commit-config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ repos:
4040
- id: gitleaks
4141
# Oracle copyright checker
4242
- repo: https://github.com/oracle-samples/oci-data-science-ai-samples/
43-
rev: cbe0136
43+
rev: cbe0136f7aaffe463b31ddf3f34b0e16b4b124ff
4444
hooks:
4545
- id: check-copyright
4646
name: check-copyright

ads/common/oci_client.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,6 @@
1717
from oci.resource_search import ResourceSearchClient
1818
from oci.secrets import SecretsClient
1919
from oci.vault import VaultsClient
20-
from oci.feature_store import FeatureStoreClient
21-
2220
logger = logging.getLogger(__name__)
2321

2422

@@ -65,10 +63,15 @@ def _client_impl(self, client):
6563
"ai_language": AIServiceLanguageClient,
6664
"data_labeling_dp": DataLabelingClient,
6765
"data_labeling_cp": DataLabelingManagementClient,
68-
"feature_store": FeatureStoreClient,
6966
"resource_search": ResourceSearchClient,
7067
"data_catalog": DataCatalogClient
7168
}
69+
try:
70+
from oci.feature_store import FeatureStoreClient
71+
client_map["feature_store"] = FeatureStoreClient
72+
except ImportError:
73+
logger.warning("OCI SDK with feature store support is not installed")
74+
pass
7275

7376
assert (
7477
client in client_map

ads/common/oci_mixin.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -230,7 +230,7 @@ def _parse_kwargs(attribute_map: dict, **kwargs):
230230

231231
return parsed_kwargs
232232

233-
@classmethod
233+
@class_or_instance_method
234234
def deserialize(cls, data, to_cls):
235235
"""De-serialize data from dictionary to an OCI model"""
236236
if cls.type_mappings is None:
@@ -549,7 +549,7 @@ def from_dict(cls, data):
549549
"""
550550
return cls.create_instance(**data)
551551

552-
@classmethod
552+
@class_or_instance_method
553553
def deserialize(cls, data: dict, to_cls: str = None):
554554
"""Deserialize data
555555
@@ -726,7 +726,7 @@ def update_from_oci_model(
726726
for attr in self.swagger_types.keys():
727727
if (
728728
hasattr(oci_model_instance, attr)
729-
and getattr(oci_model_instance, attr) is not None
729+
and getattr(oci_model_instance, attr)
730730
and (
731731
not hasattr(self, attr)
732732
or not getattr(self, attr)

ads/common/serializer.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
import yaml
1616

1717
from ads.common import logger
18+
from ads.common.auth import default_signer
1819

1920
try:
2021
from yaml import CSafeDumper as dumper
@@ -134,6 +135,14 @@ def _read_from_file(uri: str, **kwargs) -> str:
134135
-------
135136
string: Contents in file specified by URI
136137
"""
138+
# Add default signer if the uri is an object storage uri, and
139+
# the user does not specify config or signer.
140+
if (
141+
uri.startswith("oci://")
142+
and "config" not in kwargs
143+
and "signer" not in kwargs
144+
):
145+
kwargs.update(default_signer())
137146
with fsspec.open(uri, "r", **kwargs) as f:
138147
return f.read()
139148

ads/feature_store/dataset.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -896,11 +896,7 @@ def get_validation_output(self, job_id: str = None) -> "ValidationOutput":
896896
validation_output = (
897897
output_details.get("validationOutput") if output_details else None
898898
)
899-
validation_output_json = (
900-
json.loads(validation_output) if validation_output else None
901-
)
902-
903-
return ValidationOutput(validation_output_json)
899+
return ValidationOutput(validation_output)
904900

905901
@classmethod
906902
def list_df(cls, compartment_id: str = None, **kwargs) -> "pandas.DataFrame":

ads/feature_store/docs/source/conf.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99

1010
sys.path.insert(0, os.path.abspath("../../"))
1111

12-
version = "1.0"
12+
version = "1.2"
1313
release = version
1414

1515

@@ -54,7 +54,7 @@
5454

5555
# Get version
5656

57-
version = "1.0"
57+
version = "1.2"
5858
release = version
5959

6060
# Unless we want to expose real buckets and namespaces

ads/feature_store/docs/source/dataset.rst

Lines changed: 74 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -119,46 +119,97 @@ With a Dataset instance, we can get the last dataset job details using ``get_las
119119

120120
.. code-block:: python3
121121
122-
# Fetch validation results for a dataset
123122
dataset_job = dataset.get_last_job()
124-
df = dataset_job.get_validation_output().to_dataframe()
125-
df.show()
126-
127123
128124
Save expectation entity
129125
=======================
126+
Feature store allows you to define expectations on data being materialized into dataset instance.With a ``Dataset`` instance, You can save the expectation details using ``with_expectation_suite()`` with parameters
130127

131-
With a Dataset instance, we can save the expectation entity using ``save_expectation()``
128+
- ``expectation_suite: ExpectationSuite``. ExpectationSuit of great expectation
129+
- ``expectation_type: ExpectationType``. Type of expectation
130+
- ``ExpectationType.STRICT``: Fail the job if expectation not met
131+
- ``ExpectationType.LENIENT``: Pass the job even if expectation not met
132132

133133
.. note::
134134

135135
Great Expectations is a Python-based open-source library for validating, documenting, and profiling your data. It helps you to maintain data quality and improve communication about data between teams. Software developers have long known that automated testing is essential for managing complex codebases.
136136

137137
.. image:: figures/validation.png
138138

139+
.. code-block:: python3
139140
140-
The ``.save_expectation()`` method takes the following optional parameter:
141+
expectation_suite = ExpectationSuite(
142+
expectation_suite_name="expectation_suite_name"
143+
)
144+
expectation_suite.add_expectation(
145+
ExpectationConfiguration(
146+
expectation_type="expect_column_values_to_not_be_null",
147+
kwargs={"column": "<column>"},
148+
)
149+
150+
dataset_resource = (
151+
Dataset()
152+
.with_description("dataset description")
153+
.with_compartment_id(<compartment_id>)
154+
.with_name(<name>)
155+
.with_entity_id(entity_id)
156+
.with_feature_store_id(feature_store_id)
157+
.with_query(f"SELECT * FROM `{entity_id}`.{feature_group_name}")
158+
.with_expectation_suite(
159+
expectation_suite=expectation_suite,
160+
expectation_type=ExpectationType.STRICT,
161+
)
162+
)
163+
164+
You can call the ``get_validation_output()`` method of the Dataset instance to fetch validation results for a specific ingestion job.
165+
The ``get_validation_output()`` method takes the following optional parameter:
141166

142-
- ``expectation_suite: ExpectationSuite``. Expectation suite of great expectation
143-
- ``expectation_type: ExpectationType``. Type of expectation
144-
- ``ExpectationType.STRICT``: Fail the job if expectation not met
145-
- ``ExpectationType.LENIENT``: Pass the job even if expectation not met
167+
- ``job_id: string``. Id of dataset job
168+
169+
``get_validation_output().to_pandas()`` will output the validation results for each expectation as pandas dataframe
170+
171+
.. image:: figures/dataset_validation_results.png
172+
173+
``get_validation_output().to_summary()`` will output the overall summary of validation as pandas dataframe.
174+
175+
.. image:: figures/dataset_validation_summary.png
176+
177+
.. seealso::
178+
179+
:ref:`Feature Validation`
180+
181+
Statistics Computation
182+
========================
183+
During the materialization feature store performs computation of statistical metrics for all the features by default. This can be configured using ``StatisticsConfig`` object which can be passed at the creation of
184+
dataset or it can be updated later as well.
146185

147186
.. code-block:: python3
148187
149-
dataset.save_expectation(expectation_suite, expectation_type="STRICT")
188+
# Define statistics configuration for selected features
189+
stats_config = StatisticsConfig().with_is_enabled(True).with_columns(["column1", "column2"])
150190
151191
152-
Statistics Results
153-
==================
154-
You can call the ``get_statistics()`` method of the Dataset instance to fetch feature statistics results of a dataset job.
192+
This can be used with dataset instance.
155193

156-
.. note::
194+
.. code-block:: python3
157195
158-
PyDeequ is a Python API for Deequ, a library built on top of Apache Spark for defining "unit tests for data", which measure data quality in large datasets.
196+
from ads.feature_store.dataset import Dataset
159197
198+
dataset = (
199+
Dataset
200+
.with_name("<dataset_name>")
201+
.with_entity_id(<entity_id>)
202+
.with_feature_store_id("<feature_store_id>")
203+
.with_description("<dataset_description>")
204+
.with_compartment_id("<compartment_id>")
205+
.with_dataset_ingestion_mode(DatasetIngestionMode.SQL)
206+
.with_query('SELECT col FROM <entity_id>.<feature_group_name>')
207+
.with_statistics_config(stats_config)
208+
)
209+
210+
You can call the ``get_statistics()`` method of the dataset to fetch metrics for a specific ingestion job.
160211

161-
The ``.get_statistics()`` method takes the following optional parameter:
212+
The ``get_statistics()`` method takes the following optional parameter:
162213

163214
- ``job_id: string``. Id of dataset job
164215

@@ -167,6 +218,12 @@ The ``.get_statistics()`` method takes the following optional parameter:
167218
# Fetch stats results for a dataset job
168219
df = dataset.get_statistics(job_id).to_pandas()
169220
221+
.. image:: figures/dataset_statistics.png
222+
223+
.. seealso::
224+
225+
:ref:`Statistics`
226+
170227

171228
Get features
172229
============

ads/feature_store/docs/source/feature_group.rst

Lines changed: 51 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -191,10 +191,54 @@ With a ``FeatureGroup`` instance, You can save the expectation details using ``w
191191
)
192192
193193
You can call the ``get_validation_output()`` method of the FeatureGroup instance to fetch validation results for a specific ingestion job.
194+
The ``get_validation_output()`` method takes the following optional parameter:
194195

195-
Statistics Results
196-
==================
197-
You can call the ``get_statistics()`` method of the FeatureGroup instance to fetch statistics for a specific ingestion job.
196+
- ``job_id: string``. Id of feature group job
197+
``get_validation_output().to_pandas()`` will output the validation results for each expectation as pandas dataframe
198+
199+
.. image:: figures/validation_results.png
200+
201+
``get_validation_output().to_summary()`` will output the overall summary of validation as pandas dataframe.
202+
203+
.. image:: figures/validation_summary.png
204+
.. seealso::
205+
206+
:ref:`Feature Validation`
207+
208+
209+
Statistics Computation
210+
========================
211+
During the materialization feature store performs computation of statistical metrics for all the features by default. This can be configured using ``StatisticsConfig`` object which can be passed at the creation of
212+
feature group or it can be updated later as well.
213+
214+
.. code-block:: python3
215+
216+
# Define statistics configuration for selected features
217+
stats_config = StatisticsConfig().with_is_enabled(True).with_columns(["column1", "column2"])
218+
219+
220+
This can be used with feature group instance.
221+
222+
.. code-block:: python3
223+
224+
# Fetch stats results for a feature group job
225+
from ads.feature_store.feature_group import FeatureGroup
226+
227+
feature_group_resource = (
228+
FeatureGroup()
229+
.with_feature_store_id(feature_store.id)
230+
.with_primary_keys(["<key>"])
231+
.with_name("<name>")
232+
.with_entity_id(entity.id)
233+
.with_compartment_id(<compartment_id>)
234+
.with_schema_details_from_dataframe(<dataframe>)
235+
.with_statistics_config(stats_config)
236+
237+
You can call the ``get_statistics()`` method of the feature group to fetch metrics for a specific ingestion job.
238+
239+
The ``get_statistics()`` method takes the following optional parameter:
240+
241+
- ``job_id: string``. Id of feature group job
198242

199243
.. code-block:: python3
200244
@@ -203,6 +247,10 @@ You can call the ``get_statistics()`` method of the FeatureGroup instance to fet
203247
204248
.. image:: figures/stats_1.png
205249

250+
.. seealso::
251+
252+
:ref:`Statistics`
253+
206254
Get last feature group job
207255
==========================
208256
Feature group job is the execution instance of a feature group. Each feature group job will include validation results and statistics results.
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
.. _Feature Validation:
2+
3+
Feature Validation
4+
******************
5+
6+
Feature validation is the process of checking the quality and accuracy of the features used in a machine learning model. This is important because features that are not accurate or reliable can lead to poor model performance.
7+
Feature store allows you to define expectation on the data which is being materialized into feature group and dataset. This is achieved using open source library Great Expectations.
8+
9+
.. note::
10+
`Great Expectations <https://docs.greatexpectations.io/docs/0.15.50/>`_ is a Python-based open-source library for validating, documenting, and profiling your data. It helps you to maintain data quality and improve communication about data between teams. Software developers have long known that automated testing is essential for managing complex codebases.
11+
12+
13+
Expectations
14+
============
15+
An Expectation is a verifiable assertion about your data. You can define expectation as below:
16+
17+
.. code-block:: python3
18+
19+
from great_expectations.core.expectation_configuration import ExpectationConfiguration
20+
21+
# Create an Expectation
22+
expect_config = ExpectationConfiguration(
23+
# Name of expectation type being added
24+
expectation_type="expect_table_columns_to_match_ordered_list",
25+
# These are the arguments of the expectation
26+
# The keys allowed in the dictionary are Parameters and
27+
# Keyword Arguments of this Expectation Type
28+
kwargs={
29+
"column_list": [
30+
"column1",
31+
"column2",
32+
"column3",
33+
"column4",
34+
]
35+
},
36+
# This is how you can optionally add a comment about this expectation.
37+
meta={
38+
"notes": {
39+
"format": "markdown",
40+
"content": "details about this expectation. **Markdown** `Supported`",
41+
}
42+
},
43+
)
44+
45+
Expectations Suite
46+
===================
47+
48+
Expectation Suite is a collection of verifiable assertions i.e. expectations about your data. You can define expectation suite as below:
49+
50+
.. code-block:: python3
51+
52+
# Create an Expectation Suite
53+
expectation_suite = ExpectationSuite(
54+
expectation_suite_name=<expectation_suite_name>
55+
)
56+
expectation_suite.add_expectation(expect_config)

0 commit comments

Comments
 (0)