Skip to content

Commit a424945

Browse files
committed
Merge branch 'feature/feature-store' of github.com:oracle/accelerated-data-science into ODSC-44566/validation-output
2 parents 158fe0c + de655f7 commit a424945

37 files changed

+2321
-191
lines changed

.gitleaks.toml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,13 @@ useDefault = true
99
# Paths listed in allowlist will not be scanned.
1010
[allowlist]
1111
description = "Global allow list"
12-
stopwords = ["test_password", "sample_key"]
1312
regexes = [
1413
'''example-password''',
1514
'''this-is-not-the-secret''',
16-
'''<redacted>'''
15+
'''<redacted>''',
16+
# NVIDIA_GPGKEY_SUM from public documentation:
17+
# https://gitlab.com/nvidia/container-images/cuda/-/blob/master/dist/10.1/centos7/base/Dockerfile
18+
'''d0664fbbdb8c32356d45de36c5984617217b2d0bef41b93ccecd326ba3b80c87'''
1719
]
1820
paths = [
1921
'''tests/integration/tests_configs.yaml'''

.pre-commit-config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ repos:
4040
- id: gitleaks
4141
# Oracle copyright checker
4242
- repo: https://github.com/oracle-samples/oci-data-science-ai-samples/
43-
rev: cbe0136
43+
rev: cbe0136f7aaffe463b31ddf3f34b0e16b4b124ff
4444
hooks:
4545
- id: check-copyright
4646
name: check-copyright

ads/common/oci_client.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,6 @@
1717
from oci.resource_search import ResourceSearchClient
1818
from oci.secrets import SecretsClient
1919
from oci.vault import VaultsClient
20-
from oci.feature_store import FeatureStoreClient
21-
2220
logger = logging.getLogger(__name__)
2321

2422

@@ -65,10 +63,15 @@ def _client_impl(self, client):
6563
"ai_language": AIServiceLanguageClient,
6664
"data_labeling_dp": DataLabelingClient,
6765
"data_labeling_cp": DataLabelingManagementClient,
68-
"feature_store": FeatureStoreClient,
6966
"resource_search": ResourceSearchClient,
7067
"data_catalog": DataCatalogClient
7168
}
69+
try:
70+
from oci.feature_store import FeatureStoreClient
71+
client_map["feature_store"] = FeatureStoreClient
72+
except ImportError:
73+
logger.warning("OCI SDK with feature store support is not installed")
74+
pass
7275

7376
assert (
7477
client in client_map

ads/common/oci_mixin.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -230,7 +230,7 @@ def _parse_kwargs(attribute_map: dict, **kwargs):
230230

231231
return parsed_kwargs
232232

233-
@classmethod
233+
@class_or_instance_method
234234
def deserialize(cls, data, to_cls):
235235
"""De-serialize data from dictionary to an OCI model"""
236236
if cls.type_mappings is None:
@@ -549,7 +549,7 @@ def from_dict(cls, data):
549549
"""
550550
return cls.create_instance(**data)
551551

552-
@classmethod
552+
@class_or_instance_method
553553
def deserialize(cls, data: dict, to_cls: str = None):
554554
"""Deserialize data
555555
@@ -726,7 +726,7 @@ def update_from_oci_model(
726726
for attr in self.swagger_types.keys():
727727
if (
728728
hasattr(oci_model_instance, attr)
729-
and getattr(oci_model_instance, attr) is not None
729+
and getattr(oci_model_instance, attr)
730730
and (
731731
not hasattr(self, attr)
732732
or not getattr(self, attr)

ads/common/serializer.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
import yaml
1616

1717
from ads.common import logger
18+
from ads.common.auth import default_signer
1819

1920
try:
2021
from yaml import CSafeDumper as dumper
@@ -134,6 +135,14 @@ def _read_from_file(uri: str, **kwargs) -> str:
134135
-------
135136
string: Contents in file specified by URI
136137
"""
138+
# Add default signer if the uri is an object storage uri, and
139+
# the user does not specify config or signer.
140+
if (
141+
uri.startswith("oci://")
142+
and "config" not in kwargs
143+
and "signer" not in kwargs
144+
):
145+
kwargs.update(default_signer())
137146
with fsspec.open(uri, "r", **kwargs) as f:
138147
return f.read()
139148

ads/feature_store/docs/source/dataset.rst

Lines changed: 40 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -124,41 +124,60 @@ With a Dataset instance, we can get the last dataset job details using ``get_las
124124
df = dataset_job.get_validation_output().to_dataframe()
125125
df.show()
126126
127-
128127
Save expectation entity
129128
=======================
129+
Feature store allows you to define expectations on data being materialized into feature group instance. With a ``FeatureGroup`` instance, we can save the expectation entity using ``save_expectation()``
130130

131-
With a Dataset instance, we can save the expectation entity using ``save_expectation()``
132-
133-
.. note::
134-
135-
Great Expectations is a Python-based open-source library for validating, documenting, and profiling your data. It helps you to maintain data quality and improve communication about data between teams. Software developers have long known that automated testing is essential for managing complex codebases.
136131

137132
.. image:: figures/validation.png
138133

139-
140134
The ``.save_expectation()`` method takes the following optional parameter:
141135

142-
- ``expectation_suite: ExpectationSuite``. Expectation suite of great expectation
136+
- ``expectation: Expectation``. Expectation of great expectation
143137
- ``expectation_type: ExpectationType``. Type of expectation
144138
- ``ExpectationType.STRICT``: Fail the job if expectation not met
145139
- ``ExpectationType.LENIENT``: Pass the job even if expectation not met
146140

147141
.. code-block:: python3
148142
149-
dataset.save_expectation(expectation_suite, expectation_type="STRICT")
143+
feature_group.save_expectation(expectation_suite, expectation_type="STRICT")
144+
145+
.. seealso::
146+
147+
:ref:`Feature Validation`
148+
149+
Statistics Computation
150+
========================
151+
During the materialization feature store performs computation of statistical metrics for all the features by default. This can be configured using ``StatisticsConfig`` object which can be passed at the creation of
152+
dataset or it can be updated later as well.
153+
154+
.. code-block:: python3
155+
156+
# Define statistics configuration for selected features
157+
stats_config = StatisticsConfig().with_is_enabled(True).with_columns(["column1", "column2"])
158+
150159
160+
This can be used with dataset instance.
151161

152-
Statistics Results
153-
==================
154-
You can call the ``get_statistics()`` method of the Dataset instance to fetch feature statistics results of a dataset job.
162+
.. code-block:: python3
155163
156-
.. note::
164+
from ads.feature_store.dataset import Dataset
157165
158-
PyDeequ is a Python API for Deequ, a library built on top of Apache Spark for defining "unit tests for data", which measure data quality in large datasets.
166+
dataset = (
167+
Dataset
168+
.with_name("<dataset_name>")
169+
.with_entity_id(<entity_id>)
170+
.with_feature_store_id("<feature_store_id>")
171+
.with_description("<dataset_description>")
172+
.with_compartment_id("<compartment_id>")
173+
.with_dataset_ingestion_mode(DatasetIngestionMode.SQL)
174+
.with_query('SELECT col FROM <entity_id>.<feature_group_name>')
175+
.with_statistics_config(stats_config)
176+
)
159177
178+
You can call the ``get_statistics()`` method of the dataset to fetch metrics for a specific ingestion job.
160179

161-
The ``.get_statistics()`` method takes the following optional parameter:
180+
The ``get_statistics()`` method takes the following optional parameter:
162181

163182
- ``job_id: string``. Id of dataset job
164183

@@ -167,6 +186,12 @@ The ``.get_statistics()`` method takes the following optional parameter:
167186
# Fetch stats results for a dataset job
168187
df = dataset.get_statistics(job_id).to_pandas()
169188
189+
.. image:: figures/stats_1.png
190+
191+
.. seealso::
192+
193+
:ref:`Statistics`
194+
170195

171196
Get features
172197
============

ads/feature_store/docs/source/feature_group.rst

Lines changed: 45 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -152,49 +152,60 @@ Feature store provides an API similar to Pandas to join feature groups together
152152
153153
Save expectation entity
154154
=======================
155-
With a ``FeatureGroup`` instance, You can save the expectation details using ``with_expectation_suite()`` with parameters
155+
Feature store allows you to define expectations on data being materialized into feature group instance. With a ``FeatureGroup`` instance, we can save the expectation entity using ``save_expectation()``
156156

157-
- ``expectation_suite: ExpectationSuite``. ExpectationSuit of great expectation
157+
158+
.. image:: figures/validation.png
159+
160+
The ``.save_expectation()`` method takes the following optional parameter:
161+
162+
- ``expectation: Expectation``. Expectation of great expectation
158163
- ``expectation_type: ExpectationType``. Type of expectation
159164
- ``ExpectationType.STRICT``: Fail the job if expectation not met
160165
- ``ExpectationType.LENIENT``: Pass the job even if expectation not met
161166

162-
.. note::
167+
.. code-block:: python3
163168
164-
Great Expectations is a Python-based open-source library for validating, documenting, and profiling your data. It helps you to maintain data quality and improve communication about data between teams. Software developers have long known that automated testing is essential for managing complex codebases.
169+
feature_group.save_expectation(expectation_suite, expectation_type="STRICT")
170+
171+
.. seealso::
172+
173+
:ref:`Feature Validation`
165174

166-
.. image:: figures/validation.png
175+
176+
Statistics Computation
177+
========================
178+
During the materialization feature store performs computation of statistical metrics for all the features by default. This can be configured using ``StatisticsConfig`` object which can be passed at the creation of
179+
feature group or it can be updated later as well.
167180

168181
.. code-block:: python3
169182
170-
expectation_suite = ExpectationSuite(
171-
expectation_suite_name="expectation_suite_name"
172-
)
173-
expectation_suite.add_expectation(
174-
ExpectationConfiguration(
175-
expectation_type="expect_column_values_to_not_be_null",
176-
kwargs={"column": "<column>"},
177-
)
183+
# Define statistics configuration for selected features
184+
stats_config = StatisticsConfig().with_is_enabled(True).with_columns(["column1", "column2"])
178185
179-
feature_group_resource = (
180-
FeatureGroup()
181-
.with_feature_store_id(feature_store.id)
182-
.with_primary_keys(["<key>"])
183-
.with_name("<name>")
184-
.with_entity_id(entity.id)
185-
.with_compartment_id(<compartment_id>)
186-
.with_schema_details_from_dataframe(<datframe>)
187-
.with_expectation_suite(
188-
expectation_suite=expectation_suite,
189-
expectation_type=ExpectationType.STRICT,
190-
)
191-
)
192186
193-
You can call the ``get_validation_output()`` method of the FeatureGroup instance to fetch validation results for a specific ingestion job.
187+
This can be used with feature group instance.
188+
189+
.. code-block:: python3
190+
191+
# Fetch stats results for a feature group job
192+
from ads.feature_store.feature_group import FeatureGroup
194193
195-
Statistics Results
196-
==================
197-
You can call the ``get_statistics()`` method of the FeatureGroup instance to fetch statistics for a specific ingestion job.
194+
feature_group_resource = (
195+
FeatureGroup()
196+
.with_feature_store_id(feature_store.id)
197+
.with_primary_keys(["<key>"])
198+
.with_name("<name>")
199+
.with_entity_id(entity.id)
200+
.with_compartment_id(<compartment_id>)
201+
.with_schema_details_from_dataframe(<dataframe>)
202+
.with_statistics_config(stats_config)
203+
204+
You can call the ``get_statistics()`` method of the feature group to fetch metrics for a specific ingestion job.
205+
206+
The ``get_statistics()`` method takes the following optional parameter:
207+
208+
- ``job_id: string``. Id of feature group job
198209

199210
.. code-block:: python3
200211
@@ -203,6 +214,10 @@ You can call the ``get_statistics()`` method of the FeatureGroup instance to fet
203214
204215
.. image:: figures/stats_1.png
205216

217+
.. seealso::
218+
219+
:ref:`Statistics`
220+
206221
Get last feature group job
207222
==========================
208223
Feature group job is the execution instance of a feature group. Each feature group job will include validation results and statistics results.
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
.. _Feature Validation:
2+
3+
Feature Validation
4+
*************
5+
6+
Feature validation is the process of checking the quality and accuracy of the features used in a machine learning model. This is important because features that are not accurate or reliable can lead to poor model performance.
7+
Feature store allows you to define expectation on the data which is being materialized into feature group and dataset. This is achieved using open source library Great Expectations.
8+
9+
.. note::
10+
`Great Expectations <https://docs.greatexpectations.io/docs/>`_ is a Python-based open-source library for validating, documenting, and profiling your data. It helps you to maintain data quality and improve communication about data between teams. Software developers have long known that automated testing is essential for managing complex codebases.
11+
12+
13+
Expectations
14+
============
15+
An Expectation is a verifiable assertion about your data. You can define expectation as below:
16+
17+
.. code-block:: python3
18+
19+
from great_expectations.core.expectation_configuration import ExpectationConfiguration
20+
21+
# Create an Expectation
22+
expect_config = ExpectationConfiguration(
23+
# Name of expectation type being added
24+
expectation_type="expect_table_columns_to_match_ordered_list",
25+
# These are the arguments of the expectation
26+
# The keys allowed in the dictionary are Parameters and
27+
# Keyword Arguments of this Expectation Type
28+
kwargs={
29+
"column_list": [
30+
"column1",
31+
"column2",
32+
"column3",
33+
"column4",
34+
]
35+
},
36+
# This is how you can optionally add a comment about this expectation.
37+
meta={
38+
"notes": {
39+
"format": "markdown",
40+
"content": "details about this expectation. **Markdown** `Supported`",
41+
}
42+
},
43+
)
44+
45+
Expectations Suite
46+
============
47+
48+
Expectation Suite is a collection of verifiable assertions i.e. expectations about your data. You can define expectation suite as below:
49+
50+
.. code-block:: python3
51+
52+
# Create an Expectation Suite
53+
suite = context.add_expectation_suite(expectation_suite_name="example_suite")
54+
suite.add_expectation(expect_config)
Loading

ads/feature_store/docs/source/index.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@ Welcome to oci-feature-store's documentation!
1616
feature_group_job
1717
dataset
1818
dataset_job
19+
statistics
20+
feature_validation
1921
demo
2022
notebook
2123
release_notes

0 commit comments

Comments
 (0)