Skip to content

Commit f04a4a6

Browse files
authored
Feature store marketplace operator (#441)
2 parents fcab1a5 + 24e2c1c commit f04a4a6

File tree

98 files changed

+4968
-932
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

98 files changed

+4968
-932
lines changed

.github/workflows/run-unittests-py38-cov-report.yml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,9 @@ jobs:
7171
- name: "Install Forecasting dependencies"
7272
run: |
7373
pip install -e ".[forecast]"
74-
74+
- name: "Install featurestore marketplace dependencies"
75+
run: |
76+
pip install -e ".[feature-store-marketplace]"
7577
- name: "Run unitary tests folder with maximum ADS dependencies"
7678
timeout-minutes: 40
7779
shell: bash

ads/common/decorator/runtime_dependency.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
#!/usr/bin/env python
22
# -*- coding: utf-8; -*-
33

4-
# Copyright (c) 2021, 2023 Oracle and/or its affiliates.
4+
# Copyright (c) 2021, 2024 Oracle and/or its affiliates.
55
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
66

77
"""
@@ -65,6 +65,7 @@ class OptionalDependency:
6565
SPARK = "oracle-ads[spark]"
6666
HUGGINGFACE = "oracle-ads[huggingface]"
6767
FORECAST = "oracle-ads[forecast]"
68+
FEATURE_STORE_MARKETPLACE = "oracle-ads[feature-store-marketplace]"
6869
PII = "oracle-ads[pii]"
6970
FEATURE_STORE = "oracle-ads[feature-store]"
7071
GRAPHVIZ = "oracle-ads[graphviz]"

ads/common/extended_enum.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
#!/usr/bin/env python
22
# -*- coding: utf-8; -*-
33

4-
# Copyright (c) 2022 Oracle and/or its affiliates.
4+
# Copyright (c) 2022, 2024 Oracle and/or its affiliates.
55
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
66

77

@@ -70,4 +70,4 @@ def values(cls):
7070

7171
@classmethod
7272
def keys(cls):
73-
return sorted(map(lambda c: c.name, cls))
73+
return sorted(map(lambda c: c.name, cls))

ads/common/oci_client.py

Lines changed: 22 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,21 @@
11
#!/usr/bin/env python
22
# -*- coding: utf-8; -*-
33

4-
# Copyright (c) 2021, 2023 Oracle and/or its affiliates.
4+
# Copyright (c) 2021, 2024 Oracle and/or its affiliates.
55
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
66

77
import logging
88

9+
import oci.artifacts
910
from oci.ai_language import AIServiceLanguageClient
11+
from oci.artifacts import ArtifactsClient
1012
from oci.data_catalog import DataCatalogClient
1113
from oci.data_flow import DataFlowClient
1214
from oci.data_labeling_service import DataLabelingManagementClient
1315
from oci.data_labeling_service_dataplane import DataLabelingClient
1416
from oci.data_science import DataScienceClient
1517
from oci.identity import IdentityClient
18+
from oci.marketplace import MarketplaceClient
1619
from oci.object_storage import ObjectStorageClient
1720
from oci.resource_search import ResourceSearchClient
1821
from oci.secrets import SecretsClient
@@ -25,7 +28,7 @@ class OCIClientFactory:
2528

2629
"""
2730
A factory class to create OCI client objects. The constructor takes in config, signer and client_kwargs. `client_kwargs` is passed
28-
to the client constructor as key word argutments.
31+
to the client constructor as key word arguments.
2932
3033
Examples
3134
--------
@@ -48,12 +51,15 @@ class OCIClientFactory:
4851
oc.OCIClientFactory(**auth).object_storage # Creates Object storage client using instance principal authentication
4952
"""
5053

51-
def __init__(self, config={}, signer=None, client_kwargs=None):
54+
def __init__(self, config=None, signer=None, client_kwargs=None):
55+
if not config:
56+
config = {}
5257
self.config = config
5358
self.signer = signer
5459
self.client_kwargs = client_kwargs
5560

56-
def _client_impl(self, client):
61+
@staticmethod
62+
def _client_impl(client):
5763
client_map = {
5864
"object_storage": ObjectStorageClient,
5965
"data_science": DataScienceClient,
@@ -66,21 +72,17 @@ def _client_impl(self, client):
6672
"data_labeling_cp": DataLabelingManagementClient,
6773
"resource_search": ResourceSearchClient,
6874
"data_catalog": DataCatalogClient,
75+
"marketplace": MarketplaceClient,
76+
"artifacts": ArtifactsClient,
6977
}
70-
try:
71-
from oci.feature_store import FeatureStoreClient
72-
73-
client_map["feature_store"] = FeatureStoreClient
74-
except ImportError:
75-
logger.debug("OCI SDK with feature store support is not installed")
76-
pass
7778

7879
assert (
7980
client in client_map
8081
), f"Invalid client name. Client name not found in {client_map.keys()}"
8182
return client_map[client]
8283

83-
def _validate_auth_param(self, auth):
84+
@staticmethod
85+
def _validate_auth_param(auth):
8486
if not isinstance(auth, dict):
8587
raise ValueError("auth parameter should be of type dictionary")
8688
if "config" in auth and not isinstance(auth["config"], dict):
@@ -135,10 +137,6 @@ def ai_language(self):
135137
def data_labeling_cp(self):
136138
return self.create_client("data_labeling_cp")
137139

138-
@property
139-
def feature_store(self):
140-
return self.create_client("feature_store")
141-
142140
@property
143141
def data_labeling_dp(self):
144142
return self.create_client("data_labeling_dp")
@@ -150,3 +148,11 @@ def resource_search(self):
150148
@property
151149
def data_catalog(self):
152150
return self.create_client("data_catalog")
151+
152+
@property
153+
def marketplace(self):
154+
return self.create_client("marketplace")
155+
156+
@property
157+
def artifacts(self) -> oci.artifacts.ArtifactsClient:
158+
return self.create_client("artifacts")

ads/feature_store/README.md

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
# Oracle Feature Store (ADS)
2+
3+
[![Python](https://img.shields.io/badge/python-3.8-blue?style=for-the-badge&logo=pypi&logoColor=white)](https://pypi.org/project/oracle-ads/) [![PysparkConda](https://img.shields.io/badge/fspyspark32_p38_cpu_v2-1.0-blue?style=for-the-badge&logo=pypi&logoColor=white)](https://docs.oracle.com/en-us/iaas/data-science/using/conda-pyspark-fam.htm) [![Notebook Examples](https://img.shields.io/badge/docs-notebook--examples-blue?style=for-the-badge&logo=pypi&logoColor=white)](https://github.com/oracle-samples/oci-data-science-ai-samples/tree/master/notebook_examples) [![Delta](https://img.shields.io/badge/delta-2.0.1-blue?style=for-the-badge&logo=pypi&logoColor=white)](https://delta.io/) [![PySpark](https://img.shields.io/badge/pyspark-3.2.1-blue?style=for-the-badge&logo=pypi&logoColor=white)](https://spark.apache.org/docs/3.2.1/api/python/index.html) [![Great Expectations](https://img.shields.io/badge/greatexpectations-0.17.19-blue?style=for-the-badge&logo=pypi&logoColor=white)](https://greatexpectations.io/) [![Pandas](https://img.shields.io/badge/pandas-1.5.3-blue?style=for-the-badge&logo=pypi&logoColor=white)](https://pandas.pydata.org/) [![PyArrow](https://img.shields.io/badge/pyarrow-11.0.0-blue?style=for-the-badge&logo=pypi&logoColor=white)](https://arrow.apache.org/docs/python/index.html)
4+
5+
Managing many datasets, data sources, and transformations for machine learning is complex and costly. Poorly cleaned data, data issues, bugs in transformations, data drift, and training serving skew all lead to increased model development time and poor model performance. Feature store solves many of the problems because it is a centralized way to transform and access data for training and serving time, Feature stores help define a standardised pipeline for ingestion of data and querying of data.
6+
7+
ADS feature store is a stack-based solution that is deployed in your tenancy using OCI Resource Manager.
8+
9+
Following are brief descriptions of key concepts and the main components of ADS feature store.
10+
11+
- ``Feature Vector``: Set of feature values for any one primary and identifier key. For example, all and a subset of features of customer ID 2536 can be called as one feature vector .
12+
- ``Feature``: A feature is an individual measurable property or characteristic of an event being observed.
13+
- ``Entity``: An entity is a group of semantically related features. The first step a consumer of features would typically do when accessing the feature store service is to list the entities and the entities associated with features. Another way to look at it is that an entity is an object or concept that's described by its features. Examples of entities are customer, product, transaction, review, image, document, and so on.
14+
- ``Feature Group``: A feature group in a feature store is a collection of related features that are often used together in ML models. It serves as an organizational unit within the feature store for users to manage, version, and share features across different ML projects. By organizing features into groups, data scientists and ML engineers can efficiently discover, reuse, and collaborate on features reducing the redundant work and ensuring consistency in feature engineering.
15+
- ``Feature Group Job``: Feature group jobs are the processing instance of a feature group. Each feature group job includes validation results and statistics results.
16+
- ``Dataset``: A dataset is a collection of features that are used together to either train a model or perform model inference.
17+
- ``Dataset Job``: A dataset job is the processing instance of a dataset. Each dataset job includes validation results and statistics results.
18+
19+
## Documentation
20+
21+
- [Oracle Feature Store SDK (ADS) Documentation](https://feature-store-accelerated-data-science.readthedocs.io/en/latest/)
22+
- [OCI Data Science and AI services Examples](https://github.com/oracle/oci-data-science-ai-samples)
23+
- [Oracle AI & Data Science Blog](https://blogs.oracle.com/ai-and-datascience/)
24+
- [OCI Documentation](https://docs.oracle.com/en-us/iaas/data-science/using/data-science.htm)
25+
26+
## Examples
27+
28+
### Quick start examples
29+
30+
| Jupyter Notebook | Description |
31+
|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
32+
| [Feature store querying](https://github.com/oracle-samples/oci-data-science-ai-samples/blob/main/notebook_examples/feature_store_querying.ipynb) | - Ingestion, querying and exploration of data. |
33+
| [Feature store quickstart](https://github.com/oracle-samples/oci-data-science-ai-samples/blob/main/notebook_examples/feature_store_quickstart.ipynb) | - Ingestion, querying and exploration of data. |
34+
| [Schema enforcement and schema evolution](https://github.com/oracle-samples/oci-data-science-ai-samples/blob/main/notebook_examples/feature_store_schema_evolution.ipynb) | - `Schema evolution` allows you to easily change a table's current schema to accommodate data that is changing over time. `Schema enforcement`, also known as schema validation, is a safeguard in Delta Lake that ensures data quality by rejecting writes to a table that don't match the table's schema. |
35+
| [Storage of medical records in feature store](https://github.com/oracle-samples/oci-data-science-ai-samples/blob/main/notebook_examples/feature_store_ehr_data.ipynb) | Example to demonstrate storage of medical records in feature store |
36+
37+
### Big data operations using OCI DataFlow
38+
39+
| Jupyter Notebook | Description |
40+
|---------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------|
41+
| [Big data operations with feature store](https://github.com/oracle-samples/oci-data-science-ai-samples/blob/main/notebook_examples/feature_store_spark_magic.ipynb) | - Ingestion of data using Spark Magic, querying and exploration of data using Spark Magic. |
42+
43+
### LLM Use cases
44+
45+
| Jupyter Notebook | Description |
46+
|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
47+
| [Embeddings in Feature Store](https://github.com/oracle-samples/oci-data-science-ai-samples/blob/main/notebook_examples/feature_store_embeddings.ipynb) | - `Embedding feature stores` are optimized for fast and efficient retrieval of embeddings. This is important because embeddings can be high-dimensional and computationally expensive to calculate. By storing them in a dedicated store, you can avoid the need to recalculate embeddings for the same data repeatedly. |
48+
| [Synthetic data generation in feature store using OpenAI and FewShotPromptTemplate](https://github.com/oracle-samples/oci-data-science-ai-samples/blob/main/notebook_examples/feature_store_medical_synthetic_data_openai.ipynb) | - `Synthetic data` is artificially generated data, rather than data collected from real-world events. It's used to simulate real data without compromising privacy or encountering real-world limitations. |
49+
| [PII Data redaction, Summarise Content and Translate content using doctran and open AI](https://github.com/oracle-samples/oci-data-science-ai-samples/blob/main/notebook_examples/feature_store_pii_redaction_and_transformation.ipynb) | - One way to think of Doctran is a LLM-powered black box where messy strings go in and nice, clean, labelled strings come out. Another way to think about it is a modular, declarative wrapper over OpenAI's functional calling feature that significantly improves the developer experience. |
50+
| [OpenAI embeddings in feature store](https://github.com/oracle-samples/oci-data-science-ai-samples/blob/main/notebook_examples/feature_store_embeddings_openai.ipynb) | - `Embedding feature stores` are optimized for fast and efficient retrieval of embeddings. This is important because embeddings can be high-dimensional and computationally expensive to calculate. By storing them in a dedicated store, you can avoid the need to recalculate embeddings for the same data repeatedly. |
51+
52+
53+
## Contributing
54+
55+
This project welcomes contributions from the community. Before submitting a pull request, please [review our contribution guide](./../../CONTRIBUTING.md)
56+
57+
Find Getting Started instructions for developers in [README-development.md](https://github.com/oracle/accelerated-data-science/blob/main/README-development.md)
58+
59+
## Security
60+
61+
Consult the security guide [SECURITY.md](https://github.com/oracle/accelerated-data-science/blob/main/SECURITY.md) for our responsible security vulnerability disclosure process.
62+
63+
## License
64+
65+
Copyright (c) 2020, 2022 Oracle and/or its affiliates. Licensed under the [Universal Permissive License v1.0](https://oss.oracle.com/licenses/upl/)

ads/feature_store/common/utils/transformation_utils.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
11
#!/usr/bin/env python
22
# -*- coding: utf-8; -*-
3+
4+
# Copyright (c) 2024 Oracle and/or its affiliates.
5+
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
6+
37
import json
48

59
# Copyright (c) 2023 Oracle and/or its affiliates.
@@ -52,9 +56,7 @@ def apply_transformation(
5256
# Execute the function under namespace
5357
execution_namespace = {}
5458
exec(transformation_function, execution_namespace)
55-
transformation_function_caller = execution_namespace.get(
56-
transformation.display_name
57-
)
59+
transformation_function_caller = execution_namespace.get(transformation.name)
5860
transformed_data = None
5961

6062
transformation_kwargs_dict = json.loads(transformation_kwargs)

ads/feature_store/dataset.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@
5050
)
5151
from ads.feature_store.validation_output import ValidationOutput
5252

53-
# Copyright (c) 2023 Oracle and/or its affiliates.
53+
# Copyright (c) 2023, 2024 Oracle and/or its affiliates.
5454
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
5555

5656
logger = logging.getLogger(__name__)
@@ -422,8 +422,8 @@ def with_expectation_suite(
422422
423423
Returns
424424
-------
425-
Pipeline
426-
The Expectation instance (self).
425+
Dataset
426+
The Dataset instance (self).
427427
"""
428428
return self.set_spec(
429429
self.CONST_EXPECTATION_DETAILS,

ads/feature_store/dataset_job.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
#!/usr/bin/env python
2-
# -*- coding: utf-8; -*-
32

4-
# Copyright (c) 2023 Oracle and/or its affiliates.
3+
# Copyright (c) 2023, 2024 Oracle and/or its affiliates.
54
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
5+
66
import logging
77
from copy import deepcopy
88
from typing import Dict, List, Any, Union
@@ -54,7 +54,7 @@ class DatasetJob(Builder):
5454
>>> dataset_run = dataset_run.DatasetJob()
5555
>>> .with_compartment_id(os.environ["PROJECT_COMPARTMENT_OCID"])
5656
>>> .with_dataset_id("dataset_id")
57-
>>> .with_ingestion_mode(IngestionMode.SQL)
57+
>>> .with_ingestion_mode(BatchIngestionMode.SQL)
5858
>>> dataset_run.create()
5959
"""
6060

ads/feature_store/docs/requirements.txt

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,11 @@
1-
autodoc
2-
nbsphinx
1+
autodoc nbsphinx
32
sphinx
43
sphinxcontrib-napoleon
54
sphinx_copybutton
65
sphinx_code_tabs
76
sphinx-autobuild
87
sphinx-autorun
9-
oracle_ads==2.9.0rc0
8+
oracle_ads
109
furo
1110
IPython
1211
pandoc

0 commit comments

Comments
 (0)