Skip to content

Commit a305343

Browse files
authored
DataType Changes, validation output transpose (#246)
2 parents 432955f + 103637a commit a305343

19 files changed

+434
-193
lines changed

ads/feature_store/common/enums.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -295,6 +295,7 @@ class FeatureType(Enum):
295295
STRING_BINARY_MAP = "STRING_BINARY_MAP"
296296
STRING_BOOLEAN_MAP = "STRING_BOOLEAN_MAP"
297297
UNKNOWN = "UNKNOWN"
298+
COMPLEX = "COMPLEX"
298299

299300

300301
class EntityType(Enum):

ads/feature_store/common/spark_session_singleton.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,8 @@ def __init__(self, metastore_id: str = None):
7575
"spark.hadoop.oracle.dcat.metastore.id", metastore_id
7676
).config(
7777
"spark.sql.warehouse.dir", metastore.default_managed_table_location
78-
)
78+
)\
79+
.config("spark.driver.memory", "16G")
7980

8081
if developer_enabled():
8182
# Configure spark session with delta jars only in developer mode. In other cases,

ads/feature_store/common/utils/feature_schema_mapper.py

Lines changed: 11 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ def map_spark_type_to_feature_type(spark_type):
7171
if spark_type in spark_type_to_feature_type:
7272
return spark_type_to_feature_type.get(spark_type)
7373
else:
74-
return FeatureType.UNKNOWN
74+
return FeatureType.COMPLEX
7575

7676

7777
def map_pandas_type_to_feature_type(feature_name, values):
@@ -180,7 +180,7 @@ def map_feature_type_to_spark_type(feature_type):
180180
if feature_type_in in spark_types:
181181
return spark_types.get(feature_type_in)
182182
else:
183-
return "UNKNOWN"
183+
return "COMPLEX"
184184

185185

186186
def get_raw_data_source_schema(raw_feature_details: List[dict]):
@@ -225,30 +225,22 @@ def map_feature_type_to_pandas(feature_type):
225225
FeatureType.INTEGER: "int32",
226226
FeatureType.DECIMAL: "object",
227227
FeatureType.DATE: "object",
228+
FeatureType.STRING_ARRAY: "object",
229+
FeatureType.INTEGER_ARRAY: "object",
230+
FeatureType.LONG_ARRAY: "object",
231+
FeatureType.FLOAT_ARRAY: "object",
232+
FeatureType.DOUBLE_ARRAY: "object",
233+
FeatureType.TIMESTAMP_ARRAY: "object",
234+
FeatureType.BOOLEAN_ARRAY: "object",
235+
# FeatureType.DECIMAL_ARRAY: "object",
236+
FeatureType.DATE_ARRAY: "object",
228237
}
229238
if feature_type_in in supported_feature_type:
230239
return supported_feature_type.get(feature_type_in)
231240
else:
232241
raise TypeError(f"Feature Type {feature_type} is not supported for pandas")
233242

234243

235-
def convert_pandas_datatype_with_schema(
236-
raw_feature_details: List[dict], input_df: pd.DataFrame
237-
):
238-
feature_detail_map = {}
239-
for feature_details in raw_feature_details:
240-
feature_detail_map[feature_details.get("name")] = feature_details
241-
for column in input_df.columns:
242-
if column in feature_detail_map.keys():
243-
feature_details = feature_detail_map[column]
244-
feature_type = feature_details.get("featureType")
245-
pandas_type = map_feature_type_to_pandas(feature_type)
246-
input_df[column] = (
247-
input_df[column]
248-
.astype(pandas_type)
249-
.where(pd.notnull(input_df[column]), None)
250-
)
251-
252244

253245
def map_spark_type_to_stats_data_type(spark_type):
254246
"""Maps the spark data types to MLM library data types

ads/feature_store/common/utils/utility.py

Lines changed: 49 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,12 @@
1111
from ads.common.decorator.runtime_dependency import OptionalDependency
1212
from ads.feature_store.common.utils.feature_schema_mapper import (
1313
map_spark_type_to_feature_type,
14-
map_pandas_type_to_feature_type,
14+
map_feature_type_to_pandas,
1515
)
1616
from ads.feature_store.feature import Feature, DatasetFeature
1717
from ads.feature_store.feature_group_expectation import Rule, Expectation
1818
from ads.feature_store.input_feature_detail import FeatureDetail
19+
from ads.feature_store.common.spark_session_singleton import SparkSessionSingleton
1920

2021
try:
2122
from pyspark.pandas import DataFrame
@@ -154,18 +155,9 @@ def get_features(
154155

155156

156157
def get_schema_from_pandas_df(df: pd.DataFrame):
157-
schema_details = []
158-
159-
for order_number, field in enumerate(df.columns, start=1):
160-
details = {
161-
"name": field,
162-
"feature_type": map_pandas_type_to_feature_type(field, df[field]),
163-
"order_number": order_number,
164-
}
165-
166-
schema_details.append(details)
167-
168-
return schema_details
158+
spark = SparkSessionSingleton().get_spark_session()
159+
converted_df = spark.createDataFrame(df)
160+
return get_schema_from_spark_df(converted_df)
169161

170162

171163
def get_schema_from_spark_df(df: DataFrame):
@@ -268,3 +260,47 @@ def largest_matching_subset_of_primary_keys(left_feature_group, right_feature_gr
268260
common_keys = left_primary_keys.intersection(right_primary_keys)
269261

270262
return common_keys
263+
264+
265+
def convert_pandas_datatype_with_schema(
266+
raw_feature_details: List[dict], input_df: pd.DataFrame
267+
) -> pd.DataFrame:
268+
feature_detail_map = {}
269+
columns_to_remove = []
270+
for feature_details in raw_feature_details:
271+
feature_detail_map[feature_details.get("name")] = feature_details
272+
for column in input_df.columns:
273+
if column in feature_detail_map.keys():
274+
feature_details = feature_detail_map[column]
275+
feature_type = feature_details.get("featureType")
276+
pandas_type = map_feature_type_to_pandas(feature_type)
277+
input_df[column] = (
278+
input_df[column]
279+
.astype(pandas_type)
280+
.where(pd.notnull(input_df[column]), None)
281+
)
282+
else:
283+
logger.warning("column" + column + "doesn't exist in the input feature details")
284+
columns_to_remove.append(column)
285+
return input_df.drop(columns = columns_to_remove)
286+
287+
288+
def convert_spark_dataframe_with_schema(
289+
raw_feature_details: List[dict], input_df: DataFrame
290+
) -> DataFrame:
291+
feature_detail_map = {}
292+
columns_to_remove = []
293+
for feature_details in raw_feature_details:
294+
feature_detail_map[feature_details.get("name")] = feature_details
295+
for column in input_df.columns:
296+
if column not in feature_detail_map.keys():
297+
logger.warning("column" + column + "doesn't exist in the input feature details")
298+
columns_to_remove.append(column)
299+
300+
return input_df.drop(*columns_to_remove)
301+
302+
303+
def validate_input_feature_details(input_feature_details, data_frame):
304+
if isinstance(data_frame, pd.DataFrame):
305+
return convert_pandas_datatype_with_schema(input_feature_details, data_frame)
306+
return convert_spark_dataframe_with_schema(input_feature_details, data_frame)

ads/feature_store/docs/source/feature_group.rst

Lines changed: 40 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -152,32 +152,49 @@ Feature store provides an API similar to Pandas to join feature groups together
152152
153153
Save expectation entity
154154
=======================
155-
With a ``FeatureGroup`` instance, we can save the expectation entity using ``save_expectation()``
155+
With a ``FeatureGroup`` instance, You can save the expectation details using ``with_expectation_suite()`` with parameters
156+
157+
- ``expectation_suite: ExpectationSuite``. ExpectationSuit of great expectation
158+
- ``expectation_type: ExpectationType``. Type of expectation
159+
- ``ExpectationType.STRICT``: Fail the job if expectation not met
160+
- ``ExpectationType.LENIENT``: Pass the job even if expectation not met
156161

157162
.. note::
158163

159164
Great Expectations is a Python-based open-source library for validating, documenting, and profiling your data. It helps you to maintain data quality and improve communication about data between teams. Software developers have long known that automated testing is essential for managing complex codebases.
160165

161166
.. image:: figures/validation.png
162167

163-
The ``.save_expectation()`` method takes the following optional parameter:
168+
.. code-block:: python3
164169
165-
- ``expectation: Expectation``. Expectation of great expectation
166-
- ``expectation_type: ExpectationType``. Type of expectation
167-
- ``ExpectationType.STRICT``: Fail the job if expectation not met
168-
- ``ExpectationType.LENIENT``: Pass the job even if expectation not met
170+
expectation_suite = ExpectationSuite(
171+
expectation_suite_name="expectation_suite_name"
172+
)
173+
expectation_suite.add_expectation(
174+
ExpectationConfiguration(
175+
expectation_type="expect_column_values_to_not_be_null",
176+
kwargs={"column": "<column>"},
177+
)
169178
170-
.. code-block:: python3
179+
feature_group_resource = (
180+
FeatureGroup()
181+
.with_feature_store_id(feature_store.id)
182+
.with_primary_keys(["<key>"])
183+
.with_name("<name>")
184+
.with_entity_id(entity.id)
185+
.with_compartment_id(<compartment_id>)
186+
.with_schema_details_from_dataframe(<datframe>)
187+
.with_expectation_suite(
188+
expectation_suite=expectation_suite,
189+
expectation_type=ExpectationType.STRICT,
190+
)
191+
)
171192
172-
feature_group.save_expectation(expectation_suite, expectation_type="STRICT")
193+
You can call the ``get_validation_output()`` method of the FeatureGroup instance to fetch validation results for a specific ingestion job.
173194

174195
Statistics Results
175196
==================
176-
You can call the ``get_statistics()`` method of the FeatureGroup instance to fetch validation results for a specific ingestion job.
177-
178-
.. note::
179-
180-
PyDeequ is a Python API for Deequ, a library built on top of Apache Spark for defining "unit tests for data", which measure data quality in large datasets.
197+
You can call the ``get_statistics()`` method of the FeatureGroup instance to fetch statistics for a specific ingestion job.
181198

182199
.. code-block:: python3
183200
@@ -196,26 +213,16 @@ With a FeatureGroup instance, we can get the last feature group job details usin
196213
197214
# Fetch validation results for a feature group
198215
feature_group_job = feature_group.get_last_job()
199-
df = feature_group_job.get_validation().to_pandas()
200-
df.show()
201216
202217
Get features
203218
=============
204-
You can call the ``get_features_dataframe()`` method of the FeatureGroup instance to fetch features in a feature group
219+
You can call the ``get_features_df`` method of the FeatureGroup instance to fetch features in a feature group
205220

206221
.. code-block:: python3
207222
208223
# Fetch features for a feature group
209-
df = feature_group.get_features_dataframe()
210-
211-
Get input schema details
212-
==========================
213-
You can call the ``get_input_schema_dataframe()`` method of the FeatureGroup instance to fetch input schema details of a feature group
224+
df = feature_group.get_features_df()
214225
215-
.. code-block:: python3
216-
217-
# Fetch features for a feature group
218-
df = feature_group.get_input_schema_dataframe()
219226
220227
Filter
221228
======
@@ -308,7 +315,8 @@ The data will be stored in a data type native to each store. There is an option
308315

309316
Offline data types
310317
###################
311-
Please refer to the following mapping when registering a Spark DataFrame, or a Pandas DataFrame.
318+
Please refer to the following mapping when registering a Spark DataFrame, or a Pandas DataFrame.For spark dataframes we support
319+
all the data types and the ones which are not specified in the following table will be mapped to Offline Feature Type COMPLEX
312320

313321
.. list-table::
314322
:widths: 20 25 25 40
@@ -363,31 +371,31 @@ The data will be stored in a data type native to each store. There is an option
363371
- STRING
364372
- Textual data
365373
* - ArrayType(IntegerType())
366-
- object (list), object (np.ndarray) - not supported
374+
- object (list), object (np.ndarray)
367375
- INTEGER_ARRAY
368376
- List of values
369377
* - ArrayType(LongType())
370-
- object (list), object (np.ndarray) - not supported
378+
- object (list), object (np.ndarray)
371379
- LONG_ARRAY
372380
- List of values
373381
* - ArrayType(FloatType())
374-
- object (list), object (np.ndarray) - not supported
382+
- object (list), object (np.ndarray)
375383
- FLOAT_ARRAY
376384
- List of values
377385
* - ArrayType(DoubleType())
378-
- object (list), object (np.ndarray) - not supported
386+
- object (list), object (np.ndarray)
379387
- DOUBLE_ARRAY
380388
- List of values
381389
* - ArrayType(BinaryType())
382390
- object (list), object (np.ndarray) - not supported
383391
- BINARY_ARRAY
384392
- List of values
385393
* - ArrayType(DateType())
386-
- object (list), object (np.ndarray) - not supported
394+
- object (list), object (np.ndarray)
387395
- DATE_ARRAY
388396
- List of values
389397
* - ArrayType(TimestampType())
390-
- object (list), object (np.ndarray) - not supported
398+
- object (list), object (np.ndarray)
391399
- TIMESTAMP_ARRAY
392400
- List of values
393401
* - StructType

ads/feature_store/docs/source/release_notes.rst

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,33 @@
33
=============
44
Release Notes
55
=============
6+
1.1
7+
---
8+
9+
.. note::
10+
11+
.. list-table::
12+
:header-rows: 1
13+
14+
* - Package Name
15+
- Latest Version
16+
- Notes
17+
* - Conda pack
18+
- `https://objectstorage.us-ashburn-1.oraclecloud.com/n/bigdatadatasciencelarge/b/service-conda-packs-fs/o/service_pack/cpu/PySpark_3.2_and_Feature_Store/1.0/fspyspark32_p38_cpu_v1#conda`
19+
-
20+
* - SERVICE_VERSION
21+
- 0.1.212.master
22+
-
23+
* - Terraform Stack
24+
- `link <https://objectstorage.us-ashburn-1.oraclecloud.com/p/vZogtXWwHqbkGLeqyKiqBmVxdbR4MK4nyOBqDsJNVE4sHGUY5KFi4T3mOFGA3FOy/n/idogsu2ylimg/b/oci-feature-store/o/beta/terraform/feature-store-terraform.zip>`__
25+
- Par link expires Jan 5, 2026
26+
27+
28+
Release notes: July 5, 2023
29+
30+
* [FEATURE] Supporting Offline Feature Type COMPLEX
31+
* [[DOCS] Data Type update for Offline Feature Type COMPLEX
32+
633
1.0
734
---
835

@@ -21,10 +48,9 @@ Release Notes
2148
- 0.1.209.master
2249
-
2350
* - Terraform Stack
24-
- `link <https://objectstorage.us-ashburn-1.oraclecloud.com/p/MokUwWRFZLj1Dgfev7D_0ALc8YL7jEBmM8FIeWH3AysnH2zxavptSobzR6ezErfZ/n/idogsu2ylimg/b/oci-feature-store/o/beta/terraform/feature-store-terraform.zip>`__
51+
- `link <https://objectstorage.us-ashburn-1.oraclecloud.com/p/vZogtXWwHqbkGLeqyKiqBmVxdbR4MK4nyOBqDsJNVE4sHGUY5KFi4T3mOFGA3FOy/n/idogsu2ylimg/b/oci-feature-store/o/beta/terraform/feature-store-terraform.zip>`__
2552
- Par link expires Jan 5, 2026
2653

27-
2854
Release notes: June 15, 2023
2955

3056
* [FEATURE] Included ``FeatureStore``, ``FeatureGroup``, ``Dataset``, ``Entity`` and ``Transformation`` concepts for feature store.

ads/feature_store/docs/source/terraform.rst

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -30,21 +30,21 @@ Feature Store users need to provide the following access permissions in order to
3030
3131
define tenancy <feature store service tenancy> as <feature store service tenancy ocid>
3232
endorse group <feature store user group> to read repos in tenancy <feature store service tenancy>
33-
allow group <feature store user group> to manage orm-stacks in compartment <compartmentId>
34-
allow group <feature store user group> to manage orm-jobs in compartment <compartmentId>
35-
allow group <feature store user group> to manage object-family in compartment <compartmentId>
36-
allow group <feature store user group> to manage users in compartment <compartmentId>
37-
allow group <feature store user group> to manage instance-family in compartment <compartmentId>
38-
allow group <feature store user group> to manage tag-namespaces in compartment <compartmentId>
39-
allow group <feature store user group> to manage groups in compartment <compartmentId>
40-
allow group <feature store user group> to manage policies in compartment <compartmentId>
41-
allow group <feature store user group> to manage dynamic-groups in compartment <compartmentId>
42-
allow group <feature store user group> to manage virtual-network-family in compartment <compartmentId>
43-
allow group <feature store user group> to manage functions-family in compartment <compartmentId>
44-
allow group <feature store user group> to inspect compartments in compartment <compartmentId>
45-
allow group <feature store user group> to manage cluster-family in compartment <compartmentId>
46-
allow group <feature store user group> to manage mysql-family in compartment <compartmentId>
47-
allow group <feature store user group> to manage api-gateway-family in compartment <compartmentId>
33+
allow group <feature store user group> to manage orm-stacks in compartment <compartmentName>
34+
allow group <feature store user group> to manage orm-jobs in compartment <compartmentName>
35+
allow group <feature store user group> to manage object-family in compartment <compartmentName>
36+
allow group <feature store user group> to manage users in compartment <compartmentName>
37+
allow group <feature store user group> to manage instance-family in compartment <compartmentName>
38+
allow group <feature store user group> to manage tag-namespaces in compartment <compartmentName>
39+
allow group <feature store user group> to manage groups in compartment <compartmentName>
40+
allow group <feature store user group> to manage policies in compartment <compartmentName>
41+
allow group <feature store user group> to manage dynamic-groups in compartment <compartmentName>
42+
allow group <feature store user group> to manage virtual-network-family in compartment <compartmentName>
43+
allow group <feature store user group> to manage functions-family in compartment <compartmentName>
44+
allow group <feature store user group> to inspect compartments in compartment <compartmentName>
45+
allow group <feature store user group> to manage cluster-family in compartment <compartmentName>
46+
allow group <feature store user group> to manage mysql-family in compartment <compartmentName>
47+
allow group <feature store user group> to manage api-gateway-family in compartment <compartmentName>
4848
4949
Deploy Using Oracle Resource Manager
5050
====================================

0 commit comments

Comments
 (0)