Skip to content

Adding catalog attribute in sourceDetails and TargetDetails inside onboarding json #175

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Apr 1, 2025
Merged
18 changes: 12 additions & 6 deletions demo/conf/cloudfiles-onboarding.template
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
},
"source_schema_path": "{uc_volume_path}/demo/resources/data/afam/ddl/customers.ddl"
},
"bronze_catalog_demo": "{uc_catalog_name}",
"bronze_database_demo": "{uc_catalog_name}.{bronze_schema}",
"bronze_table": "customers",
"bronze_reader_options": {
Expand All @@ -30,7 +31,8 @@
"pipelines.autoOptimize.zOrderCols": "id, email"
},
"bronze_data_quality_expectations_json_demo": "{uc_volume_path}/demo/conf/dqe/customers/bronze_data_quality_expectations.json",
"bronze_database_quarantine_demo": "{uc_catalog_name}.{bronze_schema}",
"bronze_catalog_quarantine_demo": "{uc_catalog_name}",
"bronze_database_quarantine_demo": "{bronze_schema}",
"bronze_quarantine_table": "customers_quarantine",
"bronze_quarantine_table_properties": {
"pipelines.reset.allowed": "false",
Expand All @@ -53,7 +55,8 @@
"once": false
}
],
"silver_database_demo": "{uc_catalog_name}.{silver_schema}",
"silver_catalog_demo": "{uc_catalog_name}",
"silver_database_demo": "{silver_schema}",
"silver_table": "customers",
"silver_transformation_json_demo": "{uc_volume_path}/demo/conf/afam_silver_transformations.json",
"silver_table_properties": {
Expand Down Expand Up @@ -91,8 +94,9 @@
}
},
"source_schema_path": "{uc_volume_path}/demo/resources/data/afam/ddl/transactions.ddl"
},
"bronze_database_demo": "{uc_catalog_name}.{bronze_schema}",
},
"bronze_catalog_demo": "{uc_catalog_name}",
"bronze_database_demo": "{bronze_schema}",
"bronze_table": "transactions",
"bronze_reader_options": {
"cloudFiles.format": "json",
Expand All @@ -104,7 +108,8 @@
"pipelines.autoOptimize.zOrderCols": "id, customer_id"
},
"bronze_data_quality_expectations_json_demo": "{uc_volume_path}/demo/conf/dqe/transactions/bronze_data_quality_expectations.json",
"bronze_database_quarantine_demo": "{uc_catalog_name}.{bronze_schema}",
"bronze_catalog_quarantine_demo": "{uc_catalog_name}",
"bronze_database_quarantine_demo": "{bronze_schema}",
"bronze_quarantine_table": "transactions_quarantine",
"bronze_quarantine_table_properties": {
"pipelines.reset.allowed": "true",
Expand All @@ -128,7 +133,8 @@
"once": false
}
],
"silver_database_demo": "{uc_catalog_name}.{silver_schema}",
"silver_catalog_demo": "{uc_catalog_name}",
"silver_database_demo": "{silver_schema}",
"silver_table": "transactions",
"silver_cdc_apply_changes": {
"keys": [
Expand Down
6 changes: 4 additions & 2 deletions demo/conf/cloudfiles-onboarding_A2.template
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@
},
"source_schema_path": "{uc_volume_path}/demo/resources/data/afam/ddl/customers.ddl"
},
"bronze_database_demo": "{uc_catalog_name}.{bronze_schema}",
"bronze_catalog_demo": "{uc_catalog_name}",
"bronze_database_demo": "{bronze_schema}",
"bronze_table": "customers_delta",
"bronze_reader_options": {
"cloudFiles.format": "json",
Expand All @@ -28,7 +29,8 @@
"pipelines.autoOptimize.zOrderCols": "id, email"
},
"bronze_data_quality_expectations_json_demo": "{uc_volume_path}/demo/conf/dqe/customers/bronze_data_quality_expectations.json",
"bronze_database_quarantine_demo": "{uc_catalog_name}.{bronze_schema}",
"bronze_catalog_quarantine_demo": "{uc_catalog_name}",
"bronze_database_quarantine_demo": "{bronze_schema}",
"bronze_quarantine_table": "customers_delta_quarantine",
"bronze_quarantine_table_properties": {
"pipelines.reset.allowed": "false",
Expand Down
6 changes: 4 additions & 2 deletions demo/conf/eventhub-onboarding.template
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,13 @@
"kafka.request.timeout.ms": "60000",
"kafka.session.timeout.ms": "60000"
},
"bronze_database_demo": "{uc_catalog_name}.{bronze_schema}",
"bronze_catalog_demo": "{uc_catalog_name}",
"bronze_database_demo": "{bronze_schema}",
"bronze_table": "bronze_{run_id}_iot",
"bronze_partition_columns": "date",
"bronze_data_quality_expectations_json_demo": "{uc_volume_path}/demo/conf/dqe/iot/bronze_data_quality_expectations.json",
"bronze_database_quarantine_demo": "{uc_catalog_name}.{bronze_schema}",
"bronze_catalog_quarantine_demo": "{uc_catalog_name}",
"bronze_database_quarantine_demo": "{bronze_schema}",
"bronze_quarantine_table": "bronze_{run_id}_iot_quarantine",
"bronze_quarantine_table_path_demo": "{uc_volume_path}/data/bronze/iot_quarantine",
"bronze_append_flows": [
Expand Down
6 changes: 4 additions & 2 deletions demo/conf/kafka-sink-onboarding.template
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,14 @@
"kafka.request.timeout.ms": "60000",
"kafka.session.timeout.ms": "60000"
},
"bronze_database_demo": "{uc_catalog_name}.{bronze_schema}",
"bronze_catalog_demo": "{uc_catalog_name}",
"bronze_database_demo": "{bronze_schema}",
"bronze_table": "bronze_{run_id}_iot",
"bronze_partition_columns": "date",
"bronze_table_path_demo": "{uc_volume_path}/data/bronze/iot",
"bronze_data_quality_expectations_json_demo": "{uc_volume_path}/demo/conf/dqe/iot/bronze_data_quality_expectations.json",
"bronze_database_quarantine_demo": "{uc_catalog_name}.{bronze_schema}",
"bronze_catalog_quarantine_demo": "{uc_catalog_name}",
"bronze_database_quarantine_demo": "{bronze_schema}",
"bronze_quarantine_table": "bronze_{run_id}_iot_quarantine",
"bronze_quarantine_table_path_demo": "{uc_volume_path}/data/bronze/iot_quarantine",
"bronze_sinks": [
Expand Down
30 changes: 20 additions & 10 deletions demo/conf/onboarding.template
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@
"source_path_prod": "{uc_volume_path}/demo/resources/data/customers",
"source_schema_path": "{uc_volume_path}/demo/resources/ddl/customers.ddl"
},
"bronze_database_prod": "{uc_catalog_name}.{bronze_schema}",
"bronze_catalog_prod": "{uc_catalog_name}",
"bronze_database_prod": "{bronze_schema}",
"bronze_table": "customers",
"bronze_table_path_prod": "{uc_volume_path}/data/bronze/customers",
"bronze_reader_options": {
Expand All @@ -20,10 +21,12 @@
},
"bronze_cluster_by":["customer_id"],
"bronze_data_quality_expectations_json_prod": "{uc_volume_path}/demo/conf/dqe/customers.json",
"bronze_database_quarantine_prod": "{uc_catalog_name}.{bronze_schema}",
"bronze_catalog_quarantine_prod": "{uc_catalog_name}",
"bronze_database_quarantine_prod": "{bronze_schema}",
"bronze_quarantine_table": "customers_quarantine",
"bronze_quarantine_table_path_prod": "{uc_volume_path}/data/bronze/customers_quarantine",
"silver_database_prod": "{uc_catalog_name}.{silver_schema}",
"silver_catalog_prod": "{uc_catalog_name}",
"silver_database_prod": "{silver_schema}",
"silver_table": "customers",
"silver_table_path_prod": "{uc_volume_path}/data/silver/customers",
"silver_cdc_apply_changes": {
Expand Down Expand Up @@ -55,7 +58,8 @@
"source_path_prod": "{uc_volume_path}/demo/resources/data/transactions",
"source_schema_path": "{uc_volume_path}/demo/resources/ddl/transactions.ddl"
},
"bronze_database_prod": "{uc_catalog_name}.{bronze_schema}",
"bronze_catalog_prod": "{uc_catalog_name}",
"bronze_database_prod": "{bronze_schema}",
"bronze_table": "transactions",
"bronze_table_path_prod": "{uc_volume_path}/data/bronze/transactions",
"bronze_reader_options": {
Expand All @@ -67,7 +71,8 @@
"bronze_database_quarantine_prod": "{uc_catalog_name}.{bronze_schema}",
"bronze_quarantine_table": "transactions_quarantine",
"bronze_quarantine_table_path_prod": "{uc_volume_path}/demo/resources/data/bronze/transactions_quarantine",
"silver_database_prod": "{uc_catalog_name}.{silver_schema}",
"silver_catalog_prod": "{uc_catalog_name}",
"silver_database_prod": "{silver_schema}",
"silver_table": "transactions",
"silver_table_path_prod": "{uc_volume_path}/data/silver/transactions",
"silver_cdc_apply_changes": {
Expand Down Expand Up @@ -98,7 +103,8 @@
"source_path_prod": "{uc_volume_path}/demo/resources/data/products",
"source_schema_path": "{uc_volume_path}/demo/resources/ddl/products.ddl"
},
"bronze_database_prod": "{uc_catalog_name}.{bronze_schema}",
"bronze_catalog_prod": "{uc_catalog_name}",
"bronze_database_prod": "{bronze_schema}",
"bronze_table": "products",
"bronze_table_path_prod": "{uc_volume_path}/data/bronze/products",
"bronze_reader_options": {
Expand All @@ -111,7 +117,8 @@
"bronze_database_quarantine_prod": "{uc_catalog_name}.{bronze_schema}",
"bronze_quarantine_table": "products_quarantine",
"bronze_quarantine_table_path_prod": "{uc_volume_path}/demo/resources/data/bronze/products_quarantine",
"silver_database_prod": "{uc_catalog_name}.{silver_schema}",
"silver_catalog_prod": "{uc_catalog_name}",
"silver_database_prod": "{silver_schema}",
"silver_table": "products",
"silver_table_path_prod": "{uc_volume_path}/data/silver/products",
"silver_cdc_apply_changes": {
Expand Down Expand Up @@ -142,7 +149,8 @@
"source_path_prod": "{uc_volume_path}/demo/resources/data/stores",
"source_schema_path": "{uc_volume_path}/demo/resources/ddl/stores.ddl"
},
"bronze_database_prod": "{uc_catalog_name}.{bronze_schema}",
"bronze_catalog_prod": "{uc_catalog_name}",
"bronze_database_prod": "{bronze_schema}",
"bronze_table": "stores",
"bronze_table_path_prod": "{uc_volume_path}/data/bronze/stores",
"bronze_reader_options": {
Expand All @@ -152,10 +160,12 @@
},
"bronze_table_path_prod": "{uc_volume_path}/demo/resources/data/bronze/stores",
"bronze_data_quality_expectations_json_prod": "{uc_volume_path}/demo/conf/dqe/stores.json",
"bronze_database_quarantine_prod": "{uc_catalog_name}.{bronze_schema}",
"bronze_catalog_quarantine_prod": "{uc_catalog_name}",
"bronze_database_quarantine_prod": "{bronze_schema}",
"bronze_quarantine_table": "stores_quarantine",
"bronze_quarantine_table_path_prod": "{uc_volume_path}/demo/resources/data/bronze/stores_quarantine",
"silver_database_prod": "{uc_catalog_name}.{silver_schema}",
"silver_catalog_prod": "{uc_catalog_name}",
"silver_database_prod": "{silver_schema}",
"silver_table": "stores",
"silver_table_path_prod": "{uc_volume_path}/data/silver/stores",
"silver_cdc_apply_changes": {
Expand Down
6 changes: 4 additions & 2 deletions demo/conf/onboarding_cars.template
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,16 @@
"source_details": {
"source_path_demo": "{uc_volume_path}/demo/resources/data/cars"
},
"bronze_database_demo": "{uc_catalog_name}.{bronze_schema}",
"bronze_catalog_demo": "{uc_catalog_name}",
"bronze_database_demo": "{bronze_schema}",
"bronze_table": "cars",
"bronze_reader_options": {
"cloudFiles.format": "csv",
"cloudFiles.rescuedDataColumn": "_rescued_data",
"header": "true"
},
"silver_database_demo": "{uc_catalog_name}.{silver_schema}",
"silver_catalog_demo": "{uc_catalog_name}",
"silver_database_demo": "{silver_schema}",
"silver_table": "cars_usa",
"silver_transformation_json_demo": "{uc_volume_path}/demo/conf/silver_transformations_cars.json"
}
Expand Down
18 changes: 12 additions & 6 deletions demo/conf/onboarding_fanout_cars.template
Original file line number Diff line number Diff line change
Expand Up @@ -2,27 +2,33 @@
{
"data_flow_id": "101",
"data_flow_group": "A1",
"bronze_database_demo": "{uc_catalog_name}.{bronze_schema}",
"bronze_catalog_demo": "{uc_catalog_name}",
"bronze_database_demo": "{bronze_schema}",
"bronze_table": "cars",
"silver_database_demo": "{uc_catalog_name}.{silver_schema}",
"silver_catalog_demo": "{uc_catalog_name}",
"silver_database_demo": "{silver_schema}",
"silver_table": "cars_germany",
"silver_transformation_json_demo": "{uc_volume_path}/demo/conf/silver_transformations_cars.json"
},
{
"data_flow_id": "102",
"data_flow_group": "A1",
"bronze_database_demo": "{uc_catalog_name}.{bronze_schema}",
"bronze_catalog_demo": "{uc_catalog_name}",
"bronze_database_demo": "{bronze_schema}",
"bronze_table": "cars",
"silver_database_demo": "{uc_catalog_name}.{silver_schema}",
"silver_catalog_demo": "{uc_catalog_name}",
"silver_database_demo": "{silver_schema}",
"silver_table": "cars_uk",
"silver_transformation_json_demo": "{uc_volume_path}/demo/conf/silver_transformations_cars.json"
},
{
"data_flow_id": "103",
"data_flow_group": "A1",
"bronze_database_demo": "{uc_catalog_name}.{bronze_schema}",
"bronze_catalog_demo": "{uc_catalog_name}",
"bronze_database_demo": "{bronze_schema}",
"bronze_table": "cars",
"silver_database_demo": "{uc_catalog_name}.{silver_schema}",
"silver_catalog_demo": "{uc_catalog_name}",
"silver_database_demo": "{silver_schema}",
"silver_table": "cars_japan",
"silver_transformation_json_demo": "{uc_volume_path}/demo/conf/silver_transformations_cars.json"
}
Expand Down
10 changes: 6 additions & 4 deletions demo/conf/snapshot-onboarding.template
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,9 @@
},
"bronze_reader_options": {
"header": "true"
},
"bronze_database_demo": "{uc_catalog_name}.{bronze_schema}",
},
"bronze_catalog_demo": "{uc_catalog_name}",
"bronze_database_demo": "{bronze_schema}",
"bronze_table": "products",
"bronze_apply_changes_from_snapshot": {
"keys": [
Expand All @@ -31,8 +32,9 @@
},
"bronze_reader_options": {
"header": "true"
},
"bronze_database_demo": "{uc_catalog_name}.{bronze_schema}",
},
"bronze_catalog_demo": "{uc_catalog_name}",
"bronze_database_demo": "{bronze_schema}",
"bronze_table": "stores",
"bronze_apply_changes_from_snapshot": {
"keys": [
Expand Down
5 changes: 4 additions & 1 deletion docs/content/getting_started/metadatapreperation.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@ The `onboarding.json` file contains links to [silver_transformations.json](https
| data_flow_id | This is unique identifier for pipeline |
| data_flow_group | This is group identifier for launching multiple pipelines under single DLT |
| source_format | Source format e.g `cloudFiles`, `eventhub`, `kafka`, `delta`, `snapshot` |
| source_details | This map Type captures all source details for cloudfiles = `source_schema_path`, `source_path_{env}`, `source_database`, `source_metadata` For eventhub= `source_schema_path` , `eventhub.accessKeyName`, `eventhub.accessKeySecretName`, `eventhub.name` , `eventhub.secretsScopeName` , `kafka.sasl.mechanism`, `kafka.security.protocol`, `eventhub.namespace`, `eventhub.port`. For Source schema file spark DDL schema format parsing is supported <br> In case of custom schema format then write schema parsing function `bronze_schema_mapper(schema_file_path, spark):Schema` and provide to `OnboardDataflowspec` initialization <br> e.g `onboardDataFlowSpecs = OnboardDataflowspec(spark, dict_obj,bronze_schema_mapper).onboardDataFlowSpecs()`.<br> For cloudFiles option _metadata columns addtiion there is `source_metadata` tag with attributes: `include_autoloader_metadata_column` flag (`True` or `False` value) will add _metadata column to target bronze dataframe, `autoloader_metadata_col_name` if this provided then will be used to rename _metadata to this value otherwise default is `source_metadata`,`select_metadata_cols:{key:value}` will be used to extract columns from _metadata. key is target dataframe column name and value is expression used to add column from _metadata column. <br> for snapshot= `snapshot_format`, `source_path_{env}` |
| source_details | This map Type captures all source details for cloudfiles = `source_schema_path`, `source_path_{env}`, `source_catalog`, `source_database`, `source_metadata` For eventhub= `source_schema_path` , `eventhub.accessKeyName`, `eventhub.accessKeySecretName`, `eventhub.name` , `eventhub.secretsScopeName` , `kafka.sasl.mechanism`, `kafka.security.protocol`, `eventhub.namespace`, `eventhub.port`. For Source schema file spark DDL schema format parsing is supported <br> In case of custom schema format then write schema parsing function `bronze_schema_mapper(schema_file_path, spark):Schema` and provide to `OnboardDataflowspec` initialization <br> e.g `onboardDataFlowSpecs = OnboardDataflowspec(spark, dict_obj,bronze_schema_mapper).onboardDataFlowSpecs()`.<br> For cloudFiles option _metadata columns addtiion there is `source_metadata` tag with attributes: `include_autoloader_metadata_column` flag (`True` or `False` value) will add _metadata column to target bronze dataframe, `autoloader_metadata_col_name` if this provided then will be used to rename _metadata to this value otherwise default is `source_metadata`,`select_metadata_cols:{key:value}` will be used to extract columns from _metadata. key is target dataframe column name and value is expression used to add column from _metadata column. <br> for snapshot= `snapshot_format`, `source_path_{env}` |
| bronze_catalog_{env} | Unity catalog name |
| bronze_database_{env} | Delta lake bronze database name. |
| bronze_table | Delta lake bronze table name |
| bronze_reader_options | Reader options which can be provided to spark reader <br> e.g multiline=true,header=true in json format |
Expand All @@ -40,13 +41,15 @@ The `onboarding.json` file contains links to [silver_transformations.json](https
| bronze_table_properties | DLT table properties map. e.g. `{"pipelines.autoOptimize.managed": "false" , "pipelines.autoOptimize.zOrderCols": "year,month", "pipelines.reset.allowed": "false" }` |
| bronze_sink | DLT Sink API properties: e.g Delta: `{"name": "bronze_sink","format": "delta","options": {"tableName": "my_catalog.my_schema.my_table"}}`, Kafka:`{"name": "bronze_sink","format": "kafka","options": { "kafka.bootstrap.servers": "host:port","subscribe": "my_topic"}}` |
| bronze_data_quality_expectations_json | Bronze table data quality expectations |
| bronze_catalog_quarantine_{env} | Unity catalog name |
| bronze_database_quarantine_{env} | Bronze database for quarantine data which fails expectations. |
| bronze_quarantine_table Bronze | Table for quarantine data which fails expectations |
| bronze_quarantine_table_path_{env} | Bronze database for quarantine data which fails expectations. |
| bronze_quarantine_table_partitions | Bronze quarantine tables partition cols |
| bronze_quarantine_table_cluster_by | Bronze quarantine tables cluster cols |
| bronze_quarantine_table_properties | DLT table properties map. e.g. `{"pipelines.autoOptimize.managed": "false" , "pipelines.autoOptimize.zOrderCols": "year,month", "pipelines.reset.allowed": "false" }` |
| bronze_append_flows | Bronze table append flows json. e.g.`"bronze_append_flows":[{"name":"customer_bronze_flow", "create_streaming_table": false,"source_format": "cloudFiles", "source_details": {"source_database": "APP","source_table":"CUSTOMERS", "source_path_dev": "tests/resources/data/customers", "source_schema_path": "tests/resources/schema/customer_schema.ddl"},"reader_options": {"cloudFiles.format": "json","cloudFiles.inferColumnTypes": "true","cloudFiles.rescuedDataColumn": "_rescued_data"},"once": true}]` |
| silver_catalog_{env} | Unit Catalog name. |
| silver_database_{env} | Silver database name. |
| silver_table | Silver table name |
| silver_partition_columns | Silver table partition columns list |
Expand Down
Loading
Loading