-
Notifications
You must be signed in to change notification settings - Fork 5
Fix Issue #3: Add Support for Identifying Identical Table Names Across Different Data Sets #4
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
171ed8a
3f03bda
e9b32d8
458ce30
5e3a7f0
ccd575b
f6ade7f
73b1108
4d2f8ab
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -17,57 +17,74 @@ const commonAssertionsResult = commonAssertions({ | |
}, | ||
}, | ||
rowConditions: { | ||
"first_table": { | ||
"id_not_null": "id IS NOT NULL", | ||
"id_strict_positive": "id > 0" | ||
}, | ||
"second_table": { | ||
"id_in_accepted_values": "id IN (1, 2, 3)" | ||
// Format: "schema": { "table": { "conditionName": "conditionQuery", ... }, ... } | ||
"dataform": { | ||
"first_table": { | ||
"id_not_null": "id IS NOT NULL", | ||
"id_strict_positive": "id > 0" | ||
}, | ||
"second_table": { | ||
"id_in_accepted_values": "id IN (1, 2, 3)" | ||
} | ||
} | ||
}, | ||
uniqueKeyConditions: { | ||
"first_table": ["id"], | ||
"second_table": ["id", "updated_date"] | ||
// Format: "schema": { "table": [column1, column2, ...], ... } | ||
"dataform": { | ||
"first_table": ["id"], | ||
"second_table": ["id", "updated_date"] | ||
} | ||
}, | ||
dataFreshnessConditions: { | ||
"first_table": { | ||
"dateColumn": "updated_date", | ||
"timeUnit": "DAY", | ||
"delayCondition": 1, | ||
"timeZone": "America/Los_Angeles" | ||
}, | ||
"second_table": { | ||
// If timeUnit is not DAY, WEEK, MONTH, QUARTER, or YEAR, dateColumn should be a TIMESTAMP. | ||
// Check here for valid Date time units: https://cloud.google.com/bigquery/docs/reference/standard-sql/date_functions#date_diff | ||
// Check here for valid Timestamp time units: https://cloud.google.com/bigquery/docs/reference/standard-sql/timestamp_functions#timestamp_diff | ||
"dateColumn": "TIMESTAMP(updated_date)", | ||
"timeUnit": "HOUR", | ||
"delayCondition": 3, | ||
"timeZone": "-08" | ||
// Format: "schema": { "table": { "dateColumn", "timeUnit", "delayCondition" }, ... } | ||
"dataform": { | ||
"first_table": { | ||
"dateColumn": "updated_date", | ||
"timeUnit": "DAY", | ||
"delayCondition": 1, | ||
"timeZone": "America/Los_Angeles" | ||
}, | ||
"second_table": { | ||
// If timeUnit is not DAY, WEEK, MONTH, QUARTER, or YEAR, dateColumn should be a TIMESTAMP. | ||
// Check here for valid Date time units: https://cloud.google.com/bigquery/docs/reference/standard-sql/date_functions#date_diff | ||
// Check here for valid Timestamp time units: https://cloud.google.com/bigquery/docs/reference/standard-sql/timestamp_functions#timestamp_diff | ||
"dateColumn": "TIMESTAMP(updated_date)", | ||
"timeUnit": "HOUR", | ||
"delayCondition": 3, | ||
"timeZone": "-08" | ||
} | ||
} | ||
}, | ||
dataCompletenessConditions: { | ||
"first_table": { | ||
// Format: "column": allowedPercentageNull | ||
"updated_date": 1, // 1% of null values allowed in the updated_date column | ||
"id": 20 | ||
}, | ||
"second_table": { | ||
"id": 30 | ||
// Format: "schema": { "table": { "column": allowedPercentageNull, ... }, ... } | ||
"dataform": { | ||
"first_table": { | ||
// Format: "column": allowedPercentageNull | ||
"updated_date": 1, // 1% of null values allowed in the updated_date column | ||
"id": 20 | ||
}, | ||
"second_table": { | ||
"id": 30 | ||
} | ||
} | ||
}, | ||
referentialIntegrityConditions: { | ||
"first_table": [{ | ||
"parentKey": "id", | ||
"childTable": "second_table", | ||
"childKey": "id" | ||
}, | ||
{ | ||
"parentKey": "id", | ||
"childTable": "third_table", | ||
"childKey": "parent_id" | ||
} | ||
] | ||
// Format: "parentSchema": { "parentTable": [{ parentKey, childSchema, childTable, childKey }, ...], ... } | ||
"dataform": { | ||
"first_table": [{ | ||
"parentKey": "id", | ||
"childSchema": "dataform", | ||
"childTable": "second_table", | ||
Comment on lines
+76
to
+77
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. add |
||
"childKey": "id" | ||
}, | ||
{ | ||
"parentKey": "id", | ||
"childSchema": "dataform", | ||
"childTable": "third_table", | ||
"childKey": "parent_id" | ||
} | ||
] | ||
} | ||
} | ||
}); | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -11,30 +11,30 @@ | |
|
||
/** | ||
* @param {Object} globalParams - See index.js for details. | ||
* @param {string} filter - The condition to filter the data. | ||
* @param {string} schemaName - The name of the schema to check for unique keys. | ||
* @param {string} tableName - The name of the table to check for data completeness. | ||
* @param {string} filter - The condition to filter the data. | ||
* @param {Object} columnConditions - An object mapping column names to their allowed percentage of null values. If a value is an object, it should have an `allowedPercentageNull` property. | ||
*/ | ||
|
||
const assertions = []; | ||
|
||
const createDataCompletenessAssertion = (globalParams, filter, tableName, columnConditions) => { | ||
|
||
const createDataCompletenessAssertion = (globalParams, schemaName, tableName, filter, columnConditions) => { | ||
for (let columnName in columnConditions) { | ||
const allowedPercentageNull = columnConditions[columnName]; | ||
|
||
const assertion = assert(`assert_data_completeness_${tableName}_${columnName}`) | ||
const assertion = assert(`assert_data_completeness_${schemaName}_${tableName}_${columnName}`) | ||
.database(globalParams.database) | ||
.schema(globalParams.schema) | ||
.description(`Check data completeness for ${tableName}.${columnName}, allowed percentage of null values: ${allowedPercentageNull}`) | ||
.description(`Check data completeness for ${schemaName}.${tableName}.${columnName}, allowed percentage of null values: ${allowedPercentageNull}`) | ||
.tags("assert-data-completeness") | ||
.query(ctx => ` | ||
WITH | ||
filtering AS ( | ||
SELECT | ||
* | ||
FROM | ||
${ctx.ref(tableName)} | ||
${ctx.ref(schemaName, tableName)} | ||
WHERE | ||
${filter} | ||
) | ||
|
@@ -55,11 +55,13 @@ const createDataCompletenessAssertion = (globalParams, filter, tableName, column | |
|
||
module.exports = (globalParams, config, dataCompletenessConditions) => { | ||
// Loop through dataCompletenessConditions to create data completeness check assertions. | ||
for (let tableName in dataCompletenessConditions) { | ||
const columnConditions = dataCompletenessConditions[tableName]; | ||
const filter = config[tableName]?.where ?? true; | ||
createDataCompletenessAssertion(globalParams, filter, tableName, columnConditions); | ||
for (let schemaName in dataCompletenessConditions) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
const tableNames = dataCompletenessConditions[schemaName]; | ||
for (let tableName in tableNames) { | ||
const columnConditions = tableNames[tableName]; | ||
const filter = config[tableName]?.where ?? true; | ||
createDataCompletenessAssertion(globalParams, schemaName, tableName, filter, columnConditions); | ||
} | ||
} | ||
|
||
return assertions; | ||
}; |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9,8 +9,9 @@ | |
|
||
/** | ||
* @param {Object} globalParams - See index.js for details. | ||
* @param {string} filter - The condition to filter the data. | ||
* @param {string} schemaName - The name of the schema to check for unique keys. | ||
* @param {string} tableName - The name of the table to check for data freshness. | ||
* @param {string} filter - The condition to filter the data. | ||
* @param {number} delayCondition - The maximum allowed delay (in units specified by `timeUnit`) for the data to be considered fresh. | ||
* @param {string} timeUnit - The unit of time to use for the delay condition. This should be a string that is valid in a SQL `DATE_DIFF` function, such as 'DAY', 'HOUR', etc. | ||
* @param {string} dateColumn - The name of the date column to check for data freshness. | ||
|
@@ -19,20 +20,19 @@ | |
|
||
const assertions = []; | ||
|
||
const createDataFreshnessAssertion = (globalParams, filter, tableName, delayCondition, timeUnit, dateColumn, timeZone = "UTC") => { | ||
|
||
const assertion = assert(`assert_freshness_${tableName}`) | ||
const createDataFreshnessAssertion = (globalParams, schemaName, tableName, filter, delayCondition, timeUnit, dateColumn) => { | ||
const assertion = assert(`assert_freshness_${schemaName}_${tableName}`) | ||
.database(globalParams.database) | ||
.schema(globalParams.schema) | ||
.description(`Assert that data in ${tableName} is fresh with a delay less than ${delayCondition} ${timeUnit}`) | ||
.description(`Assert that data in ${schemaName}.${tableName} is fresh with a delay less than ${delayCondition} ${timeUnit}`) | ||
.tags("assert-data-freshness") | ||
.query(ctx => ` | ||
WITH | ||
filtering AS ( | ||
SELECT | ||
* | ||
FROM | ||
${ctx.ref(tableName)} | ||
${ctx.ref(schemaName, tableName)} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. reference |
||
WHERE | ||
${filter} | ||
), | ||
|
@@ -59,18 +59,20 @@ const createDataFreshnessAssertion = (globalParams, filter, tableName, delayCond | |
assertions.push(assertion); | ||
}; | ||
|
||
module.exports = (globalParams, config, freshnessConditions) => { | ||
|
||
module.exports = (globalParams, config, freshnessConditions) => { | ||
// Loop through freshnessConditions to create assertions. | ||
for (let tableName in freshnessConditions) { | ||
const { | ||
delayCondition, | ||
timeUnit, | ||
dateColumn, | ||
timeZone | ||
} = freshnessConditions[tableName]; | ||
const filter = config[tableName]?.where ?? true; | ||
createDataFreshnessAssertion(globalParams, filter, tableName, delayCondition, timeUnit, dateColumn, timeZone); | ||
for (let schemaName in freshnessConditions) { | ||
const tableNames = freshnessConditions[schemaName]; | ||
for (let tableName in tableNames) { | ||
const { | ||
delayCondition, | ||
timeUnit, | ||
dateColumn | ||
} = tableNames[tableName]; | ||
const filter = config[tableName]?.where ?? true; | ||
createDataFreshnessAssertion(globalParams, schemaName, tableName, delayCondition, timeUnit, dateColumn); | ||
} | ||
} | ||
|
||
return assertions; | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Expected Format is here.