Skip to content

Fix Issue #3: Add Support for Identifying Identical Table Names Across Different Data Sets #4

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Sep 20, 2024
97 changes: 57 additions & 40 deletions definitions/example.js
Original file line number Diff line number Diff line change
Expand Up @@ -17,57 +17,74 @@ const commonAssertionsResult = commonAssertions({
},
},
rowConditions: {
"first_table": {
"id_not_null": "id IS NOT NULL",
"id_strict_positive": "id > 0"
},
"second_table": {
"id_in_accepted_values": "id IN (1, 2, 3)"
// Format: "schema": { "table": { "conditionName": "conditionQuery", ... }, ... }
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Expected Format is here.

"dataform": {
"first_table": {
"id_not_null": "id IS NOT NULL",
"id_strict_positive": "id > 0"
},
"second_table": {
"id_in_accepted_values": "id IN (1, 2, 3)"
}
}
},
uniqueKeyConditions: {
"first_table": ["id"],
"second_table": ["id", "updated_date"]
// Format: "schema": { "table": [column1, column2, ...], ... }
"dataform": {
"first_table": ["id"],
"second_table": ["id", "updated_date"]
}
},
dataFreshnessConditions: {
"first_table": {
"dateColumn": "updated_date",
"timeUnit": "DAY",
"delayCondition": 1,
"timeZone": "America/Los_Angeles"
},
"second_table": {
// If timeUnit is not DAY, WEEK, MONTH, QUARTER, or YEAR, dateColumn should be a TIMESTAMP.
// Check here for valid Date time units: https://cloud.google.com/bigquery/docs/reference/standard-sql/date_functions#date_diff
// Check here for valid Timestamp time units: https://cloud.google.com/bigquery/docs/reference/standard-sql/timestamp_functions#timestamp_diff
"dateColumn": "TIMESTAMP(updated_date)",
"timeUnit": "HOUR",
"delayCondition": 3,
"timeZone": "-08"
// Format: "schema": { "table": { "dateColumn", "timeUnit", "delayCondition" }, ... }
"dataform": {
"first_table": {
"dateColumn": "updated_date",
"timeUnit": "DAY",
"delayCondition": 1,
"timeZone": "America/Los_Angeles"
},
"second_table": {
// If timeUnit is not DAY, WEEK, MONTH, QUARTER, or YEAR, dateColumn should be a TIMESTAMP.
// Check here for valid Date time units: https://cloud.google.com/bigquery/docs/reference/standard-sql/date_functions#date_diff
// Check here for valid Timestamp time units: https://cloud.google.com/bigquery/docs/reference/standard-sql/timestamp_functions#timestamp_diff
"dateColumn": "TIMESTAMP(updated_date)",
"timeUnit": "HOUR",
"delayCondition": 3,
"timeZone": "-08"
}
}
},
dataCompletenessConditions: {
"first_table": {
// Format: "column": allowedPercentageNull
"updated_date": 1, // 1% of null values allowed in the updated_date column
"id": 20
},
"second_table": {
"id": 30
// Format: "schema": { "table": { "column": allowedPercentageNull, ... }, ... }
"dataform": {
"first_table": {
// Format: "column": allowedPercentageNull
"updated_date": 1, // 1% of null values allowed in the updated_date column
"id": 20
},
"second_table": {
"id": 30
}
}
},
referentialIntegrityConditions: {
"first_table": [{
"parentKey": "id",
"childTable": "second_table",
"childKey": "id"
},
{
"parentKey": "id",
"childTable": "third_table",
"childKey": "parent_id"
}
]
// Format: "parentSchema": { "parentTable": [{ parentKey, childSchema, childTable, childKey }, ...], ... }
"dataform": {
"first_table": [{
"parentKey": "id",
"childSchema": "dataform",
"childTable": "second_table",
Comment on lines +76 to +77
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add childSchema

"childKey": "id"
},
{
"parentKey": "id",
"childSchema": "dataform",
"childTable": "third_table",
"childKey": "parent_id"
}
]
}
}
});

Expand Down
24 changes: 13 additions & 11 deletions includes/data_completeness_assertions.js
Original file line number Diff line number Diff line change
Expand Up @@ -11,30 +11,30 @@

/**
* @param {Object} globalParams - See index.js for details.
* @param {string} filter - The condition to filter the data.
* @param {string} schemaName - The name of the schema to check for unique keys.
* @param {string} tableName - The name of the table to check for data completeness.
* @param {string} filter - The condition to filter the data.
* @param {Object} columnConditions - An object mapping column names to their allowed percentage of null values. If a value is an object, it should have an `allowedPercentageNull` property.
*/

const assertions = [];

const createDataCompletenessAssertion = (globalParams, filter, tableName, columnConditions) => {

const createDataCompletenessAssertion = (globalParams, schemaName, tableName, filter, columnConditions) => {
for (let columnName in columnConditions) {
const allowedPercentageNull = columnConditions[columnName];

const assertion = assert(`assert_data_completeness_${tableName}_${columnName}`)
const assertion = assert(`assert_data_completeness_${schemaName}_${tableName}_${columnName}`)
.database(globalParams.database)
.schema(globalParams.schema)
.description(`Check data completeness for ${tableName}.${columnName}, allowed percentage of null values: ${allowedPercentageNull}`)
.description(`Check data completeness for ${schemaName}.${tableName}.${columnName}, allowed percentage of null values: ${allowedPercentageNull}`)
.tags("assert-data-completeness")
.query(ctx => `
WITH
filtering AS (
SELECT
*
FROM
${ctx.ref(tableName)}
${ctx.ref(schemaName, tableName)}
WHERE
${filter}
)
Expand All @@ -55,11 +55,13 @@ const createDataCompletenessAssertion = (globalParams, filter, tableName, column

module.exports = (globalParams, config, dataCompletenessConditions) => {
// Loop through dataCompletenessConditions to create data completeness check assertions.
for (let tableName in dataCompletenessConditions) {
const columnConditions = dataCompletenessConditions[tableName];
const filter = config[tableName]?.where ?? true;
createDataCompletenessAssertion(globalParams, filter, tableName, columnConditions);
for (let schemaName in dataCompletenessConditions) {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  • get schemaName
  • get tableNames

const tableNames = dataCompletenessConditions[schemaName];
for (let tableName in tableNames) {
const columnConditions = tableNames[tableName];
const filter = config[tableName]?.where ?? true;
createDataCompletenessAssertion(globalParams, schemaName, tableName, filter, columnConditions);
}
}

return assertions;
};
34 changes: 18 additions & 16 deletions includes/data_freshness_assertions.js
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,9 @@

/**
* @param {Object} globalParams - See index.js for details.
* @param {string} filter - The condition to filter the data.
* @param {string} schemaName - The name of the schema to check for unique keys.
* @param {string} tableName - The name of the table to check for data freshness.
* @param {string} filter - The condition to filter the data.
* @param {number} delayCondition - The maximum allowed delay (in units specified by `timeUnit`) for the data to be considered fresh.
* @param {string} timeUnit - The unit of time to use for the delay condition. This should be a string that is valid in a SQL `DATE_DIFF` function, such as 'DAY', 'HOUR', etc.
* @param {string} dateColumn - The name of the date column to check for data freshness.
Expand All @@ -19,20 +20,19 @@

const assertions = [];

const createDataFreshnessAssertion = (globalParams, filter, tableName, delayCondition, timeUnit, dateColumn, timeZone = "UTC") => {

const assertion = assert(`assert_freshness_${tableName}`)
const createDataFreshnessAssertion = (globalParams, schemaName, tableName, filter, delayCondition, timeUnit, dateColumn) => {
const assertion = assert(`assert_freshness_${schemaName}_${tableName}`)
.database(globalParams.database)
.schema(globalParams.schema)
.description(`Assert that data in ${tableName} is fresh with a delay less than ${delayCondition} ${timeUnit}`)
.description(`Assert that data in ${schemaName}.${tableName} is fresh with a delay less than ${delayCondition} ${timeUnit}`)
.tags("assert-data-freshness")
.query(ctx => `
WITH
filtering AS (
SELECT
*
FROM
${ctx.ref(tableName)}
${ctx.ref(schemaName, tableName)}
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

reference schemaName.tableName

WHERE
${filter}
),
Expand All @@ -59,18 +59,20 @@ const createDataFreshnessAssertion = (globalParams, filter, tableName, delayCond
assertions.push(assertion);
};

module.exports = (globalParams, config, freshnessConditions) => {

module.exports = (globalParams, config, freshnessConditions) => {
// Loop through freshnessConditions to create assertions.
for (let tableName in freshnessConditions) {
const {
delayCondition,
timeUnit,
dateColumn,
timeZone
} = freshnessConditions[tableName];
const filter = config[tableName]?.where ?? true;
createDataFreshnessAssertion(globalParams, filter, tableName, delayCondition, timeUnit, dateColumn, timeZone);
for (let schemaName in freshnessConditions) {
const tableNames = freshnessConditions[schemaName];
for (let tableName in tableNames) {
const {
delayCondition,
timeUnit,
dateColumn
} = tableNames[tableName];
const filter = config[tableName]?.where ?? true;
createDataFreshnessAssertion(globalParams, schemaName, tableName, delayCondition, timeUnit, dateColumn);
}
}

return assertions;
Expand Down
52 changes: 34 additions & 18 deletions includes/referential_integrity_assertions.js
Original file line number Diff line number Diff line change
Expand Up @@ -11,19 +11,21 @@

/**
* @param {Object} globalParams - See index.js for details.
* @param {string} parentFilter - The condition to filter the data of parent table.
* @param {string} childFilter - The condition to filter the data of child table.
* @param {Object} parentSchema -
* @param {Object} parentTable - The name of the parent table in the foreign key relationship.
* @param {Object} parentKey - The name of the column in the parent table that is the primary key.
* @param {string} parentFilter - The condition to filter the data of parent table.
* @param {Object} childSchema -
* @param {Object} childTable - The name of the child table in the foreign key relationship.
* @param {Object} childKey - The name of the column in the child table that is the foreign key.
* @param {string} childFilter - The condition to filter the data of child table.
*/

const assertions = [];

const createReferentialIntegrityAssertion = (globalParams, parentFilter, childFilter, parentTable, parentKey, childTable, childKey) => {
const createReferentialIntegrityAssertion = (globalParams, parentSchema, parentTable, parentKey, parentFilter, childSchema, childTable, childKey, childFilter) => {

const assertion = assert(`assert_referential_integrity_${parentTable}_${childTable}`)
const assertion = assert(`assert_referential_integrity_${parentSchema}_${parentTable}_${childSchema}_${childTable}`)
.database(globalParams.database)
.schema(globalParams.schema)
.description(`Check referential integrity for ${childTable}.${childKey} referencing ${parentTable}.${parentKey}`)
Expand All @@ -34,7 +36,7 @@ const createReferentialIntegrityAssertion = (globalParams, parentFilter, childFi
SELECT
*
FROM
${ctx.ref(parentTable)}
${ctx.ref(parentSchema, parentTable)}
WHERE
${parentFilter}
),
Expand All @@ -43,7 +45,7 @@ const createReferentialIntegrityAssertion = (globalParams, parentFilter, childFi
SELECT
*
FROM
${ctx.ref(childTable)}
${ctx.ref(childSchema, childTable)}
WHERE
${childFilter}
)
Expand All @@ -62,18 +64,32 @@ const createReferentialIntegrityAssertion = (globalParams, parentFilter, childFi
};

module.exports = (globalParams, config, referentialIntegrityConditions) => {
for (let parentTable in referentialIntegrityConditions) {
const relationships = referentialIntegrityConditions[parentTable];
const parentFilter = config[parentTable]?.where ?? true;
for (let parentSchema in referentialIntegrityConditions) {
const parentTables = referentialIntegrityConditions[parentSchema];
for (let parentTable in parentTables) {
const relationships = parentTables[parentTable];
const parentFilter = config[parentTable]?.where ?? true;

relationships.forEach(({
parentKey,
childTable,
childKey
}) => {
const childFilter = config[childTable]?.where ?? true;
createReferentialIntegrityAssertion(globalParams, parentFilter, childFilter, parentTable, parentKey, childTable, childKey);
})
}
relationships.forEach(({
parentKey,
childSchema,
childTable,
childKey
}) => {
const childFilter = config[childTable]?.where ?? true;
createReferentialIntegrityAssertion(
globalParams,
parentSchema,
parentTable,
parentKey,
parentFilter,
childSchema,
childTable,
childKey,
childFilter
);
})
}
};
return assertions;
};
32 changes: 21 additions & 11 deletions includes/row_condition_assertions.js
Original file line number Diff line number Diff line change
Expand Up @@ -11,27 +11,28 @@

/**
* @param {Object} globalParams - See index.js for details.
* @param {string} filter - The condition to filter the data.
* @param {string} schemaName - The name of the schema to check for unique keys.
* @param {string} tableName - The name of the table to check for row conditions.
* @param {string} filter - The condition to filter the data.
* @param {string} conditionName - The name of the condition to check.
* @param {string} conditionQuery - The SQL query that defines the condition to check.
*/

const assertions = [];

const createRowConditionAssertion = (globalParams, filter, tableName, conditionName, conditionQuery) => {
const assertion = assert(`assert_${conditionName.replace(/-/g , "_")}_${tableName}`)
const createRowConditionAssertion = (globalParams, schemaName, tableName, filter, conditionName, conditionQuery) => {
const assertion = assert(`assert_${conditionName.replace(/-/g , "_")}${schemaName}_${tableName}`)
.database(globalParams.database)
.schema(globalParams.schema)
.description(`Assert that rows in ${tableName} meet ${conditionName}`)
.description(`Assert that rows in ${schemaName}.${tableName} meet ${conditionName}`)
.tags("assert-row-condition")
.query(ctx => `
WITH
filtering AS (
SELECT
*
FROM
${ctx.ref(tableName)}
${ctx.ref(schemaName, tableName)}
WHERE
${filter}
)
Expand All @@ -50,13 +51,22 @@ const createRowConditionAssertion = (globalParams, filter, tableName, conditionN
module.exports = (globalParams, config, rowConditions) => {

// Loop through rowConditions to create assertions.
for (let tableName in rowConditions) {
for (let conditionName in rowConditions[tableName]) {
const conditionQuery = rowConditions[tableName][conditionName];
const filter = config[tableName]?.where ?? true;
createRowConditionAssertion(globalParams, filter, tableName, conditionName, conditionQuery);
for (let schemaName in rowConditions) {
const tableNames = rowConditions[schemaName];
for (let tableName in tableNames) {
for (let conditionName in tableNames[tableName]) {
const conditionQuery = tableNames[tableName][conditionName];
const filter = config[tableName]?.where ?? true;
createRowConditionAssertion(
globalParams,
schemaName,
tableName,
filter,
conditionName,
conditionQuery
);
}
}
}

return assertions;
}
Loading