Skip to content

Fix Issue #3: Add Support for Identifying Identical Table Names Across Different Data Sets #4

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Sep 20, 2024
87 changes: 52 additions & 35 deletions definitions/example.js
Original file line number Diff line number Diff line change
Expand Up @@ -12,52 +12,69 @@ const commonAssertionsResult = commonAssertions({
// "disabledInEnvs": ["dv", "qa"]
},
rowConditions: {
"first_table": {
"id_not_null": "id IS NOT NULL",
"id_strict_positive": "id > 0"
},
"second_table": {
"id_in_accepted_values": "id IN (1, 2, 3)"
// Format: "schema": { "table": { "conditionName": "conditionQuery", ... }, ... }
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Expected Format is here.

"dataform": {
"first_table": {
"id_not_null": "id IS NOT NULL",
"id_strict_positive": "id > 0"
},
"second_table": {
"id_in_accepted_values": "id IN (1, 2, 3)"
}
}
},
uniqueKeyConditions: {
"first_table": ["id"],
"second_table": ["id", "updated_date"]
// Format: "schema": { "table": [column1, column2, ...], ... }
"dataform": {
"first_table": ["id"],
"second_table": ["id", "updated_date"]
}
},
dataFreshnessConditions: {
"first_table": {
"dateColumn": "updated_date",
"timeUnit": "DAY",
"delayCondition": 1,
},
"second_table": {
"dateColumn": "updated_date",
"timeUnit": "MONTH",
"delayCondition": 3,
// Format: "schema": { "table": { "dateColumn", "timeUnit", "delayCondition" }, ... }
"dataform": {
"first_table": {
"dateColumn": "updated_date",
"timeUnit": "DAY",
"delayCondition": 1,
},
"second_table": {
"dateColumn": "updated_date",
"timeUnit": "MONTH",
"delayCondition": 3,
}
}
},
dataCompletenessConditions: {
"first_table": {
// Format: "column": allowedPercentageNull
"updated_date": 1, // 1% of null values allowed in the updated_date column
"id": 20
},
"second_table": {
"id": 30
// Format: "schema": { "table": { "column": allowedPercentageNull, ... }, ... }
"dataform": {
"first_table": {
// Format: "column": allowedPercentageNull
"updated_date": 1, // 1% of null values allowed in the updated_date column
"id": 20
},
"second_table": {
"id": 30
}
}
},
referentialIntegrityConditions: {
"first_table": [{
"parentKey": "id",
"childTable": "second_table",
"childKey": "id"
},
{
"parentKey": "id",
"childTable": "third_table",
"childKey": "parent_id"
}
]
// Format: "parentSchema": { "parentTable": [{ parentKey, childSchema, childTable, childKey }, ...], ... }
"dataform": {
"first_table": [{
"parentKey": "id",
"childSchema": "dataform",
"childTable": "second_table",
Comment on lines +76 to +77
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add childSchema

"childKey": "id"
},
{
"parentKey": "id",
"childSchema": "dataform",
"childTable": "third_table",
"childKey": "parent_id"
}
]
}
}
});

Expand Down
19 changes: 11 additions & 8 deletions includes/data_completeness_assertions.js
Original file line number Diff line number Diff line change
Expand Up @@ -11,25 +11,26 @@

/**
* @param {Object} globalParams - See index.js for details.
* @param {string} schemaName - The name of the schema to check for unique keys.
* @param {string} tableName - The name of the table to check for data completeness.
* @param {Object} columnConditions - An object mapping column names to their allowed percentage of null values. If a value is an object, it should have an `allowedPercentageNull` property.
*/

const assertions = [];

const createDataCompletenessAssertion = (globalParams, tableName, columnConditions) => {
const createDataCompletenessAssertion = (globalParams, schemaName, tableName, columnConditions) => {

for (let columnName in columnConditions) {
const allowedPercentageNull = columnConditions[columnName];

const assertion = assert(`assert_data_completeness_${tableName}_${columnName}`)
const assertion = assert(`assert_data_completeness_${schemaName}_${tableName}_${columnName}`)
.database(globalParams.database)
.schema(globalParams.schema)
.description(`Check data completeness for ${tableName}.${columnName}, allowed percentage of null values: ${allowedPercentageNull}`)
.description(`Check data completeness for ${schemaName}.${tableName}.${columnName}, allowed percentage of null values: ${allowedPercentageNull}`)
.tags("assert-data-completeness")
.query(ctx => `SELECT COUNT(*) AS total_rows,
SUM(CASE WHEN ${columnName} IS NULL THEN 1 ELSE 0 END) AS null_count
FROM ${ctx.ref(tableName)}
FROM ${ctx.ref(schemaName, tableName)}
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

reference schemaName.tableName

HAVING SAFE_DIVIDE(null_count, total_rows) > ${allowedPercentageNull / 100} AND null_count > 0 AND total_rows > 0`);

(globalParams.tags && globalParams.tags.forEach((tag) => assertion.tags(tag)));
Expand All @@ -43,10 +44,12 @@ const createDataCompletenessAssertion = (globalParams, tableName, columnConditio

module.exports = (globalParams, dataCompletenessConditions) => {
// Loop through dataCompletenessConditions to create data completeness check assertions.
for (let tableName in dataCompletenessConditions) {
const columnConditions = dataCompletenessConditions[tableName];
createDataCompletenessAssertion(globalParams, tableName, columnConditions);
for (let schemaName in dataCompletenessConditions) {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  • get schemaName
  • get tableNames

const tableNames = dataCompletenessConditions[schemaName];
for (let tableName in tableNames) {
const columnConditions = tableNames[tableName];
createDataCompletenessAssertion(globalParams, schemaName, tableName, columnConditions);
}
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fix nest structure.

}

return assertions;
};
27 changes: 15 additions & 12 deletions includes/data_freshness_assertions.js
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

/**
* @param {Object} globalParams - See index.js for details.
* @param {string} schemaName - The name of the schema to check for unique keys.
* @param {string} tableName - The name of the table to check for data freshness.
* @param {number} delayCondition - The maximum allowed delay (in units specified by `timeUnit`) for the data to be considered fresh.
* @param {string} timeUnit - The unit of time to use for the delay condition. This should be a string that is valid in a SQL `DATE_DIFF` function, such as 'DAY', 'HOUR', etc.
Expand All @@ -17,19 +18,19 @@

const assertions = [];

const createDataFreshnessAssertion = (globalParams, tableName, delayCondition, timeUnit, dateColumn) => {
const assertion = assert(`assert_freshness_${tableName}`)
const createDataFreshnessAssertion = (globalParams, schemaName, tableName, delayCondition, timeUnit, dateColumn) => {
const assertion = assert(`assert_freshness_${schemaName}_${tableName}`)
.database(globalParams.database)
.schema(globalParams.schema)
.description(`Assert that data in ${tableName} is fresh with a delay less than ${delayCondition} ${timeUnit}`)
.description(`Assert that data in ${schemaName}.${tableName} is fresh with a delay less than ${delayCondition} ${timeUnit}`)
.tags("assert-data-freshness")
.query(ctx => `
WITH
freshness AS (
SELECT
DATE_DIFF(CURRENT_DATE(), MAX(${dateColumn}), ${timeUnit}) AS delay
FROM
${ctx.ref(tableName)}
${ctx.ref(schemaName, tableName)}
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

reference schemaName.tableName

)
SELECT
*
Expand All @@ -47,15 +48,17 @@ const createDataFreshnessAssertion = (globalParams, tableName, delayCondition, t
};

module.exports = (globalParams, freshnessConditions) => {

// Loop through freshnessConditions to create assertions.
for (let tableName in freshnessConditions) {
const {
delayCondition,
timeUnit,
dateColumn
} = freshnessConditions[tableName];
createDataFreshnessAssertion(globalParams, tableName, delayCondition, timeUnit, dateColumn);
for (let schemaName in freshnessConditions) {
const tableNames = freshnessConditions[schemaName];
for (let tableName in tableNames) {
const {
delayCondition,
timeUnit,
dateColumn
} = tableNames[tableName];
createDataFreshnessAssertion(globalParams, schemaName, tableName, delayCondition, timeUnit, dateColumn);
}
}

return assertions;
Expand Down
42 changes: 27 additions & 15 deletions includes/referential_integrity_assertions.js
Original file line number Diff line number Diff line change
Expand Up @@ -19,17 +19,17 @@

const assertions = [];

const createReferentialIntegrityAssertion = (globalParams, parentTable, parentKey, childTable, childKey) => {
const createReferentialIntegrityAssertion = (globalParams, parentSchema, parentTable, parentKey, childSchema, childTable, childKey) => {

const assertion = assert(`assert_referential_integrity_${parentTable}_${childTable}`)
const assertion = assert(`assert_referential_integrity_${parentSchema}_${parentTable}_${childSchema}_${childTable}`)
.database(globalParams.database)
.schema(globalParams.schema)
.description(`Check referential integrity for ${childTable}.${childKey} referencing ${parentTable}.${parentKey}`)
.tags("assert-referential-integrity")
.query(ctx => `
SELECT pt.${parentKey}
FROM ${ctx.ref(parentTable)} AS pt
LEFT JOIN ${ctx.ref(childTable)} AS t ON t.${childKey} = pt.${parentKey}
FROM ${ctx.ref(parentSchema, parentTable)} AS pt
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

reference parentSchema.parentTable

LEFT JOIN ${ctx.ref(childSchema, childTable)} AS t ON t.${childKey} = pt.${parentKey}
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

reference childSchema.childTable

WHERE t.${childKey} IS NULL
`);

Expand All @@ -41,16 +41,28 @@ const createReferentialIntegrityAssertion = (globalParams, parentTable, parentKe
};

module.exports = (globalParams, referentialIntegrityConditions) => {
for (let parentTable in referentialIntegrityConditions) {
const relationships = referentialIntegrityConditions[parentTable];

relationships.forEach(({
parentKey,
childTable,
childKey
}) => {
createReferentialIntegrityAssertion(globalParams, parentTable, parentKey, childTable, childKey);
})
}
for (let parentSchema in referentialIntegrityConditions) {
const parentTables = referentialIntegrityConditions[parentSchema];
for (let parentTable in parentTables) {
const relationships = parentTables[parentTable];

relationships.forEach(({
parentKey,
childSchema,
childTable,
childKey
}) => {
createReferentialIntegrityAssertion(
globalParams,
parentSchema,
parentTable,
parentKey,
childSchema,
childTable,
childKey
);
})
}
};
return assertions;
};
29 changes: 19 additions & 10 deletions includes/row_condition_assertions.js
Original file line number Diff line number Diff line change
Expand Up @@ -11,21 +11,22 @@

/**
* @param {Object} globalParams - See index.js for details.
* @param {string} schemaName - The name of the schema to check for unique keys.
* @param {string} tableName - The name of the table to check for row conditions.
* @param {string} conditionName - The name of the condition to check.
* @param {string} conditionQuery - The SQL query that defines the condition to check.
*/

const assertions = [];

const createRowConditionAssertion = (globalParams, tableName, conditionName, conditionQuery) => {
const assertion = assert(`assert_${conditionName.replace(/-/g , "_")}_${tableName}`)
const createRowConditionAssertion = (globalParams, schemaName, tableName, conditionName, conditionQuery) => {
const assertion = assert(`assert_${conditionName.replace(/-/g , "_")}${schemaName}_${tableName}`)
.database(globalParams.database)
.schema(globalParams.schema)
.description(`Assert that rows in ${tableName} meet ${conditionName}`)
.description(`Assert that rows in ${schemaName}.${tableName} meet ${conditionName}`)
.tags("assert-row-condition")
.query(ctx => `SELECT "Condition not met: ${conditionQuery}, Table: ${ctx.ref(tableName)}" AS assertion_description
FROM ${ctx.ref(tableName)}
.query(ctx => `SELECT "Condition not met: ${conditionQuery}, Table: ${ctx.ref(schemaName, tableName)}" AS assertion_description
FROM ${ctx.ref(schemaName, tableName)}
WHERE NOT (${conditionQuery})`);

(globalParams.tags && globalParams.tags.forEach((tag) => assertion.tags(tag)));
Expand All @@ -38,12 +39,20 @@ const createRowConditionAssertion = (globalParams, tableName, conditionName, con
module.exports = (globalParams, rowConditions) => {

// Loop through rowConditions to create assertions.
for (let tableName in rowConditions) {
for (let conditionName in rowConditions[tableName]) {
const conditionQuery = rowConditions[tableName][conditionName];
createRowConditionAssertion(globalParams, tableName, conditionName, conditionQuery);
for (let schemaName in rowConditions) {
const tableNames = rowConditions[schemaName];
for (let tableName in tableNames) {
for (let conditionName in tableNames[tableName]) {
const conditionQuery = tableNames[tableName][conditionName];
createRowConditionAssertion(
globalParams,
schemaName,
tableName,
conditionName,
conditionQuery
);
}
}
}

return assertions;
}
18 changes: 11 additions & 7 deletions includes/unique_key_assertions.js
Original file line number Diff line number Diff line change
Expand Up @@ -11,22 +11,23 @@

/**
* @param {Object} globalParams - See index.js for details.
* @param {string} schemaName - The name of the schema to check for unique keys.
* @param {string} tableName - The name of the table to check for unique keys.
* @param {Array} columns - An array of column names that should form a unique key.
*/

const assertions = [];

const createUniqueKeyAssertion = (globalParams, tableName, columns) => {
const createUniqueKeyAssertion = (globalParams, schemaName, tableName, columns) => {
const uniqueColumns = columns.join(', ');

const assertion = assert(`assert_unique_key_${tableName}`)
const assertion = assert(`assert_unique_key_${schemaName}_${tableName}`)
.database(globalParams.database)
.schema(globalParams.schema)
.description(`Check that values in columns (${uniqueColumns}) in ${tableName} form a unique key`)
.description(`Check that values in columns (${uniqueColumns}) in ${schemaName}.${tableName} form a unique key`)
.tags("assert-unique-key")
.query(ctx => `SELECT ${uniqueColumns}
FROM ${ctx.ref(tableName)}
FROM ${ctx.ref(schemaName, tableName)}
GROUP BY ${uniqueColumns}
HAVING COUNT(*) > 1`);

Expand All @@ -40,9 +41,12 @@ const createUniqueKeyAssertion = (globalParams, tableName, columns) => {
module.exports = (globalParams, uniqueKeyConditions) => {

// Loop through uniqueKeyConditions to create unique key check assertions.
for (let tableName in uniqueKeyConditions) {
const columns = uniqueKeyConditions[tableName];
createUniqueKeyAssertion(globalParams, tableName, columns);
for (let schemaName in uniqueKeyConditions) {
const tableNames = uniqueKeyConditions[schemaName];
for (let tableName in tableNames) {
const columns = tableNames[tableName];
createUniqueKeyAssertion(globalParams, schemaName, tableName, columns);
}
}

return assertions;
Expand Down