From f58aa2482071077b707b8f4aa354316701426d4e Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Mon, 21 Oct 2024 19:58:11 +0200 Subject: [PATCH 1/3] Promote workflow linter to non-experimental --- README.md | 4 +--- .../labs/ucx/assessment/workflows.py | 18 ++++++++------ src/databricks/labs/ucx/runtime.py | 4 ++-- .../labs/ucx/source_code/workflows.py | 24 +++++++++++-------- tests/integration/source_code/test_jobs.py | 6 ++--- 5 files changed, 31 insertions(+), 25 deletions(-) diff --git a/README.md b/README.md index a530b3d25a..1c2d109147 100644 --- a/README.md +++ b/README.md @@ -728,9 +728,7 @@ in the Migration dashboard. ## Jobs Static Code Analysis Workflow -> Please note that this is an experimental workflow. - -The `experimental-workflow-linter` workflow lints accessible code from 2 sources: +The `workflow-linter` workflow lints accessible code from 2 sources: - all workflows/jobs present in the workspace - all dashboards/queries present in the workspace The linting emits problems indicating what to resolve for making the code Unity Catalog compatible. diff --git a/src/databricks/labs/ucx/assessment/workflows.py b/src/databricks/labs/ucx/assessment/workflows.py index 8d5dd16a32..54a4c48fe7 100644 --- a/src/databricks/labs/ucx/assessment/workflows.py +++ b/src/databricks/labs/ucx/assessment/workflows.py @@ -183,17 +183,21 @@ def crawl_groups(self, ctx: RuntimeContext): """Scans all groups for the local group migration scope""" ctx.group_manager.snapshot() + @job_task + def assess_workflows(self, ctx: RuntimeContext): + """Scans all jobs for migration issues in notebooks jobs. + + Also, stores direct filesystem accesses for display in the migration dashboard. + """ + ctx.workflow_linter.refresh_report(ctx.sql_backend, ctx.inventory_database) + @job_task def assess_dashboards(self, ctx: RuntimeContext): """Scans all dashboards for migration issues in SQL code of embedded widgets. - Also stores direct filesystem accesses for display in the migration dashboard.""" - ctx.query_linter.refresh_report(ctx.sql_backend, ctx.inventory_database) - @job_task - def assess_workflows(self, ctx: RuntimeContext): - """Scans all jobs for migration issues in notebooks. - Also stores direct filesystem accesses for display in the migration dashboard.""" - ctx.workflow_linter.refresh_report(ctx.sql_backend, ctx.inventory_database) + Also, stores direct filesystem accesses for display in the migration dashboard. + """ + ctx.query_linter.refresh_report(ctx.sql_backend, ctx.inventory_database) class Failing(Workflow): diff --git a/src/databricks/labs/ucx/runtime.py b/src/databricks/labs/ucx/runtime.py index cfb03dce64..0e2977c386 100644 --- a/src/databricks/labs/ucx/runtime.py +++ b/src/databricks/labs/ucx/runtime.py @@ -20,7 +20,7 @@ ) from databricks.labs.ucx.progress.workflows import MigrationProgress from databricks.labs.ucx.recon.workflows import MigrationRecon -from databricks.labs.ucx.source_code.workflows import ExperimentalWorkflowLinter +from databricks.labs.ucx.source_code.workflows import WorkflowLinter from databricks.labs.ucx.workspace_access.workflows import ( GroupMigration, PermissionsMigrationAPI, @@ -58,7 +58,7 @@ def all(cls): ScanTablesInMounts(), MigrateTablesInMounts(), PermissionsMigrationAPI(), - ExperimentalWorkflowLinter(), + WorkflowLinter(), MigrationRecon(), Failing(), ] diff --git a/src/databricks/labs/ucx/source_code/workflows.py b/src/databricks/labs/ucx/source_code/workflows.py index 9fd3046070..6acbf46cb0 100644 --- a/src/databricks/labs/ucx/source_code/workflows.py +++ b/src/databricks/labs/ucx/source_code/workflows.py @@ -2,18 +2,22 @@ from databricks.labs.ucx.framework.tasks import Workflow, job_task -class ExperimentalWorkflowLinter(Workflow): +class WorkflowLinter(Workflow): def __init__(self): - super().__init__('experimental-workflow-linter') + super().__init__('workflow-linter') - @job_task(job_cluster="table_migration") - def lint_all_workflows(self, ctx: RuntimeContext): - """[EXPERIMENTAL] Analyses all jobs for source code compatibility problems. This is an experimental feature, - that is not yet fully supported.""" + @job_task + def assess_workflows(self, ctx: RuntimeContext): + """Scans all jobs for migration issues in notebooks jobs. + + Also, stores direct filesystem accesses for display in the migration dashboard. + """ ctx.workflow_linter.refresh_report(ctx.sql_backend, ctx.inventory_database) - @job_task(job_cluster="table_migration") - def lint_all_queries(self, ctx: RuntimeContext): - """[EXPERIMENTAL] Analyses all jobs for source code compatibility problems. This is an experimental feature, - that is not yet fully supported.""" + @job_task + def assess_dashboards(self, ctx: RuntimeContext): + """Scans all dashboards for migration issues in SQL code of embedded widgets. + + Also, stores direct filesystem accesses for display in the migration dashboard. + """ ctx.query_linter.refresh_report(ctx.sql_backend, ctx.inventory_database) diff --git a/tests/integration/source_code/test_jobs.py b/tests/integration/source_code/test_jobs.py index f61f37e263..f9bd224595 100644 --- a/tests/integration/source_code/test_jobs.py +++ b/tests/integration/source_code/test_jobs.py @@ -38,14 +38,14 @@ def test_running_real_workflow_linter_job(installation_ctx, make_job) -> None: job = make_job(content="spark.read.table('a_table').write.csv('/mnt/things/e/f/g')\n") ctx = installation_ctx.replace(config_transform=lambda wc: replace(wc, include_job_ids=[job.job_id])) ctx.workspace_installation.run() - ctx.deployed_workflows.run_workflow("experimental-workflow-linter") - ctx.deployed_workflows.validate_step("experimental-workflow-linter") + ctx.deployed_workflows.run_workflow("workflow-linter") + ctx.deployed_workflows.validate_step("workflow-linter") # This test merely tests that the workflows produces records of the expected types; record content is not checked. cursor = ctx.sql_backend.fetch(f"SELECT COUNT(*) AS count FROM {ctx.inventory_database}.workflow_problems") result = next(cursor) if result['count'] == 0: - installation_ctx.deployed_workflows.relay_logs("experimental-workflow-linter") + installation_ctx.deployed_workflows.relay_logs("workflow-linter") assert False, "No workflow problems found" dfsa_records = installation_ctx.directfs_access_crawler_for_paths.snapshot() used_table_records = installation_ctx.used_tables_crawler_for_paths.snapshot() From dfee24d835733738eae4a88bcb5a08f29089a8e6 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Tue, 22 Oct 2024 12:48:49 +0200 Subject: [PATCH 2/3] Remove WorkflowLinter --- README.md | 22 +++--------------- .../labs/ucx/assessment/workflows.py | 12 +++++----- src/databricks/labs/ucx/runtime.py | 2 -- .../labs/ucx/source_code/workflows.py | 23 ------------------- tests/integration/source_code/test_jobs.py | 20 ---------------- 5 files changed, 9 insertions(+), 70 deletions(-) delete mode 100644 src/databricks/labs/ucx/source_code/workflows.py diff --git a/README.md b/README.md index 1c2d109147..6f89e8f0f0 100644 --- a/README.md +++ b/README.md @@ -401,9 +401,8 @@ which can be used for further analysis and decision-making through the [assessme 9. `assess_pipelines`: This task scans through all the Pipelines and identifies those pipelines that have Azure Service Principals embedded in their configurations. A list of all the pipelines with matching configurations is stored in the `$inventory.pipelines` table. 10. `assess_azure_service_principals`: This task scans through all the clusters configurations, cluster policies, job cluster configurations, Pipeline configurations, and Warehouse configuration and identifies all the Azure Service Principals who have been given access to the Azure storage accounts via spark configurations referred in those entities. The list of all the Azure Service Principals referred in those configurations is saved in the `$inventory.azure_service_principals` table. 11. `assess_global_init_scripts`: This task scans through all the global init scripts and identifies if there is an Azure Service Principal who has been given access to the Azure storage accounts via spark configurations referred in those scripts. -12. `assess_dashboards`: This task scans through all the dashboards and analyzes embedded queries for migration problems. It also collects direct filesystem access patterns that require attention. -13. `assess_workflows`: This task scans through all the jobs and tasks and analyzes notebooks and files for migration problems. It also collects direct filesystem access patterns that require attention. - +12. `assess_dashboards`: This task scans through all the dashboards and analyzes embedded queries for migration problems which it persists in `$inventory_database.query_problems`. It also collects direct filesystem access patterns that require attention which it persists in `$inventory_database.directfs_in_queries`. +13. `assess_workflows`: This task scans through all the jobs and tasks and analyzes notebooks and files for migration problems which it persists in `$inventory_database.workflow_problems`. It also collects direct filesystem access patterns that require attention which it persists in `$inventory_database.directfs_in_paths`. ![report](docs/assessment-report.png) @@ -726,25 +725,10 @@ in the Migration dashboard. [[back to top](#databricks-labs-ucx)] -## Jobs Static Code Analysis Workflow - -The `workflow-linter` workflow lints accessible code from 2 sources: - - all workflows/jobs present in the workspace - - all dashboards/queries present in the workspace -The linting emits problems indicating what to resolve for making the code Unity Catalog compatible. -The linting also locates direct filesystem access that need to be migrated. - -Once the workflow completes: - - problems are stored in the `$inventory_database.workflow_problems`/`$inventory_database.query_problems` table - - direct filesystem access are stored in the `$inventory_database.directfs_in_paths`/`$inventory_database.directfs_in_queries` table - - all the above are displayed in the Migration dashboard. +### Linter message codes ![code compatibility problems](docs/code_compatibility_problems.png) -[[back to top](#databricks-labs-ucx)] - -### Linter message codes - Here's the detailed explanation of the linter message codes: #### `cannot-autofix-table-reference` diff --git a/src/databricks/labs/ucx/assessment/workflows.py b/src/databricks/labs/ucx/assessment/workflows.py index 54a4c48fe7..a36b7bbbb6 100644 --- a/src/databricks/labs/ucx/assessment/workflows.py +++ b/src/databricks/labs/ucx/assessment/workflows.py @@ -184,20 +184,20 @@ def crawl_groups(self, ctx: RuntimeContext): ctx.group_manager.snapshot() @job_task - def assess_workflows(self, ctx: RuntimeContext): - """Scans all jobs for migration issues in notebooks jobs. + def assess_dashboards(self, ctx: RuntimeContext): + """Scans all dashboards for migration issues in SQL code of embedded widgets. Also, stores direct filesystem accesses for display in the migration dashboard. """ - ctx.workflow_linter.refresh_report(ctx.sql_backend, ctx.inventory_database) + ctx.query_linter.refresh_report(ctx.sql_backend, ctx.inventory_database) @job_task - def assess_dashboards(self, ctx: RuntimeContext): - """Scans all dashboards for migration issues in SQL code of embedded widgets. + def assess_workflows(self, ctx: RuntimeContext): + """Scans all jobs for migration issues in notebooks jobs. Also, stores direct filesystem accesses for display in the migration dashboard. """ - ctx.query_linter.refresh_report(ctx.sql_backend, ctx.inventory_database) + ctx.workflow_linter.refresh_report(ctx.sql_backend, ctx.inventory_database) class Failing(Workflow): diff --git a/src/databricks/labs/ucx/runtime.py b/src/databricks/labs/ucx/runtime.py index 0e2977c386..c5954afc62 100644 --- a/src/databricks/labs/ucx/runtime.py +++ b/src/databricks/labs/ucx/runtime.py @@ -20,7 +20,6 @@ ) from databricks.labs.ucx.progress.workflows import MigrationProgress from databricks.labs.ucx.recon.workflows import MigrationRecon -from databricks.labs.ucx.source_code.workflows import WorkflowLinter from databricks.labs.ucx.workspace_access.workflows import ( GroupMigration, PermissionsMigrationAPI, @@ -58,7 +57,6 @@ def all(cls): ScanTablesInMounts(), MigrateTablesInMounts(), PermissionsMigrationAPI(), - WorkflowLinter(), MigrationRecon(), Failing(), ] diff --git a/src/databricks/labs/ucx/source_code/workflows.py b/src/databricks/labs/ucx/source_code/workflows.py deleted file mode 100644 index 6acbf46cb0..0000000000 --- a/src/databricks/labs/ucx/source_code/workflows.py +++ /dev/null @@ -1,23 +0,0 @@ -from databricks.labs.ucx.contexts.workflow_task import RuntimeContext -from databricks.labs.ucx.framework.tasks import Workflow, job_task - - -class WorkflowLinter(Workflow): - def __init__(self): - super().__init__('workflow-linter') - - @job_task - def assess_workflows(self, ctx: RuntimeContext): - """Scans all jobs for migration issues in notebooks jobs. - - Also, stores direct filesystem accesses for display in the migration dashboard. - """ - ctx.workflow_linter.refresh_report(ctx.sql_backend, ctx.inventory_database) - - @job_task - def assess_dashboards(self, ctx: RuntimeContext): - """Scans all dashboards for migration issues in SQL code of embedded widgets. - - Also, stores direct filesystem accesses for display in the migration dashboard. - """ - ctx.query_linter.refresh_report(ctx.sql_backend, ctx.inventory_database) diff --git a/tests/integration/source_code/test_jobs.py b/tests/integration/source_code/test_jobs.py index f9bd224595..72756a7945 100644 --- a/tests/integration/source_code/test_jobs.py +++ b/tests/integration/source_code/test_jobs.py @@ -32,26 +32,6 @@ from tests.unit.source_code.test_graph import _TestDependencyGraph -@retried(on=[NotFound], timeout=timedelta(minutes=5)) -def test_running_real_workflow_linter_job(installation_ctx, make_job) -> None: - # Deprecated file system path in call to: /mnt/things/e/f/g - job = make_job(content="spark.read.table('a_table').write.csv('/mnt/things/e/f/g')\n") - ctx = installation_ctx.replace(config_transform=lambda wc: replace(wc, include_job_ids=[job.job_id])) - ctx.workspace_installation.run() - ctx.deployed_workflows.run_workflow("workflow-linter") - ctx.deployed_workflows.validate_step("workflow-linter") - - # This test merely tests that the workflows produces records of the expected types; record content is not checked. - cursor = ctx.sql_backend.fetch(f"SELECT COUNT(*) AS count FROM {ctx.inventory_database}.workflow_problems") - result = next(cursor) - if result['count'] == 0: - installation_ctx.deployed_workflows.relay_logs("workflow-linter") - assert False, "No workflow problems found" - dfsa_records = installation_ctx.directfs_access_crawler_for_paths.snapshot() - used_table_records = installation_ctx.used_tables_crawler_for_paths.snapshot() - assert dfsa_records and used_table_records - - @retried(on=[NotFound], timeout=timedelta(minutes=2)) def test_linter_from_context(simple_ctx, make_job) -> None: # This code is similar to test_running_real_workflow_linter_job, but it's executed on the caller side and is easier From 769bd2e0fb42c82392931667fff192f9f66b94d2 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 23 Oct 2024 13:30:56 +0200 Subject: [PATCH 3/3] Remove workflow linter from table persistence overview --- docs/table_persistence.md | 56 +++++++++++++++++++-------------------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/docs/table_persistence.md b/docs/table_persistence.md index d01d743eb9..12cac2a2c6 100644 --- a/docs/table_persistence.md +++ b/docs/table_persistence.md @@ -4,34 +4,34 @@ List of all UCX objects and their respective metadata. ## Overview -Table Utilization: - -| Table | Generate Assessment | Update Migration Progress | Migrate Groups | Migrate External Tables | Upgrade Jobs | Migrate tables | Migrate Data Reconciliation | Workflow linter | -|--------------------------|---------------------|---------------------------|----------------|-------------------------|--------------|----------------|-----------------------------|-----------------| -| tables | RW | RW | | RO | | RO | | | -| grants | RW | RW | | RW | | RW | | | -| mounts | RW | | | RO | RO | RO | | | -| permissions | RW | | RW | RO | | RO | | | -| jobs | RW | RW | | | RO | | | | -| clusters | RW | RW | | | | | | | -| directfs_in_paths | RW | RW | | | | | | RW | -| directfs_in_queries | RW | RW | | | | | | RW | -| external_locations | RW | | | RO | | | | | -| workspace | RW | | RO | | RO | | | | -| workspace_objects | RW | | | | | | | | -| azure_service_principals | RW | | | | | | | | -| global_init_scripts | RW | | | | | | | | -| pipelines | RW | RW | | | | | | | -| groups | RW | | RO | | | | | | -| table_size | RW | | | | | | | | -| submit_runs | RW | | | | | | | | -| policies | RW | RW | | | | | | | -| migration_status | | RW | | RW | | RW | | | -| query_problems | RW | RW | | | | | | RW | -| workflow_problems | RW | RW | | | | | | RW | -| udfs | RW | RW | RO | | | | | | -| logs | RW | | RW | RW | | RW | RW | | -| recon_results | | | | | | | RW | | +Table utilization per workflow: + +| Table | Generate Assessment | Update Migration Progress | Migrate Groups | Migrate External Tables | Upgrade Jobs | Migrate tables | Migrate Data Reconciliation | +|--------------------------|---------------------|---------------------------|----------------|-------------------------|--------------|----------------|-----------------------------| +| tables | RW | RW | | RO | | RO | | +| grants | RW | RW | | RW | | RW | | +| mounts | RW | | | RO | RO | RO | | +| permissions | RW | | RW | RO | | RO | | +| jobs | RW | RW | | | RO | | | +| clusters | RW | RW | | | | | | +| directfs_in_paths | RW | RW | | | | | | +| directfs_in_queries | RW | RW | | | | | | +| external_locations | RW | | | RO | | | | +| workspace | RW | | RO | | RO | | | +| workspace_objects | RW | | | | | | | +| azure_service_principals | RW | | | | | | | +| global_init_scripts | RW | | | | | | | +| pipelines | RW | RW | | | | | | +| groups | RW | | RO | | | | | +| table_size | RW | | | | | | | +| submit_runs | RW | | | | | | | +| policies | RW | RW | | | | | | +| migration_status | | RW | | RW | | RW | | +| query_problems | RW | RW | | | | | | +| workflow_problems | RW | RW | | | | | | +| udfs | RW | RW | RO | | | | | +| logs | RW | | RW | RW | | RW | RW | +| recon_results | | | | | | | RW | **RW** - Read/Write, the job generates or updates the table.
**RO** - Read Only