Skip to content

Commit 1629eff

Browse files
authored
Assess source code as part of the assessment (#2678)
## Changes Adds 2 tasks to the assessment workflow: - assess source code from dashboards -> widgets -> SQL queries - assess source code from workflows -> jobs -> tasks -> notebooks/files ### Linked issues Progresses #2595 ### Functionality - [x] modified existing command: `databricks labs install ucx` by linting source code on top of existing assessment ### Tests - [x] ran integration tests --------- Co-authored-by: Eric Vergnaud <eric.vergnaud@databricks.com>
1 parent 8ef0da0 commit 1629eff

File tree

8 files changed

+59
-13
lines changed

8 files changed

+59
-13
lines changed

src/databricks/labs/ucx/assessment/workflows.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,18 @@ def crawl_groups(self, ctx: RuntimeContext):
181181
"""Scans all groups for the local group migration scope"""
182182
ctx.group_manager.snapshot()
183183

184+
@job_task
185+
def assess_dashboards(self, ctx: RuntimeContext):
186+
"""Scans all dashboards for migration issues in SQL code of embedded widgets.
187+
Also stores direct filesystem accesses for display in the migration dashboard."""
188+
ctx.query_linter.refresh_report(ctx.sql_backend, ctx.inventory_database)
189+
190+
@job_task
191+
def assess_workflows(self, ctx: RuntimeContext):
192+
"""Scans all jobs for migration issues in notebooks.
193+
Also stores direct filesystem accesses for display in the migration dashboard."""
194+
ctx.workflow_linter.refresh_report(ctx.sql_backend, ctx.inventory_database)
195+
184196

185197
class Failing(Workflow):
186198
def __init__(self):

src/databricks/labs/ucx/config.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,9 @@ class WorkspaceConfig: # pylint: disable=too-many-instance-attributes
7070
# [INTERNAL ONLY] Whether the assessment should capture only specific object permissions.
7171
include_object_permissions: list[str] | None = None
7272

73+
# [INTERNAL ONLY] Whether the assessment should lint only specific dashboards.
74+
include_dashboard_ids: list[str] | None = None
75+
7376
def replace_inventory_variable(self, text: str) -> str:
7477
return text.replace("$inventory", f"hive_metastore.{self.inventory_database}")
7578

src/databricks/labs/ucx/contexts/application.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -435,6 +435,7 @@ def query_linter(self):
435435
self.workspace_client,
436436
TableMigrationIndex([]), # TODO: bring back self.tables_migrator.index()
437437
self.directfs_access_crawler_for_queries,
438+
self.config.include_dashboard_ids,
438439
)
439440

440441
@cached_property

src/databricks/labs/ucx/source_code/queries.py

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -40,10 +40,12 @@ def __init__(
4040
ws: WorkspaceClient,
4141
migration_index: TableMigrationIndex,
4242
directfs_crawler: DirectFsAccessCrawler,
43+
include_dashboard_ids: list[str] | None,
4344
):
4445
self._ws = ws
4546
self._migration_index = migration_index
4647
self._directfs_crawler = directfs_crawler
48+
self._include_dashboard_ids = include_dashboard_ids
4749

4850
def refresh_report(self, sql_backend: SqlBackend, inventory_database: str):
4951
assessment_start = datetime.now(timezone.utc)
@@ -53,16 +55,12 @@ def refresh_report(self, sql_backend: SqlBackend, inventory_database: str):
5355
all_problems: list[QueryProblem] = []
5456
all_dfsas: list[DirectFsAccess] = []
5557
# first lint and collect queries from dashboards
56-
for dashboard in all_dashboards:
57-
if not dashboard.id:
58-
continue
59-
dashboard = self._ws.dashboards.get(dashboard_id=dashboard.id)
58+
for dashboard_id in self._dashboard_ids_in_scope():
59+
dashboard = self._ws.dashboards.get(dashboard_id=dashboard_id)
6060
problems, dfsas = self._lint_and_collect_from_dashboard(dashboard, linted_queries)
6161
all_problems.extend(problems)
6262
all_dfsas.extend(dfsas)
63-
for query in self._ws.queries_legacy.list():
64-
if query.id is None:
65-
continue
63+
for query in self._queries_in_scope():
6664
if query.id in linted_queries:
6765
continue
6866
linted_queries.add(query.id)
@@ -88,6 +86,18 @@ def refresh_report(self, sql_backend: SqlBackend, inventory_database: str):
8886
]
8987
self._directfs_crawler.dump_all(all_dfsas)
9088

89+
def _dashboard_ids_in_scope(self) -> list[str]:
90+
if self._include_dashboard_ids is not None: # an empty list is accepted
91+
return self._include_dashboard_ids
92+
all_dashboards = self._ws.dashboards.list()
93+
return [dashboard.id for dashboard in all_dashboards if dashboard.id]
94+
95+
def _queries_in_scope(self):
96+
if self._include_dashboard_ids is not None: # an empty list is accepted
97+
return []
98+
all_queries = self._ws.queries_legacy.list()
99+
return [query for query in all_queries if query.id]
100+
91101
def _lint_and_collect_from_dashboard(
92102
self, dashboard: Dashboard, linted_queries: set[str]
93103
) -> tuple[Iterable[QueryProblem], Iterable[DirectFsAccess]]:

tests/integration/assessment/test_ext_hms.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import dataclasses
22
import datetime as dt
3+
import io
34

45
from databricks.labs.lsql.backends import CommandExecutionBackend
56
from databricks.sdk.service.iam import PermissionLevel
@@ -11,6 +12,9 @@ def test_running_real_assessment_job_ext_hms(
1112
env_or_skip,
1213
make_cluster_policy,
1314
make_cluster_policy_permissions,
15+
make_notebook,
16+
make_job,
17+
make_dashboard,
1418
):
1519
cluster_id = env_or_skip('TEST_EXT_HMS_CLUSTER_ID')
1620
ext_hms_ctx = installation_ctx.replace(
@@ -39,6 +43,12 @@ def test_running_real_assessment_job_ext_hms(
3943

4044
# Under ideal circumstances this can take 10-16 minutes (depending on whether there are compute instances available
4145
# via the integration pool). Allow some margin to reduce spurious failures.
46+
notebook_path = make_notebook(content=io.BytesIO(b"import xyz"))
47+
job = make_job(notebook_path=notebook_path)
48+
installation_ctx.config.include_job_ids = [job.job_id]
49+
50+
dashboard = make_dashboard()
51+
installation_ctx.config.include_dashboard_ids = [dashboard.id]
4252
ext_hms_ctx.deployed_workflows.run_workflow("assessment", max_wait=dt.timedelta(minutes=25))
4353

4454
# assert the workflow is successful. the tasks on sql warehouse will fail so skip checking them
Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import io
12
from datetime import timedelta
23

34
from databricks.sdk.errors import NotFound, InvalidParameterValue
@@ -6,19 +7,28 @@
67

78

89
@retried(on=[NotFound, InvalidParameterValue], timeout=timedelta(minutes=8))
9-
def test_running_real_assessment_job(ws, installation_ctx, make_cluster_policy, make_cluster_policy_permissions):
10-
ws_group_a, _ = installation_ctx.make_ucx_group()
10+
def test_running_real_assessment_job(
11+
ws, installation_ctx, make_cluster_policy, make_cluster_policy_permissions, make_job, make_notebook, make_dashboard
12+
):
13+
ws_group, _ = installation_ctx.make_ucx_group()
1114
cluster_policy = make_cluster_policy()
1215
make_cluster_policy_permissions(
1316
object_id=cluster_policy.policy_id,
1417
permission_level=PermissionLevel.CAN_USE,
15-
group_name=ws_group_a.display_name,
18+
group_name=ws_group.display_name,
1619
)
1720
installation_ctx.__dict__['include_object_permissions'] = [f"cluster-policies:{cluster_policy.policy_id}"]
1821
installation_ctx.workspace_installation.run()
1922

23+
notebook_path = make_notebook(content=io.BytesIO(b"import xyz"))
24+
job = make_job(notebook_path=notebook_path)
25+
installation_ctx.config.include_job_ids = [job.job_id]
26+
27+
dashboard = make_dashboard()
28+
installation_ctx.config.include_dashboard_ids = [dashboard.id]
29+
2030
installation_ctx.deployed_workflows.run_workflow("assessment")
2131
assert installation_ctx.deployed_workflows.validate_step("assessment")
2232

2333
after = installation_ctx.generic_permissions_support.load_as_dict("cluster-policies", cluster_policy.policy_id)
24-
assert after[ws_group_a.display_name] == PermissionLevel.CAN_USE
34+
assert after[ws_group.display_name] == PermissionLevel.CAN_USE

tests/integration/source_code/test_queries.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
def test_query_linter_lints_queries_and_stores_dfsas(simple_ctx, ws, sql_backend, make_query, make_dashboard):
77
query = make_query(sql_query="SELECT * from csv.`dbfs://some_folder/some_file.csv`")
88
_dashboard = make_dashboard(query=query)
9-
linter = QueryLinter(ws, TableMigrationIndex([]), simple_ctx.directfs_access_crawler_for_queries)
9+
linter = QueryLinter(ws, TableMigrationIndex([]), simple_ctx.directfs_access_crawler_for_queries, None)
1010
linter.refresh_report(sql_backend, simple_ctx.inventory_database)
1111
all_problems = sql_backend.fetch("SELECT * FROM query_problems", schema=simple_ctx.inventory_database)
1212
problems = [row for row in all_problems if row["query_name"] == query.name]

tests/unit/source_code/test_queries.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ def test_query_linter_collects_dfsas_from_queries(name, query, dfsa_paths, is_re
2626
ws = create_autospec(WorkspaceClient)
2727
crawlers = create_autospec(DirectFsAccessCrawler)
2828
query = LegacyQuery.from_dict({"parent": "workspace", "name": name, "query": query})
29-
linter = QueryLinter(ws, migration_index, crawlers)
29+
linter = QueryLinter(ws, migration_index, crawlers, None)
3030
dfsas = linter.collect_dfsas_from_query(query)
3131
ws.assert_not_called()
3232
crawlers.assert_not_called()

0 commit comments

Comments
 (0)