Skip to content

Commit e7fb7ed

Browse files
committed
Make force_refresh of assessment parameter driven
1 parent 20be701 commit e7fb7ed

File tree

1 file changed

+20
-19
lines changed

1 file changed

+20
-19
lines changed

src/databricks/labs/ucx/assessment/workflows.py

Lines changed: 20 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,8 @@
88

99

1010
class Assessment(Workflow): # pylint: disable=too-many-public-methods
11-
def __init__(self):
11+
def __init__(self, force_refresh: bool = False):
12+
self._force_refresh = force_refresh
1213
super().__init__('assessment')
1314

1415
@job_task
@@ -18,14 +19,14 @@ def crawl_tables(self, ctx: RuntimeContext):
1819
`$inventory_database.tables`. Note that the `inventory_database` is set in the configuration file. The metadata
1920
stored is then used in the subsequent tasks and workflows to, for example, find all Hive Metastore tables that
2021
cannot easily be migrated to Unity Catalog."""
21-
ctx.tables_crawler.snapshot(force_refresh=True)
22+
ctx.tables_crawler.snapshot(force_refresh=self._force_refresh)
2223

2324
@job_task
2425
def crawl_udfs(self, ctx: RuntimeContext):
2526
"""Iterates over all UDFs in the Hive Metastore of the current workspace and persists their metadata in the
2627
table named `$inventory_database.udfs`. This inventory is currently used when scanning securable objects for
2728
issues with grants that cannot be migrated to Unit Catalog."""
28-
ctx.udfs_crawler.snapshot(force_refresh=True)
29+
ctx.udfs_crawler.snapshot(force_refresh=self._force_refresh)
2930

3031
@job_task(job_cluster="tacl")
3132
def setup_tacl(self, ctx: RuntimeContext):
@@ -40,15 +41,15 @@ def crawl_grants(self, ctx: RuntimeContext):
4041
4142
Note: This job runs on a separate cluster (named `tacl`) as it requires the proper configuration to have the Table
4243
ACLs enabled and available for retrieval."""
43-
ctx.grants_crawler.snapshot(force_refresh=True)
44+
ctx.grants_crawler.snapshot(force_refresh=self._force_refresh)
4445

4546
@job_task(depends_on=[crawl_tables])
4647
def estimate_table_size_for_migration(self, ctx: RuntimeContext):
4748
"""Scans the previously created Delta table named `$inventory_database.tables` and locate tables that cannot be
4849
"synced". These tables will have to be cloned in the migration process.
4950
Assesses the size of these tables and create `$inventory_database.table_size` table to list these sizes.
5051
The table size is a factor in deciding whether to clone these tables."""
51-
ctx.table_size_crawler.snapshot(force_refresh=True)
52+
ctx.table_size_crawler.snapshot(force_refresh=self._force_refresh)
5253

5354
@job_task
5455
def crawl_mounts(self, ctx: RuntimeContext):
@@ -58,7 +59,7 @@ def crawl_mounts(self, ctx: RuntimeContext):
5859
5960
The assessment involves scanning the workspace to compile a list of all existing mount points and subsequently
6061
storing this information in the `$inventory.mounts` table. This is crucial for planning the migration."""
61-
ctx.mounts_crawler.snapshot(force_refresh=True)
62+
ctx.mounts_crawler.snapshot(force_refresh=self._force_refresh)
6263

6364
@job_task(depends_on=[crawl_mounts, crawl_tables])
6465
def guess_external_locations(self, ctx: RuntimeContext):
@@ -70,7 +71,7 @@ def guess_external_locations(self, ctx: RuntimeContext):
7071
- Extracting all the locations associated with tables that do not use DBFS directly, but a mount point instead
7172
- Scanning all these locations to identify folders that can act as shared path prefixes
7273
- These identified external locations will be created subsequently prior to the actual table migration"""
73-
ctx.external_locations.snapshot(force_refresh=True)
74+
ctx.external_locations.snapshot(force_refresh=self._force_refresh)
7475

7576
@job_task
7677
def assess_jobs(self, ctx: RuntimeContext):
@@ -83,7 +84,7 @@ def assess_jobs(self, ctx: RuntimeContext):
8384
- Clusters with incompatible Spark config tags
8485
- Clusters referencing DBFS locations in one or more config options
8586
"""
86-
ctx.jobs_crawler.snapshot(force_refresh=True)
87+
ctx.jobs_crawler.snapshot(force_refresh=self._force_refresh)
8788

8889
@job_task
8990
def assess_clusters(self, ctx: RuntimeContext):
@@ -96,7 +97,7 @@ def assess_clusters(self, ctx: RuntimeContext):
9697
- Clusters with incompatible spark config tags
9798
- Clusters referencing DBFS locations in one or more config options
9899
"""
99-
ctx.clusters_crawler.snapshot(force_refresh=True)
100+
ctx.clusters_crawler.snapshot(force_refresh=self._force_refresh)
100101

101102
@job_task
102103
def assess_pipelines(self, ctx: RuntimeContext):
@@ -109,7 +110,7 @@ def assess_pipelines(self, ctx: RuntimeContext):
109110
110111
Subsequently, a list of all the pipelines with matching configurations are stored in the
111112
`$inventory.pipelines` table."""
112-
ctx.pipelines_crawler.snapshot(force_refresh=True)
113+
ctx.pipelines_crawler.snapshot(force_refresh=self._force_refresh)
113114

114115
@job_task
115116
def assess_incompatible_submit_runs(self, ctx: RuntimeContext):
@@ -122,7 +123,7 @@ def assess_incompatible_submit_runs(self, ctx: RuntimeContext):
122123
It also combines several submit runs under a single pseudo_id based on hash of the submit run configuration.
123124
Subsequently, a list of all the incompatible runs with failures are stored in the
124125
`$inventory.submit_runs` table."""
125-
ctx.submit_runs_crawler.snapshot(force_refresh=True)
126+
ctx.submit_runs_crawler.snapshot(force_refresh=self._force_refresh)
126127

127128
@job_task
128129
def crawl_cluster_policies(self, ctx: RuntimeContext):
@@ -133,7 +134,7 @@ def crawl_cluster_policies(self, ctx: RuntimeContext):
133134
134135
Subsequently, a list of all the policies with matching configurations are stored in the
135136
`$inventory.policies` table."""
136-
ctx.policies_crawler.snapshot(force_refresh=True)
137+
ctx.policies_crawler.snapshot(force_refresh=self._force_refresh)
137138

138139
@job_task(cloud="azure")
139140
def assess_azure_service_principals(self, ctx: RuntimeContext):
@@ -147,7 +148,7 @@ def assess_azure_service_principals(self, ctx: RuntimeContext):
147148
Subsequently, the list of all the Azure Service Principals referred in those configurations are saved
148149
in the `$inventory.azure_service_principals` table."""
149150
if ctx.is_azure:
150-
ctx.azure_service_principal_crawler.snapshot(force_refresh=True)
151+
ctx.azure_service_principal_crawler.snapshot(force_refresh=self._force_refresh)
151152

152153
@job_task
153154
def assess_global_init_scripts(self, ctx: RuntimeContext):
@@ -156,7 +157,7 @@ def assess_global_init_scripts(self, ctx: RuntimeContext):
156157
157158
It looks in:
158159
- the list of all the global init scripts are saved in the `$inventory.global_init_scripts` table."""
159-
ctx.global_init_scripts_crawler.snapshot(force_refresh=True)
160+
ctx.global_init_scripts_crawler.snapshot(force_refresh=self._force_refresh)
160161

161162
@job_task
162163
def workspace_listing(self, ctx: RuntimeContext):
@@ -168,7 +169,7 @@ def workspace_listing(self, ctx: RuntimeContext):
168169
if not ctx.config.use_legacy_permission_migration:
169170
logger.info("Skipping workspace listing as legacy permission migration is disabled.")
170171
return
171-
ctx.workspace_listing.snapshot(force_refresh=True)
172+
ctx.workspace_listing.snapshot(force_refresh=self._force_refresh)
172173

173174
@job_task(depends_on=[crawl_grants, workspace_listing])
174175
def crawl_permissions(self, ctx: RuntimeContext):
@@ -182,22 +183,22 @@ def crawl_permissions(self, ctx: RuntimeContext):
182183
return
183184
permission_manager = ctx.permission_manager
184185
permission_manager.reset()
185-
permission_manager.snapshot(force_refresh=True)
186+
permission_manager.snapshot(force_refresh=self._force_refresh)
186187

187188
@job_task
188189
def crawl_groups(self, ctx: RuntimeContext):
189190
"""Scans all groups for the local group migration scope"""
190-
ctx.group_manager.snapshot(force_refresh=True)
191+
ctx.group_manager.snapshot(force_refresh=self._force_refresh)
191192

192193
@job_task
193194
def crawl_redash_dashboards(self, ctx: RuntimeContext):
194195
"""Scans all Redash dashboards."""
195-
ctx.redash_crawler.snapshot(force_refresh=True)
196+
ctx.redash_crawler.snapshot(force_refresh=self._force_refresh)
196197

197198
@job_task
198199
def crawl_lakeview_dashboards(self, ctx: RuntimeContext):
199200
"""Scans all Lakeview dashboards."""
200-
ctx.lakeview_crawler.snapshot(force_refresh=True)
201+
ctx.lakeview_crawler.snapshot(force_refresh=self._force_refresh)
201202

202203
@job_task(depends_on=[crawl_redash_dashboards, crawl_lakeview_dashboards])
203204
def assess_dashboards(self, ctx: RuntimeContext):

0 commit comments

Comments
 (0)