Skip to content

Commit f547968

Browse files
committed
Full refresh all crawlers for each assessment run
1 parent 14c8223 commit f547968

File tree

1 file changed

+18
-18
lines changed

1 file changed

+18
-18
lines changed

src/databricks/labs/ucx/assessment/workflows.py

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -18,14 +18,14 @@ def crawl_tables(self, ctx: RuntimeContext):
1818
`$inventory_database.tables`. Note that the `inventory_database` is set in the configuration file. The metadata
1919
stored is then used in the subsequent tasks and workflows to, for example, find all Hive Metastore tables that
2020
cannot easily be migrated to Unity Catalog."""
21-
ctx.tables_crawler.snapshot()
21+
ctx.tables_crawler.snapshot(force_refresh=True)
2222

2323
@job_task
2424
def crawl_udfs(self, ctx: RuntimeContext):
2525
"""Iterates over all UDFs in the Hive Metastore of the current workspace and persists their metadata in the
2626
table named `$inventory_database.udfs`. This inventory is currently used when scanning securable objects for
2727
issues with grants that cannot be migrated to Unit Catalog."""
28-
ctx.udfs_crawler.snapshot()
28+
ctx.udfs_crawler.snapshot(force_refresh=True)
2929

3030
@job_task(job_cluster="tacl")
3131
def setup_tacl(self, ctx: RuntimeContext):
@@ -40,15 +40,15 @@ def crawl_grants(self, ctx: RuntimeContext):
4040
4141
Note: This job runs on a separate cluster (named `tacl`) as it requires the proper configuration to have the Table
4242
ACLs enabled and available for retrieval."""
43-
ctx.grants_crawler.snapshot()
43+
ctx.grants_crawler.snapshot(force_refresh=True)
4444

4545
@job_task(depends_on=[crawl_tables])
4646
def estimate_table_size_for_migration(self, ctx: RuntimeContext):
4747
"""Scans the previously created Delta table named `$inventory_database.tables` and locate tables that cannot be
4848
"synced". These tables will have to be cloned in the migration process.
4949
Assesses the size of these tables and create `$inventory_database.table_size` table to list these sizes.
5050
The table size is a factor in deciding whether to clone these tables."""
51-
ctx.table_size_crawler.snapshot()
51+
ctx.table_size_crawler.snapshot(force_refresh=True)
5252

5353
@job_task
5454
def crawl_mounts(self, ctx: RuntimeContext):
@@ -58,7 +58,7 @@ def crawl_mounts(self, ctx: RuntimeContext):
5858
5959
The assessment involves scanning the workspace to compile a list of all existing mount points and subsequently
6060
storing this information in the `$inventory.mounts` table. This is crucial for planning the migration."""
61-
ctx.mounts_crawler.snapshot()
61+
ctx.mounts_crawler.snapshot(force_refresh=True)
6262

6363
@job_task(depends_on=[crawl_mounts, crawl_tables])
6464
def guess_external_locations(self, ctx: RuntimeContext):
@@ -70,7 +70,7 @@ def guess_external_locations(self, ctx: RuntimeContext):
7070
- Extracting all the locations associated with tables that do not use DBFS directly, but a mount point instead
7171
- Scanning all these locations to identify folders that can act as shared path prefixes
7272
- These identified external locations will be created subsequently prior to the actual table migration"""
73-
ctx.external_locations.snapshot()
73+
ctx.external_locations.snapshot(force_refresh=True)
7474

7575
@job_task
7676
def assess_jobs(self, ctx: RuntimeContext):
@@ -83,7 +83,7 @@ def assess_jobs(self, ctx: RuntimeContext):
8383
- Clusters with incompatible Spark config tags
8484
- Clusters referencing DBFS locations in one or more config options
8585
"""
86-
ctx.jobs_crawler.snapshot()
86+
ctx.jobs_crawler.snapshot(force_refresh=True)
8787

8888
@job_task
8989
def assess_clusters(self, ctx: RuntimeContext):
@@ -96,7 +96,7 @@ def assess_clusters(self, ctx: RuntimeContext):
9696
- Clusters with incompatible spark config tags
9797
- Clusters referencing DBFS locations in one or more config options
9898
"""
99-
ctx.clusters_crawler.snapshot()
99+
ctx.clusters_crawler.snapshot(force_refresh=True)
100100

101101
@job_task
102102
def assess_pipelines(self, ctx: RuntimeContext):
@@ -109,7 +109,7 @@ def assess_pipelines(self, ctx: RuntimeContext):
109109
110110
Subsequently, a list of all the pipelines with matching configurations are stored in the
111111
`$inventory.pipelines` table."""
112-
ctx.pipelines_crawler.snapshot()
112+
ctx.pipelines_crawler.snapshot(force_refresh=True)
113113

114114
@job_task
115115
def assess_incompatible_submit_runs(self, ctx: RuntimeContext):
@@ -122,7 +122,7 @@ def assess_incompatible_submit_runs(self, ctx: RuntimeContext):
122122
It also combines several submit runs under a single pseudo_id based on hash of the submit run configuration.
123123
Subsequently, a list of all the incompatible runs with failures are stored in the
124124
`$inventory.submit_runs` table."""
125-
ctx.submit_runs_crawler.snapshot()
125+
ctx.submit_runs_crawler.snapshot(force_refresh=True)
126126

127127
@job_task
128128
def crawl_cluster_policies(self, ctx: RuntimeContext):
@@ -133,7 +133,7 @@ def crawl_cluster_policies(self, ctx: RuntimeContext):
133133
134134
Subsequently, a list of all the policies with matching configurations are stored in the
135135
`$inventory.policies` table."""
136-
ctx.policies_crawler.snapshot()
136+
ctx.policies_crawler.snapshot(force_refresh=True)
137137

138138
@job_task(cloud="azure")
139139
def assess_azure_service_principals(self, ctx: RuntimeContext):
@@ -147,7 +147,7 @@ def assess_azure_service_principals(self, ctx: RuntimeContext):
147147
Subsequently, the list of all the Azure Service Principals referred in those configurations are saved
148148
in the `$inventory.azure_service_principals` table."""
149149
if ctx.is_azure:
150-
ctx.azure_service_principal_crawler.snapshot()
150+
ctx.azure_service_principal_crawler.snapshot(force_refresh=True)
151151

152152
@job_task
153153
def assess_global_init_scripts(self, ctx: RuntimeContext):
@@ -156,7 +156,7 @@ def assess_global_init_scripts(self, ctx: RuntimeContext):
156156
157157
It looks in:
158158
- the list of all the global init scripts are saved in the `$inventory.global_init_scripts` table."""
159-
ctx.global_init_scripts_crawler.snapshot()
159+
ctx.global_init_scripts_crawler.snapshot(force_refresh=True)
160160

161161
@job_task
162162
def workspace_listing(self, ctx: RuntimeContext):
@@ -168,7 +168,7 @@ def workspace_listing(self, ctx: RuntimeContext):
168168
if not ctx.config.use_legacy_permission_migration:
169169
logger.info("Skipping workspace listing as legacy permission migration is disabled.")
170170
return
171-
ctx.workspace_listing.snapshot()
171+
ctx.workspace_listing.snapshot(force_refresh=True)
172172

173173
@job_task(depends_on=[crawl_grants, workspace_listing])
174174
def crawl_permissions(self, ctx: RuntimeContext):
@@ -182,22 +182,22 @@ def crawl_permissions(self, ctx: RuntimeContext):
182182
return
183183
permission_manager = ctx.permission_manager
184184
permission_manager.reset()
185-
permission_manager.snapshot()
185+
permission_manager.snapshot(force_refresh=True)
186186

187187
@job_task
188188
def crawl_groups(self, ctx: RuntimeContext):
189189
"""Scans all groups for the local group migration scope"""
190-
ctx.group_manager.snapshot()
190+
ctx.group_manager.snapshot(force_refresh=True)
191191

192192
@job_task
193193
def crawl_redash_dashboards(self, ctx: RuntimeContext):
194194
"""Scans all Redash dashboards."""
195-
ctx.redash_crawler.snapshot()
195+
ctx.redash_crawler.snapshot(force_refresh=True)
196196

197197
@job_task
198198
def crawl_lakeview_dashboards(self, ctx: RuntimeContext):
199199
"""Scans all Lakeview dashboards."""
200-
ctx.lakeview_crawler.snapshot()
200+
ctx.lakeview_crawler.snapshot(force_refresh=True)
201201

202202
@job_task(depends_on=[crawl_redash_dashboards, crawl_lakeview_dashboards])
203203
def assess_dashboards(self, ctx: RuntimeContext):

0 commit comments

Comments
 (0)