Add a parameter to the assessment job for force_refresh of data

pritishpai · pritishpai · commit a8d1fa213c69 · 2025-06-26T17:32:16.000-04:00
diff --git a/src/databricks/labs/ucx/assessment/workflows.py b/src/databricks/labs/ucx/assessment/workflows.py
@@ -1,5 +1,7 @@
 import logging
 
+from databricks.sdk.service.jobs import JobParameterDefinition
+
 from databricks.labs.ucx.contexts.workflow_task import RuntimeContext
 from databricks.labs.ucx.framework.tasks import Workflow, job_task
 
@@ -8,25 +10,27 @@
 
 
 class Assessment(Workflow):  # pylint: disable=too-many-public-methods
-    def __init__(self, force_refresh: bool = False):
-        self._force_refresh = force_refresh
-        super().__init__('assessment')
+    def __init__(self):
+        self._force_refresh_param = JobParameterDefinition(name="force_refresh", default=False)
+        super().__init__('assessment', [self._force_refresh_param])
 
     @job_task
     def crawl_tables(self, ctx: RuntimeContext):
         """Iterates over all tables in the Hive Metastore of the current workspace and persists their metadata, such
         as _database name_, _table name_, _table type_, _table location_, etc., in the Delta table named
         `$inventory_database.tables`. Note that the `inventory_database` is set in the configuration file. The metadata
-        stored is then used in the subsequent tasks and workflows to, for example,  find all Hive Metastore tables that
+        stored is then used in the subsequent tasks and workflows to, for example, find all Hive Metastore tables that
         cannot easily be migrated to Unity Catalog."""
-        ctx.tables_crawler.snapshot(force_refresh=self._force_refresh)
+        force_refresh = ctx.named_parameters.get("force_refresh", "False").lower() in ["true", "1"]
+        ctx.tables_crawler.snapshot(force_refresh=force_refresh)
 
     @job_task
     def crawl_udfs(self, ctx: RuntimeContext):
         """Iterates over all UDFs in the Hive Metastore of the current workspace and persists their metadata in the
         table named `$inventory_database.udfs`. This inventory is currently used when scanning securable objects for
         issues with grants that cannot be migrated to Unit Catalog."""
-        ctx.udfs_crawler.snapshot(force_refresh=self._force_refresh)
+        force_refresh = ctx.named_parameters.get("force_refresh", "False").lower() in ["true", "1"]
+        ctx.udfs_crawler.snapshot(force_refresh=force_refresh)
 
     @job_task(job_cluster="tacl")
     def setup_tacl(self, ctx: RuntimeContext):
@@ -41,15 +45,17 @@ def crawl_grants(self, ctx: RuntimeContext):
 
         Note: This job runs on a separate cluster (named `tacl`) as it requires the proper configuration to have the Table
         ACLs enabled and available for retrieval."""
-        ctx.grants_crawler.snapshot(force_refresh=self._force_refresh)
+        force_refresh = ctx.named_parameters.get("force_refresh", "False").lower() in ["true", "1"]
+        ctx.grants_crawler.snapshot(force_refresh=force_refresh)
 
     @job_task(depends_on=[crawl_tables])
     def estimate_table_size_for_migration(self, ctx: RuntimeContext):
         """Scans the previously created Delta table named `$inventory_database.tables` and locate tables that cannot be
         "synced". These tables will have to be cloned in the migration process.
         Assesses the size of these tables and create `$inventory_database.table_size` table to list these sizes.
         The table size is a factor in deciding whether to clone these tables."""
-        ctx.table_size_crawler.snapshot(force_refresh=self._force_refresh)
+        force_refresh = ctx.named_parameters.get("force_refresh", "False").lower() in ["true", "1"]
+        ctx.table_size_crawler.snapshot(force_refresh=force_refresh)
 
     @job_task
     def crawl_mounts(self, ctx: RuntimeContext):
@@ -59,7 +65,8 @@ def crawl_mounts(self, ctx: RuntimeContext):
 
         The assessment involves scanning the workspace to compile a list of all existing mount points and subsequently
         storing this information in the `$inventory.mounts` table. This is crucial for planning the migration."""
-        ctx.mounts_crawler.snapshot(force_refresh=self._force_refresh)
+        force_refresh = ctx.named_parameters.get("force_refresh", "False").lower() in ["true", "1"]
+        ctx.mounts_crawler.snapshot(force_refresh=force_refresh)
 
     @job_task(depends_on=[crawl_mounts, crawl_tables])
     def guess_external_locations(self, ctx: RuntimeContext):
@@ -71,7 +78,8 @@ def guess_external_locations(self, ctx: RuntimeContext):
           - Extracting all the locations associated with tables that do not use DBFS directly, but a mount point instead
           - Scanning all these locations to identify folders that can act as shared path prefixes
           - These identified external locations will be created subsequently prior to the actual table migration"""
-        ctx.external_locations.snapshot(force_refresh=self._force_refresh)
+        force_refresh = ctx.named_parameters.get("force_refresh", "False").lower() in ["true", "1"]
+        ctx.external_locations.snapshot(force_refresh=force_refresh)
 
     @job_task
     def assess_jobs(self, ctx: RuntimeContext):
@@ -84,7 +92,8 @@ def assess_jobs(self, ctx: RuntimeContext):
           - Clusters with incompatible Spark config tags
           - Clusters referencing DBFS locations in one or more config options
         """
-        ctx.jobs_crawler.snapshot(force_refresh=self._force_refresh)
+        force_refresh = ctx.named_parameters.get("force_refresh", "False").lower() in ["true", "1"]
+        ctx.jobs_crawler.snapshot(force_refresh=force_refresh)
 
     @job_task
     def assess_clusters(self, ctx: RuntimeContext):
@@ -97,7 +106,8 @@ def assess_clusters(self, ctx: RuntimeContext):
           - Clusters with incompatible spark config tags
           - Clusters referencing DBFS locations in one or more config options
         """
-        ctx.clusters_crawler.snapshot(force_refresh=self._force_refresh)
+        force_refresh = ctx.named_parameters.get("force_refresh", "False").lower() in ["true", "1"]
+        ctx.clusters_crawler.snapshot(force_refresh=force_refresh)
 
     @job_task
     def assess_pipelines(self, ctx: RuntimeContext):
@@ -110,7 +120,8 @@ def assess_pipelines(self, ctx: RuntimeContext):
 
         Subsequently, a list of all the pipelines with matching configurations are stored in the
         `$inventory.pipelines` table."""
-        ctx.pipelines_crawler.snapshot(force_refresh=self._force_refresh)
+        force_refresh = ctx.named_parameters.get("force_refresh", "False").lower() in ["true", "1"]
+        ctx.pipelines_crawler.snapshot(force_refresh=force_refresh)
 
     @job_task
     def assess_incompatible_submit_runs(self, ctx: RuntimeContext):
@@ -123,7 +134,8 @@ def assess_incompatible_submit_runs(self, ctx: RuntimeContext):
         It also combines several submit runs under a single pseudo_id based on hash of the submit run configuration.
         Subsequently, a list of all the incompatible runs with failures are stored in the
         `$inventory.submit_runs` table."""
-        ctx.submit_runs_crawler.snapshot(force_refresh=self._force_refresh)
+        force_refresh = ctx.named_parameters.get("force_refresh", "False").lower() in ["true", "1"]
+        ctx.submit_runs_crawler.snapshot(force_refresh=force_refresh)
 
     @job_task
     def crawl_cluster_policies(self, ctx: RuntimeContext):
@@ -134,7 +146,8 @@ def crawl_cluster_policies(self, ctx: RuntimeContext):
 
           Subsequently, a list of all the policies with matching configurations are stored in the
         `$inventory.policies` table."""
-        ctx.policies_crawler.snapshot(force_refresh=self._force_refresh)
+        force_refresh = ctx.named_parameters.get("force_refresh", "False").lower() in ["true", "1"]
+        ctx.policies_crawler.snapshot(force_refresh=force_refresh)
 
     @job_task(cloud="azure")
     def assess_azure_service_principals(self, ctx: RuntimeContext):
@@ -148,7 +161,8 @@ def assess_azure_service_principals(self, ctx: RuntimeContext):
         Subsequently, the list of all the Azure Service Principals referred in those configurations are saved
         in the `$inventory.azure_service_principals` table."""
         if ctx.is_azure:
-            ctx.azure_service_principal_crawler.snapshot(force_refresh=self._force_refresh)
+            force_refresh = ctx.named_parameters.get("force_refresh", "False").lower() in ["true", "1"]
+            ctx.azure_service_principal_crawler.snapshot(force_refresh=force_refresh)
 
     @job_task
     def assess_global_init_scripts(self, ctx: RuntimeContext):
@@ -157,7 +171,8 @@ def assess_global_init_scripts(self, ctx: RuntimeContext):
 
         It looks in:
           - the list of all the global init scripts are saved in the `$inventory.global_init_scripts` table."""
-        ctx.global_init_scripts_crawler.snapshot(force_refresh=self._force_refresh)
+        force_refresh = ctx.named_parameters.get("force_refresh", "False").lower() in ["true", "1"]
+        ctx.global_init_scripts_crawler.snapshot(force_refresh=force_refresh)
 
     @job_task
     def workspace_listing(self, ctx: RuntimeContext):
@@ -169,7 +184,8 @@ def workspace_listing(self, ctx: RuntimeContext):
         if not ctx.config.use_legacy_permission_migration:
             logger.info("Skipping workspace listing as legacy permission migration is disabled.")
             return
-        ctx.workspace_listing.snapshot(force_refresh=self._force_refresh)
+        force_refresh = ctx.named_parameters.get("force_refresh", "False").lower() in ["true", "1"]
+        ctx.workspace_listing.snapshot(force_refresh=force_refresh)
 
     @job_task(depends_on=[crawl_grants, workspace_listing])
     def crawl_permissions(self, ctx: RuntimeContext):
@@ -183,22 +199,26 @@ def crawl_permissions(self, ctx: RuntimeContext):
             return
         permission_manager = ctx.permission_manager
         permission_manager.reset()
-        permission_manager.snapshot(force_refresh=self._force_refresh)
+        force_refresh = ctx.named_parameters.get("force_refresh", "False").lower() in ["true", "1"]
+        permission_manager.snapshot(force_refresh=force_refresh)
 
     @job_task
     def crawl_groups(self, ctx: RuntimeContext):
         """Scans all groups for the local group migration scope"""
-        ctx.group_manager.snapshot(force_refresh=self._force_refresh)
+        force_refresh = ctx.named_parameters.get("force_refresh", "False").lower() in ["true", "1"]
+        ctx.group_manager.snapshot(force_refresh=force_refresh)
 
     @job_task
     def crawl_redash_dashboards(self, ctx: RuntimeContext):
         """Scans all Redash dashboards."""
-        ctx.redash_crawler.snapshot(force_refresh=self._force_refresh)
+        force_refresh = ctx.named_parameters.get("force_refresh", "False").lower() in ["true", "1"]
+        ctx.redash_crawler.snapshot(force_refresh=force_refresh)
 
     @job_task
     def crawl_lakeview_dashboards(self, ctx: RuntimeContext):
         """Scans all Lakeview dashboards."""
-        ctx.lakeview_crawler.snapshot(force_refresh=self._force_refresh)
+        force_refresh = ctx.named_parameters.get("force_refresh", "False").lower() in ["true", "1"]
+        ctx.lakeview_crawler.snapshot(force_refresh=force_refresh)
 
     @job_task(depends_on=[crawl_redash_dashboards, crawl_lakeview_dashboards])
     def assess_dashboards(self, ctx: RuntimeContext):
diff --git a/src/databricks/labs/ucx/framework/tasks.py b/src/databricks/labs/ucx/framework/tasks.py
@@ -6,7 +6,7 @@
 from databricks.labs.lsql.backends import SqlBackend
 from databricks.sdk import WorkspaceClient
 from databricks.sdk.core import Config
-from databricks.sdk.service.jobs import CronSchedule
+from databricks.sdk.service.jobs import CronSchedule, JobParameterDefinition
 
 from databricks.labs.ucx.config import WorkspaceConfig
 
@@ -65,8 +65,9 @@ def parse_args(*argv) -> dict[str, str]:
 
 
 class Workflow:
-    def __init__(self, name: str):
+    def __init__(self, name: str, named_parameters: list[JobParameterDefinition] | None = None):
         self._name = name
+        self._named_parameters = named_parameters
 
     @property
     def name(self):
@@ -77,6 +78,11 @@ def schedule(self) -> CronSchedule | None:
         """The default (cron) schedule for this workflow, or None if it is not scheduled."""
         return None
 
+    @property
+    def parameters(self) -> list[JobParameterDefinition] | None:
+        """Named parameters for this workflow, or None if there are no parameters."""
+        return self._named_parameters
+
     def tasks(self) -> Iterable[Task]:
         # return __task__ from every method in this class that has this attribute
         for attr in dir(self):
diff --git a/src/databricks/labs/ucx/installer/workflows.py b/src/databricks/labs/ucx/installer/workflows.py
@@ -855,6 +855,7 @@ def _job_settings(self, workflow_name: str, remote_wheels: list[str]) -> dict[st
             "email_notifications": email_notifications,
             "schedule": workflow.schedule,
             "tasks": job_tasks,
+            "parameters": workflow.parameters,
         }
 
     def _job_task(self, task: Task, remote_wheels: list[str]) -> jobs.Task:

Original file line number	Diff line number	Diff line change
`@@ -855,6 +855,7 @@ def _job_settings(self, workflow_name: str, remote_wheels: list[str]) -> dict[st`
`855`	`855`	`"email_notifications": email_notifications,`
`856`	`856`	`"schedule": workflow.schedule,`
`857`	`857`	`"tasks": job_tasks,`
	`858`	`+ "parameters": workflow.parameters,`
`858`	`859`	`}`
`859`	`860`
`860`	`861`	`def _job_task(self, task: Task, remote_wheels: list[str]) -> jobs.Task:`