Skip to content

Commit 05593cd

Browse files
authored
Let WorkflowLinter.refresh_report lint jobs from JobsCrawler (#3732)
## Changes Let `WorkflowLinter.refresh_report` lint jobs from `JobsCrawler` so that we only lint what is within scope ### Linked issues Resolves #3662 Progresses #3722 ### Functionality - [x] modified workflow linting code - [x] modified existing workflow: `assessment` ### Tests - [x] modified unit tests - [x] modified integration tests
1 parent 05c2d6a commit 05593cd

File tree

6 files changed

+41
-40
lines changed

6 files changed

+41
-40
lines changed

src/databricks/labs/ucx/assessment/workflows.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -207,7 +207,7 @@ def assess_dashboards(self, ctx: RuntimeContext):
207207
"""
208208
ctx.query_linter.refresh_report()
209209

210-
@job_task
210+
@job_task(depends_on=[assess_jobs])
211211
def assess_workflows(self, ctx: RuntimeContext):
212212
"""Scans all jobs for migration issues in notebooks jobs.
213213

src/databricks/labs/ucx/contexts/application.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -589,13 +589,12 @@ def dependency_resolver(self) -> DependencyResolver:
589589
def workflow_linter(self) -> WorkflowLinter:
590590
return WorkflowLinter(
591591
self.workspace_client,
592+
self.jobs_crawler,
592593
self.dependency_resolver,
593594
self.path_lookup,
594595
TableMigrationIndex([]), # TODO: bring back self.tables_migrator.index()
595596
self.directfs_access_crawler_for_paths,
596597
self.used_tables_crawler_for_paths,
597-
self.config.include_job_ids,
598-
self.config.debug_listing_upper_limit,
599598
)
600599

601600
@cached_property

src/databricks/labs/ucx/source_code/linters/jobs.py

Lines changed: 4 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
from databricks.sdk.errors import NotFound
1313
from databricks.sdk.service import jobs
1414

15+
from databricks.labs.ucx.assessment.jobs import JobsCrawler
1516
from databricks.labs.ucx.hive_metastore.table_migration_status import TableMigrationIndex
1617
from databricks.labs.ucx.source_code.base import (
1718
DirectFsAccess,
@@ -40,37 +41,25 @@ class WorkflowLinter:
4041
def __init__(
4142
self,
4243
ws: WorkspaceClient,
44+
jobs_crawler: JobsCrawler,
4345
resolver: DependencyResolver,
4446
path_lookup: PathLookup,
4547
migration_index: TableMigrationIndex,
4648
directfs_crawler: DirectFsAccessCrawler,
4749
used_tables_crawler: UsedTablesCrawler,
48-
include_job_ids: list[int] | None = None,
49-
debug_listing_upper_limit: int | None = None,
5050
):
5151
self._ws = ws
52+
self._jobs_crawler = jobs_crawler
5253
self._resolver = resolver
5354
self._path_lookup = path_lookup
5455
self._migration_index = migration_index
5556
self._directfs_crawler = directfs_crawler
5657
self._used_tables_crawler = used_tables_crawler
57-
self._include_job_ids = include_job_ids
58-
self._debug_listing_upper_limit = debug_listing_upper_limit
5958

6059
def refresh_report(self, sql_backend: SqlBackend, inventory_database: str) -> None:
6160
tasks = []
62-
items_listed = 0
63-
for job in self._ws.jobs.list():
64-
if self._include_job_ids is not None and job.job_id not in self._include_job_ids:
65-
logger.info(f"Skipping job_id={job.job_id}")
66-
continue
67-
if self._debug_listing_upper_limit is not None and items_listed >= self._debug_listing_upper_limit:
68-
logger.warning(f"Debug listing limit reached: {self._debug_listing_upper_limit}")
69-
break
70-
if job.settings is not None and job.settings.name is not None:
71-
logger.info(f"Found job_id={job.job_id}: {job.settings.name}")
61+
for job in self._jobs_crawler.snapshot():
7262
tasks.append(functools.partial(self.lint_job, job.job_id))
73-
items_listed += 1
7463
logger.info(f"Running {len(tasks)} linting tasks in parallel...")
7564
job_results, errors = Threads.gather('linting workflows', tasks)
7665
job_problems: list[JobProblem] = []

tests/integration/assessment/test_workflows.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,5 +47,5 @@ def test_running_real_assessment_job(
4747
assert actual_tables == expected_tables
4848

4949
query = f"SELECT * FROM {installation_ctx.inventory_database}.workflow_problems"
50-
for row in sql_backend.fetch(query):
51-
assert row['path'] != 'UNKNOWN'
50+
workflow_problems_without_path = [problem for problem in sql_backend.fetch(query) if problem["path"] == "UNKNOWN"]
51+
assert not workflow_problems_without_path

tests/integration/source_code/test_directfs_access.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -78,8 +78,6 @@ def test_lakeview_query_dfsa_ownership(runtime_ctx) -> None:
7878

7979
def test_path_dfsa_ownership(
8080
runtime_ctx,
81-
make_notebook,
82-
make_job,
8381
make_directory,
8482
inventory_schema,
8583
sql_backend,
@@ -88,18 +86,18 @@ def test_path_dfsa_ownership(
8886

8987
# A job with a notebook task that contains direct filesystem access.
9088
notebook_source = b"display(spark.read.csv('/mnt/things/e/f/g'))"
91-
notebook = make_notebook(path=f"{make_directory()}/notebook.py", content=notebook_source)
92-
job = make_job(notebook_path=notebook)
89+
notebook = runtime_ctx.make_notebook(path=f"{make_directory()}/notebook.py", content=notebook_source)
90+
runtime_ctx.make_job(notebook_path=notebook)
9391

9492
# Produce a DFSA record for the job.
9593
linter = WorkflowLinter(
9694
runtime_ctx.workspace_client,
95+
runtime_ctx.jobs_crawler,
9796
runtime_ctx.dependency_resolver,
9897
runtime_ctx.path_lookup,
9998
TableMigrationIndex([]),
10099
runtime_ctx.directfs_access_crawler_for_paths,
101100
runtime_ctx.used_tables_crawler_for_paths,
102-
include_job_ids=[job.job_id],
103101
)
104102
linter.refresh_report(sql_backend, inventory_schema)
105103

tests/unit/source_code/test_jobs.py

Lines changed: 30 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -5,20 +5,24 @@
55
from unittest.mock import create_autospec
66

77
import pytest
8+
from databricks.labs.blueprint.paths import DBFSPath, WorkspacePath
89
from databricks.labs.lsql.backends import MockBackend
9-
from databricks.sdk.service.compute import LibraryInstallStatus
10+
from databricks.sdk import WorkspaceClient
11+
from databricks.sdk.errors import NotFound
12+
from databricks.sdk.service import compute, jobs
1013
from databricks.sdk.service.jobs import Job, SparkPythonTask
11-
from databricks.sdk.service.pipelines import NotebookLibrary, GetPipelineResponse, PipelineLibrary, FileLibrary
14+
from databricks.sdk.service.pipelines import (
15+
GetPipelineResponse,
16+
FileLibrary,
17+
NotebookLibrary,
18+
PipelineLibrary,
19+
PipelineSpec,
20+
)
21+
from databricks.sdk.service.workspace import ExportFormat, Language, ObjectInfo
1222

13-
from databricks.labs.blueprint.paths import DBFSPath, WorkspacePath
23+
from databricks.labs.ucx.assessment.jobs import JobsCrawler
1424
from databricks.labs.ucx.source_code.base import CurrentSessionState
1525
from databricks.labs.ucx.source_code.directfs_access import DirectFsAccessCrawler
16-
from databricks.labs.ucx.source_code.python_libraries import PythonLibraryResolver
17-
from databricks.sdk import WorkspaceClient
18-
from databricks.sdk.errors import NotFound
19-
from databricks.sdk.service import compute, jobs, pipelines
20-
from databricks.sdk.service.workspace import ExportFormat, ObjectInfo, Language
21-
2226
from databricks.labs.ucx.source_code.files import FileLoader, ImportFileResolver
2327
from databricks.labs.ucx.source_code.graph import (
2428
Dependency,
@@ -27,7 +31,8 @@
2731
)
2832
from databricks.labs.ucx.source_code.jobs import JobProblem, WorkflowTaskContainer
2933
from databricks.labs.ucx.source_code.linters.jobs import WorkflowLinter
30-
from databricks.labs.ucx.source_code.notebooks.loaders import NotebookResolver, NotebookLoader
34+
from databricks.labs.ucx.source_code.notebooks.loaders import NotebookLoader, NotebookResolver
35+
from databricks.labs.ucx.source_code.python_libraries import PythonLibraryResolver
3136
from databricks.labs.ucx.source_code.used_table import UsedTablesCrawler
3237

3338

@@ -228,10 +233,17 @@ def test_workflow_linter_lint_job_logs_problems(dependency_resolver, mock_path_l
228233
expected_message = "Found job problems:\nUNKNOWN:-1 [library-install-failed] 'pip --disable-pip-version-check install unknown-library"
229234

230235
ws = create_autospec(WorkspaceClient)
236+
jobs_crawler = create_autospec(JobsCrawler)
231237
directfs_crawler = create_autospec(DirectFsAccessCrawler)
232238
used_tables_crawler = create_autospec(UsedTablesCrawler)
233239
linter = WorkflowLinter(
234-
ws, dependency_resolver, mock_path_lookup, empty_index, directfs_crawler, used_tables_crawler
240+
ws,
241+
jobs_crawler,
242+
dependency_resolver,
243+
mock_path_lookup,
244+
empty_index,
245+
directfs_crawler,
246+
used_tables_crawler,
235247
)
236248

237249
libraries = [compute.Library(pypi=compute.PythonPyPiLibrary(package="unknown-library-name"))]
@@ -243,6 +255,7 @@ def test_workflow_linter_lint_job_logs_problems(dependency_resolver, mock_path_l
243255
with caplog.at_level(logging.WARNING, logger="databricks.labs.ucx.source_code.jobs"):
244256
linter.lint_job(1234)
245257

258+
jobs_crawler.assert_not_called() # Only called through refresh_report
246259
directfs_crawler.assert_not_called()
247260
used_tables_crawler.assert_not_called()
248261
assert any(message.startswith(expected_message) for message in caplog.messages), caplog.messages
@@ -326,7 +339,7 @@ def test_workflow_task_container_with_existing_cluster_builds_dependency_graph_p
326339
whl=None,
327340
),
328341
messages=None,
329-
status=LibraryInstallStatus.PENDING,
342+
status=compute.LibraryInstallStatus.PENDING,
330343
)
331344
]
332345

@@ -446,7 +459,7 @@ def test_workflow_linter_dlt_pipeline_task(graph) -> None:
446459
ws.pipelines.get.return_value = GetPipelineResponse(
447460
pipeline_id=pipeline.pipeline_id,
448461
name="test-pipeline",
449-
spec=pipelines.PipelineSpec(continuous=False),
462+
spec=PipelineSpec(continuous=False),
450463
)
451464

452465
workflow_task_container = WorkflowTaskContainer(ws, task, Job())
@@ -456,7 +469,7 @@ def test_workflow_linter_dlt_pipeline_task(graph) -> None:
456469
ws.pipelines.get.return_value = GetPipelineResponse(
457470
pipeline_id=pipeline.pipeline_id,
458471
name="test-pipeline",
459-
spec=pipelines.PipelineSpec(
472+
spec=PipelineSpec(
460473
libraries=[
461474
PipelineLibrary(
462475
jar="some.jar",
@@ -549,19 +562,21 @@ def test_workflow_linter_refresh_report(dependency_resolver, mock_path_lookup, m
549562
ws.jobs.get.return_value = Job(job_id=2, settings=settings)
550563

551564
sql_backend = MockBackend()
565+
jobs_crawler = create_autospec(JobsCrawler)
552566
directfs_crawler = DirectFsAccessCrawler.for_paths(sql_backend, "test")
553567
used_tables_crawler = UsedTablesCrawler.for_paths(sql_backend, "test")
554568
linter = WorkflowLinter(
555569
ws,
570+
jobs_crawler,
556571
dependency_resolver,
557572
mock_path_lookup,
558573
migration_index,
559574
directfs_crawler,
560575
used_tables_crawler,
561-
[1],
562576
)
563577
linter.refresh_report(sql_backend, 'test')
564578

579+
jobs_crawler.snapshot.assert_called_once()
565580
sql_backend.has_rows_written_for('test.workflow_problems')
566581
sql_backend.has_rows_written_for('hive_metastore.test.used_tables_in_paths')
567582
sql_backend.has_rows_written_for('hive_metastore.test.directfs_in_paths')

0 commit comments

Comments
 (0)