Improve error message inference from failed workflow runs (#753)

nfx · web-flow · commit f32987519099 · 2024-01-06T17:38:13.000+01:00
This PR adds heuristics to determine the actual remote error type based
on message content. This improves integration test flakiness for those
that call `run_workflow`.
diff --git a/src/databricks/labs/ucx/install.py b/src/databricks/labs/ucx/install.py
@@ -2,6 +2,7 @@
 import json
 import logging
 import os
+import re
 import sys
 import time
 import webbrowser
@@ -12,15 +13,33 @@
 import yaml
 from databricks.labs.blueprint.entrypoint import get_logger
 from databricks.labs.blueprint.installer import InstallState
-from databricks.labs.blueprint.parallel import Threads
+from databricks.labs.blueprint.parallel import ManyError, Threads
 from databricks.labs.blueprint.tui import Prompts
 from databricks.labs.blueprint.wheels import ProductInfo, Wheels, find_project_root
 from databricks.sdk import WorkspaceClient
 from databricks.sdk.errors import (
+    Aborted,
+    AlreadyExists,
+    BadRequest,
+    Cancelled,
+    DatabricksError,
+    DataLoss,
+    DeadlineExceeded,
+    InternalError,
     InvalidParameterValue,
     NotFound,
+    NotImplemented,
     OperationFailed,
     PermissionDenied,
+    RequestLimitExceeded,
+    ResourceAlreadyExists,
+    ResourceConflict,
+    ResourceDoesNotExist,
+    ResourceExhausted,
+    TemporarilyUnavailable,
+    TooManyRequests,
+    Unauthenticated,
+    Unknown,
 )
 from databricks.sdk.service import compute, jobs
 from databricks.sdk.service.jobs import RunLifeCycleState, RunResultState
@@ -248,16 +267,18 @@ def run_workflow(self, step: str):
         job_run_waiter = self._ws.jobs.run_now(job_id)
         try:
             job_run_waiter.result()
-        except OperationFailed:
+        except OperationFailed as err:
             # currently we don't have any good message from API, so we have to work around it.
             job_run = self._ws.jobs.get_run(job_run_waiter.run_id)
-            messages = []
+            errors: list[DatabricksError] = []
+            timeouts: list[DeadlineExceeded] = []
             assert job_run.tasks is not None
             for run_task in job_run.tasks:
                 if not run_task.state:
                     continue
                 if run_task.state.result_state == jobs.RunResultState.TIMEDOUT:
-                    messages.append(f"{run_task.task_key}: The run was stopped after reaching the timeout")
+                    msg = f"{run_task.task_key}: The run was stopped after reaching the timeout"
+                    timeouts.append(DeadlineExceeded(msg))
                     continue
                 if run_task.state.result_state != jobs.RunResultState.FAILED:
                     continue
@@ -267,13 +288,51 @@ def run_workflow(self, step: str):
                     if run_output and run_output.error_trace:
                         sys.stderr.write(run_output.error_trace)
                 if run_output and run_output.error:
-                    messages.append(f"{run_task.task_key}: {run_output.error}")
-                else:
-                    messages.append(f"{run_task.task_key}: output unavailable")
+                    errors.append(self._infer_task_exception(f"{run_task.task_key}: {run_output.error}"))
             assert job_run.state is not None
             assert job_run.state.state_message is not None
-            msg = f'{job_run.state.state_message.rstrip(".")}: {", ".join(messages)}'
-            raise OperationFailed(msg) from None
+            if len(errors) == 1:
+                raise errors[0] from err
+            all_errors = errors + timeouts
+            if len(all_errors) == 0:
+                raise Unknown(job_run.state.state_message) from err
+            raise ManyError(all_errors) from err
+
+    @staticmethod
+    def _infer_task_exception(haystack: str) -> DatabricksError:
+        needles = [
+            BadRequest,
+            Unauthenticated,
+            PermissionDenied,
+            NotFound,
+            ResourceConflict,
+            TooManyRequests,
+            Cancelled,
+            InternalError,
+            NotImplemented,
+            TemporarilyUnavailable,
+            DeadlineExceeded,
+            InvalidParameterValue,
+            ResourceDoesNotExist,
+            Aborted,
+            AlreadyExists,
+            ResourceAlreadyExists,
+            ResourceExhausted,
+            RequestLimitExceeded,
+            Unknown,
+            DataLoss,
+        ]
+        constructors: dict[re.Pattern, type[DatabricksError]] = {
+            re.compile(r".*\[TABLE_OR_VIEW_NOT_FOUND] (.*)"): NotFound,
+            re.compile(r".*\[SCHEMA_NOT_FOUND] (.*)"): NotFound,
+        }
+        for klass in needles:
+            constructors[re.compile(f".*{klass.__name__}: (.*)")] = klass
+        for pattern, klass in constructors.items():
+            match = pattern.match(haystack)
+            if match:
+                return klass(match.group(1))
+        return Unknown(haystack)
 
     def _create_dashboards(self):
         logger.info("Creating dashboards...")
diff --git a/tests/integration/test_installation.py b/tests/integration/test_installation.py
@@ -6,7 +6,7 @@
 
 import pytest
 from databricks.labs.blueprint.parallel import Threads
-from databricks.sdk.errors import InvalidParameterValue, NotFound, OperationFailed
+from databricks.sdk.errors import InvalidParameterValue, NotFound
 from databricks.sdk.retries import retried
 from databricks.sdk.service.iam import PermissionLevel
 
@@ -67,7 +67,7 @@ def test_job_failure_propagates_correct_error_message_and_logs(ws, sql_backend,
 
     sql_backend.execute(f"DROP SCHEMA {install.current_config.inventory_database} CASCADE")
 
-    with pytest.raises(OperationFailed) as failure:
+    with pytest.raises(NotFound) as failure:
         install.run_workflow("099-destroy-schema")
 
     assert "cannot be found" in str(failure.value)
@@ -76,7 +76,7 @@ def test_job_failure_propagates_correct_error_message_and_logs(ws, sql_backend,
     assert len(workflow_run_logs) == 1
 
 
-@retried(on=[NotFound, InvalidParameterValue, OperationFailed], timeout=timedelta(minutes=10))
+@retried(on=[NotFound, InvalidParameterValue], timeout=timedelta(minutes=6))
 def test_running_real_assessment_job(
     ws, new_installation, make_ucx_group, make_cluster_policy, make_cluster_policy_permissions
 ):
@@ -130,7 +130,7 @@ def test_running_real_migrate_groups_job(
     assert found[f"{install.current_config.renamed_group_prefix}{ws_group_a.display_name}"] == PermissionLevel.CAN_USE
 
 
-@retried(on=[NotFound, InvalidParameterValue, OperationFailed], timeout=timedelta(minutes=5))
+@retried(on=[NotFound, InvalidParameterValue], timeout=timedelta(minutes=5))
 def test_running_real_remove_backup_groups_job(ws, sql_backend, new_installation, make_ucx_group):
     ws_group_a, acc_group_a = make_ucx_group()
 
@@ -149,12 +149,12 @@ def test_running_real_remove_backup_groups_job(ws, sql_backend, new_installation
         ws.groups.get(ws_group_a.id)
 
 
-@retried(on=[NotFound, InvalidParameterValue, OperationFailed], timeout=timedelta(minutes=10))
+@retried(on=[NotFound, InvalidParameterValue], timeout=timedelta(minutes=10))
 def test_repair_run_workflow_job(ws, mocker, new_installation, sql_backend):
     install = new_installation()
     mocker.patch("webbrowser.open")
     sql_backend.execute(f"DROP SCHEMA {install.current_config.inventory_database} CASCADE")
-    with pytest.raises(OperationFailed):
+    with pytest.raises(NotFound):
         install.run_workflow("099-destroy-schema")
 
     sql_backend.execute(f"CREATE SCHEMA IF NOT EXISTS {install.current_config.inventory_database}")
diff --git a/tests/unit/test_install.py b/tests/unit/test_install.py
@@ -6,13 +6,15 @@
 import pytest
 import yaml
 from databricks.labs.blueprint.installer import InstallState
+from databricks.labs.blueprint.parallel import ManyError
 from databricks.labs.blueprint.tui import MockPrompts
 from databricks.labs.blueprint.wheels import Wheels, find_project_root
 from databricks.sdk.errors import (
     InvalidParameterValue,
     NotFound,
     OperationFailed,
     PermissionDenied,
+    Unknown,
 )
 from databricks.sdk.service import iam, jobs, sql
 from databricks.sdk.service.compute import (
@@ -248,10 +250,92 @@ def result():
     ws.jobs.get_run_output.return_value = jobs.RunOutput(error="does not compute", error_trace="# goes to stderr")
     installer = WorkspaceInstaller(ws)
     installer._state.jobs = {"foo": "111"}
-    with pytest.raises(OperationFailed) as failure:
+    with pytest.raises(Unknown) as failure:
         installer.run_workflow("foo")
 
-    assert "Stuff happens: stuff: does not compute" == str(failure.value)
+    assert "stuff: does not compute" == str(failure.value)
+
+
+def test_run_workflow_creates_failure_from_mapping(ws, mocker):
+    def run_now(job_id):
+        assert 111 == job_id
+
+        def result():
+            raise OperationFailed(...)
+
+        waiter = mocker.Mock()
+        waiter.result = result
+        waiter.run_id = "qux"
+        return waiter
+
+    ws.jobs.run_now = run_now
+    ws.jobs.get_run.return_value = jobs.Run(
+        state=jobs.RunState(state_message="Stuff happens."),
+        tasks=[
+            jobs.RunTask(
+                task_key="stuff",
+                state=jobs.RunState(result_state=jobs.RunResultState.FAILED),
+                run_id=123,
+            )
+        ],
+    )
+    ws.jobs.get_run_output.return_value = jobs.RunOutput(
+        error="something: PermissionDenied: does not compute", error_trace="# goes to stderr"
+    )
+    installer = WorkspaceInstaller(ws)
+    installer._state.jobs = {"foo": "111"}
+    with pytest.raises(PermissionDenied) as failure:
+        installer.run_workflow("foo")
+
+    assert str(failure.value) == "does not compute"
+
+
+def test_run_workflow_creates_failure_many_error(ws, mocker):
+    def run_now(job_id):
+        assert 111 == job_id
+
+        def result():
+            raise OperationFailed(...)
+
+        waiter = mocker.Mock()
+        waiter.result = result
+        waiter.run_id = "qux"
+        return waiter
+
+    ws.jobs.run_now = run_now
+    ws.jobs.get_run.return_value = jobs.Run(
+        state=jobs.RunState(state_message="Stuff happens."),
+        tasks=[
+            jobs.RunTask(
+                task_key="stuff",
+                state=jobs.RunState(result_state=jobs.RunResultState.FAILED),
+                run_id=123,
+            ),
+            jobs.RunTask(
+                task_key="things",
+                state=jobs.RunState(result_state=jobs.RunResultState.TIMEDOUT),
+                run_id=124,
+            ),
+            jobs.RunTask(
+                task_key="some",
+                state=jobs.RunState(result_state=jobs.RunResultState.FAILED),
+                run_id=125,
+            ),
+        ],
+    )
+    ws.jobs.get_run_output.return_value = jobs.RunOutput(
+        error="something: DataLoss: does not compute", error_trace="# goes to stderr"
+    )
+    installer = WorkspaceInstaller(ws)
+    installer._state.jobs = {"foo": "111"}
+    with pytest.raises(ManyError) as failure:
+        installer.run_workflow("foo")
+
+    assert str(failure.value) == (
+        "Detected 3 failures: "
+        "DataLoss: does not compute, "
+        "DeadlineExceeded: things: The run was stopped after reaching the timeout"
+    )
 
 
 def test_save_config(ws):