1287 purldb scan worker update (#1320)

JonoYang · tdruez · web-flow · commit fdf151994a2b · 2024-08-02T11:31:26.000+04:00
* Split the WebhookSubscription model in 2 models #1325 WebhookSubscription defines the Webhook behavior WebhookDelivery stores historical data about deliveries Signed-off-by: tdruez <tdruez@nexb.com> * Add model migrations #1325 Signed-off-by: tdruez <tdruez@nexb.com> * Display the Webhook deliveries in the run details view #1325 Signed-off-by: tdruez <tdruez@nexb.com> * Add support for new Webhook model in API #1325 Signed-off-by: tdruez <tdruez@nexb.com> * Improve the Webhook related tests #1325 Signed-off-by: tdruez <tdruez@nexb.com> * Using proper webhookdeliveries for the related name #1325 Signed-off-by: tdruez <tdruez@nexb.com> * Relate a webhook delivery with a pipeline run #1325 Signed-off-by: tdruez <tdruez@nexb.com> * Refine API support for webhooks #1325 Signed-off-by: tdruez <tdruez@nexb.com> * Send scan results to purldb in pipeline run #1287 * Update purldb-scan-worker.py to only create scan projects from download urls and to not track project status Signed-off-by: Jono Yang <jyang@nexb.com> * Filter using Q objects #1287 Signed-off-by: Jono Yang <jyang@nexb.com> * Move pipeline function logic to scanpipe.pipes.purldb #1287 * Remove poll_run_status Signed-off-by: Jono Yang <jyang@nexb.com> * Create test for check_project_run_statuses #1287 Signed-off-by: Jono Yang <jyang@nexb.com> * Update test for check_project_run_statuses #1287 Signed-off-by: Jono Yang <jyang@nexb.com> * Remove test for get_run_status #1287 Signed-off-by: Jono Yang <jyang@nexb.com> * Update project extra_data before executing it #1287 * Print message detailing new jobs from purldb Signed-off-by: Jono Yang <jyang@nexb.com> * Use existing queryset methods to get runs #1287 Signed-off-by: Jono Yang <jyang@nexb.com> * Update CHANGELOG.rst #1287 Signed-off-by: Jono Yang <jyang@nexb.com> * Display message on continue #1287 Signed-off-by: Jono Yang <jyang@nexb.com> * Add webhook subscription when creating scan project #1287 Signed-off-by: Jono Yang <jyang@nexb.com> * Update URLs #1287 Signed-off-by: Jono Yang <jyang@nexb.com> * Remove send_project_results pipeline #1287 Signed-off-by: Jono Yang <jyang@nexb.com> * Fix logic display logic #1287 Signed-off-by: Jono Yang <jyang@nexb.com> * Get webhook_url from purldb #1287 Signed-off-by: Jono Yang <jyang@nexb.com> * Update test expectations #1287 * Bump packageurl-python to 0.15.6 Signed-off-by: Jono Yang <jyang@nexb.com> * Update CHANGELOG.rst #1287 Signed-off-by: Jono Yang <jyang@nexb.com> * Fix test #1287 Signed-off-by: Jono Yang <jyang@nexb.com> --------- Signed-off-by: tdruez <tdruez@nexb.com> Signed-off-by: Jono Yang <jyang@nexb.com> Co-authored-by: tdruez <tdruez@nexb.com>
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -26,6 +26,15 @@ v34.7.2 (unreleased)
 - Add a new Dependency view that renders the project dependencies as a tree.
   https://github.com/nexB/scancode.io/issues/1145
 
+- The ``purldb-scan-worker`` command has been updated to send project results
+  back using the Project webhook subscriptions. This allows us to not have the
+  main task loop to monitor a single project run for completion in order to
+  return data, and allows us to have multiple scan projects active at once while
+  we use ``purldb-scan-worker``. A new option ``--max-concurrent-projects`` has
+  been added to set the number of purldb packages that can be requested and
+  processed at once.
+  https://github.com/nexB/scancode.io/issues/1287
+
 v34.7.1 (2024-07-15)
 --------------------
 
diff --git a/scanpipe/management/commands/purldb-scan-worker.py b/scanpipe/management/commands/purldb-scan-worker.py
@@ -27,7 +27,8 @@
 
 from scanpipe.management.commands import AddInputCommandMixin
 from scanpipe.management.commands import CreateProjectCommandMixin
-from scanpipe.pipes import output
+from scanpipe.management.commands import execute_project
+from scanpipe.models import Run
 from scanpipe.pipes import purldb
 
 
@@ -47,40 +48,76 @@ def add_arguments(self, parser):
         parser.add_argument(
             "--max-loops",
             dest="max_loops",
+            type=int,
             default=0,
             action="store",
             help="Limit the number of loops to a maximum number. "
             "0 means no limit. Used only for testing.",
         )
 
+        parser.add_argument(
+            "--max-concurrent-projects",
+            dest="max_concurrent_projects",
+            type=int,
+            default=1,
+            action="store",
+            help="Limit the number of Projects that can be run at once.",
+        )
+
     def handle(self, *args, **options):
         self.verbosity = options["verbosity"]
         sleep = options["sleep"]
         run_async = options["async"]
         max_loops = options["max_loops"]
+        max_concurrent_projects = options["max_concurrent_projects"]
 
         loop_count = 0
         while True:
-            if max_loops and int(loop_count) >= int(max_loops):
+            if max_loops and loop_count >= max_loops:
                 self.stdout.write("loop max reached")
                 break
 
             time.sleep(sleep)
             loop_count += 1
 
+            # Usually, a worker can only run one Run at a time
+            queued_or_running = Run.objects.queued_or_running()
+            queued_or_running_count = queued_or_running.count()
+            if queued_or_running_count >= max_concurrent_projects:
+                self.stdout.write(
+                    "Continuing: number of queued or running Runs"
+                    f"({queued_or_running_count}) is greater "
+                    "than the number of max concurrent projects "
+                    f"({max_concurrent_projects})"
+                )
+                continue
+
             # 1. Get download url from purldb
             response = purldb.get_next_download_url()
             if response:
                 scannable_uri_uuid = response["scannable_uri_uuid"]
                 download_url = response["download_url"]
                 pipelines = response["pipelines"]
+                webhook_url = response["webhook_url"]
             else:
                 self.stderr.write("Bad response from PurlDB: unable to get next job.")
                 continue
 
             if not (download_url and scannable_uri_uuid):
                 self.stdout.write("No new job from PurlDB.")
                 continue
+            else:
+                formatted_pipeline_names = [f"\t\t{pipeline}" for pipeline in pipelines]
+                formatted_pipeline_names = "\n".join(formatted_pipeline_names)
+                msg = (
+                    "New job from PurlDB:\n"
+                    "\tscannable_uri_uuid:\n"
+                    f"\t\t{scannable_uri_uuid}\n"
+                    "\tdownload_url:\n"
+                    f"\t\t{download_url}\n"
+                    "\tpipelines:\n"
+                ) + formatted_pipeline_names
+                self.stdout.write(msg)
 
             try:
                 # 2. Create and run project
@@ -89,21 +126,12 @@ def handle(self, *args, **options):
                     scannable_uri_uuid=scannable_uri_uuid,
                     download_url=download_url,
                     pipelines=pipelines,
+                    webhook_url=webhook_url,
                     run_async=run_async,
                 )
 
-                # 3. Poll project results
-                purldb.poll_run_status(
-                    project=project,
-                    sleep=sleep,
-                )
-
-                # 4. Get project results and send to PurlDB
-                send_scan_project_results(
-                    project=project, scannable_uri_uuid=scannable_uri_uuid
-                )
                 self.stdout.write(
-                    "Scan results and other data have been sent to PurlDB",
+                    f"Project {project.name} has been created",
                     self.style.SUCCESS,
                 )
 
@@ -119,7 +147,7 @@ def handle(self, *args, **options):
 
 
 def create_scan_project(
-    command, scannable_uri_uuid, download_url, pipelines, run_async=False
+    command, scannable_uri_uuid, download_url, pipelines, webhook_url, run_async=False
 ):
     """
     Create and return a Project for the scan project request with ID of
@@ -135,30 +163,17 @@ def create_scan_project(
         name=name,
         pipelines=pipelines,
         input_urls=input_urls,
-        execute=True,
-        run_async=run_async,
     )
-    project.update_extra_data({"scannable_uri_uuid": scannable_uri_uuid})
-    return project
-
-
-def send_scan_project_results(project, scannable_uri_uuid):
-    """
-    Send the JSON summary and results of `project` to PurlDB for the scan
-    request `scannable_uri_uuid`.
-
-    Raise a PurlDBException if there is an issue sending results to PurlDB.
-    """
-    project.refresh_from_db()
-    scan_results_location = output.to_json(project)
-    scan_summary_location = project.get_latest_output(filename="summary")
-    response = purldb.send_results_to_purldb(
-        scannable_uri_uuid,
-        scan_results_location,
-        scan_summary_location,
-        project.extra_data,
+    project.update_extra_data(
+        {
+            "scannable_uri_uuid": scannable_uri_uuid,
+        }
     )
-    if not response:
-        raise purldb.PurlDBException(
-            "Bad response returned when sending results to PurlDB"
-        )
+    project.add_webhook_subscription(
+        target_url=webhook_url,
+        trigger_on_each_run=False,
+        include_summary=True,
+        include_results=True,
+    )
+    execute_project(project=project, run_async=run_async, command=command)
+    return project
diff --git a/scanpipe/pipes/purldb.py b/scanpipe/pipes/purldb.py
@@ -34,7 +34,6 @@
 
 from scanpipe.pipes import LoopProgress
 from scanpipe.pipes import _clean_package_data
-from scanpipe.pipes import poll_until_success
 
 
 class PurlDBException(Exception):
@@ -426,38 +425,6 @@ def get_next_download_url(timeout=DEFAULT_TIMEOUT, api_url=PURLDB_API_URL):
         return response
 
 
-def send_results_to_purldb(
-    scannable_uri_uuid,
-    scan_results_location,
-    scan_summary_location,
-    project_extra_data,
-    timeout=DEFAULT_TIMEOUT,
-    api_url=PURLDB_API_URL,
-):
-    """
-    Send project results to purldb for the package handeled by the ScannableURI
-    with uuid of `scannable_uri_uuid`
-    """
-    with open(scan_results_location, "rb") as scan_results_file:
-        with open(scan_summary_location, "rb") as scan_summary_file:
-            data = {
-                "scannable_uri_uuid": scannable_uri_uuid,
-                "scan_status": "scanned",
-                "project_extra_data": json.dumps(project_extra_data),
-            }
-            files = {
-                "scan_results_file": scan_results_file,
-                "scan_summary_file": scan_summary_file,
-            }
-            response = request_post(
-                url=f"{api_url}scan_queue/update_status/",
-                timeout=timeout,
-                data=data,
-                files=files,
-            )
-    return response
-
-
 def update_status(
     scannable_uri_uuid,
     status,
@@ -467,12 +434,11 @@ def update_status(
 ):
     """Update the status of a ScannableURI on a PurlDB scan queue"""
     data = {
-        "scannable_uri_uuid": scannable_uri_uuid,
         "scan_status": status,
         "scan_log": scan_log,
     }
     response = request_post(
-        url=f"{api_url}scan_queue/update_status/",
+        url=f"{api_url}scan_queue/{scannable_uri_uuid}/update_status/",
         timeout=timeout,
         data=data,
     )
@@ -486,18 +452,29 @@ def create_project_name(download_url, scannable_uri_uuid):
     return f"{slugify(download_url)}-{scannable_uri_uuid[0:8]}"
 
 
-def poll_run_status(project, sleep=10):
+def check_project_run_statuses(project, logger=None):
     """
-    Poll the status of all runs of `project`. Raise a PurlDBException with a
-    message containing the log of the run if the run has stopped, failed, or
-    gone stale, otherwise return None.
+    If any of the runs of this Project has failed, stopped, or gone stale,
+    update the status of the Scannable URI associated with this Project to
+    `failed` and send back a log of the failed runs.
     """
-    runs = project.runs.all()
-    for run in runs:
-        if not poll_until_success(check=get_run_status, sleep=sleep, run=run):
-            status = get_run_status(run)
-            msg = f"Run ended with status {status}:\n\n{run.log}"
-            raise PurlDBException(msg)
+    failed_runs = project.runs.failed()
+    if failed_runs.exists():
+        failure_msgs = []
+        for failed_run in failed_runs:
+            msg = f"{failed_run.pipeline_name} failed:\n\n{failed_run.log}\n"
+            failure_msgs.append(msg)
+        failure_msg = "\n".join(failure_msgs)
+
+        if logger:
+            logger(failure_msg)
+
+        scannable_uri_uuid = project.extra_data.get("scannable_uri_uuid")
+        update_status(
+            scannable_uri_uuid=scannable_uri_uuid,
+            status="failed",
+            scan_log=failure_msg,
+        )
 
 
 def get_run_status(run, **kwargs):
diff --git a/scanpipe/tests/pipes/test_purldb.py b/scanpipe/tests/pipes/test_purldb.py
@@ -150,18 +150,14 @@ def test_scanpipe_pipes_purldb_get_next_download_url(
         results = purldb.get_next_download_url()
         self.assertFalse(results)
 
-    def test_scanpipe_pipes_purldb_get_run_status(self):
-        now = timezone.now()
-        run = self.create_run(
-            pipeline="succeed",
-            task_start_date=now,
-            task_end_date=now,
-            task_exitcode=0,
-        )
-        status = purldb.get_run_status(run=run)
-        self.assertEqual("success", status)
-
-    def test_scanpipe_pipes_purldb_poll_run_status(self):
+    @mock.patch("scanpipe.pipes.purldb.request_post")
+    @mock.patch("scanpipe.pipes.purldb.is_available")
+    def test_scanpipe_pipes_purldb_check_project_run_statuses(
+        self, mock_is_available, mock_request_post
+    ):
+        mock_is_available.return_value = True
+        scannable_uri_uuid = "97627c6e-9acb-43e0-b8df-28bd92f2b7e5"
+        self.project1.extra_data.update({"scannable_uri_uuid": scannable_uri_uuid})
         now = timezone.now()
 
         # Test poll_run_status on individual pipelines
@@ -172,7 +168,10 @@ def test_scanpipe_pipes_purldb_poll_run_status(self):
             task_end_date=now,
             task_exitcode=0,
         )
-        purldb.poll_run_status(project=self.project1)
+        purldb.check_project_run_statuses(
+            project=self.project1,
+        )
+        mock_request_post.assert_not_called()
         self.project1.runs.all().delete()
 
         self.create_run(
@@ -182,9 +181,21 @@ def test_scanpipe_pipes_purldb_poll_run_status(self):
             task_exitcode=1,
             log="failed",
         )
-        with self.assertRaises(purldb.PurlDBException) as context:
-            purldb.poll_run_status(project=self.project1)
-            self.assertIn("failed", context.exception)
+        purldb.check_project_run_statuses(
+            project=self.project1,
+        )
+        mock_request_post.assert_called_once()
+        mock_request_post_call = mock_request_post.mock_calls[0]
+        mock_request_post_call_kwargs = mock_request_post_call.kwargs
+        purldb_update_status_url = (
+            f"{purldb.PURLDB_API_URL}scan_queue/{scannable_uri_uuid}/update_status/"
+        )
+        self.assertEqual(purldb_update_status_url, mock_request_post_call_kwargs["url"])
+        expected_data = {
+            "scan_status": "failed",
+            "scan_log": "failed failed:\n\nfailed\n",
+        }
+        self.assertEqual(expected_data, mock_request_post_call_kwargs["data"])
         self.project1.runs.all().delete()
 
         self.create_run(
@@ -194,9 +205,14 @@ def test_scanpipe_pipes_purldb_poll_run_status(self):
             task_exitcode=99,
             log="stopped",
         )
-        with self.assertRaises(purldb.PurlDBException) as context:
-            purldb.poll_run_status(project=self.project1)
-            self.assertIn("stopped", context.exception)
+        purldb.check_project_run_statuses(
+            project=self.project1,
+        )
+        self.assertEqual(2, mock_request_post.call_count)
+        mock_request_post_call = mock_request_post.mock_calls[0]
+        mock_request_post_call_kwargs = mock_request_post_call.kwargs
+        self.assertEqual(purldb_update_status_url, mock_request_post_call_kwargs["url"])
+        self.assertEqual(expected_data, mock_request_post_call_kwargs["data"])
         self.project1.runs.all().delete()
 
         self.create_run(
@@ -206,29 +222,14 @@ def test_scanpipe_pipes_purldb_poll_run_status(self):
             task_exitcode=88,
             log="stale",
         )
-        with self.assertRaises(purldb.PurlDBException) as context:
-            purldb.poll_run_status(project=self.project1)
-            self.assertIn("stale", context.exception)
-        self.project1.runs.all().delete()
-
-        # Test pipelines success, then failure
-        self.assertEqual(0, self.project1.runs.count())
-        self.create_run(
-            pipeline="succeed",
-            task_start_date=now,
-            task_end_date=now,
-            task_exitcode=0,
-        )
-        self.create_run(
-            pipeline="failed",
-            task_start_date=now,
-            task_end_date=now,
-            task_exitcode=1,
-            log="failed",
+        purldb.check_project_run_statuses(
+            project=self.project1,
         )
-        with self.assertRaises(purldb.PurlDBException) as context:
-            purldb.poll_run_status(project=self.project1)
-            self.assertIn("failed", context.exception)
+        self.assertEqual(3, mock_request_post.call_count)
+        mock_request_post_call = mock_request_post.mock_calls[0]
+        mock_request_post_call_kwargs = mock_request_post_call.kwargs
+        self.assertEqual(purldb_update_status_url, mock_request_post_call_kwargs["url"])
+        self.assertEqual(expected_data, mock_request_post_call_kwargs["data"])
         self.project1.runs.all().delete()
 
     def test_scanpipe_pipes_purldb_create_project_name(self):
diff --git a/scanpipe/tests/test_commands.py b/scanpipe/tests/test_commands.py
diff --git a/setup.cfg b/setup.cfg