feat(taskworker) Collect another metric from workers (#94960)

markstory · web-flow · commit e8e3ae42925a · 2025-07-07T16:29:04.000-04:00
We have some pools that run really smoothly, and other that don't run as
smoothly. I think that the number of times after startup that we call
this method to fill the child_tasks queue is contributing to broker
processing deadlines occurring.
diff --git a/src/sentry/taskworker/worker.py b/src/sentry/taskworker/worker.py
@@ -153,6 +153,17 @@ def _add_task(self) -> bool:
         Add a task to child tasks queue. Returns False if no new task was fetched.
         """
         if self._child_tasks.full():
+            # I want to see how this differs between pools that operate well,
+            # and those that are not as effective. I suspect that with a consistent
+            # load of slowish tasks (like 5-15 seconds) that this will happen
+            # infrequently, resulting in the child tasks queue being full
+            # causing processing deadline expiration.
+            # Whereas in pools that have consistent short tasks, this happens
+            # more frequently, allowing workers to run more smoothly.
+            metrics.incr(
+                "taskworker.worker.add_tasks.child_tasks_full",
+                tags={"processing_pool": self._processing_pool_name},
+            )
             return False
 
         inflight = self.fetch_task()
@@ -344,6 +355,7 @@ def fetch_task(self) -> InflightTaskActivation | None:
                 extra={"processing_pool": self._processing_pool_name},
             )
 
+            # TODO cap backoff to 5 seconds instead?
             self._gettask_backoff_seconds = min(self._gettask_backoff_seconds + 1, 10)
             return None