Handle broken process pool in diffing server well (#38)

Mr0grog · web-flow · commit 20cb995f76d0 · 2020-11-23T19:13:30.000-08:00
We previously tried to reset the process pool that runs the actual diff routines every time it broke, and failed to handle the case where we exhausted all our retries (we'd wind up returning `None`, which is not an acceptable result). We now don't reset the pool and instead raise an exception on the last try, preventing weird errors from happening later on because we returned a bad value (`None`) from `DiffHandler.diff()`. This also does a little more work to ensure that we aren't thrashing the process pool when a lot of diffs are happening simultaneously. We previously had each diff blindly reset the pool, which means that if multiple diffs were in flight when the pool broke, it would get reset multiple times, even though it really only needed to get reset once. This was probably wasting a lot of memory and CPU time. Note that the tests here don't really test the first issue. There's not a good way to differentiate from the outside whether an unknown error was the process pool or something else. Fixes #33.
diff --git a/docs/source/release-history.rst b/docs/source/release-history.rst
@@ -2,6 +2,12 @@
 Release History
 ===============
 
+In Development
+--------------
+
+- Fixes an issue where the diffing server could reset the process pool that manages the actual diffs multiple times unnecessarily, leading to wasted memory and CPU. If you are tracking logs and errors, this will also make error messages about the diffing server clearer — you’ll see “BrokenProcessPool” instead of “'NoneType' object does not support item assignment.” (`#38 <https://github.com/edgi-govdata-archiving/web-monitoring-diff/issues/38>`_)
+
+
 Version 0.1.0
 -------------
 
diff --git a/web_monitoring_diff/server/server.py b/web_monitoring_diff/server/server.py
@@ -498,7 +498,16 @@ async def diff(self, func, a, b, params, tries=2):
                 return await loop.run_in_executor(
                     executor, functools.partial(caller, func, a, b, **params))
             except concurrent.futures.process.BrokenProcessPool:
-                executor = self.get_diff_executor(reset=True)
+                if attempt + 1 < tries:
+                    # There could be many diffs happening in parallel, so
+                    # before trying to reset the process pool, make sure other
+                    # parallel diffs haven't already done it. If it's already
+                    # been reset, then we can just go and use the new one.
+                    old_executor, executor = executor, self.get_diff_executor()
+                    if executor == old_executor:
+                        executor = self.get_diff_executor(reset=True)
+                else:
+                    raise
 
     # NOTE: this doesn't do anything async, but if we change it to do so, we
     # need to add a lock (either asyncio.Lock or tornado.locks.Lock).
diff --git a/web_monitoring_diff/tests/test_server_exc_handling.py b/web_monitoring_diff/tests/test_server_exc_handling.py
@@ -1,4 +1,6 @@
 import asyncio
+import concurrent.futures
+from concurrent.futures.process import BrokenProcessPool, ProcessPoolExecutor
 import json
 import os
 import unittest
@@ -387,6 +389,92 @@ async def responder(handler):
             assert len(result['diff'][0][1]) == 1024
 
 
+class BrokenProcessPoolExecutor(concurrent.futures.Executor):
+    "Fake process pool that only raises BrokenProcessPool exceptions."
+    submit_count = 0
+
+    def submit(self, fn, *args, **kwargs):
+        self.submit_count += 1
+        result = concurrent.futures.Future()
+        result.set_exception(BrokenProcessPool(
+            'This pool is broken, yo'
+        ))
+        return result
+
+
+class ExecutionPoolTestCase(DiffingServerTestCase):
+    def fetch_async(self, path, **kwargs):
+        "Like AyncHTTPTestCase.fetch, but async."
+        url = self.get_url(path)
+        return self.http_client.fetch(url, **kwargs)
+
+    def test_rebuilds_process_pool_when_broken(self):
+        # Get a custom executor that will always fail the first time, but get
+        # a real one that will succeed afterward.
+        did_get_executor = False
+        def get_executor(self, reset=False):
+            nonlocal did_get_executor
+            if did_get_executor:
+                return ProcessPoolExecutor(1)
+            else:
+                did_get_executor = True
+                return BrokenProcessPoolExecutor()
+
+        with patch.object(df.DiffHandler, 'get_diff_executor', get_executor):
+            response = self.fetch('/html_source_dmp?format=json&'
+                                  f'a=file://{fixture_path("empty.txt")}&'
+                                  f'b=file://{fixture_path("empty.txt")}')
+            assert response.code == 200
+            assert did_get_executor == True
+
+    def test_diff_returns_error_if_process_pool_repeatedly_breaks(self):
+        # Set a custom executor that will always fail.
+        def get_executor(self, reset=False):
+            return BrokenProcessPoolExecutor()
+
+        with patch.object(df.DiffHandler, 'get_diff_executor', get_executor):
+            response = self.fetch('/html_source_dmp?format=json&'
+                                  f'a=file://{fixture_path("empty.txt")}&'
+                                  f'b=file://{fixture_path("empty.txt")}')
+            self.json_check(response)
+            assert response.code == 500
+
+    @tornado.testing.gen_test
+    async def test_rebuilds_process_pool_cooperatively(self):
+        """
+        Make sure that two parallel diffing failures only cause the process
+        pool to be rebuilt once, not multiple times.
+        """
+        # Get a custom executor that will always fail the first time, but get
+        # a real one that will succeed afterward.
+        executor_resets = 0
+        good_executor = ProcessPoolExecutor(1)
+        bad_executor = BrokenProcessPoolExecutor()
+        def get_executor(self, reset=False):
+            nonlocal executor_resets
+            if reset:
+                executor_resets += 1
+            if executor_resets > 0:
+                return good_executor
+            else:
+                return bad_executor
+
+        with patch.object(df.DiffHandler, 'get_diff_executor', get_executor):
+            one = self.fetch_async('/html_source_dmp?format=json&'
+                                   f'a=file://{fixture_path("empty.txt")}&'
+                                   f'b=file://{fixture_path("empty.txt")}')
+            two = self.fetch_async('/html_source_dmp?format=json&'
+                                   f'a=file://{fixture_path("empty.txt")}&'
+                                   f'b=file://{fixture_path("empty.txt")}')
+            response1, response2 = await asyncio.gather(one, two)
+            assert response1.code == 200
+            assert response2.code == 200
+            assert executor_resets == 1
+            # Ensure *both* diffs hit the bad executor, so we know we didn't
+            # have one reset because only one request hit the bad executor.
+            assert bad_executor.submit_count == 2
+
+
 def mock_diffing_method(c_body):
     return