From 1e554159b96d8ca18549ccf117e1d9c20a764109 Mon Sep 17 00:00:00 2001 From: Lin Liu Date: Fri, 27 Sep 2024 06:55:34 +0000 Subject: [PATCH] CA-392674: nbd_client_manager retry connect on nbd device busy to connect to nbd devices, nbd_client_manager will 1. protect the operation with /var/run/nonpersistent/nbd_client_manager file lock 2. check whether nbd is being used by `nbd-client -check` 3. load nbd kernel module by `modprobe nbd` 4. call `nbd-client` to connect to nbd device However, step 3 will trigger systemd-udevd run asyncly, which would open and lock the same nbd devices, run udev rules, etc. This introduce races with step 4, e.g. both process want to open and lock the nbd device. Note: the file lock in step 1 does NOT resovle the issue here, as it only coordinate multiple nbd_client_manager processes. To fix the issue, - we patch nbd-client to report the device busy from kernel to nbd_client_manager - nbd_client_manager should check nbd-client exit code, and retry on device busy - nbd_client_manager call `udevadm settle` to wait for udevd parsing udev rules Note: checking nbd-client exit code is still necessary in case of racing with others Signed-off-by: Lin Liu --- python3/libexec/nbd_client_manager.py | 36 +++++++++++++++++------- python3/tests/test_nbd_client_manager.py | 6 ++-- 2 files changed, 30 insertions(+), 12 deletions(-) diff --git a/python3/libexec/nbd_client_manager.py b/python3/libexec/nbd_client_manager.py index d0655df9756..3d0920a3845 100644 --- a/python3/libexec/nbd_client_manager.py +++ b/python3/libexec/nbd_client_manager.py @@ -24,6 +24,8 @@ # Don't wait more than 10 minutes for the NBD device MAX_DEVICE_WAIT_MINUTES = 10 +# According to https://github.com/thom311/libnl/blob/main/include/netlink/errno.h#L38 +NLE_BUSY = 25 class InvalidNbdDevName(Exception): """ @@ -80,7 +82,7 @@ def __exit__(self, *args): FILE_LOCK = FileLock(path=LOCK_FILE) -def _call(cmd_args, error=True): +def _call(cmd_args, raise_err=True, log_err=True): """ [call cmd_args] executes [cmd_args] and returns the exit code. If [error] and exit code != 0, log and throws a CalledProcessError. @@ -94,14 +96,16 @@ def _call(cmd_args, error=True): _, stderr = proc.communicate() - if error and proc.returncode != 0: - LOGGER.error( - "%s exited with code %d: %s", " ".join(cmd_args), proc.returncode, stderr - ) + if proc.returncode != 0: + if log_err: + LOGGER.error( + "%s exited with code %d: %s", " ".join(cmd_args), proc.returncode, stderr + ) - raise subprocess.CalledProcessError( - returncode=proc.returncode, cmd=cmd_args, output=stderr - ) + if raise_err: + raise subprocess.CalledProcessError( + returncode=proc.returncode, cmd=cmd_args, output=stderr + ) return proc.returncode @@ -116,7 +120,7 @@ def _is_nbd_device_connected(nbd_device): if not os.path.exists(nbd_device): raise NbdDeviceNotFound(nbd_device) cmd = ["nbd-client", "-check", nbd_device] - returncode = _call(cmd, error=False) + returncode = _call(cmd, raise_err=False, log_err=False) if returncode == 0: return True if returncode == 1: @@ -191,6 +195,8 @@ def connect_nbd(path, exportname): """Connects to a free NBD device using nbd-client and returns its path""" # We should not ask for too many nbds, as we might not have enough memory _call(["modprobe", "nbd", "nbds_max=24"]) + # Wait for systemd-udevd to process the udev rules + _call(["udevadm", "settle", "--timeout=30"]) retries = 0 while True: try: @@ -206,7 +212,17 @@ def connect_nbd(path, exportname): "-name", exportname, ] - _call(cmd) + ret = _call(cmd, raise_err=False, log_err=True) + if NLE_BUSY == ret: + # Although _find_unused_nbd_device tell us the nbd devcie is + # not connected by other nbd-client, it may be opened and locked + # by other process like systemd-udev, raise NbdDeviceNotFound to retry + LOGGER.warning("Device %s is busy, will retry", nbd_device) + raise NbdDeviceNotFound(nbd_device) + + if 0 != ret: + raise subprocess.CalledProcessError(returncode=ret, cmd=cmd) + _wait_for_nbd_device(nbd_device=nbd_device, connected=True) _persist_connect_info(nbd_device, path, exportname) nbd = ( diff --git a/python3/tests/test_nbd_client_manager.py b/python3/tests/test_nbd_client_manager.py index 224a1c3e2ea..b3f439f5442 100644 --- a/python3/tests/test_nbd_client_manager.py +++ b/python3/tests/test_nbd_client_manager.py @@ -43,7 +43,8 @@ def test_nbd_device_connected(self, mock_call, mock_exists): result = nbd_client_manager._is_nbd_device_connected('/dev/nbd0') self.assertTrue(result) - mock_call.assert_called_once_with(["nbd-client", "-check", "/dev/nbd0"], error=False) + mock_call.assert_called_once_with(["nbd-client", "-check", "/dev/nbd0"], + raise_err=False, log_err=False) @patch('nbd_client_manager._call') def test_nbd_device_not_connected(self, mock_call, mock_exists): @@ -53,7 +54,8 @@ def test_nbd_device_not_connected(self, mock_call, mock_exists): result = nbd_client_manager._is_nbd_device_connected('/dev/nbd1') self.assertFalse(result) - mock_call.assert_called_once_with(["nbd-client", "-check", "/dev/nbd1"], error=False) + mock_call.assert_called_once_with(["nbd-client", "-check", "/dev/nbd1"], + raise_err=False, log_err=False) def test_nbd_device_not_found(self, mock_exists): mock_exists.return_value = False