Skip to content

Commit 206ccf4

Browse files
Nick Childgregkh
authored andcommitted
ibmvnic: Do partial reset on login failure
commit 23cc5f6 upstream. Perform a partial reset before sending a login request if any of the following are true: 1. If a previous request times out. This can be dangerous because the VIOS could still receive the old login request at any point after the timeout. Therefore, it is best to re-register the CRQ's and sub-CRQ's before retrying. 2. If the previous request returns an error that is not described in PAPR. PAPR provides procedures if the login returns with partial success or aborted return codes (section L.5.1) but other values do not have a defined procedure. Previously, these conditions just returned error from the login function rather than trying to resolve the issue. This can cause further issues since most callers of the login function are not prepared to handle an error when logging in. This improper cleanup can lead to the device being permanently DOWN'd. For example, if the VIOS believes that the device is already logged in then it will return INVALID_STATE (-7). If we never re-register CRQ's then it will always think that the device is already logged in. This leaves the device inoperable. The partial reset involves freeing the sub-CRQs, freeing the CRQ then registering and initializing a new CRQ and sub-CRQs. This essentially restarts all communication with VIOS to allow for a fresh login attempt that will be unhindered by any previous failed attempts. Fixes: dff515a ("ibmvnic: Harden device login requests") Signed-off-by: Nick Child <nnac123@linux.ibm.com> Reviewed-by: Simon Horman <horms@kernel.org> Link: https://lore.kernel.org/r/20230809221038.51296-4-nnac123@linux.ibm.com Signed-off-by: Jakub Kicinski <kuba@kernel.org> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
1 parent 31ccd1b commit 206ccf4

File tree

1 file changed

+40
-6
lines changed

1 file changed

+40
-6
lines changed

drivers/net/ethernet/ibm/ibmvnic.c

Lines changed: 40 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,8 @@ static int pending_scrq(struct ibmvnic_adapter *,
9696
static union sub_crq *ibmvnic_next_scrq(struct ibmvnic_adapter *,
9797
struct ibmvnic_sub_crq_queue *);
9898
static int ibmvnic_poll(struct napi_struct *napi, int data);
99+
static int reset_sub_crq_queues(struct ibmvnic_adapter *adapter);
100+
static inline void reinit_init_done(struct ibmvnic_adapter *adapter);
99101
static void send_query_map(struct ibmvnic_adapter *adapter);
100102
static int send_request_map(struct ibmvnic_adapter *, dma_addr_t, u32, u8);
101103
static int send_request_unmap(struct ibmvnic_adapter *, u8);
@@ -1336,11 +1338,9 @@ static int ibmvnic_login(struct net_device *netdev)
13361338

13371339
if (!wait_for_completion_timeout(&adapter->init_done,
13381340
timeout)) {
1339-
netdev_warn(netdev, "Login timed out, retrying...\n");
1340-
retry = true;
1341-
adapter->init_done_rc = 0;
1342-
retry_count++;
1343-
continue;
1341+
netdev_warn(netdev, "Login timed out\n");
1342+
adapter->login_pending = false;
1343+
goto partial_reset;
13441344
}
13451345

13461346
if (adapter->init_done_rc == ABORTED) {
@@ -1385,7 +1385,41 @@ static int ibmvnic_login(struct net_device *netdev)
13851385
} else if (adapter->init_done_rc) {
13861386
netdev_warn(netdev, "Adapter login failed, init_done_rc = %d\n",
13871387
adapter->init_done_rc);
1388-
return -EIO;
1388+
1389+
partial_reset:
1390+
/* adapter login failed, so free any CRQs or sub-CRQs
1391+
* and register again before attempting to login again.
1392+
* If we don't do this then the VIOS may think that
1393+
* we are already logged in and reject any subsequent
1394+
* attempts
1395+
*/
1396+
netdev_warn(netdev,
1397+
"Freeing and re-registering CRQs before attempting to login again\n");
1398+
retry = true;
1399+
adapter->init_done_rc = 0;
1400+
retry_count++;
1401+
release_sub_crqs(adapter, true);
1402+
reinit_init_done(adapter);
1403+
release_crq_queue(adapter);
1404+
/* If we don't sleep here then we risk an unnecessary
1405+
* failover event from the VIOS. This is a known VIOS
1406+
* issue caused by a vnic device freeing and registering
1407+
* a CRQ too quickly.
1408+
*/
1409+
msleep(1500);
1410+
rc = init_crq_queue(adapter);
1411+
if (rc) {
1412+
netdev_err(netdev, "login recovery: init CRQ failed %d\n",
1413+
rc);
1414+
return -EIO;
1415+
}
1416+
1417+
rc = ibmvnic_reset_init(adapter, false);
1418+
if (rc) {
1419+
netdev_err(netdev, "login recovery: Reset init failed %d\n",
1420+
rc);
1421+
return -EIO;
1422+
}
13891423
}
13901424
} while (retry);
13911425

0 commit comments

Comments
 (0)