Skip to content

Commit ad5cd12

Browse files
committed
Merge bitcoin/bitcoin#30660: qa: Verify clean shutdown on startup failure
10845cd qa: Add feature_framework_startup_failures.py (Hodlinator) 28e282e qa: assert_raises_message() - Stop assuming certain structure for exceptions (Hodlinator) 1f639ef qa: Work around Python socket timeout issue (Hodlinator) 9b24a40 qa: Only allow calling TestNode.stop() after connecting (Hodlinator) 6ad21b4 qa: Include ignored errors in RPC connection timeout (Hodlinator) 879243e qa refactor: wait_for_rpc_connection - Treat OSErrors the same (Hodlinator) Pull request description: Improves handling of startup errors in functional tests and puts tests in place to ensure knock-on errors don't creep in. - `wait_for_rpc_connection()` now appends specific failures leading up to the `Unable to connect to bitcoind` error to that error message: `[node 0] Unable to connect to bitcoind after 60s (ignored errors: {'missing_credentials': 1, 'OSError.ECONNREFUSED': 239}, latest error: ConnectionRefusedError(111, 'Connection refused'))` - Fixes Windows Python issue where `socket.timeout` exceptions end up with unset `errno`-fields. - Also adds comments, refactors code, improves logging. The underlying purpose is to ensure developer efficiency in finding root causes of test failures. Prior iterations of the PR partially focused on fixing the same issue as #31620. Originally inspired by #30390. ### Testing Can be tested by reverting either faf2f2c or fae3bf6 from #31620, or the "qa: Avoid calling stop-RPC if not connected" from this PR, and running *feature_framework_startup_failures.py*. ACKs for top commit: l0rinc: ACK 10845cd ryanofsky: Code review ACK 10845cd. Only changes since last review were adding a new commit tweaking assert_raises_message(), extending the new test to have a self-check, and to pass through all options to child tests instead of a hardcoded list of options. I left some cleanup suggestions below but they are not important. Tree-SHA512: f0235c5cbb6d1bb85d8dc5de492a08a34f6edc83499cbf0a5f9a3824809ff84635888c62c9c01101e3cc9ef9f1cdee2c9ab6537fea6feeb005b29f428caf8b22
2 parents 1656f6d + 10845cd commit ad5cd12

File tree

4 files changed

+215
-29
lines changed

4 files changed

+215
-29
lines changed
Lines changed: 172 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,172 @@
1+
#!/usr/bin/env python3
2+
# Copyright (c) 2025-present The Bitcoin Core developers
3+
# Distributed under the MIT software license, see the accompanying
4+
# file COPYING or http://www.opensource.org/licenses/mit-license.php.
5+
"""
6+
Verify framework startup failures only raise one exception since
7+
multiple exceptions being raised muddies the waters of what actually
8+
went wrong. We should maintain this bar of only raising one exception as
9+
long as additional maintenance and complexity is low.
10+
11+
Test relaunches itself into child processes in order to trigger failures
12+
without the parent process' BitcoinTestFramework also failing.
13+
"""
14+
15+
from test_framework.util import (
16+
assert_raises_message,
17+
rpc_port,
18+
)
19+
from test_framework.test_framework import BitcoinTestFramework
20+
21+
from hashlib import md5
22+
from os import linesep
23+
import re
24+
import subprocess
25+
import sys
26+
import time
27+
28+
class FeatureFrameworkStartupFailures(BitcoinTestFramework):
29+
def set_test_params(self):
30+
self.num_nodes = 1
31+
32+
def setup_network(self):
33+
# Don't start the node yet, as we want to measure during run_test.
34+
self.add_nodes(self.num_nodes, self.extra_args)
35+
36+
# Launches a child test process that runs this same file, but instantiates
37+
# a child test. Verifies that it raises only the expected exception, once.
38+
def _verify_startup_failure(self, test, internal_args, expected_exception):
39+
# Inherit args from parent, only modifying tmpdir so children don't fail
40+
# as a cause of colliding with the parent dir.
41+
parent_args = sys.argv.copy()
42+
assert self.options.tmpdir, "Framework should always set tmpdir."
43+
i, path = next(((i, m[1]) for i, arg in enumerate(parent_args) if (m := re.match(r'--tm(?:p(?:d(?:ir?)?)?)?=(.+)', arg))),
44+
(len(parent_args), self.options.tmpdir))
45+
subdir = md5(expected_exception.encode('utf-8')).hexdigest()[:8]
46+
parent_args[i:i + 1] = [f"--tmpdir={path}/{subdir}"]
47+
args = [sys.executable] + parent_args + [f"--internal_test={test.__name__}"] + internal_args
48+
49+
try:
50+
# The subprocess encoding argument gives different results for e.output
51+
# on Linux/Windows, so handle decoding by ourselves for consistency.
52+
output = subprocess.run(args, timeout=60 * self.options.timeout_factor,
53+
stdout=subprocess.PIPE, stderr=subprocess.STDOUT).stdout.decode("utf-8")
54+
except subprocess.TimeoutExpired as e:
55+
print("Unexpected child process timeout!\n"
56+
"WARNING: Timeouts like this halt execution of TestNode logic, "
57+
"meaning dangling bitcoind processes are to be expected.\n"
58+
f"<CHILD OUTPUT BEGIN>\n{e.output.decode('utf-8')}\n<CHILD OUTPUT END>",
59+
file=sys.stderr)
60+
raise
61+
62+
errors = []
63+
if (n := output.count("Traceback")) != 1:
64+
errors.append(f"Found {n}/1 tracebacks - expecting exactly one with no knock-on exceptions.")
65+
if (n := len(re.findall(expected_exception, output))) != 1:
66+
errors.append(f"Found {n}/1 occurrences of the specific exception: {expected_exception}")
67+
if (n := output.count("Test failed. Test logging available at")) != 1:
68+
errors.append(f"Found {n}/1 test failure output messages.")
69+
70+
assert not errors, f"Child test didn't contain (only) expected errors:\n{linesep.join(errors)}\n<CHILD OUTPUT BEGIN>\n{output}\n<CHILD OUTPUT END>\n"
71+
72+
def run_test(self):
73+
self.log.info("Verifying _verify_startup_failure() functionality (self-check).")
74+
assert_raises_message(
75+
AssertionError,
76+
("Child test didn't contain (only) expected errors:\n" +
77+
linesep.join(["Found 0/1 tracebacks - expecting exactly one with no knock-on exceptions.",
78+
"Found 0/1 occurrences of the specific exception: NonExistentError",
79+
"Found 0/1 test failure output messages."])).encode("unicode_escape").decode("utf-8"),
80+
self._verify_startup_failure,
81+
TestSuccess, [],
82+
"NonExistentError",
83+
)
84+
85+
self.log.info("Parent process is measuring node startup duration in order to obtain a reasonable timeout value for later test...")
86+
node_start_time = time.time()
87+
self.nodes[0].start()
88+
self.nodes[0].wait_for_rpc_connection()
89+
node_start_duration = time.time() - node_start_time
90+
self.nodes[0].stop_node()
91+
self.log.info(f"...measured {node_start_duration:.1f}s.")
92+
93+
self.log.info("Verifying inability to connect to bitcoind's RPC interface due to wrong port results in one exception containing at least one OSError.")
94+
self._verify_startup_failure(
95+
TestWrongRpcPortStartupFailure, [f"--internal_node_start_duration={node_start_duration}"],
96+
r"AssertionError: \[node 0\] Unable to connect to bitcoind after \d+s \(ignored errors: {[^}]*'OSError \w+'?: \d+[^}]*}, latest error: \w+\([^)]+\)\)"
97+
)
98+
99+
self.log.info("Verifying startup failure due to invalid arg results in only one exception.")
100+
self._verify_startup_failure(
101+
TestInitErrorStartupFailure, [],
102+
r"FailedToStartError: \[node 0\] bitcoind exited with status 1 during initialization\. Error: Error parsing command line arguments: Invalid parameter -nonexistentarg"
103+
)
104+
105+
self.log.info("Verifying start() then stop_node() on a node without wait_for_rpc_connection() in between raises an assert.")
106+
self._verify_startup_failure(
107+
TestStartStopStartupFailure, [],
108+
r"AssertionError: \[node 0\] Should only call stop_node\(\) on a running node after wait_for_rpc_connection\(\) succeeded\. Did you forget to call the latter after start\(\)\? Not connected to process: \d+"
109+
)
110+
111+
class InternalTestMixin:
112+
def add_options(self, parser):
113+
# Just here to silence unrecognized argument error, we actually read the value in the if-main at the bottom.
114+
parser.add_argument("--internal_test", dest="internal_never_read", help="ONLY TO BE USED WHEN TEST RELAUNCHES ITSELF")
115+
116+
class TestWrongRpcPortStartupFailure(InternalTestMixin, BitcoinTestFramework):
117+
def add_options(self, parser):
118+
parser.add_argument("--internal_node_start_duration", dest="node_start_duration", help="ONLY TO BE USED WHEN TEST RELAUNCHES ITSELF", type=float)
119+
InternalTestMixin.add_options(self, parser)
120+
121+
def set_test_params(self):
122+
self.num_nodes = 1
123+
# Override RPC listen port to something TestNode isn't expecting so that
124+
# we are unable to establish an RPC connection.
125+
self.extra_args = [[f"-rpcport={rpc_port(2)}"]]
126+
# Override the timeout to avoid waiting unnecessarily long to realize
127+
# nothing is on that port. Divide by timeout_factor to counter
128+
# multiplication in base, 2 * node_start_duration should be enough.
129+
self.rpc_timeout = max(3, 2 * self.options.node_start_duration) / self.options.timeout_factor
130+
131+
def run_test(self):
132+
assert False, "Should have failed earlier during startup."
133+
134+
class TestInitErrorStartupFailure(InternalTestMixin, BitcoinTestFramework):
135+
def set_test_params(self):
136+
self.num_nodes = 1
137+
self.extra_args = [["-nonexistentarg"]]
138+
139+
def run_test(self):
140+
assert False, "Should have failed earlier during startup."
141+
142+
class TestStartStopStartupFailure(InternalTestMixin, BitcoinTestFramework):
143+
def set_test_params(self):
144+
self.num_nodes = 1
145+
146+
def setup_network(self):
147+
self.add_nodes(self.num_nodes, self.extra_args)
148+
self.nodes[0].start()
149+
self.nodes[0].stop_node()
150+
assert False, "stop_node() should raise an exception when we haven't called wait_for_rpc_connection()"
151+
152+
def run_test(self):
153+
assert False, "Should have failed earlier during startup."
154+
155+
class TestSuccess(InternalTestMixin, BitcoinTestFramework):
156+
def set_test_params(self):
157+
self.num_nodes = 1
158+
159+
def setup_network(self):
160+
pass # Don't need to start our node.
161+
162+
def run_test(self):
163+
pass # Just succeed.
164+
165+
166+
if __name__ == '__main__':
167+
if class_name := next((m[1] for arg in sys.argv[1:] if (m := re.match(r'--internal_test=(.+)', arg))), None):
168+
internal_test = globals().get(class_name)
169+
assert internal_test, f"Unrecognized test: {class_name}"
170+
internal_test(__file__).main()
171+
else:
172+
FeatureFrameworkStartupFailures(__file__).main()

test/functional/test_framework/test_node.py

Lines changed: 38 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88
import decimal
99
import errno
1010
from enum import Enum
11-
import http.client
1211
import json
1312
import logging
1413
import os
@@ -264,6 +263,8 @@ def wait_for_rpc_connection(self, *, wait_for_import=True):
264263
"""Sets up an RPC connection to the bitcoind process. Returns False if unable to connect."""
265264
# Poll at a rate of four times per second
266265
poll_per_s = 4
266+
suppressed_errors = collections.defaultdict(int)
267+
latest_error = ""
267268
for _ in range(poll_per_s * self.rpc_timeout):
268269
if self.process.poll() is not None:
269270
# Attach abrupt shutdown error/s to the exception message
@@ -304,33 +305,46 @@ def wait_for_rpc_connection(self, *, wait_for_import=True):
304305
# overhead is trivial, and the added guarantees are worth
305306
# the minimal performance cost.
306307
self.log.debug("RPC successfully started")
308+
# Set rpc_connected even if we are in use_cli mode so that we know we can call self.stop() if needed.
309+
self.rpc_connected = True
307310
if self.use_cli:
308311
return
309312
self.rpc = rpc
310-
self.rpc_connected = True
311313
self.url = self.rpc.rpc_url
312314
return
313-
except JSONRPCException as e: # Initialization phase
315+
except JSONRPCException as e:
316+
# Suppress these as they are expected during initialization.
314317
# -28 RPC in warmup
315-
# -342 Service unavailable, RPC server started but is shutting down due to error
316-
if e.error['code'] != -28 and e.error['code'] != -342:
318+
# -342 Service unavailable, could be starting up or shutting down
319+
if e.error['code'] not in [-28, -342]:
317320
raise # unknown JSON RPC exception
318-
except ConnectionResetError:
319-
# This might happen when the RPC server is in warmup, but shut down before the call to getblockcount
320-
# succeeds. Try again to properly raise the FailedToStartError
321-
pass
321+
suppressed_errors[f"JSONRPCException {e.error['code']}"] += 1
322+
latest_error = repr(e)
322323
except OSError as e:
323-
if e.errno == errno.ETIMEDOUT:
324-
pass # Treat identical to ConnectionResetError
325-
elif e.errno == errno.ECONNREFUSED:
326-
pass # Port not yet open?
327-
else:
324+
error_num = e.errno
325+
# Work around issue where socket timeouts don't have errno set.
326+
# https://github.com/python/cpython/issues/109601
327+
if error_num is None and isinstance(e, TimeoutError):
328+
error_num = errno.ETIMEDOUT
329+
330+
# Suppress similarly to the above JSONRPCException errors.
331+
if error_num not in [
332+
errno.ECONNRESET, # This might happen when the RPC server is in warmup,
333+
# but shut down before the call to getblockcount succeeds.
334+
errno.ETIMEDOUT, # Treat identical to ECONNRESET
335+
errno.ECONNREFUSED # Port not yet open?
336+
]:
328337
raise # unknown OS error
329-
except ValueError as e: # cookie file not found and no rpcuser or rpcpassword; bitcoind is still starting
338+
suppressed_errors[f"OSError {errno.errorcode[error_num]}"] += 1
339+
latest_error = repr(e)
340+
except ValueError as e:
341+
# Suppress if cookie file isn't generated yet and no rpcuser or rpcpassword; bitcoind may be starting.
330342
if "No RPC credentials" not in str(e):
331343
raise
344+
suppressed_errors["missing_credentials"] += 1
345+
latest_error = repr(e)
332346
time.sleep(1.0 / poll_per_s)
333-
self._raise_assertion_error("Unable to connect to bitcoind after {}s".format(self.rpc_timeout))
347+
self._raise_assertion_error(f"Unable to connect to bitcoind after {self.rpc_timeout}s (ignored errors: {str(dict(suppressed_errors))}, latest error: {latest_error})")
334348

335349
def wait_for_cookie_credentials(self):
336350
"""Ensures auth cookie credentials can be read, e.g. for testing CLI with -rpcwait before RPC connection is up."""
@@ -387,15 +401,15 @@ def stop_node(self, expected_stderr='', *, wait=0, wait_until_stopped=True):
387401
"""Stop the node."""
388402
if not self.running:
389403
return
404+
assert self.rpc_connected, self._node_msg(
405+
"Should only call stop_node() on a running node after wait_for_rpc_connection() succeeded. "
406+
f"Did you forget to call the latter after start()? Not connected to process: {self.process.pid}")
390407
self.log.debug("Stopping node")
391-
try:
392-
# Do not use wait argument when testing older nodes, e.g. in wallet_backwards_compatibility.py
393-
if self.version_is_at_least(180000):
394-
self.stop(wait=wait)
395-
else:
396-
self.stop()
397-
except http.client.CannotSendRequest:
398-
self.log.exception("Unable to stop node.")
408+
# Do not use wait argument when testing older nodes, e.g. in wallet_backwards_compatibility.py
409+
if self.version_is_at_least(180000):
410+
self.stop(wait=wait)
411+
else:
412+
self.stop()
399413

400414
# If there are any running perf processes, stop them.
401415
for profile_name in tuple(self.perf_subprocesses.keys()):

test/functional/test_framework/util.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -101,10 +101,9 @@ def assert_raises_message(exc, message, fun, *args, **kwds):
101101
except JSONRPCException:
102102
raise AssertionError("Use assert_raises_rpc_error() to test RPC failures")
103103
except exc as e:
104-
if message is not None and message not in e.error['message']:
105-
raise AssertionError(
106-
"Expected substring not found in error message:\nsubstring: '{}'\nerror message: '{}'.".format(
107-
message, e.error['message']))
104+
if message is not None and message not in repr(e):
105+
raise AssertionError("Expected substring not found in exception:\n"
106+
f"substring: '{message}'\nexception: {repr(e)}.")
108107
except Exception as e:
109108
raise AssertionError("Unexpected exception raised: " + type(e).__name__)
110109
else:

test/functional/test_runner.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
#!/usr/bin/env python3
2-
# Copyright (c) 2014-2022 The Bitcoin Core developers
2+
# Copyright (c) 2014-present The Bitcoin Core developers
33
# Distributed under the MIT software license, see the accompanying
44
# file COPYING or http://www.opensource.org/licenses/mit-license.php.
55
"""Run regression test suite.
@@ -355,6 +355,7 @@
355355
'p2p_handshake.py --v2transport',
356356
'feature_dirsymlinks.py',
357357
'feature_help.py',
358+
'feature_framework_startup_failures.py',
358359
'feature_shutdown.py',
359360
'wallet_migration.py',
360361
'p2p_ibd_txrelay.py',

0 commit comments

Comments
 (0)