Skip to content

Commit 1e0c5bd

Browse files
committed
Merge bitcoin/bitcoin#30125: test: improve BDB parser (handle internal/overflow pages, support all page sizes)
d45eb39 test: compare BDB dumps of test framework parser and wallet tool (Sebastian Falbesoner) 01ddd9f test: complete BDB parser (handle internal/overflow pages, support all page sizes) (Sebastian Falbesoner) Pull request description: This PR adds missing features to our test framework's BDB parser with the goal of hopefully being able to read all legacy wallets that are created with current and past versions of Bitcoin Core. This could be useful both for making review of bitcoin/bitcoin#26606 easier and to also possibly improve our functional tests for the wallet BDB-ro parser by additionally validating it with an alternative implementation. The second commits introduces a test that create a legacy wallet with huge label strings (in order to create overflow pages, i.e. pages needed for key/value data than is larger than the page size) and compares the dump outputs of wallet tool and the extended test framework BDB parser. It can be exercised via `$ ./test/functional/tool_wallet.py --legacy`. BDB support has to be compiled in (obviously). For some manual tests regarding different page sizes, the following patch can be used: ```diff diff --git a/src/wallet/bdb.cpp b/src/wallet/bdb.cpp index 38cca32..1bf39323d3 100644 --- a/src/wallet/bdb.cpp +++ b/src/wallet/bdb.cpp @@ -395,6 +395,7 @@ void BerkeleyDatabase::Open() DB_BTREE, // Database type nFlags, // Flags 0); + pdb_temp->set_pagesize(1<<9); /* valid BDB pagesizes are from 1<<9 (=512) to <<16 (=65536) */ if (ret != 0) { throw std::runtime_error(strprintf("BerkeleyDatabase: Error %d, can't open database %s", ret, strFile)); ``` I verified that the newly introduced test passes with all valid page sizes between 512 and 65536. ACKs for top commit: achow101: ACK d45eb39 furszy: utACK d45eb39 brunoerg: code review ACK d45eb39 Tree-SHA512: 9f8ac80452545f4fcd24a17ea6f9cf91b487cfb1fcb99a0ba9153fa4e3b239daa126454e26109fdcb72eb1c76a4ee3b46fd6af21dc318ab67bd12b3ebd26cfdd
2 parents 1d6c6e9 + d45eb39 commit 1e0c5bd

File tree

2 files changed

+144
-39
lines changed

2 files changed

+144
-39
lines changed

test/functional/test_framework/bdb.py

Lines changed: 98 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -6,44 +6,55 @@
66
Utilities for working directly with the wallet's BDB database file
77
88
This is specific to the configuration of BDB used in this project:
9-
- pagesize: 4096 bytes
109
- Outer database contains single subdatabase named 'main'
1110
- btree
12-
- btree leaf pages
11+
- btree internal, leaf and overflow pages
1312
14-
Each key-value pair is two entries in a btree leaf. The first is the key, the one that follows
13+
Each key-value pair is two entries in a btree leaf, which optionally refers to overflow pages
14+
if the data doesn't fit into a single page. The first entry is the key, the one that follows
1515
is the value. And so on. Note that the entry data is itself not in the correct order. Instead
1616
entry offsets are stored in the correct order and those offsets are needed to then retrieve
17-
the data itself.
17+
the data itself. Note that this implementation currently only supports reading databases that
18+
are in the same endianness as the host.
1819
1920
Page format can be found in BDB source code dbinc/db_page.h
20-
This only implements the deserialization of btree metadata pages and normal btree pages. Overflow
21-
pages are not implemented but may be needed in the future if dealing with wallets with large
22-
transactions.
2321
2422
`db_dump -da wallet.dat` is useful to see the data in a wallet.dat BDB file
2523
"""
2624

2725
import struct
2826

2927
# Important constants
30-
PAGESIZE = 4096
28+
PAGE_HEADER_SIZE = 26
3129
OUTER_META_PAGE = 0
32-
INNER_META_PAGE = 2
3330

3431
# Page type values
3532
BTREE_INTERNAL = 3
3633
BTREE_LEAF = 5
34+
OVERFLOW_DATA = 7
3735
BTREE_META = 9
3836

37+
# Record type values
38+
RECORD_KEYDATA = 1
39+
RECORD_OVERFLOW_DATA = 3
40+
3941
# Some magic numbers for sanity checking
4042
BTREE_MAGIC = 0x053162
4143
DB_VERSION = 9
42-
43-
# Deserializes a leaf page into a dict.
44-
# Btree internal pages have the same header, for those, return None.
45-
# For the btree leaf pages, deserialize them and put all the data into a dict
46-
def dump_leaf_page(data):
44+
SUBDATABASE_NAME = b'main'
45+
46+
# Deserializes an internal, leaf or overflow page into a dict.
47+
# In addition to the common page header fields, the result contains an 'entries'
48+
# array of dicts with the following fields, depending on the page type:
49+
# internal page [BTREE_INTERNAL]:
50+
# - 'page_num': referenced page number (used to find further pages to process)
51+
# leaf page [BTREE_LEAF]:
52+
# - 'record_type': record type, must be RECORD_KEYDATA or RECORD_OVERFLOW_DATA
53+
# - 'data': binary data (key or value payload), if record type is RECORD_KEYDATA
54+
# - 'page_num': referenced overflow page number, if record type is RECORD_OVERFLOW_DATA
55+
# overflow page [OVERFLOW_DATA]:
56+
# - 'data': binary data (part of key or value payload)
57+
def dump_page(data):
4758
page_info = {}
4859
page_header = data[0:26]
4960
_, pgno, prev_pgno, next_pgno, entries, hf_offset, level, pg_type = struct.unpack('QIIIHHBB', page_header)
@@ -56,20 +67,35 @@ def dump_leaf_page(data):
5667
page_info['entry_offsets'] = struct.unpack('{}H'.format(entries), data[26:26 + entries * 2])
5768
page_info['entries'] = []
5869

59-
if pg_type == BTREE_INTERNAL:
60-
# Skip internal pages. These are the internal nodes of the btree and don't contain anything relevant to us
61-
return None
70+
assert pg_type in (BTREE_INTERNAL, BTREE_LEAF, OVERFLOW_DATA)
6271

63-
assert pg_type == BTREE_LEAF, 'A non-btree leaf page has been encountered while dumping leaves'
72+
if pg_type == OVERFLOW_DATA:
73+
assert entries == 1
74+
page_info['entries'].append({'data': data[26:26 + hf_offset]})
75+
return page_info
6476

6577
for i in range(0, entries):
78+
entry = {}
6679
offset = page_info['entry_offsets'][i]
67-
entry = {'offset': offset}
68-
page_data_header = data[offset:offset + 3]
69-
e_len, pg_type = struct.unpack('HB', page_data_header)
70-
entry['len'] = e_len
71-
entry['pg_type'] = pg_type
72-
entry['data'] = data[offset + 3:offset + 3 + e_len]
80+
record_header = data[offset:offset + 3]
81+
offset += 3
82+
e_len, record_type = struct.unpack('HB', record_header)
83+
84+
if pg_type == BTREE_INTERNAL:
85+
assert record_type == RECORD_KEYDATA
86+
internal_record_data = data[offset:offset + 9]
87+
_, page_num, _ = struct.unpack('=BII', internal_record_data)
88+
entry['page_num'] = page_num
89+
elif pg_type == BTREE_LEAF:
90+
assert record_type in (RECORD_KEYDATA, RECORD_OVERFLOW_DATA)
91+
entry['record_type'] = record_type
92+
if record_type == RECORD_KEYDATA:
93+
entry['data'] = data[offset:offset + e_len]
94+
elif record_type == RECORD_OVERFLOW_DATA:
95+
overflow_record_data = data[offset:offset + 9]
96+
_, page_num, _ = struct.unpack('=BII', overflow_record_data)
97+
entry['page_num'] = page_num
98+
7399
page_info['entries'].append(entry)
74100

75101
return page_info
@@ -115,37 +141,70 @@ def dump_meta_page(page):
115141
return metadata
116142

117143
# Given the dict from dump_leaf_page, get the key-value pairs and put them into a dict
118-
def extract_kv_pairs(page_data):
144+
def extract_kv_pairs(page_data, pages):
119145
out = {}
120146
last_key = None
121147
for i, entry in enumerate(page_data['entries']):
148+
data = b''
149+
if entry['record_type'] == RECORD_KEYDATA:
150+
data = entry['data']
151+
elif entry['record_type'] == RECORD_OVERFLOW_DATA:
152+
next_page = entry['page_num']
153+
while next_page != 0:
154+
opage = pages[next_page]
155+
opage_info = dump_page(opage)
156+
data += opage_info['entries'][0]['data']
157+
next_page = opage_info['next_pgno']
158+
122159
# By virtue of these all being pairs, even number entries are keys, and odd are values
123160
if i % 2 == 0:
124161
out[entry['data']] = b''
125-
last_key = entry['data']
162+
last_key = data
126163
else:
127-
out[last_key] = entry['data']
164+
out[last_key] = data
128165
return out
129166

130167
# Extract the key-value pairs of the BDB file given in filename
131168
def dump_bdb_kv(filename):
132169
# Read in the BDB file and start deserializing it
133170
pages = []
134171
with open(filename, 'rb') as f:
135-
data = f.read(PAGESIZE)
136-
while len(data) > 0:
137-
pages.append(data)
138-
data = f.read(PAGESIZE)
172+
# Determine pagesize first
173+
data = f.read(PAGE_HEADER_SIZE)
174+
pagesize = struct.unpack('I', data[20:24])[0]
175+
assert pagesize in (512, 1024, 2048, 4096, 8192, 16384, 32768, 65536)
139176

140-
# Sanity check the meta pages
141-
dump_meta_page(pages[OUTER_META_PAGE])
142-
dump_meta_page(pages[INNER_META_PAGE])
177+
# Read rest of first page
178+
data += f.read(pagesize - PAGE_HEADER_SIZE)
179+
assert len(data) == pagesize
143180

144-
# Fetch the kv pairs from the leaf pages
181+
# Read all remaining pages
182+
while len(data) > 0:
183+
pages.append(data)
184+
data = f.read(pagesize)
185+
186+
# Sanity check the meta pages, read root page
187+
outer_meta_info = dump_meta_page(pages[OUTER_META_PAGE])
188+
root_page_info = dump_page(pages[outer_meta_info['root']])
189+
assert root_page_info['pg_type'] == BTREE_LEAF
190+
assert len(root_page_info['entries']) == 2
191+
assert root_page_info['entries'][0]['data'] == SUBDATABASE_NAME
192+
assert len(root_page_info['entries'][1]['data']) == 4
193+
inner_meta_page = int.from_bytes(root_page_info['entries'][1]['data'], 'big')
194+
inner_meta_info = dump_meta_page(pages[inner_meta_page])
195+
196+
# Fetch the kv pairs from the pages
145197
kv = {}
146-
for i in range(3, len(pages)):
147-
info = dump_leaf_page(pages[i])
148-
if info is not None:
149-
info_kv = extract_kv_pairs(info)
198+
pages_to_process = [inner_meta_info['root']]
199+
while len(pages_to_process) > 0:
200+
curr_page_no = pages_to_process.pop()
201+
assert curr_page_no <= outer_meta_info['last_pgno']
202+
info = dump_page(pages[curr_page_no])
203+
assert info['pg_type'] in (BTREE_INTERNAL, BTREE_LEAF)
204+
if info['pg_type'] == BTREE_INTERNAL:
205+
for entry in info['entries']:
206+
pages_to_process.append(entry['page_num'])
207+
elif info['pg_type'] == BTREE_LEAF:
208+
info_kv = extract_kv_pairs(info, pages)
150209
kv = {**kv, **info_kv}
151210
return kv

test/functional/tool_wallet.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,18 +6,23 @@
66

77
import os
88
import platform
9+
import random
910
import stat
11+
import string
1012
import subprocess
1113
import textwrap
1214

1315
from collections import OrderedDict
1416

17+
from test_framework.bdb import dump_bdb_kv
18+
from test_framework.messages import ser_string
1519
from test_framework.test_framework import BitcoinTestFramework
1620
from test_framework.util import (
1721
assert_equal,
1822
assert_greater_than,
1923
sha256sum_file,
2024
)
25+
from test_framework.wallet import getnewdestination
2126

2227

2328
class ToolWalletTest(BitcoinTestFramework):
@@ -545,6 +550,44 @@ def test_dump_unclean_lsns(self):
545550
self.stop_node(0)
546551
self.assert_tool_output("The dumpfile may contain private keys. To ensure the safety of your Bitcoin, do not share the dumpfile.\n", "-wallet=unclean_lsn", f"-dumpfile={wallet_dump}", "dump")
547552

553+
def test_compare_legacy_dump_with_framework_bdb_parser(self):
554+
self.log.info("Verify that legacy wallet database dump matches the one from the test framework's BDB parser")
555+
wallet_name = "bdb_ro_test"
556+
self.start_node(0)
557+
# add some really large labels (above twice the largest valid page size) to create BDB overflow pages
558+
self.nodes[0].createwallet(wallet_name)
559+
wallet_rpc = self.nodes[0].get_wallet_rpc(wallet_name)
560+
generated_labels = {}
561+
for i in range(10):
562+
address = getnewdestination()[2]
563+
large_label = ''.join([random.choice(string.ascii_letters) for _ in range(150000)])
564+
wallet_rpc.setlabel(address, large_label)
565+
generated_labels[address] = large_label
566+
# fill the keypool to create BDB internal pages
567+
wallet_rpc.keypoolrefill(1000)
568+
self.stop_node(0)
569+
570+
wallet_dumpfile = self.nodes[0].datadir_path / "bdb_ro_test.dump"
571+
self.assert_tool_output("The dumpfile may contain private keys. To ensure the safety of your Bitcoin, do not share the dumpfile.\n", "-wallet={}".format(wallet_name), "-dumpfile={}".format(wallet_dumpfile), "dump")
572+
573+
expected_dump = self.read_dump(wallet_dumpfile)
574+
# remove extra entries from wallet tool dump that are not actual key/value pairs from the database
575+
del expected_dump['BITCOIN_CORE_WALLET_DUMP']
576+
del expected_dump['format']
577+
del expected_dump['checksum']
578+
bdb_ro_parser_dump_raw = dump_bdb_kv(self.nodes[0].wallets_path / wallet_name / "wallet.dat")
579+
bdb_ro_parser_dump = OrderedDict()
580+
assert any([len(bytes.fromhex(value)) >= 150000 for value in expected_dump.values()])
581+
for key, value in sorted(bdb_ro_parser_dump_raw.items()):
582+
bdb_ro_parser_dump[key.hex()] = value.hex()
583+
assert_equal(bdb_ro_parser_dump, expected_dump)
584+
585+
# check that all labels were created with the correct address
586+
for address, label in generated_labels.items():
587+
key_bytes = b'\x04name' + ser_string(address.encode())
588+
assert key_bytes in bdb_ro_parser_dump_raw
589+
assert_equal(bdb_ro_parser_dump_raw[key_bytes], ser_string(label.encode()))
590+
548591
def run_test(self):
549592
self.wallet_path = self.nodes[0].wallets_path / self.default_wallet_name / self.wallet_data_filename
550593
self.test_invalid_tool_commands_and_args()
@@ -561,6 +604,9 @@ def run_test(self):
561604
self.test_dump_createfromdump()
562605
self.test_chainless_conflicts()
563606
self.test_dump_very_large_records()
607+
if not self.options.descriptors and self.is_bdb_compiled() and not self.options.swap_bdb_endian:
608+
self.test_compare_legacy_dump_with_framework_bdb_parser()
609+
564610

565611
if __name__ == '__main__':
566612
ToolWalletTest(__file__).main()

0 commit comments

Comments
 (0)