Skip to content

Commit 28c70ac

Browse files
committed
Ensure all HTLCs for a claimed payment are claimed on startup
While the HTLC-claim process happens across all MPP parts under one lock, this doesn't imply that they are claimed fully atomically on disk. Ultimately, an application can crash after persisting one `ChannelMonitorUpdate` out of multiple monitor updates needed for the full claim. Previously, this would leave us in a very bad state - because of the all-channels-available check in `claim_funds` we'd refuse to claim the payment again on restart (even though the `PaymentReceived` event will be passed to the user again), and we'd end up having partially claimed the payment! The fix for the consistency part of this issue is pretty straightforward - just check for this condition on startup and complete the claim across all channels/`ChannelMonitor`s if we detect it. This still leaves us in a confused state from the perspective of the user, however - we've actually claimed a payment but when they call `claim_funds` we return `false` indicating it could not be claimed.
1 parent bd1e20d commit 28c70ac

File tree

5 files changed

+252
-5
lines changed

5 files changed

+252
-5
lines changed

lightning/src/chain/channelmonitor.rs

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1085,7 +1085,8 @@ impl<Signer: Sign> ChannelMonitor<Signer> {
10851085
self.inner.lock().unwrap().provide_latest_holder_commitment_tx(holder_commitment_tx, htlc_outputs).map_err(|_| ())
10861086
}
10871087

1088-
#[cfg(test)]
1088+
/// This is used to provide payment preimage(s) out-of-band during startup without updating the
1089+
/// off-chain state with a new commitment transaction.
10891090
pub(crate) fn provide_payment_preimage<B: Deref, F: Deref, L: Deref>(
10901091
&self,
10911092
payment_hash: &PaymentHash,
@@ -1631,6 +1632,10 @@ impl<Signer: Sign> ChannelMonitor<Signer> {
16311632

16321633
res
16331634
}
1635+
1636+
pub(crate) fn get_stored_preimages(&self) -> HashMap<PaymentHash, PaymentPreimage> {
1637+
self.inner.lock().unwrap().payment_preimages.clone()
1638+
}
16341639
}
16351640

16361641
/// Compares a broadcasted commitment transaction's HTLCs with those in the latest state,

lightning/src/ln/channel.rs

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1703,6 +1703,28 @@ impl<Signer: Sign> Channel<Signer> {
17031703
make_funding_redeemscript(&self.get_holder_pubkeys().funding_pubkey, self.counterparty_funding_pubkey())
17041704
}
17051705

1706+
/// Claims an HTLC while we're disconnected from a peer, dropping the ChannelMonitorUpdate
1707+
/// entirely.
1708+
///
1709+
/// The ChannelMonitor for this channel MUST be updated out-of-band with the preimage provided
1710+
/// (i.e. without calling [`crate::chain::Watch::update_channel`]).
1711+
///
1712+
/// The HTLC claim will end up in the holding cell (because the caller must ensure the peer is
1713+
/// disconnected).
1714+
pub fn claim_htlc_while_disconnected_dropping_mon_update<L: Deref>
1715+
(&mut self, htlc_id_arg: u64, payment_preimage_arg: PaymentPreimage, logger: &L)
1716+
where L::Target: Logger {
1717+
// Assert that we'll add the HTLC claim to the holding cell in `get_update_fulfill_htlc`
1718+
// (see equivalent if condition there).
1719+
assert!(self.channel_state & (ChannelState::AwaitingRemoteRevoke as u32 | ChannelState::PeerDisconnected as u32 | ChannelState::MonitorUpdateFailed as u32) != 0);
1720+
let mon_update_id = self.latest_monitor_update_id; // Forget the ChannelMonitor update
1721+
let fulfill_resp = self.get_update_fulfill_htlc(htlc_id_arg, payment_preimage_arg, logger);
1722+
self.latest_monitor_update_id = mon_update_id;
1723+
if let UpdateFulfillFetch::NewClaim { msg, .. } = fulfill_resp {
1724+
assert!(msg.is_none()); // The HTLC must have ended up in the holding cell.
1725+
}
1726+
}
1727+
17061728
fn get_update_fulfill_htlc<L: Deref>(&mut self, htlc_id_arg: u64, payment_preimage_arg: PaymentPreimage, logger: &L) -> UpdateFulfillFetch where L::Target: Logger {
17071729
// Either ChannelFunded got set (which means it won't be unset) or there is no way any
17081730
// caller thought we could have something claimed (cause we wouldn't have accepted in an
@@ -1765,6 +1787,10 @@ impl<Signer: Sign> Channel<Signer> {
17651787
};
17661788

17671789
if (self.channel_state & (ChannelState::AwaitingRemoteRevoke as u32 | ChannelState::PeerDisconnected as u32 | ChannelState::MonitorUpdateFailed as u32)) != 0 {
1790+
// Note that this condition is the same as the assertion in
1791+
// `claim_htlc_while_disconnected_dropping_mon_update` and must match exactly -
1792+
// `claim_htlc_while_disconnected_dropping_mon_update` would not work correctly if we
1793+
// do not not get into this branch.
17681794
for pending_update in self.holding_cell_htlc_updates.iter() {
17691795
match pending_update {
17701796
&HTLCUpdateAwaitingACK::ClaimHTLC { htlc_id, .. } => {

lightning/src/ln/channelmanager.rs

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6698,7 +6698,7 @@ impl<'a, Signer: Sign, M: Deref, T: Deref, K: Deref, F: Deref, L: Deref>
66986698
// payments which are still in-flight via their on-chain state.
66996699
// We only rebuild the pending payments map if we were most recently serialized by
67006700
// 0.0.102+
6701-
for (_, monitor) in args.channel_monitors {
6701+
for (_, monitor) in args.channel_monitors.iter() {
67026702
if by_id.get(&monitor.get_funding_txo().0.to_channel_id()).is_none() {
67036703
for (htlc_source, htlc) in monitor.get_pending_outbound_htlcs() {
67046704
if let HTLCSource::OutboundRoute { payment_id, session_priv, path, payment_secret, .. } = htlc_source {
@@ -6824,6 +6824,38 @@ impl<'a, Signer: Sign, M: Deref, T: Deref, K: Deref, F: Deref, L: Deref>
68246824
}
68256825
}
68266826

6827+
for (_, monitor) in args.channel_monitors.iter() {
6828+
for (payment_hash, payment_preimage) in monitor.get_stored_preimages() {
6829+
if let Some(claimable_htlcs) = claimable_htlcs.remove(&payment_hash) {
6830+
log_info!(args.logger, "Re-claimaing HTLCs with payment hash {} due to partial-claim.", log_bytes!(payment_hash.0));
6831+
for claimable_htlc in claimable_htlcs.1 {
6832+
// Add a holding-cell claim of the payment to the Channel, which should be
6833+
// applied ~immediately on peer reconnection. Because it won't generate a
6834+
// new commitment transaction we can just provide the payment preimage to
6835+
// the corresponding ChannelMonitor and nothing else.
6836+
//
6837+
// We do so directly instead of via the normal ChannelMonitor update
6838+
// procedure as the ChainMonitor hasn't yet been initialized, implying
6839+
// we're not allowed to call it directly yet. Further, we do the update
6840+
// without incrementing the ChannelMonitor update ID as there isn't any
6841+
// reason to.
6842+
// If we were to generate a new ChannelMonitor update ID here and then
6843+
// crash before the user finishes block connect we'd end up force-closing
6844+
// this channel as well. On the flip side, there's no harm in restarting
6845+
// without the new monitor persisted - we'll end up right back here on
6846+
// restart.
6847+
let previous_channel_id = claimable_htlc.prev_hop.outpoint.to_channel_id();
6848+
if let Some(channel) = by_id.get_mut(&previous_channel_id) {
6849+
channel.claim_htlc_while_disconnected_dropping_mon_update(claimable_htlc.prev_hop.htlc_id, payment_preimage, &args.logger);
6850+
}
6851+
if let Some(previous_hop_monitor) = args.channel_monitors.get(&claimable_htlc.prev_hop.outpoint) {
6852+
previous_hop_monitor.provide_payment_preimage(&payment_hash, &payment_preimage, &args.tx_broadcaster, &args.fee_estimator, &args.logger);
6853+
}
6854+
}
6855+
}
6856+
}
6857+
}
6858+
68276859
let channel_manager = ChannelManager {
68286860
genesis_hash,
68296861
fee_estimator: args.fee_estimator,

lightning/src/ln/functional_test_utils.rs

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1476,7 +1476,7 @@ pub fn send_along_route_with_secret<'a, 'b, 'c>(origin_node: &Node<'a, 'b, 'c>,
14761476
payment_id
14771477
}
14781478

1479-
pub fn pass_along_path<'a, 'b, 'c>(origin_node: &Node<'a, 'b, 'c>, expected_path: &[&Node<'a, 'b, 'c>], recv_value: u64, our_payment_hash: PaymentHash, our_payment_secret: Option<PaymentSecret>, ev: MessageSendEvent, payment_received_expected: bool, expected_preimage: Option<PaymentPreimage>) {
1479+
pub fn do_pass_along_path<'a, 'b, 'c>(origin_node: &Node<'a, 'b, 'c>, expected_path: &[&Node<'a, 'b, 'c>], recv_value: u64, our_payment_hash: PaymentHash, our_payment_secret: Option<PaymentSecret>, ev: MessageSendEvent, payment_received_expected: bool, clear_recipient_events: bool, expected_preimage: Option<PaymentPreimage>) {
14801480
let mut payment_event = SendEvent::from_event(ev);
14811481
let mut prev_node = origin_node;
14821482

@@ -1489,7 +1489,7 @@ pub fn pass_along_path<'a, 'b, 'c>(origin_node: &Node<'a, 'b, 'c>, expected_path
14891489

14901490
expect_pending_htlcs_forwardable!(node);
14911491

1492-
if idx == expected_path.len() - 1 {
1492+
if idx == expected_path.len() - 1 && clear_recipient_events {
14931493
let events_2 = node.node.get_and_clear_pending_events();
14941494
if payment_received_expected {
14951495
assert_eq!(events_2.len(), 1);
@@ -1513,7 +1513,7 @@ pub fn pass_along_path<'a, 'b, 'c>(origin_node: &Node<'a, 'b, 'c>, expected_path
15131513
} else {
15141514
assert!(events_2.is_empty());
15151515
}
1516-
} else {
1516+
} else if idx != expected_path.len() - 1 {
15171517
let mut events_2 = node.node.get_and_clear_pending_msg_events();
15181518
assert_eq!(events_2.len(), 1);
15191519
check_added_monitors!(node, 1);
@@ -1525,6 +1525,10 @@ pub fn pass_along_path<'a, 'b, 'c>(origin_node: &Node<'a, 'b, 'c>, expected_path
15251525
}
15261526
}
15271527

1528+
pub fn pass_along_path<'a, 'b, 'c>(origin_node: &Node<'a, 'b, 'c>, expected_path: &[&Node<'a, 'b, 'c>], recv_value: u64, our_payment_hash: PaymentHash, our_payment_secret: Option<PaymentSecret>, ev: MessageSendEvent, payment_received_expected: bool, expected_preimage: Option<PaymentPreimage>) {
1529+
do_pass_along_path(origin_node, expected_path, recv_value, our_payment_hash, our_payment_secret, ev, payment_received_expected, true, expected_preimage);
1530+
}
1531+
15281532
pub fn pass_along_route<'a, 'b, 'c>(origin_node: &Node<'a, 'b, 'c>, expected_route: &[&[&Node<'a, 'b, 'c>]], recv_value: u64, our_payment_hash: PaymentHash, our_payment_secret: PaymentSecret) {
15291533
let mut events = origin_node.node.get_and_clear_pending_msg_events();
15301534
assert_eq!(events.len(), expected_route.len());

lightning/src/ln/functional_tests.rs

Lines changed: 180 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9843,6 +9843,186 @@ fn test_keysend_payments_to_private_node() {
98439843
claim_payment(&nodes[0], &path, test_preimage);
98449844
}
98459845

9846+
fn do_test_partial_claim_before_restart(persist_both_monitors: bool) {
9847+
// Test what happens if a node receives an MPP payment, claims it, but crashes before
9848+
// persisting the ChannelManager. If `persist_both_monitors` is false, also crash after only
9849+
// updating one of the two channels' ChannelMonitors. As a result, on startup, we'll (a) still
9850+
// have the PaymentReceived event, (b) have one (or two) channel(s) that goes on chain with the
9851+
// HTLC preimage in them, and (c) optionally have one channel that is live off-chain but does
9852+
// not have the preimage tied to the still-pending HTLC.
9853+
//
9854+
// To get to the correct state, on startup we should propagate the preimage to the
9855+
// still-off-chain channel, claiming the HTLC as soon as the peer connects, with the monitor
9856+
// receiving the preimage without a state update.
9857+
let chanmon_cfgs = create_chanmon_cfgs(4);
9858+
let node_cfgs = create_node_cfgs(4, &chanmon_cfgs);
9859+
let node_chanmgrs = create_node_chanmgrs(4, &node_cfgs, &[None, None, None, None]);
9860+
9861+
let persister: test_utils::TestPersister;
9862+
let new_chain_monitor: test_utils::TestChainMonitor;
9863+
let nodes_3_deserialized: ChannelManager<EnforcingSigner, &test_utils::TestChainMonitor, &test_utils::TestBroadcaster, &test_utils::TestKeysInterface, &test_utils::TestFeeEstimator, &test_utils::TestLogger>;
9864+
9865+
let mut nodes = create_network(4, &node_cfgs, &node_chanmgrs);
9866+
9867+
create_announced_chan_between_nodes_with_value(&nodes, 0, 1, 100_000, 0, InitFeatures::known(), InitFeatures::known());
9868+
create_announced_chan_between_nodes_with_value(&nodes, 0, 2, 100_000, 0, InitFeatures::known(), InitFeatures::known());
9869+
let chan_id_persisted = create_announced_chan_between_nodes_with_value(&nodes, 1, 3, 100_000, 0, InitFeatures::known(), InitFeatures::known()).2;
9870+
let chan_id_not_persisted = create_announced_chan_between_nodes_with_value(&nodes, 2, 3, 100_000, 0, InitFeatures::known(), InitFeatures::known()).2;
9871+
9872+
// Create an MPP route for 15k sats, more than the default htlc-max of 10%
9873+
let (mut route, payment_hash, payment_preimage, payment_secret) = get_route_and_payment_hash!(nodes[0], nodes[3], 15_000_000);
9874+
assert_eq!(route.paths.len(), 2);
9875+
route.paths.sort_by(|path_a, _| {
9876+
// Sort the path so that the path through nodes[1] comes first
9877+
if path_a[0].pubkey == nodes[1].node.get_our_node_id() {
9878+
core::cmp::Ordering::Less } else { core::cmp::Ordering::Greater }
9879+
});
9880+
9881+
nodes[0].node.send_payment(&route, payment_hash, &Some(payment_secret)).unwrap();
9882+
check_added_monitors!(nodes[0], 2);
9883+
9884+
// Send the payment through to nodes[3] *without* clearing the PaymentReceived event
9885+
let mut send_events = nodes[0].node.get_and_clear_pending_msg_events();
9886+
assert_eq!(send_events.len(), 2);
9887+
do_pass_along_path(&nodes[0], &[&nodes[1], &nodes[3]], 15_000_000, payment_hash, Some(payment_secret), send_events[0].clone(), true, false, None);
9888+
do_pass_along_path(&nodes[0], &[&nodes[2], &nodes[3]], 15_000_000, payment_hash, Some(payment_secret), send_events[1].clone(), true, false, None);
9889+
9890+
// Now that we have an MPP payment pending, get the latest encoded copies of nodes[3]'s
9891+
// monitors and ChannelManager, for use later, if we don't want to persist both monitors.
9892+
let mut original_monitor = test_utils::TestVecWriter(Vec::new());
9893+
if !persist_both_monitors {
9894+
for outpoint in nodes[3].chain_monitor.chain_monitor.list_monitors() {
9895+
if outpoint.to_channel_id() == chan_id_not_persisted {
9896+
assert!(original_monitor.0.is_empty());
9897+
nodes[3].chain_monitor.chain_monitor.get_monitor(outpoint).unwrap().write(&mut original_monitor).unwrap();
9898+
}
9899+
}
9900+
}
9901+
9902+
let mut original_manager = test_utils::TestVecWriter(Vec::new());
9903+
nodes[3].node.write(&mut original_manager).unwrap();
9904+
9905+
expect_payment_received!(nodes[3], payment_hash, payment_secret, 15_000_000);
9906+
9907+
nodes[3].node.claim_funds(payment_preimage);
9908+
check_added_monitors!(nodes[3], 2);
9909+
9910+
// Now fetch one of the two updated ChannelMonitors from nodes[3], and restart pretending we
9911+
// crashed in between the two persistence calls - using one old ChannelMonitor and one new one,
9912+
// with the old ChannelManager.
9913+
let mut updated_monitor = test_utils::TestVecWriter(Vec::new());
9914+
for outpoint in nodes[3].chain_monitor.chain_monitor.list_monitors() {
9915+
if outpoint.to_channel_id() == chan_id_persisted {
9916+
assert!(updated_monitor.0.is_empty());
9917+
nodes[3].chain_monitor.chain_monitor.get_monitor(outpoint).unwrap().write(&mut updated_monitor).unwrap();
9918+
}
9919+
}
9920+
// If `persist_both_monitors` is set, get the second monitor here as well
9921+
if persist_both_monitors {
9922+
for outpoint in nodes[3].chain_monitor.chain_monitor.list_monitors() {
9923+
if outpoint.to_channel_id() == chan_id_not_persisted {
9924+
assert!(original_monitor.0.is_empty());
9925+
nodes[3].chain_monitor.chain_monitor.get_monitor(outpoint).unwrap().write(&mut original_monitor).unwrap();
9926+
}
9927+
}
9928+
}
9929+
9930+
// Now restart nodes[3].
9931+
persister = test_utils::TestPersister::new();
9932+
let keys_manager = &chanmon_cfgs[3].keys_manager;
9933+
new_chain_monitor = test_utils::TestChainMonitor::new(Some(nodes[3].chain_source), nodes[3].tx_broadcaster.clone(), nodes[3].logger, node_cfgs[3].fee_estimator, &persister, keys_manager);
9934+
nodes[3].chain_monitor = &new_chain_monitor;
9935+
let mut monitors = Vec::new();
9936+
for mut monitor_data in [original_monitor, updated_monitor].iter() {
9937+
let (_, mut deserialized_monitor) = <(BlockHash, ChannelMonitor<EnforcingSigner>)>::read(&mut &monitor_data.0[..], keys_manager).unwrap();
9938+
monitors.push(deserialized_monitor);
9939+
}
9940+
9941+
let config = UserConfig::default();
9942+
nodes_3_deserialized = {
9943+
let mut channel_monitors = HashMap::new();
9944+
for monitor in monitors.iter_mut() {
9945+
channel_monitors.insert(monitor.get_funding_txo().0, monitor);
9946+
}
9947+
<(BlockHash, ChannelManager<EnforcingSigner, &test_utils::TestChainMonitor, &test_utils::TestBroadcaster, &test_utils::TestKeysInterface, &test_utils::TestFeeEstimator, &test_utils::TestLogger>)>::read(&mut &original_manager.0[..], ChannelManagerReadArgs {
9948+
default_config: config,
9949+
keys_manager,
9950+
fee_estimator: node_cfgs[3].fee_estimator,
9951+
chain_monitor: nodes[3].chain_monitor,
9952+
tx_broadcaster: nodes[3].tx_broadcaster.clone(),
9953+
logger: nodes[3].logger,
9954+
channel_monitors,
9955+
}).unwrap().1
9956+
};
9957+
nodes[3].node = &nodes_3_deserialized;
9958+
9959+
for monitor in monitors {
9960+
// On startup the preimage should have been copied into the non-persisted monitor:
9961+
assert!(monitor.get_stored_preimages().contains_key(&payment_hash));
9962+
nodes[3].chain_monitor.watch_channel(monitor.get_funding_txo().0.clone(), monitor).unwrap();
9963+
}
9964+
check_added_monitors!(nodes[3], 2);
9965+
9966+
nodes[1].node.peer_disconnected(&nodes[3].node.get_our_node_id(), false);
9967+
nodes[2].node.peer_disconnected(&nodes[3].node.get_our_node_id(), false);
9968+
9969+
// During deserialization, we should have closed one channel and broadcast its latest
9970+
// commitment transaction. We should also still have the original PaymentReceived event we
9971+
// never finished processing.
9972+
let events = nodes[3].node.get_and_clear_pending_events();
9973+
assert_eq!(events.len(), if persist_both_monitors { 3 } else { 2 });
9974+
if let Event::PaymentReceived { amt: 15_000_000, .. } = events[0] { } else { panic!(); }
9975+
if let Event::ChannelClosed { reason: ClosureReason::OutdatedChannelManager, .. } = events[1] { } else { panic!(); }
9976+
if persist_both_monitors {
9977+
if let Event::ChannelClosed { reason: ClosureReason::OutdatedChannelManager, .. } = events[2] { } else { panic!(); }
9978+
}
9979+
9980+
assert_eq!(nodes[3].node.list_channels().len(), if persist_both_monitors { 0 } else { 1 });
9981+
if !persist_both_monitors {
9982+
// If one of the two channels is still live, reveal the payment preimage over it.
9983+
9984+
nodes[3].node.peer_connected(&nodes[2].node.get_our_node_id(), &msgs::Init { features: InitFeatures::empty(), remote_network_address: None });
9985+
let reestablish_1 = get_chan_reestablish_msgs!(nodes[3], nodes[2]);
9986+
nodes[2].node.peer_connected(&nodes[3].node.get_our_node_id(), &msgs::Init { features: InitFeatures::empty(), remote_network_address: None });
9987+
let reestablish_2 = get_chan_reestablish_msgs!(nodes[2], nodes[3]);
9988+
9989+
nodes[2].node.handle_channel_reestablish(&nodes[3].node.get_our_node_id(), &reestablish_1[0]);
9990+
get_event_msg!(nodes[2], MessageSendEvent::SendChannelUpdate, nodes[3].node.get_our_node_id());
9991+
assert!(nodes[2].node.get_and_clear_pending_msg_events().is_empty());
9992+
9993+
nodes[3].node.handle_channel_reestablish(&nodes[2].node.get_our_node_id(), &reestablish_2[0]);
9994+
9995+
// Once we call `get_and_clear_pending_msg_events` the holding cell is cleared and the HTLC
9996+
// claim should fly.
9997+
let ds_msgs = nodes[3].node.get_and_clear_pending_msg_events();
9998+
check_added_monitors!(nodes[3], 1);
9999+
assert_eq!(ds_msgs.len(), 2);
10000+
if let MessageSendEvent::SendChannelUpdate { .. } = ds_msgs[1] {} else { panic!(); }
10001+
10002+
let cs_updates = match ds_msgs[0] {
10003+
MessageSendEvent::UpdateHTLCs { ref updates, .. } => {
10004+
nodes[2].node.handle_update_fulfill_htlc(&nodes[3].node.get_our_node_id(), &updates.update_fulfill_htlcs[0]);
10005+
check_added_monitors!(nodes[2], 1);
10006+
let cs_updates = get_htlc_update_msgs!(nodes[2], nodes[0].node.get_our_node_id());
10007+
expect_payment_forwarded!(nodes[2], nodes[0], nodes[3], Some(1000), false, false);
10008+
commitment_signed_dance!(nodes[2], nodes[3], updates.commitment_signed, false, true);
10009+
cs_updates
10010+
}
10011+
_ => panic!(),
10012+
};
10013+
10014+
nodes[0].node.handle_update_fulfill_htlc(&nodes[2].node.get_our_node_id(), &cs_updates.update_fulfill_htlcs[0]);
10015+
commitment_signed_dance!(nodes[0], nodes[2], cs_updates.commitment_signed, false, true);
10016+
expect_payment_sent!(nodes[0], payment_preimage);
10017+
}
10018+
}
10019+
10020+
#[test]
10021+
fn test_partial_claim_before_restart() {
10022+
do_test_partial_claim_before_restart(false);
10023+
do_test_partial_claim_before_restart(true);
10024+
}
10025+
984610026
/// The possible events which may trigger a `max_dust_htlc_exposure` breach
984710027
#[derive(Clone, Copy, PartialEq)]
984810028
enum ExposureEvent {

0 commit comments

Comments
 (0)