Skip to content

Commit 7a26c7c

Browse files
committed
Drop the ChannelMonitorUpdateStatus::PermanentFailure variant
When a `ChannelMonitorUpdate` fails to apply, it generally means we cannot reach our storage backend. This, in general, is a critical issue, but is often only a transient issue. Sadly, users see the failure variant and return it on any I/O error, resulting in channel force-closures due to transient issues. Users don't generally expect force-closes in most cases, and luckily with async `ChannelMonitorUpdate`s supported we don't take any risk by "delaying" the `ChannelMonitorUpdate` indefinitely. Thus, here we drop the `PermanentFailure` variant entirely, making all failures instead be "the update is in progress, but won't ever complete", which is equivalent if we do not close the channel automatically.
1 parent 17be969 commit 7a26c7c

File tree

12 files changed

+112
-306
lines changed

12 files changed

+112
-306
lines changed

lightning-persister/src/fs_store.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -436,7 +436,7 @@ mod tests {
436436
}
437437

438438
// Test that if the store's path to channel data is read-only, writing a
439-
// monitor to it results in the store returning a PermanentFailure.
439+
// monitor to it results in the store returning a InProgress.
440440
// Windows ignores the read-only flag for folders, so this test is Unix-only.
441441
#[cfg(not(target_os = "windows"))]
442442
#[test]
@@ -470,7 +470,7 @@ mod tests {
470470
index: 0
471471
};
472472
match store.persist_new_channel(test_txo, &added_monitors[0].1, update_id.2) {
473-
ChannelMonitorUpdateStatus::PermanentFailure => {},
473+
ChannelMonitorUpdateStatus::InProgress => {},
474474
_ => panic!("unexpected result from persisting new channel")
475475
}
476476

lightning/src/chain/chainmonitor.rs

Lines changed: 14 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ impl MonitorUpdateId {
7878
/// `Persist` defines behavior for persisting channel monitors: this could mean
7979
/// writing once to disk, and/or uploading to one or more backup services.
8080
///
81-
/// Each method can return three possible values:
81+
/// Each method can return two possible values:
8282
/// * If persistence (including any relevant `fsync()` calls) happens immediately, the
8383
/// implementation should return [`ChannelMonitorUpdateStatus::Completed`], indicating normal
8484
/// channel operation should continue.
@@ -91,10 +91,9 @@ impl MonitorUpdateId {
9191
/// Note that unlike the direct [`chain::Watch`] interface,
9292
/// [`ChainMonitor::channel_monitor_updated`] must be called once for *each* update which occurs.
9393
///
94-
/// * If persistence fails for some reason, implementations should return
95-
/// [`ChannelMonitorUpdateStatus::PermanentFailure`], in which case the channel will likely be
96-
/// closed without broadcasting the latest state. See
97-
/// [`ChannelMonitorUpdateStatus::PermanentFailure`] for more details.
94+
/// If persistence fails for some reason, implementations should still return
95+
/// [`ChannelMonitorUpdateStatus::InProgress`], attempting to shut down or otherwise resolve the
96+
/// situation ASAP.
9897
///
9998
/// Third-party watchtowers may be built as a part of an implementation of this trait, with the
10099
/// advantage that you can control whether to resume channel operation depending on if an update
@@ -335,11 +334,6 @@ where C::Target: chain::Filter,
335334
match self.persister.update_persisted_channel(*funding_outpoint, None, monitor, update_id) {
336335
ChannelMonitorUpdateStatus::Completed =>
337336
log_trace!(self.logger, "Finished syncing Channel Monitor for channel {}", log_funding_info!(monitor)),
338-
ChannelMonitorUpdateStatus::PermanentFailure => {
339-
monitor_state.channel_perm_failed.store(true, Ordering::Release);
340-
self.pending_monitor_events.lock().unwrap().push((*funding_outpoint, vec![MonitorEvent::UpdateFailed(*funding_outpoint)], monitor.get_counterparty_node_id()));
341-
self.event_notifier.notify();
342-
}
343337
ChannelMonitorUpdateStatus::InProgress => {
344338
log_debug!(self.logger, "Channel Monitor sync for channel {} in progress, holding events until completion!", log_funding_info!(monitor));
345339
pending_monitor_updates.push(update_id);
@@ -673,12 +667,12 @@ where C::Target: chain::Filter,
673667
///
674668
/// Note that we persist the given `ChannelMonitor` while holding the `ChainMonitor`
675669
/// monitors lock.
676-
fn watch_channel(&self, funding_outpoint: OutPoint, monitor: ChannelMonitor<ChannelSigner>) -> ChannelMonitorUpdateStatus {
670+
fn watch_channel(&self, funding_outpoint: OutPoint, monitor: ChannelMonitor<ChannelSigner>) -> Result<ChannelMonitorUpdateStatus, ()> {
677671
let mut monitors = self.monitors.write().unwrap();
678672
let entry = match monitors.entry(funding_outpoint) {
679673
hash_map::Entry::Occupied(_) => {
680674
log_error!(self.logger, "Failed to add new channel data: channel monitor for given outpoint is already present");
681-
return ChannelMonitorUpdateStatus::PermanentFailure
675+
return Err(());
682676
},
683677
hash_map::Entry::Vacant(e) => e,
684678
};
@@ -691,10 +685,6 @@ where C::Target: chain::Filter,
691685
log_info!(self.logger, "Persistence of new ChannelMonitor for channel {} in progress", log_funding_info!(monitor));
692686
pending_monitor_updates.push(update_id);
693687
},
694-
ChannelMonitorUpdateStatus::PermanentFailure => {
695-
log_error!(self.logger, "Persistence of new ChannelMonitor for channel {} failed", log_funding_info!(monitor));
696-
return persist_res;
697-
},
698688
ChannelMonitorUpdateStatus::Completed => {
699689
log_info!(self.logger, "Persistence of new ChannelMonitor for channel {} completed", log_funding_info!(monitor));
700690
}
@@ -708,7 +698,7 @@ where C::Target: chain::Filter,
708698
channel_perm_failed: AtomicBool::new(false),
709699
last_chain_persist_height: AtomicUsize::new(self.highest_chain_height.load(Ordering::Acquire)),
710700
});
711-
persist_res
701+
Ok(persist_res)
712702
}
713703

714704
/// Note that we persist the given `ChannelMonitor` update while holding the
@@ -723,10 +713,10 @@ where C::Target: chain::Filter,
723713
// We should never ever trigger this from within ChannelManager. Technically a
724714
// user could use this object with some proxying in between which makes this
725715
// possible, but in tests and fuzzing, this should be a panic.
726-
#[cfg(any(test, fuzzing))]
716+
#[cfg(debug_assertions)]
727717
panic!("ChannelManager generated a channel update for a channel that was not yet registered!");
728-
#[cfg(not(any(test, fuzzing)))]
729-
ChannelMonitorUpdateStatus::PermanentFailure
718+
#[cfg(not(debug_assertions))]
719+
ChannelMonitorUpdateStatus::InProgress
730720
},
731721
Some(monitor_state) => {
732722
let monitor = &monitor_state.monitor;
@@ -745,18 +735,14 @@ where C::Target: chain::Filter,
745735
pending_monitor_updates.push(update_id);
746736
log_debug!(self.logger, "Persistence of ChannelMonitorUpdate for channel {} in progress", log_funding_info!(monitor));
747737
},
748-
ChannelMonitorUpdateStatus::PermanentFailure => {
749-
monitor_state.channel_perm_failed.store(true, Ordering::Release);
750-
log_error!(self.logger, "Persistence of ChannelMonitorUpdate for channel {} failed", log_funding_info!(monitor));
751-
},
752738
ChannelMonitorUpdateStatus::Completed => {
753739
log_debug!(self.logger, "Persistence of ChannelMonitorUpdate for channel {} completed", log_funding_info!(monitor));
754740
},
755741
}
756742
if update_res.is_err() {
757-
ChannelMonitorUpdateStatus::PermanentFailure
743+
ChannelMonitorUpdateStatus::InProgress
758744
} else if monitor_state.channel_perm_failed.load(Ordering::Acquire) {
759-
ChannelMonitorUpdateStatus::PermanentFailure
745+
ChannelMonitorUpdateStatus::InProgress
760746
} else {
761747
persist_res
762748
}
@@ -988,12 +974,8 @@ mod tests {
988974
chanmon_cfgs[0].persister.set_update_ret(ChannelMonitorUpdateStatus::Completed);
989975
unwrap_send_err!(nodes[0].node.send_payment_with_route(&route, second_payment_hash,
990976
RecipientOnionFields::secret_only(second_payment_secret), PaymentId(second_payment_hash.0)
991-
), true, APIError::ChannelUnavailable { ref err },
992-
assert!(err.contains("ChannelMonitor storage failure")));
993-
check_added_monitors!(nodes[0], 2); // After the failure we generate a close-channel monitor update
994-
check_closed_broadcast!(nodes[0], true);
995-
check_closed_event!(nodes[0], 1, ClosureReason::ProcessingError { err: "ChannelMonitor storage failure".to_string() },
996-
[nodes[1].node.get_our_node_id()], 100000);
977+
), false, APIError::MonitorUpdateInProgress, {});
978+
check_added_monitors!(nodes[0], 1);
997979

998980
// However, as the ChainMonitor is still waiting for the original persistence to complete,
999981
// it won't yet release the MonitorEvents.
@@ -1020,28 +1002,4 @@ mod tests {
10201002
do_chainsync_pauses_events(false);
10211003
do_chainsync_pauses_events(true);
10221004
}
1023-
1024-
#[test]
1025-
fn update_during_chainsync_fails_channel() {
1026-
let chanmon_cfgs = create_chanmon_cfgs(2);
1027-
let node_cfgs = create_node_cfgs(2, &chanmon_cfgs);
1028-
let node_chanmgrs = create_node_chanmgrs(2, &node_cfgs, &[None, None]);
1029-
let nodes = create_network(2, &node_cfgs, &node_chanmgrs);
1030-
create_announced_chan_between_nodes(&nodes, 0, 1);
1031-
1032-
chanmon_cfgs[0].persister.chain_sync_monitor_persistences.lock().unwrap().clear();
1033-
chanmon_cfgs[0].persister.set_update_ret(ChannelMonitorUpdateStatus::PermanentFailure);
1034-
1035-
connect_blocks(&nodes[0], 1);
1036-
// Before processing events, the ChannelManager will still think the Channel is open and
1037-
// there won't be any ChannelMonitorUpdates
1038-
assert_eq!(nodes[0].node.list_channels().len(), 1);
1039-
check_added_monitors!(nodes[0], 0);
1040-
// ... however once we get events once, the channel will close, creating a channel-closed
1041-
// ChannelMonitorUpdate.
1042-
check_closed_broadcast!(nodes[0], true);
1043-
check_closed_event!(nodes[0], 1, ClosureReason::ProcessingError { err: "Failed to persist ChannelMonitor update during chain sync".to_string() },
1044-
[nodes[1].node.get_our_node_id()], 100000);
1045-
check_added_monitors!(nodes[0], 1);
1046-
}
10471005
}

lightning/src/chain/channelmonitor.rs

Lines changed: 7 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -151,10 +151,7 @@ pub enum MonitorEvent {
151151
monitor_update_id: u64,
152152
},
153153

154-
/// Indicates a [`ChannelMonitor`] update has failed. See
155-
/// [`ChannelMonitorUpdateStatus::PermanentFailure`] for more information on how this is used.
156-
///
157-
/// [`ChannelMonitorUpdateStatus::PermanentFailure`]: super::ChannelMonitorUpdateStatus::PermanentFailure
154+
/// Indicates a [`ChannelMonitor`] update has failed.
158155
UpdateFailed(OutPoint),
159156
}
160157
impl_writeable_tlv_based_enum_upgradable!(MonitorEvent,
@@ -1492,17 +1489,14 @@ impl<Signer: WriteableEcdsaChannelSigner> ChannelMonitor<Signer> {
14921489
/// the Channel was out-of-date.
14931490
///
14941491
/// You may also use this to broadcast the latest local commitment transaction, either because
1495-
/// a monitor update failed with [`ChannelMonitorUpdateStatus::PermanentFailure`] or because we've
1496-
/// fallen behind (i.e. we've received proof that our counterparty side knows a revocation
1497-
/// secret we gave them that they shouldn't know).
1492+
/// a monitor update failed or because we've fallen behind (i.e. we've received proof that our
1493+
/// counterparty side knows a revocation secret we gave them that they shouldn't know).
14981494
///
14991495
/// Broadcasting these transactions in the second case is UNSAFE, as they allow counterparty
15001496
/// side to punish you. Nevertheless you may want to broadcast them if counterparty doesn't
15011497
/// close channel with their commitment transaction after a substantial amount of time. Best
15021498
/// may be to contact the other node operator out-of-band to coordinate other options available
15031499
/// to you. In any-case, the choice is up to you.
1504-
///
1505-
/// [`ChannelMonitorUpdateStatus::PermanentFailure`]: super::ChannelMonitorUpdateStatus::PermanentFailure
15061500
pub fn get_latest_holder_commitment_txn<L: Deref>(&self, logger: &L) -> Vec<Transaction>
15071501
where L::Target: Logger {
15081502
self.inner.lock().unwrap().get_latest_holder_commitment_txn(logger)
@@ -2599,6 +2593,7 @@ impl<Signer: WriteableEcdsaChannelSigner> ChannelMonitorImpl<Signer> {
25992593
ChannelMonitorUpdateStep::CommitmentSecret { idx, secret } => {
26002594
log_trace!(logger, "Updating ChannelMonitor with commitment secret");
26012595
if let Err(e) = self.provide_secret(*idx, *secret) {
2596+
debug_assert!(false, "Latest counterparty commitment secret was invalid");
26022597
log_error!(logger, "Providing latest counterparty commitment secret failed/was refused:");
26032598
log_error!(logger, " {}", e);
26042599
ret = Err(());
@@ -4482,18 +4477,14 @@ mod tests {
44824477
let (route, payment_hash, _, payment_secret) = get_route_and_payment_hash!(nodes[1], nodes[0], 100_000);
44834478
unwrap_send_err!(nodes[1].node.send_payment_with_route(&route, payment_hash,
44844479
RecipientOnionFields::secret_only(payment_secret), PaymentId(payment_hash.0)
4485-
), true, APIError::ChannelUnavailable { ref err },
4486-
assert!(err.contains("ChannelMonitor storage failure")));
4487-
check_added_monitors!(nodes[1], 2); // After the failure we generate a close-channel monitor update
4488-
check_closed_broadcast!(nodes[1], true);
4489-
check_closed_event!(nodes[1], 1, ClosureReason::ProcessingError { err: "ChannelMonitor storage failure".to_string() },
4490-
[nodes[0].node.get_our_node_id()], 100000);
4480+
), false, APIError::MonitorUpdateInProgress, {});
4481+
check_added_monitors!(nodes[1], 1);
44914482

44924483
// Build a new ChannelMonitorUpdate which contains both the failing commitment tx update
44934484
// and provides the claim preimages for the two pending HTLCs. The first update generates
44944485
// an error, but the point of this test is to ensure the later updates are still applied.
44954486
let monitor_updates = nodes[1].chain_monitor.monitor_updates.lock().unwrap();
4496-
let mut replay_update = monitor_updates.get(&channel.2).unwrap().iter().rev().skip(1).next().unwrap().clone();
4487+
let mut replay_update = monitor_updates.get(&channel.2).unwrap().iter().rev().next().unwrap().clone();
44974488
assert_eq!(replay_update.updates.len(), 1);
44984489
if let ChannelMonitorUpdateStep::LatestCounterpartyCommitmentTXInfo { .. } = replay_update.updates[0] {
44994490
} else { panic!(); }

lightning/src/chain/mod.rs

Lines changed: 17 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -192,10 +192,6 @@ pub enum ChannelMonitorUpdateStatus {
192192
/// have been successfully applied, a [`MonitorEvent::Completed`] can be used to restore the
193193
/// channel to an operational state.
194194
///
195-
/// Note that a given [`ChannelManager`] will *never* re-generate a [`ChannelMonitorUpdate`].
196-
/// If you return this error you must ensure that it is written to disk safely before writing
197-
/// the latest [`ChannelManager`] state, or you should return [`PermanentFailure`] instead.
198-
///
199195
/// Even when a channel has been "frozen", updates to the [`ChannelMonitor`] can continue to
200196
/// occur (e.g. if an inbound HTLC which we forwarded was claimed upstream, resulting in us
201197
/// attempting to claim it on this channel) and those updates must still be persisted.
@@ -208,49 +204,8 @@ pub enum ChannelMonitorUpdateStatus {
208204
/// remote location (with local copies persisted immediately), it is anticipated that all
209205
/// updates will return [`InProgress`] until the remote copies could be updated.
210206
///
211-
/// [`PermanentFailure`]: ChannelMonitorUpdateStatus::PermanentFailure
212207
/// [`InProgress`]: ChannelMonitorUpdateStatus::InProgress
213-
/// [`ChannelManager`]: crate::ln::channelmanager::ChannelManager
214208
InProgress,
215-
/// Used to indicate no further channel monitor updates will be allowed (likely a disk failure
216-
/// or a remote copy of this [`ChannelMonitor`] is no longer reachable and thus not updatable).
217-
///
218-
/// When this is returned, [`ChannelManager`] will force-close the channel but *not* broadcast
219-
/// our current commitment transaction. This avoids a dangerous case where a local disk failure
220-
/// (e.g. the Linux-default remounting of the disk as read-only) causes [`PermanentFailure`]s
221-
/// for all monitor updates. If we were to broadcast our latest commitment transaction and then
222-
/// restart, we could end up reading a previous [`ChannelMonitor`] and [`ChannelManager`],
223-
/// revoking our now-broadcasted state before seeing it confirm and losing all our funds.
224-
///
225-
/// Note that this is somewhat of a tradeoff - if the disk is really gone and we may have lost
226-
/// the data permanently, we really should broadcast immediately. If the data can be recovered
227-
/// with manual intervention, we'd rather close the channel, rejecting future updates to it,
228-
/// and broadcast the latest state only if we have HTLCs to claim which are timing out (which
229-
/// we do as long as blocks are connected).
230-
///
231-
/// In order to broadcast the latest local commitment transaction, you'll need to call
232-
/// [`ChannelMonitor::get_latest_holder_commitment_txn`] and broadcast the resulting
233-
/// transactions once you've safely ensured no further channel updates can be generated by your
234-
/// [`ChannelManager`].
235-
///
236-
/// Note that at least one final [`ChannelMonitorUpdate`] may still be provided, which must
237-
/// still be processed by a running [`ChannelMonitor`]. This final update will mark the
238-
/// [`ChannelMonitor`] as finalized, ensuring no further updates (e.g. revocation of the latest
239-
/// commitment transaction) are allowed.
240-
///
241-
/// Note that even if you return a [`PermanentFailure`] due to unavailability of secondary
242-
/// [`ChannelMonitor`] copies, you should still make an attempt to store the update where
243-
/// possible to ensure you can claim HTLC outputs on the latest commitment transaction
244-
/// broadcasted later.
245-
///
246-
/// In case of distributed watchtowers deployment, the new version must be written to disk, as
247-
/// state may have been stored but rejected due to a block forcing a commitment broadcast. This
248-
/// storage is used to claim outputs of rejected state confirmed onchain by another watchtower,
249-
/// lagging behind on block processing.
250-
///
251-
/// [`PermanentFailure`]: ChannelMonitorUpdateStatus::PermanentFailure
252-
/// [`ChannelManager`]: crate::ln::channelmanager::ChannelManager
253-
PermanentFailure,
254209
}
255210

256211
/// The `Watch` trait defines behavior for watching on-chain activity pertaining to channels as
@@ -262,37 +217,44 @@ pub enum ChannelMonitorUpdateStatus {
262217
/// requirements.
263218
///
264219
/// Implementations **must** ensure that updates are successfully applied and persisted upon method
265-
/// completion. If an update fails with a [`PermanentFailure`], then it must immediately shut down
266-
/// without taking any further action such as persisting the current state.
220+
/// completion. If an update will not succeed, then it must immediately shut down.
267221
///
268222
/// If an implementation maintains multiple instances of a channel's monitor (e.g., by storing
269223
/// backup copies), then it must ensure that updates are applied across all instances. Otherwise, it
270224
/// could result in a revoked transaction being broadcast, allowing the counterparty to claim all
271225
/// funds in the channel. See [`ChannelMonitorUpdateStatus`] for more details about how to handle
272226
/// multiple instances.
273-
///
274-
/// [`PermanentFailure`]: ChannelMonitorUpdateStatus::PermanentFailure
275227
pub trait Watch<ChannelSigner: WriteableEcdsaChannelSigner> {
276228
/// Watches a channel identified by `funding_txo` using `monitor`.
277229
///
278230
/// Implementations are responsible for watching the chain for the funding transaction along
279231
/// with any spends of outputs returned by [`get_outputs_to_watch`]. In practice, this means
280232
/// calling [`block_connected`] and [`block_disconnected`] on the monitor.
281233
///
282-
/// Note: this interface MUST error with [`ChannelMonitorUpdateStatus::PermanentFailure`] if
283-
/// the given `funding_txo` has previously been registered via `watch_channel`.
234+
/// A return of `Err(())` indicates that the channel should not open and the channel
235+
/// immediately force-closed without broadcasting the funding transaction.
236+
///
237+
/// If the given `funding_txo` has previously been registered via `watch_channel`, `Err(())`
238+
/// must be returned.
284239
///
285240
/// [`get_outputs_to_watch`]: channelmonitor::ChannelMonitor::get_outputs_to_watch
286241
/// [`block_connected`]: channelmonitor::ChannelMonitor::block_connected
287242
/// [`block_disconnected`]: channelmonitor::ChannelMonitor::block_disconnected
288-
fn watch_channel(&self, funding_txo: OutPoint, monitor: ChannelMonitor<ChannelSigner>) -> ChannelMonitorUpdateStatus;
243+
fn watch_channel(&self, funding_txo: OutPoint, monitor: ChannelMonitor<ChannelSigner>) -> Result<ChannelMonitorUpdateStatus, ()>;
289244

290245
/// Updates a channel identified by `funding_txo` by applying `update` to its monitor.
291246
///
292-
/// Implementations must call [`update_monitor`] with the given update. See
293-
/// [`ChannelMonitorUpdateStatus`] for invariants around returning an error.
247+
/// Implementations must call [`ChannelMonitor::update_monitor`] with the given update. This
248+
/// may fail (returning an `Err(())`), in which case this should return
249+
/// [`ChannelMonitorUpdateStatus::InProgress`] (and the update should never complete). This
250+
/// generally implies the channel has been closed (either by the funding outpoint being spent
251+
/// on-chain or the [`ChannelMonitor`] having decided to do so and broadcasted a transaction),
252+
/// and the [`ChannelManager`] state will be updated once it sees the funding spend on-chain.
253+
///
254+
/// If persistence fails, this should return [`ChannelMonitorUpdateStatus::InProgress`] and
255+
/// the node should shut down immediately.
294256
///
295-
/// [`update_monitor`]: channelmonitor::ChannelMonitor::update_monitor
257+
/// [`ChannelManager`]: crate::ln::channelmanager::ChannelManager
296258
fn update_channel(&self, funding_txo: OutPoint, update: &ChannelMonitorUpdate) -> ChannelMonitorUpdateStatus;
297259

298260
/// Returns any monitor events since the last call. Subsequent calls must only return new

0 commit comments

Comments
 (0)