Skip to content

Commit 2fbdf5d

Browse files
committed
Merge branch 'zoran/CON-1035-divergence-check2' into 'master'
fix(ic-backup, CON-1035) Improve more divergence check in order to reduce flakiness There was a flakiness that the test would pick the first checkpoint and modify it's memory in order to produce divergence. In a rare occasion that there is more than one checkpoint, it might pick the one that was already replayed and the test would fail. This change makes it pick the last one if there are more, and it is the one from which the replay will start. Closes CON-1035 Closes CON-1035 See merge request dfinity-lab/public/ic!12577
2 parents 1734012 + ad381d3 commit 2fbdf5d

File tree

2 files changed

+29
-16
lines changed

2 files changed

+29
-16
lines changed

rs/backup/src/backup_helper.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -901,7 +901,7 @@ fn last_dir_height(dir: &PathBuf, radix: u32) -> u64 {
901901
}
902902
}
903903

904-
fn last_checkpoint(dir: &Path) -> u64 {
904+
pub fn last_checkpoint(dir: &Path) -> u64 {
905905
last_dir_height(&dir.join("checkpoints"), 16)
906906
}
907907

rs/tests/src/orchestrator/backup_manager.rs

Lines changed: 28 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ use crate::{
4242
},
4343
util::{block_on, get_nns_node},
4444
};
45-
use ic_backup::backup_helper::ls_path;
45+
use ic_backup::backup_helper::{last_checkpoint, ls_path};
4646
use ic_backup::config::{ColdStorage, Config, SubnetConfig};
4747
use ic_backup::util::sleep_secs;
4848
use ic_base_types::SubnetId;
@@ -77,7 +77,7 @@ pub fn config(env: TestEnv) {
7777
pub fn test(env: TestEnv) {
7878
let log = env.logger();
7979

80-
// Create all directories
80+
info!(log, "Create all directories");
8181
let root_dir = tempfile::TempDir::new()
8282
.expect("failed to create a temporary directory")
8383
.path()
@@ -91,6 +91,7 @@ pub fn test(env: TestEnv) {
9191
.path()
9292
.to_path_buf();
9393

94+
info!(log, "Fetch the replica version");
9495
let nns_node = get_nns_node(&env.topology_snapshot());
9596
let node_ip: IpAddr = nns_node.get_ip_addr();
9697
let subnet_id = env.topology_snapshot().root_subnet_id();
@@ -99,6 +100,10 @@ pub fn test(env: TestEnv) {
99100
let initial_replica_version = ReplicaVersion::try_from(replica_version.clone())
100101
.expect("Assigned replica version should be valid");
101102

103+
info!(
104+
log,
105+
"Copy the binaries needed for replay of the current version"
106+
);
102107
let backup_binaries_dir = backup_dir.join("binaries").join(&replica_version);
103108
fs::create_dir_all(&backup_binaries_dir).expect("failure creating backup binaries directory");
104109

@@ -109,6 +114,10 @@ pub fn test(env: TestEnv) {
109114
copy_file(&binaries_path, &backup_binaries_dir, "sandbox_launcher");
110115
copy_file(&binaries_path, &backup_binaries_dir, "canister_sandbox");
111116

117+
info!(
118+
log,
119+
"Download the binaries needed for replay of the mainnet version"
120+
);
112121
let mainnet_version = env
113122
.read_dependency_to_string("testnet/mainnet_nns_revision.txt")
114123
.expect("could not read mainnet version!");
@@ -130,6 +139,7 @@ pub fn test(env: TestEnv) {
130139
.expect("chmod command failed");
131140
chmod.wait_with_output().expect("chmod execution failed");
132141

142+
info!(log, "Run ECDSA signature test");
133143
let nns_node = env.get_first_healthy_nns_node_snapshot();
134144
let agent = nns_node.build_default_agent();
135145
let nns_canister = block_on(MessageCanister::new(
@@ -146,6 +156,7 @@ pub fn test(env: TestEnv) {
146156
);
147157
run_ecdsa_signature_test(&nns_canister, &log, key);
148158

159+
info!(log, "Install universal canister");
149160
let log2 = log.clone();
150161
let id = nns_node.effective_canister_id();
151162
let canister_id_hex: String = block_on({
@@ -155,13 +166,13 @@ pub fn test(env: TestEnv) {
155166
}
156167
});
157168

158-
// Update the registry with the backup key
169+
info!(log, "Update the registry with the backup key");
159170
let payload = get_updatesubnetpayload_with_keys(subnet_id, None, Some(vec![backup_public_key]));
160171
block_on(update_subnet_record(nns_node.get_public_url(), payload));
161172
let backup_mean = AuthMean::PrivateKey(backup_private_key);
162173
wait_until_authentication_is_granted(&node_ip, "backup", &backup_mean);
163174

164-
// Fetch NNS public key
175+
info!(log, "Fetch NNS public key");
165176
let nns_public_key = env
166177
.prep_dir("")
167178
.expect("missing NNS public key")
@@ -229,6 +240,7 @@ pub fn test(env: TestEnv) {
229240
&log,
230241
));
231242

243+
info!(log, "Wait for archived checkpoint");
232244
let archive_dir = backup_dir.join("archive").join(subnet_id.to_string());
233245
// make sure we have some archive of the old version before upgrading to the new one
234246
loop {
@@ -305,6 +317,7 @@ pub fn test(env: TestEnv) {
305317
info!(log, "Modify memory file: {:?}", memory_artifact_path);
306318
modify_byte_in_file(memory_artifact_path).expect("Modifying a byte failed");
307319

320+
info!(log, "Start again the backup process in a separate thread");
308321
let mut command = Command::new(&ic_backup_path);
309322
command
310323
.arg("--config-file")
@@ -317,10 +330,12 @@ pub fn test(env: TestEnv) {
317330
.expect("Failed to start backup process");
318331
info!(log, "Started process: {}", child.id());
319332

320-
assert!(cold_storage_exists(
321-
&log,
322-
cold_storage_dir.join(subnet_id.to_string())
323-
));
333+
if !cold_storage_exists(&log, cold_storage_dir.join(subnet_id.to_string())) {
334+
info!(log, "Kill child process");
335+
child.kill().expect("Error killing backup process");
336+
panic!("No cold storage");
337+
}
338+
324339
info!(log, "Artifacts and states are moved to cold storage");
325340

326341
let mut hash_mismatch = false;
@@ -365,17 +380,15 @@ fn some_checkpoint_dir(backup_dir: &Path, subnet_id: &SubnetId) -> Option<PathBu
365380
let dir = backup_dir
366381
.join("data")
367382
.join(subnet_id.to_string())
368-
.join("ic_state")
369-
.join("checkpoints");
383+
.join("ic_state");
370384
if !dir.exists() {
371385
return None;
372386
}
373-
if let Ok(mut cps) = fs::read_dir(dir) {
374-
if let Some(Ok(cp)) = cps.next() {
375-
return Some(cp.path());
376-
}
387+
let lcp = last_checkpoint(&dir);
388+
if lcp == 0 {
389+
return None;
377390
}
378-
None
391+
Some(dir.join(format!("checkpoints/{:016x}", lcp)))
379392
}
380393

381394
fn modify_byte_in_file(file_path: PathBuf) -> std::io::Result<()> {

0 commit comments

Comments
 (0)