Skip to content

Commit 538c2a2

Browse files
committed
pageserver - store timeline metadata durably
The metadata file is now always 512 bytes. The last 4 bytes are a crc32c checksum of the previous 508 bytes. Padding zeroes are added between the serde serialization and the start of the checksum. A single write call is used, and the file is fsyncd after. On file creation, the parent directory is fsyncd as well.
1 parent 62f8386 commit 538c2a2

File tree

2 files changed

+106
-10
lines changed

2 files changed

+106
-10
lines changed

pageserver/src/layered_repository.rs

Lines changed: 53 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,8 @@ use serde::{Deserialize, Serialize};
2222
use std::collections::hash_map::Entry;
2323
use std::collections::HashMap;
2424
use std::collections::{BTreeSet, HashSet};
25-
use std::fs::File;
25+
use std::convert::TryInto;
26+
use std::fs::{File, OpenOptions};
2627
use std::io::Write;
2728
use std::ops::Bound::Included;
2829
use std::path::{Path, PathBuf};
@@ -73,6 +74,11 @@ static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; 8192]);
7374
// Timeout when waiting for WAL receiver to catch up to an LSN given in a GetPage@LSN call.
7475
static TIMEOUT: Duration = Duration::from_secs(60);
7576

77+
// Taken from PG_CONTROL_MAX_SAFE_SIZE
78+
const METADATA_MAX_SAFE_SIZE: usize = 512;
79+
const METADATA_CHECKSUM_SIZE: usize = std::mem::size_of::<u32>();
80+
const METADATA_MAX_DATA_SIZE: usize = METADATA_MAX_SAFE_SIZE - METADATA_CHECKSUM_SIZE;
81+
7682
// Metrics collected on operations on the storage repository.
7783
lazy_static! {
7884
static ref STORAGE_TIME: HistogramVec = register_histogram_vec!(
@@ -135,7 +141,7 @@ impl Repository for LayeredRepository {
135141
ancestor_timeline: None,
136142
ancestor_lsn: Lsn(0),
137143
};
138-
Self::save_metadata(self.conf, timelineid, self.tenantid, &metadata)?;
144+
Self::save_metadata(self.conf, timelineid, self.tenantid, &metadata, true)?;
139145

140146
let timeline = LayeredTimeline::new(
141147
self.conf,
@@ -180,7 +186,7 @@ impl Repository for LayeredRepository {
180186
ancestor_lsn: start_lsn,
181187
};
182188
crashsafe_dir::create_dir_all(self.conf.timeline_path(&dst, &self.tenantid))?;
183-
Self::save_metadata(self.conf, dst, self.tenantid, &metadata)?;
189+
Self::save_metadata(self.conf, dst, self.tenantid, &metadata, true)?;
184190

185191
info!("branched timeline {} from {} at {}", dst, src, start_lsn);
186192

@@ -353,13 +359,36 @@ impl LayeredRepository {
353359
timelineid: ZTimelineId,
354360
tenantid: ZTenantId,
355361
data: &TimelineMetadata,
362+
first_save: bool,
356363
) -> Result<PathBuf> {
357-
let path = conf.timeline_path(&timelineid, &tenantid).join("metadata");
358-
let mut file = File::create(&path)?;
364+
let timeline_path = conf.timeline_path(&timelineid, &tenantid);
365+
let path = timeline_path.join("metadata");
366+
// use OpenOptions to ensure file presence is consistent with first_save
367+
let mut file = OpenOptions::new()
368+
.write(true)
369+
.create_new(first_save)
370+
.open(&path)?;
359371

360372
info!("saving metadata {}", path.display());
361373

362-
file.write_all(&TimelineMetadata::ser(data)?)?;
374+
let mut metadata_bytes = TimelineMetadata::ser(data)?;
375+
376+
assert!(metadata_bytes.len() <= METADATA_MAX_DATA_SIZE);
377+
metadata_bytes.resize(METADATA_MAX_SAFE_SIZE, 0u8);
378+
379+
let checksum = crc32c::crc32c(&metadata_bytes[..METADATA_MAX_DATA_SIZE]);
380+
metadata_bytes[METADATA_MAX_DATA_SIZE..].copy_from_slice(&u32::to_le_bytes(checksum));
381+
382+
if file.write(&metadata_bytes)? != metadata_bytes.len() {
383+
bail!("Could not write all the metadata bytes in a single call");
384+
}
385+
file.sync_all()?;
386+
387+
// fsync the parent directory to ensure the directory entry is durable
388+
if first_save {
389+
let timeline_dir = File::open(&timeline_path)?;
390+
timeline_dir.sync_all()?;
391+
}
363392

364393
Ok(path)
365394
}
@@ -370,9 +399,18 @@ impl LayeredRepository {
370399
tenantid: ZTenantId,
371400
) -> Result<TimelineMetadata> {
372401
let path = conf.timeline_path(&timelineid, &tenantid).join("metadata");
373-
let data = std::fs::read(&path)?;
402+
let metadata_bytes = std::fs::read(&path)?;
403+
ensure!(metadata_bytes.len() == METADATA_MAX_SAFE_SIZE);
404+
405+
let data = &metadata_bytes[..METADATA_MAX_DATA_SIZE];
406+
let calculated_checksum = crc32c::crc32c(&data);
374407

375-
let data = TimelineMetadata::des(&data)?;
408+
let checksum_bytes: &[u8; METADATA_CHECKSUM_SIZE] =
409+
metadata_bytes[METADATA_MAX_DATA_SIZE..].try_into()?;
410+
let expected_checksum = u32::from_le_bytes(*checksum_bytes);
411+
ensure!(calculated_checksum == expected_checksum);
412+
413+
let data = TimelineMetadata::des_prefix(&data)?;
376414
assert!(data.disk_consistent_lsn.is_aligned());
377415

378416
Ok(data)
@@ -1450,8 +1488,13 @@ impl LayeredTimeline {
14501488
ancestor_timeline: ancestor_timelineid,
14511489
ancestor_lsn: self.ancestor_lsn,
14521490
};
1453-
let metadata_path =
1454-
LayeredRepository::save_metadata(self.conf, self.timelineid, self.tenantid, &metadata)?;
1491+
let metadata_path = LayeredRepository::save_metadata(
1492+
self.conf,
1493+
self.timelineid,
1494+
self.tenantid,
1495+
&metadata,
1496+
false,
1497+
)?;
14551498
if let Some(relish_uploader) = &self.relish_uploader {
14561499
relish_uploader.schedule_upload(self.timelineid, metadata_path);
14571500
}

pageserver/src/repository.rs

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -277,6 +277,23 @@ mod tests {
277277
Ok(repo)
278278
}
279279

280+
fn load_test_repo(test_name: &str, tenantid: ZTenantId) -> Result<Box<dyn Repository>> {
281+
let repo_dir = PageServerConf::test_repo_dir(test_name);
282+
283+
let conf = PageServerConf::dummy_conf(repo_dir);
284+
let conf: &'static PageServerConf = Box::leak(Box::new(conf));
285+
286+
let walredo_mgr = TestRedoManager {};
287+
288+
let repo = Box::new(LayeredRepository::new(
289+
conf,
290+
Arc::new(walredo_mgr),
291+
tenantid,
292+
));
293+
294+
Ok(repo)
295+
}
296+
280297
#[test]
281298
fn test_relsize() -> Result<()> {
282299
let repo = get_test_repo("test_relsize")?;
@@ -706,6 +723,42 @@ mod tests {
706723
Ok(())
707724
}
708725

726+
#[test]
727+
fn corrupt_metadata() -> Result<()> {
728+
const TEST_NAME: &str = "corrupt_metadata";
729+
let repo = get_test_repo(TEST_NAME)?;
730+
731+
let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap();
732+
repo.create_empty_timeline(timelineid)?;
733+
drop(repo);
734+
735+
let dir = PageServerConf::test_repo_dir(TEST_NAME);
736+
let mut read_dir = std::fs::read_dir(dir.join("tenants"))?;
737+
let tenant_dir = read_dir.next().unwrap().unwrap().path();
738+
assert!(tenant_dir.is_dir());
739+
let tenantid = tenant_dir.file_name().unwrap().to_str().unwrap();
740+
let tenantid = ZTenantId::from_str(tenantid)?;
741+
assert!(read_dir.next().is_none());
742+
743+
let metadata_path = tenant_dir
744+
.join("timelines")
745+
.join(timelineid.to_string())
746+
.join("metadata");
747+
748+
assert!(metadata_path.is_file());
749+
750+
let mut metadata_bytes = std::fs::read(&metadata_path)?;
751+
assert_eq!(metadata_bytes.len(), 512);
752+
metadata_bytes[512 - 4 - 2] ^= 1;
753+
std::fs::write(metadata_path, metadata_bytes)?;
754+
755+
let new_repo = load_test_repo(TEST_NAME, tenantid)?;
756+
let err = new_repo.get_timeline(timelineid).err().unwrap();
757+
assert!(err.to_string().contains("checksum"));
758+
759+
Ok(())
760+
}
761+
709762
// Mock WAL redo manager that doesn't do much
710763
struct TestRedoManager {}
711764

0 commit comments

Comments
 (0)