From 538c2a2a3ea62d25a3c70c73c570544a6721f8eb Mon Sep 17 00:00:00 2001 From: Patrick Insinger Date: Wed, 29 Sep 2021 16:24:40 -0700 Subject: [PATCH] pageserver - store timeline metadata durably The metadata file is now always 512 bytes. The last 4 bytes are a crc32c checksum of the previous 508 bytes. Padding zeroes are added between the serde serialization and the start of the checksum. A single write call is used, and the file is fsyncd after. On file creation, the parent directory is fsyncd as well. --- pageserver/src/layered_repository.rs | 63 +++++++++++++++++++++++----- pageserver/src/repository.rs | 53 +++++++++++++++++++++++ 2 files changed, 106 insertions(+), 10 deletions(-) diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index b2984d5337..86d151b8cb 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -22,7 +22,8 @@ use serde::{Deserialize, Serialize}; use std::collections::hash_map::Entry; use std::collections::HashMap; use std::collections::{BTreeSet, HashSet}; -use std::fs::File; +use std::convert::TryInto; +use std::fs::{File, OpenOptions}; use std::io::Write; use std::ops::Bound::Included; use std::path::{Path, PathBuf}; @@ -73,6 +74,11 @@ static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; 8192]); // Timeout when waiting for WAL receiver to catch up to an LSN given in a GetPage@LSN call. static TIMEOUT: Duration = Duration::from_secs(60); +// Taken from PG_CONTROL_MAX_SAFE_SIZE +const METADATA_MAX_SAFE_SIZE: usize = 512; +const METADATA_CHECKSUM_SIZE: usize = std::mem::size_of::(); +const METADATA_MAX_DATA_SIZE: usize = METADATA_MAX_SAFE_SIZE - METADATA_CHECKSUM_SIZE; + // Metrics collected on operations on the storage repository. lazy_static! { static ref STORAGE_TIME: HistogramVec = register_histogram_vec!( @@ -135,7 +141,7 @@ impl Repository for LayeredRepository { ancestor_timeline: None, ancestor_lsn: Lsn(0), }; - Self::save_metadata(self.conf, timelineid, self.tenantid, &metadata)?; + Self::save_metadata(self.conf, timelineid, self.tenantid, &metadata, true)?; let timeline = LayeredTimeline::new( self.conf, @@ -180,7 +186,7 @@ impl Repository for LayeredRepository { ancestor_lsn: start_lsn, }; crashsafe_dir::create_dir_all(self.conf.timeline_path(&dst, &self.tenantid))?; - Self::save_metadata(self.conf, dst, self.tenantid, &metadata)?; + Self::save_metadata(self.conf, dst, self.tenantid, &metadata, true)?; info!("branched timeline {} from {} at {}", dst, src, start_lsn); @@ -353,13 +359,36 @@ impl LayeredRepository { timelineid: ZTimelineId, tenantid: ZTenantId, data: &TimelineMetadata, + first_save: bool, ) -> Result { - let path = conf.timeline_path(&timelineid, &tenantid).join("metadata"); - let mut file = File::create(&path)?; + let timeline_path = conf.timeline_path(&timelineid, &tenantid); + let path = timeline_path.join("metadata"); + // use OpenOptions to ensure file presence is consistent with first_save + let mut file = OpenOptions::new() + .write(true) + .create_new(first_save) + .open(&path)?; info!("saving metadata {}", path.display()); - file.write_all(&TimelineMetadata::ser(data)?)?; + let mut metadata_bytes = TimelineMetadata::ser(data)?; + + assert!(metadata_bytes.len() <= METADATA_MAX_DATA_SIZE); + metadata_bytes.resize(METADATA_MAX_SAFE_SIZE, 0u8); + + let checksum = crc32c::crc32c(&metadata_bytes[..METADATA_MAX_DATA_SIZE]); + metadata_bytes[METADATA_MAX_DATA_SIZE..].copy_from_slice(&u32::to_le_bytes(checksum)); + + if file.write(&metadata_bytes)? != metadata_bytes.len() { + bail!("Could not write all the metadata bytes in a single call"); + } + file.sync_all()?; + + // fsync the parent directory to ensure the directory entry is durable + if first_save { + let timeline_dir = File::open(&timeline_path)?; + timeline_dir.sync_all()?; + } Ok(path) } @@ -370,9 +399,18 @@ impl LayeredRepository { tenantid: ZTenantId, ) -> Result { let path = conf.timeline_path(&timelineid, &tenantid).join("metadata"); - let data = std::fs::read(&path)?; + let metadata_bytes = std::fs::read(&path)?; + ensure!(metadata_bytes.len() == METADATA_MAX_SAFE_SIZE); - let data = TimelineMetadata::des(&data)?; + let data = &metadata_bytes[..METADATA_MAX_DATA_SIZE]; + let calculated_checksum = crc32c::crc32c(&data); + + let checksum_bytes: &[u8; METADATA_CHECKSUM_SIZE] = + metadata_bytes[METADATA_MAX_DATA_SIZE..].try_into()?; + let expected_checksum = u32::from_le_bytes(*checksum_bytes); + ensure!(calculated_checksum == expected_checksum); + + let data = TimelineMetadata::des_prefix(&data)?; assert!(data.disk_consistent_lsn.is_aligned()); Ok(data) @@ -1450,8 +1488,13 @@ impl LayeredTimeline { ancestor_timeline: ancestor_timelineid, ancestor_lsn: self.ancestor_lsn, }; - let metadata_path = - LayeredRepository::save_metadata(self.conf, self.timelineid, self.tenantid, &metadata)?; + let metadata_path = LayeredRepository::save_metadata( + self.conf, + self.timelineid, + self.tenantid, + &metadata, + false, + )?; if let Some(relish_uploader) = &self.relish_uploader { relish_uploader.schedule_upload(self.timelineid, metadata_path); } diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index a1af7d8a13..af0c7912ae 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -277,6 +277,23 @@ mod tests { Ok(repo) } + fn load_test_repo(test_name: &str, tenantid: ZTenantId) -> Result> { + let repo_dir = PageServerConf::test_repo_dir(test_name); + + let conf = PageServerConf::dummy_conf(repo_dir); + let conf: &'static PageServerConf = Box::leak(Box::new(conf)); + + let walredo_mgr = TestRedoManager {}; + + let repo = Box::new(LayeredRepository::new( + conf, + Arc::new(walredo_mgr), + tenantid, + )); + + Ok(repo) + } + #[test] fn test_relsize() -> Result<()> { let repo = get_test_repo("test_relsize")?; @@ -706,6 +723,42 @@ mod tests { Ok(()) } + #[test] + fn corrupt_metadata() -> Result<()> { + const TEST_NAME: &str = "corrupt_metadata"; + let repo = get_test_repo(TEST_NAME)?; + + let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap(); + repo.create_empty_timeline(timelineid)?; + drop(repo); + + let dir = PageServerConf::test_repo_dir(TEST_NAME); + let mut read_dir = std::fs::read_dir(dir.join("tenants"))?; + let tenant_dir = read_dir.next().unwrap().unwrap().path(); + assert!(tenant_dir.is_dir()); + let tenantid = tenant_dir.file_name().unwrap().to_str().unwrap(); + let tenantid = ZTenantId::from_str(tenantid)?; + assert!(read_dir.next().is_none()); + + let metadata_path = tenant_dir + .join("timelines") + .join(timelineid.to_string()) + .join("metadata"); + + assert!(metadata_path.is_file()); + + let mut metadata_bytes = std::fs::read(&metadata_path)?; + assert_eq!(metadata_bytes.len(), 512); + metadata_bytes[512 - 4 - 2] ^= 1; + std::fs::write(metadata_path, metadata_bytes)?; + + let new_repo = load_test_repo(TEST_NAME, tenantid)?; + let err = new_repo.get_timeline(timelineid).err().unwrap(); + assert!(err.to_string().contains("checksum")); + + Ok(()) + } + // Mock WAL redo manager that doesn't do much struct TestRedoManager {}