pageserver - store timeline metadata durably

The metadata file is now always 512 bytes. The last 4 bytes are a
crc32c checksum of the previous 508 bytes. Padding zeroes are added
between the serde serialization and the start of the checksum.

A single write call is used, and the file is fsyncd after.
On file creation, the parent directory is fsyncd as well.
This commit is contained in:
Patrick Insinger
2021-09-29 16:24:40 -07:00
committed by Patrick Insinger
parent 62f83869f1
commit 538c2a2a3e
2 changed files with 106 additions and 10 deletions

View File

@@ -22,7 +22,8 @@ use serde::{Deserialize, Serialize};
use std::collections::hash_map::Entry;
use std::collections::HashMap;
use std::collections::{BTreeSet, HashSet};
use std::fs::File;
use std::convert::TryInto;
use std::fs::{File, OpenOptions};
use std::io::Write;
use std::ops::Bound::Included;
use std::path::{Path, PathBuf};
@@ -73,6 +74,11 @@ static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; 8192]);
// Timeout when waiting for WAL receiver to catch up to an LSN given in a GetPage@LSN call.
static TIMEOUT: Duration = Duration::from_secs(60);
// Taken from PG_CONTROL_MAX_SAFE_SIZE
const METADATA_MAX_SAFE_SIZE: usize = 512;
const METADATA_CHECKSUM_SIZE: usize = std::mem::size_of::<u32>();
const METADATA_MAX_DATA_SIZE: usize = METADATA_MAX_SAFE_SIZE - METADATA_CHECKSUM_SIZE;
// Metrics collected on operations on the storage repository.
lazy_static! {
static ref STORAGE_TIME: HistogramVec = register_histogram_vec!(
@@ -135,7 +141,7 @@ impl Repository for LayeredRepository {
ancestor_timeline: None,
ancestor_lsn: Lsn(0),
};
Self::save_metadata(self.conf, timelineid, self.tenantid, &metadata)?;
Self::save_metadata(self.conf, timelineid, self.tenantid, &metadata, true)?;
let timeline = LayeredTimeline::new(
self.conf,
@@ -180,7 +186,7 @@ impl Repository for LayeredRepository {
ancestor_lsn: start_lsn,
};
crashsafe_dir::create_dir_all(self.conf.timeline_path(&dst, &self.tenantid))?;
Self::save_metadata(self.conf, dst, self.tenantid, &metadata)?;
Self::save_metadata(self.conf, dst, self.tenantid, &metadata, true)?;
info!("branched timeline {} from {} at {}", dst, src, start_lsn);
@@ -353,13 +359,36 @@ impl LayeredRepository {
timelineid: ZTimelineId,
tenantid: ZTenantId,
data: &TimelineMetadata,
first_save: bool,
) -> Result<PathBuf> {
let path = conf.timeline_path(&timelineid, &tenantid).join("metadata");
let mut file = File::create(&path)?;
let timeline_path = conf.timeline_path(&timelineid, &tenantid);
let path = timeline_path.join("metadata");
// use OpenOptions to ensure file presence is consistent with first_save
let mut file = OpenOptions::new()
.write(true)
.create_new(first_save)
.open(&path)?;
info!("saving metadata {}", path.display());
file.write_all(&TimelineMetadata::ser(data)?)?;
let mut metadata_bytes = TimelineMetadata::ser(data)?;
assert!(metadata_bytes.len() <= METADATA_MAX_DATA_SIZE);
metadata_bytes.resize(METADATA_MAX_SAFE_SIZE, 0u8);
let checksum = crc32c::crc32c(&metadata_bytes[..METADATA_MAX_DATA_SIZE]);
metadata_bytes[METADATA_MAX_DATA_SIZE..].copy_from_slice(&u32::to_le_bytes(checksum));
if file.write(&metadata_bytes)? != metadata_bytes.len() {
bail!("Could not write all the metadata bytes in a single call");
}
file.sync_all()?;
// fsync the parent directory to ensure the directory entry is durable
if first_save {
let timeline_dir = File::open(&timeline_path)?;
timeline_dir.sync_all()?;
}
Ok(path)
}
@@ -370,9 +399,18 @@ impl LayeredRepository {
tenantid: ZTenantId,
) -> Result<TimelineMetadata> {
let path = conf.timeline_path(&timelineid, &tenantid).join("metadata");
let data = std::fs::read(&path)?;
let metadata_bytes = std::fs::read(&path)?;
ensure!(metadata_bytes.len() == METADATA_MAX_SAFE_SIZE);
let data = TimelineMetadata::des(&data)?;
let data = &metadata_bytes[..METADATA_MAX_DATA_SIZE];
let calculated_checksum = crc32c::crc32c(&data);
let checksum_bytes: &[u8; METADATA_CHECKSUM_SIZE] =
metadata_bytes[METADATA_MAX_DATA_SIZE..].try_into()?;
let expected_checksum = u32::from_le_bytes(*checksum_bytes);
ensure!(calculated_checksum == expected_checksum);
let data = TimelineMetadata::des_prefix(&data)?;
assert!(data.disk_consistent_lsn.is_aligned());
Ok(data)
@@ -1450,8 +1488,13 @@ impl LayeredTimeline {
ancestor_timeline: ancestor_timelineid,
ancestor_lsn: self.ancestor_lsn,
};
let metadata_path =
LayeredRepository::save_metadata(self.conf, self.timelineid, self.tenantid, &metadata)?;
let metadata_path = LayeredRepository::save_metadata(
self.conf,
self.timelineid,
self.tenantid,
&metadata,
false,
)?;
if let Some(relish_uploader) = &self.relish_uploader {
relish_uploader.schedule_upload(self.timelineid, metadata_path);
}

View File

@@ -277,6 +277,23 @@ mod tests {
Ok(repo)
}
fn load_test_repo(test_name: &str, tenantid: ZTenantId) -> Result<Box<dyn Repository>> {
let repo_dir = PageServerConf::test_repo_dir(test_name);
let conf = PageServerConf::dummy_conf(repo_dir);
let conf: &'static PageServerConf = Box::leak(Box::new(conf));
let walredo_mgr = TestRedoManager {};
let repo = Box::new(LayeredRepository::new(
conf,
Arc::new(walredo_mgr),
tenantid,
));
Ok(repo)
}
#[test]
fn test_relsize() -> Result<()> {
let repo = get_test_repo("test_relsize")?;
@@ -706,6 +723,42 @@ mod tests {
Ok(())
}
#[test]
fn corrupt_metadata() -> Result<()> {
const TEST_NAME: &str = "corrupt_metadata";
let repo = get_test_repo(TEST_NAME)?;
let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap();
repo.create_empty_timeline(timelineid)?;
drop(repo);
let dir = PageServerConf::test_repo_dir(TEST_NAME);
let mut read_dir = std::fs::read_dir(dir.join("tenants"))?;
let tenant_dir = read_dir.next().unwrap().unwrap().path();
assert!(tenant_dir.is_dir());
let tenantid = tenant_dir.file_name().unwrap().to_str().unwrap();
let tenantid = ZTenantId::from_str(tenantid)?;
assert!(read_dir.next().is_none());
let metadata_path = tenant_dir
.join("timelines")
.join(timelineid.to_string())
.join("metadata");
assert!(metadata_path.is_file());
let mut metadata_bytes = std::fs::read(&metadata_path)?;
assert_eq!(metadata_bytes.len(), 512);
metadata_bytes[512 - 4 - 2] ^= 1;
std::fs::write(metadata_path, metadata_bytes)?;
let new_repo = load_test_repo(TEST_NAME, tenantid)?;
let err = new_repo.get_timeline(timelineid).err().unwrap();
assert!(err.to_string().contains("checksum"));
Ok(())
}
// Mock WAL redo manager that doesn't do much
struct TestRedoManager {}