mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-08 05:52:55 +00:00
pageserver - store timeline metadata durably
The metadata file is now always 512 bytes. The last 4 bytes are a crc32c checksum of the previous 508 bytes. Padding zeroes are added between the serde serialization and the start of the checksum. A single write call is used, and the file is fsyncd after. On file creation, the parent directory is fsyncd as well.
This commit is contained in:
committed by
Patrick Insinger
parent
62f83869f1
commit
538c2a2a3e
@@ -22,7 +22,8 @@ use serde::{Deserialize, Serialize};
|
||||
use std::collections::hash_map::Entry;
|
||||
use std::collections::HashMap;
|
||||
use std::collections::{BTreeSet, HashSet};
|
||||
use std::fs::File;
|
||||
use std::convert::TryInto;
|
||||
use std::fs::{File, OpenOptions};
|
||||
use std::io::Write;
|
||||
use std::ops::Bound::Included;
|
||||
use std::path::{Path, PathBuf};
|
||||
@@ -73,6 +74,11 @@ static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; 8192]);
|
||||
// Timeout when waiting for WAL receiver to catch up to an LSN given in a GetPage@LSN call.
|
||||
static TIMEOUT: Duration = Duration::from_secs(60);
|
||||
|
||||
// Taken from PG_CONTROL_MAX_SAFE_SIZE
|
||||
const METADATA_MAX_SAFE_SIZE: usize = 512;
|
||||
const METADATA_CHECKSUM_SIZE: usize = std::mem::size_of::<u32>();
|
||||
const METADATA_MAX_DATA_SIZE: usize = METADATA_MAX_SAFE_SIZE - METADATA_CHECKSUM_SIZE;
|
||||
|
||||
// Metrics collected on operations on the storage repository.
|
||||
lazy_static! {
|
||||
static ref STORAGE_TIME: HistogramVec = register_histogram_vec!(
|
||||
@@ -135,7 +141,7 @@ impl Repository for LayeredRepository {
|
||||
ancestor_timeline: None,
|
||||
ancestor_lsn: Lsn(0),
|
||||
};
|
||||
Self::save_metadata(self.conf, timelineid, self.tenantid, &metadata)?;
|
||||
Self::save_metadata(self.conf, timelineid, self.tenantid, &metadata, true)?;
|
||||
|
||||
let timeline = LayeredTimeline::new(
|
||||
self.conf,
|
||||
@@ -180,7 +186,7 @@ impl Repository for LayeredRepository {
|
||||
ancestor_lsn: start_lsn,
|
||||
};
|
||||
crashsafe_dir::create_dir_all(self.conf.timeline_path(&dst, &self.tenantid))?;
|
||||
Self::save_metadata(self.conf, dst, self.tenantid, &metadata)?;
|
||||
Self::save_metadata(self.conf, dst, self.tenantid, &metadata, true)?;
|
||||
|
||||
info!("branched timeline {} from {} at {}", dst, src, start_lsn);
|
||||
|
||||
@@ -353,13 +359,36 @@ impl LayeredRepository {
|
||||
timelineid: ZTimelineId,
|
||||
tenantid: ZTenantId,
|
||||
data: &TimelineMetadata,
|
||||
first_save: bool,
|
||||
) -> Result<PathBuf> {
|
||||
let path = conf.timeline_path(&timelineid, &tenantid).join("metadata");
|
||||
let mut file = File::create(&path)?;
|
||||
let timeline_path = conf.timeline_path(&timelineid, &tenantid);
|
||||
let path = timeline_path.join("metadata");
|
||||
// use OpenOptions to ensure file presence is consistent with first_save
|
||||
let mut file = OpenOptions::new()
|
||||
.write(true)
|
||||
.create_new(first_save)
|
||||
.open(&path)?;
|
||||
|
||||
info!("saving metadata {}", path.display());
|
||||
|
||||
file.write_all(&TimelineMetadata::ser(data)?)?;
|
||||
let mut metadata_bytes = TimelineMetadata::ser(data)?;
|
||||
|
||||
assert!(metadata_bytes.len() <= METADATA_MAX_DATA_SIZE);
|
||||
metadata_bytes.resize(METADATA_MAX_SAFE_SIZE, 0u8);
|
||||
|
||||
let checksum = crc32c::crc32c(&metadata_bytes[..METADATA_MAX_DATA_SIZE]);
|
||||
metadata_bytes[METADATA_MAX_DATA_SIZE..].copy_from_slice(&u32::to_le_bytes(checksum));
|
||||
|
||||
if file.write(&metadata_bytes)? != metadata_bytes.len() {
|
||||
bail!("Could not write all the metadata bytes in a single call");
|
||||
}
|
||||
file.sync_all()?;
|
||||
|
||||
// fsync the parent directory to ensure the directory entry is durable
|
||||
if first_save {
|
||||
let timeline_dir = File::open(&timeline_path)?;
|
||||
timeline_dir.sync_all()?;
|
||||
}
|
||||
|
||||
Ok(path)
|
||||
}
|
||||
@@ -370,9 +399,18 @@ impl LayeredRepository {
|
||||
tenantid: ZTenantId,
|
||||
) -> Result<TimelineMetadata> {
|
||||
let path = conf.timeline_path(&timelineid, &tenantid).join("metadata");
|
||||
let data = std::fs::read(&path)?;
|
||||
let metadata_bytes = std::fs::read(&path)?;
|
||||
ensure!(metadata_bytes.len() == METADATA_MAX_SAFE_SIZE);
|
||||
|
||||
let data = TimelineMetadata::des(&data)?;
|
||||
let data = &metadata_bytes[..METADATA_MAX_DATA_SIZE];
|
||||
let calculated_checksum = crc32c::crc32c(&data);
|
||||
|
||||
let checksum_bytes: &[u8; METADATA_CHECKSUM_SIZE] =
|
||||
metadata_bytes[METADATA_MAX_DATA_SIZE..].try_into()?;
|
||||
let expected_checksum = u32::from_le_bytes(*checksum_bytes);
|
||||
ensure!(calculated_checksum == expected_checksum);
|
||||
|
||||
let data = TimelineMetadata::des_prefix(&data)?;
|
||||
assert!(data.disk_consistent_lsn.is_aligned());
|
||||
|
||||
Ok(data)
|
||||
@@ -1450,8 +1488,13 @@ impl LayeredTimeline {
|
||||
ancestor_timeline: ancestor_timelineid,
|
||||
ancestor_lsn: self.ancestor_lsn,
|
||||
};
|
||||
let metadata_path =
|
||||
LayeredRepository::save_metadata(self.conf, self.timelineid, self.tenantid, &metadata)?;
|
||||
let metadata_path = LayeredRepository::save_metadata(
|
||||
self.conf,
|
||||
self.timelineid,
|
||||
self.tenantid,
|
||||
&metadata,
|
||||
false,
|
||||
)?;
|
||||
if let Some(relish_uploader) = &self.relish_uploader {
|
||||
relish_uploader.schedule_upload(self.timelineid, metadata_path);
|
||||
}
|
||||
|
||||
@@ -277,6 +277,23 @@ mod tests {
|
||||
Ok(repo)
|
||||
}
|
||||
|
||||
fn load_test_repo(test_name: &str, tenantid: ZTenantId) -> Result<Box<dyn Repository>> {
|
||||
let repo_dir = PageServerConf::test_repo_dir(test_name);
|
||||
|
||||
let conf = PageServerConf::dummy_conf(repo_dir);
|
||||
let conf: &'static PageServerConf = Box::leak(Box::new(conf));
|
||||
|
||||
let walredo_mgr = TestRedoManager {};
|
||||
|
||||
let repo = Box::new(LayeredRepository::new(
|
||||
conf,
|
||||
Arc::new(walredo_mgr),
|
||||
tenantid,
|
||||
));
|
||||
|
||||
Ok(repo)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_relsize() -> Result<()> {
|
||||
let repo = get_test_repo("test_relsize")?;
|
||||
@@ -706,6 +723,42 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn corrupt_metadata() -> Result<()> {
|
||||
const TEST_NAME: &str = "corrupt_metadata";
|
||||
let repo = get_test_repo(TEST_NAME)?;
|
||||
|
||||
let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap();
|
||||
repo.create_empty_timeline(timelineid)?;
|
||||
drop(repo);
|
||||
|
||||
let dir = PageServerConf::test_repo_dir(TEST_NAME);
|
||||
let mut read_dir = std::fs::read_dir(dir.join("tenants"))?;
|
||||
let tenant_dir = read_dir.next().unwrap().unwrap().path();
|
||||
assert!(tenant_dir.is_dir());
|
||||
let tenantid = tenant_dir.file_name().unwrap().to_str().unwrap();
|
||||
let tenantid = ZTenantId::from_str(tenantid)?;
|
||||
assert!(read_dir.next().is_none());
|
||||
|
||||
let metadata_path = tenant_dir
|
||||
.join("timelines")
|
||||
.join(timelineid.to_string())
|
||||
.join("metadata");
|
||||
|
||||
assert!(metadata_path.is_file());
|
||||
|
||||
let mut metadata_bytes = std::fs::read(&metadata_path)?;
|
||||
assert_eq!(metadata_bytes.len(), 512);
|
||||
metadata_bytes[512 - 4 - 2] ^= 1;
|
||||
std::fs::write(metadata_path, metadata_bytes)?;
|
||||
|
||||
let new_repo = load_test_repo(TEST_NAME, tenantid)?;
|
||||
let err = new_repo.get_timeline(timelineid).err().unwrap();
|
||||
assert!(err.to_string().contains("checksum"));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Mock WAL redo manager that doesn't do much
|
||||
struct TestRedoManager {}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user