diff --git a/control_plane/src/storage.rs b/control_plane/src/storage.rs index 2891ef5cbb..a65523c8f5 100644 --- a/control_plane/src/storage.rs +++ b/control_plane/src/storage.rs @@ -427,7 +427,7 @@ impl PageServerNode { .map(|x| x.parse::()) .transpose() .context("Failed to parse 'max_lsn_wal_lag' as non zero integer")?, - data_checksums: Some(true), + data_checksums_enabled: Some(true), }) .send()? .error_from_body()? diff --git a/libs/postgres_ffi/src/lib.rs b/libs/postgres_ffi/src/lib.rs index 9529f45787..add17256be 100644 --- a/libs/postgres_ffi/src/lib.rs +++ b/libs/postgres_ffi/src/lib.rs @@ -60,13 +60,17 @@ pub fn page_set_lsn(pg: &mut [u8], lsn: Lsn) { /// Calculate page checksum and stamp it onto the page. /// NB: this will zero out and ignore any existing checksum. -pub fn page_set_checksum(page: &mut [u8], blkno: u32) { +/// # Safety +/// See safety notes for `pg_checksum_page` +pub unsafe fn page_set_checksum(page: &mut [u8], blkno: u32) { let checksum = pg_checksum_page(page, blkno); page[8..10].copy_from_slice(&checksum.to_le_bytes()); } /// Check if page checksum is valid. -pub fn page_verify_checksum(page: &[u8], blkno: u32) -> bool { +/// # Safety +/// See safety notes for `pg_checksum_page` +pub unsafe fn page_verify_checksum(page: &[u8], blkno: u32) -> bool { let checksum = pg_checksum_page(page, blkno); checksum == u16::from_le_bytes(page[8..10].try_into().unwrap()) } @@ -86,7 +90,7 @@ mod tests { } // Calculate the checksum. - let checksum = pg_checksum_page(&page[..], 0); + let checksum = unsafe { pg_checksum_page(&page[..], 0) }; // Sanity check: random bytes in the checksum attribute should not be // a valid checksum. @@ -96,12 +100,12 @@ mod tests { ); // Set the actual checksum. - page_set_checksum(&mut page, 0); + unsafe { page_set_checksum(&mut page, 0) }; // Verify the checksum. - assert!(page_verify_checksum(&page[..], 0)); + assert!(unsafe { page_verify_checksum(&page[..], 0) }); // Checksum is not valid with another block number. - assert!(!page_verify_checksum(&page[..], 1)); + assert!(!unsafe { page_verify_checksum(&page[..], 1) }); } } diff --git a/libs/utils/benches/benchmarks.rs b/libs/utils/benches/benchmarks.rs index 0339939934..76790ddc98 100644 --- a/libs/utils/benches/benchmarks.rs +++ b/libs/utils/benches/benchmarks.rs @@ -1,6 +1,6 @@ -#![allow(unused)] - use criterion::{criterion_group, criterion_main, Criterion}; + +use utils::pg_checksum_page::pg_checksum_page; use utils::zid; pub fn bench_zid_stringify(c: &mut Criterion) { @@ -18,5 +18,20 @@ pub fn bench_zid_stringify(c: &mut Criterion) { }); } -criterion_group!(benches, bench_zid_stringify); +// NB: adding `black_box` around arguments doesn't seem to change anything. +pub fn pg_checksum_page_basic(c: &mut Criterion) { + const BLCKSZ: usize = 8192; + let mut page: [u8; BLCKSZ] = [0; BLCKSZ]; + for (i, byte) in page.iter_mut().enumerate().take(BLCKSZ) { + *byte = i as u8; + } + + c.bench_function("pg_checksum_page_basic", |b| { + b.iter(|| { + unsafe { pg_checksum_page(&page[..], 0) }; + }) + }); +} + +criterion_group!(benches, pg_checksum_page_basic, bench_zid_stringify); criterion_main!(benches); diff --git a/libs/utils/src/pg_checksum_page.rs b/libs/utils/src/pg_checksum_page.rs index d1886316b0..a4d1997175 100644 --- a/libs/utils/src/pg_checksum_page.rs +++ b/libs/utils/src/pg_checksum_page.rs @@ -1,16 +1,20 @@ /// -/// Port of Postgres pg_checksum_page +/// Rust implementation of Postgres pg_checksum_page +/// See: https://github.com/postgres/postgres/blob/88210542106de5b26fe6aa088d1811b68502d224/src/include/storage/checksum_impl.h +/// for additional comments. +/// +/// This is not a direct port of pg_checksum_page from Postgres, though. +/// For example, in the current state it can only produce a valid result +/// on the little-endian platform and with the standard 8 KB page size. /// const BLCKSZ: usize = 8192; const N_SUMS: usize = 32; -/* prime multiplier of FNV-1a hash */ +// Prime multiplier of FNV-1a hash const FNV_PRIME: u32 = 16777619; -/* - * Base offsets to initialize each of the parallel FNV hashes into a - * different initial state. - */ +// Base offsets to initialize each of the parallel FNV hashes into a +// different initial state. const CHECKSUM_BASE_OFFSETS: [u32; N_SUMS] = [ 0x5B1F36E9, 0xB8525960, 0x02AB50AA, 0x1DE66D2A, 0x79FF467A, 0x9BB9F8A3, 0x217E7CD2, 0x83E13D2C, 0xF8D4474F, 0xE39EB970, 0x42C6AE16, 0x993216FA, 0x7B093B5D, 0x98DAFF3C, 0xF718902A, 0x0B1C9CDB, @@ -18,68 +22,80 @@ const CHECKSUM_BASE_OFFSETS: [u32; N_SUMS] = [ 0x783125BB, 0x6CA8EAA2, 0xE407EAC6, 0x4B5CFC3E, 0x9FBF8C76, 0x15CA20BE, 0xF2CA9FD3, 0x959BD756, ]; -/* - * Calculate one round of the checksum. - */ +// Calculate one round of the checksum. fn checksum_comp(checksum: u32, value: u32) -> u32 { let tmp = checksum ^ value; tmp.wrapping_mul(FNV_PRIME) ^ (tmp >> 17) } -/* - * Compute the checksum for a Postgres page. - * - * The page must be adequately aligned (at least on a 4-byte boundary). - * Beware also that the checksum field of the page is transiently zeroed. - * - * The checksum includes the block number (to detect the case where a page is - * somehow moved to a different location), the page header (excluding the - * checksum itself), and the page data. - * - * As in C implementation in Postgres, the checksum attribute on the page is - * excluded from the calculation and preserved. - */ -pub fn pg_checksum_page(data: &[u8], blkno: u32) -> u16 { - let page = unsafe { std::mem::transmute::<&[u8], &[u32]>(data) }; +/// Compute the checksum for a Postgres page. +/// +/// The page must be adequately aligned (at least on a 4-byte boundary). +/// +/// The checksum includes the block number (to detect the case where a page is +/// somehow moved to a different location), the page header (excluding the +/// checksum itself), and the page data. +/// +/// As in C implementation in Postgres, the checksum attribute on the page is +/// excluded from the calculation and preserved. +/// +/// NB: after doing any modifications run `cargo bench`. The baseline on the more +/// or less recent Intel laptop is around 700ns. If it's significantly higher, +/// then it's worth looking into. +/// +/// # Arguments +/// * `data` - the page to checksum +/// * `blkno` - the block number of the page +/// +/// # Safety +/// This function is safe to call only if: +/// * `data` is strictly a standard 8 KB Postgres page +/// * it's called on the little-endian platform +pub unsafe fn pg_checksum_page(data: &[u8], blkno: u32) -> u16 { + let page = std::mem::transmute::<&[u8], &[u32]>(data); let mut checksum: u32 = 0; let mut sums = CHECKSUM_BASE_OFFSETS; - /* main checksum calculation */ - for i in 0..(BLCKSZ / (4 * N_SUMS)) { - for (j, sum) in sums.iter_mut().enumerate().take(N_SUMS) { - let chunk_i = i * N_SUMS + j; - let chunk: u32; - if chunk_i == 2 { - let mut chunk_copy = page[chunk_i].to_le_bytes(); - chunk_copy[0] = 0; - chunk_copy[1] = 0; - chunk = u32::from_le_bytes(chunk_copy); - } else { - chunk = page[chunk_i]; - } + // Calculate the checksum of the first 'row' of the page. Do it separately as + // we do an expensive comparison here, which is not required for the rest of the + // page. Putting it into the main loop slows it down ~3 times. + for (j, sum) in sums.iter_mut().enumerate().take(N_SUMS) { + // Third 32-bit chunk of the page contains the checksum in the lower half + // (assuming we are on little-endian machine), which we need to zero out. + // See also `PageHeaderData` for reference. + let chunk: u32 = if j == 2 { + page[j] & 0xFFFF_0000 + } else { + page[j] + }; - *sum = checksum_comp(*sum, chunk); + *sum = checksum_comp(*sum, chunk); + } + + // Main checksum calculation loop + for i in 1..(BLCKSZ / (4 * N_SUMS)) { + for (j, sum) in sums.iter_mut().enumerate().take(N_SUMS) { + *sum = checksum_comp(*sum, page[i * N_SUMS + j]); } } - /* finally add in two rounds of zeroes for additional mixing */ + + // Finally, add in two rounds of zeroes for additional mixing for _i in 0..2 { for s in sums.iter_mut().take(N_SUMS) { *s = checksum_comp(*s, 0); } } - /* xor fold partial checksums together */ + // Xor fold partial checksums together for sum in sums { checksum ^= sum; } - /* Mix in the block number to detect transposed pages */ + // Mix in the block number to detect transposed pages checksum ^= blkno; - /* - * Reduce to a uint16 (to fit in the pd_checksum field) with an offset of - * one. That avoids checksums of zero, which seems like a good idea. - */ + // Reduce to a uint16 (to fit in the pd_checksum field) with an offset of + // one. That avoids checksums of zero, which seems like a good idea. ((checksum % 65535) + 1) as u16 } @@ -96,25 +112,25 @@ mod tests { } // Calculate the checksum. - let checksum = pg_checksum_page(&page[..], 0); + let checksum = unsafe { pg_checksum_page(&page[..], 0) }; // Zero the checksum attribute on the page. page[8..10].copy_from_slice(&[0u8; 2]); // Calculate the checksum again, should be the same. - let new_checksum = pg_checksum_page(&page[..], 0); + let new_checksum = unsafe { pg_checksum_page(&page[..], 0) }; assert_eq!(checksum, new_checksum); // Set the correct checksum into the page. page[8..10].copy_from_slice(&checksum.to_le_bytes()); // Calculate the checksum again, should be the same. - let new_checksum = pg_checksum_page(&page[..], 0); + let new_checksum = unsafe { pg_checksum_page(&page[..], 0) }; assert_eq!(checksum, new_checksum); // Check that we protect from the page transposition, i.e. page is the // same, but in the wrong place. - let wrong_blockno_checksum = pg_checksum_page(&page[..], 1); + let wrong_blockno_checksum = unsafe { pg_checksum_page(&page[..], 1) }; assert_ne!(checksum, wrong_blockno_checksum); } } diff --git a/pageserver/src/http/models.rs b/pageserver/src/http/models.rs index be2b6e1dc6..6da1df0891 100644 --- a/pageserver/src/http/models.rs +++ b/pageserver/src/http/models.rs @@ -38,7 +38,7 @@ pub struct TenantCreateRequest { pub walreceiver_connect_timeout: Option, pub lagging_wal_timeout: Option, pub max_lsn_wal_lag: Option, - pub data_checksums: Option, + pub data_checksums_enabled: Option, } #[serde_as] diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index 1541042efa..671ff9dbf0 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -494,7 +494,7 @@ components: type: string compaction_threshold: type: string - data_checksums: + data_checksums_enabled: type: boolean TenantConfigInfo: type: object diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 158c47c679..c93dfa43eb 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -413,7 +413,7 @@ async fn tenant_create_handler(mut request: Request) -> Result, pub max_lsn_wal_lag: Option, - pub data_checksums: Option, + pub data_checksums_enabled: Option, } impl TenantConfOpt { @@ -141,7 +141,9 @@ impl TenantConfOpt { .lagging_wal_timeout .unwrap_or(global_conf.lagging_wal_timeout), max_lsn_wal_lag: self.max_lsn_wal_lag.unwrap_or(global_conf.max_lsn_wal_lag), - data_checksums: self.data_checksums.unwrap_or(global_conf.data_checksums), + data_checksums_enabled: self + .data_checksums_enabled + .unwrap_or(global_conf.data_checksums_enabled), } } @@ -179,8 +181,8 @@ impl TenantConfOpt { if let Some(max_lsn_wal_lag) = other.max_lsn_wal_lag { self.max_lsn_wal_lag = Some(max_lsn_wal_lag); } - if let Some(data_checksums) = other.data_checksums { - self.data_checksums = Some(data_checksums); + if let Some(data_checksums_enabled) = other.data_checksums_enabled { + self.data_checksums_enabled = Some(data_checksums_enabled); } } } @@ -209,7 +211,7 @@ impl TenantConf { .expect("cannot parse default walreceiver lagging wal timeout"), max_lsn_wal_lag: NonZeroU64::new(DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG) .expect("cannot parse default max walreceiver Lsn wal lag"), - data_checksums: DEFAULT_DATA_CHECKSUMS, + data_checksums_enabled: DEFAULT_DATA_CHECKSUMS, } } @@ -240,7 +242,7 @@ impl TenantConf { .unwrap(), max_lsn_wal_lag: NonZeroU64::new(defaults::DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG) .unwrap(), - data_checksums: defaults::DEFAULT_DATA_CHECKSUMS, + data_checksums_enabled: defaults::DEFAULT_DATA_CHECKSUMS, } } } diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index 84686c85a6..6dfd1a356d 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -266,11 +266,14 @@ pub fn create_tenant_repository( Ok(None) } Entry::Vacant(v) => { - let data_checksums = tenant_conf - .data_checksums + let data_checksums_enabled = tenant_conf + .data_checksums_enabled .unwrap_or(tenant_config::defaults::DEFAULT_DATA_CHECKSUMS); - let wal_redo_manager = - Arc::new(PostgresRedoManager::new(conf, data_checksums, tenant_id)); + let wal_redo_manager = Arc::new(PostgresRedoManager::new( + conf, + data_checksums_enabled, + tenant_id, + )); let repo = timelines::create_repo( conf, tenant_conf, @@ -576,11 +579,11 @@ fn load_local_repo( let mut m = tenants_state::write_tenants(); let tenant = m.entry(tenant_id).or_insert_with(|| { - let data_checksums = tenant_conf - .data_checksums + let data_checksums_enabled = tenant_conf + .data_checksums_enabled .unwrap_or(tenant_config::defaults::DEFAULT_DATA_CHECKSUMS); // Set up a WAL redo manager, for applying WAL records. - let walredo_mgr = PostgresRedoManager::new(conf, data_checksums, tenant_id); + let walredo_mgr = PostgresRedoManager::new(conf, data_checksums_enabled, tenant_id); // Set up an object repository, for actual data storage. let repo: Arc = Arc::new(LayeredRepository::new( diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 7cd2206bf5..01c6984152 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -313,7 +313,7 @@ impl<'a, R: Repository> WalIngest<'a, R> { if !page_is_new(&image) { page_set_lsn(&mut image, lsn) } - page_set_checksum(&mut image, blk.blkno); + unsafe { page_set_checksum(&mut image, blk.blkno) }; assert_eq!(image.len(), pg_constants::BLCKSZ as usize); self.put_rel_page_image(modification, rel, blk.blkno, image.freeze())?; diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index 0ddd99b261..044f453fd1 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -132,7 +132,7 @@ lazy_static! { pub struct PostgresRedoManager { tenantid: ZTenantId, conf: &'static PageServerConf, - data_checksums: bool, + data_checksums_enabled: bool, process: Mutex>, } @@ -233,14 +233,14 @@ impl PostgresRedoManager { /// pub fn new( conf: &'static PageServerConf, - data_checksums: bool, + data_checksums_enabled: bool, tenantid: ZTenantId, ) -> PostgresRedoManager { // The actual process is launched lazily, on first request. PostgresRedoManager { tenantid, conf, - data_checksums, + data_checksums_enabled, process: Mutex::new(None), } } @@ -280,7 +280,7 @@ impl PostgresRedoManager { base_img, records, wal_redo_timeout, - self.data_checksums, + self.data_checksums_enabled, ) .map_err(WalRedoError::IoError); @@ -730,7 +730,7 @@ impl PostgresRedoProcess { base_img: Option, records: &[(Lsn, ZenithWalRecord)], wal_redo_timeout: Duration, - data_checksums: bool, + data_checksums_enabled: bool, ) -> Result { // Serialize all the messages to send the WAL redo process first. // @@ -742,7 +742,7 @@ impl PostgresRedoProcess { if let Some(img) = base_img { // Checksums could be not stamped for old tenants, so check them only if they // are enabled (this is controlled by per-tenant config). - if data_checksums && !page_verify_checksum(&img, tag.blknum) { + if data_checksums_enabled && !unsafe { page_verify_checksum(&img, tag.blknum) } { return Err(std::io::Error::new( std::io::ErrorKind::InvalidData, format!("block {} of relation {} is invalid", tag.blknum, tag.rel),