From dc3f3d0acefd45f5b3379bd81512c17488ebc9ed Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Wed, 16 Feb 2022 13:16:35 +0300 Subject: [PATCH] Calculate postgres checksum for FPI stored in page server --- pageserver/src/walingest.rs | 4 ++ zenith_utils/src/lib.rs | 3 ++ zenith_utils/src/pg_checksum_page.rs | 70 ++++++++++++++++++++++++++++ 3 files changed, 77 insertions(+) create mode 100644 zenith_utils/src/pg_checksum_page.rs diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 1962c9bbd3..f070c77355 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -37,6 +37,7 @@ use postgres_ffi::xlog_utils::*; use postgres_ffi::TransactionId; use postgres_ffi::{pg_constants, CheckPoint}; use zenith_utils::lsn::Lsn; +use zenith_utils::pg_checksum_page::pg_checksum_page; static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; 8192]); @@ -329,6 +330,9 @@ impl WalIngest { } image[0..4].copy_from_slice(&((lsn.0 >> 32) as u32).to_le_bytes()); image[4..8].copy_from_slice(&(lsn.0 as u32).to_le_bytes()); + image[8..10].copy_from_slice(&[0u8; 2]); + let checksum = pg_checksum_page(&image, blk.blkno); + image[8..10].copy_from_slice(&checksum.to_le_bytes()); assert_eq!(image.len(), pg_constants::BLCKSZ as usize); timeline.put_page_image(tag, blk.blkno, lsn, image.freeze())?; } else { diff --git a/zenith_utils/src/lib.rs b/zenith_utils/src/lib.rs index 7d8ef63b1c..4632fb2bb7 100644 --- a/zenith_utils/src/lib.rs +++ b/zenith_utils/src/lib.rs @@ -54,6 +54,9 @@ pub mod nonblock; // Default signal handling pub mod signals; +// Postgres checksum calculation +pub mod pg_checksum_page; + // This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages // // we have several cases: diff --git a/zenith_utils/src/pg_checksum_page.rs b/zenith_utils/src/pg_checksum_page.rs new file mode 100644 index 0000000000..6f79ceb3fc --- /dev/null +++ b/zenith_utils/src/pg_checksum_page.rs @@ -0,0 +1,70 @@ +/// +/// Port of Postgres pg_checksum_page +/// + +const BLCKSZ: usize = 8192; +const N_SUMS: usize = 32; +/* prime multiplier of FNV-1a hash */ +const FNV_PRIME: u32 = 16777619; + +/* + * Base offsets to initialize each of the parallel FNV hashes into a + * different initial state. + */ +const CHECKSUM_BASE_OFFSETS: [u32; N_SUMS] = [ + 0x5B1F36E9, 0xB8525960, 0x02AB50AA, 0x1DE66D2A, 0x79FF467A, 0x9BB9F8A3, 0x217E7CD2, 0x83E13D2C, + 0xF8D4474F, 0xE39EB970, 0x42C6AE16, 0x993216FA, 0x7B093B5D, 0x98DAFF3C, 0xF718902A, 0x0B1C9CDB, + 0xE58F764B, 0x187636BC, 0x5D7B3BB1, 0xE73DE7DE, 0x92BEC979, 0xCCA6C0B2, 0x304A0979, 0x85AA43D4, + 0x783125BB, 0x6CA8EAA2, 0xE407EAC6, 0x4B5CFC3E, 0x9FBF8C76, 0x15CA20BE, 0xF2CA9FD3, 0x959BD756, +]; + +/* + * Calculate one round of the checksum. + */ +fn checksum_comp(checksum: u32, value: u32) -> u32 { + let tmp = checksum ^ value; + return tmp.wrapping_mul(FNV_PRIME) ^ (tmp >> 17); +} + +/* + * Compute the checksum for a Postgres page. + * + * The page must be adequately aligned (at least on a 4-byte boundary). + * Beware also that the checksum field of the page is transiently zeroed. + * + * The checksum includes the block number (to detect the case where a page is + * somehow moved to a different location), the page header (excluding the + * checksum itself), and the page data. + */ +pub fn pg_checksum_page(data: &[u8], blkno: u32) -> u16 { + let page = unsafe { std::mem::transmute::<&[u8], &[u32]>(data) }; + let mut checksum: u32 = 0; + let mut sums = CHECKSUM_BASE_OFFSETS; + + /* main checksum calculation */ + for i in 0..(BLCKSZ / (4 * N_SUMS)) { + for j in 0..N_SUMS { + sums[j] = checksum_comp(sums[j], page[i * N_SUMS + j]); + } + } + /* finally add in two rounds of zeroes for additional mixing */ + for i in 0..2 { + for j in 0..N_SUMS { + sums[i] = checksum_comp(sums[j], 0); + } + } + + /* xor fold partial checksums together */ + for i in 0..N_SUMS { + checksum ^= sums[i]; + } + + /* Mix in the block number to detect transposed pages */ + checksum ^= blkno; + + /* + * Reduce to a uint16 (to fit in the pd_checksum field) with an offset of + * one. That avoids checksums of zero, which seems like a good idea. + */ + return ((checksum % 65535) + 1) as u16; +}