From 2501afba6e4aa7261e689ffa0b8ed078ccf8bb17 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Wed, 16 Feb 2022 13:16:35 +0300 Subject: [PATCH] Calculate postgres checksum for FPI stored in pageserver (neondatabase/cloud#536) --- libs/postgres_ffi/src/lib.rs | 9 ++++ libs/utils/src/lib.rs | 3 ++ libs/utils/src/pg_checksum_page.rs | 70 ++++++++++++++++++++++++++++++ pageserver/src/walingest.rs | 4 +- 4 files changed, 85 insertions(+), 1 deletion(-) create mode 100644 libs/utils/src/pg_checksum_page.rs diff --git a/libs/postgres_ffi/src/lib.rs b/libs/postgres_ffi/src/lib.rs index 28d9a13dbf..91f400b74d 100644 --- a/libs/postgres_ffi/src/lib.rs +++ b/libs/postgres_ffi/src/lib.rs @@ -9,6 +9,7 @@ use serde::{Deserialize, Serialize}; use utils::lsn::Lsn; +use utils::pg_checksum_page::pg_checksum_page; include!(concat!(env!("OUT_DIR"), "/bindings.rs")); @@ -56,3 +57,11 @@ pub fn page_set_lsn(pg: &mut [u8], lsn: Lsn) { pg[0..4].copy_from_slice(&((lsn.0 >> 32) as u32).to_le_bytes()); pg[4..8].copy_from_slice(&(lsn.0 as u32).to_le_bytes()); } + +/// Calculate page checksum and stamp it onto the page. +/// NB: this will zero out and ignore any existing checksum. +pub fn page_set_checksum(page: &mut [u8], blkno: u32) { + page[8..10].copy_from_slice(&[0u8; 2]); + let checksum = pg_checksum_page(page, blkno); + page[8..10].copy_from_slice(&checksum.to_le_bytes()); +} diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs index 1b011bb73a..6bbd919931 100644 --- a/libs/utils/src/lib.rs +++ b/libs/utils/src/lib.rs @@ -54,6 +54,9 @@ pub mod nonblock; // Default signal handling pub mod signals; +// Postgres checksum calculation +pub mod pg_checksum_page; + /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages /// /// we have several cases: diff --git a/libs/utils/src/pg_checksum_page.rs b/libs/utils/src/pg_checksum_page.rs new file mode 100644 index 0000000000..5c7cdaf7ac --- /dev/null +++ b/libs/utils/src/pg_checksum_page.rs @@ -0,0 +1,70 @@ +/// +/// Port of Postgres pg_checksum_page +/// + +const BLCKSZ: usize = 8192; +const N_SUMS: usize = 32; +/* prime multiplier of FNV-1a hash */ +const FNV_PRIME: u32 = 16777619; + +/* + * Base offsets to initialize each of the parallel FNV hashes into a + * different initial state. + */ +const CHECKSUM_BASE_OFFSETS: [u32; N_SUMS] = [ + 0x5B1F36E9, 0xB8525960, 0x02AB50AA, 0x1DE66D2A, 0x79FF467A, 0x9BB9F8A3, 0x217E7CD2, 0x83E13D2C, + 0xF8D4474F, 0xE39EB970, 0x42C6AE16, 0x993216FA, 0x7B093B5D, 0x98DAFF3C, 0xF718902A, 0x0B1C9CDB, + 0xE58F764B, 0x187636BC, 0x5D7B3BB1, 0xE73DE7DE, 0x92BEC979, 0xCCA6C0B2, 0x304A0979, 0x85AA43D4, + 0x783125BB, 0x6CA8EAA2, 0xE407EAC6, 0x4B5CFC3E, 0x9FBF8C76, 0x15CA20BE, 0xF2CA9FD3, 0x959BD756, +]; + +/* + * Calculate one round of the checksum. + */ +fn checksum_comp(checksum: u32, value: u32) -> u32 { + let tmp = checksum ^ value; + tmp.wrapping_mul(FNV_PRIME) ^ (tmp >> 17) +} + +/* + * Compute the checksum for a Postgres page. + * + * The page must be adequately aligned (at least on a 4-byte boundary). + * Beware also that the checksum field of the page is transiently zeroed. + * + * The checksum includes the block number (to detect the case where a page is + * somehow moved to a different location), the page header (excluding the + * checksum itself), and the page data. + */ +pub fn pg_checksum_page(data: &[u8], blkno: u32) -> u16 { + let page = unsafe { std::mem::transmute::<&[u8], &[u32]>(data) }; + let mut checksum: u32 = 0; + let mut sums = CHECKSUM_BASE_OFFSETS; + + /* main checksum calculation */ + for i in 0..(BLCKSZ / (4 * N_SUMS)) { + for j in 0..N_SUMS { + sums[j] = checksum_comp(sums[j], page[i * N_SUMS + j]); + } + } + /* finally add in two rounds of zeroes for additional mixing */ + for _i in 0..2 { + for s in sums.iter_mut().take(N_SUMS) { + *s = checksum_comp(*s, 0); + } + } + + /* xor fold partial checksums together */ + for sum in sums { + checksum ^= sum; + } + + /* Mix in the block number to detect transposed pages */ + checksum ^= blkno; + + /* + * Reduce to a uint16 (to fit in the pd_checksum field) with an offset of + * one. That avoids checksums of zero, which seems like a good idea. + */ + ((checksum % 65535) + 1) as u16 +} diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 2f39007e9f..7cd2206bf5 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -24,7 +24,7 @@ use anyhow::Context; use postgres_ffi::nonrelfile_utils::clogpage_precedes; use postgres_ffi::nonrelfile_utils::slru_may_delete_clogsegment; -use postgres_ffi::{page_is_new, page_set_lsn}; +use postgres_ffi::{page_is_new, page_set_checksum, page_set_lsn}; use anyhow::Result; use bytes::{Buf, Bytes, BytesMut}; @@ -313,6 +313,8 @@ impl<'a, R: Repository> WalIngest<'a, R> { if !page_is_new(&image) { page_set_lsn(&mut image, lsn) } + page_set_checksum(&mut image, blk.blkno); + assert_eq!(image.len(), pg_constants::BLCKSZ as usize); self.put_rel_page_image(modification, rel, blk.blkno, image.freeze())?; } else {