From 1fb3d081854a31f9afd1f4e5161fa4cbf9738299 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Fri, 22 Apr 2022 21:31:27 +0300 Subject: [PATCH] Use a 1-byte length header for short blobs. Notably, this shaves 3 bytes from each small WAL record stored in ephemeral or delta layers. --- pageserver/src/layered_repository/blob_io.rs | 72 ++++++++++++++----- .../src/layered_repository/ephemeral_file.rs | 42 +++++++---- 2 files changed, 83 insertions(+), 31 deletions(-) diff --git a/pageserver/src/layered_repository/blob_io.rs b/pageserver/src/layered_repository/blob_io.rs index aa90bbd0cf..3aeeb2b2c8 100644 --- a/pageserver/src/layered_repository/blob_io.rs +++ b/pageserver/src/layered_repository/blob_io.rs @@ -1,12 +1,20 @@ //! //! Functions for reading and writing variable-sized "blobs". //! -//! Each blob begins with a 4-byte length, followed by the actual data. +//! Each blob begins with a 1- or 4-byte length field, followed by the +//! actual data. If the length is smaller than 128 bytes, the length +//! is written as a one byte. If it's larger than that, the length +//! is written as a four-byte integer, in big-endian, with the high +//! bit set. This way, we can detect whether it's 1- or 4-byte header +//! by peeking at the first byte. +//! +//! len < 128: 0XXXXXXX +//! len >= 128: 1XXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX //! use crate::layered_repository::block_io::{BlockCursor, BlockReader}; use crate::page_cache::PAGE_SZ; use std::cmp::min; -use std::io::Error; +use std::io::{Error, ErrorKind}; /// For reading pub trait BlobCursor { @@ -40,21 +48,30 @@ where let mut buf = self.read_blk(blknum)?; - // read length - let mut len_buf = [0u8; 4]; - let thislen = PAGE_SZ - off; - if thislen < 4 { - // it is split across two pages - len_buf[..thislen].copy_from_slice(&buf[off..PAGE_SZ]); - blknum += 1; - buf = self.read_blk(blknum)?; - len_buf[thislen..].copy_from_slice(&buf[0..4 - thislen]); - off = 4 - thislen; + // peek at the first byte, to determine if it's a 1- or 4-byte length + let first_len_byte = buf[off]; + let len: usize = if first_len_byte < 0x80 { + // 1-byte length header + off += 1; + first_len_byte as usize } else { - len_buf.copy_from_slice(&buf[off..off + 4]); - off += 4; - } - let len = u32::from_ne_bytes(len_buf) as usize; + // 4-byte length header + let mut len_buf = [0u8; 4]; + let thislen = PAGE_SZ - off; + if thislen < 4 { + // it is split across two pages + len_buf[..thislen].copy_from_slice(&buf[off..PAGE_SZ]); + blknum += 1; + buf = self.read_blk(blknum)?; + len_buf[thislen..].copy_from_slice(&buf[0..4 - thislen]); + off = 4 - thislen; + } else { + len_buf.copy_from_slice(&buf[off..off + 4]); + off += 4; + } + len_buf[0] &= 0x7f; + u32::from_be_bytes(len_buf) as usize + }; dstbuf.clear(); @@ -130,10 +147,27 @@ where { fn write_blob(&mut self, srcbuf: &[u8]) -> Result { let offset = self.offset; - self.inner - .write_all(&((srcbuf.len()) as u32).to_ne_bytes())?; + + if srcbuf.len() < 128 { + // Short blob. Write a 1-byte length header + let len_buf = srcbuf.len() as u8; + self.inner.write_all(&[len_buf])?; + self.offset += 1; + } else { + // Write a 4-byte length header + if srcbuf.len() > 0x7fff_ffff { + return Err(Error::new( + ErrorKind::Other, + format!("blob too large ({} bytes)", srcbuf.len()), + )); + } + let mut len_buf = ((srcbuf.len()) as u32).to_be_bytes(); + len_buf[0] |= 0x80; + self.inner.write_all(&len_buf)?; + self.offset += 4; + } self.inner.write_all(srcbuf)?; - self.offset += 4 + srcbuf.len() as u64; + self.offset += srcbuf.len() as u64; Ok(offset) } } diff --git a/pageserver/src/layered_repository/ephemeral_file.rs b/pageserver/src/layered_repository/ephemeral_file.rs index 9537d3939c..cdde9d5d13 100644 --- a/pageserver/src/layered_repository/ephemeral_file.rs +++ b/pageserver/src/layered_repository/ephemeral_file.rs @@ -199,18 +199,24 @@ impl BlobWriter for EphemeralFile { let mut buf = self.get_buf_for_write(blknum)?; // Write the length field - let len_buf = u32::to_ne_bytes(srcbuf.len() as u32); - let thislen = PAGE_SZ - off; - if thislen < 4 { - // it needs to be split across pages - buf[off..(off + thislen)].copy_from_slice(&len_buf[..thislen]); - blknum += 1; - buf = self.get_buf_for_write(blknum)?; - buf[0..4 - thislen].copy_from_slice(&len_buf[thislen..]); - off = 4 - thislen; + if srcbuf.len() < 0x80 { + buf[off] = srcbuf.len() as u8; + off += 1; } else { - buf[off..off + 4].copy_from_slice(&len_buf); - off += 4; + let mut len_buf = u32::to_be_bytes(srcbuf.len() as u32); + len_buf[0] |= 0x80; + let thislen = PAGE_SZ - off; + if thislen < 4 { + // it needs to be split across pages + buf[off..(off + thislen)].copy_from_slice(&len_buf[..thislen]); + blknum += 1; + buf = self.get_buf_for_write(blknum)?; + buf[0..4 - thislen].copy_from_slice(&len_buf[thislen..]); + off = 4 - thislen; + } else { + buf[off..off + 4].copy_from_slice(&len_buf); + off += 4; + } } // Write the payload @@ -229,7 +235,13 @@ impl BlobWriter for EphemeralFile { buf_remain = &buf_remain[this_blk_len..]; } drop(buf); - self.size += 4 + srcbuf.len() as u64; + + if srcbuf.len() < 0x80 { + self.size += 1; + } else { + self.size += 4; + } + self.size += srcbuf.len() as u64; Ok(pos) } @@ -387,6 +399,12 @@ mod tests { let pos = file.write_blob(&data)?; blobs.push((pos, data)); } + // also test with a large blobs + for i in 0..100 { + let data = format!("blob{}", i).as_bytes().repeat(100); + let pos = file.write_blob(&data)?; + blobs.push((pos, data)); + } let mut cursor = BlockCursor::new(&file); for (pos, expected) in blobs {