Add a buffer cache, and use it to store materialized pages.

The buffer cache is shared across all tenants, allowing memory to be dynamically allocated where it's needed the most. The cache works on 8 kB pages, and uses the clock algorithm for replacement policy; same as the PostgreSQL buffer cache. One peculiarity is that the materialized page versions can be looked up by an inexact LSN, to find the latest page version with an LSN >= the search key. The code is structured to support caching other kinds of pages in the same cache in the future, but with a different mapping key. Co-authored-by: Patrick Insinger <patrick@zenith.tech>
2026-01-08 05:52:55 +00:00 · 2021-11-12 20:06:27 +02:00
parent 3d172d98a3
commit 431d32756b
8 changed files with 800 additions and 9 deletions
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -26,8 +26,8 @@ use clap::{App, Arg, ArgMatches};
 use daemonize::Daemonize;

 use pageserver::{
-    branches, defaults::*, http, page_service, remote_storage, tenant_mgr, virtual_file,
-    PageServerConf, RemoteStorageConfig, RemoteStorageKind, S3Config, LOG_FILE_NAME,
+    branches, defaults::*, http, page_cache, page_service, remote_storage, tenant_mgr,
+    virtual_file, PageServerConf, RemoteStorageConfig, RemoteStorageKind, S3Config, LOG_FILE_NAME,
 };
 use zenith_utils::http::endpoint;
 use zenith_utils::postgres_backend;
@@ -44,6 +44,7 @@ struct CfgFileParams {
    gc_horizon: Option<String>,
    gc_period: Option<String>,
    open_mem_limit: Option<String>,
+    page_cache_size: Option<String>,
    max_file_descriptors: Option<String>,
    pg_distrib_dir: Option<String>,
    auth_validation_public_key_path: Option<String>,
@@ -107,6 +108,7 @@ impl CfgFileParams {
            gc_horizon: get_arg("gc_horizon"),
            gc_period: get_arg("gc_period"),
            open_mem_limit: get_arg("open_mem_limit"),
+            page_cache_size: get_arg("page_cache_size"),
            max_file_descriptors: get_arg("max_file_descriptors"),
            pg_distrib_dir: get_arg("postgres-distrib"),
            auth_validation_public_key_path: get_arg("auth-validation-public-key-path"),
@@ -127,6 +129,7 @@ impl CfgFileParams {
            gc_horizon: self.gc_horizon.or(other.gc_horizon),
            gc_period: self.gc_period.or(other.gc_period),
            open_mem_limit: self.open_mem_limit.or(other.open_mem_limit),
+            page_cache_size: self.page_cache_size.or(other.page_cache_size),
            max_file_descriptors: self.max_file_descriptors.or(other.max_file_descriptors),
            pg_distrib_dir: self.pg_distrib_dir.or(other.pg_distrib_dir),
            auth_validation_public_key_path: self
@@ -177,6 +180,11 @@ impl CfgFileParams {
            None => DEFAULT_OPEN_MEM_LIMIT,
        };

+        let page_cache_size: usize = match self.page_cache_size.as_ref() {
+            Some(page_cache_size_str) => page_cache_size_str.parse()?,
+            None => DEFAULT_PAGE_CACHE_SIZE,
+        };
+
        let max_file_descriptors: usize = match self.max_file_descriptors.as_ref() {
            Some(max_file_descriptors_str) => max_file_descriptors_str.parse()?,
            None => DEFAULT_MAX_FILE_DESCRIPTORS,
@@ -252,6 +260,7 @@ impl CfgFileParams {
            gc_horizon,
            gc_period,
            open_mem_limit,
+            page_cache_size,
            max_file_descriptors,

            superuser: String::from(DEFAULT_SUPERUSER),
@@ -330,6 +339,13 @@ fn main() -> Result<()> {
                .takes_value(true)
                .help("Amount of memory reserved for buffering incoming WAL"),
        )
+        .arg(
+
+            Arg::with_name("page_cache_size")
+                .long("page_cache_size")
+                .takes_value(true)
+                .help("Number of pages in the page cache"),
+        )
        .arg(
            Arg::with_name("max_file_descriptors")
                .long("max_file_descriptors")
@@ -470,6 +486,8 @@ fn main() -> Result<()> {
    // Basic initialization of things that don't change after startup
    virtual_file::init(conf.max_file_descriptors);

+    page_cache::init(conf);
+
    // Create repo and exit if init was requested
    if init {
        branches::init_pageserver(conf, create_tenant).context("Failed to init pageserver")?;
@@ -636,6 +654,7 @@ mod tests {
            gc_horizon: Some("gc_horizon_VALUE".to_string()),
            gc_period: Some("gc_period_VALUE".to_string()),
            open_mem_limit: Some("open_mem_limit_VALUE".to_string()),
+            page_cache_size: Some("page_cache_size_VALUE".to_string()),
            max_file_descriptors: Some("max_file_descriptors_VALUE".to_string()),
            pg_distrib_dir: Some("pg_distrib_dir_VALUE".to_string()),
            auth_validation_public_key_path: Some(
@@ -661,6 +680,7 @@ checkpoint_period = 'checkpoint_period_VALUE'
 gc_horizon = 'gc_horizon_VALUE'
 gc_period = 'gc_period_VALUE'
 open_mem_limit = 'open_mem_limit_VALUE'
+page_cache_size = 'page_cache_size_VALUE'
 max_file_descriptors = 'max_file_descriptors_VALUE'
 pg_distrib_dir = 'pg_distrib_dir_VALUE'
 auth_validation_public_key_path = 'auth_validation_public_key_path_VALUE'
@@ -697,6 +717,7 @@ local_path = 'remote_storage_local_VALUE'
            gc_horizon: Some("gc_horizon_VALUE".to_string()),
            gc_period: Some("gc_period_VALUE".to_string()),
            open_mem_limit: Some("open_mem_limit_VALUE".to_string()),
+            page_cache_size: Some("page_cache_size_VALUE".to_string()),
            max_file_descriptors: Some("max_file_descriptors_VALUE".to_string()),
            pg_distrib_dir: Some("pg_distrib_dir_VALUE".to_string()),
            auth_validation_public_key_path: Some(
@@ -725,6 +746,7 @@ checkpoint_period = 'checkpoint_period_VALUE'
 gc_horizon = 'gc_horizon_VALUE'
 gc_period = 'gc_period_VALUE'
 open_mem_limit = 'open_mem_limit_VALUE'
+page_cache_size = 'page_cache_size_VALUE'
 max_file_descriptors = 'max_file_descriptors_VALUE'
 pg_distrib_dir = 'pg_distrib_dir_VALUE'
 auth_validation_public_key_path = 'auth_validation_public_key_path_VALUE'
--- a/pageserver/src/layered_repository.rs
+++ b/pageserver/src/layered_repository.rs
@@ -18,6 +18,7 @@ use lazy_static::lazy_static;
 use postgres_ffi::pg_constants::BLCKSZ;
 use tracing::*;

+use std::cmp;
 use std::collections::hash_map::Entry;
 use std::collections::HashMap;
 use std::collections::{BTreeSet, HashSet};
@@ -26,11 +27,12 @@ use std::fs::{File, OpenOptions};
 use std::io::Write;
 use std::ops::{Bound::Included, Deref};
 use std::path::{Path, PathBuf};
-use std::sync::atomic::{AtomicUsize, Ordering};
+use std::sync::atomic::{self, AtomicUsize};
 use std::sync::{Arc, Mutex, MutexGuard};
 use std::time::{Duration, Instant};

 use self::metadata::{metadata_path, TimelineMetadata};
+use crate::page_cache;
 use crate::relish::*;
 use crate::remote_storage::schedule_timeline_upload;
 use crate::repository::{GcResult, Repository, Timeline, TimelineWriter, WALRecord};
@@ -769,7 +771,7 @@ impl Timeline for LayeredTimeline {
    }

    fn get_current_logical_size(&self) -> usize {
-        self.current_logical_size.load(Ordering::Acquire) as usize
+        self.current_logical_size.load(atomic::Ordering::Acquire) as usize
    }

    fn get_current_logical_size_non_incremental(&self, lsn: Lsn) -> Result<usize> {
@@ -919,7 +921,7 @@ impl LayeredTimeline {
    /// Used to init current logical size on startup
    ///
    fn init_current_logical_size(&mut self) -> Result<()> {
-        if self.current_logical_size.load(Ordering::Relaxed) != 0 {
+        if self.current_logical_size.load(atomic::Ordering::Relaxed) != 0 {
            bail!("cannot init already initialized current logical size")
        };
        let lsn = self.get_last_record_lsn();
@@ -927,7 +929,7 @@ impl LayeredTimeline {
            AtomicUsize::new(self.get_current_logical_size_non_incremental(lsn)?);
        trace!(
            "current_logical_size initialized to {}",
-            self.current_logical_size.load(Ordering::Relaxed)
+            self.current_logical_size.load(atomic::Ordering::Relaxed)
        );
        Ok(())
    }
@@ -1514,6 +1516,22 @@ impl LayeredTimeline {
        Ok(result)
    }

+    fn lookup_cached_page(&self, seg: &SegmentTag, blknum: u32, lsn: Lsn) -> Option<(Lsn, Bytes)> {
+        if let RelishTag::Relation(rel_tag) = &seg.rel {
+            let (lsn, read_guard) = page_cache::get().lookup_materialized_page(
+                self.tenantid,
+                self.timelineid,
+                *rel_tag,
+                blknum,
+                lsn,
+            )?;
+            let img = Bytes::from(read_guard.to_vec());
+            Some((lsn, img))
+        } else {
+            None
+        }
+    }
+
    ///
    /// Reconstruct a page version from given Layer
    ///
@@ -1524,6 +1542,22 @@ impl LayeredTimeline {
        lsn: Lsn,
        layer: &dyn Layer,
    ) -> Result<Bytes> {
+        // Check the page cache. We will get back the most recent page with lsn <= `lsn`.
+        // The cached image can be returned directly if there is no WAL between the cached image
+        // and requested LSN. The cached image can also be used to reduce the amount of WAL needed
+        // for redo.
+        let (cached_lsn_opt, cached_page_opt) = match self.lookup_cached_page(&seg, blknum, lsn) {
+            Some((cached_lsn, cached_img)) => {
+                match cached_lsn.cmp(&lsn) {
+                    cmp::Ordering::Less => {} // there might be WAL between cached_lsn and lsn, we need to check
+                    cmp::Ordering::Equal => return Ok(cached_img), // exact LSN match, return the image
+                    cmp::Ordering::Greater => panic!(), // the returned lsn should never be after the requested lsn
+                }
+                (Some(cached_lsn), Some((cached_lsn, cached_img)))
+            }
+            None => (None, None),
+        };
+
        let mut data = PageReconstructData {
            records: Vec::new(),
            page_img: None,
@@ -1538,7 +1572,12 @@ impl LayeredTimeline {
        let mut layer_ref = layer;
        let mut curr_lsn = lsn;
        loop {
-            match layer_ref.get_page_reconstruct_data(blknum, curr_lsn, &mut data)? {
+            match layer_ref.get_page_reconstruct_data(
+                blknum,
+                curr_lsn,
+                cached_lsn_opt,
+                &mut data,
+            )? {
                PageReconstructResult::Complete => break,
                PageReconstructResult::Continue(cont_lsn) => {
                    // Fetch base image / more WAL from the returned predecessor layer
@@ -1582,6 +1621,16 @@ impl LayeredTimeline {
                        lsn,
                    );
                }
+                PageReconstructResult::Cached => {
+                    let (cached_lsn, cached_img) = cached_page_opt.unwrap();
+                    assert!(data.page_img.is_none());
+                    if let Some((first_rec_lsn, first_rec)) = data.records.first() {
+                        assert!(&cached_lsn < first_rec_lsn);
+                        assert!(!first_rec.will_init);
+                    }
+                    data.page_img = Some(cached_img);
+                    break;
+                }
            }
        }

@@ -1637,6 +1686,9 @@ impl LayeredTimeline {
                } else {
                    trace!("found {} WAL records that will init the page for blk {} in {} at {}, performing WAL redo", data.records.len(), blknum, rel, request_lsn);
                }
+
+                let last_rec_lsn = data.records.last().unwrap().0;
+
                let img = self.walredo_mgr.request_redo(
                    rel,
                    blknum,
@@ -1645,6 +1697,17 @@ impl LayeredTimeline {
                    data.records,
                )?;

+                if let RelishTag::Relation(rel_tag) = &rel {
+                    page_cache::get().memorize_materialized_page(
+                        self.tenantid,
+                        self.timelineid,
+                        *rel_tag,
+                        blknum,
+                        last_rec_lsn,
+                        &img,
+                    );
+                }
+
                Ok(img)
            }
        }
@@ -1656,7 +1719,7 @@ impl LayeredTimeline {
    fn increase_current_logical_size(&self, diff: u32) {
        let val = self
            .current_logical_size
-            .fetch_add(diff as usize, Ordering::SeqCst);
+            .fetch_add(diff as usize, atomic::Ordering::SeqCst);
        trace!(
            "increase_current_logical_size: {} + {} = {}",
            val,
@@ -1673,7 +1736,7 @@ impl LayeredTimeline {
    fn decrease_current_logical_size(&self, diff: u32) {
        let val = self
            .current_logical_size
-            .fetch_sub(diff as usize, Ordering::SeqCst);
+            .fetch_sub(diff as usize, atomic::Ordering::SeqCst);
        trace!(
            "decrease_current_logical_size: {} - {} = {}",
            val,
--- a/pageserver/src/layered_repository/delta_layer.rs
+++ b/pageserver/src/layered_repository/delta_layer.rs
@@ -183,12 +183,20 @@ impl Layer for DeltaLayer {
        &self,
        blknum: u32,
        lsn: Lsn,
+        cached_img_lsn: Option<Lsn>,
        reconstruct_data: &mut PageReconstructData,
    ) -> Result<PageReconstructResult> {
        let mut need_image = true;

        assert!(self.seg.blknum_in_seg(blknum));

+        match &cached_img_lsn {
+            Some(cached_lsn) if &self.end_lsn <= cached_lsn => {
+                return Ok(PageReconstructResult::Cached)
+            }
+            _ => {}
+        }
+
        {
            // Open the file and lock the metadata in memory
            let inner = self.load()?;
@@ -207,6 +215,13 @@ impl Layer for DeltaLayer {
                .iter()
                .rev();
            for ((_blknum, pv_lsn), blob_range) in iter {
+                match &cached_img_lsn {
+                    Some(cached_lsn) if pv_lsn <= cached_lsn => {
+                        return Ok(PageReconstructResult::Cached)
+                    }
+                    _ => {}
+                }
+
                let pv = PageVersion::des(&read_blob(&page_version_reader, blob_range)?)?;

                match pv {
--- a/pageserver/src/layered_repository/image_layer.rs
+++ b/pageserver/src/layered_repository/image_layer.rs
@@ -146,10 +146,16 @@ impl Layer for ImageLayer {
        &self,
        blknum: u32,
        lsn: Lsn,
+        cached_img_lsn: Option<Lsn>,
        reconstruct_data: &mut PageReconstructData,
    ) -> Result<PageReconstructResult> {
        assert!(lsn >= self.lsn);

+        match cached_img_lsn {
+            Some(cached_lsn) if self.lsn <= cached_lsn => return Ok(PageReconstructResult::Cached),
+            _ => {}
+        }
+
        let inner = self.load()?;

        let base_blknum = blknum % RELISH_SEG_SIZE;
--- a/pageserver/src/layered_repository/inmemory_layer.rs
+++ b/pageserver/src/layered_repository/inmemory_layer.rs
@@ -169,6 +169,7 @@ impl Layer for InMemoryLayer {
        &self,
        blknum: u32,
        lsn: Lsn,
+        cached_img_lsn: Option<Lsn>,
        reconstruct_data: &mut PageReconstructData,
    ) -> Result<PageReconstructResult> {
        let mut need_image = true;
@@ -185,6 +186,13 @@ impl Layer for InMemoryLayer {
                .iter()
                .rev();
            for (entry_lsn, pv) in iter {
+                match &cached_img_lsn {
+                    Some(cached_lsn) if entry_lsn <= cached_lsn => {
+                        return Ok(PageReconstructResult::Cached)
+                    }
+                    _ => {}
+                }
+
                match pv {
                    PageVersion::Page(img) => {
                        reconstruct_data.page_img = Some(img.clone());
--- a/pageserver/src/layered_repository/storage_layer.rs
+++ b/pageserver/src/layered_repository/storage_layer.rs
@@ -80,6 +80,8 @@ pub enum PageReconstructResult {
    /// the returned LSN. This is usually considered an error, but might be OK
    /// in some circumstances.
    Missing(Lsn),
+    /// Use the cached image at `cached_img_lsn` as the base image
+    Cached,
 }

 ///
@@ -127,6 +129,9 @@ pub trait Layer: Send + Sync {
    /// of the *relish*, not the beginning of the segment. The requested
    /// 'blknum' must be covered by this segment.
    ///
+    /// `cached_img_lsn` should be set to a cached page image's lsn < `lsn`.
+    /// This function will only return data after `cached_img_lsn`.
+    ///
    /// See PageReconstructResult for possible return values. The collected data
    /// is appended to reconstruct_data; the caller should pass an empty struct
    /// on first call. If this returns PageReconstructResult::Continue, look up
@@ -136,6 +141,7 @@ pub trait Layer: Send + Sync {
        &self,
        blknum: u32,
        lsn: Lsn,
+        cached_img_lsn: Option<Lsn>,
        reconstruct_data: &mut PageReconstructData,
    ) -> Result<PageReconstructResult>;

--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -12,6 +12,7 @@ pub mod basebackup;
 pub mod branches;
 pub mod http;
 pub mod layered_repository;
+pub mod page_cache;
 pub mod page_service;
 pub mod relish;
 pub mod remote_storage;
@@ -46,6 +47,7 @@ pub mod defaults {
    pub const DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNC_LIMITS: usize = 100;

    pub const DEFAULT_OPEN_MEM_LIMIT: usize = 128 * 1024 * 1024;
+    pub const DEFAULT_PAGE_CACHE_SIZE: usize = 8192;
    pub const DEFAULT_MAX_FILE_DESCRIPTORS: usize = 100;
 }

@@ -76,6 +78,7 @@ pub struct PageServerConf {
    pub superuser: String,

    pub open_mem_limit: usize,
+    pub page_cache_size: usize,
    pub max_file_descriptors: usize,

    // Repository directory, relative to current working directory.
@@ -161,6 +164,7 @@ impl PageServerConf {
            gc_horizon: defaults::DEFAULT_GC_HORIZON,
            gc_period: Duration::from_secs(10),
            open_mem_limit: defaults::DEFAULT_OPEN_MEM_LIMIT,
+            page_cache_size: defaults::DEFAULT_PAGE_CACHE_SIZE,
            max_file_descriptors: defaults::DEFAULT_MAX_FILE_DESCRIPTORS,
            listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(),
            listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(),
--- a/pageserver/src/page_cache.rs
+++ b/pageserver/src/page_cache.rs
@@ -0,0 +1,667 @@
+//!
+//! Global page cache
+//!
+//! The page cache uses up most of the memory in the page server. It is shared
+//! by all tenants, and it is used to store different kinds of pages. Sharing
+//! the cache allows memory to be dynamically allocated where it's needed the
+//! most.
+//!
+//! The page cache consists of fixed-size buffers, 8 kB each to match the
+//! PostgreSQL buffer size, and a Slot struct for each buffer to contain
+//! information about what's stored in the buffer.
+//!
+//! # Locking
+//!
+//! There are two levels of locking involved: There's one lock for the "mapping"
+//! from page identifier (tenant ID, timeline ID, rel, block, LSN) to the buffer
+//! slot, and a separate lock on each slot. To read or write the contents of a
+//! slot, you must hold the lock on the slot in read or write mode,
+//! respectively. To change the mapping of a slot, i.e. to evict a page or to
+//! assign a buffer for a page, you must hold the mapping lock and the lock on
+//! the slot at the same time.
+//!
+//! Whenever you need to hold both locks simultenously, the slot lock must be
+//! acquired first. This consistent ordering avoids deadlocks. To look up a page
+//! in the cache, you would first look up the mapping, while holding the mapping
+//! lock, and then lock the slot. You must release the mapping lock in between,
+//! to obey the lock ordering and avoid deadlock.
+//!
+//! A slot can momentarily have invalid contents, even if it's already been
+//! inserted to the mapping, but you must hold the write-lock on the slot until
+//! the contents are valid. If you need to release the lock without initializing
+//! the contents, you must remove the mapping first. We make that easy for the
+//! callers with PageWriteGuard: when lock_for_write() returns an uninitialized
+//! page, the caller must explicitly call guard.mark_valid() after it has
+//! initialized it. If the guard is dropped without calling mark_valid(), the
+//! mapping is automatically removed and the slot is marked free.
+//!
+
+use std::{
+    collections::{hash_map::Entry, HashMap},
+    convert::TryInto,
+    sync::{
+        atomic::{AtomicU8, AtomicUsize, Ordering},
+        RwLock, RwLockReadGuard, RwLockWriteGuard,
+    },
+};
+
+use once_cell::sync::OnceCell;
+use zenith_utils::{
+    lsn::Lsn,
+    zid::{ZTenantId, ZTimelineId},
+};
+
+use crate::{relish::RelTag, PageServerConf};
+
+static PAGE_CACHE: OnceCell<PageCache> = OnceCell::new();
+const TEST_PAGE_CACHE_SIZE: usize = 10;
+
+///
+/// Initialize the page cache. This must be called once at page server startup.
+///
+pub fn init(conf: &'static PageServerConf) {
+    if PAGE_CACHE
+        .set(PageCache::new(conf.page_cache_size))
+        .is_err()
+    {
+        panic!("page cache already initialized");
+    }
+}
+
+///
+/// Get a handle to the page cache.
+///
+pub fn get() -> &'static PageCache {
+    //
+    // In unit tests, page server startup doesn't happen and no one calls
+    // page_cache::init(). Initialize it here with a tiny cache, so that the
+    // page cache is usable in unit tests.
+    //
+    if cfg!(test) {
+        PAGE_CACHE.get_or_init(|| PageCache::new(TEST_PAGE_CACHE_SIZE))
+    } else {
+        PAGE_CACHE.get().expect("page cache not initialized")
+    }
+}
+
+const PAGE_SZ: usize = postgres_ffi::pg_constants::BLCKSZ as usize;
+const MAX_USAGE_COUNT: u8 = 5;
+
+///
+/// CacheKey uniquely identifies a "thing" to cache in the page cache.
+///
+#[derive(PartialEq, Eq, Clone)]
+enum CacheKey {
+    MaterializedPage {
+        hash_key: MaterializedPageHashKey,
+        lsn: Lsn,
+    },
+    // Currently, we only store materialized page versions in the page cache.
+    // To cache another kind of "thing", add enum variant here.
+}
+
+#[derive(PartialEq, Eq, Hash, Clone)]
+struct MaterializedPageHashKey {
+    tenant_id: ZTenantId,
+    timeline_id: ZTimelineId,
+    rel_tag: RelTag,
+    blknum: u32,
+}
+
+#[derive(Clone)]
+struct Version {
+    lsn: Lsn,
+    slot_idx: usize,
+}
+
+struct Slot {
+    inner: RwLock<SlotInner>,
+    usage_count: AtomicU8,
+}
+
+struct SlotInner {
+    key: Option<CacheKey>,
+    buf: &'static mut [u8; PAGE_SZ],
+}
+
+impl Slot {
+    /// Increment usage count on the buffer, with ceiling at MAX_USAGE_COUNT.
+    fn inc_usage_count(&self) {
+        let _ = self
+            .usage_count
+            .fetch_update(Ordering::Relaxed, Ordering::Relaxed, |val| {
+                if val == MAX_USAGE_COUNT {
+                    None
+                } else {
+                    Some(val + 1)
+                }
+            });
+    }
+
+    /// Decrement usage count on the buffer, unless it's already zero.  Returns
+    /// the old usage count.
+    fn dec_usage_count(&self) -> u8 {
+        let count_res =
+            self.usage_count
+                .fetch_update(Ordering::Relaxed, Ordering::Relaxed, |val| {
+                    if val == 0 {
+                        None
+                    } else {
+                        Some(val - 1)
+                    }
+                });
+
+        match count_res {
+            Ok(usage_count) => usage_count,
+            Err(usage_count) => usage_count,
+        }
+    }
+}
+
+pub struct PageCache {
+    /// This contains the mapping from the cache key to buffer slot that currently
+    /// contains the page, if any.
+    ///
+    /// TODO: This is protected by a single lock. If that becomes a bottleneck,
+    /// this HashMap can be replaced with a more concurrent version, there are
+    /// plenty of such crates around.
+    ///
+    /// If you add support for caching different kinds of objects, each object kind
+    /// can have a separate mapping map, next to this field.
+    materialized_page_map: RwLock<HashMap<MaterializedPageHashKey, Vec<Version>>>,
+
+    /// The actual buffers with their metadata.
+    slots: Box<[Slot]>,
+
+    /// Index of the next candidate to evict, for the Clock replacement algorithm.
+    /// This is interpreted modulo the page cache size.
+    next_evict_slot: AtomicUsize,
+}
+
+///
+/// PageReadGuard is a "lease" on a buffer, for reading. The page is kept locked
+/// until the guard is dropped.
+///
+pub struct PageReadGuard<'i>(RwLockReadGuard<'i, SlotInner>);
+
+impl std::ops::Deref for PageReadGuard<'_> {
+    type Target = [u8; PAGE_SZ];
+
+    fn deref(&self) -> &Self::Target {
+        self.0.buf
+    }
+}
+
+///
+/// PageWriteGuard is a lease on a buffer for modifying it. The page is kept locked
+/// until the guard is dropped.
+///
+/// Counterintuitively, this is used even for a read, if the requested page is not
+/// currently found in the page cache. In that case, the caller of lock_for_read()
+/// is expected to fill in the page contents and call mark_valid(). Similarly
+/// lock_for_write() can return an invalid buffer that the caller is expected to
+/// to initialize.
+///
+pub struct PageWriteGuard<'i> {
+    inner: RwLockWriteGuard<'i, SlotInner>,
+
+    // Are the page contents currently valid?
+    valid: bool,
+}
+
+impl std::ops::DerefMut for PageWriteGuard<'_> {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        self.inner.buf
+    }
+}
+
+impl std::ops::Deref for PageWriteGuard<'_> {
+    type Target = [u8; PAGE_SZ];
+
+    fn deref(&self) -> &Self::Target {
+        self.inner.buf
+    }
+}
+
+impl PageWriteGuard<'_> {
+    /// Mark that the buffer contents are now valid.
+    pub fn mark_valid(&mut self) {
+        assert!(
+            !self.valid,
+            "mark_valid called on a buffer that was already valid"
+        );
+        self.valid = true;
+    }
+}
+
+impl Drop for PageWriteGuard<'_> {
+    ///
+    /// If the buffer was allocated for a page that was not already in the
+    /// cache, but the lock_for_read/write() caller dropped the buffer without
+    /// initializing it, remove the mapping from the page cache.
+    ///
+    fn drop(&mut self) {
+        if !self.valid {
+            let self_key = self.inner.key.as_ref().unwrap();
+            PAGE_CACHE.get().unwrap().remove_mapping(self_key);
+            self.inner.key = None;
+        }
+    }
+}
+
+/// lock_for_read() return value
+enum ReadBufResult<'a> {
+    Found(PageReadGuard<'a>),
+    NotFound(PageWriteGuard<'a>),
+}
+
+/// lock_for_write() return value
+enum WriteBufResult<'a> {
+    Found(PageWriteGuard<'a>),
+    NotFound(PageWriteGuard<'a>),
+}
+
+impl PageCache {
+    //
+    // Section 1: Public interface functions for looking up and memorizing materialized page
+    // versions in the page cache
+    //
+
+    /// Look up a materialized page version.
+    ///
+    /// The 'lsn' is an upper bound, this will return the latest version of
+    /// the given block, but not newer than 'lsn'. Returns the actual LSN of the
+    /// returned page.
+    pub fn lookup_materialized_page(
+        &self,
+        tenant_id: ZTenantId,
+        timeline_id: ZTimelineId,
+        rel_tag: RelTag,
+        blknum: u32,
+        lsn: Lsn,
+    ) -> Option<(Lsn, PageReadGuard)> {
+        let mut cache_key = CacheKey::MaterializedPage {
+            hash_key: MaterializedPageHashKey {
+                tenant_id,
+                timeline_id,
+                rel_tag,
+                blknum,
+            },
+            lsn,
+        };
+
+        if let Some(guard) = self.try_lock_for_read(&mut cache_key) {
+            let CacheKey::MaterializedPage { hash_key: _, lsn } = cache_key;
+            Some((lsn, guard))
+        } else {
+            None
+        }
+    }
+
+    ///
+    /// Store an image of the given page in the cache.
+    ///
+    pub fn memorize_materialized_page(
+        &self,
+        tenant_id: ZTenantId,
+        timeline_id: ZTimelineId,
+        rel_tag: RelTag,
+        blknum: u32,
+        lsn: Lsn,
+        img: &[u8],
+    ) {
+        let cache_key = CacheKey::MaterializedPage {
+            hash_key: MaterializedPageHashKey {
+                tenant_id,
+                timeline_id,
+                rel_tag,
+                blknum,
+            },
+            lsn,
+        };
+
+        match self.lock_for_write(&cache_key) {
+            WriteBufResult::Found(write_guard) => {
+                // We already had it in cache. Another thread must've put it there
+                // concurrently. Check that it had the same contents that we
+                // replayed.
+                assert!(*write_guard == img);
+            }
+            WriteBufResult::NotFound(mut write_guard) => {
+                write_guard.copy_from_slice(img);
+                write_guard.mark_valid();
+            }
+        }
+    }
+
+    //
+    // Section 2: Internal interface functions for lookup/update.
+    //
+    // Currently, the page cache only stores materialized page images. In the
+    // future, to add support for a new kind of "thing" to cache, you will need
+    // to add public interface routines above, and code to deal with the
+    // "mappings" after this section. But the routines in this section should
+    // not require changes.
+
+    /// Look up a page in the cache.
+    ///
+    /// If the search criteria is not exact, *cache_key is updated with the key
+    /// for exact key of the returned page. (For materialized pages, that means
+    /// that the LSN in 'cache_key' is updated with the LSN of the returned page
+    /// version.)
+    ///
+    /// If no page is found, returns None and *cache_key is left unmodified.
+    ///
+    fn try_lock_for_read(&self, cache_key: &mut CacheKey) -> Option<PageReadGuard> {
+        let cache_key_orig = cache_key.clone();
+        if let Some(slot_idx) = self.search_mapping(cache_key) {
+            // The page was found in the mapping. Lock the slot, and re-check
+            // that it's still what we expected (because we released the mapping
+            // lock already, another thread could have evicted the page)
+            let slot = &self.slots[slot_idx];
+            let inner = slot.inner.read().unwrap();
+            if inner.key.as_ref() == Some(cache_key) {
+                slot.inc_usage_count();
+                return Some(PageReadGuard(inner));
+            } else {
+                // search_mapping might have modified the search key; restore it.
+                *cache_key = cache_key_orig;
+            }
+        }
+        None
+    }
+
+    /// Return a locked buffer for given block.
+    ///
+    /// Like try_lock_for_read(), if the search criteria is not exact and the
+    /// page is already found in the cache, *cache_key is updated.
+    ///
+    /// If the page is not found in the cache, this allocates a new buffer for
+    /// it. The caller may then initialize the buffer with the contents, and
+    /// call mark_valid().
+    ///
+    /// Example usage:
+    ///
+    /// ```ignore
+    /// let cache = page_cache::get();
+    ///
+    /// match cache.lock_for_read(&key) {
+    ///     ReadBufResult::Found(read_guard) => {
+    ///         // The page was found in cache. Use it
+    ///     },
+    ///     ReadBufResult::NotFound(write_guard) => {
+    ///         // The page was not found in cache. Read it from disk into the
+    ///         // buffer.
+    ///         //read_my_page_from_disk(write_guard);
+    ///
+    ///         // The buffer contents are now valid. Tell the page cache.
+    ///         write_guard.mark_valid();
+    ///     },
+    /// }
+    /// ```
+    ///
+    #[allow(unused)] // this is currently unused
+    fn lock_for_read(&self, cache_key: &mut CacheKey) -> ReadBufResult {
+        loop {
+            // First check if the key already exists in the cache.
+            if let Some(read_guard) = self.try_lock_for_read(cache_key) {
+                return ReadBufResult::Found(read_guard);
+            }
+
+            // Not found. Find a victim buffer
+            let (slot_idx, mut inner) = self.find_victim();
+
+            // Insert mapping for this. At this point, we may find that another
+            // thread did the same thing concurrently. In that case, we evicted
+            // our victim buffer unnecessarily. Put it into the free list and
+            // continue with the slot that the other thread chose.
+            if let Some(_existing_slot_idx) = self.try_insert_mapping(cache_key, slot_idx) {
+                // TODO: put to free list
+
+                // We now just loop back to start from beginning. This is not
+                // optimal, we'll perform the lookup in the mapping again, which
+                // is not really necessary because we already got
+                // 'existing_slot_idx'.  But this shouldn't happen often enough
+                // to matter much.
+                continue;
+            }
+
+            // Make the slot ready
+            let slot = &self.slots[slot_idx];
+            inner.key = Some(cache_key.clone());
+            slot.usage_count.store(1, Ordering::Relaxed);
+
+            return ReadBufResult::NotFound(PageWriteGuard {
+                inner,
+                valid: false,
+            });
+        }
+    }
+
+    /// Look up a page in the cache and lock it in write mode. If it's not
+    /// found, returns None.
+    ///
+    /// When locking a page for writing, the search criteria is always "exact".
+    fn try_lock_for_write(&self, cache_key: &CacheKey) -> Option<PageWriteGuard> {
+        if let Some(slot_idx) = self.search_mapping_for_write(cache_key) {
+            // The page was found in the mapping. Lock the slot, and re-check
+            // that it's still what we expected (because we don't released the mapping
+            // lock already, another thread could have evicted the page)
+            let slot = &self.slots[slot_idx];
+            let inner = slot.inner.write().unwrap();
+            if inner.key.as_ref() == Some(cache_key) {
+                slot.inc_usage_count();
+                return Some(PageWriteGuard { inner, valid: true });
+            }
+        }
+        None
+    }
+
+    /// Return a write-locked buffer for given block.
+    ///
+    /// Similar to read_for_read(), but the returned buffer is write-locked and
+    /// may be modified by the caller even if it's already found in the cache.
+    fn lock_for_write(&self, cache_key: &CacheKey) -> WriteBufResult {
+        loop {
+            // First check if the key already exists in the cache.
+            if let Some(write_guard) = self.try_lock_for_write(cache_key) {
+                return WriteBufResult::Found(write_guard);
+            }
+
+            // Not found. Find a victim buffer
+            let (slot_idx, mut inner) = self.find_victim();
+
+            // Insert mapping for this. At this point, we may find that another
+            // thread did the same thing concurrently. In that case, we evicted
+            // our victim buffer unnecessarily. Put it into the free list and
+            // continue with the slot that the other thread chose.
+            if let Some(_existing_slot_idx) = self.try_insert_mapping(cache_key, slot_idx) {
+                // TODO: put to free list
+
+                // We now just loop back to start from beginning. This is not
+                // optimal, we'll perform the lookup in the mapping again, which
+                // is not really necessary because we already got
+                // 'existing_slot_idx'.  But this shouldn't happen often enough
+                // to matter much.
+                continue;
+            }
+
+            // Make the slot ready
+            let slot = &self.slots[slot_idx];
+            inner.key = Some(cache_key.clone());
+            slot.usage_count.store(1, Ordering::Relaxed);
+
+            return WriteBufResult::NotFound(PageWriteGuard {
+                inner,
+                valid: false,
+            });
+        }
+    }
+
+    //
+    // Section 3: Mapping functions
+    //
+
+    /// Search for a page in the cache using the given search key.
+    ///
+    /// Returns the slot index, if any. If the search criteria is not exact,
+    /// *cache_key is updated with the actual key of the found page.
+    ///
+    /// NOTE: We don't hold any lock on the mapping on return, so the slot might
+    /// get recycled for an unrelated page immediately after this function
+    /// returns.  The caller is responsible for re-checking that the slot still
+    /// contains the page with the same key before using it.
+    ///
+    fn search_mapping(&self, cache_key: &mut CacheKey) -> Option<usize> {
+        match cache_key {
+            CacheKey::MaterializedPage { hash_key, lsn } => {
+                let map = self.materialized_page_map.read().unwrap();
+                let versions = map.get(hash_key)?;
+
+                let version_idx = match versions.binary_search_by_key(lsn, |v| v.lsn) {
+                    Ok(version_idx) => version_idx,
+                    Err(0) => return None,
+                    Err(version_idx) => version_idx - 1,
+                };
+                let version = &versions[version_idx];
+                *lsn = version.lsn;
+                Some(version.slot_idx)
+            }
+        }
+    }
+
+    /// Search for a page in the cache using the given search key.
+    ///
+    /// Like 'search_mapping, but performs an "exact" search. Used for
+    /// allocating a new buffer.
+    fn search_mapping_for_write(&self, key: &CacheKey) -> Option<usize> {
+        match key {
+            CacheKey::MaterializedPage { hash_key, lsn } => {
+                let map = self.materialized_page_map.read().unwrap();
+                let versions = map.get(hash_key)?;
+
+                if let Ok(version_idx) = versions.binary_search_by_key(lsn, |v| v.lsn) {
+                    Some(versions[version_idx].slot_idx)
+                } else {
+                    None
+                }
+            }
+        }
+    }
+
+    ///
+    /// Remove mapping for given key.
+    ///
+    fn remove_mapping(&self, old_key: &CacheKey) {
+        match old_key {
+            CacheKey::MaterializedPage {
+                hash_key: old_hash_key,
+                lsn: old_lsn,
+            } => {
+                let mut map = self.materialized_page_map.write().unwrap();
+                if let Entry::Occupied(mut old_entry) = map.entry(old_hash_key.clone()) {
+                    let versions = old_entry.get_mut();
+
+                    if let Ok(version_idx) = versions.binary_search_by_key(old_lsn, |v| v.lsn) {
+                        versions.remove(version_idx);
+                        if versions.is_empty() {
+                            old_entry.remove_entry();
+                        }
+                    }
+                } else {
+                    panic!()
+                }
+            }
+        }
+    }
+
+    ///
+    /// Insert mapping for given key.
+    ///
+    /// If a mapping already existed for the given key, returns the slot index
+    /// of the existing mapping and leaves it untouched.
+    fn try_insert_mapping(&self, new_key: &CacheKey, slot_idx: usize) -> Option<usize> {
+        match new_key {
+            CacheKey::MaterializedPage {
+                hash_key: new_key,
+                lsn: new_lsn,
+            } => {
+                let mut map = self.materialized_page_map.write().unwrap();
+                let versions = map.entry(new_key.clone()).or_default();
+                match versions.binary_search_by_key(new_lsn, |v| v.lsn) {
+                    Ok(version_idx) => Some(versions[version_idx].slot_idx),
+                    Err(version_idx) => {
+                        versions.insert(
+                            version_idx,
+                            Version {
+                                lsn: *new_lsn,
+                                slot_idx,
+                            },
+                        );
+                        None
+                    }
+                }
+            }
+        }
+    }
+
+    //
+    // Section 5: Misc internal helpers
+    //
+
+    /// Find a slot to evict.
+    ///
+    /// On return, the slot is empty and write-locked.
+    fn find_victim(&self) -> (usize, RwLockWriteGuard<SlotInner>) {
+        let iter_limit = self.slots.len() * 2;
+        let mut iters = 0;
+        loop {
+            let slot_idx = self.next_evict_slot.fetch_add(1, Ordering::Relaxed) % self.slots.len();
+
+            let slot = &self.slots[slot_idx];
+
+            if slot.dec_usage_count() == 0 || iters >= iter_limit {
+                let mut inner = slot.inner.write().unwrap();
+
+                if let Some(old_key) = &inner.key {
+                    // TODO: if we supported storing dirty pages, this is where
+                    // we'd need to write it disk
+
+                    // remove mapping for old buffer
+                    self.remove_mapping(old_key);
+                    inner.key = None;
+                }
+                return (slot_idx, inner);
+            }
+
+            iters += 1;
+        }
+    }
+
+    /// Initialize a new page cache
+    ///
+    /// This should be called only once at page server startup.
+    fn new(num_pages: usize) -> Self {
+        assert!(num_pages > 0, "page cache size must be > 0");
+
+        let page_buffer = Box::leak(vec![0u8; num_pages * PAGE_SZ].into_boxed_slice());
+
+        let slots = page_buffer
+            .chunks_exact_mut(PAGE_SZ)
+            .map(|chunk| {
+                let buf: &mut [u8; PAGE_SZ] = chunk.try_into().unwrap();
+
+                Slot {
+                    inner: RwLock::new(SlotInner { key: None, buf }),
+                    usage_count: AtomicU8::new(0),
+                }
+            })
+            .collect();
+
+        Self {
+            materialized_page_map: Default::default(),
+            slots,
+            next_evict_slot: AtomicUsize::new(0),
+        }
+    }
+}